diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-10 09:10:28 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-10 09:10:28 -0700 |
commit | 30c999937f69abf935b0228b8411713737377d9e (patch) | |
tree | 1ae97b8df4d61e83bdba2388b2254bb407418385 /kernel | |
parent | 493ffd6605b2d3d4dc7008ab927dba319f36671f (diff) | |
parent | fdf756f7127185eeffe00e918e66dfee797f3625 (diff) |
Merge tag 'sched-core-2022-10-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"Debuggability:
- Change most occurances of BUG_ON() to WARN_ON_ONCE()
- Reorganize & fix TASK_ state comparisons, turn it into a bitmap
- Update/fix misc scheduler debugging facilities
Load-balancing & regular scheduling:
- Improve the behavior of the scheduler in presence of lot of
SCHED_IDLE tasks - in particular they should not impact other
scheduling classes.
- Optimize task load tracking, cleanups & fixes
- Clean up & simplify misc load-balancing code
Freezer:
- Rewrite the core freezer to behave better wrt thawing and be
simpler in general, by replacing PF_FROZEN with TASK_FROZEN &
fixing/adjusting all the fallout.
Deadline scheduler:
- Fix the DL capacity-aware code
- Factor out dl_task_is_earliest_deadline() &
replenish_dl_new_period()
- Relax/optimize locking in task_non_contending()
Cleanups:
- Factor out the update_current_exec_runtime() helper
- Various cleanups, simplifications"
* tag 'sched-core-2022-10-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits)
sched: Fix more TASK_state comparisons
sched: Fix TASK_state comparisons
sched/fair: Move call to list_last_entry() in detach_tasks
sched/fair: Cleanup loop_max and loop_break
sched/fair: Make sure to try to detach at least one movable task
sched: Show PF_flag holes
freezer,sched: Rewrite core freezer logic
sched: Widen TAKS_state literals
sched/wait: Add wait_event_state()
sched/completion: Add wait_for_completion_state()
sched: Add TASK_ANY for wait_task_inactive()
sched: Change wait_task_inactive()s match_state
freezer,umh: Clean up freezer/initrd interaction
freezer: Have {,un}lock_system_sleep() save/restore flags
sched: Rename task_running() to task_on_cpu()
sched/fair: Cleanup for SIS_PROP
sched/fair: Default to false in test_idle_cores()
sched/fair: Remove useless check in select_idle_core()
sched/fair: Avoid double search on same cpu
sched/fair: Remove redundant check in select_idle_smt()
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup/legacy_freezer.c | 23 | ||||
-rw-r--r-- | kernel/exit.c | 4 | ||||
-rw-r--r-- | kernel/fork.c | 5 | ||||
-rw-r--r-- | kernel/freezer.c | 133 | ||||
-rw-r--r-- | kernel/futex/waitwake.c | 8 | ||||
-rw-r--r-- | kernel/hung_task.c | 16 | ||||
-rw-r--r-- | kernel/power/hibernate.c | 35 | ||||
-rw-r--r-- | kernel/power/main.c | 18 | ||||
-rw-r--r-- | kernel/power/process.c | 10 | ||||
-rw-r--r-- | kernel/power/suspend.c | 12 | ||||
-rw-r--r-- | kernel/power/user.c | 24 | ||||
-rw-r--r-- | kernel/ptrace.c | 2 | ||||
-rw-r--r-- | kernel/sched/autogroup.c | 3 | ||||
-rw-r--r-- | kernel/sched/completion.c | 12 | ||||
-rw-r--r-- | kernel/sched/core.c | 103 | ||||
-rw-r--r-- | kernel/sched/core_sched.c | 4 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 2 | ||||
-rw-r--r-- | kernel/sched/cpupri.c | 2 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 115 | ||||
-rw-r--r-- | kernel/sched/fair.c | 227 | ||||
-rw-r--r-- | kernel/sched/rt.c | 18 | ||||
-rw-r--r-- | kernel/sched/sched.h | 65 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 11 | ||||
-rw-r--r-- | kernel/signal.c | 14 | ||||
-rw-r--r-- | kernel/time/hrtimer.c | 4 | ||||
-rw-r--r-- | kernel/umh.c | 18 |
26 files changed, 470 insertions, 418 deletions
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 08236798d173..1b6b21851e9d 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -113,7 +113,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) if (parent && (parent->state & CGROUP_FREEZING)) { freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); } mutex_unlock(&freezer_mutex); @@ -134,7 +134,7 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) - atomic_dec(&system_freezing_cnt); + static_branch_dec(&freezer_active); freezer->state = 0; @@ -179,6 +179,7 @@ static void freezer_attach(struct cgroup_taskset *tset) __thaw_task(task); } else { freeze_task(task); + /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; @@ -271,16 +272,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css) css_task_iter_start(css, 0, &it); while ((task = css_task_iter_next(&it))) { - if (freezing(task)) { - /* - * freezer_should_skip() indicates that the task - * should be skipped when determining freezing - * completion. Consider it frozen in addition to - * the usual frozen condition. - */ - if (!frozen(task) && !freezer_should_skip(task)) - goto out_iter_end; - } + if (freezing(task) && !frozen(task)) + goto out_iter_end; } freezer->state |= CGROUP_FROZEN; @@ -357,7 +350,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); freezer->state |= state; freeze_cgroup(freezer); } else { @@ -366,9 +359,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, freezer->state &= ~state; if (!(freezer->state & CGROUP_FREEZING)) { - if (was_freezing) - atomic_dec(&system_freezing_cnt); freezer->state &= ~CGROUP_FROZEN; + if (was_freezing) + static_branch_dec(&freezer_active); unfreeze_cgroup(freezer); } } diff --git a/kernel/exit.c b/kernel/exit.c index 4f7424523bac..4b8e7c94a3c0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -374,10 +374,10 @@ static void coredump_task_exit(struct task_struct *tsk) complete(&core_state->startup); for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; - freezable_schedule(); + schedule(); } __set_current_state(TASK_RUNNING); } diff --git a/kernel/fork.c b/kernel/fork.c index cecab2735935..947eb1a6399a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1421,13 +1421,12 @@ static void complete_vfork_done(struct task_struct *tsk) static int wait_for_vfork_done(struct task_struct *child, struct completion *vfork) { + unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE; int killed; - freezer_do_not_count(); cgroup_enter_frozen(); - killed = wait_for_completion_killable(vfork); + killed = wait_for_completion_state(vfork, state); cgroup_leave_frozen(false); - freezer_count(); if (killed) { task_lock(child); diff --git a/kernel/freezer.c b/kernel/freezer.c index 45ab36ffd0e7..4fad0e6fca64 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -13,10 +13,11 @@ #include <linux/kthread.h> /* total number of freezing conditions in effect */ -atomic_t system_freezing_cnt = ATOMIC_INIT(0); -EXPORT_SYMBOL(system_freezing_cnt); +DEFINE_STATIC_KEY_FALSE(freezer_active); +EXPORT_SYMBOL(freezer_active); -/* indicate whether PM freezing is in effect, protected by +/* + * indicate whether PM freezing is in effect, protected by * system_transition_mutex */ bool pm_freezing; @@ -29,7 +30,7 @@ static DEFINE_SPINLOCK(freezer_lock); * freezing_slow_path - slow path for testing whether a task needs to be frozen * @p: task to be tested * - * This function is called by freezing() if system_freezing_cnt isn't zero + * This function is called by freezing() if freezer_active isn't zero * and tests whether @p needs to enter and stay in frozen state. Can be * called under any context. The freezers are responsible for ensuring the * target tasks see the updated state. @@ -52,41 +53,40 @@ bool freezing_slow_path(struct task_struct *p) } EXPORT_SYMBOL(freezing_slow_path); +bool frozen(struct task_struct *p) +{ + return READ_ONCE(p->__state) & TASK_FROZEN; +} + /* Refrigerator is place where frozen processes are stored :-). */ bool __refrigerator(bool check_kthr_stop) { - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ + unsigned int state = get_current_state(); bool was_frozen = false; - unsigned int save = get_current_state(); pr_debug("%s entered refrigerator\n", current->comm); + WARN_ON_ONCE(state && !(state & TASK_NORMAL)); + for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + bool freeze; + + set_current_state(TASK_FROZEN); spin_lock_irq(&freezer_lock); - current->flags |= PF_FROZEN; - if (!freezing(current) || - (check_kthr_stop && kthread_should_stop())) - current->flags &= ~PF_FROZEN; + freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop()); spin_unlock_irq(&freezer_lock); - if (!(current->flags & PF_FROZEN)) + if (!freeze) break; + was_frozen = true; schedule(); } + __set_current_state(TASK_RUNNING); pr_debug("%s left refrigerator\n", current->comm); - /* - * Restore saved task state before returning. The mb'd version - * needs to be used; otherwise, it might silently break - * synchronization which depends on ordered task state change. - */ - set_current_state(save); - return was_frozen; } EXPORT_SYMBOL(__refrigerator); @@ -101,6 +101,44 @@ static void fake_signal_wake_up(struct task_struct *p) } } +static int __set_task_frozen(struct task_struct *p, void *arg) +{ + unsigned int state = READ_ONCE(p->__state); + + if (p->on_rq) + return 0; + + if (p != current && task_curr(p)) + return 0; + + if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED))) + return 0; + + /* + * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they + * can suffer spurious wakeups. + */ + if (state & TASK_FREEZABLE) + WARN_ON_ONCE(!(state & TASK_NORMAL)); + +#ifdef CONFIG_LOCKDEP + /* + * It's dangerous to freeze with locks held; there be dragons there. + */ + if (!(state & __TASK_FREEZABLE_UNSAFE)) + WARN_ON_ONCE(debug_locks && p->lockdep_depth); +#endif + + WRITE_ONCE(p->__state, TASK_FROZEN); + return TASK_FROZEN; +} + +static bool __freeze_task(struct task_struct *p) +{ + /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */ + return task_call_func(p, __set_task_frozen, NULL); +} + /** * freeze_task - send a freeze request to given task * @p: task to send the request to @@ -116,20 +154,8 @@ bool freeze_task(struct task_struct *p) { unsigned long flags; - /* - * This check can race with freezer_do_not_count, but worst case that - * will result in an extra wakeup being sent to the task. It does not - * race with freezer_count(), the barriers in freezer_count() and - * freezer_should_skip() ensure that either freezer_count() sees - * freezing == true in try_to_freeze() and freezes, or - * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task - * normally. - */ - if (freezer_should_skip(p)) - return false; - spin_lock_irqsave(&freezer_lock, flags); - if (!freezing(p) || frozen(p)) { + if (!freezing(p) || frozen(p) || __freeze_task(p)) { spin_unlock_irqrestore(&freezer_lock, flags); return false; } @@ -137,19 +163,52 @@ bool freeze_task(struct task_struct *p) if (!(p->flags & PF_KTHREAD)) fake_signal_wake_up(p); else - wake_up_state(p, TASK_INTERRUPTIBLE); + wake_up_state(p, TASK_NORMAL); spin_unlock_irqrestore(&freezer_lock, flags); return true; } +/* + * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical + * state in p->jobctl. If either of them got a wakeup that was missed because + * TASK_FROZEN, then their canonical state reflects that and the below will + * refuse to restore the special state and instead issue the wakeup. + */ +static int __set_task_special(struct task_struct *p, void *arg) +{ + unsigned int state = 0; + + if (p->jobctl & JOBCTL_TRACED) + state = TASK_TRACED; + + else if (p->jobctl & JOBCTL_STOPPED) + state = TASK_STOPPED; + + if (state) + WRITE_ONCE(p->__state, state); + + return state; +} + void __thaw_task(struct task_struct *p) { - unsigned long flags; + unsigned long flags, flags2; spin_lock_irqsave(&freezer_lock, flags); - if (frozen(p)) - wake_up_process(p); + if (WARN_ON_ONCE(freezing(p))) + goto unlock; + + if (lock_task_sighand(p, &flags2)) { + /* TASK_FROZEN -> TASK_{STOPPED,TRACED} */ + bool ret = task_call_func(p, __set_task_special, NULL); + unlock_task_sighand(p, &flags2); + if (ret) + goto unlock; + } + + wake_up_state(p, TASK_FROZEN); +unlock: spin_unlock_irqrestore(&freezer_lock, flags); } diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 4ce0923f1ce3..ba01b9408203 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -334,7 +334,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * futex_queue() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); futex_queue(q, hb); /* Arm the timer */ @@ -352,7 +352,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) - freezable_schedule(); + schedule(); } __set_current_state(TASK_RUNNING); } @@ -430,7 +430,7 @@ retry: return ret; } - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; @@ -504,7 +504,7 @@ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, return; } - freezable_schedule(); + schedule(); } /** diff --git a/kernel/hung_task.c b/kernel/hung_task.c index bb2354f73ded..c71889f3f3fc 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -95,8 +95,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * Ensure the task is not frozen. * Also, skip vfork and any other user process that freezer should skip. */ - if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) - return; + if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN)) + return; /* * When a freshly created task is scheduled once, changes its state to @@ -191,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { + unsigned int state; + if (!max_count--) goto unlock; if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { @@ -198,8 +200,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } - /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ - if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE) + /* + * skip the TASK_KILLABLE tasks -- these can be killed + * skip the TASK_IDLE tasks -- those are genuinely idle + */ + state = READ_ONCE(t->__state); + if ((state & TASK_UNINTERRUPTIBLE) && + !(state & TASK_WAKEKILL) && + !(state & TASK_NOLOAD)) check_hung_task(t, timeout); } unlock: diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 89c71fce225d..f58a0aa92310 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -92,20 +92,24 @@ bool hibernation_available(void) */ void hibernation_set_ops(const struct platform_hibernation_ops *ops) { + unsigned int sleep_flags; + if (ops && !(ops->begin && ops->end && ops->pre_snapshot && ops->prepare && ops->finish && ops->enter && ops->pre_restore && ops->restore_cleanup && ops->leave)) { WARN_ON(1); return; } - lock_system_sleep(); + + sleep_flags = lock_system_sleep(); + hibernation_ops = ops; if (ops) hibernation_mode = HIBERNATION_PLATFORM; else if (hibernation_mode == HIBERNATION_PLATFORM) hibernation_mode = HIBERNATION_SHUTDOWN; - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); } EXPORT_SYMBOL_GPL(hibernation_set_ops); @@ -713,6 +717,7 @@ static int load_image_and_restore(void) int hibernate(void) { bool snapshot_test = false; + unsigned int sleep_flags; int error; if (!hibernation_available()) { @@ -720,7 +725,7 @@ int hibernate(void) return -EPERM; } - lock_system_sleep(); + sleep_flags = lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; @@ -794,7 +799,7 @@ int hibernate(void) pm_restore_console(); hibernate_release(); Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); pr_info("hibernation exit\n"); return error; @@ -809,9 +814,10 @@ int hibernate(void) */ int hibernate_quiet_exec(int (*func)(void *data), void *data) { + unsigned int sleep_flags; int error; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); if (!hibernate_acquire()) { error = -EBUSY; @@ -891,7 +897,7 @@ restore: hibernate_release(); unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error; } @@ -1100,11 +1106,12 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { + int mode = HIBERNATION_INVALID; + unsigned int sleep_flags; int error = 0; - int i; int len; char *p; - int mode = HIBERNATION_INVALID; + int i; if (!hibernation_available()) return -EPERM; @@ -1112,7 +1119,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, p = memchr(buf, '\n', n); len = p ? p - buf : n; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (len == strlen(hibernation_modes[i]) && !strncmp(buf, hibernation_modes[i], len)) { @@ -1142,7 +1149,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, if (!error) pm_pr_dbg("Hibernation mode set to '%s'\n", hibernation_modes[mode]); - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error ? error : n; } @@ -1158,9 +1165,10 @@ static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { - dev_t res; + unsigned int sleep_flags; int len = n; char *name; + dev_t res; if (len && buf[len-1] == '\n') len--; @@ -1173,9 +1181,10 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, if (!res) return -EINVAL; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); swsusp_resume_device = res; - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); + pm_pr_dbg("Configured hibernation resume from disk to %u\n", swsusp_resume_device); noresume = 0; diff --git a/kernel/power/main.c b/kernel/power/main.c index e3694034b753..31ec4a9b9d70 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -21,14 +21,16 @@ #ifdef CONFIG_PM_SLEEP -void lock_system_sleep(void) +unsigned int lock_system_sleep(void) { - current->flags |= PF_FREEZER_SKIP; + unsigned int flags = current->flags; + current->flags |= PF_NOFREEZE; mutex_lock(&system_transition_mutex); + return flags; } EXPORT_SYMBOL_GPL(lock_system_sleep); -void unlock_system_sleep(void) +void unlock_system_sleep(unsigned int flags) { /* * Don't use freezer_count() because we don't want the call to @@ -46,7 +48,8 @@ void unlock_system_sleep(void) * Which means, if we use try_to_freeze() here, it would make them * enter the refrigerator, thus causing hibernation to lockup. */ - current->flags &= ~PF_FREEZER_SKIP; + if (!(flags & PF_NOFREEZE)) + current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(unlock_system_sleep); @@ -263,16 +266,17 @@ static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { + unsigned int sleep_flags; const char * const *s; + int error = -EINVAL; int level; char *p; int len; - int error = -EINVAL; p = memchr(buf, '\n', n); len = p ? p - buf : n; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) @@ -282,7 +286,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, break; } - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error ? error : n; } diff --git a/kernel/power/process.c b/kernel/power/process.c index 3068601e585a..ddd9988327fe 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -50,8 +50,7 @@ static int try_to_freeze_tasks(bool user_only) if (p == current || !freeze_task(p)) continue; - if (!freezer_should_skip(p)) - todo++; + todo++; } read_unlock(&tasklist_lock); @@ -96,8 +95,7 @@ static int try_to_freeze_tasks(bool user_only) if (!wakeup || pm_debug_messages_on) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { - if (p != current && !freezer_should_skip(p) - && freezing(p) && !frozen(p)) + if (p != current && freezing(p) && !frozen(p)) sched_show_task(p); } read_unlock(&tasklist_lock); @@ -129,7 +127,7 @@ int freeze_processes(void) current->flags |= PF_SUSPEND_TASK; if (!pm_freezing) - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); pm_wakeup_clear(0); pr_info("Freezing user space processes ... "); @@ -190,7 +188,7 @@ void thaw_processes(void) trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) - atomic_dec(&system_freezing_cnt); + static_branch_dec(&freezer_active); pm_freezing = false; pm_nosig_freezing = false; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c6272d466e58..fa3bf161d13f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -75,9 +75,11 @@ EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle); void s2idle_set_ops(const struct platform_s2idle_ops *ops) { - lock_system_sleep(); + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); s2idle_ops = ops; - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); } static void s2idle_begin(void) @@ -203,7 +205,9 @@ __setup("mem_sleep_default=", mem_sleep_default_setup); */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - lock_system_sleep(); + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); suspend_ops = ops; @@ -219,7 +223,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) mem_sleep_current = PM_SUSPEND_MEM; } - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); } EXPORT_SYMBOL_GPL(suspend_set_ops); diff --git a/kernel/power/user.c b/kernel/power/user.c index d43c2aa583b2..3a4e70366f35 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -47,12 +47,13 @@ int is_hibernate_resume_dev(dev_t dev) static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; + unsigned int sleep_flags; int error; if (!hibernation_available()) return -EPERM; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); if (!hibernate_acquire()) { error = -EBUSY; @@ -98,7 +99,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->dev = 0; Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error; } @@ -106,8 +107,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; + unsigned int sleep_flags; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); swsusp_free(); data = filp->private_data; @@ -124,7 +126,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) PM_POST_HIBERNATION : PM_POST_RESTORE); hibernate_release(); - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return 0; } @@ -132,11 +134,12 @@ static int snapshot_release(struct inode *inode, struct file *filp) static ssize_t snapshot_read(struct file *filp, char __user *buf, size_t count, loff_t *offp) { + loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; + unsigned int sleep_flags; ssize_t res; - loff_t pg_offp = *offp & ~PAGE_MASK; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); data = filp->private_data; if (!data->ready) { @@ -157,7 +160,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, *offp += res; Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return res; } @@ -165,16 +168,17 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, static ssize_t snapshot_write(struct file *filp, const char __user *buf, size_t count, loff_t *offp) { + loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; + unsigned long sleep_flags; ssize_t res; - loff_t pg_offp = *offp & ~PAGE_MASK; if (need_wait) { wait_for_device_probe(); need_wait = false; } - lock_system_sleep(); + sleep_flags = lock_system_sleep(); data = filp->private_data; @@ -196,7 +200,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, if (res > 0) *offp += res; unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return res; } diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1893d909e45c..54482193e1ed 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -269,7 +269,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) read_unlock(&tasklist_lock); if (!ret && !ignore_state && - WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED))) + WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED|TASK_FROZEN))) ret = -ESRCH; return ret; diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 4ebaf97f7bd8..991fc9002535 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -161,7 +161,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) struct task_struct *t; unsigned long flags; - BUG_ON(!lock_task_sighand(p, &flags)); + if (WARN_ON_ONCE(!lock_task_sighand(p, &flags))) + return; prev = p->signal->autogroup; if (prev == ag) { diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 35f15c26ed54..d57a5c1c1cd9 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -204,6 +204,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout); int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) return t; return 0; @@ -241,12 +242,23 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) return t; return 0; } EXPORT_SYMBOL(wait_for_completion_killable); +int __sched wait_for_completion_state(struct completion *x, unsigned int state) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, state); + + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_state); + /** * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) * @x: holds the state of this particular completion diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 60fdc0faf1c9..8cd1b5a8f613 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -143,11 +143,7 @@ __read_mostly int sysctl_resched_latency_warn_once = 1; * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ -#ifdef CONFIG_PREEMPT_RT -const_debug unsigned int sysctl_sched_nr_migrate = 8; -#else -const_debug unsigned int sysctl_sched_nr_migrate = 32; -#endif +const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; __read_mostly int scheduler_running; @@ -482,8 +478,7 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } * p->se.load, p->rt_priority, * p->dl.dl_{runtime, deadline, period, flags, bw, density} * - sched_setnuma(): p->numa_preferred_nid - * - sched_move_task()/ - * cpu_cgroup_fork(): p->sched_task_group + * - sched_move_task(): p->sched_task_group * - uclamp_update_active() p->uclamp* * * p->state <- TASK_*: @@ -2329,7 +2324,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, rq = cpu_rq(new_cpu); rq_lock(rq, rf); - BUG_ON(task_cpu(p) != new_cpu); + WARN_ON_ONCE(task_cpu(p) != new_cpu); activate_task(rq, p, 0); check_preempt_curr(rq, p, 0); @@ -2779,7 +2774,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag return -EINVAL; } - if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { + if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { /* * MIGRATE_ENABLE gets here because 'p == current', but for * anything else we cannot do is_migration_disabled(), punt @@ -3255,12 +3250,12 @@ out: /* * wait_task_inactive - wait for a thread to unschedule. * - * If @match_state is nonzero, it's the @p->state value just checked and - * not expected to change. If it changes, i.e. @p might have woken up, - * then return zero. When we succeed in waiting for @p to be off its CPU, - * we return a positive number (its total switch count). If a second call - * a short while later returns the same number, the caller can be sure that - * @p has remained unscheduled the whole time. + * Wait for the thread to block in any of the states set in @match_state. + * If it changes, i.e. @p might have woken up, then return zero. When we + * succeed in waiting for @p to be off its CPU, we return a positive number + * (its total switch count). If a second call a short while later returns the + * same number, the caller can be sure that @p has remained unscheduled the + * whole time. * * The caller must ensure that the task *will* unschedule sometime soon, * else this function might spin for a *long* time. This function can't @@ -3291,12 +3286,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * * NOTE! Since we don't hold any locks, it's not * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_running()" will + * But we don't care, since "task_on_cpu()" will * return false if the runqueue has changed and p * is actually now running somewhere else! */ - while (task_running(rq, p)) { - if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) + while (task_on_cpu(rq, p)) { + if (!(READ_ONCE(p->__state) & match_state)) return 0; cpu_relax(); } @@ -3308,10 +3303,10 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state */ rq = task_rq_lock(p, &rf); trace_sched_wait_task(p); - running = task_running(rq, p); + running = task_on_cpu(rq, p); queued = task_on_rq_queued(p); ncsw = 0; - if (!match_state || READ_ONCE(p->__state) == match_state) + if (READ_ONCE(p->__state) & match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf); @@ -6430,7 +6425,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) prev->sched_contributes_to_load = (prev_state & TASK_UNINTERRUPTIBLE) && !(prev_state & TASK_NOLOAD) && - !(prev->flags & PF_FROZEN); + !(prev_state & TASK_FROZEN); if (prev->sched_contributes_to_load) rq->nr_uninterruptible++; @@ -8650,7 +8645,7 @@ again: if (curr->sched_class != p->sched_class) goto out_unlock; - if (task_running(p_rq, p) || !task_is_running(p)) + if (task_on_cpu(p_rq, p) || !task_is_running(p)) goto out_unlock; yielded = curr->sched_class->yield_to_task(rq, p); @@ -8862,7 +8857,7 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", + pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n", free, task_pid_nr(p), ppid, read_task_thread_flags(p)); @@ -8890,7 +8885,7 @@ state_filter_match(unsigned long state_filter, struct task_struct *p) * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows * TASK_KILLABLE). */ - if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE) + if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD)) return false; return true; @@ -9602,9 +9597,6 @@ LIST_HEAD(task_groups); static struct kmem_cache *task_group_cache __read_mostly; #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); -DECLARE_PER_CPU(cpumask_var_t, select_rq_mask); - void __init sched_init(void) { unsigned long ptr = 0; @@ -9648,14 +9640,6 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } -#ifdef CONFIG_CPUMASK_OFFSTACK - for_each_possible_cpu(i) { - per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( - cpumask_size(), GFP_KERNEL, cpu_to_node(i)); - per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node( - cpumask_size(), GFP_KERNEL, cpu_to_node(i)); - } -#endif /* CONFIG_CPUMASK_OFFSTACK */ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -10164,7 +10148,7 @@ void sched_release_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -static void sched_change_group(struct task_struct *tsk, int type) +static void sched_change_group(struct task_struct *tsk) { struct task_group *tg; @@ -10180,7 +10164,7 @@ static void sched_change_group(struct task_struct *tsk, int type) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) - tsk->sched_class->task_change_group(tsk, type); + tsk->sched_class->task_change_group(tsk); else #endif set_task_rq(tsk, task_cpu(tsk)); @@ -10211,7 +10195,7 @@ void sched_move_task(struct task_struct *tsk) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk, TASK_MOVE_GROUP); + sched_change_group(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -10289,53 +10273,19 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_unregister_group(tg); } -/* - * This is called before wake_up_new_task(), therefore we really only - * have to set its group bits, all the other stuff does not apply. - */ -static void cpu_cgroup_fork(struct task_struct *task) -{ - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(task, &rf); - - update_rq_clock(rq); - sched_change_group(task, TASK_SET_GROUP); - - task_rq_unlock(rq, task, &rf); -} - +#ifdef CONFIG_RT_GROUP_SCHED static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; - int ret = 0; cgroup_taskset_for_each(task, css, tset) { -#ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; -#endif - /* - * Serialize against wake_up_new_task() such that if it's - * running, we're sure to observe its full state. - */ - raw_spin_lock_irq(&task->pi_lock); - /* - * Avoid calling sched_move_task() before wake_up_new_task() - * has happened. This would lead to problems with PELT, due to - * move wanting to detach+attach while we're not attached yet. - */ - if (READ_ONCE(task->__state) == TASK_NEW) - ret = -EINVAL; - raw_spin_unlock_irq(&task->pi_lock); - - if (ret) - break; } - return ret; + return 0; } +#endif static void cpu_cgroup_attach(struct cgroup_taskset *tset) { @@ -11171,8 +11121,9 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, - .fork = cpu_cgroup_fork, +#ifdef CONFIG_RT_GROUP_SCHED .can_attach = cpu_cgroup_can_attach, +#endif .attach = cpu_cgroup_attach, .legacy_cftypes = cpu_legacy_files, .dfl_cftypes = cpu_files, diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 93878cb2a46d..a57fd8f27498 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -88,7 +88,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, * core has now entered/left forced idle state. Defer accounting to the * next scheduling edge, rather than always forcing a reschedule here. */ - if (task_running(rq, p)) + if (task_on_cpu(rq, p)) resched_curr(rq); task_rq_unlock(rq, p, &rf); @@ -205,7 +205,7 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, default: err = -EINVAL; goto out; - }; + } if (type == PIDTYPE_PID) { __sched_core_set(task, cookie); diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 02d970a879ed..57c92d751bcd 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -123,7 +123,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, unsigned long cap, max_cap = 0; int cpu, max_cpu = -1; - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + if (!sched_asym_cpucap_active()) return 1; /* Ensure the capacity of the CPUs fits the task. */ diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index fa9ce9d83683..a286e726eb4b 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -147,7 +147,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, int task_pri = convert_prio(p->prio); int idx, cpu; - BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); + WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0ab79d819a0d..86dea6a05267 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -124,15 +124,12 @@ static inline int dl_bw_cpus(int i) return cpus; } -static inline unsigned long __dl_bw_capacity(int i) +static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) { - struct root_domain *rd = cpu_rq(i)->rd; unsigned long cap = 0; + int i; - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - - for_each_cpu_and(i, rd->span, cpu_active_mask) + for_each_cpu_and(i, mask, cpu_active_mask) cap += capacity_orig_of(i); return cap; @@ -144,11 +141,14 @@ static inline unsigned long __dl_bw_capacity(int i) */ static inline unsigned long dl_bw_capacity(int i) { - if (!static_branch_unlikely(&sched_asym_cpucapacity) && + if (!sched_asym_cpucap_active() && capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; } else { - return __dl_bw_capacity(i); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + + return __dl_bw_capacity(cpu_rq(i)->rd->span); } } @@ -310,7 +310,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) { struct rq *rq; - BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); if (task_on_rq_queued(p)) return; @@ -431,8 +431,8 @@ static void task_non_contending(struct task_struct *p) sub_rq_bw(&p->dl, &rq->dl); raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); - __dl_clear_params(p); raw_spin_unlock(&dl_b->lock); + __dl_clear_params(p); } return; @@ -607,7 +607,7 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct rb_node *leftmost; - BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); + WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); leftmost = rb_add_cached(&p->pushable_dl_tasks, &rq->dl.pushable_dl_tasks_root, @@ -684,7 +684,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * Failed to find any suitable CPU. * The task will never come back! */ - BUG_ON(dl_bandwidth_enabled()); + WARN_ON_ONCE(dl_bandwidth_enabled()); /* * If admission control is disabled we @@ -770,6 +770,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); +static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, + struct rq *rq) +{ + /* for non-boosted task, pi_of(dl_se) == dl_se */ + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; +} + /* * We are being explicitly informed that a new instance is starting, * and this means that: @@ -803,8 +811,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; - dl_se->runtime = dl_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } /* @@ -830,16 +837,14 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - BUG_ON(pi_of(dl_se)->dl_runtime <= 0); + WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0); /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. */ - if (dl_se->dl_deadline == 0) { - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; - dl_se->runtime = pi_of(dl_se)->dl_runtime; - } + if (dl_se->dl_deadline == 0) + replenish_dl_new_period(dl_se, rq); if (dl_se->dl_yielded && dl_se->runtime > 0) dl_se->runtime = 0; @@ -866,8 +871,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { printk_deferred_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; - dl_se->runtime = pi_of(dl_se)->dl_runtime; + replenish_dl_new_period(dl_se, rq); } if (dl_se->dl_yielded) @@ -1024,8 +1028,7 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) return; } - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; - dl_se->runtime = pi_of(dl_se)->dl_runtime; + replenish_dl_new_period(dl_se, rq); } } @@ -1333,11 +1336,7 @@ static void update_curr_dl(struct rq *rq) trace_sched_stat_runtime(curr, delta_exec, 0); - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); + update_current_exec_runtime(curr, now, delta_exec); if (dl_entity_is_special(dl_se)) return; @@ -1616,7 +1615,7 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); + WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node)); rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less); @@ -1640,7 +1639,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) static void enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) { - BUG_ON(on_dl_rq(dl_se)); + WARN_ON_ONCE(on_dl_rq(dl_se)); update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags); @@ -1814,6 +1813,14 @@ static void yield_task_dl(struct rq *rq) #ifdef CONFIG_SMP +static inline bool dl_task_is_earliest_deadline(struct task_struct *p, + struct rq *rq) +{ + return (!rq->dl.dl_nr_running || + dl_time_before(p->dl.deadline, + rq->dl.earliest_dl.curr)); +} + static int find_later_rq(struct task_struct *task); static int @@ -1849,16 +1856,14 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) * Take the capacity of the CPU into account to * ensure it fits the requirement of the task. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) + if (sched_asym_cpucap_active()) select_rq |= !dl_task_fits_capacity(p, cpu); if (select_rq) { int target = find_later_rq(p); if (target != -1 && - (dl_time_before(p->dl.deadline, - cpu_rq(target)->dl.earliest_dl.curr) || - (cpu_rq(target)->dl.dl_nr_running == 0))) + dl_task_is_earliest_deadline(p, cpu_rq(target))) cpu = target; } rcu_read_unlock(); @@ -2017,7 +2022,7 @@ static struct task_struct *pick_task_dl(struct rq *rq) return NULL; dl_se = pick_next_dl_entity(dl_rq); - BUG_ON(!dl_se); + WARN_ON_ONCE(!dl_se); p = dl_task_of(dl_se); return p; @@ -2087,7 +2092,7 @@ static void task_fork_dl(struct task_struct *p) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { - if (!task_running(rq, p) && + if (!task_on_cpu(rq, p) && cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; return 0; @@ -2225,9 +2230,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) later_rq = cpu_rq(cpu); - if (later_rq->dl.dl_nr_running && - !dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) { + if (!dl_task_is_earliest_deadline(task, later_rq)) { /* * Target rq has tasks of equal or earlier deadline, * retrying does not release any lock and is unlikely @@ -2241,7 +2244,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || - task_running(rq, task) || + task_on_cpu(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); @@ -2255,9 +2258,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) * its earliest one has a later deadline than our * task, the rq is a good one. */ - if (!later_rq->dl.dl_nr_running || - dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) + if (dl_task_is_earliest_deadline(task, later_rq)) break; /* Otherwise we try again. */ @@ -2277,12 +2278,12 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + WARN_ON_ONCE(rq->cpu != task_cpu(p)); + WARN_ON_ONCE(task_current(rq, p)); + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!task_on_rq_queued(p)); + WARN_ON_ONCE(!dl_task(p)); return p; } @@ -2428,9 +2429,7 @@ static void pull_dl_task(struct rq *this_rq) * - it will preempt the last one we pulled (if any). */ if (p && dl_time_before(p->dl.deadline, dmin) && - (!this_rq->dl.dl_nr_running || - dl_time_before(p->dl.deadline, - this_rq->dl.earliest_dl.curr))) { + dl_task_is_earliest_deadline(p, this_rq)) { WARN_ON(p == src_rq->curr); WARN_ON(!task_on_rq_queued(p)); @@ -2475,7 +2474,7 @@ skip: */ static void task_woken_dl(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && + if (!task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && dl_task(rq->curr) && @@ -2492,7 +2491,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, struct root_domain *src_rd; struct rq *rq; - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!dl_task(p)); rq = task_rq(p); src_rd = rq->rd; @@ -3007,17 +3006,15 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { - int ret = 1, trial_cpus; + unsigned long flags, cap; struct dl_bw *cur_dl_b; - unsigned long flags; + int ret = 1; rcu_read_lock_sched(); cur_dl_b = dl_bw_of(cpumask_any(cur)); - trial_cpus = cpumask_weight(trial); - + cap = __dl_bw_capacity(trial); raw_spin_lock_irqsave(&cur_dl_b->lock, flags); - if (cur_dl_b->bw != -1 && - cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + if (__dl_overflow(cur_dl_b, cap, 0, 0)) ret = 0; raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); rcu_read_unlock_sched(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 914096c5b1ae..5ffec4370602 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -799,8 +799,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static void attach_entity_cfs_rq(struct sched_entity *se); - /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: @@ -835,20 +833,6 @@ void post_init_entity_util_avg(struct task_struct *p) long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; - if (cap > 0) { - if (cfs_rq->avg.util_avg != 0) { - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; - sa->util_avg /= (cfs_rq->avg.load_avg + 1); - - if (sa->util_avg > cap) - sa->util_avg = cap; - } else { - sa->util_avg = cap; - } - } - - sa->runnable_avg = sa->util_avg; - if (p->sched_class != &fair_sched_class) { /* * For !fair tasks do: @@ -864,7 +848,19 @@ void post_init_entity_util_avg(struct task_struct *p) return; } - attach_entity_cfs_rq(se); + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + } + + sa->runnable_avg = sa->util_avg; } #else /* !CONFIG_SMP */ @@ -1592,11 +1588,11 @@ numa_type numa_classify(unsigned int imbalance_pct, #ifdef CONFIG_SCHED_SMT /* Forward declarations of select_idle_sibling helpers */ -static inline bool test_idle_cores(int cpu, bool def); +static inline bool test_idle_cores(int cpu); static inline int numa_idle_core(int idle_core, int cpu) { if (!static_branch_likely(&sched_smt_present) || - idle_core >= 0 || !test_idle_cores(cpu, false)) + idle_core >= 0 || !test_idle_cores(cpu)) return idle_core; /* @@ -2600,7 +2596,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!join) return; - BUG_ON(irqs_disabled()); + WARN_ON_ONCE(irqs_disabled()); double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { @@ -3838,8 +3834,7 @@ static void migrate_se_pelt_lag(struct sched_entity *se) {} * @cfs_rq: cfs_rq to update * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) - * avg. The immediate corollary is that all (fair) tasks must be attached, see - * post_init_entity_util_avg(). + * avg. The immediate corollary is that all (fair) tasks must be attached. * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * @@ -4003,6 +3998,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s #define UPDATE_TG 0x1 #define SKIP_AGE_LOAD 0x2 #define DO_ATTACH 0x4 +#define DO_DETACH 0x8 /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -4032,6 +4028,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq); + } else if (flags & DO_DETACH) { + /* + * DO_DETACH means we're here from dequeue_entity() + * and we are migrating task out of the CPU. + */ + detach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq); } else if (decayed) { cfs_rq_util_change(cfs_rq, 0); @@ -4064,8 +4067,8 @@ static void remove_entity_load_avg(struct sched_entity *se) /* * tasks cannot exit without having gone through wake_up_new_task() -> - * post_init_entity_util_avg() which will have added things to the - * cfs_rq, so we can remove unconditionally. + * enqueue_task_fair() which will have added things to the cfs_rq, + * so we can remove unconditionally. */ sync_entity_load_avg(se); @@ -4262,7 +4265,7 @@ static inline int task_fits_capacity(struct task_struct *p, static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + if (!sched_asym_cpucap_active()) return; if (!p || p->nr_cpus_allowed == 1) { @@ -4292,6 +4295,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 #define DO_ATTACH 0x0 +#define DO_DETACH 0x0 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { @@ -4434,7 +4438,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_running of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight @@ -4511,6 +4516,11 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + int action = UPDATE_TG; + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) + action |= DO_DETACH; + /* * Update run-time statistics of the 'current'. */ @@ -4519,12 +4529,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Subtract its load from the cfs_rq->runnable_avg. + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_running of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(cfs_rq, se, action); se_update_runnable(se); update_stats_dequeue_fair(cfs_rq, se, flags); @@ -5893,8 +5904,8 @@ dequeue_throttle: #ifdef CONFIG_SMP /* Working cpumask for: load_balance, load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); -DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); +static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); #ifdef CONFIG_NO_HZ_COMMON @@ -6260,7 +6271,7 @@ static inline void set_idle_cores(int cpu, int val) WRITE_ONCE(sds->has_idle_cores, val); } -static inline bool test_idle_cores(int cpu, bool def) +static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; @@ -6268,7 +6279,7 @@ static inline bool test_idle_cores(int cpu, bool def) if (sds) return READ_ONCE(sds->has_idle_cores); - return def; + return false; } /* @@ -6284,7 +6295,7 @@ void __update_idle_core(struct rq *rq) int cpu; rcu_read_lock(); - if (test_idle_cores(core, true)) + if (test_idle_cores(core)) goto unlock; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -6310,9 +6321,6 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu bool idle = true; int cpu; - if (!static_branch_likely(&sched_smt_present)) - return __select_idle_cpu(core, p); - for_each_cpu(cpu, cpu_smt_mask(core)) { if (!available_idle_cpu(cpu)) { idle = false; @@ -6339,13 +6347,12 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_smt(struct task_struct *p, int target) { int cpu; - for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr) || - !cpumask_test_cpu(cpu, sched_domain_span(sd))) + for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { + if (cpu == target) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; @@ -6360,9 +6367,9 @@ static inline void set_idle_cores(int cpu, int val) { } -static inline bool test_idle_cores(int cpu, bool def) +static inline bool test_idle_cores(int cpu) { - return def; + return false; } static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) @@ -6370,7 +6377,7 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma return __select_idle_cpu(core, p); } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline int select_idle_smt(struct task_struct *p, int target) { return -1; } @@ -6389,19 +6396,19 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct sched_domain_shared *sd_share; struct rq *this_rq = this_rq(); int this = smp_processor_id(); - struct sched_domain *this_sd; + struct sched_domain *this_sd = NULL; u64 time = 0; - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; unsigned long now = jiffies; + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) + return -1; + /* * If we're busy, the assumption that the last idle period * predicts the future is flawed; age away the remaining @@ -6455,7 +6462,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (has_idle_core) set_idle_cores(target, false); - if (sched_feat(SIS_PROP) && !has_idle_core) { + if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) { time = cpu_clock(this) - time; /* @@ -6506,7 +6513,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) static inline bool asym_fits_capacity(unsigned long task_util, int cpu) { - if (static_branch_unlikely(&sched_asym_cpucapacity)) + if (sched_asym_cpucap_active()) return fits_capacity(task_util, capacity_of(cpu)); return true; @@ -6526,7 +6533,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * On asymmetric system, update task utilization because we will check * that the task fits with cpu's capacity. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { sync_entity_load_avg(&p->se); task_util = uclamp_task_util(p); } @@ -6580,7 +6587,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * For asymmetric CPU capacity systems, our domain of interest is * sd_asym_cpucapacity rather than sd_llc. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); /* * On an asymmetric CPU capacity system where an exclusive @@ -6601,10 +6608,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; if (sched_smt_active()) { - has_idle_core = test_idle_cores(target, false); + has_idle_core = test_idle_cores(target); if (!has_idle_core && cpus_share_cache(prev, target)) { - i = select_idle_smt(p, sd, prev); + i = select_idle_smt(p, prev); if ((unsigned int)i < nr_cpumask_bits) return i; } @@ -7076,8 +7083,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) return new_cpu; } -static void detach_entity_cfs_rq(struct sched_entity *se); - /* * Called immediately before a task is migrated to a new CPU; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -7099,15 +7104,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) se->vruntime -= u64_u32_load(cfs_rq->min_vruntime); } - if (p->on_rq == TASK_ON_RQ_MIGRATING) { - /* - * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' - * rq->lock and can modify state directly. - */ - lockdep_assert_rq_held(task_rq(p)); - detach_entity_cfs_rq(se); - - } else { + if (!task_on_rq_migrating(p)) { remove_entity_load_avg(se); /* @@ -7279,7 +7276,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; find_matching_se(&se, &pse); - BUG_ON(!pse); + WARN_ON_ONCE(!pse); cse_is_idle = se_is_idle(se); pse_is_idle = se_is_idle(pse); @@ -7938,7 +7935,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found at least one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (task_running(env->src_rq, p)) { + if (task_on_cpu(env->src_rq, p)) { schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } @@ -8012,8 +8009,6 @@ static struct task_struct *detach_one_task(struct lb_env *env) return NULL; } -static const unsigned int sched_nr_migrate_break = 32; - /* * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". @@ -8049,20 +8044,24 @@ static int detach_tasks(struct lb_env *env) if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) break; - p = list_last_entry(tasks, struct task_struct, se.group_node); - env->loop++; - /* We've more or less seen every task there is, call it quits */ - if (env->loop > env->loop_max) + /* + * We've more or less seen every task there is, call it quits + * unless we haven't found any movable task yet. + */ + if (env->loop > env->loop_max && + !(env->flags & LBF_ALL_PINNED)) break; /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { - env->loop_break += sched_nr_migrate_break; + env->loop_break += SCHED_NR_MIGRATE_BREAK; env->flags |= LBF_NEED_BREAK; break; } + p = list_last_entry(tasks, struct task_struct, se.group_node); + if (!can_migrate_task(p, env)) goto next; @@ -8159,7 +8158,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) { lockdep_assert_rq_held(rq); - BUG_ON(task_rq(p) != rq); + WARN_ON_ONCE(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); check_preempt_curr(rq, p, 0); } @@ -10099,14 +10098,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); - struct lb_env env = { .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, .dst_grpmask = sched_group_span(sd->groups), .idle = idle, - .loop_break = sched_nr_migrate_break, + .loop_break = SCHED_NR_MIGRATE_BREAK, .cpus = cpus, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), @@ -10134,7 +10132,7 @@ redo: goto out_balanced; } - BUG_ON(busiest == env.dst_rq); + WARN_ON_ONCE(busiest == env.dst_rq); schedstat_add(sd->lb_imbalance[idle], env.imbalance); @@ -10182,7 +10180,9 @@ more_balance: if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; - goto more_balance; + /* Stop if we tried all running tasks */ + if (env.loop < busiest->nr_running) + goto more_balance; } /* @@ -10213,7 +10213,7 @@ more_balance: env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_DST_PINNED; env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; /* * Go back to "more_balance" rather than "redo" since we @@ -10245,7 +10245,7 @@ more_balance: */ if (!cpumask_subset(cpus, env.dst_grpmask)) { env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; goto redo; } goto out_all_pinned; @@ -10430,7 +10430,7 @@ static int active_load_balance_cpu_stop(void *data) * we need to fix it. Originally reported by * Bjorn Helgaas on a 128-CPU setup. */ - BUG_ON(busiest_rq == target_rq); + WARN_ON_ONCE(busiest_rq == target_rq); /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); @@ -10916,8 +10916,7 @@ static bool update_nohz_stats(struct rq *rq) * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. */ -static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, - enum cpu_idle_type idle) +static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) { /* Earliest time when we have to do rebalance again */ unsigned long now = jiffies; @@ -11032,7 +11031,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (idle != CPU_IDLE) return false; - _nohz_idle_balance(this_rq, flags, idle); + _nohz_idle_balance(this_rq, flags); return true; } @@ -11052,7 +11051,7 @@ void nohz_run_idle_balance(int cpu) * (ie NOHZ_STATS_KICK set) and will do the same. */ if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) - _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE); + _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); } static void nohz_newidle_balance(struct rq *this_rq) @@ -11552,6 +11551,17 @@ static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); +#ifdef CONFIG_SMP + /* + * In case the task sched_avg hasn't been attached: + * - A forked task which hasn't been woken up by wake_up_new_task(). + * - A task which has been woken up by try_to_wake_up() but is + * waiting for actually being woken up by sched_ttwu_pending(). + */ + if (!se->avg.last_update_time) + return; +#endif + /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); @@ -11563,14 +11573,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Since the real-depth could have been changed (only FAIR - * class maintain depth value), reset depth properly. - */ - se->depth = se->parent ? se->parent->depth + 1 : 0; -#endif - /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); @@ -11666,39 +11668,25 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_set_group_fair(struct task_struct *p) +static void task_change_group_fair(struct task_struct *p) { - struct sched_entity *se = &p->se; - - set_task_rq(p, task_cpu(p)); - se->depth = se->parent ? se->parent->depth + 1 : 0; -} + /* + * We couldn't detach or attach a forked task which + * hasn't been woken up by wake_up_new_task(). + */ + if (READ_ONCE(p->__state) == TASK_NEW) + return; -static void task_move_group_fair(struct task_struct *p) -{ detach_task_cfs_rq(p); - set_task_rq(p, task_cpu(p)); #ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; #endif + set_task_rq(p, task_cpu(p)); attach_task_cfs_rq(p); } -static void task_change_group_fair(struct task_struct *p, int type) -{ - switch (type) { - case TASK_SET_GROUP: - task_set_group_fair(p); - break; - - case TASK_MOVE_GROUP: - task_move_group_fair(p); - break; - } -} - void free_fair_sched_group(struct task_group *tg) { int i; @@ -12075,6 +12063,13 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { #ifdef CONFIG_SMP + int i; + + for_each_possible_cpu(i) { + zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); + zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); + } + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 55f39c8f4203..d869bcf898cc 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -509,7 +509,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) unsigned int cpu_cap; /* Only heterogeneous systems can benefit from this check */ - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + if (!sched_asym_cpucap_active()) return true; min_cap = uclamp_eff_value(p, UCLAMP_MIN); @@ -843,7 +843,7 @@ static void __disable_runtime(struct rq *rq) * We cannot be left wanting - that would mean some runtime * leaked out of the system. */ - BUG_ON(want); + WARN_ON_ONCE(want); balanced: /* * Disable all the borrow logic by pretending we have inf @@ -1062,11 +1062,7 @@ static void update_curr_rt(struct rq *rq) trace_sched_stat_runtime(curr, delta_exec, 0); - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); + update_current_exec_runtime(curr, now, delta_exec); if (!rt_bandwidth_enabled()) return; @@ -1849,7 +1845,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { - if (!task_running(rq, p) && + if (!task_on_cpu(rq, p) && cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; @@ -1897,7 +1893,7 @@ static int find_lowest_rq(struct task_struct *task) * If we're on asym system ensure we consider the different capacities * of the CPUs when searching for the lowest_mask. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, task, lowest_mask, @@ -2004,7 +2000,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) */ if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || - task_running(rq, task) || + task_on_cpu(rq, task) || !rt_task(task) || !task_on_rq_queued(task))) { @@ -2462,7 +2458,7 @@ skip: */ static void task_woken_rt(struct rq *rq, struct task_struct *p) { - bool need_to_push = !task_running(rq, p) && + bool need_to_push = !task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e26688d387ae..1fc198be1ffd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -321,21 +321,6 @@ struct dl_bw { u64 total_bw; }; -/* - * Verify the fitness of task @p to run on @cpu taking into account the - * CPU original capacity and the runtime/deadline ratio of the task. - * - * The function will return true if the CPU original capacity of the - * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the - * task and false otherwise. - */ -static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) -{ - unsigned long cap = arch_scale_cpu_capacity(cpu); - - return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime; -} - extern void init_dl_bw(struct dl_bw *dl_b); extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); @@ -1815,6 +1800,11 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +static __always_inline bool sched_asym_cpucap_active(void) +{ + return static_branch_unlikely(&sched_asym_cpucapacity); +} + struct sched_group_capacity { atomic_t ref; /* @@ -1942,6 +1932,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; p->se.parent = tg->se[cpu]; + p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -2060,7 +2051,7 @@ static inline int task_current(struct rq *rq, struct task_struct *p) return rq->curr == p; } -static inline int task_running(struct rq *rq, struct task_struct *p) +static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { #ifdef CONFIG_SMP return p->on_cpu; @@ -2204,11 +2195,8 @@ struct sched_class { void (*update_curr)(struct rq *rq); -#define TASK_SET_GROUP 0 -#define TASK_MOVE_GROUP 1 - #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_change_group)(struct task_struct *p, int type); + void (*task_change_group)(struct task_struct *p); #endif }; @@ -2435,6 +2423,12 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); +#ifdef CONFIG_PREEMPT_RT +#define SCHED_NR_MIGRATE_BREAK 8 +#else +#define SCHED_NR_MIGRATE_BREAK 32 +#endif + extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; @@ -2709,8 +2703,8 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) __acquires(rq1->lock) __acquires(rq2->lock) { - BUG_ON(!irqs_disabled()); - BUG_ON(rq1 != rq2); + WARN_ON_ONCE(!irqs_disabled()); + WARN_ON_ONCE(rq1 != rq2); raw_spin_rq_lock(rq1); __acquire(rq2->lock); /* Fake it out ;) */ double_rq_clock_clear_update(rq1, rq2); @@ -2726,7 +2720,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - BUG_ON(rq1 != rq2); + WARN_ON_ONCE(rq1 != rq2); raw_spin_rq_unlock(rq1); __release(rq2->lock); } @@ -2896,6 +2890,21 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, enum cpu_util_type type, struct task_struct *p); +/* + * Verify the fitness of task @p to run on @cpu taking into account the + * CPU original capacity and the runtime/deadline ratio of the task. + * + * The function will return true if the original capacity of @cpu is + * greater than or equal to task's deadline density right shifted by + * (BW_SHIFT - SCHED_CAPACITY_SHIFT) and false otherwise. + */ +static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned long cap = arch_scale_cpu_capacity(cpu); + + return cap >= p->dl.dl_density >> (BW_SHIFT - SCHED_CAPACITY_SHIFT); +} + static inline unsigned long cpu_bw_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; @@ -3157,4 +3166,14 @@ extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif +static inline void update_current_exec_runtime(struct task_struct *curr, + u64 now, u64 delta_exec) +{ + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = now; + cgroup_account_cputime(curr, delta_exec); +} + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index d04073a93eb4..85590599b4d6 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -71,20 +71,17 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *curr = rq->curr; - u64 delta_exec; + u64 now, delta_exec; - delta_exec = rq_clock_task(rq) - curr->se.exec_start; + now = rq_clock_task(rq); + delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; schedstat_set(curr->stats.exec_max, max(curr->stats.exec_max, delta_exec)); - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = rq_clock_task(rq); - cgroup_account_cputime(curr, delta_exec); + update_current_exec_runtime(curr, now, delta_exec); } /* diff --git a/kernel/signal.c b/kernel/signal.c index 8a0f114d00e0..d140672185a4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2305,7 +2305,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, read_unlock(&tasklist_lock); cgroup_enter_frozen(); preempt_enable_no_resched(); - freezable_schedule(); + schedule(); cgroup_leave_frozen(true); /* @@ -2474,7 +2474,7 @@ static bool do_signal_stop(int signr) /* Now we don't run again until woken by SIGCONT or SIGKILL */ cgroup_enter_frozen(); - freezable_schedule(); + schedule(); return true; } else { /* @@ -2549,11 +2549,11 @@ static void do_freezer_trap(void) * immediately (if there is a non-fatal signal pending), and * put the task into sleep. */ - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); clear_thread_flag(TIF_SIGPENDING); spin_unlock_irq(¤t->sighand->siglock); cgroup_enter_frozen(); - freezable_schedule(); + schedule(); } static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) @@ -3601,9 +3601,9 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); - __set_current_state(TASK_INTERRUPTIBLE); - ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns, - HRTIMER_MODE_REL); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns, + HRTIMER_MODE_REL); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 23af5eca11b1..3ae661ab6260 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2037,11 +2037,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod struct restart_block *restart; do { - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) - freezable_schedule(); + schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; diff --git a/kernel/umh.c b/kernel/umh.c index b989736e8707..850631518665 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -28,6 +28,7 @@ #include <linux/async.h> #include <linux/uaccess.h> #include <linux/initrd.h> +#include <linux/freezer.h> #include <trace/events/module.h> @@ -403,6 +404,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { + unsigned int state = TASK_UNINTERRUPTIBLE; DECLARE_COMPLETION_ONSTACK(done); int retval = 0; @@ -436,18 +438,22 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; - if (wait & UMH_KILLABLE) { - retval = wait_for_completion_killable(&done); - if (!retval) - goto wait_done; + if (wait & UMH_KILLABLE) + state |= TASK_KILLABLE; + + if (wait & UMH_FREEZABLE) + state |= TASK_FREEZABLE; + retval = wait_for_completion_state(&done, state); + if (!retval) + goto wait_done; + + if (wait & UMH_KILLABLE) { /* umh_complete() will see NULL and free sub_info */ if (xchg(&sub_info->complete, NULL)) goto unlock; - /* fallthrough, umh_complete() was already called */ } - wait_for_completion(&done); wait_done: retval = sub_info->retval; out: |