diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/completion.c | 26 | ||||
-rw-r--r-- | kernel/sched/core.c | 501 | ||||
-rw-r--r-- | kernel/sched/debug.c | 49 | ||||
-rw-r--r-- | kernel/sched/fair.c | 1343 | ||||
-rw-r--r-- | kernel/sched/features.h | 24 | ||||
-rw-r--r-- | kernel/sched/psi.c | 2 | ||||
-rw-r--r-- | kernel/sched/rt.c | 5 | ||||
-rw-r--r-- | kernel/sched/sched.h | 70 | ||||
-rw-r--r-- | kernel/sched/swait.c | 8 | ||||
-rw-r--r-- | kernel/sched/topology.c | 15 | ||||
-rw-r--r-- | kernel/sched/wait.c | 5 |
11 files changed, 1133 insertions, 915 deletions
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index d57a5c1c1cd9..3561ab533dd4 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -13,6 +13,23 @@ * Waiting for completion is a typically sync point, but not an exclusion point. */ +static void complete_with_flags(struct completion *x, int wake_flags) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&x->wait.lock, flags); + + if (x->done != UINT_MAX) + x->done++; + swake_up_locked(&x->wait, wake_flags); + raw_spin_unlock_irqrestore(&x->wait.lock, flags); +} + +void complete_on_current_cpu(struct completion *x) +{ + return complete_with_flags(x, WF_CURRENT_CPU); +} + /** * complete: - signals a single thread waiting on this completion * @x: holds the state of this particular completion @@ -27,14 +44,7 @@ */ void complete(struct completion *x) { - unsigned long flags; - - raw_spin_lock_irqsave(&x->wait.lock, flags); - - if (x->done != UINT_MAX) - x->done++; - swake_up_locked(&x->wait); - raw_spin_unlock_irqrestore(&x->wait.lock, flags); + complete_with_flags(x, 0); } EXPORT_SYMBOL(complete); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c52c2eba7c73..2299a5cfbfb9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1097,25 +1097,22 @@ int get_nohz_timer_target(void) hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); - rcu_read_lock(); + guard(rcu)(); + for_each_domain(cpu, sd) { for_each_cpu_and(i, sched_domain_span(sd), hk_mask) { if (cpu == i) continue; - if (!idle_cpu(i)) { - cpu = i; - goto unlock; - } + if (!idle_cpu(i)) + return i; } } if (default_cpu == -1) default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); - cpu = default_cpu; -unlock: - rcu_read_unlock(); - return cpu; + + return default_cpu; } /* @@ -1194,6 +1191,20 @@ static void nohz_csd_func(void *info) #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL +static inline bool __need_bw_check(struct rq *rq, struct task_struct *p) +{ + if (rq->nr_running != 1) + return false; + + if (p->sched_class != &fair_sched_class) + return false; + + if (!task_on_rq_queued(p)) + return false; + + return true; +} + bool sched_can_stop_tick(struct rq *rq) { int fifo_nr_running; @@ -1229,6 +1240,18 @@ bool sched_can_stop_tick(struct rq *rq) if (rq->nr_running > 1) return false; + /* + * If there is one task and it has CFS runtime bandwidth constraints + * and it's on the cpu now we don't want to stop the tick. + * This check prevents clearing the bit if a newly enqueued task here is + * dequeued by migrating while the constrained task continues to run. + * E.g. going from 2->1 without going through pick_next_task(). + */ + if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) { + if (cfs_task_bw_constrained(rq->curr)) + return false; + } + return true; } #endif /* CONFIG_NO_HZ_FULL */ @@ -1804,7 +1827,8 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, int old_min, old_max, old_min_rt; int result; - mutex_lock(&uclamp_mutex); + guard(mutex)(&uclamp_mutex); + old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; old_min_rt = sysctl_sched_uclamp_util_min_rt_default; @@ -1813,7 +1837,7 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, if (result) goto undo; if (!write) - goto done; + return 0; if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || @@ -1849,16 +1873,12 @@ static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, * Otherwise, keep it simple and do just a lazy update at each next * task enqueue time. */ - - goto done; + return 0; undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; sysctl_sched_uclamp_util_min_rt_default = old_min_rt; -done: - mutex_unlock(&uclamp_mutex); - return result; } #endif @@ -3413,7 +3433,6 @@ static int migrate_swap_stop(void *data) { struct migration_swap_arg *arg = data; struct rq *src_rq, *dst_rq; - int ret = -EAGAIN; if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) return -EAGAIN; @@ -3421,33 +3440,25 @@ static int migrate_swap_stop(void *data) src_rq = cpu_rq(arg->src_cpu); dst_rq = cpu_rq(arg->dst_cpu); - double_raw_lock(&arg->src_task->pi_lock, - &arg->dst_task->pi_lock); - double_rq_lock(src_rq, dst_rq); + guard(double_raw_spinlock)(&arg->src_task->pi_lock, &arg->dst_task->pi_lock); + guard(double_rq_lock)(src_rq, dst_rq); if (task_cpu(arg->dst_task) != arg->dst_cpu) - goto unlock; + return -EAGAIN; if (task_cpu(arg->src_task) != arg->src_cpu) - goto unlock; + return -EAGAIN; if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) - goto unlock; + return -EAGAIN; if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) - goto unlock; + return -EAGAIN; __migrate_swap_task(arg->src_task, arg->dst_cpu); __migrate_swap_task(arg->dst_task, arg->src_cpu); - ret = 0; - -unlock: - double_rq_unlock(src_rq, dst_rq); - raw_spin_unlock(&arg->dst_task->pi_lock); - raw_spin_unlock(&arg->src_task->pi_lock); - - return ret; + return 0; } /* @@ -3722,14 +3733,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) struct sched_domain *sd; __schedstat_inc(p->stats.nr_wakeups_remote); - rcu_read_lock(); + + guard(rcu)(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { __schedstat_inc(sd->ttwu_wake_remote); break; } } - rcu_read_unlock(); } if (wake_flags & WF_MIGRATED) @@ -3928,21 +3939,13 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags void wake_up_if_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; - - rcu_read_lock(); - if (!is_idle_task(rcu_dereference(rq->curr))) - goto out; - - rq_lock_irqsave(rq, &rf); - if (is_idle_task(rq->curr)) - resched_curr(rq); - /* Else CPU is not idle, do nothing here: */ - rq_unlock_irqrestore(rq, &rf); - -out: - rcu_read_unlock(); + guard(rcu)(); + if (is_idle_task(rcu_dereference(rq->curr))) { + guard(rq_lock_irqsave)(rq); + if (is_idle_task(rq->curr)) + resched_curr(rq); + } } bool cpus_share_cache(int this_cpu, int that_cpu) @@ -4193,13 +4196,11 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) * Return: %true if @p->state changes (an actual wakeup was done), * %false otherwise. */ -static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - unsigned long flags; + guard(preempt)(); int cpu, success = 0; - preempt_disable(); if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -4226,129 +4227,127 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * reordered with p->state check below. This pairs with smp_store_mb() * in set_current_state() that the waiting thread does. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - smp_mb__after_spinlock(); - if (!ttwu_state_match(p, state, &success)) - goto unlock; + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { + smp_mb__after_spinlock(); + if (!ttwu_state_match(p, state, &success)) + break; - trace_sched_waking(p); + trace_sched_waking(p); - /* - * Ensure we load p->on_rq _after_ p->state, otherwise it would - * be possible to, falsely, observe p->on_rq == 0 and get stuck - * in smp_cond_load_acquire() below. - * - * sched_ttwu_pending() try_to_wake_up() - * STORE p->on_rq = 1 LOAD p->state - * UNLOCK rq->lock - * - * __schedule() (switch to task 'p') - * LOCK rq->lock smp_rmb(); - * smp_mb__after_spinlock(); - * UNLOCK rq->lock - * - * [task p] - * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq - * - * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in - * __schedule(). See the comment for smp_mb__after_spinlock(). - * - * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). - */ - smp_rmb(); - if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) - goto unlock; + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * STORE p->on_rq = 1 LOAD p->state + * UNLOCK rq->lock + * + * __schedule() (switch to task 'p') + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * UNLOCK rq->lock + * + * [task p] + * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq + * + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). + */ + smp_rmb(); + if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) + break; #ifdef CONFIG_SMP - /* - * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be - * possible to, falsely, observe p->on_cpu == 0. - * - * One must be running (->on_cpu == 1) in order to remove oneself - * from the runqueue. - * - * __schedule() (switch to task 'p') try_to_wake_up() - * STORE p->on_cpu = 1 LOAD p->on_rq - * UNLOCK rq->lock - * - * __schedule() (put 'p' to sleep) - * LOCK rq->lock smp_rmb(); - * smp_mb__after_spinlock(); - * STORE p->on_rq = 0 LOAD p->on_cpu - * - * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in - * __schedule(). See the comment for smp_mb__after_spinlock(). - * - * Form a control-dep-acquire with p->on_rq == 0 above, to ensure - * schedule()'s deactivate_task() has 'happened' and p will no longer - * care about it's own p->state. See the comment in __schedule(). - */ - smp_acquire__after_ctrl_dep(); + /* + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * __schedule() (switch to task 'p') try_to_wake_up() + * STORE p->on_cpu = 1 LOAD p->on_rq + * UNLOCK rq->lock + * + * __schedule() (put 'p' to sleep) + * LOCK rq->lock smp_rmb(); + * smp_mb__after_spinlock(); + * STORE p->on_rq = 0 LOAD p->on_cpu + * + * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in + * __schedule(). See the comment for smp_mb__after_spinlock(). + * + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure + * schedule()'s deactivate_task() has 'happened' and p will no longer + * care about it's own p->state. See the comment in __schedule(). + */ + smp_acquire__after_ctrl_dep(); - /* - * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq - * == 0), which means we need to do an enqueue, change p->state to - * TASK_WAKING such that we can unlock p->pi_lock before doing the - * enqueue, such as ttwu_queue_wakelist(). - */ - WRITE_ONCE(p->__state, TASK_WAKING); + /* + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq + * == 0), which means we need to do an enqueue, change p->state to + * TASK_WAKING such that we can unlock p->pi_lock before doing the + * enqueue, such as ttwu_queue_wakelist(). + */ + WRITE_ONCE(p->__state, TASK_WAKING); - /* - * If the owning (remote) CPU is still in the middle of schedule() with - * this task as prev, considering queueing p on the remote CPUs wake_list - * which potentially sends an IPI instead of spinning on p->on_cpu to - * let the waker make forward progress. This is safe because IRQs are - * disabled and the IPI will deliver after on_cpu is cleared. - * - * Ensure we load task_cpu(p) after p->on_cpu: - * - * set_task_cpu(p, cpu); - * STORE p->cpu = @cpu - * __schedule() (switch to task 'p') - * LOCK rq->lock - * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) - * STORE p->on_cpu = 1 LOAD p->cpu - * - * to ensure we observe the correct CPU on which the task is currently - * scheduling. - */ - if (smp_load_acquire(&p->on_cpu) && - ttwu_queue_wakelist(p, task_cpu(p), wake_flags)) - goto unlock; + /* + * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, considering queueing p on the remote CPUs wake_list + * which potentially sends an IPI instead of spinning on p->on_cpu to + * let the waker make forward progress. This is safe because IRQs are + * disabled and the IPI will deliver after on_cpu is cleared. + * + * Ensure we load task_cpu(p) after p->on_cpu: + * + * set_task_cpu(p, cpu); + * STORE p->cpu = @cpu + * __schedule() (switch to task 'p') + * LOCK rq->lock + * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) + * STORE p->on_cpu = 1 LOAD p->cpu + * + * to ensure we observe the correct CPU on which the task is currently + * scheduling. + */ + if (smp_load_acquire(&p->on_cpu) && + ttwu_queue_wakelist(p, task_cpu(p), wake_flags)) + break; - /* - * If the owning (remote) CPU is still in the middle of schedule() with - * this task as prev, wait until it's done referencing the task. - * - * Pairs with the smp_store_release() in finish_task(). - * - * This ensures that tasks getting woken will be fully ordered against - * their previous state and preserve Program Order. - */ - smp_cond_load_acquire(&p->on_cpu, !VAL); + /* + * If the owning (remote) CPU is still in the middle of schedule() with + * this task as prev, wait until it's done referencing the task. + * + * Pairs with the smp_store_release() in finish_task(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. + */ + smp_cond_load_acquire(&p->on_cpu, !VAL); - cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); - if (task_cpu(p) != cpu) { - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } + cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); + if (task_cpu(p) != cpu) { + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } - wake_flags |= WF_MIGRATED; - psi_ttwu_dequeue(p); - set_task_cpu(p, cpu); - } + wake_flags |= WF_MIGRATED; + psi_ttwu_dequeue(p); + set_task_cpu(p, cpu); + } #else - cpu = task_cpu(p); + cpu = task_cpu(p); #endif /* CONFIG_SMP */ - ttwu_queue(p, cpu, wake_flags); -unlock: - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + ttwu_queue(p, cpu, wake_flags); + } out: if (success) ttwu_stat(p, task_cpu(p), wake_flags); - preempt_enable(); return success; } @@ -4501,6 +4500,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + p->se.vlag = 0; + p->se.slice = sysctl_sched_base_slice; INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -5496,23 +5497,20 @@ unsigned int nr_iowait(void) void sched_exec(void) { struct task_struct *p = current; - unsigned long flags; + struct migration_arg arg; int dest_cpu; - raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); - if (dest_cpu == smp_processor_id()) - goto unlock; + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); + if (dest_cpu == smp_processor_id()) + return; - if (likely(cpu_active(dest_cpu))) { - struct migration_arg arg = { p, dest_cpu }; + if (unlikely(!cpu_active(dest_cpu))) + return; - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); - return; + arg = (struct migration_arg){ p, dest_cpu }; } -unlock: - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } #endif @@ -5722,9 +5720,6 @@ static void sched_tick_remote(struct work_struct *work) struct tick_work *twork = container_of(dwork, struct tick_work, work); int cpu = twork->cpu; struct rq *rq = cpu_rq(cpu); - struct task_struct *curr; - struct rq_flags rf; - u64 delta; int os; /* @@ -5734,30 +5729,26 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (!tick_nohz_tick_stopped_cpu(cpu)) - goto out_requeue; + if (tick_nohz_tick_stopped_cpu(cpu)) { + guard(rq_lock_irq)(rq); + struct task_struct *curr = rq->curr; - rq_lock_irq(rq, &rf); - curr = rq->curr; - if (cpu_is_offline(cpu)) - goto out_unlock; + if (cpu_online(cpu)) { + update_rq_clock(rq); - update_rq_clock(rq); + if (!is_idle_task(curr)) { + /* + * Make sure the next tick runs within a + * reasonable amount of time. + */ + u64 delta = rq_clock_task(rq) - curr->se.exec_start; + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + } + curr->sched_class->task_tick(rq, curr, 0); - if (!is_idle_task(curr)) { - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - delta = rq_clock_task(rq) - curr->se.exec_start; - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + calc_load_nohz_remote(rq); + } } - curr->sched_class->task_tick(rq, curr, 0); - - calc_load_nohz_remote(rq); -out_unlock: - rq_unlock_irq(rq, &rf); -out_requeue: /* * Run the remote tick once per second (1Hz). This arbitrary @@ -6306,19 +6297,19 @@ static bool try_steal_cookie(int this, int that) unsigned long cookie; bool success = false; - local_irq_disable(); - double_rq_lock(dst, src); + guard(irq)(); + guard(double_rq_lock)(dst, src); cookie = dst->core->core_cookie; if (!cookie) - goto unlock; + return false; if (dst->curr != dst->idle) - goto unlock; + return false; p = sched_core_find(src, cookie); if (!p) - goto unlock; + return false; do { if (p == src->core_pick || p == src->curr) @@ -6330,9 +6321,10 @@ static bool try_steal_cookie(int this, int that) if (p->core_occupation > dst->idle->core_occupation) goto next; /* - * sched_core_find() and sched_core_next() will ensure that task @p - * is not throttled now, we also need to check whether the runqueue - * of the destination CPU is being throttled. + * sched_core_find() and sched_core_next() will ensure + * that task @p is not throttled now, we also need to + * check whether the runqueue of the destination CPU is + * being throttled. */ if (sched_task_is_throttled(p, this)) goto next; @@ -6350,10 +6342,6 @@ next: p = sched_core_next(p, cookie); } while (p); -unlock: - double_rq_unlock(dst, src); - local_irq_enable(); - return success; } @@ -6411,20 +6399,24 @@ static void queue_core_balance(struct rq *rq) queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance); } +DEFINE_LOCK_GUARD_1(core_lock, int, + sched_core_lock(*_T->lock, &_T->flags), + sched_core_unlock(*_T->lock, &_T->flags), + unsigned long flags) + static void sched_core_cpu_starting(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); struct rq *rq = cpu_rq(cpu), *core_rq = NULL; - unsigned long flags; int t; - sched_core_lock(cpu, &flags); + guard(core_lock)(&cpu); WARN_ON_ONCE(rq->core != rq); /* if we're the first, we'll be our own leader */ if (cpumask_weight(smt_mask) == 1) - goto unlock; + return; /* find the leader */ for_each_cpu(t, smt_mask) { @@ -6438,7 +6430,7 @@ static void sched_core_cpu_starting(unsigned int cpu) } if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ - goto unlock; + return; /* install and validate core_rq */ for_each_cpu(t, smt_mask) { @@ -6449,29 +6441,25 @@ static void sched_core_cpu_starting(unsigned int cpu) WARN_ON_ONCE(rq->core != core_rq); } - -unlock: - sched_core_unlock(cpu, &flags); } static void sched_core_cpu_deactivate(unsigned int cpu) { const struct cpumask *smt_mask = cpu_smt_mask(cpu); struct rq *rq = cpu_rq(cpu), *core_rq = NULL; - unsigned long flags; int t; - sched_core_lock(cpu, &flags); + guard(core_lock)(&cpu); /* if we're the last man standing, nothing to do */ if (cpumask_weight(smt_mask) == 1) { WARN_ON_ONCE(rq->core != rq); - goto unlock; + return; } /* if we're not the leader, nothing to do */ if (rq->core != rq) - goto unlock; + return; /* find a new leader */ for_each_cpu(t, smt_mask) { @@ -6482,7 +6470,7 @@ static void sched_core_cpu_deactivate(unsigned int cpu) } if (WARN_ON_ONCE(!core_rq)) /* impossible */ - goto unlock; + return; /* copy the shared state to the new leader */ core_rq->core_task_seq = rq->core_task_seq; @@ -6504,9 +6492,6 @@ static void sched_core_cpu_deactivate(unsigned int cpu) rq = cpu_rq(t); rq->core = core_rq; } - -unlock: - sched_core_unlock(cpu, &flags); } static inline void sched_core_cpu_dying(unsigned int cpu) @@ -7030,7 +7015,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); + WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU)); return try_to_wake_up(curr->private, mode, wake_flags); } EXPORT_SYMBOL(default_wake_function); @@ -7383,6 +7368,19 @@ struct task_struct *idle_task(int cpu) return cpu_rq(cpu)->idle; } +#ifdef CONFIG_SCHED_CORE +int sched_core_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (sched_core_enabled(rq) && rq->curr == rq->idle) + return 1; + + return idle_cpu(cpu); +} + +#endif + #ifdef CONFIG_SMP /* * This function computes an effective utilization for the given CPU, to be @@ -9940,7 +9938,7 @@ void __init sched_init(void) ptr += nr_cpu_ids * sizeof(void **); root_task_group.shares = ROOT_TASK_GROUP_LOAD; - init_cfs_bandwidth(&root_task_group.cfs_bandwidth); + init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_entity **)ptr; @@ -11074,11 +11072,16 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) /* * Ensure max(child_quota) <= parent_quota. On cgroup2, - * always take the min. On cgroup1, only inherit when no - * limit is set: + * always take the non-RUNTIME_INF min. On cgroup1, only + * inherit when no limit is set. In both cases this is used + * by the scheduler to determine if a given CFS task has a + * bandwidth constraint at some higher level. */ if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { - quota = min(quota, parent_quota); + if (quota == RUNTIME_INF) + quota = parent_quota; + else if (parent_quota != RUNTIME_INF) + quota = min(quota, parent_quota); } else { if (quota == RUNTIME_INF) quota = parent_quota; @@ -11139,6 +11142,27 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) return 0; } + +static u64 throttled_time_self(struct task_group *tg) +{ + int i; + u64 total = 0; + + for_each_possible_cpu(i) { + total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time); + } + + return total; +} + +static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + + seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg)); + + return 0; +} #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -11215,6 +11239,10 @@ static struct cftype cpu_legacy_files[] = { .name = "stat", .seq_show = cpu_cfs_stat_show, }, + { + .name = "stat.local", + .seq_show = cpu_cfs_local_stat_show, + }, #endif #ifdef CONFIG_RT_GROUP_SCHED { @@ -11271,6 +11299,24 @@ static int cpu_extra_stat_show(struct seq_file *sf, return 0; } +static int cpu_local_stat_show(struct seq_file *sf, + struct cgroup_subsys_state *css) +{ +#ifdef CONFIG_CFS_BANDWIDTH + { + struct task_group *tg = css_tg(css); + u64 throttled_self_usec; + + throttled_self_usec = throttled_time_self(tg); + do_div(throttled_self_usec, NSEC_PER_USEC); + + seq_printf(sf, "throttled_usec %llu\n", + throttled_self_usec); + } +#endif + return 0; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) @@ -11449,6 +11495,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, + .css_local_stat_show = cpu_local_stat_show, #ifdef CONFIG_RT_GROUP_SCHED .can_attach = cpu_cgroup_can_attach, #endif diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 066ff1c8ae4e..4c3d0d9f3db6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -347,10 +347,7 @@ static __init int sched_init_debug(void) debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); #endif - debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency); - debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity); - debugfs_create_u32("idle_min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_idle_min_granularity); - debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); + debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); @@ -427,6 +424,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent) #undef SDM debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); + debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); } void update_sched_domain_debugfs(void) @@ -581,9 +579,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); - SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", + SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), + entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', + SPLIT_NS(p->se.deadline), + SPLIT_NS(p->se.slice), + SPLIT_NS(p->se.sum_exec_runtime), (long long)(p->nvcsw + p->nivcsw), p->prio); @@ -626,10 +628,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, - spread, rq0_min_vruntime, spread0; + s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread; + struct sched_entity *last, *first; struct rq *rq = cpu_rq(cpu); - struct sched_entity *last; unsigned long flags; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -643,26 +644,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_rq_lock_irqsave(rq, flags); - if (rb_first_cached(&cfs_rq->tasks_timeline)) - MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; + first = __pick_first_entity(cfs_rq); + if (first) + left_vruntime = first->vruntime; last = __pick_last_entity(cfs_rq); if (last) - max_vruntime = last->vruntime; + right_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; - rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; raw_spin_rq_unlock_irqrestore(rq, flags); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", - SPLIT_NS(MIN_vruntime)); + + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", + SPLIT_NS(left_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", SPLIT_NS(min_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", - SPLIT_NS(max_vruntime)); - spread = max_vruntime - MIN_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", - SPLIT_NS(spread)); - spread0 = min_vruntime - rq0_min_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", - SPLIT_NS(spread0)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", + SPLIT_NS(avg_vruntime(cfs_rq))); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", + SPLIT_NS(right_vruntime)); + spread = right_vruntime - left_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); @@ -863,10 +863,7 @@ static void sched_debug_header(struct seq_file *m) SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) - PN(sysctl_sched_latency); - PN(sysctl_sched_min_granularity); - PN(sysctl_sched_idle_min_granularity); - PN(sysctl_sched_wakeup_granularity); + PN(sysctl_sched_base_slice); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 695f8e5c9e77..911d0063763c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -47,6 +47,7 @@ #include <linux/psi.h> #include <linux/ratelimit.h> #include <linux/task_work.h> +#include <linux/rbtree_augmented.h> #include <asm/switch_to.h> @@ -57,22 +58,6 @@ #include "autogroup.h" /* - * Targeted preemption latency for CPU-bound tasks: - * - * NOTE: this latency value is not the same as the concept of - * 'timeslice length' - timeslices in CFS are of variable length - * and have no persistent notion like in traditional, time-slice - * based scheduling concepts. - * - * (to see the precise effective timeslice length of your workload, - * run vmstat and monitor the context-switches (cs) field) - * - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) - */ -unsigned int sysctl_sched_latency = 6000000ULL; -static unsigned int normalized_sysctl_sched_latency = 6000000ULL; - -/* * The initial- and re-scaling of tunables is configurable * * Options are: @@ -90,21 +75,8 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 750000ULL; -static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; - -/* - * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. - * Applies only when SCHED_IDLE tasks compete with normal tasks. - * - * (default: 0.75 msec) - */ -unsigned int sysctl_sched_idle_min_granularity = 750000ULL; - -/* - * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity - */ -static unsigned int sched_nr_latency = 8; +unsigned int sysctl_sched_base_slice = 750000ULL; +static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; /* * After fork, child runs first. If set to 0 (default) then @@ -112,18 +84,6 @@ static unsigned int sched_nr_latency = 8; */ unsigned int sysctl_sched_child_runs_first __read_mostly; -/* - * SCHED_OTHER wake-up granularity. - * - * This option delays the preemption effects of decoupled workloads - * and reduces their over-scheduling. Synchronous workloads will still - * have immediate wakeup/sleep latencies. - * - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -unsigned int sysctl_sched_wakeup_granularity = 1000000UL; -static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; int sched_thermal_decay_shift; @@ -277,9 +237,7 @@ static void update_sysctl(void) #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) - SET_SYSCTL(sched_min_granularity); - SET_SYSCTL(sched_latency); - SET_SYSCTL(sched_wakeup_granularity); + SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } @@ -347,6 +305,16 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight return mul_u64_u32_shr(delta_exec, fact, shift); } +/* + * delta /= w + */ +static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) +{ + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = __calc_delta(delta, NICE_0_LOAD, &se->load); + + return delta; +} const struct sched_class fair_sched_class; @@ -601,13 +569,198 @@ static inline bool entity_before(const struct sched_entity *a, return (s64)(a->vruntime - b->vruntime) < 0; } +static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return (s64)(se->vruntime - cfs_rq->min_vruntime); +} + #define __node_2_se(node) \ rb_entry((node), struct sched_entity, run_node) +/* + * Compute virtual time from the per-task service numbers: + * + * Fair schedulers conserve lag: + * + * \Sum lag_i = 0 + * + * Where lag_i is given by: + * + * lag_i = S - s_i = w_i * (V - v_i) + * + * Where S is the ideal service time and V is it's virtual time counterpart. + * Therefore: + * + * \Sum lag_i = 0 + * \Sum w_i * (V - v_i) = 0 + * \Sum w_i * V - w_i * v_i = 0 + * + * From which we can solve an expression for V in v_i (which we have in + * se->vruntime): + * + * \Sum v_i * w_i \Sum v_i * w_i + * V = -------------- = -------------- + * \Sum w_i W + * + * Specifically, this is the weighted average of all entity virtual runtimes. + * + * [[ NOTE: this is only equal to the ideal scheduler under the condition + * that join/leave operations happen at lag_i = 0, otherwise the + * virtual time has non-continguous motion equivalent to: + * + * V +-= lag_i / W + * + * Also see the comment in place_entity() that deals with this. ]] + * + * However, since v_i is u64, and the multiplcation could easily overflow + * transform it into a relative form that uses smaller quantities: + * + * Substitute: v_i == (v_i - v0) + v0 + * + * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i + * V = ---------------------------- = --------------------- + v0 + * W W + * + * Which we track using: + * + * v0 := cfs_rq->min_vruntime + * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime + * \Sum w_i := cfs_rq->avg_load + * + * Since min_vruntime is a monotonic increasing variable that closely tracks + * the per-task service, these deltas: (v_i - v), will be in the order of the + * maximal (virtual) lag induced in the system due to quantisation. + * + * Also, we use scale_load_down() to reduce the size. + * + * As measured, the max (key * weight) value was ~44 bits for a kernel build. + */ +static void +avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + unsigned long weight = scale_load_down(se->load.weight); + s64 key = entity_key(cfs_rq, se); + + cfs_rq->avg_vruntime += key * weight; + cfs_rq->avg_load += weight; +} + +static void +avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + unsigned long weight = scale_load_down(se->load.weight); + s64 key = entity_key(cfs_rq, se); + + cfs_rq->avg_vruntime -= key * weight; + cfs_rq->avg_load -= weight; +} + +static inline +void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +{ + /* + * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load + */ + cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; +} + +u64 avg_vruntime(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + s64 avg = cfs_rq->avg_vruntime; + long load = cfs_rq->avg_load; + + if (curr && curr->on_rq) { + unsigned long weight = scale_load_down(curr->load.weight); + + avg += entity_key(cfs_rq, curr) * weight; + load += weight; + } + + if (load) + avg = div_s64(avg, load); + + return cfs_rq->min_vruntime + avg; +} + +/* + * lag_i = S - s_i = w_i * (V - v_i) + * + * However, since V is approximated by the weighted average of all entities it + * is possible -- by addition/removal/reweight to the tree -- to move V around + * and end up with a larger lag than we started with. + * + * Limit this to either double the slice length with a minimum of TICK_NSEC + * since that is the timing granularity. + * + * EEVDF gives the following limit for a steady state system: + * + * -r_max < lag < max(r_max, q) + * + * XXX could add max_slice to the augmented data to track this. + */ +void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + s64 lag, limit; + + SCHED_WARN_ON(!se->on_rq); + lag = avg_vruntime(cfs_rq) - se->vruntime; + + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); + se->vlag = clamp(lag, -limit, limit); +} + +/* + * Entity is eligible once it received less service than it ought to have, + * eg. lag >= 0. + * + * lag_i = S - s_i = w_i*(V - v_i) + * + * lag_i >= 0 -> V >= v_i + * + * \Sum (v_i - v)*w_i + * V = ------------------ + v + * \Sum w_i + * + * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) + * + * Note: using 'avg_vruntime() > se->vruntime' is inacurate due + * to the loss in precision caused by the division. + */ +int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct sched_entity *curr = cfs_rq->curr; + s64 avg = cfs_rq->avg_vruntime; + long load = cfs_rq->avg_load; + + if (curr && curr->on_rq) { + unsigned long weight = scale_load_down(curr->load.weight); + + avg += entity_key(cfs_rq, curr) * weight; + load += weight; + } + + return avg >= entity_key(cfs_rq, se) * load; +} + +static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) +{ + u64 min_vruntime = cfs_rq->min_vruntime; + /* + * open coded max_vruntime() to allow updating avg_vruntime + */ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta > 0) { + avg_vruntime_update(cfs_rq, delta); + min_vruntime = vruntime; + } + return min_vruntime; +} + static void update_min_vruntime(struct cfs_rq *cfs_rq) { + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; - struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); u64 vruntime = cfs_rq->min_vruntime; @@ -618,9 +771,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) curr = NULL; } - if (leftmost) { /* non-empty tree */ - struct sched_entity *se = __node_2_se(leftmost); - + if (se) { if (!curr) vruntime = se->vruntime; else @@ -629,7 +780,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) /* ensure we never gain time by being placed backwards. */ u64_u32_store(cfs_rq->min_vruntime, - max_vruntime(cfs_rq->min_vruntime, vruntime)); + __update_min_vruntime(cfs_rq, vruntime)); } static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) @@ -637,17 +788,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); } +#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) + +static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node) +{ + if (node) { + struct sched_entity *rse = __node_2_se(node); + if (deadline_gt(min_deadline, se, rse)) + se->min_deadline = rse->min_deadline; + } +} + +/* + * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline) + */ +static inline bool min_deadline_update(struct sched_entity *se, bool exit) +{ + u64 old_min_deadline = se->min_deadline; + struct rb_node *node = &se->run_node; + + se->min_deadline = se->deadline; + __update_min_deadline(se, node->rb_right); + __update_min_deadline(se, node->rb_left); + + return se->min_deadline == old_min_deadline; +} + +RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, + run_node, min_deadline, min_deadline_update); + /* * Enqueue an entity into the rb-tree: */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); + avg_vruntime_add(cfs_rq, se); + se->min_deadline = se->deadline; + rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, + __entity_less, &min_deadline_cb); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); + rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, + &min_deadline_cb); + avg_vruntime_sub(cfs_rq, se); } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) @@ -660,14 +845,88 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) return __node_2_se(left); } -static struct sched_entity *__pick_next_entity(struct sched_entity *se) +/* + * Earliest Eligible Virtual Deadline First + * + * In order to provide latency guarantees for different request sizes + * EEVDF selects the best runnable task from two criteria: + * + * 1) the task must be eligible (must be owed service) + * + * 2) from those tasks that meet 1), we select the one + * with the earliest virtual deadline. + * + * We can do this in O(log n) time due to an augmented RB-tree. The + * tree keeps the entries sorted on service, but also functions as a + * heap based on the deadline by keeping: + * + * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline) + * + * Which allows an EDF like search on (sub)trees. + */ +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) { - struct rb_node *next = rb_next(&se->run_node); + struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; + struct sched_entity *curr = cfs_rq->curr; + struct sched_entity *best = NULL; - if (!next) - return NULL; + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) + curr = NULL; + + /* + * Once selected, run a task until it either becomes non-eligible or + * until it gets a new slice. See the HACK in set_next_entity(). + */ + if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) + return curr; + + while (node) { + struct sched_entity *se = __node_2_se(node); + + /* + * If this entity is not eligible, try the left subtree. + */ + if (!entity_eligible(cfs_rq, se)) { + node = node->rb_left; + continue; + } + + /* + * If this entity has an earlier deadline than the previous + * best, take this one. If it also has the earliest deadline + * of its subtree, we're done. + */ + if (!best || deadline_gt(deadline, best, se)) { + best = se; + if (best->deadline == best->min_deadline) + break; + } + + /* + * If the earlest deadline in this subtree is in the fully + * eligible left half of our space, go there. + */ + if (node->rb_left && + __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { + node = node->rb_left; + continue; + } + + node = node->rb_right; + } + + if (!best || (curr && deadline_gt(deadline, best, curr))) + best = curr; + + if (unlikely(!best)) { + struct sched_entity *left = __pick_first_entity(cfs_rq); + if (left) { + pr_err("EEVDF scheduling fail, picking leftmost\n"); + return left; + } + } - return __node_2_se(next); + return best; } #ifdef CONFIG_SCHED_DEBUG @@ -689,14 +948,9 @@ int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); - sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, - sysctl_sched_min_granularity); - #define WRT_SYSCTL(name) \ (normalized_sysctl_##name = sysctl_##name / (factor)) - WRT_SYSCTL(sched_min_granularity); - WRT_SYSCTL(sched_latency); - WRT_SYSCTL(sched_wakeup_granularity); + WRT_SYSCTL(sched_base_slice); #undef WRT_SYSCTL return 0; @@ -704,90 +958,36 @@ int sched_update_scaling(void) #endif #endif -/* - * delta /= w - */ -static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) -{ - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta = __calc_delta(delta, NICE_0_LOAD, &se->load); - - return delta; -} - -/* - * The idea is to set a period in which each task runs once. - * - * When there are too many tasks (sched_nr_latency) we have to stretch - * this period because otherwise the slices get too small. - * - * p = (nr <= nl) ? l : l*nr/nl - */ -static u64 __sched_period(unsigned long nr_running) -{ - if (unlikely(nr_running > sched_nr_latency)) - return nr_running * sysctl_sched_min_granularity; - else - return sysctl_sched_latency; -} - -static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq); +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); /* - * We calculate the wall-time slice from the period by taking a part - * proportional to the weight. - * - * s = p*P[w/rw] + * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i + * this is probably good enough. */ -static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) +static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned int nr_running = cfs_rq->nr_running; - struct sched_entity *init_se = se; - unsigned int min_gran; - u64 slice; - - if (sched_feat(ALT_PERIOD)) - nr_running = rq_of(cfs_rq)->cfs.h_nr_running; - - slice = __sched_period(nr_running + !se->on_rq); - - for_each_sched_entity(se) { - struct load_weight *load; - struct load_weight lw; - struct cfs_rq *qcfs_rq; - - qcfs_rq = cfs_rq_of(se); - load = &qcfs_rq->load; - - if (unlikely(!se->on_rq)) { - lw = qcfs_rq->load; + if ((s64)(se->vruntime - se->deadline) < 0) + return; - update_load_add(&lw, se->load.weight); - load = &lw; - } - slice = __calc_delta(slice, se->load.weight, load); - } + /* + * For EEVDF the virtual time slope is determined by w_i (iow. + * nice) while the request time r_i is determined by + * sysctl_sched_base_slice. + */ + se->slice = sysctl_sched_base_slice; - if (sched_feat(BASE_SLICE)) { - if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq)) - min_gran = sysctl_sched_idle_min_granularity; - else - min_gran = sysctl_sched_min_granularity; + /* + * EEVDF: vd_i = ve_i + r_i / w_i + */ + se->deadline = se->vruntime + calc_delta_fair(se->slice, se); - slice = max_t(u64, slice, min_gran); + /* + * The task has consumed its request, reschedule. + */ + if (cfs_rq->nr_running > 1) { + resched_curr(rq_of(cfs_rq)); + clear_buddies(cfs_rq, se); } - - return slice; -} - -/* - * We calculate the vruntime slice of a to-be-inserted task. - * - * vs = s/w - */ -static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - return calc_delta_fair(sched_slice(cfs_rq, se), se); } #include "pelt.h" @@ -922,6 +1122,7 @@ static void update_curr(struct cfs_rq *cfs_rq) schedstat_add(cfs_rq->exec_clock, delta_exec); curr->vruntime += calc_delta_fair(delta_exec, curr); + update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { @@ -3376,16 +3577,36 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { + unsigned long old_weight = se->load.weight; + if (se->on_rq) { /* commit outstanding execution time */ if (cfs_rq->curr == se) update_curr(cfs_rq); + else + avg_vruntime_sub(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); update_load_set(&se->load, weight); + if (!se->on_rq) { + /* + * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), + * we need to scale se->vlag when w_i changes. + */ + se->vlag = div_s64(se->vlag * old_weight, weight); + } else { + s64 deadline = se->deadline - se->vruntime; + /* + * When the weight changes, the virtual time slope changes and + * we should adjust the relative virtual deadline accordingly. + */ + deadline = div_s64(deadline * old_weight, weight); + se->deadline = se->vruntime + deadline; + } + #ifdef CONFIG_SMP do { u32 divider = get_pelt_divider(&se->avg); @@ -3395,9 +3616,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, #endif enqueue_load_avg(cfs_rq, se); - if (se->on_rq) + if (se->on_rq) { update_load_add(&cfs_rq->load, se->load.weight); - + if (cfs_rq->curr != se) + avg_vruntime_add(cfs_rq, se); + } } void reweight_task(struct task_struct *p, int prio) @@ -4693,159 +4916,125 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ -static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ -#ifdef CONFIG_SCHED_DEBUG - s64 d = se->vruntime - cfs_rq->min_vruntime; - - if (d < 0) - d = -d; - - if (d > 3*sysctl_sched_latency) - schedstat_inc(cfs_rq->nr_spread_over); -#endif -} - -static inline bool entity_is_long_sleeper(struct sched_entity *se) +static void +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - struct cfs_rq *cfs_rq; - u64 sleep_time; - - if (se->exec_start == 0) - return false; + u64 vslice = calc_delta_fair(se->slice, se); + u64 vruntime = avg_vruntime(cfs_rq); + s64 lag = 0; - cfs_rq = cfs_rq_of(se); - - sleep_time = rq_clock_task(rq_of(cfs_rq)); + /* + * Due to how V is constructed as the weighted average of entities, + * adding tasks with positive lag, or removing tasks with negative lag + * will move 'time' backwards, this can screw around with the lag of + * other tasks. + * + * EEVDF: placement strategy #1 / #2 + */ + if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { + struct sched_entity *curr = cfs_rq->curr; + unsigned long load; - /* Happen while migrating because of clock task divergence */ - if (sleep_time <= se->exec_start) - return false; + lag = se->vlag; - sleep_time -= se->exec_start; - if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD))) - return true; + /* + * If we want to place a task and preserve lag, we have to + * consider the effect of the new entity on the weighted + * average and compensate for this, otherwise lag can quickly + * evaporate. + * + * Lag is defined as: + * + * lag_i = S - s_i = w_i * (V - v_i) + * + * To avoid the 'w_i' term all over the place, we only track + * the virtual lag: + * + * vl_i = V - v_i <=> v_i = V - vl_i + * + * And we take V to be the weighted average of all v: + * + * V = (\Sum w_j*v_j) / W + * + * Where W is: \Sum w_j + * + * Then, the weighted average after adding an entity with lag + * vl_i is given by: + * + * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) + * = (W*V + w_i*(V - vl_i)) / (W + w_i) + * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) + * = (V*(W + w_i) - w_i*l) / (W + w_i) + * = V - w_i*vl_i / (W + w_i) + * + * And the actual lag after adding an entity with vl_i is: + * + * vl'_i = V' - v_i + * = V - w_i*vl_i / (W + w_i) - (V - vl_i) + * = vl_i - w_i*vl_i / (W + w_i) + * + * Which is strictly less than vl_i. So in order to preserve lag + * we should inflate the lag before placement such that the + * effective lag after placement comes out right. + * + * As such, invert the above relation for vl'_i to get the vl_i + * we need to use such that the lag after placement is the lag + * we computed before dequeue. + * + * vl'_i = vl_i - w_i*vl_i / (W + w_i) + * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i) + * + * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i + * = W*vl_i + * + * vl_i = (W + w_i)*vl'_i / W + */ + load = cfs_rq->avg_load; + if (curr && curr->on_rq) + load += scale_load_down(curr->load.weight); - return false; -} + lag *= load + scale_load_down(se->load.weight); + if (WARN_ON_ONCE(!load)) + load = 1; + lag = div_s64(lag, load); + } -static void -place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) -{ - u64 vruntime = cfs_rq->min_vruntime; + se->vruntime = vruntime - lag; /* - * The 'current' period is already promised to the current tasks, - * however the extra weight of the new task will slow them down a - * little, place the new task so that it fits in the slot that - * stays open at the end. + * When joining the competition; the exisiting tasks will be, + * on average, halfway through their slice, as such start tasks + * off with half a slice to ease into the competition. */ - if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice(cfs_rq, se); - - /* sleeps up to a single latency don't count. */ - if (!initial) { - unsigned long thresh; - - if (se_is_idle(se)) - thresh = sysctl_sched_min_granularity; - else - thresh = sysctl_sched_latency; + if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) + vslice /= 2; - /* - * Halve their sleep time's effect, to allow - * for a gentler effect of sleepers: - */ - if (sched_feat(GENTLE_FAIR_SLEEPERS)) - thresh >>= 1; - - vruntime -= thresh; - } - - /* - * Pull vruntime of the entity being placed to the base level of - * cfs_rq, to prevent boosting it if placed backwards. - * However, min_vruntime can advance much faster than real time, with - * the extreme being when an entity with the minimal weight always runs - * on the cfs_rq. If the waking entity slept for a long time, its - * vruntime difference from min_vruntime may overflow s64 and their - * comparison may get inversed, so ignore the entity's original - * vruntime in that case. - * The maximal vruntime speedup is given by the ratio of normal to - * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES. - * When placing a migrated waking entity, its exec_start has been set - * from a different rq. In order to take into account a possible - * divergence between new and prev rq's clocks task because of irq and - * stolen time, we take an additional margin. - * So, cutting off on the sleep time of - * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days - * should be safe. - */ - if (entity_is_long_sleeper(se)) - se->vruntime = vruntime; - else - se->vruntime = max_vruntime(se->vruntime, vruntime); + /* + * EEVDF: vd_i = ve_i + r_i/w_i + */ + se->deadline = se->vruntime + vslice; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static inline bool cfs_bandwidth_used(void); -/* - * MIGRATION - * - * dequeue - * update_curr() - * update_min_vruntime() - * vruntime -= min_vruntime - * - * enqueue - * update_curr() - * update_min_vruntime() - * vruntime += min_vruntime - * - * this way the vruntime transition between RQs is done when both - * min_vruntime are up-to-date. - * - * WAKEUP (remote) - * - * ->migrate_task_rq_fair() (p->state == TASK_WAKING) - * vruntime -= min_vruntime - * - * enqueue - * update_curr() - * update_min_vruntime() - * vruntime += min_vruntime - * - * this way we don't have the most up-to-date min_vruntime on the originating - * CPU and an up-to-date min_vruntime on the destination CPU. - */ - static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); bool curr = cfs_rq->curr == se; /* * If we're the current task, we must renormalise before calling * update_curr(). */ - if (renorm && curr) - se->vruntime += cfs_rq->min_vruntime; + if (curr) + place_entity(cfs_rq, se, flags); update_curr(cfs_rq); /* - * Otherwise, renormalise after, such that we're placed at the current - * moment in time, instead of some random moment in the past. Being - * placed in the past could significantly boost this task to the - * fairness detriment of existing tasks. - */ - if (renorm && !curr) - se->vruntime += cfs_rq->min_vruntime; - - /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. * - For group_entity, update its runnable_weight to reflect the new @@ -4856,37 +5045,46 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); se_update_runnable(se); + /* + * XXX update_load_avg() above will have attached us to the pelt sum; + * but update_cfs_group() here will re-adjust the weight and have to + * undo/redo all that. Seems wasteful. + */ update_cfs_group(se); + + /* + * XXX now that the entity has been re-weighted, and it's lag adjusted, + * we can place the entity. + */ + if (!curr) + place_entity(cfs_rq, se, flags); + account_entity_enqueue(cfs_rq, se); - if (flags & ENQUEUE_WAKEUP) - place_entity(cfs_rq, se, 0); /* Entity has migrated, no longer consider this task hot */ if (flags & ENQUEUE_MIGRATED) se->exec_start = 0; check_schedstat_required(); update_stats_enqueue_fair(cfs_rq, se, flags); - check_spread(cfs_rq, se); if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; if (cfs_rq->nr_running == 1) { check_enqueue_throttle(cfs_rq); - if (!throttled_hierarchy(cfs_rq)) + if (!throttled_hierarchy(cfs_rq)) { list_add_leaf_cfs_rq(cfs_rq); - } -} - -static void __clear_buddies_last(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->last != se) - break; + } else { +#ifdef CONFIG_CFS_BANDWIDTH + struct rq *rq = rq_of(cfs_rq); - cfs_rq->last = NULL; + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) + cfs_rq->throttled_clock = rq_clock(rq); + if (!cfs_rq->throttled_clock_self) + cfs_rq->throttled_clock_self = rq_clock(rq); +#endif + } } } @@ -4901,27 +5099,10 @@ static void __clear_buddies_next(struct sched_entity *se) } } -static void __clear_buddies_skip(struct sched_entity *se) -{ - for_each_sched_entity(se) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - if (cfs_rq->skip != se) - break; - - cfs_rq->skip = NULL; - } -} - static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->last == se) - __clear_buddies_last(se); - if (cfs_rq->next == se) __clear_buddies_next(se); - - if (cfs_rq->skip == se) - __clear_buddies_skip(se); } static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@ -4955,20 +5136,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) clear_buddies(cfs_rq, se); + update_entity_lag(cfs_rq, se); if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; account_entity_dequeue(cfs_rq, se); - /* - * Normalize after update_curr(); which will also have moved - * min_vruntime if @se is the one holding it back. But before doing - * update_min_vruntime() again, which will discount @se's position and - * can move min_vruntime forward still more. - */ - if (!(flags & DEQUEUE_SLEEP)) - se->vruntime -= cfs_rq->min_vruntime; - /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); @@ -4987,52 +5160,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_idle_cfs_rq_clock_pelt(cfs_rq); } -/* - * Preempt the current task with a newly woken task if needed: - */ -static void -check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - unsigned long ideal_runtime, delta_exec; - struct sched_entity *se; - s64 delta; - - /* - * When many tasks blow up the sched_period; it is possible that - * sched_slice() reports unusually large results (when many tasks are - * very light for example). Therefore impose a maximum. - */ - ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); - - delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; - if (delta_exec > ideal_runtime) { - resched_curr(rq_of(cfs_rq)); - /* - * The current task ran long enough, ensure it doesn't get - * re-elected due to buddy favours. - */ - clear_buddies(cfs_rq, curr); - return; - } - - /* - * Ensure that a task that missed wakeup preemption by a - * narrow margin doesn't have to wait for a full slice. - * This also mitigates buddy induced latencies under load. - */ - if (delta_exec < sysctl_sched_min_granularity) - return; - - se = __pick_first_entity(cfs_rq); - delta = curr->vruntime - se->vruntime; - - if (delta < 0) - return; - - if (delta > ideal_runtime) - resched_curr(rq_of(cfs_rq)); -} - static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -5048,6 +5175,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); + /* + * HACK, stash a copy of deadline at the point of pick in vlag, + * which isn't used until dequeue. + */ + se->vlag = se->deadline; } update_stats_curr_start(cfs_rq, se); @@ -5071,9 +5203,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); - /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups @@ -5084,50 +5213,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - struct sched_entity *left = __pick_first_entity(cfs_rq); - struct sched_entity *se; - /* - * If curr is set we have to see if its left of the leftmost entity - * still in the tree, provided there was anything in the tree at all. + * Enabling NEXT_BUDDY will affect latency but not fairness. */ - if (!left || (curr && entity_before(curr, left))) - left = curr; - - se = left; /* ideally we run the leftmost entity */ - - /* - * Avoid running the skip buddy, if running something else can - * be done without getting too unfair. - */ - if (cfs_rq->skip && cfs_rq->skip == se) { - struct sched_entity *second; - - if (se == curr) { - second = __pick_first_entity(cfs_rq); - } else { - second = __pick_next_entity(se); - if (!second || (curr && entity_before(curr, second))) - second = curr; - } - - if (second && wakeup_preempt_entity(second, left) < 1) - se = second; - } + if (sched_feat(NEXT_BUDDY) && + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) + return cfs_rq->next; - if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) { - /* - * Someone really wants this to run. If it's not unfair, run it. - */ - se = cfs_rq->next; - } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) { - /* - * Prefer last buddy, try to return the CPU to a preempted task. - */ - se = cfs_rq->last; - } - - return se; + return pick_eevdf(cfs_rq); } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); @@ -5144,8 +5237,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - check_spread(cfs_rq, prev); - if (prev->on_rq) { update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */ @@ -5186,9 +5277,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) return; #endif - - if (cfs_rq->nr_running > 1) - check_preempt_tick(cfs_rq, curr); } @@ -5378,6 +5466,17 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) /* Add cfs_rq with load or one or more already running entities to the list */ if (!cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); + + if (cfs_rq->throttled_clock_self) { + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; + + cfs_rq->throttled_clock_self = 0; + + if (SCHED_WARN_ON((s64)delta < 0)) + delta = 0; + + cfs_rq->throttled_clock_self_time += delta; + } } return 0; @@ -5392,6 +5491,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) if (!cfs_rq->throttle_count) { cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); + + SCHED_WARN_ON(cfs_rq->throttled_clock_self); + if (cfs_rq->nr_running) + cfs_rq->throttled_clock_self = rq_clock(rq); } cfs_rq->throttle_count++; @@ -5481,7 +5584,9 @@ done: * throttled-list. rq->lock protects completion. */ cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq); + SCHED_WARN_ON(cfs_rq->throttled_clock); + if (cfs_rq->nr_running) + cfs_rq->throttled_clock = rq_clock(rq); return true; } @@ -5499,7 +5604,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) update_rq_clock(rq); raw_spin_lock(&cfs_b->lock); - cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; + if (cfs_rq->throttled_clock) { + cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; + cfs_rq->throttled_clock = 0; + } list_del_rcu(&cfs_rq->throttled_list); raw_spin_unlock(&cfs_b->lock); @@ -6015,13 +6123,14 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) { raw_spin_lock_init(&cfs_b->lock); cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; cfs_b->period = ns_to_ktime(default_cfs_period()); cfs_b->burst = 0; + cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); @@ -6158,6 +6267,46 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) rq_clock_stop_loop_update(rq); } +bool cfs_task_bw_constrained(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + if (!cfs_bandwidth_used()) + return false; + + if (cfs_rq->runtime_enabled || + tg_cfs_bandwidth(cfs_rq->tg)->hierarchical_quota != RUNTIME_INF) + return true; + + return false; +} + +#ifdef CONFIG_NO_HZ_FULL +/* called from pick_next_task_fair() */ +static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) +{ + int cpu = cpu_of(rq); + + if (!sched_feat(HZ_BW) || !cfs_bandwidth_used()) + return; + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (rq->nr_running != 1) + return; + + /* + * We know there is only one task runnable and we've just picked it. The + * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will + * be otherwise able to stop the tick. Just need to check if we are using + * bandwidth control. + */ + if (cfs_task_bw_constrained(p)) + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#endif + #else /* CONFIG_CFS_BANDWIDTH */ static inline bool cfs_bandwidth_used(void) @@ -6188,7 +6337,7 @@ static inline int throttled_lb_pair(struct task_group *tg, } #ifdef CONFIG_FAIR_GROUP_SCHED -void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} #endif @@ -6199,9 +6348,18 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} static inline void update_runtime_enabled(struct rq *rq) {} static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} - +#ifdef CONFIG_CGROUP_SCHED +bool cfs_task_bw_constrained(struct task_struct *p) +{ + return false; +} +#endif #endif /* CONFIG_CFS_BANDWIDTH */ +#if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) +static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} +#endif + /************************************************** * CFS operations on tasks: */ @@ -6210,13 +6368,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); SCHED_WARN_ON(task_rq(p) != rq); if (rq->cfs.h_nr_running > 1) { - u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; + u64 slice = se->slice; s64 delta = slice - ran; if (delta < 0) { @@ -6240,8 +6397,7 @@ static void hrtick_update(struct rq *rq) if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) return; - if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) - hrtick_start_fair(rq, curr); + hrtick_start_fair(rq, curr); } #else /* !CONFIG_SCHED_HRTICK */ static inline void @@ -6282,17 +6438,6 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } -/* - * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use - * of idle_nr_running, which does not consider idle descendants of normal - * entities. - */ -static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq) -{ - return cfs_rq->nr_running && - cfs_rq->nr_running == cfs_rq->idle_nr_running; -} - #ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { @@ -7065,7 +7210,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) util_min = uclamp_eff_value(p, UCLAMP_MIN); util_max = uclamp_eff_value(p, UCLAMP_MAX); - for_each_cpu_wrap(cpu, cpus, target + 1) { + for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) @@ -7289,9 +7434,6 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); - if (boost) - util_est = max(util_est, runnable); - /* * During wake-up @p isn't enqueued yet and doesn't contribute * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. @@ -7741,6 +7883,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) if (wake_flags & WF_TTWU) { record_wakee(p); + if ((wake_flags & WF_CURRENT_CPU) && + cpumask_test_cpu(cpu, p->cpus_ptr)) + return cpu; + if (sched_energy_enabled()) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) @@ -7798,18 +7944,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { struct sched_entity *se = &p->se; - /* - * As blocked tasks retain absolute vruntime the migration needs to - * deal with this by subtracting the old and adding the new - * min_vruntime -- the latter is done by enqueue_entity() when placing - * the task on the new runqueue. - */ - if (READ_ONCE(p->__state) == TASK_WAKING) { - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - se->vruntime -= u64_u32_load(cfs_rq->min_vruntime); - } - if (!task_on_rq_migrating(p)) { remove_entity_load_avg(se); @@ -7847,66 +7981,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } #endif /* CONFIG_SMP */ -static unsigned long wakeup_gran(struct sched_entity *se) -{ - unsigned long gran = sysctl_sched_wakeup_granularity; - - /* - * Since its curr running now, convert the gran from real-time - * to virtual-time in his units. - * - * By using 'se' instead of 'curr' we penalize light tasks, so - * they get preempted easier. That is, if 'se' < 'curr' then - * the resulting gran will be larger, therefore penalizing the - * lighter, if otoh 'se' > 'curr' then the resulting gran will - * be smaller, again penalizing the lighter task. - * - * This is especially important for buddies when the leftmost - * task is higher priority than the buddy. - */ - return calc_delta_fair(gran, se); -} - -/* - * Should 'se' preempt 'curr'. - * - * |s1 - * |s2 - * |s3 - * g - * |<--->|c - * - * w(c, s1) = -1 - * w(c, s2) = 0 - * w(c, s3) = 1 - * - */ -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -{ - s64 gran, vdiff = curr->vruntime - se->vruntime; - - if (vdiff <= 0) - return -1; - - gran = wakeup_gran(se); - if (vdiff > gran) - return 1; - - return 0; -} - -static void set_last_buddy(struct sched_entity *se) -{ - for_each_sched_entity(se) { - if (SCHED_WARN_ON(!se->on_rq)) - return; - if (se_is_idle(se)) - return; - cfs_rq_of(se)->last = se; - } -} - static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { @@ -7918,12 +7992,6 @@ static void set_next_buddy(struct sched_entity *se) } } -static void set_skip_buddy(struct sched_entity *se) -{ - for_each_sched_entity(se) - cfs_rq_of(se)->skip = se; -} - /* * Preempt the current task with a newly woken task if needed: */ @@ -7932,7 +8000,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int scale = cfs_rq->nr_running >= sched_nr_latency; int next_buddy_marked = 0; int cse_is_idle, pse_is_idle; @@ -7948,7 +8015,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; - if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { + if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) { set_next_buddy(pse); next_buddy_marked = 1; } @@ -7993,35 +8060,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (cse_is_idle != pse_is_idle) return; - update_curr(cfs_rq_of(se)); - if (wakeup_preempt_entity(se, pse) == 1) { - /* - * Bias pick_next to pick the sched entity that is - * triggering this preemption. - */ - if (!next_buddy_marked) - set_next_buddy(pse); + cfs_rq = cfs_rq_of(se); + update_curr(cfs_rq); + + /* + * XXX pick_eevdf(cfs_rq) != se ? + */ + if (pick_eevdf(cfs_rq) == pse) goto preempt; - } return; preempt: resched_curr(rq); - /* - * Only set the backward buddy when the current task is still - * on the rq. This can happen when a wakeup gets interleaved - * with schedule on the ->pre_schedule() or idle_balance() - * point, either of which can * drop the rq lock. - * - * Also, during early boot the idle thread is in the fair class, - * for obvious reasons its a bad idea to schedule back to it. - */ - if (unlikely(!se->on_rq || curr == rq->idle)) - return; - - if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) - set_last_buddy(se); } #ifdef CONFIG_SMP @@ -8172,6 +8223,7 @@ done: __maybe_unused; hrtick_start_fair(rq, p); update_misfit_status(p, rq); + sched_fair_update_stop_tick(rq, p); return p; @@ -8222,8 +8274,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) /* * sched_yield() is very simple - * - * The magic of dealing with the ->skip buddy is in pick_next_entity. */ static void yield_task_fair(struct rq *rq) { @@ -8239,21 +8289,19 @@ static void yield_task_fair(struct rq *rq) clear_buddies(cfs_rq, se); - if (curr->policy != SCHED_BATCH) { - update_rq_clock(rq); - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - /* - * Tell update_rq_clock() that we've just updated, - * so we don't do microscopic update in schedule() - * and double the fastpath cost. - */ - rq_clock_skip_update(rq); - } + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq_clock_skip_update(rq); - set_skip_buddy(se); + se->deadline += calc_delta_fair(se->slice, se); } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) @@ -8416,6 +8464,11 @@ enum group_type { */ group_misfit_task, /* + * Balance SMT group that's fully busy. Can benefit from migration + * a task on SMT with busy sibling to another CPU on idle core. + */ + group_smt_balance, + /* * SD_ASYM_PACKING only: One local CPU with higher capacity is available, * and the task should be migrated to it instead of running on the * current CPU. @@ -8496,8 +8549,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) * Buddy candidates are cache hot: */ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && - (&p->se == cfs_rq_of(&p->se)->next || - &p->se == cfs_rq_of(&p->se)->last)) + (&p->se == cfs_rq_of(&p->se)->next)) return 1; if (sysctl_sched_migration_cost == -1) @@ -9123,6 +9175,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -9396,6 +9449,9 @@ group_type group_classify(unsigned int imbalance_pct, if (sgs->group_asym_packing) return group_asym_packing; + if (sgs->group_smt_balance) + return group_smt_balance; + if (sgs->group_misfit_task_load) return group_misfit_task; @@ -9465,6 +9521,71 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); } +/* One group has more than one SMT CPU while the other group does not */ +static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1, + struct sched_group *sg2) +{ + if (!sg1 || !sg2) + return false; + + return (sg1->flags & SD_SHARE_CPUCAPACITY) != + (sg2->flags & SD_SHARE_CPUCAPACITY); +} + +static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + if (env->idle == CPU_NOT_IDLE) + return false; + + /* + * For SMT source group, it is better to move a task + * to a CPU that doesn't have multiple tasks sharing its CPU capacity. + * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY + * will not be on. + */ + if (group->flags & SD_SHARE_CPUCAPACITY && + sgs->sum_h_nr_running > 1) + return true; + + return false; +} + +static inline long sibling_imbalance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sg_lb_stats *busiest, + struct sg_lb_stats *local) +{ + int ncores_busiest, ncores_local; + long imbalance; + + if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running) + return 0; + + ncores_busiest = sds->busiest->cores; + ncores_local = sds->local->cores; + + if (ncores_busiest == ncores_local) { + imbalance = busiest->sum_nr_running; + lsub_positive(&imbalance, local->sum_nr_running); + return imbalance; + } + + /* Balance such that nr_running/ncores ratio are same on both groups */ + imbalance = ncores_local * busiest->sum_nr_running; + lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running); + /* Normalize imbalance and do rounding on normalization */ + imbalance = 2 * imbalance + ncores_local + ncores_busiest; + imbalance /= ncores_local + ncores_busiest; + + /* Take advantage of resource in an empty sched group */ + if (imbalance == 0 && local->sum_nr_running == 0 && + busiest->sum_nr_running > 1) + imbalance = 2; + + return imbalance; +} + static inline bool sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) { @@ -9557,6 +9678,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_asym_packing = 1; } + /* Check for loaded SMT group to be balanced to dst CPU */ + if (!local_group && smt_balance(env, sgs, group)) + sgs->group_smt_balance = 1; + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); /* Computing avg_load makes sense only when group is overloaded */ @@ -9641,6 +9766,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, return false; break; + case group_smt_balance: case group_fully_busy: /* * Select the fully busy group with highest avg_load. In @@ -9670,6 +9796,18 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_has_spare: /* + * Do not pick sg with SMT CPUs over sg with pure CPUs, + * as we do not want to pull task off SMT core with one task + * and make the core idle. + */ + if (smt_vs_nonsmt_groups(sds->busiest, sg)) { + if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1) + return false; + else + return true; + } + + /* * Select not overloaded group with lowest number of idle cpus * and highest number of running tasks. We could also compare * the spare capacity which is more stable but it can end up @@ -9865,6 +10003,7 @@ static bool update_pick_idlest(struct sched_group *idlest, case group_imbalanced: case group_asym_packing: + case group_smt_balance: /* Those types are not used in the slow wakeup path */ return false; @@ -9996,6 +10135,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) case group_imbalanced: case group_asym_packing: + case group_smt_balance: /* Those type are not used in the slow wakeup path */ return NULL; @@ -10250,6 +10390,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return; } + if (busiest->group_type == group_smt_balance) { + /* Reduce number of tasks sharing CPU capacity */ + env->migration_type = migrate_task; + env->imbalance = 1; + return; + } + if (busiest->group_type == group_imbalanced) { /* * In the group_imb case we cannot rely on group-wide averages @@ -10297,14 +10444,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } if (busiest->group_weight == 1 || sds->prefer_sibling) { - unsigned int nr_diff = busiest->sum_nr_running; /* * When prefer sibling, evenly spread running tasks on * groups. */ env->migration_type = migrate_task; - lsub_positive(&nr_diff, local->sum_nr_running); - env->imbalance = nr_diff; + env->imbalance = sibling_imbalance(env, sds, busiest, local); } else { /* @@ -10501,20 +10646,27 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * group's child domain. */ if (sds.prefer_sibling && local->group_type == group_has_spare && - busiest->sum_nr_running > local->sum_nr_running + 1) + sibling_imbalance(env, &sds, busiest, local) > 1) goto force_balance; if (busiest->group_type != group_overloaded) { - if (env->idle == CPU_NOT_IDLE) + if (env->idle == CPU_NOT_IDLE) { /* * If the busiest group is not overloaded (and as a * result the local one too) but this CPU is already * busy, let another idle CPU try to pull task. */ goto out_balanced; + } + + if (busiest->group_type == group_smt_balance && + smt_vs_nonsmt_groups(sds.local, sds.busiest)) { + /* Let non SMT CPU pull from SMT CPU sharing with sibling */ + goto force_balance; + } if (busiest->group_weight > 1 && - local->idle_cpus <= (busiest->idle_cpus + 1)) + local->idle_cpus <= (busiest->idle_cpus + 1)) { /* * If the busiest group is not overloaded * and there is no imbalance between this and busiest @@ -10525,12 +10677,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * there is more than 1 CPU per group. */ goto out_balanced; + } - if (busiest->sum_h_nr_running == 1) + if (busiest->sum_h_nr_running == 1) { /* * busiest doesn't have any tasks waiting to run */ goto out_balanced; + } } force_balance: @@ -10764,7 +10918,7 @@ static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; - int cpu; + int cpu, idle_smt = -1; /* * Ensure the balancing environment is consistent; can happen @@ -10791,10 +10945,24 @@ static int should_we_balance(struct lb_env *env) if (!idle_cpu(cpu)) continue; + /* + * Don't balance to idle SMT in busy core right away when + * balancing cores, but remember the first idle SMT CPU for + * later consideration. Find CPU on an idle core first. + */ + if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) { + if (idle_smt == -1) + idle_smt = cpu; + continue; + } + /* Are we the first idle CPU? */ return cpu == env->dst_cpu; } + if (idle_smt == env->dst_cpu) + return true; + /* Are we the first CPU of this group ? */ return group_balance_cpu(sg) == env->dst_cpu; } @@ -12007,8 +12175,8 @@ static void rq_offline_fair(struct rq *rq) static inline bool __entity_slice_used(struct sched_entity *se, int min_nr_tasks) { - u64 slice = sched_slice(cfs_rq_of(se), se); u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; + u64 slice = se->slice; return (rtime * min_nr_tasks > slice); } @@ -12164,8 +12332,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) */ static void task_fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; + struct cfs_rq *cfs_rq; struct rq *rq = this_rq(); struct rq_flags rf; @@ -12174,22 +12342,9 @@ static void task_fork_fair(struct task_struct *p) cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - if (curr) { + if (curr) update_curr(cfs_rq); - se->vruntime = curr->vruntime; - } - place_entity(cfs_rq, se, 1); - - if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { - /* - * Upon rescheduling, sched_class::put_prev_task() will place - * 'current' within the tree based on its new key value. - */ - swap(curr->vruntime, se->vruntime); - resched_curr(rq); - } - - se->vruntime -= cfs_rq->min_vruntime; + place_entity(cfs_rq, se, ENQUEUE_INITIAL); rq_unlock(rq, &rf); } @@ -12218,34 +12373,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) check_preempt_curr(rq, p, 0); } -static inline bool vruntime_normalized(struct task_struct *p) -{ - struct sched_entity *se = &p->se; - - /* - * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, - * the dequeue_entity(.flags=0) will already have normalized the - * vruntime. - */ - if (p->on_rq) - return true; - - /* - * When !on_rq, vruntime of the task has usually NOT been normalized. - * But there are some cases where it has already been normalized: - * - * - A forked child which is waiting for being woken up by - * wake_up_new_task(). - * - A task which has been woken up by try_to_wake_up() and - * waiting for actually being woken up by sched_ttwu_pending(). - */ - if (!se->sum_exec_runtime || - (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup)) - return true; - - return false; -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* * Propagate the changes of the sched_entity across the tg tree to make it @@ -12316,16 +12443,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); - - if (!vruntime_normalized(p)) { - /* - * Fix up our vruntime so that the current sleep doesn't - * cause 'unlimited' sleep bonus. - */ - place_entity(cfs_rq, se, 0); - se->vruntime -= cfs_rq->min_vruntime; - } detach_entity_cfs_rq(se); } @@ -12333,12 +12450,8 @@ static void detach_task_cfs_rq(struct task_struct *p) static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); attach_entity_cfs_rq(se); - - if (!vruntime_normalized(p)) - se->vruntime += cfs_rq->min_vruntime; } static void switched_from_fair(struct rq *rq, struct task_struct *p) @@ -12450,7 +12563,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) tg->shares = NICE_0_LOAD; - init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent)); for_each_possible_cpu(i) { cfs_rq = kzalloc_node(sizeof(struct cfs_rq), @@ -12703,7 +12816,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * idle runqueue: */ if (rq->cfs.load.weight) - rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); + rr_interval = NS_TO_JIFFIES(se->slice); return rr_interval; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index ee7f23c76bd3..f770168230ae 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -1,16 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* - * Only give sleepers 50% of their service deficit. This allows - * them to run sooner, but does not allow tons of sleepers to - * rip the spread apart. - */ -SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) /* - * Place new tasks ahead so that they do not starve already running - * tasks + * Using the avg_vruntime, do the right thing and preserve lag across + * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. */ -SCHED_FEAT(START_DEBIT, true) +SCHED_FEAT(PLACE_LAG, true) +SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) +SCHED_FEAT(RUN_TO_PARITY, true) /* * Prefer to schedule the task we woke last (assuming it failed @@ -20,13 +16,6 @@ SCHED_FEAT(START_DEBIT, true) SCHED_FEAT(NEXT_BUDDY, false) /* - * Prefer to schedule the task that ran last (when we did - * wake-preempt) as that likely will touch the same data, increases - * cache locality. - */ -SCHED_FEAT(LAST_BUDDY, true) - -/* * Consider buddies to be cache hot, decreases the likeliness of a * cache buddy being migrated away, increases cache locality. */ @@ -99,5 +88,4 @@ SCHED_FEAT(UTIL_EST_FASTUP, true) SCHED_FEAT(LATENCY_WARN, false) -SCHED_FEAT(ALT_PERIOD, true) -SCHED_FEAT(BASE_SLICE, true) +SCHED_FEAT(HZ_BW, true) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 9bb3f2b3ccfc..1d0f634725a6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -140,7 +140,7 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); -DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); +static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED static bool psi_enable; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 00e0e5074115..0597ba0f85ff 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -25,7 +25,7 @@ unsigned int sysctl_sched_rt_period = 1000000; int sysctl_sched_rt_runtime = 950000; #ifdef CONFIG_SYSCTL -static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; +static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ; static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, @@ -3062,6 +3062,9 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer, sched_rr_timeslice = sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE : msecs_to_jiffies(sysctl_sched_rr_timeslice); + + if (sysctl_sched_rr_timeslice <= 0) + sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE); } mutex_unlock(&mutex); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 44b540ad37b7..04846272409c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -454,11 +454,12 @@ extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); -extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); +extern bool cfs_task_bw_constrained(struct task_struct *p); extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int cpu, @@ -494,6 +495,7 @@ static inline void set_task_rq_fair(struct sched_entity *se, #else /* CONFIG_CGROUP_SCHED */ struct cfs_bandwidth { }; +static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } #endif /* CONFIG_CGROUP_SCHED */ @@ -548,6 +550,9 @@ struct cfs_rq { unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ + s64 avg_vruntime; + u64 avg_load; + u64 exec_clock; u64 min_vruntime; #ifdef CONFIG_SCHED_CORE @@ -567,8 +572,6 @@ struct cfs_rq { */ struct sched_entity *curr; struct sched_entity *next; - struct sched_entity *last; - struct sched_entity *skip; #ifdef CONFIG_SCHED_DEBUG unsigned int nr_spread_over; @@ -636,6 +639,8 @@ struct cfs_rq { u64 throttled_clock; u64 throttled_clock_pelt; u64 throttled_clock_pelt_time; + u64 throttled_clock_self; + u64 throttled_clock_self_time; int throttled; int throttle_count; struct list_head throttled_list; @@ -1701,6 +1706,21 @@ rq_unlock(struct rq *rq, struct rq_flags *rf) raw_spin_rq_unlock(rq); } +DEFINE_LOCK_GUARD_1(rq_lock, struct rq, + rq_lock(_T->lock, &_T->rf), + rq_unlock(_T->lock, &_T->rf), + struct rq_flags rf) + +DEFINE_LOCK_GUARD_1(rq_lock_irq, struct rq, + rq_lock_irq(_T->lock, &_T->rf), + rq_unlock_irq(_T->lock, &_T->rf), + struct rq_flags rf) + +DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq, + rq_lock_irqsave(_T->lock, &_T->rf), + rq_unlock_irqrestore(_T->lock, &_T->rf), + struct rq_flags rf) + static inline struct rq * this_rq_lock_irq(struct rq_flags *rf) __acquires(rq->lock) @@ -1883,6 +1903,7 @@ struct sched_group { atomic_t ref; unsigned int group_weight; + unsigned int cores; struct sched_group_capacity *sgc; int asym_prefer_cpu; /* CPU of highest priority in group */ int flags; @@ -2132,12 +2153,13 @@ static inline int task_on_rq_migrating(struct task_struct *p) } /* Wake flags. The first three directly map to some SD flag value */ -#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ -#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ -#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ +#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ +#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ +#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ -#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ -#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ +#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); @@ -2196,6 +2218,7 @@ extern const u32 sched_prio_to_wmult[40]; #else #define ENQUEUE_MIGRATED 0x00 #endif +#define ENQUEUE_INITIAL 0x80 #define RETRY_TASK ((void *)-1UL) @@ -2501,11 +2524,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; +extern unsigned int sysctl_sched_base_slice; + #ifdef CONFIG_SCHED_DEBUG -extern unsigned int sysctl_sched_latency; -extern unsigned int sysctl_sched_min_granularity; -extern unsigned int sysctl_sched_idle_min_granularity; -extern unsigned int sysctl_sched_wakeup_granularity; extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; @@ -2611,6 +2632,12 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} #endif +#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ +__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \ +static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \ +{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ + _lock; return _t; } + #ifdef CONFIG_SMP static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) @@ -2740,6 +2767,16 @@ static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2) raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING); } +static inline void double_raw_unlock(raw_spinlock_t *l1, raw_spinlock_t *l2) +{ + raw_spin_unlock(l1); + raw_spin_unlock(l2); +} + +DEFINE_LOCK_GUARD_2(double_raw_spinlock, raw_spinlock_t, + double_raw_lock(_T->lock, _T->lock2), + double_raw_unlock(_T->lock, _T->lock2)) + /* * double_rq_unlock - safely unlock two runqueues * @@ -2797,6 +2834,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) #endif +DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, + double_rq_lock(_T->lock, _T->lock2), + double_rq_unlock(_T->lock, _T->lock2)) + extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); @@ -3231,6 +3272,8 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); +extern int try_to_wake_up(struct task_struct *tsk, unsigned int state, int wake_flags); + #ifdef CONFIG_PREEMPT_DYNAMIC extern int preempt_dynamic_mode; extern int sched_dynamic_mode(const char *str); @@ -3482,4 +3525,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } static inline void init_sched_mm_cid(struct task_struct *t) { } #endif +extern u64 avg_vruntime(struct cfs_rq *cfs_rq); +extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 76b9b796e695..72505cd3b60a 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -18,7 +18,7 @@ EXPORT_SYMBOL(__init_swait_queue_head); * If for some reason it would return 0, that means the previously waiting * task is already running, so it will observe condition true (or has already). */ -void swake_up_locked(struct swait_queue_head *q) +void swake_up_locked(struct swait_queue_head *q, int wake_flags) { struct swait_queue *curr; @@ -26,7 +26,7 @@ void swake_up_locked(struct swait_queue_head *q) return; curr = list_first_entry(&q->task_list, typeof(*curr), task_list); - wake_up_process(curr->task); + try_to_wake_up(curr->task, TASK_NORMAL, wake_flags); list_del_init(&curr->task_list); } EXPORT_SYMBOL(swake_up_locked); @@ -41,7 +41,7 @@ EXPORT_SYMBOL(swake_up_locked); void swake_up_all_locked(struct swait_queue_head *q) { while (!list_empty(&q->task_list)) - swake_up_locked(q); + swake_up_locked(q, 0); } void swake_up_one(struct swait_queue_head *q) @@ -49,7 +49,7 @@ void swake_up_one(struct swait_queue_head *q) unsigned long flags; raw_spin_lock_irqsave(&q->lock, flags); - swake_up_locked(q); + swake_up_locked(q, 0); raw_spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(swake_up_one); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index d3a3b2646ec4..05a5bc678c08 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -722,8 +722,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (parent->parent) { parent->parent->child = tmp; - if (tmp->flags & SD_SHARE_CPUCAPACITY) - parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY; + parent->parent->groups->flags = tmp->flags; } /* @@ -1275,14 +1274,24 @@ build_sched_groups(struct sched_domain *sd, int cpu) static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) { struct sched_group *sg = sd->groups; + struct cpumask *mask = sched_domains_tmpmask2; WARN_ON(!sg); do { - int cpu, max_cpu = -1; + int cpu, cores = 0, max_cpu = -1; sg->group_weight = cpumask_weight(sched_group_span(sg)); + cpumask_copy(mask, sched_group_span(sg)); + for_each_cpu(cpu, mask) { + cores++; +#ifdef CONFIG_SCHED_SMT + cpumask_andnot(mask, mask, cpu_smt_mask(cpu)); +#endif + } + sg->cores = cores; + if (!(sd->flags & SD_ASYM_PACKING)) goto next; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 48c53e4739ea..802d98cf2de3 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -161,6 +161,11 @@ int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, } EXPORT_SYMBOL(__wake_up); +void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key) +{ + __wake_up_common_lock(wq_head, mode, 1, WF_CURRENT_CPU, key); +} + /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ |