summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2024-09-11 08:43:26 -1000
committerTejun Heo <tj@kernel.org>2024-09-11 08:43:26 -1000
commit0b1777f0fa045c561fd26c8fda61f5eb7a930ed3 (patch)
tree7f61b85ef4ab6ba1f4d5b6ee477c9f5b36a92690 /kernel/sched
parent513ed0c7ccc103c2ff668154854ec410729a3170 (diff)
parentbc9057da1a220ff2cb6c8885fd5352558aceba2c (diff)
Merge branch 'tip/sched/core' into sched_ext/for-6.12
Pull in tip/sched/core to resolve two merge conflicts: - 96fd6c65efc6 ("sched: Factor out update_other_load_avgs() from __update_blocked_others()") 5d871a63997f ("sched/fair: Move effective_cpu_util() and effective_cpu_util() in fair.c") A simple context conflict. The former added __update_blocked_others() in the same #ifdef CONFIG_SMP block that effective_cpu_util() and sched_cpu_util() are in and the latter moved those functions to fair.c. This makes __update_blocked_others() more out of place. Will follow up with a patch to relocate. - 96fd6c65efc6 ("sched: Factor out update_other_load_avgs() from __update_blocked_others()") 84d265281d6c ("sched/pelt: Use rq_clock_task() for hw_pressure") The former factored out the body of __update_blocked_others() into update_other_load_avgs(). The latter changed how update_hw_load_avg() is called in the body. Resolved by applying the change to update_other_load_avgs() instead. Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c51
-rw-r--r--kernel/sched/cpufreq_schedutil.c6
-rw-r--r--kernel/sched/debug.c31
-rw-r--r--kernel/sched/fair.c115
-rw-r--r--kernel/sched/syscalls.c102
5 files changed, 168 insertions, 137 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b01b63e78d5e..c415d61a646b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -267,6 +267,9 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node)
void sched_core_enqueue(struct rq *rq, struct task_struct *p)
{
+ if (p->se.sched_delayed)
+ return;
+
rq->core->core_task_seq++;
if (!p->core_cookie)
@@ -277,6 +280,9 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p)
void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
{
+ if (p->se.sched_delayed)
+ return;
+
rq->core->core_task_seq++;
if (sched_core_enqueued(p)) {
@@ -6477,19 +6483,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* Constants for the sched_mode argument of __schedule().
*
* The mode argument allows RT enabled kernels to differentiate a
- * preemption from blocking on an 'sleeping' spin/rwlock. Note that
- * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
- * optimize the AND operation out and just check for zero.
+ * preemption from blocking on an 'sleeping' spin/rwlock.
*/
-#define SM_NONE 0x0
-#define SM_PREEMPT 0x1
-#define SM_RTLOCK_WAIT 0x2
-
-#ifndef CONFIG_PREEMPT_RT
-# define SM_MASK_PREEMPT (~0U)
-#else
-# define SM_MASK_PREEMPT SM_PREEMPT
-#endif
+#define SM_IDLE (-1)
+#define SM_NONE 0
+#define SM_PREEMPT 1
+#define SM_RTLOCK_WAIT 2
/*
* __schedule() is the main scheduler function.
@@ -6530,9 +6529,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*
* WARNING: must be called with preemption disabled!
*/
-static void __sched notrace __schedule(unsigned int sched_mode)
+static void __sched notrace __schedule(int sched_mode)
{
struct task_struct *prev, *next;
+ /*
+ * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted
+ * as a preemption by schedule_debug() and RCU.
+ */
+ bool preempt = sched_mode > SM_NONE;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
@@ -6543,13 +6547,13 @@ static void __sched notrace __schedule(unsigned int sched_mode)
rq = cpu_rq(cpu);
prev = rq->curr;
- schedule_debug(prev, !!sched_mode);
+ schedule_debug(prev, preempt);
if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
hrtick_clear(rq);
local_irq_disable();
- rcu_note_context_switch(!!sched_mode);
+ rcu_note_context_switch(preempt);
/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -6578,12 +6582,20 @@ static void __sched notrace __schedule(unsigned int sched_mode)
switch_count = &prev->nivcsw;
+ /* Task state changes only considers SM_PREEMPT as preemption */
+ preempt = sched_mode == SM_PREEMPT;
+
/*
* We must load prev->state once (task_struct::state is volatile), such
* that we form a control dependency vs deactivate_task() below.
*/
prev_state = READ_ONCE(prev->__state);
- if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
+ if (sched_mode == SM_IDLE) {
+ if (!rq->nr_running) {
+ next = prev;
+ goto picked;
+ }
+ } else if (!preempt && prev_state) {
if (signal_pending_state(prev_state, prev)) {
WRITE_ONCE(prev->__state, TASK_RUNNING);
} else {
@@ -6614,6 +6626,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
}
next = pick_next_task(rq, prev, &rf);
+picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG
@@ -6655,7 +6668,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
+ trace_sched_switch(preempt, prev, next, prev_state);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
@@ -6731,7 +6744,7 @@ static void sched_update_worker(struct task_struct *tsk)
}
}
-static __always_inline void __schedule_loop(unsigned int sched_mode)
+static __always_inline void __schedule_loop(int sched_mode)
{
do {
preempt_disable();
@@ -6776,7 +6789,7 @@ void __sched schedule_idle(void)
*/
WARN_ON_ONCE(current->__state);
do {
- __schedule(SM_NONE);
+ __schedule(SM_IDLE);
} while (need_resched());
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e683e5d08daa..c6ba15388ea7 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -662,9 +662,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
* Fake (unused) bandwidth; workaround to "fix"
* priority inheritance.
*/
- .sched_runtime = 1000000,
- .sched_deadline = 10000000,
- .sched_period = 10000000,
+ .sched_runtime = NSEC_PER_MSEC,
+ .sched_deadline = 10 * NSEC_PER_MSEC,
+ .sched_period = 10 * NSEC_PER_MSEC,
};
struct cpufreq_policy *policy = sg_policy->policy;
int ret;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8ae255a257a2..f4035c7a0fa1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -739,7 +739,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
else
SEQ_printf(m, " %c", task_state_to_char(p));
- SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
+ SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
SPLIT_NS(p->se.vruntime),
entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
@@ -750,17 +750,16 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
(long long)(p->nvcsw + p->nivcsw),
p->prio);
- SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld",
+ SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld",
SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)),
- SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
#ifdef CONFIG_NUMA_BALANCING
- SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
+ SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
#ifdef CONFIG_CGROUP_SCHED
- SEQ_printf_task_group_path(m, task_group(p), " %s")
+ SEQ_printf_task_group_path(m, task_group(p), " %s")
#endif
SEQ_printf(m, "\n");
@@ -772,10 +771,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m, "\n");
SEQ_printf(m, "runnable tasks:\n");
- SEQ_printf(m, " S task PID tree-key switches prio"
- " wait-time sum-exec sum-sleep\n");
+ SEQ_printf(m, " S task PID vruntime eligible "
+ "deadline slice sum-exec switches "
+ "prio wait-time sum-sleep sum-block"
+#ifdef CONFIG_NUMA_BALANCING
+ " node group-id"
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ " group-path"
+#endif
+ "\n");
SEQ_printf(m, "-------------------------------------------------------"
- "------------------------------------------------------\n");
+ "------------------------------------------------------"
+ "------------------------------------------------------"
+#ifdef CONFIG_NUMA_BALANCING
+ "--------------"
+#endif
+#ifdef CONFIG_CGROUP_SCHED
+ "--------------"
+#endif
+ "\n");
rcu_read_lock();
for_each_process_thread(g, p) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 89f73ee3822c..dc98784049aa 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6949,18 +6949,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int rq_h_nr_running = rq->cfs.h_nr_running;
u64 slice = 0;
- if (flags & ENQUEUE_DELAYED) {
- requeue_delayed_entity(se);
- return;
- }
-
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
* Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil.
*/
- util_est_enqueue(&rq->cfs, p);
+ if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE))))
+ util_est_enqueue(&rq->cfs, p);
+
+ if (flags & ENQUEUE_DELAYED) {
+ requeue_delayed_entity(se);
+ return;
+ }
/*
* If in_iowait is set, the code below may not trigger any cpufreq
@@ -7178,7 +7179,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
*/
static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
- util_est_dequeue(&rq->cfs, p);
+ if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
+ util_est_dequeue(&rq->cfs, p);
if (dequeue_entities(rq, &p->se, flags) < 0) {
util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
@@ -8086,6 +8088,105 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
}
/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the IRQ utilization.
+ *
+ * The DL bandwidth number OTOH is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long *min,
+ unsigned long *max)
+{
+ unsigned long util, irq, scale;
+ struct rq *rq = cpu_rq(cpu);
+
+ scale = arch_scale_cpu_capacity(cpu);
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= scale)) {
+ if (min)
+ *min = scale;
+ if (max)
+ *max = scale;
+ return scale;
+ }
+
+ if (min) {
+ /*
+ * The minimum utilization returns the highest level between:
+ * - the computed DL bandwidth needed with the IRQ pressure which
+ * steals time to the deadline task.
+ * - The minimum performance requirement for CFS and/or RT.
+ */
+ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+ /*
+ * When an RT task is runnable and uclamp is not used, we must
+ * ensure that the task will run at maximum compute capacity.
+ */
+ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ *min = max(*min, scale);
+ }
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ */
+ util = util_cfs + cpu_util_rt(rq);
+ util += cpu_util_dl(rq);
+
+ /*
+ * The maximum hint is a soft bandwidth requirement, which can be lower
+ * than the actual utilization because of uclamp_max requirements.
+ */
+ if (max)
+ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+
+ if (util >= scale)
+ return scale;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * IRQ metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * max - irq
+ * U' = irq + --------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, scale);
+ util += irq;
+
+ return min(scale, util);
+}
+
+unsigned long sched_cpu_util(int cpu)
+{
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
+}
+
+/*
* energy_env - Utilization landscape for energy estimation.
* @task_busy_time: Utilization contribution by the task for which we test the
* placement. Given by eenv_task_busy_time().
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 7ecade89eada..b621e0050e42 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -272,110 +272,12 @@ bool update_other_load_avgs(struct rq *rq)
lockdep_assert_rq_held(rq);
+ /* hw_pressure doesn't care about invariance */
return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
- update_hw_load_avg(now, rq, hw_pressure) |
+ update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure) |
update_irq_load_avg(rq, 0);
}
-
-/*
- * This function computes an effective utilization for the given CPU, to be
- * used for frequency selection given the linear relation: f = u * f_max.
- *
- * The scheduler tracks the following metrics:
- *
- * cpu_util_{cfs,rt,dl,irq}()
- * cpu_bw_dl()
- *
- * Where the cfs,rt and dl util numbers are tracked with the same metric and
- * synchronized windows and are thus directly comparable.
- *
- * The cfs,rt,dl utilization are the running times measured with rq->clock_task
- * which excludes things like IRQ and steal-time. These latter are then accrued
- * in the IRQ utilization.
- *
- * The DL bandwidth number OTOH is not a measured metric but a value computed
- * based on the task model parameters and gives the minimal utilization
- * required to meet deadlines.
- */
-unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long *min,
- unsigned long *max)
-{
- unsigned long util, irq, scale;
- struct rq *rq = cpu_rq(cpu);
-
- scale = arch_scale_cpu_capacity(cpu);
-
- /*
- * Early check to see if IRQ/steal time saturates the CPU, can be
- * because of inaccuracies in how we track these -- see
- * update_irq_load_avg().
- */
- irq = cpu_util_irq(rq);
- if (unlikely(irq >= scale)) {
- if (min)
- *min = scale;
- if (max)
- *max = scale;
- return scale;
- }
-
- if (min) {
- /*
- * The minimum utilization returns the highest level between:
- * - the computed DL bandwidth needed with the IRQ pressure which
- * steals time to the deadline task.
- * - The minimum performance requirement for CFS and/or RT.
- */
- *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
-
- /*
- * When an RT task is runnable and uclamp is not used, we must
- * ensure that the task will run at maximum compute capacity.
- */
- if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
- *min = max(*min, scale);
- }
-
- /*
- * Because the time spend on RT/DL tasks is visible as 'lost' time to
- * CFS tasks and we use the same metric to track the effective
- * utilization (PELT windows are synchronized) we can directly add them
- * to obtain the CPU's actual utilization.
- */
- util = util_cfs + cpu_util_rt(rq);
- util += cpu_util_dl(rq);
-
- /*
- * The maximum hint is a soft bandwidth requirement, which can be lower
- * than the actual utilization because of uclamp_max requirements.
- */
- if (max)
- *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
-
- if (util >= scale)
- return scale;
-
- /*
- * There is still idle time; further improve the number by using the
- * IRQ metric. Because IRQ/steal time is hidden from the task clock we
- * need to scale the task numbers:
- *
- * max - irq
- * U' = irq + --------- * U
- * max
- */
- util = scale_irq_capacity(util, irq, scale);
- util += irq;
-
- return min(scale, util);
-}
-
-unsigned long sched_cpu_util(int cpu)
-{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
-}
#endif /* CONFIG_SMP */
/**