summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-16 17:00:50 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-16 17:00:50 -0700
commit4a996d90b9e046c6d59845acf00a54d464c34ff3 (patch)
tree52fa410c742aaec6b469a3907e71eefc75e999b3 /kernel
parent0c182ac2ebc5470a725632b08cee9a52065bbe71 (diff)
parentdb43a609d01e8bf9b812d45dc2945c65b57dd793 (diff)
Merge tag 'sched-core-2024-07-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Update Daniel Bristot de Oliveira's entry in MAINTAINERS, and credit him in CREDITS - Harmonize the lock-yielding behavior on dynamically selected preemption models with static ones - Reorganize the code a bit: split out sched/syscalls.c to reduce the size of sched/core.c - Micro-optimize psi_group_change() - Fix set_load_weight() for SCHED_IDLE tasks - Misc cleanups & fixes * tag 'sched-core-2024-07-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Update MAINTAINERS and CREDITS sched/fair: set_load_weight() must also call reweight_task() for SCHED_IDLE tasks sched/psi: Optimise psi_group_change a bit sched/core: Drop spinlocks on contention iff kernel is preemptible sched/core: Move preempt_model_*() helpers from sched.h to preempt.h sched/balance: Skip unnecessary updates to idle load balancer's flags idle: Remove stale RCU comment sched/headers: Move struct pre-declarations to the beginning of the header sched/core: Clean up kernel/sched/sched.h a bit sched/core: Simplify prefetch_curr_exec_start() sched: Fix spelling in comments sched/syscalls: Split out kernel/sched/syscalls.c from kernel/sched/core.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/build_policy.c1
-rw-r--r--kernel/sched/clock.c4
-rw-r--r--kernel/sched/core.c1874
-rw-r--r--kernel/sched/core_sched.c2
-rw-r--r--kernel/sched/cputime.c14
-rw-r--r--kernel/sched/deadline.c8
-rw-r--r--kernel/sched/fair.c18
-rw-r--r--kernel/sched/idle.c12
-rw-r--r--kernel/sched/loadavg.c4
-rw-r--r--kernel/sched/pelt.c4
-rw-r--r--kernel/sched/psi.c60
-rw-r--r--kernel/sched/rt.c22
-rw-r--r--kernel/sched/sched.h434
-rw-r--r--kernel/sched/stats.h2
-rw-r--r--kernel/sched/syscalls.c1699
-rw-r--r--kernel/sched/topology.c12
-rw-r--r--kernel/sched/wait_bit.c4
17 files changed, 2130 insertions, 2044 deletions
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index d9dc9ab3773f..39c315182b35 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -52,3 +52,4 @@
#include "cputime.c"
#include "deadline.c"
+#include "syscalls.c"
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 3c6193de9cde..a09655b48140 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -340,7 +340,7 @@ again:
this_clock = sched_clock_local(my_scd);
/*
* We must enforce atomic readout on 32-bit, otherwise the
- * update on the remote CPU can hit inbetween the readout of
+ * update on the remote CPU can hit in between the readout of
* the low 32-bit and the high 32-bit portion.
*/
remote_clock = cmpxchg64(&scd->clock, 0, 0);
@@ -444,7 +444,7 @@ notrace void sched_clock_tick_stable(void)
}
/*
- * We are going deep-idle (irqs are disabled):
+ * We are going deep-idle (IRQs are disabled):
*/
notrace void sched_clock_idle_sleep_event(void)
{
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 35a35e36024b..ae5ef3013a55 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2,9 +2,10 @@
/*
* kernel/sched/core.c
*
- * Core kernel scheduler code and related syscalls
+ * Core kernel CPU scheduler code
*
* Copyright (C) 1991-2002 Linus Torvalds
+ * Copyright (C) 1998-2024 Ingo Molnar, Red Hat
*/
#include <linux/highmem.h>
#include <linux/hrtimer_api.h>
@@ -706,14 +707,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
/*
* Since irq_time is only updated on {soft,}irq_exit, we might run into
* this case when a previous update_rq_clock() happened inside a
- * {soft,}irq region.
+ * {soft,}IRQ region.
*
* When this happens, we stop ->clock_task and only update the
* prev_irq_time stamp to account for the part that fit, so that a next
* update will consume the rest. This ensures ->clock_task is
* monotonic.
*
- * It does however cause some slight miss-attribution of {soft,}irq
+ * It does however cause some slight miss-attribution of {soft,}IRQ
* time, a more accurate solution would be to update the irq_time using
* the current rq->clock timestamp, except that would require using
* atomic ops.
@@ -825,7 +826,7 @@ static void __hrtick_start(void *arg)
/*
* Called to set the hrtick timer state.
*
- * called with rq->lock held and irqs disabled
+ * called with rq->lock held and IRQs disabled
*/
void hrtick_start(struct rq *rq, u64 delay)
{
@@ -849,7 +850,7 @@ void hrtick_start(struct rq *rq, u64 delay)
/*
* Called to set the hrtick timer state.
*
- * called with rq->lock held and irqs disabled
+ * called with rq->lock held and IRQs disabled
*/
void hrtick_start(struct rq *rq, u64 delay)
{
@@ -883,7 +884,7 @@ static inline void hrtick_rq_init(struct rq *rq)
#endif /* CONFIG_SCHED_HRTICK */
/*
- * cmpxchg based fetch_or, macro so it works for different integer types
+ * try_cmpxchg based fetch_or() macro so it works for different integer types:
*/
#define fetch_or(ptr, mask) \
({ \
@@ -1080,7 +1081,7 @@ void resched_cpu(int cpu)
*
* We don't do similar optimization for completely idle system, as
* selecting an idle CPU will add more delays to the timers than intended
- * (as that CPU's timer base may not be uptodate wrt jiffies etc).
+ * (as that CPU's timer base may not be up to date wrt jiffies etc).
*/
int get_nohz_timer_target(void)
{
@@ -1140,7 +1141,7 @@ static void wake_up_idle_cpu(int cpu)
* nohz functions that would need to follow TIF_NR_POLLING
* clearing:
*
- * - On most archs, a simple fetch_or on ti::flags with a
+ * - On most architectures, a simple fetch_or on ti::flags with a
* "0" value would be enough to know if an IPI needs to be sent.
*
* - x86 needs to perform a last need_resched() check between
@@ -1323,30 +1324,27 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-static void set_load_weight(struct task_struct *p, bool update_load)
+void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
- struct load_weight *load = &p->se.load;
+ struct load_weight lw;
- /*
- * SCHED_IDLE tasks get minimal weight:
- */
if (task_has_idle_policy(p)) {
- load->weight = scale_load(WEIGHT_IDLEPRIO);
- load->inv_weight = WMULT_IDLEPRIO;
- return;
+ lw.weight = scale_load(WEIGHT_IDLEPRIO);
+ lw.inv_weight = WMULT_IDLEPRIO;
+ } else {
+ lw.weight = scale_load(sched_prio_to_weight[prio]);
+ lw.inv_weight = sched_prio_to_wmult[prio];
}
/*
* SCHED_OTHER tasks have to update their load when changing their
* weight
*/
- if (update_load && p->sched_class == &fair_sched_class) {
- reweight_task(p, prio);
- } else {
- load->weight = scale_load(sched_prio_to_weight[prio]);
- load->inv_weight = sched_prio_to_wmult[prio];
- }
+ if (update_load && p->sched_class == &fair_sched_class)
+ reweight_task(p, &lw);
+ else
+ p->se.load = lw;
}
#ifdef CONFIG_UCLAMP_TASK
@@ -1383,7 +1381,7 @@ static unsigned int __maybe_unused sysctl_sched_uclamp_util_max = SCHED_CAPACITY
* This knob will not override the system default sched_util_clamp_min defined
* above.
*/
-static unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
+unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];
@@ -1408,32 +1406,6 @@ static struct uclamp_se uclamp_default[UCLAMP_CNT];
*/
DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
-/* Integer rounded range for each bucket */
-#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
-
-#define for_each_clamp_id(clamp_id) \
- for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
-
-static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
-{
- return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
-}
-
-static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
-{
- if (clamp_id == UCLAMP_MIN)
- return 0;
- return SCHED_CAPACITY_SCALE;
-}
-
-static inline void uclamp_se_set(struct uclamp_se *uc_se,
- unsigned int value, bool user_defined)
-{
- uc_se->value = value;
- uc_se->bucket_id = uclamp_bucket_id(value);
- uc_se->user_defined = user_defined;
-}
-
static inline unsigned int
uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
@@ -1675,7 +1647,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
rq_clamp = uclamp_rq_get(rq, clamp_id);
/*
* Defensive programming: this should never happen. If it happens,
- * e.g. due to future modification, warn and fixup the expected value.
+ * e.g. due to future modification, warn and fix up the expected value.
*/
SCHED_WARN_ON(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
@@ -1897,107 +1869,6 @@ undo:
}
#endif
-static int uclamp_validate(struct task_struct *p,
- const struct sched_attr *attr)
-{
- int util_min = p->uclamp_req[UCLAMP_MIN].value;
- int util_max = p->uclamp_req[UCLAMP_MAX].value;
-
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
- util_min = attr->sched_util_min;
-
- if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
- return -EINVAL;
- }
-
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
- util_max = attr->sched_util_max;
-
- if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
- return -EINVAL;
- }
-
- if (util_min != -1 && util_max != -1 && util_min > util_max)
- return -EINVAL;
-
- /*
- * We have valid uclamp attributes; make sure uclamp is enabled.
- *
- * We need to do that here, because enabling static branches is a
- * blocking operation which obviously cannot be done while holding
- * scheduler locks.
- */
- static_branch_enable(&sched_uclamp_used);
-
- return 0;
-}
-
-static bool uclamp_reset(const struct sched_attr *attr,
- enum uclamp_id clamp_id,
- struct uclamp_se *uc_se)
-{
- /* Reset on sched class change for a non user-defined clamp value. */
- if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
- !uc_se->user_defined)
- return true;
-
- /* Reset on sched_util_{min,max} == -1. */
- if (clamp_id == UCLAMP_MIN &&
- attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
- attr->sched_util_min == -1) {
- return true;
- }
-
- if (clamp_id == UCLAMP_MAX &&
- attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
- attr->sched_util_max == -1) {
- return true;
- }
-
- return false;
-}
-
-static void __setscheduler_uclamp(struct task_struct *p,
- const struct sched_attr *attr)
-{
- enum uclamp_id clamp_id;
-
- for_each_clamp_id(clamp_id) {
- struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
- unsigned int value;
-
- if (!uclamp_reset(attr, clamp_id, uc_se))
- continue;
-
- /*
- * RT by default have a 100% boost value that could be modified
- * at runtime.
- */
- if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
- value = sysctl_sched_uclamp_util_min_rt_default;
- else
- value = uclamp_none(clamp_id);
-
- uclamp_se_set(uc_se, value, false);
-
- }
-
- if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
- return;
-
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
- attr->sched_util_min != -1) {
- uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
- attr->sched_util_min, true);
- }
-
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
- attr->sched_util_max != -1) {
- uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
- attr->sched_util_max, true);
- }
-}
-
static void uclamp_fork(struct task_struct *p)
{
enum uclamp_id clamp_id;
@@ -2065,13 +1936,6 @@ static void __init init_uclamp(void)
#else /* !CONFIG_UCLAMP_TASK */
static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
-static inline int uclamp_validate(struct task_struct *p,
- const struct sched_attr *attr)
-{
- return -EOPNOTSUPP;
-}
-static void __setscheduler_uclamp(struct task_struct *p,
- const struct sched_attr *attr) { }
static inline void uclamp_fork(struct task_struct *p) { }
static inline void uclamp_post_fork(struct task_struct *p) { }
static inline void init_uclamp(void) { }
@@ -2101,7 +1965,7 @@ unsigned long get_wchan(struct task_struct *p)
return ip;
}
-static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (!(flags & ENQUEUE_NOCLOCK))
update_rq_clock(rq);
@@ -2118,7 +1982,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
sched_core_enqueue(rq, p);
}
-static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
if (sched_core_enabled(rq))
sched_core_dequeue(rq, p, flags);
@@ -2156,52 +2020,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-static inline int __normal_prio(int policy, int rt_prio, int nice)
-{
- int prio;
-
- if (dl_policy(policy))
- prio = MAX_DL_PRIO - 1;
- else if (rt_policy(policy))
- prio = MAX_RT_PRIO - 1 - rt_prio;
- else
- prio = NICE_TO_PRIO(nice);
-
- return prio;
-}
-
-/*
- * Calculate the expected normal priority: i.e. priority
- * without taking RT-inheritance into account. Might be
- * boosted by interactivity modifiers. Changes upon fork,
- * setprio syscalls, and whenever the interactivity
- * estimator recalculates.
- */
-static inline int normal_prio(struct task_struct *p)
-{
- return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
-}
-
-/*
- * Calculate the current priority, i.e. the priority
- * taken into account by the scheduler. This value might
- * be boosted by RT tasks, or might be boosted by
- * interactivity modifiers. Will be RT if the task got
- * RT-boosted. If not then it returns p->normal_prio.
- */
-static int effective_prio(struct task_struct *p)
-{
- p->normal_prio = normal_prio(p);
- /*
- * If we are RT tasks or we were boosted to RT priority,
- * keep the priority unchanged. Otherwise, update priority
- * to the normal priority:
- */
- if (!rt_prio(p->prio))
- return p->normal_prio;
- return p->prio;
-}
-
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
@@ -2220,9 +2038,9 @@ inline int task_curr(const struct task_struct *p)
* this means any call to check_class_changed() must be followed by a call to
* balance_callback().
*/
-static inline void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio)
+void check_class_changed(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class,
+ int oldprio)
{
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
@@ -2391,9 +2209,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
static void
__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
-static int __set_cpus_allowed_ptr(struct task_struct *p,
- struct affinity_context *ctx);
-
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
struct affinity_context ac = {
@@ -2408,7 +2223,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
return;
/*
- * Violates locking rules! see comment in __do_set_cpus_allowed().
+ * Violates locking rules! See comment in __do_set_cpus_allowed().
*/
__do_set_cpus_allowed(p, &ac);
}
@@ -2575,7 +2390,7 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
}
/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * migration_cpu_stop - this will be executed by a high-prio stopper thread
* and performs thread migration by bumping thread off CPU then
* 'pushing' onto another runqueue.
*/
@@ -2820,16 +2635,6 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu);
}
-static cpumask_t *alloc_user_cpus_ptr(int node)
-{
- /*
- * See do_set_cpus_allowed() above for the rcu_head usage.
- */
- int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
-
- return kmalloc_node(size, GFP_KERNEL, node);
-}
-
int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
int node)
{
@@ -3198,8 +3003,7 @@ out:
* task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held.
*/
-static int __set_cpus_allowed_ptr(struct task_struct *p,
- struct affinity_context *ctx)
+int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx)
{
struct rq_flags rf;
struct rq *rq;
@@ -3318,9 +3122,6 @@ out_free_mask:
free_cpumask_var(new_mask);
}
-static int
-__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
-
/*
* Restore the affinity of a task @p which was previously restricted by a
* call to force_compatible_cpus_allowed_ptr().
@@ -3700,12 +3501,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
#else /* CONFIG_SMP */
-static inline int __set_cpus_allowed_ptr(struct task_struct *p,
- struct affinity_context *ctx)
-{
- return set_cpus_allowed_ptr(p, ctx->new_mask);
-}
-
static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
static inline bool rq_has_pinned_tasks(struct rq *rq)
@@ -3713,11 +3508,6 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
return false;
}
-static inline cpumask_t *alloc_user_cpus_ptr(int node)
-{
- return NULL;
-}
-
#endif /* !CONFIG_SMP */
static void
@@ -3900,8 +3690,8 @@ void sched_ttwu_pending(void *arg)
* it is possible for select_idle_siblings() to stack a number
* of tasks on this CPU during that window.
*
- * It is ok to clear ttwu_pending when another task pending.
- * We will receive IPI after local irq enabled and then enqueue it.
+ * It is OK to clear ttwu_pending when another task pending.
+ * We will receive IPI after local IRQ enabled and then enqueue it.
* Since now nr_running > 0, idle_cpu() will always get correct result.
*/
WRITE_ONCE(rq->ttwu_pending, 0);
@@ -5094,7 +4884,7 @@ __splice_balance_callbacks(struct rq *rq, bool split)
return head;
}
-static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
+struct balance_callback *splice_balance_callbacks(struct rq *rq)
{
return __splice_balance_callbacks(rq, true);
}
@@ -5104,7 +4894,7 @@ static void __balance_callbacks(struct rq *rq)
do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
}
-static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
+void balance_callbacks(struct rq *rq, struct balance_callback *head)
{
unsigned long flags;
@@ -5121,15 +4911,6 @@ static inline void __balance_callbacks(struct rq *rq)
{
}
-static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
-{
- return NULL;
-}
-
-static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
-{
-}
-
#endif
static inline void
@@ -5232,7 +5013,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
*
* The context switch have flipped the stack from under us and restored the
* local variables which were saved when this task called schedule() in the
- * past. prev == current is still correct but we need to recalculate this_rq
+ * past. 'prev == current' is still correct but we need to recalculate this_rq
* because prev may have moved to another CPU.
*/
static struct rq *finish_task_switch(struct task_struct *prev)
@@ -5555,9 +5336,9 @@ EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
static inline void prefetch_curr_exec_start(struct task_struct *p)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
- struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+ struct sched_entity *curr = p->se.cfs_rq->curr;
#else
- struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+ struct sched_entity *curr = task_rq(p)->cfs.curr;
#endif
prefetch(curr);
prefetch(&curr->exec_start);
@@ -5578,7 +5359,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
/*
* 64-bit doesn't need locks to atomically read a 64-bit value.
* So we have a optimization chance when the task's delta_exec is 0.
- * Reading ->on_cpu is racy, but this is ok.
+ * Reading ->on_cpu is racy, but this is OK.
*
* If we race with it leaving CPU, we'll take a lock. So we're correct.
* If we race with it entering CPU, unaccounted time is 0. This is
@@ -6856,7 +6637,7 @@ void __sched schedule_idle(void)
{
/*
* As this skips calling sched_submit_work(), which the idle task does
- * regardless because that function is a nop when the task is in a
+ * regardless because that function is a NOP when the task is in a
* TASK_RUNNING state, make sure this isn't used someplace that the
* current task can be in any other state. Note, idle is always in the
* TASK_RUNNING state.
@@ -7051,9 +6832,9 @@ EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
/*
* This is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
+ * off of IRQ context.
+ * Note, that this is called and return with IRQs disabled. This will
+ * protect us against recursive calling from IRQ contexts.
*/
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
@@ -7083,7 +6864,7 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
}
EXPORT_SYMBOL(default_wake_function);
-static void __setscheduler_prio(struct task_struct *p, int prio)
+void __setscheduler_prio(struct task_struct *p, int prio)
{
if (dl_prio(prio))
p->sched_class = &dl_sched_class;
@@ -7123,21 +6904,6 @@ void rt_mutex_post_schedule(void)
lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
}
-static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
-{
- if (pi_task)
- prio = min(prio, pi_task->prio);
-
- return prio;
-}
-
-static inline int rt_effective_prio(struct task_struct *p, int prio)
-{
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
-
- return __rt_effective_prio(pi_task, prio);
-}
-
/*
* rt_mutex_setprio - set the current priority of a task
* @p: task to boost
@@ -7187,7 +6953,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
goto out_unlock;
/*
- * Idle task boosting is a nono in general. There is one
+ * Idle task boosting is a no-no in general. There is one
* exception, when PREEMPT_RT and NOHZ is active:
*
* The idle task calls get_next_timer_interrupt() and holds
@@ -7266,1325 +7032,8 @@ out_unlock:
preempt_enable();
}
-#else
-static inline int rt_effective_prio(struct task_struct *p, int prio)
-{
- return prio;
-}
-#endif
-
-void set_user_nice(struct task_struct *p, long nice)
-{
- bool queued, running;
- struct rq *rq;
- int old_prio;
-
- if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
- return;
- /*
- * We have to be careful, if called from sys_setpriority(),
- * the task might be in the middle of scheduling on another CPU.
- */
- CLASS(task_rq_lock, rq_guard)(p);
- rq = rq_guard.rq;
-
- update_rq_clock(rq);
-
- /*
- * The RT priorities are set via sched_setscheduler(), but we still
- * allow the 'normal' nice value to be set - but as expected
- * it won't have any effect on scheduling until the task is
- * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
- */
- if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
- p->static_prio = NICE_TO_PRIO(nice);
- return;
- }
-
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
- if (running)
- put_prev_task(rq, p);
-
- p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p, true);
- old_prio = p->prio;
- p->prio = effective_prio(p);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
-
- /*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
- */
- p->sched_class->prio_changed(rq, p, old_prio);
-}
-EXPORT_SYMBOL(set_user_nice);
-
-/*
- * is_nice_reduction - check if nice value is an actual reduction
- *
- * Similar to can_nice() but does not perform a capability check.
- *
- * @p: task
- * @nice: nice value
- */
-static bool is_nice_reduction(const struct task_struct *p, const int nice)
-{
- /* Convert nice value [19,-20] to rlimit style value [1,40]: */
- int nice_rlim = nice_to_rlimit(nice);
-
- return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
-}
-
-/*
- * can_nice - check if a task can reduce its nice value
- * @p: task
- * @nice: nice value
- */
-int can_nice(const struct task_struct *p, const int nice)
-{
- return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
-}
-
-#ifdef __ARCH_WANT_SYS_NICE
-
-/*
- * sys_nice - change the priority of the current process.
- * @increment: priority increment
- *
- * sys_setpriority is a more generic, but much slower function that
- * does similar things.
- */
-SYSCALL_DEFINE1(nice, int, increment)
-{
- long nice, retval;
-
- /*
- * Setpriority might change our priority at the same moment.
- * We don't have to worry. Conceptually one call occurs first
- * and we have a single winner.
- */
- increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
- nice = task_nice(current) + increment;
-
- nice = clamp_val(nice, MIN_NICE, MAX_NICE);
- if (increment < 0 && !can_nice(current, nice))
- return -EPERM;
-
- retval = security_task_setnice(current, nice);
- if (retval)
- return retval;
-
- set_user_nice(current, nice);
- return 0;
-}
-
-#endif
-
-/**
- * task_prio - return the priority value of a given task.
- * @p: the task in question.
- *
- * Return: The priority value as seen by users in /proc.
- *
- * sched policy return value kernel prio user prio/nice
- *
- * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
- * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
- * deadline -101 -1 0
- */
-int task_prio(const struct task_struct *p)
-{
- return p->prio - MAX_RT_PRIO;
-}
-
-/**
- * idle_cpu - is a given CPU idle currently?
- * @cpu: the processor in question.
- *
- * Return: 1 if the CPU is currently idle. 0 otherwise.
- */
-int idle_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- if (rq->curr != rq->idle)
- return 0;
-
- if (rq->nr_running)
- return 0;
-
-#ifdef CONFIG_SMP
- if (rq->ttwu_pending)
- return 0;
-#endif
-
- return 1;
-}
-
-/**
- * available_idle_cpu - is a given CPU idle for enqueuing work.
- * @cpu: the CPU in question.
- *
- * Return: 1 if the CPU is currently idle. 0 otherwise.
- */
-int available_idle_cpu(int cpu)
-{
- if (!idle_cpu(cpu))
- return 0;
-
- if (vcpu_is_preempted(cpu))
- return 0;
-
- return 1;
-}
-
-/**
- * idle_task - return the idle task for a given CPU.
- * @cpu: the processor in question.
- *
- * Return: The idle task for the CPU @cpu.
- */
-struct task_struct *idle_task(int cpu)
-{
- return cpu_rq(cpu)->idle;
-}
-
-#ifdef CONFIG_SCHED_CORE
-int sched_core_idle_cpu(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
-
- if (sched_core_enabled(rq) && rq->curr == rq->idle)
- return 1;
-
- return idle_cpu(cpu);
-}
-
-#endif
-
-#ifdef CONFIG_SMP
-/*
- * This function computes an effective utilization for the given CPU, to be
- * used for frequency selection given the linear relation: f = u * f_max.
- *
- * The scheduler tracks the following metrics:
- *
- * cpu_util_{cfs,rt,dl,irq}()
- * cpu_bw_dl()
- *
- * Where the cfs,rt and dl util numbers are tracked with the same metric and
- * synchronized windows and are thus directly comparable.
- *
- * The cfs,rt,dl utilization are the running times measured with rq->clock_task
- * which excludes things like IRQ and steal-time. These latter are then accrued
- * in the irq utilization.
- *
- * The DL bandwidth number otoh is not a measured metric but a value computed
- * based on the task model parameters and gives the minimal utilization
- * required to meet deadlines.
- */
-unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- unsigned long *min,
- unsigned long *max)
-{
- unsigned long util, irq, scale;
- struct rq *rq = cpu_rq(cpu);
-
- scale = arch_scale_cpu_capacity(cpu);
-
- /*
- * Early check to see if IRQ/steal time saturates the CPU, can be
- * because of inaccuracies in how we track these -- see
- * update_irq_load_avg().
- */
- irq = cpu_util_irq(rq);
- if (unlikely(irq >= scale)) {
- if (min)
- *min = scale;
- if (max)
- *max = scale;
- return scale;
- }
-
- if (min) {
- /*
- * The minimum utilization returns the highest level between:
- * - the computed DL bandwidth needed with the IRQ pressure which
- * steals time to the deadline task.
- * - The minimum performance requirement for CFS and/or RT.
- */
- *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
-
- /*
- * When an RT task is runnable and uclamp is not used, we must
- * ensure that the task will run at maximum compute capacity.
- */
- if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
- *min = max(*min, scale);
- }
-
- /*
- * Because the time spend on RT/DL tasks is visible as 'lost' time to
- * CFS tasks and we use the same metric to track the effective
- * utilization (PELT windows are synchronized) we can directly add them
- * to obtain the CPU's actual utilization.
- */
- util = util_cfs + cpu_util_rt(rq);
- util += cpu_util_dl(rq);
-
- /*
- * The maximum hint is a soft bandwidth requirement, which can be lower
- * than the actual utilization because of uclamp_max requirements.
- */
- if (max)
- *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
-
- if (util >= scale)
- return scale;
-
- /*
- * There is still idle time; further improve the number by using the
- * irq metric. Because IRQ/steal time is hidden from the task clock we
- * need to scale the task numbers:
- *
- * max - irq
- * U' = irq + --------- * U
- * max
- */
- util = scale_irq_capacity(util, irq, scale);
- util += irq;
-
- return min(scale, util);
-}
-
-unsigned long sched_cpu_util(int cpu)
-{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
-}
-#endif /* CONFIG_SMP */
-
-/**
- * find_process_by_pid - find a process with a matching PID value.
- * @pid: the pid in question.
- *
- * The task of @pid, if found. %NULL otherwise.
- */
-static struct task_struct *find_process_by_pid(pid_t pid)
-{
- return pid ? find_task_by_vpid(pid) : current;
-}
-
-static struct task_struct *find_get_task(pid_t pid)
-{
- struct task_struct *p;
- guard(rcu)();
-
- p = find_process_by_pid(pid);
- if (likely(p))
- get_task_struct(p);
-
- return p;
-}
-
-DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
- find_get_task(pid), pid_t pid)
-
-/*
- * sched_setparam() passes in -1 for its policy, to let the functions
- * it calls know not to change it.
- */
-#define SETPARAM_POLICY -1
-
-static void __setscheduler_params(struct task_struct *p,
- const struct sched_attr *attr)
-{
- int policy = attr->sched_policy;
-
- if (policy == SETPARAM_POLICY)
- policy = p->policy;
-
- p->policy = policy;
-
- if (dl_policy(policy))
- __setparam_dl(p, attr);
- else if (fair_policy(policy))
- p->static_prio = NICE_TO_PRIO(attr->sched_nice);
-
- /*
- * __sched_setscheduler() ensures attr->sched_priority == 0 when
- * !rt_policy. Always setting this ensures that things like
- * getparam()/getattr() don't report silly values for !rt tasks.
- */
- p->rt_priority = attr->sched_priority;
- p->normal_prio = normal_prio(p);
- set_load_weight(p, true);
-}
-
-/*
- * Check the target process has a UID that matches the current process's:
- */
-static bool check_same_owner(struct task_struct *p)
-{
- const struct cred *cred = current_cred(), *pcred;
- guard(rcu)();
-
- pcred = __task_cred(p);
- return (uid_eq(cred->euid, pcred->euid) ||
- uid_eq(cred->euid, pcred->uid));
-}
-
-/*
- * Allow unprivileged RT tasks to decrease priority.
- * Only issue a capable test if needed and only once to avoid an audit
- * event on permitted non-privileged operations:
- */
-static int user_check_sched_setscheduler(struct task_struct *p,
- const struct sched_attr *attr,
- int policy, int reset_on_fork)
-{
- if (fair_policy(policy)) {
- if (attr->sched_nice < task_nice(p) &&
- !is_nice_reduction(p, attr->sched_nice))
- goto req_priv;
- }
-
- if (rt_policy(policy)) {
- unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
-
- /* Can't set/change the rt policy: */
- if (policy != p->policy && !rlim_rtprio)
- goto req_priv;
-
- /* Can't increase priority: */
- if (attr->sched_priority > p->rt_priority &&
- attr->sched_priority > rlim_rtprio)
- goto req_priv;
- }
-
- /*
- * Can't set/change SCHED_DEADLINE policy at all for now
- * (safest behavior); in the future we would like to allow
- * unprivileged DL tasks to increase their relative deadline
- * or reduce their runtime (both ways reducing utilization)
- */
- if (dl_policy(policy))
- goto req_priv;
-
- /*
- * Treat SCHED_IDLE as nice 20. Only allow a switch to
- * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
- */
- if (task_has_idle_policy(p) && !idle_policy(policy)) {
- if (!is_nice_reduction(p, task_nice(p)))
- goto req_priv;
- }
-
- /* Can't change other user's priorities: */
- if (!check_same_owner(p))
- goto req_priv;
-
- /* Normal users shall not reset the sched_reset_on_fork flag: */
- if (p->sched_reset_on_fork && !reset_on_fork)
- goto req_priv;
-
- return 0;
-
-req_priv:
- if (!capable(CAP_SYS_NICE))
- return -EPERM;
-
- return 0;
-}
-
-static int __sched_setscheduler(struct task_struct *p,
- const struct sched_attr *attr,
- bool user, bool pi)
-{
- int oldpolicy = -1, policy = attr->sched_policy;
- int retval, oldprio, newprio, queued, running;
- const struct sched_class *prev_class;
- struct balance_callback *head;
- struct rq_flags rf;
- int reset_on_fork;
- int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- struct rq *rq;
- bool cpuset_locked = false;
-
- /* The pi code expects interrupts enabled */
- BUG_ON(pi && in_interrupt());
-recheck:
- /* Double check policy once rq lock held: */
- if (policy < 0) {
- reset_on_fork = p->sched_reset_on_fork;
- policy = oldpolicy = p->policy;
- } else {
- reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
-
- if (!valid_policy(policy))
- return -EINVAL;
- }
-
- if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
- return -EINVAL;
-
- /*
- * Valid priorities for SCHED_FIFO and SCHED_RR are
- * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
- * SCHED_BATCH and SCHED_IDLE is 0.
- */
- if (attr->sched_priority > MAX_RT_PRIO-1)
- return -EINVAL;
- if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
- (rt_policy(policy) != (attr->sched_priority != 0)))
- return -EINVAL;
-
- if (user) {
- retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
- if (retval)
- return retval;
-
- if (attr->sched_flags & SCHED_FLAG_SUGOV)
- return -EINVAL;
-
- retval = security_task_setscheduler(p);
- if (retval)
- return retval;
- }
-
- /* Update task specific "requested" clamps */
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
- retval = uclamp_validate(p, attr);
- if (retval)
- return retval;
- }
-
- /*
- * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
- * information.
- */
- if (dl_policy(policy) || dl_policy(p->policy)) {
- cpuset_locked = true;
- cpuset_lock();
- }
-
- /*
- * Make sure no PI-waiters arrive (or leave) while we are
- * changing the priority of the task:
- *
- * To be able to change p->policy safely, the appropriate
- * runqueue lock must be held.
- */
- rq = task_rq_lock(p, &rf);
- update_rq_clock(rq);
-
- /*
- * Changing the policy of the stop threads its a very bad idea:
- */
- if (p == rq->stop) {
- retval = -EINVAL;
- goto unlock;
- }
-
- /*
- * If not changing anything there's no need to proceed further,
- * but store a possible modification of reset_on_fork.
- */
- if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != task_nice(p))
- goto change;
- if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
- goto change;
- if (dl_policy(policy) && dl_param_changed(p, attr))
- goto change;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
- goto change;
-
- p->sched_reset_on_fork = reset_on_fork;
- retval = 0;
- goto unlock;
- }
-change:
-
- if (user) {
-#ifdef CONFIG_RT_GROUP_SCHED
- /*
- * Do not allow realtime tasks into groups that have no runtime
- * assigned.
- */
- if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0 &&
- !task_group_is_autogroup(task_group(p))) {
- retval = -EPERM;
- goto unlock;
- }
-#endif
-#ifdef CONFIG_SMP
- if (dl_bandwidth_enabled() && dl_policy(policy) &&
- !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
- cpumask_t *span = rq->rd->span;
-
- /*
- * Don't allow tasks with an affinity mask smaller than
- * the entire root_domain to become SCHED_DEADLINE. We
- * will also fail if there's no bandwidth available.
- */
- if (!cpumask_subset(span, p->cpus_ptr) ||
- rq->rd->dl_bw.bw == 0) {
- retval = -EPERM;
- goto unlock;
- }
- }
-#endif
- }
-
- /* Re-check policy now with rq lock held: */
- if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
- policy = oldpolicy = -1;
- task_rq_unlock(rq, p, &rf);
- if (cpuset_locked)
- cpuset_unlock();
- goto recheck;
- }
-
- /*
- * If setscheduling to SCHED_DEADLINE (or changing the parameters
- * of a SCHED_DEADLINE task) we need to check if enough bandwidth
- * is available.
- */
- if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
- retval = -EBUSY;
- goto unlock;
- }
-
- p->sched_reset_on_fork = reset_on_fork;
- oldprio = p->prio;
-
- newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
- if (pi) {
- /*
- * Take priority boosted tasks into account. If the new
- * effective priority is unchanged, we just store the new
- * normal parameters and do not touch the scheduler class and
- * the runqueue. This will be done when the task deboost
- * itself.
- */
- newprio = rt_effective_prio(p, newprio);
- if (newprio == oldprio)
- queue_flags &= ~DEQUEUE_MOVE;
- }
-
- queued = task_on_rq_queued(p);
- running = task_current(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flags);
- if (running)
- put_prev_task(rq, p);
-
- prev_class = p->sched_class;
-
- if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
- __setscheduler_params(p, attr);
- __setscheduler_prio(p, newprio);
- }
- __setscheduler_uclamp(p, attr);
-
- if (queued) {
- /*
- * We enqueue to tail when the priority of a task is
- * increased (user space view).
- */
- if (oldprio < p->prio)
- queue_flags |= ENQUEUE_HEAD;
-
- enqueue_task(rq, p, queue_flags);
- }
- if (running)
- set_next_task(rq, p);
-
- check_class_changed(rq, p, prev_class, oldprio);
-
- /* Avoid rq from going away on us: */
- preempt_disable();
- head = splice_balance_callbacks(rq);
- task_rq_unlock(rq, p, &rf);
-
- if (pi) {
- if (cpuset_locked)
- cpuset_unlock();
- rt_mutex_adjust_pi(p);
- }
-
- /* Run balance callbacks after we've adjusted the PI chain: */
- balance_callbacks(rq, head);
- preempt_enable();
-
- return 0;
-
-unlock:
- task_rq_unlock(rq, p, &rf);
- if (cpuset_locked)
- cpuset_unlock();
- return retval;
-}
-
-static int _sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param, bool check)
-{
- struct sched_attr attr = {
- .sched_policy = policy,
- .sched_priority = param->sched_priority,
- .sched_nice = PRIO_TO_NICE(p->static_prio),
- };
-
- /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
- if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
- attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- policy &= ~SCHED_RESET_ON_FORK;
- attr.sched_policy = policy;
- }
-
- return __sched_setscheduler(p, &attr, check, true);
-}
-/**
- * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Use sched_set_fifo(), read its comment.
- *
- * Return: 0 on success. An error code otherwise.
- *
- * NOTE that the task may be already dead.
- */
-int sched_setscheduler(struct task_struct *p, int policy,
- const struct sched_param *param)
-{
- return _sched_setscheduler(p, policy, param, true);
-}
-
-int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
-{
- return __sched_setscheduler(p, attr, true, true);
-}
-
-int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
-{
- return __sched_setscheduler(p, attr, false, true);
-}
-EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
-
-/**
- * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
- * @p: the task in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Just like sched_setscheduler, only don't bother checking if the
- * current context has permission. For example, this is needed in
- * stop_machine(): we create temporary high priority worker threads,
- * but our caller might not have that capability.
- *
- * Return: 0 on success. An error code otherwise.
- */
-int sched_setscheduler_nocheck(struct task_struct *p, int policy,
- const struct sched_param *param)
-{
- return _sched_setscheduler(p, policy, param, false);
-}
-
-/*
- * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
- * incapable of resource management, which is the one thing an OS really should
- * be doing.
- *
- * This is of course the reason it is limited to privileged users only.
- *
- * Worse still; it is fundamentally impossible to compose static priority
- * workloads. You cannot take two correctly working static prio workloads
- * and smash them together and still expect them to work.
- *
- * For this reason 'all' FIFO tasks the kernel creates are basically at:
- *
- * MAX_RT_PRIO / 2
- *
- * The administrator _MUST_ configure the system, the kernel simply doesn't
- * know enough information to make a sensible choice.
- */
-void sched_set_fifo(struct task_struct *p)
-{
- struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
- WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
-}
-EXPORT_SYMBOL_GPL(sched_set_fifo);
-
-/*
- * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
- */
-void sched_set_fifo_low(struct task_struct *p)
-{
- struct sched_param sp = { .sched_priority = 1 };
- WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
-}
-EXPORT_SYMBOL_GPL(sched_set_fifo_low);
-
-void sched_set_normal(struct task_struct *p, int nice)
-{
- struct sched_attr attr = {
- .sched_policy = SCHED_NORMAL,
- .sched_nice = nice,
- };
- WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
-}
-EXPORT_SYMBOL_GPL(sched_set_normal);
-
-static int
-do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
-{
- struct sched_param lparam;
-
- if (!param || pid < 0)
- return -EINVAL;
- if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
- return -EFAULT;
-
- CLASS(find_get_task, p)(pid);
- if (!p)
- return -ESRCH;
-
- return sched_setscheduler(p, policy, &lparam);
-}
-
-/*
- * Mimics kernel/events/core.c perf_copy_attr().
- */
-static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
-{
- u32 size;
- int ret;
-
- /* Zero the full structure, so that a short copy will be nice: */
- memset(attr, 0, sizeof(*attr));
-
- ret = get_user(size, &uattr->size);
- if (ret)
- return ret;
-
- /* ABI compatibility quirk: */
- if (!size)
- size = SCHED_ATTR_SIZE_VER0;
- if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
- goto err_size;
-
- ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
- if (ret) {
- if (ret == -E2BIG)
- goto err_size;
- return ret;
- }
-
- if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
- size < SCHED_ATTR_SIZE_VER1)
- return -EINVAL;
-
- /*
- * XXX: Do we want to be lenient like existing syscalls; or do we want
- * to be strict and return an error on out-of-bounds values?
- */
- attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
-
- return 0;
-
-err_size:
- put_user(sizeof(*attr), &uattr->size);
- return -E2BIG;
-}
-
-static void get_params(struct task_struct *p, struct sched_attr *attr)
-{
- if (task_has_dl_policy(p))
- __getparam_dl(p, attr);
- else if (task_has_rt_policy(p))
- attr->sched_priority = p->rt_priority;
- else
- attr->sched_nice = task_nice(p);
-}
-
-/**
- * sys_sched_setscheduler - set/change the scheduler policy and RT priority
- * @pid: the pid in question.
- * @policy: new policy.
- * @param: structure containing the new RT priority.
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
-{
- if (policy < 0)
- return -EINVAL;
-
- return do_sched_setscheduler(pid, policy, param);
-}
-
-/**
- * sys_sched_setparam - set/change the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the new RT priority.
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
-{
- return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
-}
-
-/**
- * sys_sched_setattr - same as above, but with extended sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- * @flags: for future extension.
- */
-SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, flags)
-{
- struct sched_attr attr;
- int retval;
-
- if (!uattr || pid < 0 || flags)
- return -EINVAL;
-
- retval = sched_copy_attr(uattr, &attr);
- if (retval)
- return retval;
-
- if ((int)attr.sched_policy < 0)
- return -EINVAL;
- if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
- attr.sched_policy = SETPARAM_POLICY;
-
- CLASS(find_get_task, p)(pid);
- if (!p)
- return -ESRCH;
-
- if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
- get_params(p, &attr);
-
- return sched_setattr(p, &attr);
-}
-
-/**
- * sys_sched_getscheduler - get the policy (scheduling class) of a thread
- * @pid: the pid in question.
- *
- * Return: On success, the policy of the thread. Otherwise, a negative error
- * code.
- */
-SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
-{
- struct task_struct *p;
- int retval;
-
- if (pid < 0)
- return -EINVAL;
-
- guard(rcu)();
- p = find_process_by_pid(pid);
- if (!p)
- return -ESRCH;
-
- retval = security_task_getscheduler(p);
- if (!retval) {
- retval = p->policy;
- if (p->sched_reset_on_fork)
- retval |= SCHED_RESET_ON_FORK;
- }
- return retval;
-}
-
-/**
- * sys_sched_getparam - get the RT priority of a thread
- * @pid: the pid in question.
- * @param: structure containing the RT priority.
- *
- * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
- * code.
- */
-SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
-{
- struct sched_param lp = { .sched_priority = 0 };
- struct task_struct *p;
- int retval;
-
- if (!param || pid < 0)
- return -EINVAL;
-
- scoped_guard (rcu) {
- p = find_process_by_pid(pid);
- if (!p)
- return -ESRCH;
-
- retval = security_task_getscheduler(p);
- if (retval)
- return retval;
-
- if (task_has_rt_policy(p))
- lp.sched_priority = p->rt_priority;
- }
-
- /*
- * This one might sleep, we cannot do it with a spinlock held ...
- */
- return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
-}
-
-/*
- * Copy the kernel size attribute structure (which might be larger
- * than what user-space knows about) to user-space.
- *
- * Note that all cases are valid: user-space buffer can be larger or
- * smaller than the kernel-space buffer. The usual case is that both
- * have the same size.
- */
-static int
-sched_attr_copy_to_user(struct sched_attr __user *uattr,
- struct sched_attr *kattr,
- unsigned int usize)
-{
- unsigned int ksize = sizeof(*kattr);
-
- if (!access_ok(uattr, usize))
- return -EFAULT;
-
- /*
- * sched_getattr() ABI forwards and backwards compatibility:
- *
- * If usize == ksize then we just copy everything to user-space and all is good.
- *
- * If usize < ksize then we only copy as much as user-space has space for,
- * this keeps ABI compatibility as well. We skip the rest.
- *
- * If usize > ksize then user-space is using a newer version of the ABI,
- * which part the kernel doesn't know about. Just ignore it - tooling can
- * detect the kernel's knowledge of attributes from the attr->size value
- * which is set to ksize in this case.
- */
- kattr->size = min(usize, ksize);
-
- if (copy_to_user(uattr, kattr, kattr->size))
- return -EFAULT;
-
- return 0;
-}
-
-/**
- * sys_sched_getattr - similar to sched_getparam, but with sched_attr
- * @pid: the pid in question.
- * @uattr: structure containing the extended parameters.
- * @usize: sizeof(attr) for fwd/bwd comp.
- * @flags: for future extension.
- */
-SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, usize, unsigned int, flags)
-{
- struct sched_attr kattr = { };
- struct task_struct *p;
- int retval;
-
- if (!uattr || pid < 0 || usize > PAGE_SIZE ||
- usize < SCHED_ATTR_SIZE_VER0 || flags)
- return -EINVAL;
-
- scoped_guard (rcu) {
- p = find_process_by_pid(pid);
- if (!p)
- return -ESRCH;
-
- retval = security_task_getscheduler(p);
- if (retval)
- return retval;
-
- kattr.sched_policy = p->policy;
- if (p->sched_reset_on_fork)
- kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- get_params(p, &kattr);
- kattr.sched_flags &= SCHED_FLAG_ALL;
-
-#ifdef CONFIG_UCLAMP_TASK
- /*
- * This could race with another potential updater, but this is fine
- * because it'll correctly read the old or the new value. We don't need
- * to guarantee who wins the race as long as it doesn't return garbage.
- */
- kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
- kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
-#endif
- }
-
- return sched_attr_copy_to_user(uattr, &kattr, usize);
-}
-
-#ifdef CONFIG_SMP
-int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
-{
- /*
- * If the task isn't a deadline task or admission control is
- * disabled then we don't care about affinity changes.
- */
- if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
- return 0;
-
- /*
- * Since bandwidth control happens on root_domain basis,
- * if admission test is enabled, we only admit -deadline
- * tasks allowed to run on all the CPUs in the task's
- * root_domain.
- */
- guard(rcu)();
- if (!cpumask_subset(task_rq(p)->rd->span, mask))
- return -EBUSY;
-
- return 0;
-}
#endif
-static int
-__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
-{
- int retval;
- cpumask_var_t cpus_allowed, new_mask;
-
- if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
- return -ENOMEM;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_free_cpus_allowed;
- }
-
- cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
-
- ctx->new_mask = new_mask;
- ctx->flags |= SCA_CHECK;
-
- retval = dl_task_check_affinity(p, new_mask);
- if (retval)
- goto out_free_new_mask;
-
- retval = __set_cpus_allowed_ptr(p, ctx);
- if (retval)
- goto out_free_new_mask;
-
- cpuset_cpus_allowed(p, cpus_allowed);
- if (!cpumask_subset(new_mask, cpus_allowed)) {
- /*
- * We must have raced with a concurrent cpuset update.
- * Just reset the cpumask to the cpuset's cpus_allowed.
- */
- cpumask_copy(new_mask, cpus_allowed);
-
- /*
- * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
- * will restore the previous user_cpus_ptr value.
- *
- * In the unlikely event a previous user_cpus_ptr exists,
- * we need to further restrict the mask to what is allowed
- * by that old user_cpus_ptr.
- */
- if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
- bool empty = !cpumask_and(new_mask, new_mask,
- ctx->user_mask);
-
- if (WARN_ON_ONCE(empty))
- cpumask_copy(new_mask, cpus_allowed);
- }
- __set_cpus_allowed_ptr(p, ctx);
- retval = -EINVAL;
- }
-
-out_free_new_mask:
- free_cpumask_var(new_mask);
-out_free_cpus_allowed:
- free_cpumask_var(cpus_allowed);
- return retval;
-}
-
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
-{
- struct affinity_context ac;
- struct cpumask *user_mask;
- int retval;
-
- CLASS(find_get_task, p)(pid);
- if (!p)
- return -ESRCH;
-
- if (p->flags & PF_NO_SETAFFINITY)
- return -EINVAL;
-
- if (!check_same_owner(p)) {
- guard(rcu)();
- if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
- return -EPERM;
- }
-
- retval = security_task_setscheduler(p);
- if (retval)
- return retval;
-
- /*
- * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
- * alloc_user_cpus_ptr() returns NULL.
- */
- user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
- if (user_mask) {
- cpumask_copy(user_mask, in_mask);
- } else if (IS_ENABLED(CONFIG_SMP)) {
- return -ENOMEM;
- }
-
- ac = (struct affinity_context){
- .new_mask = in_mask,
- .user_mask = user_mask,
- .flags = SCA_USER,
- };
-
- retval = __sched_setaffinity(p, &ac);
- kfree(ac.user_mask);
-
- return retval;
-}
-
-static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
- struct cpumask *new_mask)
-{
- if (len < cpumask_size())
- cpumask_clear(new_mask);
- else if (len > cpumask_size())
- len = cpumask_size();
-
- return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
-}
-
-/**
- * sys_sched_setaffinity - set the CPU affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to the new CPU mask
- *
- * Return: 0 on success. An error code otherwise.
- */
-SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
- unsigned long __user *, user_mask_ptr)
-{
- cpumask_var_t new_mask;
- int retval;
-
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
- return -ENOMEM;
-
- retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
- if (retval == 0)
- retval = sched_setaffinity(pid, new_mask);
- free_cpumask_var(new_mask);
- return retval;
-}
-
-long sched_getaffinity(pid_t pid, struct cpumask *mask)
-{
- struct task_struct *p;
- int retval;
-
- guard(rcu)();
- p = find_process_by_pid(pid);
- if (!p)
- return -ESRCH;
-
- retval = security_task_getscheduler(p);
- if (retval)
- return retval;
-
- guard(raw_spinlock_irqsave)(&p->pi_lock);
- cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
-
- return 0;
-}
-
-/**
- * sys_sched_getaffinity - get the CPU affinity of a process
- * @pid: pid of the process
- * @len: length in bytes of the bitmask pointed to by user_mask_ptr
- * @user_mask_ptr: user-space pointer to hold the current CPU mask
- *
- * Return: size of CPU mask copied to user_mask_ptr on success. An
- * error code otherwise.
- */
-SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
- unsigned long __user *, user_mask_ptr)
-{
- int ret;
- cpumask_var_t mask;
-
- if ((len * BITS_PER_BYTE) < nr_cpu_ids)
- return -EINVAL;
- if (len & (sizeof(unsigned long)-1))
- return -EINVAL;
-
- if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
-
- ret = sched_getaffinity(pid, mask);
- if (ret == 0) {
- unsigned int retlen = min(len, cpumask_size());
-
- if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
- ret = -EFAULT;
- else
- ret = retlen;
- }
- free_cpumask_var(mask);
-
- return ret;
-}
-
-static void do_sched_yield(void)
-{
- struct rq_flags rf;
- struct rq *rq;
-
- rq = this_rq_lock_irq(&rf);
-
- schedstat_inc(rq->yld_count);
- current->sched_class->yield_task(rq);
-
- preempt_disable();
- rq_unlock_irq(rq, &rf);
- sched_preempt_enable_no_resched();
-
- schedule();
-}
-
-/**
- * sys_sched_yield - yield the current processor to other threads.
- *
- * This function yields the current CPU to other tasks. If there are no
- * other threads running on this CPU then this function will return.
- *
- * Return: 0.
- */
-SYSCALL_DEFINE0(sched_yield)
-{
- do_sched_yield();
- return 0;
-}
-
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
int __sched __cond_resched(void)
{
@@ -8907,105 +7356,11 @@ PREEMPT_MODEL_ACCESSOR(none);
PREEMPT_MODEL_ACCESSOR(voluntary);
PREEMPT_MODEL_ACCESSOR(full);
-#else /* !CONFIG_PREEMPT_DYNAMIC */
+#else /* !CONFIG_PREEMPT_DYNAMIC: */
static inline void preempt_dynamic_init(void) { }
-#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
-
-/**
- * yield - yield the current processor to other threads.
- *
- * Do not ever use this function, there's a 99% chance you're doing it wrong.
- *
- * The scheduler is at all times free to pick the calling task as the most
- * eligible task to run, if removing the yield() call from your code breaks
- * it, it's already broken.
- *
- * Typical broken usage is:
- *
- * while (!event)
- * yield();
- *
- * where one assumes that yield() will let 'the other' process run that will
- * make event true. If the current task is a SCHED_FIFO task that will never
- * happen. Never use yield() as a progress guarantee!!
- *
- * If you want to use yield() to wait for something, use wait_event().
- * If you want to use yield() to be 'nice' for others, use cond_resched().
- * If you still want to use yield(), do not!
- */
-void __sched yield(void)
-{
- set_current_state(TASK_RUNNING);
- do_sched_yield();
-}
-EXPORT_SYMBOL(yield);
-
-/**
- * yield_to - yield the current processor to another thread in
- * your thread group, or accelerate that thread toward the
- * processor it's on.
- * @p: target task
- * @preempt: whether task preemption is allowed or not
- *
- * It's the caller's job to ensure that the target task struct
- * can't go away on us before we can do any checks.
- *
- * Return:
- * true (>0) if we indeed boosted the target task.
- * false (0) if we failed to boost the target.
- * -ESRCH if there's no task to yield to.
- */
-int __sched yield_to(struct task_struct *p, bool preempt)
-{
- struct task_struct *curr = current;
- struct rq *rq, *p_rq;
- int yielded = 0;
-
- scoped_guard (irqsave) {
- rq = this_rq();
-
-again:
- p_rq = task_rq(p);
- /*
- * If we're the only runnable task on the rq and target rq also
- * has only one task, there's absolutely no point in yielding.
- */
- if (rq->nr_running == 1 && p_rq->nr_running == 1)
- return -ESRCH;
-
- guard(double_rq_lock)(rq, p_rq);
- if (task_rq(p) != p_rq)
- goto again;
-
- if (!curr->sched_class->yield_to_task)
- return 0;
-
- if (curr->sched_class != p->sched_class)
- return 0;
-
- if (task_on_cpu(p_rq, p) || !task_is_running(p))
- return 0;
-
- yielded = curr->sched_class->yield_to_task(rq, p);
- if (yielded) {
- schedstat_inc(rq->yld_count);
- /*
- * Make p's CPU reschedule; pick_next_entity
- * takes care of fairness.
- */
- if (preempt && rq != p_rq)
- resched_curr(p_rq);
- }
- }
-
- if (yielded)
- schedule();
-
- return yielded;
-}
-EXPORT_SYMBOL_GPL(yield_to);
+#endif /* CONFIG_PREEMPT_DYNAMIC */
int io_schedule_prepare(void)
{
@@ -9048,123 +7403,6 @@ void __sched io_schedule(void)
}
EXPORT_SYMBOL(io_schedule);
-/**
- * sys_sched_get_priority_max - return maximum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the maximum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
-{
- int ret = -EINVAL;
-
- switch (policy) {
- case SCHED_FIFO:
- case SCHED_RR:
- ret = MAX_RT_PRIO-1;
- break;
- case SCHED_DEADLINE:
- case SCHED_NORMAL:
- case SCHED_BATCH:
- case SCHED_IDLE:
- ret = 0;
- break;
- }
- return ret;
-}
-
-/**
- * sys_sched_get_priority_min - return minimum RT priority.
- * @policy: scheduling class.
- *
- * Return: On success, this syscall returns the minimum
- * rt_priority that can be used by a given scheduling class.
- * On failure, a negative error code is returned.
- */
-SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
-{
- int ret = -EINVAL;
-
- switch (policy) {
- case SCHED_FIFO:
- case SCHED_RR:
- ret = 1;
- break;
- case SCHED_DEADLINE:
- case SCHED_NORMAL:
- case SCHED_BATCH:
- case SCHED_IDLE:
- ret = 0;
- }
- return ret;
-}
-
-static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
-{
- unsigned int time_slice = 0;
- int retval;
-
- if (pid < 0)
- return -EINVAL;
-
- scoped_guard (rcu) {
- struct task_struct *p = find_process_by_pid(pid);
- if (!p)
- return -ESRCH;
-
- retval = security_task_getscheduler(p);
- if (retval)
- return retval;
-
- scoped_guard (task_rq_lock, p) {
- struct rq *rq = scope.rq;
- if (p->sched_class->get_rr_interval)
- time_slice = p->sched_class->get_rr_interval(rq, p);
- }
- }
-
- jiffies_to_timespec64(time_slice, t);
- return 0;
-}
-
-/**
- * sys_sched_rr_get_interval - return the default timeslice of a process.
- * @pid: pid of the process.
- * @interval: userspace pointer to the timeslice value.
- *
- * this syscall writes the default timeslice value of a given process
- * into the user-space timespec buffer. A value of '0' means infinity.
- *
- * Return: On success, 0 and the timeslice is in @interval. Otherwise,
- * an error code.
- */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
- struct __kernel_timespec __user *, interval)
-{
- struct timespec64 t;
- int retval = sched_rr_get_interval(pid, &t);
-
- if (retval == 0)
- retval = put_timespec64(&t, interval);
-
- return retval;
-}
-
-#ifdef CONFIG_COMPAT_32BIT_TIME
-SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
- struct old_timespec32 __user *, interval)
-{
- struct timespec64 t;
- int retval = sched_rr_get_interval(pid, &t);
-
- if (retval == 0)
- retval = put_old_timespec32(&t, interval);
- return retval;
-}
-#endif
-
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
@@ -9732,7 +7970,7 @@ int sched_cpu_deactivate(unsigned int cpu)
* Specifically, we rely on ttwu to no longer target this CPU, see
* ttwu_queue_cond() and is_cpu_allowed().
*
- * Do sync before park smpboot threads to take care the rcu boost case.
+ * Do sync before park smpboot threads to take care the RCU boost case.
*/
synchronize_rcu();
@@ -9807,7 +8045,7 @@ int sched_cpu_wait_empty(unsigned int cpu)
* Since this CPU is going 'away' for a while, fold any nr_active delta we
* might have. Called from the CPU stopper task after ensuring that the
* stopper is the last running task on the CPU, so nr_active count is
- * stable. We need to take the teardown thread which is calling this into
+ * stable. We need to take the tear-down thread which is calling this into
* account, so we hand in adjust = 1 to the load calculation.
*
* Also see the comment "Global load-average calculations".
@@ -10001,7 +8239,7 @@ void __init sched_init(void)
/*
* How much CPU bandwidth does root_task_group get?
*
- * In case of task-groups formed thr' the cgroup filesystem, it
+ * In case of task-groups formed through the cgroup filesystem, it
* gets 100% of the CPU resources in the system. This overall
* system CPU resource is divided among the tasks of
* root_task_group and its child task-groups in a fair manner,
@@ -10303,7 +8541,7 @@ void normalize_rt_tasks(void)
#if defined(CONFIG_KGDB_KDB)
/*
- * These functions are only useful for kdb.
+ * These functions are only useful for KDB.
*
* They can only be called when the whole system has been
* stopped - every CPU needs to be quiescent, and no scheduling
@@ -10411,7 +8649,7 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
online_fair_sched_group(tg);
}
-/* rcu callback to free various structures associated with a task group */
+/* RCU callback to free various structures associated with a task group */
static void sched_unregister_group_rcu(struct rcu_head *rhp)
{
/* Now it should be safe to free those cfs_rqs: */
@@ -11529,10 +9767,10 @@ const int sched_prio_to_weight[40] = {
};
/*
- * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
+ * Inverse (2^32/x) values of the sched_prio_to_weight[] array, pre-calculated.
*
* In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
+ * pre-calculated inverse to speed up arithmetics by turning divisions
* into multiplications:
*/
const u32 sched_prio_to_wmult[40] = {
@@ -11788,16 +10026,16 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
/*
* Move the src cid if the dst cid is unset. This keeps id
* allocation closest to 0 in cases where few threads migrate around
- * many cpus.
+ * many CPUs.
*
* If destination cid is already set, we may have to just clear
* the src cid to ensure compactness in frequent migrations
* scenarios.
*
* It is not useful to clear the src cid when the number of threads is
- * greater or equal to the number of allowed cpus, because user-space
+ * greater or equal to the number of allowed CPUs, because user-space
* can expect that the number of allowed cids can reach the number of
- * allowed cpus.
+ * allowed CPUs.
*/
dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
dst_cid = READ_ONCE(dst_pcpu_cid->cid);
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index a57fd8f27498..1ef98a93eb1d 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -279,7 +279,7 @@ void __sched_core_account_forceidle(struct rq *rq)
continue;
/*
- * Note: this will account forceidle to the current cpu, even
+ * Note: this will account forceidle to the current CPU, even
* if it comes from our SMT sibling.
*/
__account_forceidle_time(p, delta);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index aa48b2ec879d..a5e00293ae43 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -14,11 +14,11 @@
* They are only modified in vtime_account, on corresponding CPU
* with interrupts disabled. So, writes are safe.
* They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
+ * This may result in other CPU reading this CPU's IRQ time and can
* race with irq/vtime_account on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of IRQ time to wrong
+ * task when IRQ is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each IRQ in account_system_time.
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
@@ -269,7 +269,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
}
/*
- * Account how much elapsed time was spent in steal, irq, or softirq time.
+ * Account how much elapsed time was spent in steal, IRQ, or softirq time.
*/
static inline u64 account_other_time(u64 max)
{
@@ -370,7 +370,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
* Check for hardirq is done both for system and user time as there is
* no timer going off while we are on hardirq and hence we may never get an
* opportunity to update it solely in system time.
- * p->stime and friends are only updated on system time and not on irq
+ * p->stime and friends are only updated on system time and not on IRQ
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
@@ -380,7 +380,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
/*
* When returning from idle, many ticks can get accounted at
- * once, including some ticks of steal, irq, and softirq time.
+ * once, including some ticks of steal, IRQ, and softirq time.
* Subtract those ticks from the amount of time accounted to
* idle, or potentially user or system time. Due to rounding,
* other time can exceed ticks occasionally.
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9bedd148f007..f59e5c19d944 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -708,7 +708,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
}
/*
- * And we finally need to fixup root_domain(s) bandwidth accounting,
+ * And we finally need to fix up root_domain(s) bandwidth accounting,
* since p is still hanging out in the old (now moved to default) root
* domain.
*/
@@ -992,7 +992,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
* is detected, the runtime and deadline need to be updated.
*
* If the task has an implicit deadline, i.e., deadline == period, the Original
- * CBS is applied. the runtime is replenished and a new absolute deadline is
+ * CBS is applied. The runtime is replenished and a new absolute deadline is
* set, as in the previous cases.
*
* However, the Original CBS does not work properly for tasks with
@@ -1294,7 +1294,7 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
* Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied
* by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
* Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw
- * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * is multiplied by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
* Since delta is a 64 bit variable, to have an overflow its value should be
* larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is
* not an issue here.
@@ -2493,7 +2493,7 @@ static void pull_dl_task(struct rq *this_rq)
src_rq = cpu_rq(cpu);
/*
- * It looks racy, abd it is! However, as in sched_rt.c,
+ * It looks racy, and it is! However, as in sched_rt.c,
* we are fine with this.
*/
if (this_rq->dl.dl_nr_running &&
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 24dda708b699..9057584ec06d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -61,7 +61,7 @@
* Options are:
*
* SCHED_TUNABLESCALING_NONE - unscaled, always *1
- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
+ * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
*
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -3835,15 +3835,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
}
}
-void reweight_task(struct task_struct *p, int prio)
+void reweight_task(struct task_struct *p, const struct load_weight *lw)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct load_weight *load = &se->load;
- unsigned long weight = scale_load(sched_prio_to_weight[prio]);
- reweight_entity(cfs_rq, se, weight);
- load->inv_weight = sched_prio_to_wmult[prio];
+ reweight_entity(cfs_rq, se, lw->weight);
+ load->inv_weight = lw->inv_weight;
}
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
@@ -8719,7 +8718,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
* topology where each level pairs two lower groups (or better). This results
* in O(log n) layers. Furthermore we reduce the number of CPUs going up the
* tree to only the first of the previous level and we decrease the frequency
- * of load-balance at each level inv. proportional to the number of CPUs in
+ * of load-balance at each level inversely proportional to the number of CPUs in
* the groups.
*
* This yields:
@@ -11886,6 +11885,13 @@ static void kick_ilb(unsigned int flags)
return;
/*
+ * Don't bother if no new NOHZ balance work items for ilb_cpu,
+ * i.e. all bits in flags are already set in ilb_cpu.
+ */
+ if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags)
+ return;
+
+ /*
* Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
* the first flag owns it; cleared by nohz_csd_func().
*/
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 6135fbe83d68..6e78d071beb5 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -172,19 +172,13 @@ static void cpuidle_idle_call(void)
/*
* Check if the idle task must be rescheduled. If it is the
- * case, exit the function after re-enabling the local irq.
+ * case, exit the function after re-enabling the local IRQ.
*/
if (need_resched()) {
local_irq_enable();
return;
}
- /*
- * The RCU framework needs to be told that we are entering an idle
- * section, so no more rcu read side critical sections and one more
- * step to the grace period
- */
-
if (cpuidle_not_available(drv, dev)) {
tick_nohz_idle_stop_tick();
@@ -244,7 +238,7 @@ exit_idle:
__current_set_polling();
/*
- * It is up to the idle functions to reenable local interrupts
+ * It is up to the idle functions to re-enable local interrupts
*/
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
@@ -320,7 +314,7 @@ static void do_idle(void)
rcu_nocb_flush_deferred_wakeup();
/*
- * In poll mode we reenable interrupts and spin. Also if we
+ * In poll mode we re-enable interrupts and spin. Also if we
* detected in the wakeup from idle path that the tick
* broadcast device expired for us, we don't want to go deep
* idle as we know that the IPI is going to arrive right away.
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index ca9da66cc894..c48900b856a2 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -45,7 +45,7 @@
* again, being late doesn't loose the delta, just wrecks the sample.
*
* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
- * this would add another cross-CPU cacheline miss and atomic operation
+ * this would add another cross-CPU cache-line miss and atomic operation
* to the wakeup path. Instead we increment on whatever CPU the task ran
* when it went into uninterruptible state and decrement on whatever CPU
* did the wakeup. This means that only the sum of nr_uninterruptible over
@@ -62,7 +62,7 @@ EXPORT_SYMBOL(avenrun); /* should be removed */
/**
* get_avenrun - get the load average array
- * @loads: pointer to dest load array
+ * @loads: pointer to destination load array
* @offset: offset to add
* @shift: shift count to shift the result left
*
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index ef00382de595..fa52906a4478 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -417,7 +417,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
/*
- * irq:
+ * IRQ:
*
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
* util_sum = cpu_scale * load_sum
@@ -432,7 +432,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
int ret = 0;
/*
- * We can't use clock_pelt because irq time is not accounted in
+ * We can't use clock_pelt because IRQ time is not accounted in
* clock_task. Instead we directly scale the running time to
* reflect the real amount of computation
*/
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 507d7b8d79af..020d58967d4e 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -41,7 +41,7 @@
* What it means for a task to be productive is defined differently
* for each resource. For IO, productive means a running task. For
* memory, productive means a running task that isn't a reclaimer. For
- * CPU, productive means an oncpu task.
+ * CPU, productive means an on-CPU task.
*
* Naturally, the FULL state doesn't exist for the CPU resource at the
* system level, but exist at the cgroup level. At the cgroup level,
@@ -49,7 +49,7 @@
* resource which is being used by others outside of the cgroup or
* throttled by the cgroup cpu.max configuration.
*
- * The percentage of wallclock time spent in those compound stall
+ * The percentage of wall clock time spent in those compound stall
* states gives pressure numbers between 0 and 100 for each resource,
* where the SOME percentage indicates workload slowdowns and the FULL
* percentage indicates reduced CPU utilization:
@@ -218,28 +218,32 @@ void __init psi_init(void)
group_init(&psi_system);
}
-static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
+static u32 test_states(unsigned int *tasks, u32 state_mask)
{
- switch (state) {
- case PSI_IO_SOME:
- return unlikely(tasks[NR_IOWAIT]);
- case PSI_IO_FULL:
- return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
- case PSI_MEM_SOME:
- return unlikely(tasks[NR_MEMSTALL]);
- case PSI_MEM_FULL:
- return unlikely(tasks[NR_MEMSTALL] &&
- tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
- case PSI_CPU_SOME:
- return unlikely(tasks[NR_RUNNING] > oncpu);
- case PSI_CPU_FULL:
- return unlikely(tasks[NR_RUNNING] && !oncpu);
- case PSI_NONIDLE:
- return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
- tasks[NR_RUNNING];
- default:
- return false;
+ const bool oncpu = state_mask & PSI_ONCPU;
+
+ if (tasks[NR_IOWAIT]) {
+ state_mask |= BIT(PSI_IO_SOME);
+ if (!tasks[NR_RUNNING])
+ state_mask |= BIT(PSI_IO_FULL);
+ }
+
+ if (tasks[NR_MEMSTALL]) {
+ state_mask |= BIT(PSI_MEM_SOME);
+ if (tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING])
+ state_mask |= BIT(PSI_MEM_FULL);
}
+
+ if (tasks[NR_RUNNING] > oncpu)
+ state_mask |= BIT(PSI_CPU_SOME);
+
+ if (tasks[NR_RUNNING] && !oncpu)
+ state_mask |= BIT(PSI_CPU_FULL);
+
+ if (tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING])
+ state_mask |= BIT(PSI_NONIDLE);
+
+ return state_mask;
}
static void get_recent_times(struct psi_group *group, int cpu,
@@ -345,7 +349,7 @@ static void collect_percpu_times(struct psi_group *group,
/*
* Collect the per-cpu time buckets and average them into a
- * single time sample that is normalized to wallclock time.
+ * single time sample that is normalized to wall clock time.
*
* For averaging, each CPU is weighted by its non-idle time in
* the sampling period. This eliminates artifacts from uneven
@@ -770,7 +774,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
{
struct psi_group_cpu *groupc;
unsigned int t, m;
- enum psi_states s;
u32 state_mask;
lockdep_assert_rq_held(cpu_rq(cpu));
@@ -842,10 +845,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
return;
}
- for (s = 0; s < NR_PSI_STATES; s++) {
- if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
- state_mask |= (1 << s);
- }
+ state_mask = test_states(groupc->tasks, state_mask);
/*
* Since we care about lost potential, a memstall is FULL
@@ -1205,7 +1205,7 @@ void psi_cgroup_restart(struct psi_group *group)
/*
* After we disable psi_group->enabled, we don't actually
* stop percpu tasks accounting in each psi_group_cpu,
- * instead only stop test_state() loop, record_times()
+ * instead only stop test_states() loop, record_times()
* and averaging worker, see psi_group_change() for details.
*
* When disable cgroup PSI, this function has nothing to sync
@@ -1213,7 +1213,7 @@ void psi_cgroup_restart(struct psi_group *group)
* would see !psi_group->enabled and only do task accounting.
*
* When re-enable cgroup PSI, this function use psi_group_change()
- * to get correct state mask from test_state() loop on tasks[],
+ * to get correct state mask from test_states() loop on tasks[],
* and restart groupc->state_start from now, use .clear = .set = 0
* here since no task status really changed.
*/
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aa4c1c874fa4..63e49c8ffc4d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -140,7 +140,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
INIT_LIST_HEAD(array->queue + i);
__clear_bit(i, array->bitmap);
}
- /* delimiter for bitsearch: */
+ /* delimiter for bit-search: */
__set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP
@@ -1135,7 +1135,7 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
/*
* This may have been our highest task, and therefore
- * we may have some recomputation to do
+ * we may have some re-computation to do
*/
if (prio == prev_prio) {
struct rt_prio_array *array = &rt_rq->active;
@@ -1571,7 +1571,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
*
* For equal prio tasks, we just let the scheduler sort it out.
*
- * Otherwise, just let it ride on the affined RQ and the
+ * Otherwise, just let it ride on the affine RQ and the
* post-schedule router will push the preempted task away
*
* This test is optimistic, if we get it wrong the load-balancer
@@ -2147,14 +2147,14 @@ static void push_rt_tasks(struct rq *rq)
* if its the only CPU with multiple RT tasks queued, and a large number
* of CPUs scheduling a lower priority task at the same time.
*
- * Each root domain has its own irq work function that can iterate over
+ * Each root domain has its own IRQ work function that can iterate over
* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
* task must be checked if there's one or many CPUs that are lowering
- * their priority, there's a single irq work iterator that will try to
+ * their priority, there's a single IRQ work iterator that will try to
* push off RT tasks that are waiting to run.
*
* When a CPU schedules a lower priority task, it will kick off the
- * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * IRQ work iterator that will jump to each CPU with overloaded RT tasks.
* As it only takes the first CPU that schedules a lower priority task
* to start the process, the rto_start variable is incremented and if
* the atomic result is one, then that CPU will try to take the rto_lock.
@@ -2162,7 +2162,7 @@ static void push_rt_tasks(struct rq *rq)
* CPUs scheduling lower priority tasks.
*
* All CPUs that are scheduling a lower priority task will increment the
- * rt_loop_next variable. This will make sure that the irq work iterator
+ * rt_loop_next variable. This will make sure that the IRQ work iterator
* checks all RT overloaded CPUs whenever a CPU schedules a new lower
* priority task, even if the iterator is in the middle of a scan. Incrementing
* the rt_loop_next will cause the iterator to perform another scan.
@@ -2242,7 +2242,7 @@ static void tell_cpu_to_push(struct rq *rq)
* The rto_cpu is updated under the lock, if it has a valid CPU
* then the IPI is still running and will continue due to the
* update to loop_next, and nothing needs to be done here.
- * Otherwise it is finishing up and an ipi needs to be sent.
+ * Otherwise it is finishing up and an IPI needs to be sent.
*/
if (rq->rd->rto_cpu < 0)
cpu = rto_next_cpu(rq->rd);
@@ -2594,7 +2594,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
watchdog(rq, p);
/*
- * RR tasks need a special form of timeslice management.
+ * RR tasks need a special form of time-slice management.
* FIFO tasks have no timeslices.
*/
if (p->policy != SCHED_RR)
@@ -2900,7 +2900,7 @@ static int sched_rt_global_constraints(void)
int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
- /* Don't accept realtime tasks when there is no way for them to run */
+ /* Don't accept real-time tasks when there is no way for them to run */
if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
return 0;
@@ -3001,7 +3001,7 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
ret = proc_dointvec(table, write, buffer, lenp, ppos);
/*
* Make sure that internally we keep jiffies.
- * Also, writing zero resets the timeslice to default:
+ * Also, writing zero resets the time-slice to default:
*/
if (!ret && write) {
sched_rr_timeslice =
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ef20c61004eb..4c36cc680361 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -74,6 +74,12 @@
#include "../workqueue_internal.h"
+struct rq;
+struct cfs_rq;
+struct rt_rq;
+struct sched_group;
+struct cpuidle_state;
+
#ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h>
# include <asm/paravirt_api_clock.h>
@@ -90,9 +96,6 @@
# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
#endif
-struct rq;
-struct cpuidle_state;
-
/* task_struct::on_rq states: */
#define TASK_ON_RQ_QUEUED 1
#define TASK_ON_RQ_MIGRATING 2
@@ -128,12 +131,12 @@ extern struct list_head asym_cap_list;
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
-#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+#define NS_TO_JIFFIES(time) ((unsigned long)(time) / (NSEC_PER_SEC/HZ))
/*
* Increase resolution of nice-level calculations for 64-bit architectures.
* The extra resolution improves shares distribution and load balancing of
- * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper task-group
* hierarchies, especially on larger systems. This is not a user-visible change
* and does not change the user-interface for setting shares/weights.
*
@@ -147,12 +150,13 @@ extern struct list_head asym_cap_list;
#ifdef CONFIG_64BIT
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
-# define scale_load_down(w) \
-({ \
- unsigned long __w = (w); \
- if (__w) \
- __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
- __w; \
+# define scale_load_down(w) \
+({ \
+ unsigned long __w = (w); \
+ \
+ if (__w) \
+ __w = max(2UL, __w >> SCHED_FIXEDPOINT_SHIFT); \
+ __w; \
})
#else
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
@@ -187,6 +191,7 @@ static inline int idle_policy(int policy)
{
return policy == SCHED_IDLE;
}
+
static inline int fair_policy(int policy)
{
return policy == SCHED_NORMAL || policy == SCHED_BATCH;
@@ -201,6 +206,7 @@ static inline int dl_policy(int policy)
{
return policy == SCHED_DEADLINE;
}
+
static inline bool valid_policy(int policy)
{
return idle_policy(policy) || fair_policy(policy) ||
@@ -222,11 +228,12 @@ static inline int task_has_dl_policy(struct task_struct *p)
return dl_policy(p->policy);
}
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
static inline void update_avg(u64 *avg, u64 sample)
{
s64 diff = sample - *avg;
+
*avg += diff / 8;
}
@@ -251,7 +258,7 @@ static inline void update_avg(u64 *avg, u64 sample)
*/
#define SCHED_FLAG_SUGOV 0x10000000
-#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
+#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se)
{
@@ -358,9 +365,6 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
#ifdef CONFIG_CGROUP_SCHED
-struct cfs_rq;
-struct rt_rq;
-
extern struct list_head task_groups;
struct cfs_bandwidth {
@@ -406,7 +410,7 @@ struct task_group {
#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
- * it in its own cacheline separated from the fields above which
+ * it in its own cache-line separated from the fields above which
* will also be accessed at each tick.
*/
atomic_long_t load_avg ____cacheline_aligned;
@@ -536,6 +540,7 @@ static inline void set_task_rq_fair(struct sched_entity *se,
#else /* CONFIG_CGROUP_SCHED */
struct cfs_bandwidth { };
+
static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; }
#endif /* CONFIG_CGROUP_SCHED */
@@ -551,8 +556,8 @@ extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent
* applicable for 32-bits architectures.
*/
#ifdef CONFIG_64BIT
-# define u64_u32_load_copy(var, copy) var
-# define u64_u32_store_copy(var, copy, val) (var = val)
+# define u64_u32_load_copy(var, copy) var
+# define u64_u32_store_copy(var, copy, val) (var = val)
#else
# define u64_u32_load_copy(var, copy) \
({ \
@@ -580,8 +585,8 @@ do { \
copy = __val; \
} while (0)
#endif
-# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy)
-# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
+# define u64_u32_load(var) u64_u32_load_copy(var, var##_copy)
+# define u64_u32_store(var, val) u64_u32_store_copy(var, var##_copy, val)
/* CFS-related fields in a runqueue */
struct cfs_rq {
@@ -803,6 +808,7 @@ struct dl_rq {
};
#ifdef CONFIG_FAIR_GROUP_SCHED
+
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)
@@ -820,16 +826,18 @@ static inline long se_runnable(struct sched_entity *se)
return se->runnable_weight;
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
+
#define entity_is_task(se) 1
-static inline void se_update_runnable(struct sched_entity *se) {}
+static inline void se_update_runnable(struct sched_entity *se) { }
static inline long se_runnable(struct sched_entity *se)
{
return !!se->on_rq;
}
-#endif
+
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_SMP
/*
@@ -874,7 +882,7 @@ struct root_domain {
*/
bool overloaded;
- /* Indicate one or more cpus over-utilized (tipping point) */
+ /* Indicate one or more CPUs over-utilized (tipping point) */
bool overutilized;
/*
@@ -988,7 +996,6 @@ struct uclamp_rq {
DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
#endif /* CONFIG_UCLAMP_TASK */
-struct rq;
struct balance_callback {
struct balance_callback *next;
void (*func)(struct rq *rq);
@@ -1144,7 +1151,7 @@ struct rq {
call_single_data_t hrtick_csd;
#endif
struct hrtimer hrtick_timer;
- ktime_t hrtick_time;
+ ktime_t hrtick_time;
#endif
#ifdef CONFIG_SCHEDSTATS
@@ -1166,7 +1173,7 @@ struct rq {
#endif
#ifdef CONFIG_CPU_IDLE
- /* Must be inspected within a rcu lock section */
+ /* Must be inspected within a RCU lock section */
struct cpuidle_state *idle_state;
#endif
@@ -1228,7 +1235,7 @@ static inline int cpu_of(struct rq *rq)
#endif
}
-#define MDF_PUSH 0x01
+#define MDF_PUSH 0x01
static inline bool is_migration_disabled(struct task_struct *p)
{
@@ -1247,7 +1254,6 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
-struct sched_group;
#ifdef CONFIG_SCHED_CORE
static inline struct cpumask *sched_group_span(struct sched_group *sg);
@@ -1283,9 +1289,10 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
return &rq->__lock;
}
-bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
- bool fi);
-void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+extern bool
+cfs_prio_less(const struct task_struct *a, const struct task_struct *b, bool fi);
+
+extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
/*
* Helpers to check if the CPU's core cookie matches with the task's cookie
@@ -1353,7 +1360,7 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags);
extern void sched_core_get(void);
extern void sched_core_put(void);
-#else /* !CONFIG_SCHED_CORE */
+#else /* !CONFIG_SCHED_CORE: */
static inline bool sched_core_enabled(struct rq *rq)
{
@@ -1391,7 +1398,8 @@ static inline bool sched_group_cookie_match(struct rq *rq,
{
return true;
}
-#endif /* CONFIG_SCHED_CORE */
+
+#endif /* !CONFIG_SCHED_CORE */
static inline void lockdep_assert_rq_held(struct rq *rq)
{
@@ -1422,8 +1430,10 @@ static inline void raw_spin_rq_unlock_irq(struct rq *rq)
static inline unsigned long _raw_spin_rq_lock_irqsave(struct rq *rq)
{
unsigned long flags;
+
local_irq_save(flags);
raw_spin_rq_lock(rq);
+
return flags;
}
@@ -1452,6 +1462,7 @@ static inline void update_idle_core(struct rq *rq) { }
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
+
static inline struct task_struct *task_of(struct sched_entity *se)
{
SCHED_WARN_ON(!entity_is_task(se));
@@ -1475,9 +1486,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
return grp->my_q;
}
-#else
+#else /* !CONFIG_FAIR_GROUP_SCHED: */
-#define task_of(_se) container_of(_se, struct task_struct, se)
+#define task_of(_se) container_of(_se, struct task_struct, se)
static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p)
{
@@ -1497,7 +1508,8 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
{
return NULL;
}
-#endif
+
+#endif /* !CONFIG_FAIR_GROUP_SCHED */
extern void update_rq_clock(struct rq *rq);
@@ -1623,9 +1635,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
#ifdef CONFIG_SCHED_DEBUG
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
-#ifdef CONFIG_SMP
+# ifdef CONFIG_SMP
SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback);
-#endif
+# endif
#endif
}
@@ -1651,9 +1663,11 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
#endif
}
+extern
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(rq->lock);
+extern
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
__acquires(p->pi_lock)
__acquires(rq->lock);
@@ -1680,48 +1694,42 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct,
task_rq_unlock(_T->rq, _T->lock, &_T->rf),
struct rq *rq; struct rq_flags rf)
-static inline void
-rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
+static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
raw_spin_rq_lock_irqsave(rq, rf->flags);
rq_pin_lock(rq, rf);
}
-static inline void
-rq_lock_irq(struct rq *rq, struct rq_flags *rf)
+static inline void rq_lock_irq(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
raw_spin_rq_lock_irq(rq);
rq_pin_lock(rq, rf);
}
-static inline void
-rq_lock(struct rq *rq, struct rq_flags *rf)
+static inline void rq_lock(struct rq *rq, struct rq_flags *rf)
__acquires(rq->lock)
{
raw_spin_rq_lock(rq);
rq_pin_lock(rq, rf);
}
-static inline void
-rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
+static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
raw_spin_rq_unlock_irqrestore(rq, rf->flags);
}
-static inline void
-rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
+static inline void rq_unlock_irq(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
raw_spin_rq_unlock_irq(rq);
}
-static inline void
-rq_unlock(struct rq *rq, struct rq_flags *rf)
+static inline void rq_unlock(struct rq *rq, struct rq_flags *rf)
__releases(rq->lock)
{
rq_unpin_lock(rq, rf);
@@ -1743,8 +1751,7 @@ DEFINE_LOCK_GUARD_1(rq_lock_irqsave, struct rq,
rq_unlock_irqrestore(_T->lock, &_T->rf),
struct rq_flags rf)
-static inline struct rq *
-this_rq_lock_irq(struct rq_flags *rf)
+static inline struct rq *this_rq_lock_irq(struct rq_flags *rf)
__acquires(rq->lock)
{
struct rq *rq;
@@ -1752,15 +1759,18 @@ this_rq_lock_irq(struct rq_flags *rf)
local_irq_disable();
rq = this_rq();
rq_lock(rq, rf);
+
return rq;
}
#ifdef CONFIG_NUMA
+
enum numa_topology_type {
NUMA_DIRECT,
NUMA_GLUELESS_MESH,
NUMA_BACKPLANE,
};
+
extern enum numa_topology_type sched_numa_topology_type;
extern int sched_max_numa_distance;
extern bool find_numa_distance(int distance);
@@ -1769,18 +1779,23 @@ extern void sched_update_numa(int cpu, bool online);
extern void sched_domains_numa_masks_set(unsigned int cpu);
extern void sched_domains_numa_masks_clear(unsigned int cpu);
extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
-#else
+
+#else /* !CONFIG_NUMA: */
+
static inline void sched_init_numa(int offline_node) { }
static inline void sched_update_numa(int cpu, bool online) { }
static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
+
static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
{
return nr_cpu_ids;
}
-#endif
+
+#endif /* !CONFIG_NUMA */
#ifdef CONFIG_NUMA_BALANCING
+
/* The regions in numa_faults array from task_struct */
enum numa_faults_stats {
NUMA_MEM = 0,
@@ -1788,17 +1803,21 @@ enum numa_faults_stats {
NUMA_MEMBUF,
NUMA_CPUBUF
};
+
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *p, struct task_struct *t,
int cpu, int scpu);
extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
-#else
+
+#else /* !CONFIG_NUMA_BALANCING: */
+
static inline void
init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
{
}
-#endif /* CONFIG_NUMA_BALANCING */
+
+#endif /* !CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SMP
@@ -1823,8 +1842,7 @@ queue_balance_callback(struct rq *rq,
}
#define rcu_dereference_check_sched_domain(p) \
- rcu_dereference_check((p), \
- lockdep_is_held(&sched_domains_mutex))
+ rcu_dereference_check((p), lockdep_is_held(&sched_domains_mutex))
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -1895,6 +1913,7 @@ DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+
extern struct static_key_false sched_asym_cpucapacity;
extern struct static_key_false sched_cluster_active;
@@ -1958,15 +1977,11 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
extern int group_balance_cpu(struct sched_group *sg);
#ifdef CONFIG_SCHED_DEBUG
-void update_sched_domain_debugfs(void);
-void dirty_sched_domain_sysctl(int cpu);
+extern void update_sched_domain_debugfs(void);
+extern void dirty_sched_domain_sysctl(int cpu);
#else
-static inline void update_sched_domain_debugfs(void)
-{
-}
-static inline void dirty_sched_domain_sysctl(int cpu)
-{
-}
+static inline void update_sched_domain_debugfs(void) { }
+static inline void dirty_sched_domain_sysctl(int cpu) { }
#endif
extern int sched_update_scaling(void);
@@ -1977,6 +1992,7 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
return cpu_possible_mask; /* &init_task.cpus_mask */
return p->user_cpus_ptr;
}
+
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -1999,13 +2015,13 @@ static inline void sched_core_tick(struct rq *rq)
__sched_core_tick(rq);
}
-#else
+#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
-static inline void sched_core_account_forceidle(struct rq *rq) {}
+static inline void sched_core_account_forceidle(struct rq *rq) { }
-static inline void sched_core_tick(struct rq *rq) {}
+static inline void sched_core_tick(struct rq *rq) { }
-#endif /* CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS */
+#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
#ifdef CONFIG_CGROUP_SCHED
@@ -2047,15 +2063,16 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
#endif
}
-#else /* CONFIG_CGROUP_SCHED */
+#else /* !CONFIG_CGROUP_SCHED: */
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+
static inline struct task_group *task_group(struct task_struct *p)
{
return NULL;
}
-#endif /* CONFIG_CGROUP_SCHED */
+#endif /* !CONFIG_CGROUP_SCHED */
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
@@ -2100,6 +2117,7 @@ enum {
extern const_debug unsigned int sysctl_sched_features;
#ifdef CONFIG_JUMP_LABEL
+
#define SCHED_FEAT(name, enabled) \
static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
@@ -2112,13 +2130,13 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
-#else /* !CONFIG_JUMP_LABEL */
+#else /* !CONFIG_JUMP_LABEL: */
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-#endif /* CONFIG_JUMP_LABEL */
+#endif /* !CONFIG_JUMP_LABEL */
-#else /* !SCHED_DEBUG */
+#else /* !SCHED_DEBUG: */
/*
* Each translation unit has its own copy of sysctl_sched_features to allow
@@ -2134,7 +2152,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-#endif /* SCHED_DEBUG */
+#endif /* !SCHED_DEBUG */
extern struct static_key_false sched_numa_balancing;
extern struct static_key_false sched_schedstats;
@@ -2177,13 +2195,13 @@ static inline int task_on_rq_migrating(struct task_struct *p)
}
/* Wake flags. The first three directly map to some SD flag value */
-#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
-#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
-#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
+#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
+#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
+#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
-#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
-#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
-#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
+#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
+#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
+#define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
@@ -2253,9 +2271,9 @@ extern const u32 sched_prio_to_wmult[40];
#define RETRY_TASK ((void *)-1UL)
struct affinity_context {
- const struct cpumask *new_mask;
- struct cpumask *user_mask;
- unsigned int flags;
+ const struct cpumask *new_mask;
+ struct cpumask *user_mask;
+ unsigned int flags;
};
extern s64 update_curr_common(struct rq *rq);
@@ -2403,8 +2421,19 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void sched_balance_trigger(struct rq *rq);
+extern int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ctx);
extern void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx);
+static inline cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ /*
+ * See do_set_cpus_allowed() above for the rcu_head usage.
+ */
+ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head));
+
+ return kmalloc_node(size, GFP_KERNEL, node);
+}
+
static inline struct task_struct *get_push_task(struct rq *rq)
{
struct task_struct *p = rq->curr;
@@ -2426,9 +2455,23 @@ static inline struct task_struct *get_push_task(struct rq *rq)
extern int push_cpu_stop(void *arg);
-#endif
+#else /* !CONFIG_SMP: */
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+ struct affinity_context *ctx)
+{
+ return set_cpus_allowed_ptr(p, ctx->new_mask);
+}
+
+static inline cpumask_t *alloc_user_cpus_ptr(int node)
+{
+ return NULL;
+}
+
+#endif /* !CONFIG_SMP */
#ifdef CONFIG_CPU_IDLE
+
static inline void idle_set_state(struct rq *rq,
struct cpuidle_state *idle_state)
{
@@ -2441,7 +2484,9 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
return rq->idle_state;
}
-#else
+
+#else /* !CONFIG_CPU_IDLE: */
+
static inline void idle_set_state(struct rq *rq,
struct cpuidle_state *idle_state)
{
@@ -2451,7 +2496,8 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
return NULL;
}
-#endif
+
+#endif /* !CONFIG_CPU_IDLE */
extern void schedule_idle(void);
asmlinkage void schedule_user(void);
@@ -2464,7 +2510,7 @@ extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
-extern void reweight_task(struct task_struct *p, int prio);
+extern void reweight_task(struct task_struct *p, const struct load_weight *lw);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
@@ -2480,7 +2526,8 @@ extern void init_dl_entity(struct sched_dl_entity *dl_se);
#define RATIO_SHIFT 8
#define MAX_BW_BITS (64 - BW_SHIFT)
#define MAX_BW ((1ULL << MAX_BW_BITS) - 1)
-unsigned long to_ratio(u64 period, u64 runtime);
+
+extern unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
extern void post_init_entity_util_avg(struct task_struct *p);
@@ -2506,10 +2553,10 @@ static inline void sched_update_tick_dependency(struct rq *rq)
else
tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
}
-#else
+#else /* !CONFIG_NO_HZ_FULL: */
static inline int sched_tick_offload_init(void) { return 0; }
static inline void sched_update_tick_dependency(struct rq *rq) { }
-#endif
+#endif /* !CONFIG_NO_HZ_FULL */
static inline void add_nr_running(struct rq *rq, unsigned count)
{
@@ -2545,9 +2592,9 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
#ifdef CONFIG_PREEMPT_RT
-#define SCHED_NR_MIGRATE_BREAK 8
+# define SCHED_NR_MIGRATE_BREAK 8
#else
-#define SCHED_NR_MIGRATE_BREAK 32
+# define SCHED_NR_MIGRATE_BREAK 32
#endif
extern const_debug unsigned int sysctl_sched_nr_migrate;
@@ -2596,9 +2643,9 @@ static inline int hrtick_enabled_dl(struct rq *rq)
return hrtick_enabled(rq);
}
-void hrtick_start(struct rq *rq, u64 delay);
+extern void hrtick_start(struct rq *rq, u64 delay);
-#else
+#else /* !CONFIG_SCHED_HRTICK: */
static inline int hrtick_enabled_fair(struct rq *rq)
{
@@ -2615,13 +2662,10 @@ static inline int hrtick_enabled(struct rq *rq)
return 0;
}
-#endif /* CONFIG_SCHED_HRTICK */
+#endif /* !CONFIG_SCHED_HRTICK */
#ifndef arch_scale_freq_tick
-static __always_inline
-void arch_scale_freq_tick(void)
-{
-}
+static __always_inline void arch_scale_freq_tick(void) { }
#endif
#ifndef arch_scale_freq_capacity
@@ -2658,13 +2702,13 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
#endif
}
#else
-static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { }
#endif
-#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
-__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
-static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \
-{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
+#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
+__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
+static inline class_##name##_t class_##name##_constructor(type *lock, type *lock2) \
+{ class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \
_lock; return _t; }
#ifdef CONFIG_SMP
@@ -2718,7 +2762,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
return 1;
}
-#else
+#else /* !CONFIG_PREEMPTION: */
/*
* Unfair double_lock_balance: Optimizes throughput at the expense of
* latency by eliminating extra atomic operations when the locks are
@@ -2749,7 +2793,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
return 1;
}
-#endif /* CONFIG_PREEMPTION */
+#endif /* !CONFIG_PREEMPTION */
/*
* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
@@ -2825,9 +2869,10 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
extern void set_rq_online (struct rq *rq);
extern void set_rq_offline(struct rq *rq);
+
extern bool sched_smp_initialized;
-#else /* CONFIG_SMP */
+#else /* !CONFIG_SMP: */
/*
* double_rq_lock - safely lock two runqueues
@@ -2861,7 +2906,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__release(rq2->lock);
}
-#endif
+#endif /* !CONFIG_SMP */
DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq,
double_rq_lock(_T->lock, _T->lock2),
@@ -2882,16 +2927,15 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
extern void resched_latency_warn(int cpu, u64 latency);
-#ifdef CONFIG_NUMA_BALANCING
-extern void
-show_numa_stats(struct task_struct *p, struct seq_file *m);
+# ifdef CONFIG_NUMA_BALANCING
+extern void show_numa_stats(struct task_struct *p, struct seq_file *m);
extern void
print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
- unsigned long tpf, unsigned long gsf, unsigned long gpf);
-#endif /* CONFIG_NUMA_BALANCING */
-#else
-static inline void resched_latency_warn(int cpu, u64 latency) {}
-#endif /* CONFIG_SCHED_DEBUG */
+ unsigned long tpf, unsigned long gsf, unsigned long gpf);
+# endif /* CONFIG_NUMA_BALANCING */
+#else /* !CONFIG_SCHED_DEBUG: */
+static inline void resched_latency_warn(int cpu, u64 latency) { }
+#endif /* !CONFIG_SCHED_DEBUG */
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq);
@@ -2901,6 +2945,7 @@ extern void cfs_bandwidth_usage_inc(void);
extern void cfs_bandwidth_usage_dec(void);
#ifdef CONFIG_NO_HZ_COMMON
+
#define NOHZ_BALANCE_KICK_BIT 0
#define NOHZ_STATS_KICK_BIT 1
#define NOHZ_NEWILB_KICK_BIT 2
@@ -2915,14 +2960,14 @@ extern void cfs_bandwidth_usage_dec(void);
/* Update nohz.next_balance */
#define NOHZ_NEXT_KICK BIT(NOHZ_NEXT_KICK_BIT)
-#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK | NOHZ_NEXT_KICK)
-#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
extern void nohz_balance_exit_idle(struct rq *rq);
-#else
+#else /* !CONFIG_NO_HZ_COMMON: */
static inline void nohz_balance_exit_idle(struct rq *rq) { }
-#endif
+#endif /* !CONFIG_NO_HZ_COMMON */
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
extern void nohz_run_idle_balance(int cpu);
@@ -2931,6 +2976,7 @@ static inline void nohz_run_idle_balance(int cpu) { }
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
struct irqtime {
u64 total;
u64 tick_delta;
@@ -2958,9 +3004,11 @@ static inline u64 irq_time_read(int cpu)
return total;
}
+
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
+
DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
/**
@@ -2994,9 +3042,9 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
if (data)
data->func(data, rq_clock(rq), flags);
}
-#else
-static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
-#endif /* CONFIG_CPU_FREQ */
+#else /* !CONFIG_CPU_FREQ: */
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { }
+#endif /* !CONFIG_CPU_FREQ */
#ifdef arch_scale_freq_capacity
# ifndef arch_scale_freq_invariant
@@ -3007,6 +3055,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif
#ifdef CONFIG_SMP
+
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
unsigned long *min,
unsigned long *max);
@@ -3049,9 +3098,11 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
{
return READ_ONCE(rq->avg_rt.util_avg);
}
-#endif
+
+#endif /* CONFIG_SMP */
#ifdef CONFIG_UCLAMP_TASK
+
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
static inline unsigned long uclamp_rq_get(struct rq *rq,
@@ -3098,9 +3149,40 @@ static inline bool uclamp_is_used(void)
{
return static_branch_likely(&sched_uclamp_used);
}
-#else /* CONFIG_UCLAMP_TASK */
-static inline unsigned long uclamp_eff_value(struct task_struct *p,
- enum uclamp_id clamp_id)
+
+#define for_each_clamp_id(clamp_id) \
+ for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
+
+extern unsigned int sysctl_sched_uclamp_util_min_rt_default;
+
+
+static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+ return SCHED_CAPACITY_SCALE;
+}
+
+/* Integer rounded range for each bucket */
+#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
+
+static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
+{
+ return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA, UCLAMP_BUCKETS - 1);
+}
+
+static inline void
+uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool user_defined)
+{
+ uc_se->value = value;
+ uc_se->bucket_id = uclamp_bucket_id(value);
+ uc_se->user_defined = user_defined;
+}
+
+#else /* !CONFIG_UCLAMP_TASK: */
+
+static inline unsigned long
+uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
@@ -3115,8 +3197,8 @@ static inline bool uclamp_is_used(void)
return false;
}
-static inline unsigned long uclamp_rq_get(struct rq *rq,
- enum uclamp_id clamp_id)
+static inline unsigned long
+uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
@@ -3124,8 +3206,8 @@ static inline unsigned long uclamp_rq_get(struct rq *rq,
return SCHED_CAPACITY_SCALE;
}
-static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
- unsigned int value)
+static inline void
+uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id, unsigned int value)
{
}
@@ -3133,9 +3215,11 @@ static inline bool uclamp_rq_is_idle(struct rq *rq)
{
return false;
}
-#endif /* CONFIG_UCLAMP_TASK */
+
+#endif /* !CONFIG_UCLAMP_TASK */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
+
static inline unsigned long cpu_util_irq(struct rq *rq)
{
return READ_ONCE(rq->avg_irq.util_avg);
@@ -3150,7 +3234,9 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
return util;
}
-#else
+
+#else /* !CONFIG_HAVE_SCHED_AVG_IRQ: */
+
static inline unsigned long cpu_util_irq(struct rq *rq)
{
return 0;
@@ -3161,7 +3247,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
{
return util;
}
-#endif
+
+#endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
@@ -3179,11 +3266,13 @@ extern struct cpufreq_governor schedutil_gov;
#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
#define perf_domain_span(pd) NULL
+
static inline bool sched_energy_enabled(void) { return false; }
#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
#ifdef CONFIG_MEMBARRIER
+
/*
* The scheduler provides memory barriers required by membarrier between:
* - prior user-space memory accesses and store to rq->membarrier_state,
@@ -3205,13 +3294,16 @@ static inline void membarrier_switch_mm(struct rq *rq,
WRITE_ONCE(rq->membarrier_state, membarrier_state);
}
-#else
+
+#else /* !CONFIG_MEMBARRIER :*/
+
static inline void membarrier_switch_mm(struct rq *rq,
struct mm_struct *prev_mm,
struct mm_struct *next_mm)
{
}
-#endif
+
+#endif /* !CONFIG_MEMBARRIER */
#ifdef CONFIG_SMP
static inline bool is_per_cpu_kthread(struct task_struct *p)
@@ -3263,7 +3355,7 @@ static inline void __mm_cid_put(struct mm_struct *mm, int cid)
* be held to transition to other states.
*
* State transitions synchronized with cmpxchg or try_cmpxchg need to be
- * consistent across cpus, which prevents use of this_cpu_cmpxchg.
+ * consistent across CPUs, which prevents use of this_cpu_cmpxchg.
*/
static inline void mm_cid_put_lazy(struct task_struct *t)
{
@@ -3330,6 +3422,7 @@ static inline int __mm_cid_try_get(struct mm_struct *mm)
}
if (cpumask_test_and_set_cpu(cid, cpumask))
return -1;
+
return cid;
}
@@ -3394,6 +3487,7 @@ unlock:
raw_spin_unlock(&cid_lock);
end:
mm_cid_snapshot_time(rq, mm);
+
return cid;
}
@@ -3416,6 +3510,7 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
}
cid = __mm_cid_get(rq, mm);
__this_cpu_write(pcpu_cid->cid, cid);
+
return cid;
}
@@ -3470,15 +3565,68 @@ static inline void switch_mm_cid(struct rq *rq,
next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm);
}
-#else
+#else /* !CONFIG_SCHED_MM_CID: */
static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
static inline void init_sched_mm_cid(struct task_struct *t) { }
-#endif
+#endif /* !CONFIG_SCHED_MM_CID */
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+#ifdef CONFIG_RT_MUTEXES
+
+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
+{
+ if (pi_task)
+ prio = min(prio, pi_task->prio);
+
+ return prio;
+}
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+
+ return __rt_effective_prio(pi_task, prio);
+}
+
+#else /* !CONFIG_RT_MUTEXES: */
+
+static inline int rt_effective_prio(struct task_struct *p, int prio)
+{
+ return prio;
+}
+
+#endif /* !CONFIG_RT_MUTEXES */
+
+extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
+extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
+extern void __setscheduler_prio(struct task_struct *p, int prio);
+extern void set_load_weight(struct task_struct *p, bool update_load);
+extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
+extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags);
+
+extern void check_class_changed(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class,
+ int oldprio);
+
+#ifdef CONFIG_SMP
+extern struct balance_callback *splice_balance_callbacks(struct rq *rq);
+extern void balance_callbacks(struct rq *rq, struct balance_callback *head);
+#else
+
+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
+{
+ return NULL;
+}
+
+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
+{
+}
+
+#endif
+
#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index b02dfc322951..237780aa3c53 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -224,7 +224,7 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
/*
* Called when a task finally hits the CPU. We can now calculate how
* long it was waiting to run. We also note when it began so that we
- * can keep stats on how long its timeslice is.
+ * can keep stats on how long its time-slice is.
*/
static void sched_info_arrive(struct rq *rq, struct task_struct *t)
{
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
new file mode 100644
index 000000000000..ae1b42775ef9
--- /dev/null
+++ b/kernel/sched/syscalls.c
@@ -0,0 +1,1699 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kernel/sched/syscalls.c
+ *
+ * Core kernel scheduler syscalls related code
+ *
+ * Copyright (C) 1991-2002 Linus Torvalds
+ * Copyright (C) 1998-2024 Ingo Molnar, Red Hat
+ */
+#include <linux/sched.h>
+#include <linux/cpuset.h>
+#include <linux/sched/debug.h>
+
+#include <uapi/linux/sched/types.h>
+
+#include "sched.h"
+#include "autogroup.h"
+
+static inline int __normal_prio(int policy, int rt_prio, int nice)
+{
+ int prio;
+
+ if (dl_policy(policy))
+ prio = MAX_DL_PRIO - 1;
+ else if (rt_policy(policy))
+ prio = MAX_RT_PRIO - 1 - rt_prio;
+ else
+ prio = NICE_TO_PRIO(nice);
+
+ return prio;
+}
+
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(struct task_struct *p)
+{
+ return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+ p->normal_prio = normal_prio(p);
+ /*
+ * If we are RT tasks or we were boosted to RT priority,
+ * keep the priority unchanged. Otherwise, update priority
+ * to the normal priority:
+ */
+ if (!rt_prio(p->prio))
+ return p->normal_prio;
+ return p->prio;
+}
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+ bool queued, running;
+ struct rq *rq;
+ int old_prio;
+
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
+ return;
+ /*
+ * We have to be careful, if called from sys_setpriority(),
+ * the task might be in the middle of scheduling on another CPU.
+ */
+ CLASS(task_rq_lock, rq_guard)(p);
+ rq = rq_guard.rq;
+
+ update_rq_clock(rq);
+
+ /*
+ * The RT priorities are set via sched_setscheduler(), but we still
+ * allow the 'normal' nice value to be set - but as expected
+ * it won't have any effect on scheduling until the task is
+ * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
+ */
+ if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
+ p->static_prio = NICE_TO_PRIO(nice);
+ return;
+ }
+
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+ if (queued)
+ dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+ if (running)
+ put_prev_task(rq, p);
+
+ p->static_prio = NICE_TO_PRIO(nice);
+ set_load_weight(p, true);
+ old_prio = p->prio;
+ p->prio = effective_prio(p);
+
+ if (queued)
+ enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+ if (running)
+ set_next_task(rq, p);
+
+ /*
+ * If the task increased its priority or is running and
+ * lowered its priority, then reschedule its CPU:
+ */
+ p->sched_class->prio_changed(rq, p, old_prio);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
+ * @p: task
+ * @nice: nice value
+ */
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
+{
+ /* Convert nice value [19,-20] to rlimit style value [1,40]: */
+ int nice_rlim = nice_to_rlimit(nice);
+
+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+ return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+ long nice, retval;
+
+ /*
+ * Setpriority might change our priority at the same moment.
+ * We don't have to worry. Conceptually one call occurs first
+ * and we have a single winner.
+ */
+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+ nice = task_nice(current) + increment;
+
+ nice = clamp_val(nice, MIN_NICE, MAX_NICE);
+ if (increment < 0 && !can_nice(current, nice))
+ return -EPERM;
+
+ retval = security_task_setnice(current, nice);
+ if (retval)
+ return retval;
+
+ set_user_nice(current, nice);
+ return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The priority value as seen by users in /proc.
+ *
+ * sched policy return value kernel prio user prio/nice
+ *
+ * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19]
+ * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99]
+ * deadline -101 -1 0
+ */
+int task_prio(const struct task_struct *p)
+{
+ return p->prio - MAX_RT_PRIO;
+}
+
+/**
+ * idle_cpu - is a given CPU idle currently?
+ * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->curr != rq->idle)
+ return 0;
+
+ if (rq->nr_running)
+ return 0;
+
+#ifdef CONFIG_SMP
+ if (rq->ttwu_pending)
+ return 0;
+#endif
+
+ return 1;
+}
+
+/**
+ * available_idle_cpu - is a given CPU idle for enqueuing work.
+ * @cpu: the CPU in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int available_idle_cpu(int cpu)
+{
+ if (!idle_cpu(cpu))
+ return 0;
+
+ if (vcpu_is_preempted(cpu))
+ return 0;
+
+ return 1;
+}
+
+/**
+ * idle_task - return the idle task for a given CPU.
+ * @cpu: the processor in question.
+ *
+ * Return: The idle task for the CPU @cpu.
+ */
+struct task_struct *idle_task(int cpu)
+{
+ return cpu_rq(cpu)->idle;
+}
+
+#ifdef CONFIG_SCHED_CORE
+int sched_core_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (sched_core_enabled(rq) && rq->curr == rq->idle)
+ return 1;
+
+ return idle_cpu(cpu);
+}
+
+#endif
+
+#ifdef CONFIG_SMP
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ * cpu_util_{cfs,rt,dl,irq}()
+ * cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the IRQ utilization.
+ *
+ * The DL bandwidth number OTOH is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
+ unsigned long *min,
+ unsigned long *max)
+{
+ unsigned long util, irq, scale;
+ struct rq *rq = cpu_rq(cpu);
+
+ scale = arch_scale_cpu_capacity(cpu);
+
+ /*
+ * Early check to see if IRQ/steal time saturates the CPU, can be
+ * because of inaccuracies in how we track these -- see
+ * update_irq_load_avg().
+ */
+ irq = cpu_util_irq(rq);
+ if (unlikely(irq >= scale)) {
+ if (min)
+ *min = scale;
+ if (max)
+ *max = scale;
+ return scale;
+ }
+
+ if (min) {
+ /*
+ * The minimum utilization returns the highest level between:
+ * - the computed DL bandwidth needed with the IRQ pressure which
+ * steals time to the deadline task.
+ * - The minimum performance requirement for CFS and/or RT.
+ */
+ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+ /*
+ * When an RT task is runnable and uclamp is not used, we must
+ * ensure that the task will run at maximum compute capacity.
+ */
+ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ *min = max(*min, scale);
+ }
+
+ /*
+ * Because the time spend on RT/DL tasks is visible as 'lost' time to
+ * CFS tasks and we use the same metric to track the effective
+ * utilization (PELT windows are synchronized) we can directly add them
+ * to obtain the CPU's actual utilization.
+ */
+ util = util_cfs + cpu_util_rt(rq);
+ util += cpu_util_dl(rq);
+
+ /*
+ * The maximum hint is a soft bandwidth requirement, which can be lower
+ * than the actual utilization because of uclamp_max requirements.
+ */
+ if (max)
+ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
+
+ if (util >= scale)
+ return scale;
+
+ /*
+ * There is still idle time; further improve the number by using the
+ * IRQ metric. Because IRQ/steal time is hidden from the task clock we
+ * need to scale the task numbers:
+ *
+ * max - irq
+ * U' = irq + --------- * U
+ * max
+ */
+ util = scale_irq_capacity(util, irq, scale);
+ util += irq;
+
+ return min(scale, util);
+}
+
+unsigned long sched_cpu_util(int cpu)
+{
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
+}
+#endif /* CONFIG_SMP */
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
+ */
+static struct task_struct *find_process_by_pid(pid_t pid)
+{
+ return pid ? find_task_by_vpid(pid) : current;
+}
+
+static struct task_struct *find_get_task(pid_t pid)
+{
+ struct task_struct *p;
+ guard(rcu)();
+
+ p = find_process_by_pid(pid);
+ if (likely(p))
+ get_task_struct(p);
+
+ return p;
+}
+
+DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
+ find_get_task(pid), pid_t pid)
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
+
+static void __setscheduler_params(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ int policy = attr->sched_policy;
+
+ if (policy == SETPARAM_POLICY)
+ policy = p->policy;
+
+ p->policy = policy;
+
+ if (dl_policy(policy))
+ __setparam_dl(p, attr);
+ else if (fair_policy(policy))
+ p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+
+ /*
+ * __sched_setscheduler() ensures attr->sched_priority == 0 when
+ * !rt_policy. Always setting this ensures that things like
+ * getparam()/getattr() don't report silly values for !rt tasks.
+ */
+ p->rt_priority = attr->sched_priority;
+ p->normal_prio = normal_prio(p);
+ set_load_weight(p, true);
+}
+
+/*
+ * Check the target process has a UID that matches the current process's:
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred;
+ guard(rcu)();
+
+ pcred = __task_cred(p);
+ return (uid_eq(cred->euid, pcred->euid) ||
+ uid_eq(cred->euid, pcred->uid));
+}
+
+#ifdef CONFIG_UCLAMP_TASK
+
+static int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+ util_min = attr->sched_util_min;
+
+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+ util_max = attr->sched_util_max;
+
+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (util_min != -1 && util_max != -1 && util_min > util_max)
+ return -EINVAL;
+
+ /*
+ * We have valid uclamp attributes; make sure uclamp is enabled.
+ *
+ * We need to do that here, because enabling static branches is a
+ * blocking operation which obviously cannot be done while holding
+ * scheduler locks.
+ */
+ static_branch_enable(&sched_uclamp_used);
+
+ return 0;
+}
+
+static bool uclamp_reset(const struct sched_attr *attr,
+ enum uclamp_id clamp_id,
+ struct uclamp_se *uc_se)
+{
+ /* Reset on sched class change for a non user-defined clamp value. */
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
+ !uc_se->user_defined)
+ return true;
+
+ /* Reset on sched_util_{min,max} == -1. */
+ if (clamp_id == UCLAMP_MIN &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min == -1) {
+ return true;
+ }
+
+ if (clamp_id == UCLAMP_MAX &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max == -1) {
+ return true;
+ }
+
+ return false;
+}
+
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ enum uclamp_id clamp_id;
+
+ for_each_clamp_id(clamp_id) {
+ struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
+ unsigned int value;
+
+ if (!uclamp_reset(attr, clamp_id, uc_se))
+ continue;
+
+ /*
+ * RT by default have a 100% boost value that could be modified
+ * at runtime.
+ */
+ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+ value = sysctl_sched_uclamp_util_min_rt_default;
+ else
+ value = uclamp_none(clamp_id);
+
+ uclamp_se_set(uc_se, value, false);
+
+ }
+
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
+ return;
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min != -1) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
+ attr->sched_util_min, true);
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max != -1) {
+ uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
+ attr->sched_util_max, true);
+ }
+}
+
+#else /* !CONFIG_UCLAMP_TASK: */
+
+static inline int uclamp_validate(struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ return -EOPNOTSUPP;
+}
+static void __setscheduler_uclamp(struct task_struct *p,
+ const struct sched_attr *attr) { }
+#endif
+
+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ int policy, int reset_on_fork)
+{
+ if (fair_policy(policy)) {
+ if (attr->sched_nice < task_nice(p) &&
+ !is_nice_reduction(p, attr->sched_nice))
+ goto req_priv;
+ }
+
+ if (rt_policy(policy)) {
+ unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+
+ /* Can't set/change the rt policy: */
+ if (policy != p->policy && !rlim_rtprio)
+ goto req_priv;
+
+ /* Can't increase priority: */
+ if (attr->sched_priority > p->rt_priority &&
+ attr->sched_priority > rlim_rtprio)
+ goto req_priv;
+ }
+
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ goto req_priv;
+
+ /*
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+ */
+ if (task_has_idle_policy(p) && !idle_policy(policy)) {
+ if (!is_nice_reduction(p, task_nice(p)))
+ goto req_priv;
+ }
+
+ /* Can't change other user's priorities: */
+ if (!check_same_owner(p))
+ goto req_priv;
+
+ /* Normal users shall not reset the sched_reset_on_fork flag: */
+ if (p->sched_reset_on_fork && !reset_on_fork)
+ goto req_priv;
+
+ return 0;
+
+req_priv:
+ if (!capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ return 0;
+}
+
+int __sched_setscheduler(struct task_struct *p,
+ const struct sched_attr *attr,
+ bool user, bool pi)
+{
+ int oldpolicy = -1, policy = attr->sched_policy;
+ int retval, oldprio, newprio, queued, running;
+ const struct sched_class *prev_class;
+ struct balance_callback *head;
+ struct rq_flags rf;
+ int reset_on_fork;
+ int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ struct rq *rq;
+ bool cpuset_locked = false;
+
+ /* The pi code expects interrupts enabled */
+ BUG_ON(pi && in_interrupt());
+recheck:
+ /* Double check policy once rq lock held: */
+ if (policy < 0) {
+ reset_on_fork = p->sched_reset_on_fork;
+ policy = oldpolicy = p->policy;
+ } else {
+ reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
+
+ if (!valid_policy(policy))
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
+ return -EINVAL;
+
+ /*
+ * Valid priorities for SCHED_FIFO and SCHED_RR are
+ * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
+ * SCHED_BATCH and SCHED_IDLE is 0.
+ */
+ if (attr->sched_priority > MAX_RT_PRIO-1)
+ return -EINVAL;
+ if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
+ (rt_policy(policy) != (attr->sched_priority != 0)))
+ return -EINVAL;
+
+ if (user) {
+ retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+ if (retval)
+ return retval;
+
+ if (attr->sched_flags & SCHED_FLAG_SUGOV)
+ return -EINVAL;
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+ }
+
+ /* Update task specific "requested" clamps */
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+ retval = uclamp_validate(p, attr);
+ if (retval)
+ return retval;
+ }
+
+ /*
+ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
+ * information.
+ */
+ if (dl_policy(policy) || dl_policy(p->policy)) {
+ cpuset_locked = true;
+ cpuset_lock();
+ }
+
+ /*
+ * Make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+ *
+ * To be able to change p->policy safely, the appropriate
+ * runqueue lock must be held.
+ */
+ rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
+
+ /*
+ * Changing the policy of the stop threads its a very bad idea:
+ */
+ if (p == rq->stop) {
+ retval = -EINVAL;
+ goto unlock;
+ }
+
+ /*
+ * If not changing anything there's no need to proceed further,
+ * but store a possible modification of reset_on_fork.
+ */
+ if (unlikely(policy == p->policy)) {
+ if (fair_policy(policy) && attr->sched_nice != task_nice(p))
+ goto change;
+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+ goto change;
+ if (dl_policy(policy) && dl_param_changed(p, attr))
+ goto change;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+ goto change;
+
+ p->sched_reset_on_fork = reset_on_fork;
+ retval = 0;
+ goto unlock;
+ }
+change:
+
+ if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Do not allow real-time tasks into groups that have no runtime
+ * assigned.
+ */
+ if (rt_bandwidth_enabled() && rt_policy(policy) &&
+ task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+ !task_group_is_autogroup(task_group(p))) {
+ retval = -EPERM;
+ goto unlock;
+ }
+#endif
+#ifdef CONFIG_SMP
+ if (dl_bandwidth_enabled() && dl_policy(policy) &&
+ !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
+ cpumask_t *span = rq->rd->span;
+
+ /*
+ * Don't allow tasks with an affinity mask smaller than
+ * the entire root_domain to become SCHED_DEADLINE. We
+ * will also fail if there's no bandwidth available.
+ */
+ if (!cpumask_subset(span, p->cpus_ptr) ||
+ rq->rd->dl_bw.bw == 0) {
+ retval = -EPERM;
+ goto unlock;
+ }
+ }
+#endif
+ }
+
+ /* Re-check policy now with rq lock held: */
+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+ policy = oldpolicy = -1;
+ task_rq_unlock(rq, p, &rf);
+ if (cpuset_locked)
+ cpuset_unlock();
+ goto recheck;
+ }
+
+ /*
+ * If setscheduling to SCHED_DEADLINE (or changing the parameters
+ * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+ * is available.
+ */
+ if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
+ retval = -EBUSY;
+ goto unlock;
+ }
+
+ p->sched_reset_on_fork = reset_on_fork;
+ oldprio = p->prio;
+
+ newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
+ if (pi) {
+ /*
+ * Take priority boosted tasks into account. If the new
+ * effective priority is unchanged, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ newprio = rt_effective_prio(p, newprio);
+ if (newprio == oldprio)
+ queue_flags &= ~DEQUEUE_MOVE;
+ }
+
+ queued = task_on_rq_queued(p);
+ running = task_current(rq, p);
+ if (queued)
+ dequeue_task(rq, p, queue_flags);
+ if (running)
+ put_prev_task(rq, p);
+
+ prev_class = p->sched_class;
+
+ if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
+ __setscheduler_params(p, attr);
+ __setscheduler_prio(p, newprio);
+ }
+ __setscheduler_uclamp(p, attr);
+
+ if (queued) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ if (oldprio < p->prio)
+ queue_flags |= ENQUEUE_HEAD;
+
+ enqueue_task(rq, p, queue_flags);
+ }
+ if (running)
+ set_next_task(rq, p);
+
+ check_class_changed(rq, p, prev_class, oldprio);
+
+ /* Avoid rq from going away on us: */
+ preempt_disable();
+ head = splice_balance_callbacks(rq);
+ task_rq_unlock(rq, p, &rf);
+
+ if (pi) {
+ if (cpuset_locked)
+ cpuset_unlock();
+ rt_mutex_adjust_pi(p);
+ }
+
+ /* Run balance callbacks after we've adjusted the PI chain: */
+ balance_callbacks(rq, head);
+ preempt_enable();
+
+ return 0;
+
+unlock:
+ task_rq_unlock(rq, p, &rf);
+ if (cpuset_locked)
+ cpuset_unlock();
+ return retval;
+}
+
+static int _sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param, bool check)
+{
+ struct sched_attr attr = {
+ .sched_policy = policy,
+ .sched_priority = param->sched_priority,
+ .sched_nice = PRIO_TO_NICE(p->static_prio),
+ };
+
+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ policy &= ~SCHED_RESET_ON_FORK;
+ attr.sched_policy = policy;
+ }
+
+ return __sched_setscheduler(p, &attr, check, true);
+}
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Use sched_set_fifo(), read its comment.
+ *
+ * Return: 0 on success. An error code otherwise.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, true);
+}
+
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, true, true);
+}
+
+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
+{
+ return __sched_setscheduler(p, attr, false, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernel-space.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission. For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+ const struct sched_param *param)
+{
+ return _sched_setscheduler(p, policy, param, false);
+}
+
+/*
+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
+ * incapable of resource management, which is the one thing an OS really should
+ * be doing.
+ *
+ * This is of course the reason it is limited to privileged users only.
+ *
+ * Worse still; it is fundamentally impossible to compose static priority
+ * workloads. You cannot take two correctly working static prio workloads
+ * and smash them together and still expect them to work.
+ *
+ * For this reason 'all' FIFO tasks the kernel creates are basically at:
+ *
+ * MAX_RT_PRIO / 2
+ *
+ * The administrator _MUST_ configure the system, the kernel simply doesn't
+ * know enough information to make a sensible choice.
+ */
+void sched_set_fifo(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo);
+
+/*
+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
+ */
+void sched_set_fifo_low(struct task_struct *p)
+{
+ struct sched_param sp = { .sched_priority = 1 };
+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_fifo_low);
+
+void sched_set_normal(struct task_struct *p, int nice)
+{
+ struct sched_attr attr = {
+ .sched_policy = SCHED_NORMAL,
+ .sched_nice = nice,
+ };
+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
+}
+EXPORT_SYMBOL_GPL(sched_set_normal);
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+ struct sched_param lparam;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+ if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+ return -EFAULT;
+
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
+
+ return sched_setscheduler(p, policy, &lparam);
+}
+
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
+{
+ u32 size;
+ int ret;
+
+ /* Zero the full structure, so that a short copy will be nice: */
+ memset(attr, 0, sizeof(*attr));
+
+ ret = get_user(size, &uattr->size);
+ if (ret)
+ return ret;
+
+ /* ABI compatibility quirk: */
+ if (!size)
+ size = SCHED_ATTR_SIZE_VER0;
+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
+ goto err_size;
+
+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
+ if (ret) {
+ if (ret == -E2BIG)
+ goto err_size;
+ return ret;
+ }
+
+ if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
+ size < SCHED_ATTR_SIZE_VER1)
+ return -EINVAL;
+
+ /*
+ * XXX: Do we want to be lenient like existing syscalls; or do we want
+ * to be strict and return an error on out-of-bounds values?
+ */
+ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
+
+ return 0;
+
+err_size:
+ put_user(sizeof(*attr), &uattr->size);
+ return -E2BIG;
+}
+
+static void get_params(struct task_struct *p, struct sched_attr *attr)
+{
+ if (task_has_dl_policy(p))
+ __getparam_dl(p, attr);
+ else if (task_has_rt_policy(p))
+ attr->sched_priority = p->rt_priority;
+ else
+ attr->sched_nice = task_nice(p);
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
+{
+ if (policy < 0)
+ return -EINVAL;
+
+ return do_sched_setscheduler(pid, policy, param);
+}
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, flags)
+{
+ struct sched_attr attr;
+ int retval;
+
+ if (!uattr || pid < 0 || flags)
+ return -EINVAL;
+
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if ((int)attr.sched_policy < 0)
+ return -EINVAL;
+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
+ attr.sched_policy = SETPARAM_POLICY;
+
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
+
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ get_params(p, &attr);
+
+ return sched_setattr(p, &attr);
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+ struct task_struct *p;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ guard(rcu)();
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (!retval) {
+ retval = p->policy;
+ if (p->sched_reset_on_fork)
+ retval |= SCHED_RESET_ON_FORK;
+ }
+ return retval;
+}
+
+/**
+ * sys_sched_getparam - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+ struct sched_param lp = { .sched_priority = 0 };
+ struct task_struct *p;
+ int retval;
+
+ if (!param || pid < 0)
+ return -EINVAL;
+
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
+ }
+
+ /*
+ * This one might sleep, we cannot do it with a spinlock held ...
+ */
+ return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+}
+
+/*
+ * Copy the kernel size attribute structure (which might be larger
+ * than what user-space knows about) to user-space.
+ *
+ * Note that all cases are valid: user-space buffer can be larger or
+ * smaller than the kernel-space buffer. The usual case is that both
+ * have the same size.
+ */
+static int
+sched_attr_copy_to_user(struct sched_attr __user *uattr,
+ struct sched_attr *kattr,
+ unsigned int usize)
+{
+ unsigned int ksize = sizeof(*kattr);
+
+ if (!access_ok(uattr, usize))
+ return -EFAULT;
+
+ /*
+ * sched_getattr() ABI forwards and backwards compatibility:
+ *
+ * If usize == ksize then we just copy everything to user-space and all is good.
+ *
+ * If usize < ksize then we only copy as much as user-space has space for,
+ * this keeps ABI compatibility as well. We skip the rest.
+ *
+ * If usize > ksize then user-space is using a newer version of the ABI,
+ * which part the kernel doesn't know about. Just ignore it - tooling can
+ * detect the kernel's knowledge of attributes from the attr->size value
+ * which is set to ksize in this case.
+ */
+ kattr->size = min(usize, ksize);
+
+ if (copy_to_user(uattr, kattr, kattr->size))
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @usize: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, usize, unsigned int, flags)
+{
+ struct sched_attr kattr = { };
+ struct task_struct *p;
+ int retval;
+
+ if (!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags)
+ return -EINVAL;
+
+ scoped_guard (rcu) {
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ kattr.sched_policy = p->policy;
+ if (p->sched_reset_on_fork)
+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+ get_params(p, &kattr);
+ kattr.sched_flags &= SCHED_FLAG_ALL;
+
+#ifdef CONFIG_UCLAMP_TASK
+ /*
+ * This could race with another potential updater, but this is fine
+ * because it'll correctly read the old or the new value. We don't need
+ * to guarantee who wins the race as long as it doesn't return garbage.
+ */
+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+#endif
+ }
+
+ return sched_attr_copy_to_user(uattr, &kattr, usize);
+}
+
+#ifdef CONFIG_SMP
+int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
+{
+ /*
+ * If the task isn't a deadline task or admission control is
+ * disabled then we don't care about affinity changes.
+ */
+ if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
+ return 0;
+
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+ guard(rcu)();
+ if (!cpumask_subset(task_rq(p)->rd->span, mask))
+ return -EBUSY;
+
+ return 0;
+}
+#endif /* CONFIG_SMP */
+
+int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
+{
+ int retval;
+ cpumask_var_t cpus_allowed, new_mask;
+
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+
+ ctx->new_mask = new_mask;
+ ctx->flags |= SCA_CHECK;
+
+ retval = dl_task_check_affinity(p, new_mask);
+ if (retval)
+ goto out_free_new_mask;
+
+ retval = __set_cpus_allowed_ptr(p, ctx);
+ if (retval)
+ goto out_free_new_mask;
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset update.
+ * Just reset the cpumask to the cpuset's cpus_allowed.
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+
+ /*
+ * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
+ * will restore the previous user_cpus_ptr value.
+ *
+ * In the unlikely event a previous user_cpus_ptr exists,
+ * we need to further restrict the mask to what is allowed
+ * by that old user_cpus_ptr.
+ */
+ if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
+ bool empty = !cpumask_and(new_mask, new_mask,
+ ctx->user_mask);
+
+ if (WARN_ON_ONCE(empty))
+ cpumask_copy(new_mask, cpus_allowed);
+ }
+ __set_cpus_allowed_ptr(p, ctx);
+ retval = -EINVAL;
+ }
+
+out_free_new_mask:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+ struct affinity_context ac;
+ struct cpumask *user_mask;
+ int retval;
+
+ CLASS(find_get_task, p)(pid);
+ if (!p)
+ return -ESRCH;
+
+ if (p->flags & PF_NO_SETAFFINITY)
+ return -EINVAL;
+
+ if (!check_same_owner(p)) {
+ guard(rcu)();
+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
+ return -EPERM;
+ }
+
+ retval = security_task_setscheduler(p);
+ if (retval)
+ return retval;
+
+ /*
+ * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
+ * alloc_user_cpus_ptr() returns NULL.
+ */
+ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
+ if (user_mask) {
+ cpumask_copy(user_mask, in_mask);
+ } else if (IS_ENABLED(CONFIG_SMP)) {
+ return -ENOMEM;
+ }
+
+ ac = (struct affinity_context){
+ .new_mask = in_mask,
+ .user_mask = user_mask,
+ .flags = SCA_USER,
+ };
+
+ retval = __sched_setaffinity(p, &ac);
+ kfree(ac.user_mask);
+
+ return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+ struct cpumask *new_mask)
+{
+ if (len < cpumask_size())
+ cpumask_clear(new_mask);
+ else if (len > cpumask_size())
+ len = cpumask_size();
+
+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+/**
+ * sys_sched_setaffinity - set the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new CPU mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ cpumask_var_t new_mask;
+ int retval;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+ if (retval == 0)
+ retval = sched_setaffinity(pid, new_mask);
+ free_cpumask_var(new_mask);
+ return retval;
+}
+
+long sched_getaffinity(pid_t pid, struct cpumask *mask)
+{
+ struct task_struct *p;
+ int retval;
+
+ guard(rcu)();
+ p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ guard(raw_spinlock_irqsave)(&p->pi_lock);
+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
+
+ return 0;
+}
+
+/**
+ * sys_sched_getaffinity - get the CPU affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current CPU mask
+ *
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+ unsigned long __user *, user_mask_ptr)
+{
+ int ret;
+ cpumask_var_t mask;
+
+ if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+ return -EINVAL;
+ if (len & (sizeof(unsigned long)-1))
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = sched_getaffinity(pid, mask);
+ if (ret == 0) {
+ unsigned int retlen = min(len, cpumask_size());
+
+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
+ ret = -EFAULT;
+ else
+ ret = retlen;
+ }
+ free_cpumask_var(mask);
+
+ return ret;
+}
+
+static void do_sched_yield(void)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = this_rq_lock_irq(&rf);
+
+ schedstat_inc(rq->yld_count);
+ current->sched_class->yield_task(rq);
+
+ preempt_disable();
+ rq_unlock_irq(rq, &rf);
+ sched_preempt_enable_no_resched();
+
+ schedule();
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. If there are no
+ * other threads running on this CPU then this function will return.
+ *
+ * Return: 0.
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+ do_sched_yield();
+ return 0;
+}
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, it's already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
+ */
+void __sched yield(void)
+{
+ set_current_state(TASK_RUNNING);
+ do_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Return:
+ * true (>0) if we indeed boosted the target task.
+ * false (0) if we failed to boost the target.
+ * -ESRCH if there's no task to yield to.
+ */
+int __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct task_struct *curr = current;
+ struct rq *rq, *p_rq;
+ int yielded = 0;
+
+ scoped_guard (irqsave) {
+ rq = this_rq();
+
+again:
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (rq->nr_running == 1 && p_rq->nr_running == 1)
+ return -ESRCH;
+
+ guard(double_rq_lock)(rq, p_rq);
+ if (task_rq(p) != p_rq)
+ goto again;
+
+ if (!curr->sched_class->yield_to_task)
+ return 0;
+
+ if (curr->sched_class != p->sched_class)
+ return 0;
+
+ if (task_on_cpu(p_rq, p) || !task_is_running(p))
+ return 0;
+
+ yielded = curr->sched_class->yield_to_task(rq, p);
+ if (yielded) {
+ schedstat_inc(rq->yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity
+ * takes care of fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_curr(p_rq);
+ }
+ }
+
+ if (yielded)
+ schedule();
+
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = MAX_RT_PRIO-1;
+ break;
+ case SCHED_DEADLINE:
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+ int ret = -EINVAL;
+
+ switch (policy) {
+ case SCHED_FIFO:
+ case SCHED_RR:
+ ret = 1;
+ break;
+ case SCHED_DEADLINE:
+ case SCHED_NORMAL:
+ case SCHED_BATCH:
+ case SCHED_IDLE:
+ ret = 0;
+ }
+ return ret;
+}
+
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
+{
+ unsigned int time_slice = 0;
+ int retval;
+
+ if (pid < 0)
+ return -EINVAL;
+
+ scoped_guard (rcu) {
+ struct task_struct *p = find_process_by_pid(pid);
+ if (!p)
+ return -ESRCH;
+
+ retval = security_task_getscheduler(p);
+ if (retval)
+ return retval;
+
+ scoped_guard (task_rq_lock, p) {
+ struct rq *rq = scope.rq;
+ if (p->sched_class->get_rr_interval)
+ time_slice = p->sched_class->get_rr_interval(rq, p);
+ }
+ }
+
+ jiffies_to_timespec64(time_slice, t);
+ return 0;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default time-slice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the time-slice value.
+ *
+ * this syscall writes the default time-slice value of a given process
+ * into the user-space timespec buffer. A value of '0' means infinity.
+ *
+ * Return: On success, 0 and the time-slice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct __kernel_timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_timespec64(&t, interval);
+
+ return retval;
+}
+
+#ifdef CONFIG_COMPAT_32BIT_TIME
+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
+ struct old_timespec32 __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_old_timespec32(&t, interval);
+ return retval;
+}
+#endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index a6994a1fcc90..784a0be81e84 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -501,7 +501,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
cpumask_clear_cpu(rq->cpu, old_rd->span);
/*
- * If we dont want to free the old_rd yet then
+ * If we don't want to free the old_rd yet then
* set old_rd to NULL to skip the freeing later
* in this function:
*/
@@ -1176,7 +1176,7 @@ fail:
* uniquely identify each group (for a given domain):
*
* - The first is the balance_cpu (see should_we_balance() and the
- * load-balance blub in fair.c); for each group we only want 1 CPU to
+ * load-balance blurb in fair.c); for each group we only want 1 CPU to
* continue balancing at a higher domain.
*
* - The second is the sched_group_capacity; we want all identical groups
@@ -1388,7 +1388,7 @@ static inline void asym_cpu_capacity_update_data(int cpu)
/*
* Search if capacity already exits. If not, track which the entry
- * where we should insert to keep the list ordered descendingly.
+ * where we should insert to keep the list ordered descending.
*/
list_for_each_entry(entry, &asym_cap_list, link) {
if (capacity == entry->capacity)
@@ -1853,7 +1853,7 @@ void sched_init_numa(int offline_node)
struct cpumask ***masks;
/*
- * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+ * O(nr_nodes^2) de-duplicating selection sort -- in order to find the
* unique distances in the node_distance() table.
*/
distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
@@ -2750,7 +2750,7 @@ match2:
}
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
- /* Build perf. domains: */
+ /* Build perf domains: */
for (i = 0; i < ndoms_new; i++) {
for (j = 0; j < n && !sched_energy_update; j++) {
if (cpumask_equal(doms_new[i], doms_cur[j]) &&
@@ -2759,7 +2759,7 @@ match2:
goto match3;
}
}
- /* No match - add perf. domains for a new rd */
+ /* No match - add perf domains for a new rd */
has_eas |= build_perf_domains(doms_new[i]);
match3:
;
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 0b1cd985dc27..134d7112ef71 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -33,7 +33,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
EXPORT_SYMBOL(wake_bit_function);
/*
- * To allow interruptible waiting and asynchronous (i.e. nonblocking)
+ * To allow interruptible waiting and asynchronous (i.e. non-blocking)
* waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
* permitted return codes. Nonzero return codes halt waiting and return.
*/
@@ -133,7 +133,7 @@ EXPORT_SYMBOL(__wake_up_bit);
* @bit: the bit of the word being waited on
*
* There is a standard hashed waitqueue table for generic use. This
- * is the part of the hashtable's accessor API that wakes up waiters
+ * is the part of the hash-table's accessor API that wakes up waiters
* on a bit. For instance, if one were to have waiters on a bitflag,
* one would call wake_up_bit() after clearing the bit.
*