Merge tag 'sched-urgent-2024-05-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar: - Fix a sched_balance_newidle setting bug - Fix bug in the setting of /sys/fs/cgroup/test/cpu.max.burst - Fix variable-shadowing build warning - Extend sched-domains debug output - Fix documentation - Fix comments * tag 'sched-urgent-2024-05-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/core: Fix incorrect initialization of the 'burst' parameter in cpu_max_write() sched/fair: Remove stale FREQUENCY_UTIL comment sched/fair: Fix initial util_avg calculation docs: cgroup-v1: Clarify that domain levels are system-specific sched/debug: Dump domains' level sched/fair: Allow disabling sched_balance_newidle with sched_relax_domain_level arch/topology: Fix variable naming to avoid shadowing
author: Linus Torvalds <torvalds@linux-foundation.org> 2024-05-19 11:38:15 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2024-05-19 11:38:15 -0700
commit: 8dde191aabba42e9c16c8d9c853a72a062db27ee (patch)
tree: f6925e9996b8115ee7fab3a62d604b606af5a374
parent: fe0d43f23110ec80aea1f94eeb1e6ddab9ef453f (diff)
parent: 49217ea147df7647cb89161b805c797487783fc0 (diff)
7 files changed, 19 insertions, 12 deletions
diff --git a/Documentation/admin-guide/cgroup-v1/cpusets.rst b/Documentation/admin-guide/cgroup-v1/cpusets.rst
index 7d3415eea05d..f401af5e2f09 100644
--- a/Documentation/admin-guide/cgroup-v1/cpusets.rst
+++ b/Documentation/admin-guide/cgroup-v1/cpusets.rst
@@ -568,7 +568,7 @@ on the next tick.  For some applications in special situation, waiting
 
 The 'cpuset.sched_relax_domain_level' file allows you to request changing
 this searching range as you like.  This file takes int value which
-indicates size of searching range in levels ideally as follows,
+indicates size of searching range in levels approximately as follows,
 otherwise initial value -1 that indicates the cpuset has no request.
 
 ====== ===========================================================
@@ -581,6 +581,11 @@ otherwise initial value -1 that indicates the cpuset has no request.
    5   search system wide [on NUMA system]
 ====== ===========================================================
 
+Not all levels can be present and values can change depending on the
+system architecture and kernel configuration. Check
+/sys/kernel/debug/sched/domains/cpu*/domain*/ for system-specific
+details.
+
 The system default is architecture dependent.  The system default
 can be changed using the relax_domain_level= boot parameter.
 
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 0248912ff687..c66d070207a0 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -179,7 +179,7 @@ DEFINE_PER_CPU(unsigned long, hw_pressure);
 void topology_update_hw_pressure(const struct cpumask *cpus,
 				      unsigned long capped_freq)
 {
-	unsigned long max_capacity, capacity, hw_pressure;
+	unsigned long max_capacity, capacity, pressure;
 	u32 max_freq;
 	int cpu;
 
@@ -196,12 +196,12 @@ void topology_update_hw_pressure(const struct cpumask *cpus,
 	else
 		capacity = mult_frac(max_capacity, capped_freq, max_freq);
 
-	hw_pressure = max_capacity - capacity;
+	pressure = max_capacity - capacity;
 
-	trace_hw_pressure_update(cpu, hw_pressure);
+	trace_hw_pressure_update(cpu, pressure);
 
 	for_each_cpu(cpu, cpus)
-		WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure);
+		WRITE_ONCE(per_cpu(hw_pressure, cpu), pressure);
 }
 EXPORT_SYMBOL_GPL(topology_update_hw_pressure);
 
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index a10e4bd0c0c1..c12b9fdb22a4 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2941,7 +2941,7 @@ bool current_cpuset_is_being_rebound(void)
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
 #ifdef CONFIG_SMP
-	if (val < -1 || val >= sched_domain_level_max)
+	if (val < -1 || val > sched_domain_level_max + 1)
 		return -EINVAL;
 #endif
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 373eaeaf63b8..bcf2c4cc0522 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11401,7 +11401,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
 {
 	struct task_group *tg = css_tg(of_css(of));
 	u64 period = tg_get_cfs_period(tg);
-	u64 burst = tg_get_cfs_burst(tg);
+	u64 burst = tg->cfs_bandwidth.burst;
 	u64 quota;
 	int ret;
 
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8d5d98a5834d..c1eb9a1afd13 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -425,6 +425,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent)
 
 	debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
 	debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
+	debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level);
 }
 
 void update_sched_domain_debugfs(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4214df32ba45..8a5b1ae0aa55 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1030,7 +1030,8 @@ void init_entity_runnable_average(struct sched_entity *se)
  * With new tasks being created, their initial util_avgs are extrapolated
  * based on the cfs_rq's current util_avg:
  *
- *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *   util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1)
+ *		* se_weight(se)
  *
  * However, in many cases, the above util_avg does not give a desired
  * value. Moreover, the sum of the util_avgs may be divergent, such
@@ -1077,7 +1078,7 @@ void post_init_entity_util_avg(struct task_struct *p)
 
 	if (cap > 0) {
 		if (cfs_rq->avg.util_avg != 0) {
-			sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
+			sa->util_avg  = cfs_rq->avg.util_avg * se_weight(se);
 			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
 
 			if (sa->util_avg > cap)
@@ -7898,8 +7899,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
 		 * Performance domain frequency: utilization clamping
 		 * must be considered since it affects the selection
 		 * of the performance domain frequency.
-		 * NOTE: in case RT tasks are running, by default the
-		 * FREQUENCY_UTIL's utilization can be max OPP.
+		 * NOTE: in case RT tasks are running, by default the min
+		 * utilization can be max OPP.
 		 */
 		eff_util = effective_cpu_util(cpu, util, &min, &max);
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 683559831656..329c82faca9b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1474,7 +1474,7 @@ static void set_domain_attribute(struct sched_domain *sd,
 	} else
 		request = attr->relax_domain_level;
 
-	if (sd->level > request) {
+	if (sd->level >= request) {
 		/* Turn off idle balance on this domain: */
 		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
 	}
author	Linus Torvalds <torvalds@linux-foundation.org>	2024-05-19 11:38:15 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2024-05-19 11:38:15 -0700
commit	8dde191aabba42e9c16c8d9c853a72a062db27ee (patch)
tree	f6925e9996b8115ee7fab3a62d604b606af5a374
parent	fe0d43f23110ec80aea1f94eeb1e6ddab9ef453f (diff)
parent	49217ea147df7647cb89161b805c797487783fc0 (diff)