diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-11 18:45:16 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-11 18:45:16 -0700 |
commit | 89c572e2f30c4555da59c4d66dff62f30fb057fd (patch) | |
tree | 9bca20f01b8ee0da4db42d85000eaa51fd0f2023 | |
parent | a5b1a017cb76e4898dd62fcb97e8aee6a63b33b5 (diff) | |
parent | 54de442747037485da1fc4eca9636287a61e97e3 (diff) |
Merge tag 'sched-core-2024-03-11' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- Fix inconsistency in misfit task load-balancing
- Fix CPU isolation bugs in the task-wakeup logic
- Rework and unify the sched_use_asym_prio() and sched_asym_prefer()
logic
- Clean up and simplify ->avg_* accesses
- Misc cleanups and fixes
* tag 'sched-core-2024-03-11' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/topology: Rename SD_SHARE_PKG_RESOURCES to SD_SHARE_LLC
sched/fair: Check the SD_ASYM_PACKING flag in sched_use_asym_prio()
sched/fair: Rework sched_use_asym_prio() and sched_asym_prefer()
sched/fair: Remove unused parameter from sched_asym()
sched/topology: Remove duplicate descriptions from TOPOLOGY_SD_FLAGS
sched/fair: Simplify the update_sd_pick_busiest() logic
sched/fair: Do strict inequality check for busiest misfit task group
sched/fair: Remove unnecessary goto in update_sd_lb_stats()
sched/fair: Take the scheduling domain into account in select_idle_core()
sched/fair: Take the scheduling domain into account in select_idle_smt()
sched/fair: Add READ_ONCE() and use existing helper function to access ->avg_irq
sched/fair: Use existing helper functions to access ->avg_rt and ->avg_dl
sched/core: Simplify code by removing duplicate #ifdefs
-rw-r--r-- | arch/powerpc/kernel/smp.c | 6 | ||||
-rw-r--r-- | include/linux/sched/sd_flags.h | 4 | ||||
-rw-r--r-- | include/linux/sched/topology.h | 6 | ||||
-rw-r--r-- | kernel/sched/core.c | 4 | ||||
-rw-r--r-- | kernel/sched/fair.c | 110 | ||||
-rw-r--r-- | kernel/sched/sched.h | 2 | ||||
-rw-r--r-- | kernel/sched/topology.c | 35 |
7 files changed, 74 insertions, 93 deletions
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 693334c20d07..a60e4139214b 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -984,7 +984,7 @@ static bool shared_caches __ro_after_init; /* cpumask of CPUs with asymmetric SMT dependency */ static int powerpc_smt_flags(void) { - int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); @@ -1010,9 +1010,9 @@ static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack); static int powerpc_shared_cache_flags(void) { if (static_branch_unlikely(&splpar_asym_pack)) - return SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING; + return SD_SHARE_LLC | SD_ASYM_PACKING; - return SD_SHARE_PKG_RESOURCES; + return SD_SHARE_LLC; } static int powerpc_shared_proc_flags(void) diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index a8b28647aafc..b04a5d04dee9 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -117,13 +117,13 @@ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS) /* - * Domain members share CPU package resources (i.e. caches) + * Domain members share CPU Last Level Caches * * SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share * the same cache(s). * NEEDS_GROUPS: Caches are shared between groups. */ -SD_FLAG(SD_SHARE_PKG_RESOURCES, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) +SD_FLAG(SD_SHARE_LLC, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) /* * Only a single load balancing instance diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 11e0e00e0bb9..18572c9ea724 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -38,21 +38,21 @@ extern const struct sd_flag_debug sd_flag_debug[]; #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) { - return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + return SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; } #endif #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) { - return SD_CLUSTER | SD_SHARE_PKG_RESOURCES; + return SD_CLUSTER | SD_SHARE_LLC; } #endif #ifdef CONFIG_SCHED_MC static inline int cpu_core_flags(void) { - return SD_SHARE_PKG_RESOURCES; + return SD_SHARE_LLC; } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 540f229700b6..d44efa0d0611 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1792,7 +1792,6 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css); #endif #ifdef CONFIG_SYSCTL -#ifdef CONFIG_UCLAMP_TASK #ifdef CONFIG_UCLAMP_TASK_GROUP static void uclamp_update_root_tg(void) { @@ -1898,7 +1897,6 @@ undo: return result; } #endif -#endif static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) @@ -2065,7 +2063,7 @@ static void __init init_uclamp(void) } } -#else /* CONFIG_UCLAMP_TASK */ +#else /* !CONFIG_UCLAMP_TASK */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline int uclamp_validate(struct task_struct *p, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 533547e3c90a..6a16129f9a5c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7289,7 +7289,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu if (!available_idle_cpu(cpu)) { idle = false; if (*idle_cpu == -1) { - if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) { *idle_cpu = cpu; break; } @@ -7297,7 +7297,7 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu } break; } - if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr)) + if (*idle_cpu == -1 && cpumask_test_cpu(cpu, cpus)) *idle_cpu = cpu; } @@ -7311,13 +7311,19 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, int target) +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { int cpu; for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { if (cpu == target) continue; + /* + * Check if the CPU is in the LLC scheduling domain of @target. + * Due to isolcpus, there is no guarantee that all the siblings are in the domain. + */ + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) + continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; } @@ -7341,7 +7347,7 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma return __select_idle_cpu(core, p); } -static inline int select_idle_smt(struct task_struct *p, int target) +static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { return -1; } @@ -7591,7 +7597,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) has_idle_core = test_idle_cores(target); if (!has_idle_core && cpus_share_cache(prev, target)) { - i = select_idle_smt(p, prev); + i = select_idle_smt(p, sd, prev); if ((unsigned int)i < nr_cpumask_bits) return i; } @@ -9237,19 +9243,17 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) static inline bool others_have_blocked(struct rq *rq) { - if (READ_ONCE(rq->avg_rt.util_avg)) + if (cpu_util_rt(rq)) return true; - if (READ_ONCE(rq->avg_dl.util_avg)) + if (cpu_util_dl(rq)) return true; if (thermal_load_avg(rq)) return true; -#ifdef CONFIG_HAVE_SCHED_AVG_IRQ - if (READ_ONCE(rq->avg_irq.util_avg)) + if (cpu_util_irq(rq)) return true; -#endif return false; } @@ -9506,8 +9510,8 @@ static unsigned long scale_rt_capacity(int cpu) * avg_thermal.load_avg tracks thermal pressure and the weighted * average uses the actual delta max capacity(load). */ - used = READ_ONCE(rq->avg_rt.util_avg); - used += READ_ONCE(rq->avg_dl.util_avg); + used = cpu_util_rt(rq); + used += cpu_util_dl(rq); used += thermal_load_avg(rq); if (unlikely(used >= max)) @@ -9740,51 +9744,49 @@ group_type group_classify(unsigned int imbalance_pct, */ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) { + if (!(sd->flags & SD_ASYM_PACKING)) + return false; + if (!sched_smt_active()) return true; return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu); } +static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu) +{ + /* + * First check if @dst_cpu can do asym_packing load balance. Only do it + * if it has higher priority than @src_cpu. + */ + return sched_use_asym_prio(sd, dst_cpu) && + sched_asym_prefer(dst_cpu, src_cpu); +} + /** - * sched_asym - Check if the destination CPU can do asym_packing load balance + * sched_group_asym - Check if the destination CPU can do asym_packing balance * @env: The load balancing environment - * @sds: Load-balancing data with statistics of the local group * @sgs: Load-balancing statistics of the candidate busiest group * @group: The candidate busiest group * * @env::dst_cpu can do asym_packing if it has higher priority than the * preferred CPU of @group. * - * SMT is a special case. If we are balancing load between cores, @env::dst_cpu - * can do asym_packing balance only if all its SMT siblings are idle. Also, it - * can only do it if @group is an SMT group and has exactly on busy CPU. Larger - * imbalances in the number of CPUS are dealt with in find_busiest_group(). - * - * If we are balancing load within an SMT core, or at PKG domain level, always - * proceed. - * * Return: true if @env::dst_cpu can do with asym_packing load balance. False * otherwise. */ static inline bool -sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, - struct sched_group *group) +sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) { - /* Ensure that the whole local core is idle, if applicable. */ - if (!sched_use_asym_prio(env->sd, env->dst_cpu)) - return false; - /* - * CPU priorities does not make sense for SMT cores with more than one + * CPU priorities do not make sense for SMT cores with more than one * busy sibling. */ - if (group->flags & SD_SHARE_CPUCAPACITY) { - if (sgs->group_weight - sgs->idle_cpus != 1) - return false; - } + if ((group->flags & SD_SHARE_CPUCAPACITY) && + (sgs->group_weight - sgs->idle_cpus != 1)) + return false; - return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); + return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); } /* One group has more than one SMT CPU while the other group does not */ @@ -9938,11 +9940,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_weight = group->group_weight; /* Check if dst CPU is idle and preferred to this group */ - if (!local_group && env->sd->flags & SD_ASYM_PACKING && - env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && - sched_asym(env, sds, sgs, group)) { + if (!local_group && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && + sched_group_asym(env, sgs, group)) sgs->group_asym_packing = 1; - } /* Check for loaded SMT group to be balanced to dst CPU */ if (!local_group && smt_balance(env, sgs, group)) @@ -10006,9 +10006,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, switch (sgs->group_type) { case group_overloaded: /* Select the overloaded group with highest avg_load. */ - if (sgs->avg_load <= busiest->avg_load) - return false; - break; + return sgs->avg_load > busiest->avg_load; case group_imbalanced: /* @@ -10019,18 +10017,14 @@ static bool update_sd_pick_busiest(struct lb_env *env, case group_asym_packing: /* Prefer to move from lowest priority CPU's work */ - if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) - return false; - break; + return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); case group_misfit_task: /* * If we have more than one misfit sg go with the biggest * misfit. */ - if (sgs->group_misfit_task_load < busiest->group_misfit_task_load) - return false; - break; + return sgs->group_misfit_task_load > busiest->group_misfit_task_load; case group_smt_balance: /* @@ -10182,10 +10176,8 @@ static int idle_cpu_without(int cpu, struct task_struct *p) * be computed and tested before calling idle_cpu_without(). */ -#ifdef CONFIG_SMP if (rq->ttwu_pending) return 0; -#endif return 1; } @@ -10578,16 +10570,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_sg_lb_stats(env, sds, sg, sgs, &sg_status); - if (local_group) - goto next_group; - - - if (update_sd_pick_busiest(env, sds, sg, sgs)) { + if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; } -next_group: /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; @@ -10691,7 +10678,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (local->group_type == group_has_spare) { if ((busiest->group_type > group_fully_busy) && - !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { + !(env->sd->flags & SD_SHARE_LLC)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity @@ -11038,10 +11025,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, * If balancing between cores, let lower priority CPUs help * SMT cores with more than one busy sibling. */ - if ((env->sd->flags & SD_ASYM_PACKING) && - sched_use_asym_prio(env->sd, i) && - sched_asym_prefer(i, env->dst_cpu) && - nr_running == 1) + if (sched_asym(env->sd, i, env->dst_cpu) && nr_running == 1) continue; switch (env->migration_type) { @@ -11137,8 +11121,7 @@ asym_active_balance(struct lb_env *env) * the lower priority @env::dst_cpu help it. Do not follow * CPU priority. */ - return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && - sched_use_asym_prio(env->sd, env->dst_cpu) && + return env->idle != CPU_NOT_IDLE && sched_use_asym_prio(env->sd, env->dst_cpu) && (sched_asym_prefer(env->dst_cpu, env->src_cpu) || !sched_use_asym_prio(env->sd, env->src_cpu)); } @@ -11910,8 +11893,7 @@ static void nohz_balancer_kick(struct rq *rq) * preferred CPU must be idle. */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { - if (sched_use_asym_prio(sd, i) && - sched_asym_prefer(i, cpu)) { + if (sched_asym(sd, i, cpu)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 001fe047bd5d..d2242679239e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3136,7 +3136,7 @@ static inline bool uclamp_rq_is_idle(struct rq *rq) #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) { - return rq->avg_irq.util_avg; + return READ_ONCE(rq->avg_irq.util_avg); } static inline diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 10d1391e7416..99ea5986038c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -657,13 +657,13 @@ static void destroy_sched_domains(struct sched_domain *sd) } /* - * Keep a special pointer to the highest sched_domain that has - * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this - * allows us to avoid some pointer chasing select_idle_sibling(). + * Keep a special pointer to the highest sched_domain that has SD_SHARE_LLC set + * (Last Level Cache Domain) for this allows us to avoid some pointer chasing + * select_idle_sibling(). * - * Also keep a unique ID per domain (we use the first CPU number in - * the cpumask of the domain), this allows us to quickly tell if - * two CPUs are in the same cache domain, see cpus_share_cache(). + * Also keep a unique ID per domain (we use the first CPU number in the cpumask + * of the domain), this allows us to quickly tell if two CPUs are in the same + * cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); @@ -684,7 +684,7 @@ static void update_top_cache_domain(int cpu) int id = cpu; int size = 1; - sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); + sd = highest_flag_domain(cpu, SD_SHARE_LLC); if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); @@ -1551,11 +1551,12 @@ static struct cpumask ***sched_domains_numa_masks; * * These flags are purely descriptive of the topology and do not prescribe * behaviour. Behaviour is artificial and mapped in the below sd_init() - * function: + * function. For details, see include/linux/sched/sd_flags.h. * - * SD_SHARE_CPUCAPACITY - describes SMT topologies - * SD_SHARE_PKG_RESOURCES - describes shared caches - * SD_NUMA - describes NUMA topologies + * SD_SHARE_CPUCAPACITY + * SD_SHARE_LLC + * SD_CLUSTER + * SD_NUMA * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1565,7 +1566,7 @@ static struct cpumask ***sched_domains_numa_masks; #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ SD_CLUSTER | \ - SD_SHARE_PKG_RESOURCES | \ + SD_SHARE_LLC | \ SD_NUMA | \ SD_ASYM_PACKING) @@ -1608,7 +1609,7 @@ sd_init(struct sched_domain_topology_level *tl, | 0*SD_BALANCE_WAKE | 1*SD_WAKE_AFFINE | 0*SD_SHARE_CPUCAPACITY - | 0*SD_SHARE_PKG_RESOURCES + | 0*SD_SHARE_LLC | 0*SD_SERIALIZE | 1*SD_PREFER_SIBLING | 0*SD_NUMA @@ -1645,7 +1646,7 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->imbalance_pct = 110; - } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { + } else if (sd->flags & SD_SHARE_LLC) { sd->imbalance_pct = 117; sd->cache_nice_tries = 1; @@ -1670,7 +1671,7 @@ sd_init(struct sched_domain_topology_level *tl, * For all levels sharing cache; connect a sched_domain_shared * instance. */ - if (sd->flags & SD_SHARE_PKG_RESOURCES) { + if (sd->flags & SD_SHARE_LLC) { sd->shared = *per_cpu_ptr(sdd->sds, sd_id); atomic_inc(&sd->shared->ref); atomic_set(&sd->shared->nr_busy_cpus, sd_weight); @@ -2445,8 +2446,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { struct sched_domain *child = sd->child; - if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child && - (child->flags & SD_SHARE_PKG_RESOURCES)) { + if (!(sd->flags & SD_SHARE_LLC) && child && + (child->flags & SD_SHARE_LLC)) { struct sched_domain __rcu *top_p; unsigned int nr_llcs; |