summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c57
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c176
-rw-r--r--kernel/sched/psi.c2
-rw-r--r--kernel/sched/sched.h3
-rw-r--r--kernel/sched/topology.c12
6 files changed, 237 insertions, 14 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c52c2eba7c73..83e36547af17 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7383,6 +7383,19 @@ struct task_struct *idle_task(int cpu)
return cpu_rq(cpu)->idle;
}
+#ifdef CONFIG_SCHED_CORE
+int sched_core_idle_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (sched_core_enabled(rq) && rq->curr == rq->idle)
+ return 1;
+
+ return idle_cpu(cpu);
+}
+
+#endif
+
#ifdef CONFIG_SMP
/*
* This function computes an effective utilization for the given CPU, to be
@@ -11139,6 +11152,27 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
return 0;
}
+
+static u64 throttled_time_self(struct task_group *tg)
+{
+ int i;
+ u64 total = 0;
+
+ for_each_possible_cpu(i) {
+ total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time);
+ }
+
+ return total;
+}
+
+static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg));
+
+ return 0;
+}
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -11215,6 +11249,10 @@ static struct cftype cpu_legacy_files[] = {
.name = "stat",
.seq_show = cpu_cfs_stat_show,
},
+ {
+ .name = "stat.local",
+ .seq_show = cpu_cfs_local_stat_show,
+ },
#endif
#ifdef CONFIG_RT_GROUP_SCHED
{
@@ -11271,6 +11309,24 @@ static int cpu_extra_stat_show(struct seq_file *sf,
return 0;
}
+static int cpu_local_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ u64 throttled_self_usec;
+
+ throttled_self_usec = throttled_time_self(tg);
+ do_div(throttled_self_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "throttled_usec %llu\n",
+ throttled_self_usec);
+ }
+#endif
+ return 0;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
@@ -11449,6 +11505,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
+ .css_local_stat_show = cpu_local_stat_show,
#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 066ff1c8ae4e..aeeba46a096b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -427,6 +427,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent)
#undef SDM
debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
+ debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops);
}
void update_sched_domain_debugfs(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b3e25be58e2b..d3df5b1642a6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4787,6 +4787,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
static inline bool cfs_bandwidth_used(void);
@@ -4873,8 +4874,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (cfs_rq->nr_running == 1) {
check_enqueue_throttle(cfs_rq);
- if (!throttled_hierarchy(cfs_rq))
+ if (!throttled_hierarchy(cfs_rq)) {
list_add_leaf_cfs_rq(cfs_rq);
+ } else {
+#ifdef CONFIG_CFS_BANDWIDTH
+ struct rq *rq = rq_of(cfs_rq);
+
+ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock)
+ cfs_rq->throttled_clock = rq_clock(rq);
+ if (!cfs_rq->throttled_clock_self)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
+#endif
+ }
}
}
@@ -5377,6 +5388,17 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
/* Add cfs_rq with load or one or more already running entities to the list */
if (!cfs_rq_is_decayed(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
+
+ if (cfs_rq->throttled_clock_self) {
+ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self;
+
+ cfs_rq->throttled_clock_self = 0;
+
+ if (SCHED_WARN_ON((s64)delta < 0))
+ delta = 0;
+
+ cfs_rq->throttled_clock_self_time += delta;
+ }
}
return 0;
@@ -5391,6 +5413,10 @@ static int tg_throttle_down(struct task_group *tg, void *data)
if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
+
+ SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+ if (cfs_rq->nr_running)
+ cfs_rq->throttled_clock_self = rq_clock(rq);
}
cfs_rq->throttle_count++;
@@ -5480,7 +5506,9 @@ done:
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq_clock(rq);
+ SCHED_WARN_ON(cfs_rq->throttled_clock);
+ if (cfs_rq->nr_running)
+ cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
@@ -5498,7 +5526,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
update_rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
- cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ if (cfs_rq->throttled_clock) {
+ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
+ cfs_rq->throttled_clock = 0;
+ }
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
@@ -7065,7 +7096,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
util_min = uclamp_eff_value(p, UCLAMP_MIN);
util_max = uclamp_eff_value(p, UCLAMP_MAX);
- for_each_cpu_wrap(cpu, cpus, target + 1) {
+ for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
@@ -8416,6 +8447,11 @@ enum group_type {
*/
group_misfit_task,
/*
+ * Balance SMT group that's fully busy. Can benefit from migration
+ * a task on SMT with busy sibling to another CPU on idle core.
+ */
+ group_smt_balance,
+ /*
* SD_ASYM_PACKING only: One local CPU with higher capacity is available,
* and the task should be migrated to it instead of running on the
* current CPU.
@@ -9123,6 +9159,7 @@ struct sg_lb_stats {
unsigned int group_weight;
enum group_type group_type;
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
+ unsigned int group_smt_balance; /* Task on busy SMT be moved */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
@@ -9396,6 +9433,9 @@ group_type group_classify(unsigned int imbalance_pct,
if (sgs->group_asym_packing)
return group_asym_packing;
+ if (sgs->group_smt_balance)
+ return group_smt_balance;
+
if (sgs->group_misfit_task_load)
return group_misfit_task;
@@ -9465,6 +9505,71 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}
+/* One group has more than one SMT CPU while the other group does not */
+static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1,
+ struct sched_group *sg2)
+{
+ if (!sg1 || !sg2)
+ return false;
+
+ return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
+ (sg2->flags & SD_SHARE_CPUCAPACITY);
+}
+
+static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs,
+ struct sched_group *group)
+{
+ if (env->idle == CPU_NOT_IDLE)
+ return false;
+
+ /*
+ * For SMT source group, it is better to move a task
+ * to a CPU that doesn't have multiple tasks sharing its CPU capacity.
+ * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY
+ * will not be on.
+ */
+ if (group->flags & SD_SHARE_CPUCAPACITY &&
+ sgs->sum_h_nr_running > 1)
+ return true;
+
+ return false;
+}
+
+static inline long sibling_imbalance(struct lb_env *env,
+ struct sd_lb_stats *sds,
+ struct sg_lb_stats *busiest,
+ struct sg_lb_stats *local)
+{
+ int ncores_busiest, ncores_local;
+ long imbalance;
+
+ if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running)
+ return 0;
+
+ ncores_busiest = sds->busiest->cores;
+ ncores_local = sds->local->cores;
+
+ if (ncores_busiest == ncores_local) {
+ imbalance = busiest->sum_nr_running;
+ lsub_positive(&imbalance, local->sum_nr_running);
+ return imbalance;
+ }
+
+ /* Balance such that nr_running/ncores ratio are same on both groups */
+ imbalance = ncores_local * busiest->sum_nr_running;
+ lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running);
+ /* Normalize imbalance and do rounding on normalization */
+ imbalance = 2 * imbalance + ncores_local + ncores_busiest;
+ imbalance /= ncores_local + ncores_busiest;
+
+ /* Take advantage of resource in an empty sched group */
+ if (imbalance == 0 && local->sum_nr_running == 0 &&
+ busiest->sum_nr_running > 1)
+ imbalance = 2;
+
+ return imbalance;
+}
+
static inline bool
sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
{
@@ -9557,6 +9662,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_asym_packing = 1;
}
+ /* Check for loaded SMT group to be balanced to dst CPU */
+ if (!local_group && smt_balance(env, sgs, group))
+ sgs->group_smt_balance = 1;
+
sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
/* Computing avg_load makes sense only when group is overloaded */
@@ -9641,6 +9750,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
return false;
break;
+ case group_smt_balance:
case group_fully_busy:
/*
* Select the fully busy group with highest avg_load. In
@@ -9670,6 +9780,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
case group_has_spare:
/*
+ * Do not pick sg with SMT CPUs over sg with pure CPUs,
+ * as we do not want to pull task off SMT core with one task
+ * and make the core idle.
+ */
+ if (smt_vs_nonsmt_groups(sds->busiest, sg)) {
+ if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1)
+ return false;
+ else
+ return true;
+ }
+
+ /*
* Select not overloaded group with lowest number of idle cpus
* and highest number of running tasks. We could also compare
* the spare capacity which is more stable but it can end up
@@ -9865,6 +9987,7 @@ static bool update_pick_idlest(struct sched_group *idlest,
case group_imbalanced:
case group_asym_packing:
+ case group_smt_balance:
/* Those types are not used in the slow wakeup path */
return false;
@@ -9996,6 +10119,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
case group_imbalanced:
case group_asym_packing:
+ case group_smt_balance:
/* Those type are not used in the slow wakeup path */
return NULL;
@@ -10250,6 +10374,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
return;
}
+ if (busiest->group_type == group_smt_balance) {
+ /* Reduce number of tasks sharing CPU capacity */
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ return;
+ }
+
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
@@ -10297,14 +10428,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
if (busiest->group_weight == 1 || sds->prefer_sibling) {
- unsigned int nr_diff = busiest->sum_nr_running;
/*
* When prefer sibling, evenly spread running tasks on
* groups.
*/
env->migration_type = migrate_task;
- lsub_positive(&nr_diff, local->sum_nr_running);
- env->imbalance = nr_diff;
+ env->imbalance = sibling_imbalance(env, sds, busiest, local);
} else {
/*
@@ -10501,20 +10630,27 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* group's child domain.
*/
if (sds.prefer_sibling && local->group_type == group_has_spare &&
- busiest->sum_nr_running > local->sum_nr_running + 1)
+ sibling_imbalance(env, &sds, busiest, local) > 1)
goto force_balance;
if (busiest->group_type != group_overloaded) {
- if (env->idle == CPU_NOT_IDLE)
+ if (env->idle == CPU_NOT_IDLE) {
/*
* If the busiest group is not overloaded (and as a
* result the local one too) but this CPU is already
* busy, let another idle CPU try to pull task.
*/
goto out_balanced;
+ }
+
+ if (busiest->group_type == group_smt_balance &&
+ smt_vs_nonsmt_groups(sds.local, sds.busiest)) {
+ /* Let non SMT CPU pull from SMT CPU sharing with sibling */
+ goto force_balance;
+ }
if (busiest->group_weight > 1 &&
- local->idle_cpus <= (busiest->idle_cpus + 1))
+ local->idle_cpus <= (busiest->idle_cpus + 1)) {
/*
* If the busiest group is not overloaded
* and there is no imbalance between this and busiest
@@ -10525,12 +10661,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* there is more than 1 CPU per group.
*/
goto out_balanced;
+ }
- if (busiest->sum_h_nr_running == 1)
+ if (busiest->sum_h_nr_running == 1) {
/*
* busiest doesn't have any tasks waiting to run
*/
goto out_balanced;
+ }
}
force_balance:
@@ -10764,7 +10902,7 @@ static int active_load_balance_cpu_stop(void *data);
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
- int cpu;
+ int cpu, idle_smt = -1;
/*
* Ensure the balancing environment is consistent; can happen
@@ -10791,10 +10929,24 @@ static int should_we_balance(struct lb_env *env)
if (!idle_cpu(cpu))
continue;
+ /*
+ * Don't balance to idle SMT in busy core right away when
+ * balancing cores, but remember the first idle SMT CPU for
+ * later consideration. Find CPU on an idle core first.
+ */
+ if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) {
+ if (idle_smt == -1)
+ idle_smt = cpu;
+ continue;
+ }
+
/* Are we the first idle CPU? */
return cpu == env->dst_cpu;
}
+ if (idle_smt == env->dst_cpu)
+ return true;
+
/* Are we the first CPU of this group ? */
return group_balance_cpu(sg) == env->dst_cpu;
}
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 9bb3f2b3ccfc..1d0f634725a6 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -140,7 +140,7 @@
static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
-DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
+static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED
static bool psi_enable;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e93e006a942b..9baeb1a2dfdd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -636,6 +636,8 @@ struct cfs_rq {
u64 throttled_clock;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
+ u64 throttled_clock_self;
+ u64 throttled_clock_self_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
@@ -1882,6 +1884,7 @@ struct sched_group {
atomic_t ref;
unsigned int group_weight;
+ unsigned int cores;
struct sched_group_capacity *sgc;
int asym_prefer_cpu; /* CPU of highest priority in group */
int flags;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d3a3b2646ec4..7cfcfe5d27b9 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1275,14 +1275,24 @@ build_sched_groups(struct sched_domain *sd, int cpu)
static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
+ struct cpumask *mask = sched_domains_tmpmask2;
WARN_ON(!sg);
do {
- int cpu, max_cpu = -1;
+ int cpu, cores = 0, max_cpu = -1;
sg->group_weight = cpumask_weight(sched_group_span(sg));
+ cpumask_copy(mask, sched_group_span(sg));
+ for_each_cpu(cpu, mask) {
+ cores++;
+#ifdef CONFIG_SCHED_SMT
+ cpumask_andnot(mask, mask, cpu_smt_mask(cpu));
+#endif
+ }
+ sg->cores = cores;
+
if (!(sd->flags & SD_ASYM_PACKING))
goto next;