summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-10-23 15:00:03 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2018-10-23 15:00:03 +0100
commit42f52e1c59bdb78cad945b2dd34fa1f892239a39 (patch)
tree86b5e51d940ab0b1e8bb4d1158dc89d09d6f52d4 /kernel
parent0d1b82cd8ac2e8856ae9045c97782ac1c359929c (diff)
parent11e13696a08e838ba48c72404c2b3f41429b5b20 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes are: - Migrate CPU-intense 'misfit' tasks on asymmetric capacity systems, to better utilize (much) faster 'big core' CPUs. (Morten Rasmussen, Valentin Schneider) - Topology handling improvements, in particular when CPU capacity changes and related load-balancing fixes/improvements (Morten Rasmussen) - ... plus misc other improvements, fixes and updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (28 commits) sched/completions/Documentation: Add recommendation for dynamic and ONSTACK completions sched/completions/Documentation: Clean up the document some more sched/completions/Documentation: Fix a couple of punctuation nits cpu/SMT: State SMT is disabled even with nosmt and without "=force" sched/core: Fix comment regarding nr_iowait_cpu() and get_iowait_load() sched/fair: Remove setting task's se->runnable_weight during PELT update sched/fair: Disable LB_BIAS by default sched/pelt: Fix warning and clean up IRQ PELT config sched/topology: Make local variables static sched/debug: Use symbolic names for task state constants sched/numa: Remove unused numa_stats::nr_running field sched/numa: Remove unused code from update_numa_stats() sched/debug: Explicitly cast sched_feat() to bool sched/core: Disable SD_PREFER_SIBLING on asymmetric CPU capacity domains sched/fair: Don't move tasks to lower capacity CPUs unless necessary sched/fair: Set rq->rd->overload when misfit sched/fair: Wrap rq->rd->overload accesses with READ/WRITE_ONCE() sched/core: Change root_domain->overload type to int sched/fair: Change 'prefer_sibling' type to bool sched/fair: Kick nohz balance if rq->misfit_task_load ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c1
-rw-r--r--kernel/sched/core.c17
-rw-r--r--kernel/sched/fair.c187
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/pelt.c8
-rw-r--r--kernel/sched/pelt.h2
-rw-r--r--kernel/sched/sched.h23
-rw-r--r--kernel/sched/topology.c106
8 files changed, 262 insertions, 84 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index be4859f07153..e82920b8bee1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -383,6 +383,7 @@ void __init cpu_smt_disable(bool force)
pr_info("SMT: Force disabled\n");
cpu_smt_control = CPU_SMT_FORCE_DISABLED;
} else {
+ pr_info("SMT: disabled\n");
cpu_smt_control = CPU_SMT_DISABLED;
}
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad97f3ba5ec5..fe0223121883 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -135,9 +135,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
* In theory, the compile should just see 0 here, and optimize out the call
* to sched_rt_avg_update. But I don't trust it...
*/
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- s64 steal = 0, irq_delta = 0;
-#endif
+ s64 __maybe_unused steal = 0, irq_delta = 0;
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
@@ -177,7 +176,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
-#ifdef HAVE_SCHED_AVG_IRQ
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
update_irq_load_avg(rq, irq_delta + steal);
#endif
@@ -701,6 +700,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
if (idle_policy(p->policy)) {
load->weight = scale_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
+ p->se.runnable_weight = load->weight;
return;
}
@@ -713,6 +713,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
} else {
load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
+ p->se.runnable_weight = load->weight;
}
}
@@ -2915,10 +2916,10 @@ unsigned long nr_iowait(void)
}
/*
- * Consumers of these two interfaces, like for example the cpufreq menu
- * governor are using nonsensical data. Boosting frequency for a CPU that has
- * IO-wait which might not even end up running the task when it does become
- * runnable.
+ * Consumers of these two interfaces, like for example the cpuidle menu
+ * governor, are using nonsensical data. Preferring shallow idle state selection
+ * for a CPU that has IO-wait which might not even end up running the task when
+ * it does become runnable.
*/
unsigned long nr_iowait_cpu(int cpu)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 908c9cdae2f0..ee271bb661cc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -693,6 +693,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
+static unsigned long capacity_of(int cpu);
/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
@@ -1456,7 +1457,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
static unsigned long weighted_cpuload(struct rq *rq);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
-static unsigned long capacity_of(int cpu);
/* Cached statistics for all CPUs within a node */
struct numa_stats {
@@ -1464,8 +1464,6 @@ struct numa_stats {
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
-
- unsigned int nr_running;
};
/*
@@ -1473,36 +1471,16 @@ struct numa_stats {
*/
static void update_numa_stats(struct numa_stats *ns, int nid)
{
- int smt, cpu, cpus = 0;
- unsigned long capacity;
+ int cpu;
memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
- ns->nr_running += rq->nr_running;
ns->load += weighted_cpuload(rq);
ns->compute_capacity += capacity_of(cpu);
-
- cpus++;
}
- /*
- * If we raced with hotplug and there are no CPUs left in our mask
- * the @ns structure is NULL'ed and task_numa_compare() will
- * not find this node attractive.
- *
- * We'll detect a huge imbalance and bail there.
- */
- if (!cpus)
- return;
-
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
- capacity = cpus / smt; /* cores */
-
- capacity = min_t(unsigned, capacity,
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
}
struct task_numa_env {
@@ -3723,6 +3701,29 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
WRITE_ONCE(p->se.avg.util_est, ue);
}
+static inline int task_fits_capacity(struct task_struct *p, long capacity)
+{
+ return capacity * 1024 > task_util_est(p) * capacity_margin;
+}
+
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
+{
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return;
+
+ if (!p) {
+ rq->misfit_task_load = 0;
+ return;
+ }
+
+ if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+ rq->misfit_task_load = 0;
+ return;
+ }
+
+ rq->misfit_task_load = task_h_load(p);
+}
+
#else /* CONFIG_SMP */
#define UPDATE_TG 0x0
@@ -3752,6 +3753,7 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
bool task_sleep) {}
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
@@ -6280,6 +6282,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
{
long min_cap, max_cap;
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return 0;
+
min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
@@ -6290,7 +6295,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
/* Bring task utilization in sync with prev_cpu */
sync_entity_load_avg(&p->se);
- return min_cap * 1024 < task_util(p) * capacity_margin;
+ return !task_fits_capacity(p, min_cap);
}
/*
@@ -6709,9 +6714,12 @@ done: __maybe_unused;
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ update_misfit_status(p, rq);
+
return p;
idle:
+ update_misfit_status(NULL, rq);
new_tasks = idle_balance(rq, rf);
/*
@@ -6917,6 +6925,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
+enum group_type {
+ group_other = 0,
+ group_misfit_task,
+ group_imbalanced,
+ group_overloaded,
+};
+
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
@@ -6947,6 +6962,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
+ enum group_type src_grp_type;
struct list_head tasks;
};
@@ -7327,7 +7343,7 @@ static inline bool others_have_blocked(struct rq *rq)
if (READ_ONCE(rq->avg_dl.util_avg))
return true;
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if (READ_ONCE(rq->avg_irq.util_avg))
return true;
#endif
@@ -7490,12 +7506,6 @@ static unsigned long task_h_load(struct task_struct *p)
/********** Helpers for find_busiest_group ************************/
-enum group_type {
- group_other = 0,
- group_imbalanced,
- group_overloaded,
-};
-
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
@@ -7511,6 +7521,7 @@ struct sg_lb_stats {
unsigned int group_weight;
enum group_type group_type;
int group_no_capacity;
+ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -7619,13 +7630,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
+ sdg->sgc->max_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, min_capacity;
+ unsigned long capacity, min_capacity, max_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -7639,6 +7651,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
capacity = 0;
min_capacity = ULONG_MAX;
+ max_capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -7669,6 +7682,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
min_capacity = min(capacity, min_capacity);
+ max_capacity = max(capacity, max_capacity);
}
} else {
/*
@@ -7682,12 +7696,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
capacity += sgc->capacity;
min_capacity = min(sgc->min_capacity, min_capacity);
+ max_capacity = max(sgc->max_capacity, max_capacity);
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = min_capacity;
+ sdg->sgc->max_capacity = max_capacity;
}
/*
@@ -7783,16 +7799,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
}
/*
- * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
* per-CPU capacity than sched_group ref.
*/
static inline bool
-group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
return sg->sgc->min_capacity * capacity_margin <
ref->sgc->min_capacity * 1024;
}
+/*
+ * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-CPU capacity_orig than sched_group ref.
+ */
+static inline bool
+group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->max_capacity * capacity_margin <
+ ref->sgc->max_capacity * 1024;
+}
+
static inline enum
group_type group_classify(struct sched_group *group,
struct sg_lb_stats *sgs)
@@ -7803,6 +7830,9 @@ group_type group_classify(struct sched_group *group,
if (sg_imbalanced(group))
return group_imbalanced;
+ if (sgs->group_misfit_task_load)
+ return group_misfit_task;
+
return group_other;
}
@@ -7835,7 +7865,7 @@ static bool update_nohz_stats(struct rq *rq, bool force)
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
- * @overload: Indicate more than one runnable task for any CPU.
+ * @overload: Indicate pullable load (e.g. >1 runnable task).
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
@@ -7877,6 +7907,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
*/
if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
+
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+ sgs->group_misfit_task_load < rq->misfit_task_load) {
+ sgs->group_misfit_task_load = rq->misfit_task_load;
+ *overload = 1;
+ }
}
/* Adjust by relative CPU capacity of the group */
@@ -7912,6 +7948,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ /*
+ * Don't try to pull misfit tasks we can't help.
+ * We can use max_capacity here as reduction in capacity on some
+ * CPUs in the group should either be possible to resolve
+ * internally or be covered by avg_load imbalance (eventually).
+ */
+ if (sgs->group_type == group_misfit_task &&
+ (!group_smaller_max_cpu_capacity(sg, sds->local) ||
+ !group_has_capacity(env, &sds->local_stat)))
+ return false;
+
if (sgs->group_type > busiest->group_type)
return true;
@@ -7931,7 +7978,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* power/energy consequences are not considered.
*/
if (sgs->sum_nr_running <= sgs->group_weight &&
- group_smaller_cpu_capacity(sds->local, sg))
+ group_smaller_min_cpu_capacity(sds->local, sg))
+ return false;
+
+ /*
+ * If we have more than one misfit sg go with the biggest misfit.
+ */
+ if (sgs->group_type == group_misfit_task &&
+ sgs->group_misfit_task_load < busiest->group_misfit_task_load)
return false;
asym_packing:
@@ -8002,11 +8056,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
- int load_idx, prefer_sibling = 0;
+ int load_idx;
bool overload = false;
-
- if (child && child->flags & SD_PREFER_SIBLING)
- prefer_sibling = 1;
+ bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
#ifdef CONFIG_NO_HZ_COMMON
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
@@ -8080,8 +8132,8 @@ next_group:
if (!env->sd->parent) {
/* update overload indicator if we are at root domain */
- if (env->dst_rq->rd->overload != overload)
- env->dst_rq->rd->overload = overload;
+ if (READ_ONCE(env->dst_rq->rd->overload) != overload)
+ WRITE_ONCE(env->dst_rq->rd->overload, overload);
}
}
@@ -8231,8 +8283,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* factors in sg capacity and sgs with smaller group_type are
* skipped when updating the busiest sg:
*/
- if (busiest->avg_load <= sds->avg_load ||
- local->avg_load >= sds->avg_load) {
+ if (busiest->group_type != group_misfit_task &&
+ (busiest->avg_load <= sds->avg_load ||
+ local->avg_load >= sds->avg_load)) {
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
@@ -8266,6 +8319,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
+ /* Boost imbalance to allow misfit task to be balanced. */
+ if (busiest->group_type == group_misfit_task) {
+ env->imbalance = max_t(long, env->imbalance,
+ busiest->group_misfit_task_load);
+ }
+
/*
* if *imbalance is less than the average load per runnable task
* there is no guarantee that any tasks will be moved so we'll have
@@ -8332,6 +8391,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
busiest->group_no_capacity)
goto force_balance;
+ /* Misfit tasks should be dealt with regardless of the avg load */
+ if (busiest->group_type == group_misfit_task)
+ goto force_balance;
+
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
@@ -8369,6 +8432,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
force_balance:
/* Looks like there is an imbalance. Compute it */
+ env->src_grp_type = busiest->group_type;
calculate_imbalance(env, &sds);
return env->imbalance ? sds.busiest : NULL;
@@ -8416,8 +8480,32 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
+ /*
+ * For ASYM_CPUCAPACITY domains with misfit tasks we simply
+ * seek the "biggest" misfit task.
+ */
+ if (env->src_grp_type == group_misfit_task) {
+ if (rq->misfit_task_load > busiest_load) {
+ busiest_load = rq->misfit_task_load;
+ busiest = rq;
+ }
+
+ continue;
+ }
+
capacity = capacity_of(i);
+ /*
+ * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
+ * eventually lead to active_balancing high->low capacity.
+ * Higher per-CPU capacity is considered better than balancing
+ * average load.
+ */
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+ capacity_of(env->dst_cpu) < capacity &&
+ rq->nr_running == 1)
+ continue;
+
wl = weighted_cpuload(rq);
/*
@@ -8485,6 +8573,9 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ if (env->src_grp_type == group_misfit_task)
+ return 1;
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -9127,7 +9218,7 @@ static void nohz_balancer_kick(struct rq *rq)
if (time_before(now, nohz.next_balance))
goto out;
- if (rq->nr_running >= 2) {
+ if (rq->nr_running >= 2 || rq->misfit_task_load) {
flags = NOHZ_KICK_MASK;
goto out;
}
@@ -9496,7 +9587,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
rq_unpin_lock(this_rq, rf);
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
+ !READ_ONCE(this_rq->rd->overload)) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
@@ -9658,6 +9749,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+
+ update_misfit_status(curr, rq);
}
/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 85ae8488039c..858589b83377 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,7 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
-SCHED_FEAT(LB_BIAS, true)
+SCHED_FEAT(LB_BIAS, false)
/*
* Decrement CPU capacity based on time not spent running tasks
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 35475c0c5419..90fb5bc12ad4 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -269,9 +269,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
{
- if (entity_is_task(se))
- se->runnable_weight = se->load.weight;
-
if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
return 1;
@@ -282,9 +279,6 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (entity_is_task(se))
- se->runnable_weight = se->load.weight;
-
if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
cfs_rq->curr == se)) {
@@ -358,7 +352,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
return 0;
}
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
/*
* irq:
*
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index d2894db28955..7e56b489ff32 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -6,7 +6,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
int update_irq_load_avg(struct rq *rq, u64 running);
#else
static inline int
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9683f458aec7..fc24f2b8c646 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -717,8 +717,12 @@ struct root_domain {
cpumask_var_t span;
cpumask_var_t online;
- /* Indicate more than one runnable task for any CPU */
- bool overload;
+ /*
+ * Indicate pullable load on at least one CPU, e.g:
+ * - More than one runnable task
+ * - Running task is misfit
+ */
+ int overload;
/*
* The bit corresponding to a CPU gets set here if such CPU has more
@@ -845,6 +849,8 @@ struct rq {
unsigned char idle_balance;
+ unsigned long misfit_task_load;
+
/* For active balancing */
int active_balance;
int push_cpu;
@@ -858,8 +864,7 @@ struct rq {
struct sched_avg avg_rt;
struct sched_avg avg_dl;
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-#define HAVE_SCHED_AVG_IRQ
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
#endif
u64 idle_stamp;
@@ -1188,6 +1193,7 @@ DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+extern struct static_key_false sched_asym_cpucapacity;
struct sched_group_capacity {
atomic_t ref;
@@ -1197,6 +1203,7 @@ struct sched_group_capacity {
*/
unsigned long capacity;
unsigned long min_capacity; /* Min per-CPU capacity in group */
+ unsigned long max_capacity; /* Max per-CPU capacity in group */
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
@@ -1396,7 +1403,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
0;
#undef SCHED_FEAT
-#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
@@ -1696,8 +1703,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
if (prev_nr < 2 && rq->nr_running >= 2) {
#ifdef CONFIG_SMP
- if (!rq->rd->overload)
- rq->rd->overload = true;
+ if (!READ_ONCE(rq->rd->overload))
+ WRITE_ONCE(rq->rd->overload, 1);
#endif
}
@@ -2217,7 +2224,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
}
#endif
-#ifdef HAVE_SCHED_AVG_IRQ
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
{
return rq->avg_irq.util_avg;
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 505a41c42b96..9d74371e4aad 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -7,8 +7,8 @@
DEFINE_MUTEX(sched_domains_mutex);
/* Protected by sched_domains_mutex: */
-cpumask_var_t sched_domains_tmpmask;
-cpumask_var_t sched_domains_tmpmask2;
+static cpumask_var_t sched_domains_tmpmask;
+static cpumask_var_t sched_domains_tmpmask2;
#ifdef CONFIG_SCHED_DEBUG
@@ -398,6 +398,7 @@ DEFINE_PER_CPU(int, sd_llc_id);
DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
static void update_top_cache_domain(int cpu)
{
@@ -692,6 +693,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
sg_span = sched_group_span(sg);
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+ sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
}
static int
@@ -851,6 +853,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+ sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
return sg;
}
@@ -1061,7 +1064,6 @@ static struct cpumask ***sched_domains_numa_masks;
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
- * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
*
* Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it:
@@ -1073,13 +1075,12 @@ static struct cpumask ***sched_domains_numa_masks;
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
- SD_ASYM_CPUCAPACITY | \
SD_SHARE_POWERDOMAIN)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map,
- struct sched_domain *child, int cpu)
+ struct sched_domain *child, int dflags, int cpu)
{
struct sd_data *sdd = &tl->data;
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -1100,6 +1101,9 @@ sd_init(struct sched_domain_topology_level *tl,
"wrong sd_flags in topology description\n"))
sd_flags &= ~TOPOLOGY_SD_FLAGS;
+ /* Apply detected topology flags */
+ sd_flags |= dflags;
+
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
@@ -1122,7 +1126,7 @@ sd_init(struct sched_domain_topology_level *tl,
| 0*SD_SHARE_CPUCAPACITY
| 0*SD_SHARE_PKG_RESOURCES
| 0*SD_SERIALIZE
- | 0*SD_PREFER_SIBLING
+ | 1*SD_PREFER_SIBLING
| 0*SD_NUMA
| sd_flags
,
@@ -1148,17 +1152,21 @@ sd_init(struct sched_domain_topology_level *tl,
if (sd->flags & SD_ASYM_CPUCAPACITY) {
struct sched_domain *t = sd;
+ /*
+ * Don't attempt to spread across CPUs of different capacities.
+ */
+ if (sd->child)
+ sd->child->flags &= ~SD_PREFER_SIBLING;
+
for_each_lower_domain(t)
t->flags |= SD_BALANCE_WAKE;
}
if (sd->flags & SD_SHARE_CPUCAPACITY) {
- sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
- sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
@@ -1169,6 +1177,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->busy_idx = 3;
sd->idle_idx = 2;
+ sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
sd->flags &= ~(SD_BALANCE_EXEC |
@@ -1178,7 +1187,6 @@ sd_init(struct sched_domain_topology_level *tl,
#endif
} else {
- sd->flags |= SD_PREFER_SIBLING;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
sd->idle_idx = 1;
@@ -1604,9 +1612,9 @@ static void __sdt_free(const struct cpumask *cpu_map)
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *child, int cpu)
+ struct sched_domain *child, int dflags, int cpu)
{
- struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
+ struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
if (child) {
sd->level = child->level + 1;
@@ -1633,6 +1641,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
}
/*
+ * Find the sched_domain_topology_level where all CPU capacities are visible
+ * for all CPUs.
+ */
+static struct sched_domain_topology_level
+*asym_cpu_capacity_level(const struct cpumask *cpu_map)
+{
+ int i, j, asym_level = 0;
+ bool asym = false;
+ struct sched_domain_topology_level *tl, *asym_tl = NULL;
+ unsigned long cap;
+
+ /* Is there any asymmetry? */
+ cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
+
+ for_each_cpu(i, cpu_map) {
+ if (arch_scale_cpu_capacity(NULL, i) != cap) {
+ asym = true;
+ break;
+ }
+ }
+
+ if (!asym)
+ return NULL;
+
+ /*
+ * Examine topology from all CPU's point of views to detect the lowest
+ * sched_domain_topology_level where a highest capacity CPU is visible
+ * to everyone.
+ */
+ for_each_cpu(i, cpu_map) {
+ unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
+ int tl_id = 0;
+
+ for_each_sd_topology(tl) {
+ if (tl_id < asym_level)
+ goto next_level;
+
+ for_each_cpu_and(j, tl->mask(i), cpu_map) {
+ unsigned long capacity;
+
+ capacity = arch_scale_cpu_capacity(NULL, j);
+
+ if (capacity <= max_capacity)
+ continue;
+
+ max_capacity = capacity;
+ asym_level = tl_id;
+ asym_tl = tl;
+ }
+next_level:
+ tl_id++;
+ }
+ }
+
+ return asym_tl;
+}
+
+
+/*
* Build sched domains for a given set of CPUs and attach the sched domains
* to the individual CPUs
*/
@@ -1644,18 +1711,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
struct s_data d;
struct rq *rq = NULL;
int i, ret = -ENOMEM;
+ struct sched_domain_topology_level *tl_asym;
+ bool has_asym = false;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
+ tl_asym = asym_cpu_capacity_level(cpu_map);
+
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
sd = NULL;
for_each_sd_topology(tl) {
- sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+ int dflags = 0;
+
+ if (tl == tl_asym) {
+ dflags |= SD_ASYM_CPUCAPACITY;
+ has_asym = true;
+ }
+
+ sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
+
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP)
@@ -1704,6 +1783,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
rcu_read_unlock();
+ if (has_asym)
+ static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
+
if (rq && sched_debug_enabled) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);