diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 496 |
1 files changed, 301 insertions, 195 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f6a05d9b5443..13950beb01a2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -60,6 +60,14 @@ unsigned int sysctl_sched_min_granularity = 750000ULL; static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* + * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks. + * Applies only when SCHED_IDLE tasks compete with normal tasks. + * + * (default: 0.75 msec) + */ +unsigned int sysctl_sched_idle_min_granularity = 750000ULL; + +/* * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity */ static unsigned int sched_nr_latency = 8; @@ -665,6 +673,8 @@ static u64 __sched_period(unsigned long nr_running) return sysctl_sched_latency; } +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq); + /* * We calculate the wall-time slice from the period by taking a part * proportional to the weight. @@ -674,6 +684,8 @@ static u64 __sched_period(unsigned long nr_running) static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { unsigned int nr_running = cfs_rq->nr_running; + struct sched_entity *init_se = se; + unsigned int min_gran; u64 slice; if (sched_feat(ALT_PERIOD)) @@ -684,12 +696,13 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) for_each_sched_entity(se) { struct load_weight *load; struct load_weight lw; + struct cfs_rq *qcfs_rq; - cfs_rq = cfs_rq_of(se); - load = &cfs_rq->load; + qcfs_rq = cfs_rq_of(se); + load = &qcfs_rq->load; if (unlikely(!se->on_rq)) { - lw = cfs_rq->load; + lw = qcfs_rq->load; update_load_add(&lw, se->load.weight); load = &lw; @@ -697,8 +710,14 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) slice = __calc_delta(slice, se->load.weight, load); } - if (sched_feat(BASE_SLICE)) - slice = max(slice, (u64)sysctl_sched_min_granularity); + if (sched_feat(BASE_SLICE)) { + if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq)) + min_gran = sysctl_sched_idle_min_granularity; + else + min_gran = sysctl_sched_min_granularity; + + slice = max_t(u64, slice, min_gran); + } return slice; } @@ -837,8 +856,13 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->exec_start = now; - schedstat_set(curr->statistics.exec_max, - max(delta_exec, curr->statistics.exec_max)); + if (schedstat_enabled()) { + struct sched_statistics *stats; + + stats = __schedstats_from_se(curr); + __schedstat_set(stats->exec_max, + max(delta_exec, stats->exec_max)); + } curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq->exec_clock, delta_exec); @@ -863,137 +887,70 @@ static void update_curr_fair(struct rq *rq) } static inline void -update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 wait_start, prev_wait_start; + struct sched_statistics *stats; + struct task_struct *p = NULL; if (!schedstat_enabled()) return; - wait_start = rq_clock(rq_of(cfs_rq)); - prev_wait_start = schedstat_val(se->statistics.wait_start); + stats = __schedstats_from_se(se); - if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && - likely(wait_start > prev_wait_start)) - wait_start -= prev_wait_start; + if (entity_is_task(se)) + p = task_of(se); - __schedstat_set(se->statistics.wait_start, wait_start); + __update_stats_wait_start(rq_of(cfs_rq), p, stats); } static inline void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct task_struct *p; - u64 delta; + struct sched_statistics *stats; + struct task_struct *p = NULL; if (!schedstat_enabled()) return; + stats = __schedstats_from_se(se); + /* * When the sched_schedstat changes from 0 to 1, some sched se * maybe already in the runqueue, the se->statistics.wait_start * will be 0.So it will let the delta wrong. We need to avoid this * scenario. */ - if (unlikely(!schedstat_val(se->statistics.wait_start))) + if (unlikely(!schedstat_val(stats->wait_start))) return; - delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); - - if (entity_is_task(se)) { + if (entity_is_task(se)) p = task_of(se); - if (task_on_rq_migrating(p)) { - /* - * Preserve migrating task's wait time so wait_start - * time stamp can be adjusted to accumulate wait time - * prior to migration. - */ - __schedstat_set(se->statistics.wait_start, delta); - return; - } - trace_sched_stat_wait(p, delta); - } - __schedstat_set(se->statistics.wait_max, - max(schedstat_val(se->statistics.wait_max), delta)); - __schedstat_inc(se->statistics.wait_count); - __schedstat_add(se->statistics.wait_sum, delta); - __schedstat_set(se->statistics.wait_start, 0); + __update_stats_wait_end(rq_of(cfs_rq), p, stats); } static inline void -update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { + struct sched_statistics *stats; struct task_struct *tsk = NULL; - u64 sleep_start, block_start; if (!schedstat_enabled()) return; - sleep_start = schedstat_val(se->statistics.sleep_start); - block_start = schedstat_val(se->statistics.block_start); + stats = __schedstats_from_se(se); if (entity_is_task(se)) tsk = task_of(se); - if (sleep_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) - __schedstat_set(se->statistics.sleep_max, delta); - - __schedstat_set(se->statistics.sleep_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); - - if (tsk) { - account_scheduler_latency(tsk, delta >> 10, 1); - trace_sched_stat_sleep(tsk, delta); - } - } - if (block_start) { - u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; - - if ((s64)delta < 0) - delta = 0; - - if (unlikely(delta > schedstat_val(se->statistics.block_max))) - __schedstat_set(se->statistics.block_max, delta); - - __schedstat_set(se->statistics.block_start, 0); - __schedstat_add(se->statistics.sum_sleep_runtime, delta); - - if (tsk) { - if (tsk->in_iowait) { - __schedstat_add(se->statistics.iowait_sum, delta); - __schedstat_inc(se->statistics.iowait_count); - trace_sched_stat_iowait(tsk, delta); - } - - trace_sched_stat_blocked(tsk, delta); - - /* - * Blocking time is in units of nanosecs, so shift by - * 20 to get a milliseconds-range estimation of the - * amount of time that the task spent sleeping: - */ - if (unlikely(prof_on == SLEEP_PROFILING)) { - profile_hits(SLEEP_PROFILING, - (void *)get_wchan(tsk), - delta >> 20); - } - account_scheduler_latency(tsk, delta >> 10, 0); - } - } + __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats); } /* * Task is being enqueued - update stats: */ static inline void -update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { if (!schedstat_enabled()) return; @@ -1003,14 +960,14 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * a dequeue/enqueue event is a NOP) */ if (se != cfs_rq->curr) - update_stats_wait_start(cfs_rq, se); + update_stats_wait_start_fair(cfs_rq, se); if (flags & ENQUEUE_WAKEUP) - update_stats_enqueue_sleeper(cfs_rq, se); + update_stats_enqueue_sleeper_fair(cfs_rq, se); } static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { if (!schedstat_enabled()) @@ -1021,7 +978,7 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * waiting task: */ if (se != cfs_rq->curr) - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end_fair(cfs_rq, se); if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -1030,10 +987,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* XXX racy against TTWU */ state = READ_ONCE(tsk->__state); if (state & TASK_INTERRUPTIBLE) - __schedstat_set(se->statistics.sleep_start, + __schedstat_set(tsk->stats.sleep_start, rq_clock(rq_of(cfs_rq))); if (state & TASK_UNINTERRUPTIBLE) - __schedstat_set(se->statistics.block_start, + __schedstat_set(tsk->stats.block_start, rq_clock(rq_of(cfs_rq))); } } @@ -1081,11 +1038,12 @@ struct numa_group { unsigned long total_faults; unsigned long max_faults_cpu; /* + * faults[] array is split into two regions: faults_mem and faults_cpu. + * * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted * more by CPU use than by memory faults. */ - unsigned long *faults_cpu; unsigned long faults[]; }; @@ -1259,8 +1217,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid) static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) { - return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] + - group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; + return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] + + group->faults[task_faults_idx(NUMA_CPU, nid, 1)]; } static inline unsigned long group_faults_priv(struct numa_group *ng) @@ -2116,7 +2074,7 @@ static void numa_migrate_preferred(struct task_struct *p) } /* - * Find out how many nodes on the workload is actively running on. Do this by + * Find out how many nodes the workload is actively running on. Do this by * tracking the nodes from which NUMA hinting faults are triggered. This can * be different from the set of nodes where the workload's memory is currently * located. @@ -2170,7 +2128,7 @@ static void update_task_scan_period(struct task_struct *p, /* * If there were no record hinting faults then either the task is - * completely idle or all activity is areas that are not of interest + * completely idle or all activity is in areas that are not of interest * to automatic numa balancing. Related to that, if there were failed * migration then it implies we are migrating too quickly or the local * node is overloaded. In either case, scan slower @@ -2427,7 +2385,7 @@ static void task_numa_placement(struct task_struct *p) * is at the beginning of the numa_faults array. */ ng->faults[mem_idx] += diff; - ng->faults_cpu[mem_idx] += f_diff; + ng->faults[cpu_idx] += f_diff; ng->total_faults += diff; group_faults += ng->faults[mem_idx]; } @@ -2481,7 +2439,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (unlikely(!deref_curr_numa_group(p))) { unsigned int size = sizeof(struct numa_group) + - 4*nr_node_ids*sizeof(unsigned long); + NR_NUMA_HINT_FAULT_STATS * + nr_node_ids * sizeof(unsigned long); grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!grp) @@ -2492,9 +2451,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); grp->gid = p->pid; - /* Second half of the array tracks nids where faults happen */ - grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * - nr_node_ids; for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] = p->numa_faults[i]; @@ -2995,6 +2951,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #endif cfs_rq->nr_running++; + if (se_is_idle(se)) + cfs_rq->idle_nr_running++; } static void @@ -3008,6 +2966,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) } #endif cfs_rq->nr_running--; + if (se_is_idle(se)) + cfs_rq->idle_nr_running--; } /* @@ -4207,7 +4167,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) /* sleeps up to a single latency don't count. */ if (!initial) { - unsigned long thresh = sysctl_sched_latency; + unsigned long thresh; + + if (se_is_idle(se)) + thresh = sysctl_sched_min_granularity; + else + thresh = sysctl_sched_latency; /* * Halve their sleep time's effect, to allow @@ -4225,26 +4190,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); -static inline void check_schedstat_required(void) -{ -#ifdef CONFIG_SCHEDSTATS - if (schedstat_enabled()) - return; - - /* Force schedstat enabled if a dependent tracepoint is active */ - if (trace_sched_stat_wait_enabled() || - trace_sched_stat_sleep_enabled() || - trace_sched_stat_iowait_enabled() || - trace_sched_stat_blocked_enabled() || - trace_sched_stat_runtime_enabled()) { - printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " - "stat_blocked and stat_runtime require the " - "kernel parameter schedstats=enable or " - "kernel.sched_schedstats=1\n"); - } -#endif -} - static inline bool cfs_bandwidth_used(void); /* @@ -4318,7 +4263,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) place_entity(cfs_rq, se, 0); check_schedstat_required(); - update_stats_enqueue(cfs_rq, se, flags); + update_stats_enqueue_fair(cfs_rq, se, flags); check_spread(cfs_rq, se); if (!curr) __enqueue_entity(cfs_rq, se); @@ -4402,7 +4347,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_load_avg(cfs_rq, se, UPDATE_TG); se_update_runnable(se); - update_stats_dequeue(cfs_rq, se, flags); + update_stats_dequeue_fair(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -4487,7 +4432,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - update_stats_wait_end(cfs_rq, se); + update_stats_wait_end_fair(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); } @@ -4502,9 +4447,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (schedstat_enabled() && rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { - schedstat_set(se->statistics.slice_max, - max((u64)schedstat_val(se->statistics.slice_max), - se->sum_exec_runtime - se->prev_sum_exec_runtime)); + struct sched_statistics *stats; + + stats = __schedstats_from_se(se); + __schedstat_set(stats->slice_max, + max((u64)stats->slice_max, + se->sum_exec_runtime - se->prev_sum_exec_runtime)); } se->prev_sum_exec_runtime = se->sum_exec_runtime; @@ -4586,7 +4534,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) check_spread(cfs_rq, prev); if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); + update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ @@ -4687,11 +4635,20 @@ static inline u64 sched_cfs_bandwidth_slice(void) */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { + s64 runtime; + if (unlikely(cfs_b->quota == RUNTIME_INF)) return; cfs_b->runtime += cfs_b->quota; + runtime = cfs_b->runtime_snap - cfs_b->runtime; + if (runtime > 0) { + cfs_b->burst_time += runtime; + cfs_b->nr_burst++; + } + cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst); + cfs_b->runtime_snap = cfs_b->runtime; } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) @@ -5577,6 +5534,17 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } +/* + * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use + * of idle_nr_running, which does not consider idle descendants of normal + * entities. + */ +static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq) +{ + return cfs_rq->nr_running && + cfs_rq->nr_running == cfs_rq->idle_nr_running; +} + #ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { @@ -5787,6 +5755,7 @@ static struct { cpumask_var_t idle_cpus_mask; atomic_t nr_cpus; int has_blocked; /* Idle CPUS has blocked load */ + int needs_update; /* Newly idle CPUs need their next_balance collated */ unsigned long next_balance; /* in jiffy units */ unsigned long next_blocked; /* Next update of blocked load in jiffies */ } nohz ____cacheline_aligned; @@ -5997,12 +5966,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits) target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); - schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); + schedstat_inc(p->stats.nr_wakeups_affine_attempts); if (target == nr_cpumask_bits) return prev_cpu; schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); + schedstat_inc(p->stats.nr_wakeups_affine); return target; } @@ -6443,11 +6412,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && asym_fits_capacity(task_util, recent_used_cpu)) { - /* - * Replace recent_used_cpu with prev as it is a potential - * candidate for the next wake: - */ - p->recent_used_cpu = prev; return recent_used_cpu; } @@ -7806,7 +7770,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; - schedstat_inc(p->se.statistics.nr_failed_migrations_affine); + schedstat_inc(p->stats.nr_failed_migrations_affine); env->flags |= LBF_SOME_PINNED; @@ -7840,7 +7804,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->flags &= ~LBF_ALL_PINNED; if (task_running(env->src_rq, p)) { - schedstat_inc(p->se.statistics.nr_failed_migrations_running); + schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } @@ -7862,12 +7826,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) env->sd->nr_balance_failed > env->sd->cache_nice_tries) { if (tsk_cache_hot == 1) { schedstat_inc(env->sd->lb_hot_gained[env->idle]); - schedstat_inc(p->se.statistics.nr_forced_migrations); + schedstat_inc(p->stats.nr_forced_migrations); } return 1; } - schedstat_inc(p->se.statistics.nr_failed_migrations_hot); + schedstat_inc(p->stats.nr_failed_migrations_hot); return 0; } @@ -8602,6 +8566,99 @@ group_type group_classify(unsigned int imbalance_pct, } /** + * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks + * @dst_cpu: Destination CPU of the load balancing + * @sds: Load-balancing data with statistics of the local group + * @sgs: Load-balancing statistics of the candidate busiest group + * @sg: The candidate busiest group + * + * Check the state of the SMT siblings of both @sds::local and @sg and decide + * if @dst_cpu can pull tasks. + * + * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of + * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks + * only if @dst_cpu has higher priority. + * + * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more + * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. + * Bigger imbalances in the number of busy CPUs will be dealt with in + * update_sd_pick_busiest(). + * + * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings + * of @dst_cpu are idle and @sg has lower priority. + */ +static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds, + struct sg_lb_stats *sgs, + struct sched_group *sg) +{ +#ifdef CONFIG_SCHED_SMT + bool local_is_smt, sg_is_smt; + int sg_busy_cpus; + + local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY; + sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY; + + sg_busy_cpus = sgs->group_weight - sgs->idle_cpus; + + if (!local_is_smt) { + /* + * If we are here, @dst_cpu is idle and does not have SMT + * siblings. Pull tasks if candidate group has two or more + * busy CPUs. + */ + if (sg_busy_cpus >= 2) /* implies sg_is_smt */ + return true; + + /* + * @dst_cpu does not have SMT siblings. @sg may have SMT + * siblings and only one is busy. In such case, @dst_cpu + * can help if it has higher priority and is idle (i.e., + * it has no running tasks). + */ + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + } + + /* @dst_cpu has SMT siblings. */ + + if (sg_is_smt) { + int local_busy_cpus = sds->local->group_weight - + sds->local_stat.idle_cpus; + int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; + + if (busy_cpus_delta == 1) + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + + return false; + } + + /* + * @sg does not have SMT siblings. Ensure that @sds::local does not end + * up with more than one busy SMT sibling and only pull tasks if there + * are not busy CPUs (i.e., no CPU has running tasks). + */ + if (!sds->local_stat.sum_nr_running) + return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); + + return false; +#else + /* Always return false so that callers deal with non-SMT cases. */ + return false; +#endif +} + +static inline bool +sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, + struct sched_group *group) +{ + /* Only do SMT checks if either local or candidate have SMT siblings */ + if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || + (group->flags & SD_SHARE_CPUCAPACITY)) + return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); + + return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); +} + +/** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. * @group: sched_group whose statistics are to be updated. @@ -8609,6 +8666,7 @@ group_type group_classify(unsigned int imbalance_pct, * @sg_status: Holds flag indicating the status of the sched_group */ static inline void update_sg_lb_stats(struct lb_env *env, + struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, int *sg_status) @@ -8617,7 +8675,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, memset(sgs, 0, sizeof(*sgs)); - local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + local_group = group == sds->local; for_each_cpu_and(i, sched_group_span(group), env->cpus) { struct rq *rq = cpu_rq(i); @@ -8660,18 +8718,17 @@ static inline void update_sg_lb_stats(struct lb_env *env, } } - /* Check if dst CPU is idle and preferred to this group */ - if (env->sd->flags & SD_ASYM_PACKING && - env->idle != CPU_NOT_IDLE && - sgs->sum_h_nr_running && - sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { - sgs->group_asym_packing = 1; - } - sgs->group_capacity = group->sgc->capacity; sgs->group_weight = group->group_weight; + /* Check if dst CPU is idle and preferred to this group */ + if (!local_group && env->sd->flags & SD_ASYM_PACKING && + env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && + sched_asym(env, sds, sgs, group)) { + sgs->group_asym_packing = 1; + } + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); /* Computing avg_load makes sense only when group is overloaded */ @@ -9180,7 +9237,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, sgs, &sg_status); + update_sg_lb_stats(env, sds, sg, sgs, &sg_status); if (local_group) goto next_group; @@ -9603,6 +9660,12 @@ static struct rq *find_busiest_queue(struct lb_env *env, nr_running == 1) continue; + /* Make sure we only pull tasks from a CPU of lower priority */ + if ((env->sd->flags & SD_ASYM_PACKING) && + sched_asym_prefer(i, env->dst_cpu) && + nr_running == 1) + continue; + switch (env->migration_type) { case migrate_load: /* @@ -10176,6 +10239,30 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } +static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +{ + if (cost > sd->max_newidle_lb_cost) { + /* + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. + */ + sd->max_newidle_lb_cost = cost; + sd->last_decay_max_lb_cost = jiffies; + } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { + /* + * Decay the newidle max times by ~1% per second to ensure that + * it is not outdated and the current max cost is actually + * shorter. + */ + sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; + sd->last_decay_max_lb_cost = jiffies; + + return true; + } + + return false; +} + /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -10199,14 +10286,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) for_each_domain(cpu, sd) { /* * Decay the newidle max times here because this is a regular - * visit to all the domains. Decay ~1% per second. + * visit to all the domains. */ - if (time_after(jiffies, sd->next_decay_max_lb_cost)) { - sd->max_newidle_lb_cost = - (sd->max_newidle_lb_cost * 253) / 256; - sd->next_decay_max_lb_cost = jiffies + HZ; - need_decay = 1; - } + need_decay = update_newidle_cost(sd, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -10375,7 +10457,7 @@ static void nohz_balancer_kick(struct rq *rq) goto out; if (rq->nr_running >= 2) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto out; } @@ -10389,7 +10471,7 @@ static void nohz_balancer_kick(struct rq *rq) * on. */ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } @@ -10403,7 +10485,7 @@ static void nohz_balancer_kick(struct rq *rq) */ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { if (sched_asym_prefer(i, cpu)) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } @@ -10416,7 +10498,7 @@ static void nohz_balancer_kick(struct rq *rq) * to run the misfit task on. */ if (check_misfit_status(rq, sd)) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } @@ -10443,13 +10525,16 @@ static void nohz_balancer_kick(struct rq *rq) */ nr_busy = atomic_read(&sds->nr_busy_cpus); if (nr_busy > 1) { - flags = NOHZ_KICK_MASK; + flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; goto unlock; } } unlock: rcu_read_unlock(); out: + if (READ_ONCE(nohz.needs_update)) + flags |= NOHZ_NEXT_KICK; + if (flags) kick_ilb(flags); } @@ -10546,12 +10631,13 @@ void nohz_balance_enter_idle(int cpu) /* * Ensures that if nohz_idle_balance() fails to observe our * @idle_cpus_mask store, it must observe the @has_blocked - * store. + * and @needs_update stores. */ smp_mb__after_atomic(); set_cpu_sd_state_idle(cpu); + WRITE_ONCE(nohz.needs_update, 1); out: /* * Each time a cpu enter idle, we assume that it has blocked load and @@ -10600,12 +10686,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, /* * We assume there will be no idle load after this update and clear * the has_blocked flag. If a cpu enters idle in the mean time, it will - * set the has_blocked flag and trig another update of idle load. + * set the has_blocked flag and trigger another update of idle load. * Because a cpu that becomes idle, is added to idle_cpus_mask before * setting the flag, we are sure to not clear the state and not * check the load of an idle cpu. + * + * Same applies to idle_cpus_mask vs needs_update. */ - WRITE_ONCE(nohz.has_blocked, 0); + if (flags & NOHZ_STATS_KICK) + WRITE_ONCE(nohz.has_blocked, 0); + if (flags & NOHZ_NEXT_KICK) + WRITE_ONCE(nohz.needs_update, 0); /* * Ensures that if we miss the CPU, we must see the has_blocked @@ -10627,13 +10718,17 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, * balancing owner will pick it up. */ if (need_resched()) { - has_blocked_load = true; + if (flags & NOHZ_STATS_KICK) + has_blocked_load = true; + if (flags & NOHZ_NEXT_KICK) + WRITE_ONCE(nohz.needs_update, 1); goto abort; } rq = cpu_rq(balance_cpu); - has_blocked_load |= update_nohz_stats(rq); + if (flags & NOHZ_STATS_KICK) + has_blocked_load |= update_nohz_stats(rq); /* * If time for next balance is due, @@ -10664,8 +10759,9 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, if (likely(update_next_balance)) nohz.next_balance = next_balance; - WRITE_ONCE(nohz.next_blocked, - now + msecs_to_jiffies(LOAD_AVG_PERIOD)); + if (flags & NOHZ_STATS_KICK) + WRITE_ONCE(nohz.next_blocked, + now + msecs_to_jiffies(LOAD_AVG_PERIOD)); abort: /* There is still blocked load, enable periodic update */ @@ -10763,9 +10859,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; + u64 t0, t1, curr_cost = 0; struct sched_domain *sd; int pulled_task = 0; - u64 curr_cost = 0; update_misfit_status(NULL, this_rq); @@ -10796,47 +10892,49 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) */ rq_unpin_lock(this_rq, rf); - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !READ_ONCE(this_rq->rd->overload)) { + rcu_read_lock(); + sd = rcu_dereference_check_sched_domain(this_rq->sd); + + if (!READ_ONCE(this_rq->rd->overload) || + (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { - rcu_read_lock(); - sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) update_next_balance(sd, &next_balance); rcu_read_unlock(); goto out; } + rcu_read_unlock(); raw_spin_rq_unlock(this_rq); + t0 = sched_clock_cpu(this_cpu); update_blocked_averages(this_cpu); + rcu_read_lock(); for_each_domain(this_cpu, sd) { int continue_balancing = 1; - u64 t0, domain_cost; + u64 domain_cost; - if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { - update_next_balance(sd, &next_balance); + update_next_balance(sd, &next_balance); + + if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) break; - } if (sd->flags & SD_BALANCE_NEWIDLE) { - t0 = sched_clock_cpu(this_cpu); pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &continue_balancing); - domain_cost = sched_clock_cpu(this_cpu) - t0; - if (domain_cost > sd->max_newidle_lb_cost) - sd->max_newidle_lb_cost = domain_cost; + t1 = sched_clock_cpu(this_cpu); + domain_cost = t1 - t0; + update_newidle_cost(sd, domain_cost); curr_cost += domain_cost; + t0 = t1; } - update_next_balance(sd, &next_balance); - /* * Stop searching for tasks to pull if there are * now runnable tasks on this rq. @@ -11394,7 +11492,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!cfs_rq) goto err; - se = kzalloc_node(sizeof(struct sched_entity), + se = kzalloc_node(sizeof(struct sched_entity_stats), GFP_KERNEL, cpu_to_node(i)); if (!se) goto err_free_rq; @@ -11560,7 +11658,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); struct sched_entity *se = tg->se[i]; - struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i]; bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; struct rq_flags rf; @@ -11571,6 +11669,14 @@ int sched_group_set_idle(struct task_group *tg, long idle) if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq))) goto next_cpu; + if (se->on_rq) { + parent_cfs_rq = cfs_rq_of(se); + if (cfs_rq_is_idle(grp_cfs_rq)) + parent_cfs_rq->idle_nr_running++; + else + parent_cfs_rq->idle_nr_running--; + } + idle_task_delta = grp_cfs_rq->h_nr_running - grp_cfs_rq->idle_h_nr_running; if (!cfs_rq_is_idle(grp_cfs_rq)) |