From 106dd5afde3cd10db7e1370b6ddc77f0b2496a75 Mon Sep 17 00:00:00 2001 From: Michael wang Date: Wed, 13 Nov 2013 11:10:56 +0800 Subject: sched: Fix endless sync_sched/rcu() loop inside _cpu_down() Commit 6acce3ef8: sched: Remove get_online_cpus() usage tries to do sync_sched/rcu() inside _cpu_down() but triggers: INFO: task swapper/0:1 blocked for more than 120 seconds. ... [] synchronize_rcu+0x2c/0x30 [] _cpu_down+0x2b2/0x340 ... It was caused by that in the rcu boost case we rely on smpboot thread to finish the rcu callback, which has already been parked before sync in here and leads to the endless sync_sched/rcu(). This patch exchanges the sequence of smpboot_park_threads() and sync_sched/rcu() to fix the bug. Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Michael Wang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/5282EDC0.6060003@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/cpu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 63aa50d7ce1e..2227b58734a7 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -306,7 +306,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __func__, cpu); goto out_release; } - smpboot_park_threads(cpu); /* * By now we've cleared cpu_active_mask, wait for all preempt-disabled @@ -315,12 +314,16 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) * * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might * not imply sync_sched(), so explicitly call both. + * + * Do sync before park smpboot threads to take care the rcu boost case. */ #ifdef CONFIG_PREEMPT synchronize_sched(); #endif synchronize_rcu(); + smpboot_park_threads(cpu); + /* * So now all preempt/rcu users must observe !cpu_active(). */ -- cgit v1.2.3-58-ga151 From 46a73e8a1c1720f7713b5e2df68e9dd272015b5d Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 11 Nov 2013 19:29:25 -0500 Subject: sched/numa: Fix NULL pointer dereference in task_numa_migrate() The cpusets code can split up the scheduler's domain tree into smaller domains. Some of those smaller domains may not cross NUMA nodes at all, leading to a NULL pointer dereference on the per-cpu sd_numa pointer. Tasks cannot be migrated out of their domain, so the patch also sets p->numa_preferred_nid to whereever they are, to prevent the migration from being retried over and over again. Reported-by: Prarit Bhargava Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra Cc: Mel Gorman Link: http://lkml.kernel.org/n/tip-oosqomw0Jput0Jkvoowhrqtu@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df77c605c7a6..c11e36ff5ea0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1201,9 +1201,21 @@ static int task_numa_migrate(struct task_struct *p) */ rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); - env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + if (sd) + env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; rcu_read_unlock(); + /* + * Cpusets can break the scheduler domain tree into smaller + * balance domains, some of which do not cross NUMA boundaries. + * Tasks that are "trapped" in such domains cannot be migrated + * elsewhere, so there is no point in (re)trying. + */ + if (unlikely(!sd)) { + p->numa_preferred_nid = cpu_to_node(task_cpu(p)); + return -EINVAL; + } + taskweight = task_weight(p, env.src_nid); groupweight = group_weight(p, env.src_nid); update_numa_stats(&env.src_stats, env.src_nid); -- cgit v1.2.3-58-ga151 From 5eca82a9ac2c961cfbd26a4b6f43e6e3747a71dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Nov 2013 18:47:57 +0100 Subject: sched/numa: Cure update_numa_stats() vs. hotplug Because we're completely unserialized against hotplug its well possible to try and generate numa stats for an offlined node. Bail out early (and avoid a /0) in this case. The resulting stats are all 0 which should result in an undesirable balance target -- not to mention that actually trying to migrate to an offline CPU will fail. Reported-by: Prarit Bhargava Signed-off-by: Peter Zijlstra Cc: Mel Gorman Link: http://lkml.kernel.org/n/tip-orja0qylcvyhxfsuebcyL5sI@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c11e36ff5ea0..201be782b5b3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1000,7 +1000,7 @@ struct numa_stats { */ static void update_numa_stats(struct numa_stats *ns, int nid) { - int cpu; + int cpu, cpus = 0; memset(ns, 0, sizeof(*ns)); for_each_cpu(cpu, cpumask_of_node(nid)) { @@ -1009,8 +1009,21 @@ static void update_numa_stats(struct numa_stats *ns, int nid) ns->nr_running += rq->nr_running; ns->load += weighted_cpuload(cpu); ns->power += power_of(cpu); + + cpus++; } + /* + * If we raced with hotplug and there are no CPUs left in our mask + * the @ns structure is NULL'ed and task_numa_compare() will + * not find this node attractive. + * + * We'll either bail at !has_capacity, or we'll detect a huge imbalance + * and bail there. + */ + if (!cpus) + return; + ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); ns->has_capacity = (ns->nr_running < ns->capacity); -- cgit v1.2.3-58-ga151 From 911b2898b3c9fe0048e9485ad1629ed4fce330fd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Nov 2013 18:21:56 +0100 Subject: sched: Optimize task_sched_runtime() Large multi-threaded apps like to hit this using do_sys_times() and then queue up on the rq->lock. Avoid when possible. Larry reported ~20% performance increase his test case. Reported-by: Larry Woodman Suggested-by: Paul Turner Signed-off-by: Peter Zijlstra Cc: KOSAKI Motohiro Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20131111172925.GG26898@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1deccd78be98..c1808606ee5f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2253,6 +2253,20 @@ unsigned long long task_sched_runtime(struct task_struct *p) struct rq *rq; u64 ns = 0; +#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) + /* + * 64-bit doesn't need locks to atomically read a 64bit value. + * So we have a optimization chance when the task's delta_exec is 0. + * Reading ->on_cpu is racy, but this is ok. + * + * If we race with it leaving cpu, we'll take a lock. So we're correct. + * If we race with it entering cpu, unaccounted time is 0. This is + * indistinguishable from the read occurring a few cycles earlier. + */ + if (!p->on_cpu) + return p->se.sum_exec_runtime; +#endif + rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); task_rq_unlock(rq, p, &flags); -- cgit v1.2.3-58-ga151 From 85b088e934b9943322bfe37077289ae60f1b3414 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Sun, 10 Nov 2013 20:42:01 +0100 Subject: sched/fair: Avoid integer overflow sa->runnable_avg_sum is of type u32 but after shifting it by NICE_0_SHIFT bits it is promoted to u64. This of course makes no sense, since the result will never be more then 32-bit long. Casting sa->runnable_avg_sum to u64 before it is shifted, fixes this problem. Reviewed-by: Ben Segall Signed-off-by: Michal Nazarewicz Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1384112521-25177-1-git-send-email-mpn@google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 201be782b5b3..e8b652ebe027 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2178,7 +2178,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, long contrib; /* The fraction of a cpu used by this cfs_rq */ - contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, + contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, sa->runnable_avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; -- cgit v1.2.3-58-ga151