From 45226e944ce071d0231949f2fea90969437cd2dc Mon Sep 17 00:00:00 2001 From: Sameer Nanda Date: Mon, 30 Jul 2012 14:40:00 -0700 Subject: NMI watchdog: fix for lockup detector breakage on resume On the suspend/resume path the boot CPU does not go though an offline->online transition. This breaks the NMI detector post-resume since it depends on PMU state that is lost when the system gets suspended. Fix this by forcing a CPU offline->online transition for the lockup detector on the boot CPU during resume. To provide more context, we enable NMI watchdog on Chrome OS. We have seen several reports of systems freezing up completely which indicated that the NMI watchdog was not firing for some reason. Debugging further, we found a simple way of repro'ing system freezes -- issuing the command 'tasket 1 sh -c "echo nmilockup > /proc/breakme"' after the system has been suspended/resumed one or more times. With this patch in place, the system freeze result in panics, as expected. These panics provide a nice stack trace for us to debug the actual issue causing the freeze. [akpm@linux-foundation.org: fiddle with code comment] [akpm@linux-foundation.org: make lockup_detector_bootcpu_resume() conditional on CONFIG_SUSPEND] [akpm@linux-foundation.org: fix section errors] Signed-off-by: Sameer Nanda Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Don Zickus Cc: Mandeep Singh Baines Cc: Srivatsa S. Bhat Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 8 ++++++++ kernel/power/suspend.c | 3 +++ kernel/watchdog.c | 21 +++++++++++++++++++-- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 1e26a5e45aa6..68dcffaa62a0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -334,6 +334,14 @@ static inline void lockup_detector_init(void) } #endif +#if defined(CONFIG_LOCKUP_DETECTOR) && defined(CONFIG_SUSPEND) +void lockup_detector_bootcpu_resume(void); +#else +static inline void lockup_detector_bootcpu_resume(void) +{ +} +#endif + #ifdef CONFIG_DETECT_HUNG_TASK extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_check_count; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c8b7446b27df..1da39ea248fd 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -178,6 +178,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) arch_suspend_enable_irqs(); BUG_ON(irqs_disabled()); + /* Kick the lockup detector */ + lockup_detector_bootcpu_resume(); + Enable_cpus: enable_nonboot_cpus(); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4b1dfba70f7c..69add8a9da68 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -575,7 +575,7 @@ out: /* * Create/destroy watchdog threads as CPUs come and go: */ -static int __cpuinit +static int cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; @@ -610,10 +610,27 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cpu_nfb = { +static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; +#ifdef CONFIG_SUSPEND +/* + * On exit from suspend we force an offline->online transition on the boot CPU + * so that the PMU state that was lost while in suspended state gets set up + * properly for the boot CPU. This information is required for restarting the + * NMI watchdog. + */ +void lockup_detector_bootcpu_resume(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu); + cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu); +} +#endif + void __init lockup_detector_init(void) { void *cpu = (void *)(long)smp_processor_id(); -- cgit v1.2.3-58-ga151