diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-23 09:28:37 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-23 09:28:37 -0800 |
commit | 525445efacdfeed71329ce8bc5f558859a894b8b (patch) | |
tree | 6cdf76560eae119b561bade1846dffa3b182da21 /arch/x86 | |
parent | 192a5e0a19712a079f456954c203ce9dd2b889fa (diff) | |
parent | 344da544f177f919cf6919e5abcd388f27aa53db (diff) |
Merge tag 'nmi.2023.02.14a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu
Pull x86 NMI diagnostics from Paul McKenney:
"Add diagnostics to the x86 NMI handler to help detect NMI-handler bugs
on the one hand and failing hardware on the other"
* tag 'nmi.2023.02.14a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu:
x86/nmi: Print reasons why backtrace NMIs are ignored
x86/nmi: Accumulate NMI-progress evidence in exc_nmi()
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/kernel/nmi.c | 108 |
1 files changed, 107 insertions, 1 deletions
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index cec0bfa3bc04..c315b18ec7c8 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -69,6 +69,15 @@ struct nmi_stats { unsigned int unknown; unsigned int external; unsigned int swallow; + unsigned long recv_jiffies; + unsigned long idt_seq; + unsigned long idt_nmi_seq; + unsigned long idt_ignored; + atomic_long_t idt_calls; + unsigned long idt_seq_snap; + unsigned long idt_nmi_seq_snap; + unsigned long idt_ignored_snap; + long idt_calls_snap; }; static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); @@ -479,12 +488,15 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_RAW(exc_nmi) { irqentry_state_t irq_state; + struct nmi_stats *nsp = this_cpu_ptr(&nmi_stats); /* * Re-enable NMIs right here when running as an SEV-ES guest. This might * cause nested NMIs, but those can be handled safely. */ sev_es_nmi_complete(); + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) + arch_atomic_long_inc(&nsp->idt_calls); if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) return; @@ -495,6 +507,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi) } this_cpu_write(nmi_state, NMI_EXECUTING); this_cpu_write(nmi_cr2, read_cr2()); + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { + WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1); + WARN_ON_ONCE(!(nsp->idt_seq & 0x1)); + WRITE_ONCE(nsp->recv_jiffies, jiffies); + } nmi_restart: /* @@ -509,8 +526,19 @@ nmi_restart: inc_irq_stat(__nmi_count); - if (!ignore_nmis) + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU) && ignore_nmis) { + WRITE_ONCE(nsp->idt_ignored, nsp->idt_ignored + 1); + } else if (!ignore_nmis) { + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { + WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1); + WARN_ON_ONCE(!(nsp->idt_nmi_seq & 0x1)); + } default_do_nmi(regs); + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { + WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1); + WARN_ON_ONCE(nsp->idt_nmi_seq & 0x1); + } + } irqentry_nmi_exit(regs, irq_state); @@ -525,6 +553,11 @@ nmi_restart: if (user_mode(regs)) mds_user_clear_cpu_buffers(); + if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) { + WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1); + WARN_ON_ONCE(nsp->idt_seq & 0x1); + WRITE_ONCE(nsp->recv_jiffies, jiffies); + } } #if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL) @@ -537,6 +570,79 @@ DEFINE_IDTENTRY_RAW(exc_nmi_noist) EXPORT_SYMBOL_GPL(asm_exc_nmi_noist); #endif +#ifdef CONFIG_NMI_CHECK_CPU + +static char *nmi_check_stall_msg[] = { +/* */ +/* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */ +/* | +------ cpu_is_offline(cpu) */ +/* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */ +/* | | | NMI handler has been invoked. */ +/* | | | */ +/* V V V */ +/* 0 0 0 */ "NMIs are not reaching exc_nmi() handler", +/* 0 0 1 */ "exc_nmi() handler is ignoring NMIs", +/* 0 1 0 */ "CPU is offline and NMIs are not reaching exc_nmi() handler", +/* 0 1 1 */ "CPU is offline and exc_nmi() handler is legitimately ignoring NMIs", +/* 1 0 0 */ "CPU is in exc_nmi() handler and no further NMIs are reaching handler", +/* 1 0 1 */ "CPU is in exc_nmi() handler which is legitimately ignoring NMIs", +/* 1 1 0 */ "CPU is offline in exc_nmi() handler and no more NMIs are reaching exc_nmi() handler", +/* 1 1 1 */ "CPU is offline in exc_nmi() handler which is legitimately ignoring NMIs", +}; + +void nmi_backtrace_stall_snap(const struct cpumask *btp) +{ + int cpu; + struct nmi_stats *nsp; + + for_each_cpu(cpu, btp) { + nsp = per_cpu_ptr(&nmi_stats, cpu); + nsp->idt_seq_snap = READ_ONCE(nsp->idt_seq); + nsp->idt_nmi_seq_snap = READ_ONCE(nsp->idt_nmi_seq); + nsp->idt_ignored_snap = READ_ONCE(nsp->idt_ignored); + nsp->idt_calls_snap = atomic_long_read(&nsp->idt_calls); + } +} + +void nmi_backtrace_stall_check(const struct cpumask *btp) +{ + int cpu; + int idx; + unsigned long nmi_seq; + unsigned long j = jiffies; + char *modp; + char *msgp; + char *msghp; + struct nmi_stats *nsp; + + for_each_cpu(cpu, btp) { + nsp = per_cpu_ptr(&nmi_stats, cpu); + modp = ""; + msghp = ""; + nmi_seq = READ_ONCE(nsp->idt_nmi_seq); + if (nsp->idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1)) { + msgp = "CPU entered NMI handler function, but has not exited"; + } else if ((nsp->idt_nmi_seq_snap & 0x1) != (nmi_seq & 0x1)) { + msgp = "CPU is handling NMIs"; + } else { + idx = ((nsp->idt_seq_snap & 0x1) << 2) | + (cpu_is_offline(cpu) << 1) | + (nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls)); + msgp = nmi_check_stall_msg[idx]; + if (nsp->idt_ignored_snap != READ_ONCE(nsp->idt_ignored) && (idx & 0x1)) + modp = ", but OK because ignore_nmis was set"; + if (nmi_seq & ~0x1) + msghp = " (CPU currently in NMI handler function)"; + else if (nsp->idt_nmi_seq_snap + 1 == nmi_seq) + msghp = " (CPU exited one NMI handler function)"; + } + pr_alert("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n", + __func__, cpu, msgp, modp, msghp, j - READ_ONCE(nsp->recv_jiffies)); + } +} + +#endif + void stop_nmi(void) { ignore_nmis++; |