summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/arm64/include/asm/kvm_asm.h16
-rw-r--r--arch/arm64/include/asm/memory.h8
-rw-r--r--arch/arm64/include/asm/stacktrace.h62
-rw-r--r--arch/arm64/include/asm/stacktrace/common.h199
-rw-r--r--arch/arm64/include/asm/stacktrace/nvhe.h55
-rw-r--r--arch/arm64/kernel/Makefile5
-rw-r--r--arch/arm64/kernel/stacktrace.c184
-rw-r--r--arch/arm64/kvm/Kconfig13
-rw-r--r--arch/arm64/kvm/Makefile2
-rw-r--r--arch/arm64/kvm/arm.c2
-rw-r--r--arch/arm64/kvm/handle_exit.c4
-rw-r--r--arch/arm64/kvm/hyp/nvhe/Makefile2
-rw-r--r--arch/arm64/kvm/hyp/nvhe/host.S9
-rw-r--r--arch/arm64/kvm/hyp/nvhe/stacktrace.c160
-rw-r--r--arch/arm64/kvm/hyp/nvhe/switch.c6
-rw-r--r--arch/arm64/kvm/stacktrace.c218
16 files changed, 775 insertions, 170 deletions
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 2e277f2ed671..53035763e48e 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -176,6 +176,22 @@ struct kvm_nvhe_init_params {
unsigned long vtcr;
};
+/*
+ * Used by the host in EL1 to dump the nVHE hypervisor backtrace on
+ * hyp_panic() in non-protected mode.
+ *
+ * @stack_base: hyp VA of the hyp_stack base.
+ * @overflow_stack_base: hyp VA of the hyp_overflow_stack base.
+ * @fp: hyp FP where the backtrace begins.
+ * @pc: hyp PC where the backtrace begins.
+ */
+struct kvm_nvhe_stacktrace_info {
+ unsigned long stack_base;
+ unsigned long overflow_stack_base;
+ unsigned long fp;
+ unsigned long pc;
+};
+
/* Translate a kernel address @ptr into its equivalent linear mapping */
#define kvm_ksym_ref(ptr) \
({ \
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 0af70d9abede..cab80a9a4086 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -114,6 +114,14 @@
#define OVERFLOW_STACK_SIZE SZ_4K
/*
+ * With the minimum frame size of [x29, x30], exactly half the combined
+ * sizes of the hyp and overflow stacks is the maximum size needed to
+ * save the unwinded stacktrace; plus an additional entry to delimit the
+ * end.
+ */
+#define NVHE_STACKTRACE_SIZE ((OVERFLOW_STACK_SIZE + PAGE_SIZE) / 2 + sizeof(long))
+
+/*
* Alignment of kernel segments (e.g. .text, .data).
*
* 4 KB granule: 16 level 3 entries, with contiguous bit
diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index aec9315bf156..6ebdcdff77f5 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -8,52 +8,20 @@
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
-#include <linux/types.h>
#include <linux/llist.h>
#include <asm/memory.h>
+#include <asm/pointer_auth.h>
#include <asm/ptrace.h>
#include <asm/sdei.h>
-enum stack_type {
- STACK_TYPE_UNKNOWN,
- STACK_TYPE_TASK,
- STACK_TYPE_IRQ,
- STACK_TYPE_OVERFLOW,
- STACK_TYPE_SDEI_NORMAL,
- STACK_TYPE_SDEI_CRITICAL,
- __NR_STACK_TYPES
-};
-
-struct stack_info {
- unsigned long low;
- unsigned long high;
- enum stack_type type;
-};
+#include <asm/stacktrace/common.h>
extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
const char *loglvl);
DECLARE_PER_CPU(unsigned long *, irq_stack_ptr);
-static inline bool on_stack(unsigned long sp, unsigned long size,
- unsigned long low, unsigned long high,
- enum stack_type type, struct stack_info *info)
-{
- if (!low)
- return false;
-
- if (sp < low || sp + size < sp || sp + size > high)
- return false;
-
- if (info) {
- info->low = low;
- info->high = high;
- info->type = type;
- }
- return true;
-}
-
static inline bool on_irq_stack(unsigned long sp, unsigned long size,
struct stack_info *info)
{
@@ -89,30 +57,4 @@ static inline bool on_overflow_stack(unsigned long sp, unsigned long size,
struct stack_info *info) { return false; }
#endif
-
-/*
- * We can only safely access per-cpu stacks from current in a non-preemptible
- * context.
- */
-static inline bool on_accessible_stack(const struct task_struct *tsk,
- unsigned long sp, unsigned long size,
- struct stack_info *info)
-{
- if (info)
- info->type = STACK_TYPE_UNKNOWN;
-
- if (on_task_stack(tsk, sp, size, info))
- return true;
- if (tsk != current || preemptible())
- return false;
- if (on_irq_stack(sp, size, info))
- return true;
- if (on_overflow_stack(sp, size, info))
- return true;
- if (on_sdei_stack(sp, size, info))
- return true;
-
- return false;
-}
-
#endif /* __ASM_STACKTRACE_H */
diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h
new file mode 100644
index 000000000000..f58eb944c46f
--- /dev/null
+++ b/arch/arm64/include/asm/stacktrace/common.h
@@ -0,0 +1,199 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Common arm64 stack unwinder code.
+ *
+ * To implement a new arm64 stack unwinder:
+ * 1) Include this header
+ *
+ * 2) Call into unwind_next_common() from your top level unwind
+ * function, passing it the validation and translation callbacks
+ * (though the later can be NULL if no translation is required).
+ *
+ * See: arch/arm64/kernel/stacktrace.c for the reference implementation.
+ *
+ * Copyright (C) 2012 ARM Ltd.
+ */
+#ifndef __ASM_STACKTRACE_COMMON_H
+#define __ASM_STACKTRACE_COMMON_H
+
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/kprobes.h>
+#include <linux/types.h>
+
+enum stack_type {
+ STACK_TYPE_UNKNOWN,
+ STACK_TYPE_TASK,
+ STACK_TYPE_IRQ,
+ STACK_TYPE_OVERFLOW,
+ STACK_TYPE_SDEI_NORMAL,
+ STACK_TYPE_SDEI_CRITICAL,
+ STACK_TYPE_HYP,
+ __NR_STACK_TYPES
+};
+
+struct stack_info {
+ unsigned long low;
+ unsigned long high;
+ enum stack_type type;
+};
+
+/*
+ * A snapshot of a frame record or fp/lr register values, along with some
+ * accounting information necessary for robust unwinding.
+ *
+ * @fp: The fp value in the frame record (or the real fp)
+ * @pc: The lr value in the frame record (or the real lr)
+ *
+ * @stacks_done: Stacks which have been entirely unwound, for which it is no
+ * longer valid to unwind to.
+ *
+ * @prev_fp: The fp that pointed to this frame record, or a synthetic value
+ * of 0. This is used to ensure that within a stack, each
+ * subsequent frame record is at an increasing address.
+ * @prev_type: The type of stack this frame record was on, or a synthetic
+ * value of STACK_TYPE_UNKNOWN. This is used to detect a
+ * transition from one stack to another.
+ *
+ * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance
+ * associated with the most recently encountered replacement lr
+ * value.
+ *
+ * @task: The task being unwound.
+ */
+struct unwind_state {
+ unsigned long fp;
+ unsigned long pc;
+ DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES);
+ unsigned long prev_fp;
+ enum stack_type prev_type;
+#ifdef CONFIG_KRETPROBES
+ struct llist_node *kr_cur;
+#endif
+ struct task_struct *task;
+};
+
+static inline bool on_stack(unsigned long sp, unsigned long size,
+ unsigned long low, unsigned long high,
+ enum stack_type type, struct stack_info *info)
+{
+ if (!low)
+ return false;
+
+ if (sp < low || sp + size < sp || sp + size > high)
+ return false;
+
+ if (info) {
+ info->low = low;
+ info->high = high;
+ info->type = type;
+ }
+ return true;
+}
+
+static inline void unwind_init_common(struct unwind_state *state,
+ struct task_struct *task)
+{
+ state->task = task;
+#ifdef CONFIG_KRETPROBES
+ state->kr_cur = NULL;
+#endif
+
+ /*
+ * Prime the first unwind.
+ *
+ * In unwind_next() we'll check that the FP points to a valid stack,
+ * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be
+ * treated as a transition to whichever stack that happens to be. The
+ * prev_fp value won't be used, but we set it to 0 such that it is
+ * definitely not an accessible stack address.
+ */
+ bitmap_zero(state->stacks_done, __NR_STACK_TYPES);
+ state->prev_fp = 0;
+ state->prev_type = STACK_TYPE_UNKNOWN;
+}
+
+/*
+ * stack_trace_translate_fp_fn() - Translates a non-kernel frame pointer to
+ * a kernel address.
+ *
+ * @fp: the frame pointer to be updated to its kernel address.
+ * @type: the stack type associated with frame pointer @fp
+ *
+ * Returns true and success and @fp is updated to the corresponding
+ * kernel virtual address; otherwise returns false.
+ */
+typedef bool (*stack_trace_translate_fp_fn)(unsigned long *fp,
+ enum stack_type type);
+
+/*
+ * on_accessible_stack_fn() - Check whether a stack range is on any
+ * of the possible stacks.
+ *
+ * @tsk: task whose stack is being unwound
+ * @sp: stack address being checked
+ * @size: size of the stack range being checked
+ * @info: stack unwinding context
+ */
+typedef bool (*on_accessible_stack_fn)(const struct task_struct *tsk,
+ unsigned long sp, unsigned long size,
+ struct stack_info *info);
+
+static inline int unwind_next_common(struct unwind_state *state,
+ struct stack_info *info,
+ on_accessible_stack_fn accessible,
+ stack_trace_translate_fp_fn translate_fp)
+{
+ unsigned long fp = state->fp, kern_fp = fp;
+ struct task_struct *tsk = state->task;
+
+ if (fp & 0x7)
+ return -EINVAL;
+
+ if (!accessible(tsk, fp, 16, info))
+ return -EINVAL;
+
+ if (test_bit(info->type, state->stacks_done))
+ return -EINVAL;
+
+ /*
+ * If fp is not from the current address space perform the necessary
+ * translation before dereferencing it to get the next fp.
+ */
+ if (translate_fp && !translate_fp(&kern_fp, info->type))
+ return -EINVAL;
+
+ /*
+ * As stacks grow downward, any valid record on the same stack must be
+ * at a strictly higher address than the prior record.
+ *
+ * Stacks can nest in several valid orders, e.g.
+ *
+ * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL
+ * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW
+ * HYP -> OVERFLOW
+ *
+ * ... but the nesting itself is strict. Once we transition from one
+ * stack to another, it's never valid to unwind back to that first
+ * stack.
+ */
+ if (info->type == state->prev_type) {
+ if (fp <= state->prev_fp)
+ return -EINVAL;
+ } else {
+ __set_bit(state->prev_type, state->stacks_done);
+ }
+
+ /*
+ * Record this frame record's values and location. The prev_fp and
+ * prev_type are only meaningful to the next unwind_next() invocation.
+ */
+ state->fp = READ_ONCE(*(unsigned long *)(kern_fp));
+ state->pc = READ_ONCE(*(unsigned long *)(kern_fp + 8));
+ state->prev_fp = fp;
+ state->prev_type = info->type;
+
+ return 0;
+}
+
+#endif /* __ASM_STACKTRACE_COMMON_H */
diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h
new file mode 100644
index 000000000000..d5527b600390
--- /dev/null
+++ b/arch/arm64/include/asm/stacktrace/nvhe.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM nVHE hypervisor stack tracing support.
+ *
+ * The unwinder implementation depends on the nVHE mode:
+ *
+ * 1) Non-protected nVHE mode - the host can directly access the
+ * HYP stack pages and unwind the HYP stack in EL1. This saves having
+ * to allocate shared buffers for the host to read the unwinded
+ * stacktrace.
+ *
+ * 2) pKVM (protected nVHE) mode - the host cannot directly access
+ * the HYP memory. The stack is unwinded in EL2 and dumped to a shared
+ * buffer where the host can read and print the stacktrace.
+ *
+ * Copyright (C) 2022 Google LLC
+ */
+#ifndef __ASM_STACKTRACE_NVHE_H
+#define __ASM_STACKTRACE_NVHE_H
+
+#include <asm/stacktrace/common.h>
+
+/*
+ * kvm_nvhe_unwind_init - Start an unwind from the given nVHE HYP fp and pc
+ *
+ * @state : unwind_state to initialize
+ * @fp : frame pointer at which to start the unwinding.
+ * @pc : program counter at which to start the unwinding.
+ */
+static inline void kvm_nvhe_unwind_init(struct unwind_state *state,
+ unsigned long fp,
+ unsigned long pc)
+{
+ unwind_init_common(state, NULL);
+
+ state->fp = fp;
+ state->pc = pc;
+}
+
+#ifndef __KVM_NVHE_HYPERVISOR__
+/*
+ * Conventional (non-protected) nVHE HYP stack unwinder
+ *
+ * In non-protected mode, the unwinding is done from kernel proper context
+ * (by the host in EL1).
+ */
+
+DECLARE_KVM_NVHE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack);
+DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info);
+DECLARE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
+
+void kvm_nvhe_dump_backtrace(unsigned long hyp_offset);
+
+#endif /* __KVM_NVHE_HYPERVISOR__ */
+#endif /* __ASM_STACKTRACE_NVHE_H */
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index fa7981d0d917..7075a9c6a4a6 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -14,6 +14,11 @@ CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong
CFLAGS_syscall.o += -fno-stack-protector
+# When KASAN is enabled, a stack trace is recorded for every alloc/free, which
+# can significantly impact performance. Avoid instrumenting the stack trace
+# collection code to minimize this impact.
+KASAN_SANITIZE_stacktrace.o := n
+
# It's not safe to invoke KCOV when portions of the kernel environment aren't
# available or are out-of-sync with HW state. Since `noinstr` doesn't always
# inhibit KCOV instrumentation, disable it for the entire compilation unit.
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 0467cb79f080..ce190ee18a20 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -7,72 +7,90 @@
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/ftrace.h>
-#include <linux/kprobes.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
#include <linux/stacktrace.h>
#include <asm/irq.h>
-#include <asm/pointer_auth.h>
#include <asm/stack_pointer.h>
#include <asm/stacktrace.h>
/*
- * A snapshot of a frame record or fp/lr register values, along with some
- * accounting information necessary for robust unwinding.
+ * Start an unwind from a pt_regs.
*
- * @fp: The fp value in the frame record (or the real fp)
- * @pc: The lr value in the frame record (or the real lr)
+ * The unwind will begin at the PC within the regs.
*
- * @stacks_done: Stacks which have been entirely unwound, for which it is no
- * longer valid to unwind to.
+ * The regs must be on a stack currently owned by the calling task.
+ */
+static inline void unwind_init_from_regs(struct unwind_state *state,
+ struct pt_regs *regs)
+{
+ unwind_init_common(state, current);
+
+ state->fp = regs->regs[29];
+ state->pc = regs->pc;
+}
+
+/*
+ * Start an unwind from a caller.
*
- * @prev_fp: The fp that pointed to this frame record, or a synthetic value
- * of 0. This is used to ensure that within a stack, each
- * subsequent frame record is at an increasing address.
- * @prev_type: The type of stack this frame record was on, or a synthetic
- * value of STACK_TYPE_UNKNOWN. This is used to detect a
- * transition from one stack to another.
+ * The unwind will begin at the caller of whichever function this is inlined
+ * into.
*
- * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance
- * associated with the most recently encountered replacement lr
- * value.
+ * The function which invokes this must be noinline.
*/
-struct unwind_state {
- unsigned long fp;
- unsigned long pc;
- DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES);
- unsigned long prev_fp;
- enum stack_type prev_type;
-#ifdef CONFIG_KRETPROBES
- struct llist_node *kr_cur;
-#endif
-};
+static __always_inline void unwind_init_from_caller(struct unwind_state *state)
+{
+ unwind_init_common(state, current);
+
+ state->fp = (unsigned long)__builtin_frame_address(1);
+ state->pc = (unsigned long)__builtin_return_address(0);
+}
-static notrace void unwind_init(struct unwind_state *state, unsigned long fp,
- unsigned long pc)
+/*
+ * Start an unwind from a blocked task.
+ *
+ * The unwind will begin at the blocked tasks saved PC (i.e. the caller of
+ * cpu_switch_to()).
+ *
+ * The caller should ensure the task is blocked in cpu_switch_to() for the
+ * duration of the unwind, or the unwind will be bogus. It is never valid to
+ * call this for the current task.
+ */
+static inline void unwind_init_from_task(struct unwind_state *state,
+ struct task_struct *task)
{
- state->fp = fp;
- state->pc = pc;
-#ifdef CONFIG_KRETPROBES
- state->kr_cur = NULL;
-#endif
+ unwind_init_common(state, task);
+
+ state->fp = thread_saved_fp(task);
+ state->pc = thread_saved_pc(task);
+}
- /*
- * Prime the first unwind.
- *
- * In unwind_next() we'll check that the FP points to a valid stack,
- * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be
- * treated as a transition to whichever stack that happens to be. The
- * prev_fp value won't be used, but we set it to 0 such that it is
- * definitely not an accessible stack address.
- */
- bitmap_zero(state->stacks_done, __NR_STACK_TYPES);
- state->prev_fp = 0;
- state->prev_type = STACK_TYPE_UNKNOWN;
+/*
+ * We can only safely access per-cpu stacks from current in a non-preemptible
+ * context.
+ */
+static bool on_accessible_stack(const struct task_struct *tsk,
+ unsigned long sp, unsigned long size,
+ struct stack_info *info)
+{
+ if (info)
+ info->type = STACK_TYPE_UNKNOWN;
+
+ if (on_task_stack(tsk, sp, size, info))
+ return true;
+ if (tsk != current || preemptible())
+ return false;
+ if (on_irq_stack(sp, size, info))
+ return true;
+ if (on_overflow_stack(sp, size, info))
+ return true;
+ if (on_sdei_stack(sp, size, info))
+ return true;
+
+ return false;
}
-NOKPROBE_SYMBOL(unwind_init);
/*
* Unwind from one frame record (A) to the next frame record (B).
@@ -81,53 +99,20 @@ NOKPROBE_SYMBOL(unwind_init);
* records (e.g. a cycle), determined based on the location and fp value of A
* and the location (but not the fp value) of B.
*/
-static int notrace unwind_next(struct task_struct *tsk,
- struct unwind_state *state)
+static int notrace unwind_next(struct unwind_state *state)
{
+ struct task_struct *tsk = state->task;
unsigned long fp = state->fp;
struct stack_info info;
+ int err;
/* Final frame; nothing to unwind */
if (fp == (unsigned long)task_pt_regs(tsk)->stackframe)
return -ENOENT;
- if (fp & 0x7)
- return -EINVAL;
-
- if (!on_accessible_stack(tsk, fp, 16, &info))
- return -EINVAL;
-
- if (test_bit(info.type, state->stacks_done))
- return -EINVAL;
-
- /*
- * As stacks grow downward, any valid record on the same stack must be
- * at a strictly higher address than the prior record.
- *
- * Stacks can nest in several valid orders, e.g.
- *
- * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL
- * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW
- *
- * ... but the nesting itself is strict. Once we transition from one
- * stack to another, it's never valid to unwind back to that first
- * stack.
- */
- if (info.type == state->prev_type) {
- if (fp <= state->prev_fp)
- return -EINVAL;
- } else {
- set_bit(state->prev_type, state->stacks_done);
- }
-
- /*
- * Record this frame record's values and location. The prev_fp and
- * prev_type are only meaningful to the next unwind_next() invocation.
- */
- state->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp));
- state->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8));
- state->prev_fp = fp;
- state->prev_type = info.type;
+ err = unwind_next_common(state, &info, on_accessible_stack, NULL);
+ if (err)
+ return err;
state->pc = ptrauth_strip_insn_pac(state->pc);
@@ -157,8 +142,7 @@ static int notrace unwind_next(struct task_struct *tsk,
}
NOKPROBE_SYMBOL(unwind_next);
-static void notrace unwind(struct task_struct *tsk,
- struct unwind_state *state,
+static void notrace unwind(struct unwind_state *state,
stack_trace_consume_fn consume_entry, void *cookie)
{
while (1) {
@@ -166,7 +150,7 @@ static void notrace unwind(struct task_struct *tsk,
if (!consume_entry(cookie, state->pc))
break;
- ret = unwind_next(tsk, state);
+ ret = unwind_next(state);
if (ret < 0)
break;
}
@@ -212,15 +196,15 @@ noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry,
{
struct unwind_state state;
- if (regs)
- unwind_init(&state, regs->regs[29], regs->pc);
- else if (task == current)
- unwind_init(&state,
- (unsigned long)__builtin_frame_address(1),
- (unsigned long)__builtin_return_address(0));
- else
- unwind_init(&state, thread_saved_fp(task),
- thread_saved_pc(task));
-
- unwind(task, &state, consume_entry, cookie);
+ if (regs) {
+ if (task != current)
+ return;
+ unwind_init_from_regs(&state, regs);
+ } else if (task == current) {
+ unwind_init_from_caller(&state);
+ } else {
+ unwind_init_from_task(&state, task);
+ }
+
+ unwind(&state, consume_entry, cookie);
}
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 8a5fbbf084df..815cc118c675 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -56,4 +56,17 @@ config NVHE_EL2_DEBUG
If unsure, say N.
+config PROTECTED_NVHE_STACKTRACE
+ bool "Protected KVM hypervisor stacktraces"
+ depends on NVHE_EL2_DEBUG
+ default n
+ help
+ Say Y here to enable pKVM hypervisor stacktraces on hyp_panic()
+
+ If using protected nVHE mode, but cannot afford the associated
+ memory cost (less than 0.75 page per CPU) of pKVM stacktraces,
+ say N.
+
+ If unsure, or not using protected nVHE (pKVM), say N.
+
endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index aa127ae9f675..5e33c2d4645a 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -12,7 +12,7 @@ obj-$(CONFIG_KVM) += hyp/
kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
inject_fault.o va_layout.o handle_exit.o \
- guest.o debug.o reset.o sys_regs.o \
+ guest.o debug.o reset.o sys_regs.o stacktrace.o \
vgic-sys-reg-v3.o fpsimd.o pkvm.o \
arch_timer.o trng.o vmid.o \
vgic/vgic.o vgic/vgic-init.o \
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index b81f34800d3c..8fe73ee5fa84 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -49,7 +49,7 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);
DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);
-static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
+DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
unsigned long kvm_arm_hyp_percpu_base[NR_CPUS];
DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index cb0bc77c16f9..bbe5b393d689 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -17,6 +17,7 @@
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
#include <asm/debug-monitors.h>
+#include <asm/stacktrace/nvhe.h>
#include <asm/traps.h>
#include <kvm/arm_hypercalls.h>
@@ -353,6 +354,9 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
(void *)(panic_addr + kaslr_offset()));
}
+ /* Dump the nVHE hypervisor backtrace */
+ kvm_nvhe_dump_backtrace(hyp_offset);
+
/*
* Hyp has panicked and we're going to handle that by panicking the
* kernel. The kernel offset will be revealed in the panic so we're
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index a2b0d043dddf..ed5d222e2826 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -14,7 +14,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs))
hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \
hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \
- cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o
+ cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o
hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \
../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o
hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o
diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S
index ea6a397b64a6..b6c0188c4b35 100644
--- a/arch/arm64/kvm/hyp/nvhe/host.S
+++ b/arch/arm64/kvm/hyp/nvhe/host.S
@@ -177,13 +177,8 @@ SYM_FUNC_END(__host_hvc)
b hyp_panic
.L__hyp_sp_overflow\@:
- /*
- * Reset SP to the top of the stack, to allow handling the hyp_panic.
- * This corrupts the stack but is ok, since we won't be attempting
- * any unwinding here.
- */
- ldr_this_cpu x0, kvm_init_params + NVHE_INIT_STACK_HYP_VA, x1
- mov sp, x0
+ /* Switch to the overflow stack */
+ adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0
b hyp_panic_bad_stack
ASM_BUG()
diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
new file mode 100644
index 000000000000..58f645ad66bc
--- /dev/null
+++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM nVHE hypervisor stack tracing support.
+ *
+ * Copyright (C) 2022 Google LLC
+ */
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+#include <asm/memory.h>
+#include <asm/percpu.h>
+
+DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack)
+ __aligned(16);
+
+DEFINE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info);
+
+/*
+ * hyp_prepare_backtrace - Prepare non-protected nVHE backtrace.
+ *
+ * @fp : frame pointer at which to start the unwinding.
+ * @pc : program counter at which to start the unwinding.
+ *
+ * Save the information needed by the host to unwind the non-protected
+ * nVHE hypervisor stack in EL1.
+ */
+static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc)
+{
+ struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info);
+ struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
+
+ stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - PAGE_SIZE);
+ stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack);
+ stacktrace_info->fp = fp;
+ stacktrace_info->pc = pc;
+}
+
+#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE
+#include <asm/stacktrace/nvhe.h>
+
+DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace);
+
+static bool on_overflow_stack(unsigned long sp, unsigned long size,
+ struct stack_info *info)
+{
+ unsigned long low = (unsigned long)this_cpu_ptr(overflow_stack);
+ unsigned long high = low + OVERFLOW_STACK_SIZE;
+
+ return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info);
+}
+
+static bool on_hyp_stack(unsigned long sp, unsigned long size,
+ struct stack_info *info)
+{
+ struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params);
+ unsigned long high = params->stack_hyp_va;
+ unsigned long low = high - PAGE_SIZE;
+
+ return on_stack(sp, size, low, high, STACK_TYPE_HYP, info);
+}
+
+static bool on_accessible_stack(const struct task_struct *tsk,
+ unsigned long sp, unsigned long size,
+ struct stack_info *info)
+{
+ if (info)
+ info->type = STACK_TYPE_UNKNOWN;
+
+ return (on_overflow_stack(sp, size, info) ||
+ on_hyp_stack(sp, size, info));
+}
+
+static int unwind_next(struct unwind_state *state)
+{
+ struct stack_info info;
+
+ return unwind_next_common(state, &info, on_accessible_stack, NULL);
+}
+
+static void notrace unwind(struct unwind_state *state,
+ stack_trace_consume_fn consume_entry,
+ void *cookie)
+{
+ while (1) {
+ int ret;
+
+ if (!consume_entry(cookie, state->pc))
+ break;
+ ret = unwind_next(state);
+ if (ret < 0)
+ break;
+ }
+}
+
+/*
+ * pkvm_save_backtrace_entry - Saves a protected nVHE HYP stacktrace entry
+ *
+ * @arg : index of the entry in the stacktrace buffer
+ * @where : the program counter corresponding to the stack frame
+ *
+ * Save the return address of a stack frame to the shared stacktrace buffer.
+ * The host can access this shared buffer from EL1 to dump the backtrace.
+ */
+static bool pkvm_save_backtrace_entry(void *arg, unsigned long where)
+{
+ unsigned long *stacktrace = this_cpu_ptr(pkvm_stacktrace);
+ int *idx = (int *)arg;
+
+ /*
+ * Need 2 free slots: 1 for current entry and 1 for the
+ * delimiter.
+ */
+ if (*idx > ARRAY_SIZE(pkvm_stacktrace) - 2)
+ return false;
+
+ stacktrace[*idx] = where;
+ stacktrace[++*idx] = 0UL;
+
+ return true;
+}
+
+/*
+ * pkvm_save_backtrace - Saves the protected nVHE HYP stacktrace
+ *
+ * @fp : frame pointer at which to start the unwinding.
+ * @pc : program counter at which to start the unwinding.
+ *
+ * Save the unwinded stack addresses to the shared stacktrace buffer.
+ * The host can access this shared buffer from EL1 to dump the backtrace.
+ */
+static void pkvm_save_backtrace(unsigned long fp, unsigned long pc)
+{
+ struct unwind_state state;
+ int idx = 0;
+
+ kvm_nvhe_unwind_init(&state, fp, pc);
+
+ unwind(&state, pkvm_save_backtrace_entry, &idx);
+}
+#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */
+static void pkvm_save_backtrace(unsigned long fp, unsigned long pc)
+{
+}
+#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */
+
+/*
+ * kvm_nvhe_prepare_backtrace - prepare to dump the nVHE backtrace
+ *
+ * @fp : frame pointer at which to start the unwinding.
+ * @pc : program counter at which to start the unwinding.
+ *
+ * Saves the information needed by the host to dump the nVHE hypervisor
+ * backtrace.
+ */
+void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc)
+{
+ if (is_protected_kvm_enabled())
+ pkvm_save_backtrace(fp, pc);
+ else
+ hyp_prepare_backtrace(fp, pc);
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index ab0e19117dfe..9f6385702061 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -34,6 +34,8 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);
+extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
+
static void __activate_traps(struct kvm_vcpu *vcpu)
{
u64 val;
@@ -375,6 +377,10 @@ asmlinkage void __noreturn hyp_panic(void)
__sysreg_restore_state_nvhe(host_ctxt);
}
+ /* Prepare to dump kvm nvhe hyp stacktrace */
+ kvm_nvhe_prepare_backtrace((unsigned long)__builtin_frame_address(0),
+ _THIS_IP_);
+
__hyp_do_panic(host_ctxt, spsr, elr, par);
unreachable();
}
diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c
new file mode 100644
index 000000000000..949d19d603fb
--- /dev/null
+++ b/arch/arm64/kvm/stacktrace.c
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * KVM nVHE hypervisor stack tracing support.
+ *
+ * The unwinder implementation depends on the nVHE mode:
+ *
+ * 1) Non-protected nVHE mode - the host can directly access the
+ * HYP stack pages and unwind the HYP stack in EL1. This saves having
+ * to allocate shared buffers for the host to read the unwinded
+ * stacktrace.
+ *
+ * 2) pKVM (protected nVHE) mode - the host cannot directly access
+ * the HYP memory. The stack is unwinded in EL2 and dumped to a shared
+ * buffer where the host can read and print the stacktrace.
+ *
+ * Copyright (C) 2022 Google LLC
+ */
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+
+#include <asm/stacktrace/nvhe.h>
+
+/*
+ * kvm_nvhe_stack_kern_va - Convert KVM nVHE HYP stack addresses to a kernel VAs
+ *
+ * The nVHE hypervisor stack is mapped in the flexible 'private' VA range, to
+ * allow for guard pages below the stack. Consequently, the fixed offset address
+ * translation macros won't work here.
+ *
+ * The kernel VA is calculated as an offset from the kernel VA of the hypervisor
+ * stack base.
+ *
+ * Returns true on success and updates @addr to its corresponding kernel VA;
+ * otherwise returns false.
+ */
+static bool kvm_nvhe_stack_kern_va(unsigned long *addr,
+ enum stack_type type)
+{
+ struct kvm_nvhe_stacktrace_info *stacktrace_info;
+ unsigned long hyp_base, kern_base, hyp_offset;
+
+ stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info);
+
+ switch (type) {
+ case STACK_TYPE_HYP:
+ kern_base = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page);
+ hyp_base = (unsigned long)stacktrace_info->stack_base;
+ break;
+ case STACK_TYPE_OVERFLOW:
+ kern_base = (unsigned long)this_cpu_ptr_nvhe_sym(overflow_stack);
+ hyp_base = (unsigned long)stacktrace_info->overflow_stack_base;
+ break;
+ default:
+ return false;
+ }
+
+ hyp_offset = *addr - hyp_base;
+
+ *addr = kern_base + hyp_offset;
+
+ return true;
+}
+
+static bool on_overflow_stack(unsigned long sp, unsigned long size,
+ struct stack_info *info)
+{
+ struct kvm_nvhe_stacktrace_info *stacktrace_info
+ = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info);
+ unsigned long low = (unsigned long)stacktrace_info->overflow_stack_base;
+ unsigned long high = low + OVERFLOW_STACK_SIZE;
+
+ return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info);
+}
+
+static bool on_hyp_stack(unsigned long sp, unsigned long size,
+ struct stack_info *info)
+{
+ struct kvm_nvhe_stacktrace_info *stacktrace_info
+ = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info);
+ unsigned long low = (unsigned long)stacktrace_info->stack_base;
+ unsigned long high = low + PAGE_SIZE;
+
+ return on_stack(sp, size, low, high, STACK_TYPE_HYP, info);
+}
+
+static bool on_accessible_stack(const struct task_struct *tsk,
+ unsigned long sp, unsigned long size,
+ struct stack_info *info)
+{
+ if (info)
+ info->type = STACK_TYPE_UNKNOWN;
+
+ return (on_overflow_stack(sp, size, info) ||
+ on_hyp_stack(sp, size, info));
+}
+
+static int unwind_next(struct unwind_state *state)
+{
+ struct stack_info info;
+
+ return unwind_next_common(state, &info, on_accessible_stack,
+ kvm_nvhe_stack_kern_va);
+}
+
+static void unwind(struct unwind_state *state,
+ stack_trace_consume_fn consume_entry, void *cookie)
+{
+ while (1) {
+ int ret;
+
+ if (!consume_entry(cookie, state->pc))
+ break;
+ ret = unwind_next(state);
+ if (ret < 0)
+ break;
+ }
+}
+
+/*
+ * kvm_nvhe_dump_backtrace_entry - Symbolize and print an nVHE backtrace entry
+ *
+ * @arg : the hypervisor offset, used for address translation
+ * @where : the program counter corresponding to the stack frame
+ */
+static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where)
+{
+ unsigned long va_mask = GENMASK_ULL(vabits_actual - 1, 0);
+ unsigned long hyp_offset = (unsigned long)arg;
+
+ /* Mask tags and convert to kern addr */
+ where = (where & va_mask) + hyp_offset;
+ kvm_err(" [<%016lx>] %pB\n", where, (void *)(where + kaslr_offset()));
+
+ return true;
+}
+
+static void kvm_nvhe_dump_backtrace_start(void)
+{
+ kvm_err("nVHE call trace:\n");
+}
+
+static void kvm_nvhe_dump_backtrace_end(void)
+{
+ kvm_err("---[ end nVHE call trace ]---\n");
+}
+
+/*
+ * hyp_dump_backtrace - Dump the non-protected nVHE backtrace.
+ *
+ * @hyp_offset: hypervisor offset, used for address translation.
+ *
+ * The host can directly access HYP stack pages in non-protected
+ * mode, so the unwinding is done directly from EL1. This removes
+ * the need for shared buffers between host and hypervisor for
+ * the stacktrace.
+ */
+static void hyp_dump_backtrace(unsigned long hyp_offset)
+{
+ struct kvm_nvhe_stacktrace_info *stacktrace_info;
+ struct unwind_state state;
+
+ stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info);
+
+ kvm_nvhe_unwind_init(&state, stacktrace_info->fp, stacktrace_info->pc);
+
+ kvm_nvhe_dump_backtrace_start();
+ unwind(&state, kvm_nvhe_dump_backtrace_entry, (void *)hyp_offset);
+ kvm_nvhe_dump_backtrace_end();
+}
+
+#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE
+DECLARE_KVM_NVHE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)],
+ pkvm_stacktrace);
+
+/*
+ * pkvm_dump_backtrace - Dump the protected nVHE HYP backtrace.
+ *
+ * @hyp_offset: hypervisor offset, used for address translation.
+ *
+ * Dumping of the pKVM HYP backtrace is done by reading the
+ * stack addresses from the shared stacktrace buffer, since the
+ * host cannot directly access hypervisor memory in protected
+ * mode.
+ */
+static void pkvm_dump_backtrace(unsigned long hyp_offset)
+{
+ unsigned long *stacktrace
+ = (unsigned long *) this_cpu_ptr_nvhe_sym(pkvm_stacktrace);
+ int i;
+
+ kvm_nvhe_dump_backtrace_start();
+ /* The saved stacktrace is terminated by a null entry */
+ for (i = 0;
+ i < ARRAY_SIZE(kvm_nvhe_sym(pkvm_stacktrace)) && stacktrace[i];
+ i++)
+ kvm_nvhe_dump_backtrace_entry((void *)hyp_offset, stacktrace[i]);
+ kvm_nvhe_dump_backtrace_end();
+}
+#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */
+static void pkvm_dump_backtrace(unsigned long hyp_offset)
+{
+ kvm_err("Cannot dump pKVM nVHE stacktrace: !CONFIG_PROTECTED_NVHE_STACKTRACE\n");
+}
+#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */
+
+/*
+ * kvm_nvhe_dump_backtrace - Dump KVM nVHE hypervisor backtrace.
+ *
+ * @hyp_offset: hypervisor offset, used for address translation.
+ */
+void kvm_nvhe_dump_backtrace(unsigned long hyp_offset)
+{
+ if (is_protected_kvm_enabled())
+ pkvm_dump_backtrace(hyp_offset);
+ else
+ hyp_dump_backtrace(hyp_offset);
+}