summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/boot/compressed/Makefile11
-rw-r--r--arch/x86/boot/compressed/cpuflags.c4
-rw-r--r--arch/x86/boot/compressed/head_64.S33
-rw-r--r--arch/x86/boot/compressed/ident_map_64.c349
-rw-r--r--arch/x86/boot/compressed/idt_64.c54
-rw-r--r--arch/x86/boot/compressed/idt_handlers_64.S77
-rw-r--r--arch/x86/boot/compressed/kaslr.c36
-rw-r--r--arch/x86/boot/compressed/kaslr_64.c153
-rw-r--r--arch/x86/boot/compressed/misc.c7
-rw-r--r--arch/x86/boot/compressed/misc.h50
-rw-r--r--arch/x86/boot/compressed/sev-es.c214
-rw-r--r--arch/x86/entry/entry_64.S80
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h33
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/desc.h27
-rw-r--r--arch/x86/include/asm/desc_defs.h10
-rw-r--r--arch/x86/include/asm/fpu/internal.h30
-rw-r--r--arch/x86/include/asm/fpu/xcr.h34
-rw-r--r--arch/x86/include/asm/idtentry.h50
-rw-r--r--arch/x86/include/asm/insn-eval.h6
-rw-r--r--arch/x86/include/asm/mem_encrypt.h5
-rw-r--r--arch/x86/include/asm/msr-index.h3
-rw-r--r--arch/x86/include/asm/page_64_types.h1
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/include/asm/proto.h1
-rw-r--r--arch/x86/include/asm/realmode.h7
-rw-r--r--arch/x86/include/asm/segment.h2
-rw-r--r--arch/x86/include/asm/setup.h6
-rw-r--r--arch/x86/include/asm/sev-es.h114
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/include/asm/svm.h106
-rw-r--r--arch/x86/include/asm/trap_pf.h24
-rw-r--r--arch/x86/include/asm/trapnr.h1
-rw-r--r--arch/x86/include/asm/traps.h20
-rw-r--r--arch/x86/include/asm/x86_init.h16
-rw-r--r--arch/x86/include/uapi/asm/svm.h11
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/cpu/amd.c3
-rw-r--r--arch/x86/kernel/cpu/common.c25
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/cpu/vmware.c50
-rw-r--r--arch/x86/kernel/dumpstack.c7
-rw-r--r--arch/x86/kernel/dumpstack_64.c46
-rw-r--r--arch/x86/kernel/head64.c122
-rw-r--r--arch/x86/kernel/head_64.S165
-rw-r--r--arch/x86/kernel/idt.c41
-rw-r--r--arch/x86/kernel/kvm.c35
-rw-r--r--arch/x86/kernel/nmi.c15
-rw-r--r--arch/x86/kernel/sev-es-shared.c507
-rw-r--r--arch/x86/kernel/sev-es.c1404
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/traps.c48
-rw-r--r--arch/x86/kernel/umip.c49
-rw-r--r--arch/x86/kvm/svm/nested.c47
-rw-r--r--arch/x86/kvm/svm/svm.c2
-rw-r--r--arch/x86/lib/insn-eval.c130
-rw-r--r--arch/x86/mm/cpu_entry_area.c3
-rw-r--r--arch/x86/mm/extable.c1
-rw-r--r--arch/x86/mm/mem_encrypt.c38
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c3
-rw-r--r--arch/x86/platform/efi/efi_64.c10
-rw-r--r--arch/x86/realmode/init.c24
-rw-r--r--arch/x86/realmode/rm/header.S3
-rw-r--r--arch/x86/realmode/rm/trampoline_64.S20
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk50
67 files changed, 3986 insertions, 450 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 255084c65138..f6946b81f74a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1523,6 +1523,7 @@ config AMD_MEM_ENCRYPT
select DYNAMIC_PHYSICAL_MASK
select ARCH_USE_MEMREMAP_PROT
select ARCH_HAS_FORCE_DMA_UNENCRYPTED
+ select INSTRUCTION_DECODER
help
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 4fb989ef5665..ee249088cbfe 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -32,7 +32,7 @@ KBUILD_CFLAGS := -m$(BITS) -O2
KBUILD_CFLAGS += -fno-strict-aliasing -fPIE
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
cflags-$(CONFIG_X86_32) := -march=i386
-cflags-$(CONFIG_X86_64) := -mcmodel=small
+cflags-$(CONFIG_X86_64) := -mcmodel=small -mno-red-zone
KBUILD_CFLAGS += $(cflags-y)
KBUILD_CFLAGS += -mno-mmx -mno-sse
KBUILD_CFLAGS += -ffreestanding
@@ -47,6 +47,11 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS
KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
+# sev-es.c indirectly inludes inat-table.h which is generated during
+# compilation and stored in $(objtree). Add the directory to the includes so
+# that the compiler finds it even with out-of-tree builds (make O=/some/path).
+CFLAGS_sev-es.o += -I$(objtree)/arch/x86/lib/
+
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
UBSAN_SANITIZE :=n
@@ -81,9 +86,11 @@ vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o
vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
ifdef CONFIG_X86_64
- vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr_64.o
+ vmlinux-objs-y += $(obj)/ident_map_64.o
+ vmlinux-objs-y += $(obj)/idt_64.o $(obj)/idt_handlers_64.o
vmlinux-objs-y += $(obj)/mem_encrypt.o
vmlinux-objs-y += $(obj)/pgtable_64.o
+ vmlinux-objs-$(CONFIG_AMD_MEM_ENCRYPT) += $(obj)/sev-es.o
endif
vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
diff --git a/arch/x86/boot/compressed/cpuflags.c b/arch/x86/boot/compressed/cpuflags.c
index 6448a8196d32..0cc1323896d1 100644
--- a/arch/x86/boot/compressed/cpuflags.c
+++ b/arch/x86/boot/compressed/cpuflags.c
@@ -1,6 +1,4 @@
// SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_RANDOMIZE_BASE
-
#include "../cpuflags.c"
bool has_cpuflag(int flag)
@@ -9,5 +7,3 @@ bool has_cpuflag(int flag)
return test_bit(flag, cpu.flags);
}
-
-#endif
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 9e46729cf162..1c80f1738fd9 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -33,6 +33,7 @@
#include <asm/processor-flags.h>
#include <asm/asm-offsets.h>
#include <asm/bootparam.h>
+#include <asm/desc_defs.h>
#include "pgtable.h"
/*
@@ -415,6 +416,10 @@ SYM_CODE_START(startup_64)
.Lon_kernel_cs:
+ pushq %rsi
+ call load_stage1_idt
+ popq %rsi
+
/*
* paging_prepare() sets up the trampoline and checks if we need to
* enable 5-level paging.
@@ -528,6 +533,21 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
rep stosq
/*
+ * If running as an SEV guest, the encryption mask is required in the
+ * page-table setup code below. When the guest also has SEV-ES enabled
+ * set_sev_encryption_mask() will cause #VC exceptions, but the stage2
+ * handler can't map its GHCB because the page-table is not set up yet.
+ * So set up the encryption mask here while still on the stage1 #VC
+ * handler. Then load stage2 IDT and switch to the kernel's own
+ * page-table.
+ */
+ pushq %rsi
+ call set_sev_encryption_mask
+ call load_stage2_idt
+ call initialize_identity_maps
+ popq %rsi
+
+/*
* Do the extraction, and jump to the new kernel..
*/
pushq %rsi /* Save the real mode argument */
@@ -659,10 +679,21 @@ SYM_DATA_START_LOCAL(gdt)
.quad 0x0000000000000000 /* TS continued */
SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
+SYM_DATA_START(boot_idt_desc)
+ .word boot_idt_end - boot_idt - 1
+ .quad 0
+SYM_DATA_END(boot_idt_desc)
+ .balign 8
+SYM_DATA_START(boot_idt)
+ .rept BOOT_IDT_ENTRIES
+ .quad 0
+ .quad 0
+ .endr
+SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)
+
#ifdef CONFIG_EFI_STUB
SYM_DATA(image_offset, .long 0)
#endif
-
#ifdef CONFIG_EFI_MIXED
SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0, 0)
SYM_DATA(efi_is64, .byte 1)
diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c
new file mode 100644
index 000000000000..063a60edcf99
--- /dev/null
+++ b/arch/x86/boot/compressed/ident_map_64.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This code is used on x86_64 to create page table identity mappings on
+ * demand by building up a new set of page tables (or appending to the
+ * existing ones), and then switching over to them when ready.
+ *
+ * Copyright (C) 2015-2016 Yinghai Lu
+ * Copyright (C) 2016 Kees Cook
+ */
+
+/*
+ * Since we're dealing with identity mappings, physical and virtual
+ * addresses are the same, so override these defines which are ultimately
+ * used by the headers in misc.h.
+ */
+#define __pa(x) ((unsigned long)(x))
+#define __va(x) ((void *)((unsigned long)(x)))
+
+/* No PAGE_TABLE_ISOLATION support needed either: */
+#undef CONFIG_PAGE_TABLE_ISOLATION
+
+#include "error.h"
+#include "misc.h"
+
+/* These actually do the work of building the kernel identity maps. */
+#include <linux/pgtable.h>
+#include <asm/cmpxchg.h>
+#include <asm/trap_pf.h>
+#include <asm/trapnr.h>
+#include <asm/init.h>
+/* Use the static base for this part of the boot process */
+#undef __PAGE_OFFSET
+#define __PAGE_OFFSET __PAGE_OFFSET_BASE
+#include "../../mm/ident_map.c"
+
+#ifdef CONFIG_X86_5LEVEL
+unsigned int __pgtable_l5_enabled;
+unsigned int pgdir_shift = 39;
+unsigned int ptrs_per_p4d = 1;
+#endif
+
+/* Used by PAGE_KERN* macros: */
+pteval_t __default_kernel_pte_mask __read_mostly = ~0;
+
+/* Used to track our page table allocation area. */
+struct alloc_pgt_data {
+ unsigned char *pgt_buf;
+ unsigned long pgt_buf_size;
+ unsigned long pgt_buf_offset;
+};
+
+/*
+ * Allocates space for a page table entry, using struct alloc_pgt_data
+ * above. Besides the local callers, this is used as the allocation
+ * callback in mapping_info below.
+ */
+static void *alloc_pgt_page(void *context)
+{
+ struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context;
+ unsigned char *entry;
+
+ /* Validate there is space available for a new page. */
+ if (pages->pgt_buf_offset >= pages->pgt_buf_size) {
+ debug_putstr("out of pgt_buf in " __FILE__ "!?\n");
+ debug_putaddr(pages->pgt_buf_offset);
+ debug_putaddr(pages->pgt_buf_size);
+ return NULL;
+ }
+
+ entry = pages->pgt_buf + pages->pgt_buf_offset;
+ pages->pgt_buf_offset += PAGE_SIZE;
+
+ return entry;
+}
+
+/* Used to track our allocated page tables. */
+static struct alloc_pgt_data pgt_data;
+
+/* The top level page table entry pointer. */
+static unsigned long top_level_pgt;
+
+phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
+
+/*
+ * Mapping information structure passed to kernel_ident_mapping_init().
+ * Due to relocation, pointers must be assigned at run time not build time.
+ */
+static struct x86_mapping_info mapping_info;
+
+/*
+ * Adds the specified range to the identity mappings.
+ */
+static void add_identity_map(unsigned long start, unsigned long end)
+{
+ int ret;
+
+ /* Align boundary to 2M. */
+ start = round_down(start, PMD_SIZE);
+ end = round_up(end, PMD_SIZE);
+ if (start >= end)
+ return;
+
+ /* Build the mapping. */
+ ret = kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt, start, end);
+ if (ret)
+ error("Error: kernel_ident_mapping_init() failed\n");
+}
+
+/* Locates and clears a region for a new top level page table. */
+void initialize_identity_maps(void)
+{
+ /* Exclude the encryption mask from __PHYSICAL_MASK */
+ physical_mask &= ~sme_me_mask;
+
+ /* Init mapping_info with run-time function/buffer pointers. */
+ mapping_info.alloc_pgt_page = alloc_pgt_page;
+ mapping_info.context = &pgt_data;
+ mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
+ mapping_info.kernpg_flag = _KERNPG_TABLE;
+
+ /*
+ * It should be impossible for this not to already be true,
+ * but since calling this a second time would rewind the other
+ * counters, let's just make sure this is reset too.
+ */
+ pgt_data.pgt_buf_offset = 0;
+
+ /*
+ * If we came here via startup_32(), cr3 will be _pgtable already
+ * and we must append to the existing area instead of entirely
+ * overwriting it.
+ *
+ * With 5-level paging, we use '_pgtable' to allocate the p4d page table,
+ * the top-level page table is allocated separately.
+ *
+ * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
+ * cases. On 4-level paging it's equal to 'top_level_pgt'.
+ */
+ top_level_pgt = read_cr3_pa();
+ if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
+ pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
+ pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
+ memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
+ } else {
+ pgt_data.pgt_buf = _pgtable;
+ pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
+ memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
+ top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
+ }
+
+ /*
+ * New page-table is set up - map the kernel image and load it
+ * into cr3.
+ */
+ add_identity_map((unsigned long)_head, (unsigned long)_end);
+ write_cr3(top_level_pgt);
+}
+
+/*
+ * This switches the page tables to the new level4 that has been built
+ * via calls to add_identity_map() above. If booted via startup_32(),
+ * this is effectively a no-op.
+ */
+void finalize_identity_maps(void)
+{
+ write_cr3(top_level_pgt);
+}
+
+static pte_t *split_large_pmd(struct x86_mapping_info *info,
+ pmd_t *pmdp, unsigned long __address)
+{
+ unsigned long page_flags;
+ unsigned long address;
+ pte_t *pte;
+ pmd_t pmd;
+ int i;
+
+ pte = (pte_t *)info->alloc_pgt_page(info->context);
+ if (!pte)
+ return NULL;
+
+ address = __address & PMD_MASK;
+ /* No large page - clear PSE flag */
+ page_flags = info->page_flag & ~_PAGE_PSE;
+
+ /* Populate the PTEs */
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ set_pte(&pte[i], __pte(address | page_flags));
+ address += PAGE_SIZE;
+ }
+
+ /*
+ * Ideally we need to clear the large PMD first and do a TLB
+ * flush before we write the new PMD. But the 2M range of the
+ * PMD might contain the code we execute and/or the stack
+ * we are on, so we can't do that. But that should be safe here
+ * because we are going from large to small mappings and we are
+ * also the only user of the page-table, so there is no chance
+ * of a TLB multihit.
+ */
+ pmd = __pmd((unsigned long)pte | info->kernpg_flag);
+ set_pmd(pmdp, pmd);
+ /* Flush TLB to establish the new PMD */
+ write_cr3(top_level_pgt);
+
+ return pte + pte_index(__address);
+}
+
+static void clflush_page(unsigned long address)
+{
+ unsigned int flush_size;
+ char *cl, *start, *end;
+
+ /*
+ * Hardcode cl-size to 64 - CPUID can't be used here because that might
+ * cause another #VC exception and the GHCB is not ready to use yet.
+ */
+ flush_size = 64;
+ start = (char *)(address & PAGE_MASK);
+ end = start + PAGE_SIZE;
+
+ /*
+ * First make sure there are no pending writes on the cache-lines to
+ * flush.
+ */
+ asm volatile("mfence" : : : "memory");
+
+ for (cl = start; cl != end; cl += flush_size)
+ clflush(cl);
+}
+
+static int set_clr_page_flags(struct x86_mapping_info *info,
+ unsigned long address,
+ pteval_t set, pteval_t clr)
+{
+ pgd_t *pgdp = (pgd_t *)top_level_pgt;
+ p4d_t *p4dp;
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep, pte;
+
+ /*
+ * First make sure there is a PMD mapping for 'address'.
+ * It should already exist, but keep things generic.
+ *
+ * To map the page just read from it and fault it in if there is no
+ * mapping yet. add_identity_map() can't be called here because that
+ * would unconditionally map the address on PMD level, destroying any
+ * PTE-level mappings that might already exist. Use assembly here so
+ * the access won't be optimized away.
+ */
+ asm volatile("mov %[address], %%r9"
+ :: [address] "g" (*(unsigned long *)address)
+ : "r9", "memory");
+
+ /*
+ * The page is mapped at least with PMD size - so skip checks and walk
+ * directly to the PMD.
+ */
+ p4dp = p4d_offset(pgdp, address);
+ pudp = pud_offset(p4dp, address);
+ pmdp = pmd_offset(pudp, address);
+
+ if (pmd_large(*pmdp))
+ ptep = split_large_pmd(info, pmdp, address);
+ else
+ ptep = pte_offset_kernel(pmdp, address);
+
+ if (!ptep)
+ return -ENOMEM;
+
+ /*
+ * Changing encryption attributes of a page requires to flush it from
+ * the caches.
+ */
+ if ((set | clr) & _PAGE_ENC)
+ clflush_page(address);
+
+ /* Update PTE */
+ pte = *ptep;
+ pte = pte_set_flags(pte, set);
+ pte = pte_clear_flags(pte, clr);
+ set_pte(ptep, pte);
+
+ /* Flush TLB after changing encryption attribute */
+ write_cr3(top_level_pgt);
+
+ return 0;
+}
+
+int set_page_decrypted(unsigned long address)
+{
+ return set_clr_page_flags(&mapping_info, address, 0, _PAGE_ENC);
+}
+
+int set_page_encrypted(unsigned long address)
+{
+ return set_clr_page_flags(&mapping_info, address, _PAGE_ENC, 0);
+}
+
+int set_page_non_present(unsigned long address)
+{
+ return set_clr_page_flags(&mapping_info, address, 0, _PAGE_PRESENT);
+}
+
+static void do_pf_error(const char *msg, unsigned long error_code,
+ unsigned long address, unsigned long ip)
+{
+ error_putstr(msg);
+
+ error_putstr("\nError Code: ");
+ error_puthex(error_code);
+ error_putstr("\nCR2: 0x");
+ error_puthex(address);
+ error_putstr("\nRIP relative to _head: 0x");
+ error_puthex(ip - (unsigned long)_head);
+ error_putstr("\n");
+
+ error("Stopping.\n");
+}
+
+void do_boot_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ unsigned long address = native_read_cr2();
+ unsigned long end;
+ bool ghcb_fault;
+
+ ghcb_fault = sev_es_check_ghcb_fault(address);
+
+ address &= PMD_MASK;
+ end = address + PMD_SIZE;
+
+ /*
+ * Check for unexpected error codes. Unexpected are:
+ * - Faults on present pages
+ * - User faults
+ * - Reserved bits set
+ */
+ if (error_code & (X86_PF_PROT | X86_PF_USER | X86_PF_RSVD))
+ do_pf_error("Unexpected page-fault:", error_code, address, regs->ip);
+ else if (ghcb_fault)
+ do_pf_error("Page-fault on GHCB page:", error_code, address, regs->ip);
+
+ /*
+ * Error code is sane - now identity map the 2M region around
+ * the faulting address.
+ */
+ add_identity_map(address, end);
+}
diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
new file mode 100644
index 000000000000..804a502ee0d2
--- /dev/null
+++ b/arch/x86/boot/compressed/idt_64.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <asm/trap_pf.h>
+#include <asm/segment.h>
+#include <asm/trapnr.h>
+#include "misc.h"
+
+static void set_idt_entry(int vector, void (*handler)(void))
+{
+ unsigned long address = (unsigned long)handler;
+ gate_desc entry;
+
+ memset(&entry, 0, sizeof(entry));
+
+ entry.offset_low = (u16)(address & 0xffff);
+ entry.segment = __KERNEL_CS;
+ entry.bits.type = GATE_TRAP;
+ entry.bits.p = 1;
+ entry.offset_middle = (u16)((address >> 16) & 0xffff);
+ entry.offset_high = (u32)(address >> 32);
+
+ memcpy(&boot_idt[vector], &entry, sizeof(entry));
+}
+
+/* Have this here so we don't need to include <asm/desc.h> */
+static void load_boot_idt(const struct desc_ptr *dtr)
+{
+ asm volatile("lidt %0"::"m" (*dtr));
+}
+
+/* Setup IDT before kernel jumping to .Lrelocated */
+void load_stage1_idt(void)
+{
+ boot_idt_desc.address = (unsigned long)boot_idt;
+
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
+ set_idt_entry(X86_TRAP_VC, boot_stage1_vc);
+
+ load_boot_idt(&boot_idt_desc);
+}
+
+/* Setup IDT after kernel jumping to .Lrelocated */
+void load_stage2_idt(void)
+{
+ boot_idt_desc.address = (unsigned long)boot_idt;
+
+ set_idt_entry(X86_TRAP_PF, boot_page_fault);
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ set_idt_entry(X86_TRAP_VC, boot_stage2_vc);
+#endif
+
+ load_boot_idt(&boot_idt_desc);
+}
diff --git a/arch/x86/boot/compressed/idt_handlers_64.S b/arch/x86/boot/compressed/idt_handlers_64.S
new file mode 100644
index 000000000000..22890e199f5b
--- /dev/null
+++ b/arch/x86/boot/compressed/idt_handlers_64.S
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Early IDT handler entry points
+ *
+ * Copyright (C) 2019 SUSE
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#include <asm/segment.h>
+
+/* For ORIG_RAX */
+#include "../../entry/calling.h"
+
+.macro EXCEPTION_HANDLER name function error_code=0
+SYM_FUNC_START(\name)
+
+ /* Build pt_regs */
+ .if \error_code == 0
+ pushq $0
+ .endif
+
+ pushq %rdi
+ pushq %rsi
+ pushq %rdx
+ pushq %rcx
+ pushq %rax
+ pushq %r8
+ pushq %r9
+ pushq %r10
+ pushq %r11
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ /* Call handler with pt_regs */
+ movq %rsp, %rdi
+ /* Error code is second parameter */
+ movq ORIG_RAX(%rsp), %rsi
+ call \function
+
+ /* Restore regs */
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
+
+ /* Remove error code and return */
+ addq $8, %rsp
+
+ iretq
+SYM_FUNC_END(\name)
+ .endm
+
+ .text
+ .code64
+
+EXCEPTION_HANDLER boot_page_fault do_boot_page_fault error_code=1
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+EXCEPTION_HANDLER boot_stage1_vc do_vc_no_ghcb error_code=1
+EXCEPTION_HANDLER boot_stage2_vc do_boot_stage2_vc error_code=1
+#endif
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index 877970d76249..b59547ce5b19 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -40,17 +40,8 @@
#include <asm/setup.h> /* For COMMAND_LINE_SIZE */
#undef _SETUP
-#ifdef CONFIG_X86_5LEVEL
-unsigned int __pgtable_l5_enabled;
-unsigned int pgdir_shift __ro_after_init = 39;
-unsigned int ptrs_per_p4d __ro_after_init = 1;
-#endif
-
extern unsigned long get_cmd_line_ptr(void);
-/* Used by PAGE_KERN* macros: */
-pteval_t __default_kernel_pte_mask __read_mostly = ~0;
-
/* Simplified build-specific string for starting entropy. */
static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
@@ -406,8 +397,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
*/
mem_avoid[MEM_AVOID_ZO_RANGE].start = input;
mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
- add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start,
- mem_avoid[MEM_AVOID_ZO_RANGE].size);
/* Avoid initrd. */
initrd_start = (u64)boot_params->ext_ramdisk_image << 32;
@@ -425,15 +414,11 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
cmd_line_size = strnlen((char *)cmd_line, COMMAND_LINE_SIZE-1) + 1;
mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
- add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
- mem_avoid[MEM_AVOID_CMDLINE].size);
}
/* Avoid boot parameters. */
mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;
mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params);
- add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start,
- mem_avoid[MEM_AVOID_BOOTPARAMS].size);
/* We don't need to set a mapping for setup_data. */
@@ -442,11 +427,6 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
/* Enumerate the immovable memory regions */
num_immovable_mem = count_immovable_mem_regions();
-
-#ifdef CONFIG_X86_VERBOSE_BOOTUP
- /* Make sure video RAM can be used. */
- add_identity_map(0, PMD_SIZE);
-#endif
}
/*
@@ -870,9 +850,6 @@ void choose_random_location(unsigned long input,
boot_params->hdr.loadflags |= KASLR_FLAG;
- /* Prepare to add new identity pagetables on demand. */
- initialize_identity_maps();
-
if (IS_ENABLED(CONFIG_X86_32))
mem_limit = KERNEL_IMAGE_SIZE;
else
@@ -896,19 +873,8 @@ void choose_random_location(unsigned long input,
warn("Physical KASLR disabled: no suitable memory region!");
} else {
/* Update the new physical address location. */
- if (*output != random_addr) {
- add_identity_map(random_addr, output_size);
+ if (*output != random_addr)
*output = random_addr;
- }
-
- /*
- * This loads the identity mapping page table.
- * This should only be done if a new physical address
- * is found for the kernel, otherwise we should keep
- * the old page table to make it be like the "nokaslr"
- * case.
- */
- finalize_identity_maps();
}
diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c
deleted file mode 100644
index f9c5c13d979b..000000000000
--- a/arch/x86/boot/compressed/kaslr_64.c
+++ /dev/null
@@ -1,153 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * This code is used on x86_64 to create page table identity mappings on
- * demand by building up a new set of page tables (or appending to the
- * existing ones), and then switching over to them when ready.
- *
- * Copyright (C) 2015-2016 Yinghai Lu
- * Copyright (C) 2016 Kees Cook
- */
-
-/*
- * Since we're dealing with identity mappings, physical and virtual
- * addresses are the same, so override these defines which are ultimately
- * used by the headers in misc.h.
- */
-#define __pa(x) ((unsigned long)(x))
-#define __va(x) ((void *)((unsigned long)(x)))
-
-/* No PAGE_TABLE_ISOLATION support needed either: */
-#undef CONFIG_PAGE_TABLE_ISOLATION
-
-#include "misc.h"
-
-/* These actually do the work of building the kernel identity maps. */
-#include <linux/pgtable.h>
-#include <asm/init.h>
-/* Use the static base for this part of the boot process */
-#undef __PAGE_OFFSET
-#define __PAGE_OFFSET __PAGE_OFFSET_BASE
-#include "../../mm/ident_map.c"
-
-/* Used to track our page table allocation area. */
-struct alloc_pgt_data {
- unsigned char *pgt_buf;
- unsigned long pgt_buf_size;
- unsigned long pgt_buf_offset;
-};
-
-/*
- * Allocates space for a page table entry, using struct alloc_pgt_data
- * above. Besides the local callers, this is used as the allocation
- * callback in mapping_info below.
- */
-static void *alloc_pgt_page(void *context)
-{
- struct alloc_pgt_data *pages = (struct alloc_pgt_data *)context;
- unsigned char *entry;
-
- /* Validate there is space available for a new page. */
- if (pages->pgt_buf_offset >= pages->pgt_buf_size) {
- debug_putstr("out of pgt_buf in " __FILE__ "!?\n");
- debug_putaddr(pages->pgt_buf_offset);
- debug_putaddr(pages->pgt_buf_size);
- return NULL;
- }
-
- entry = pages->pgt_buf + pages->pgt_buf_offset;
- pages->pgt_buf_offset += PAGE_SIZE;
-
- return entry;
-}
-
-/* Used to track our allocated page tables. */
-static struct alloc_pgt_data pgt_data;
-
-/* The top level page table entry pointer. */
-static unsigned long top_level_pgt;
-
-phys_addr_t physical_mask = (1ULL << __PHYSICAL_MASK_SHIFT) - 1;
-
-/*
- * Mapping information structure passed to kernel_ident_mapping_init().
- * Due to relocation, pointers must be assigned at run time not build time.
- */
-static struct x86_mapping_info mapping_info;
-
-/* Locates and clears a region for a new top level page table. */
-void initialize_identity_maps(void)
-{
- /* If running as an SEV guest, the encryption mask is required. */
- set_sev_encryption_mask();
-
- /* Exclude the encryption mask from __PHYSICAL_MASK */
- physical_mask &= ~sme_me_mask;
-
- /* Init mapping_info with run-time function/buffer pointers. */
- mapping_info.alloc_pgt_page = alloc_pgt_page;
- mapping_info.context = &pgt_data;
- mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sme_me_mask;
- mapping_info.kernpg_flag = _KERNPG_TABLE;
-
- /*
- * It should be impossible for this not to already be true,
- * but since calling this a second time would rewind the other
- * counters, let's just make sure this is reset too.
- */
- pgt_data.pgt_buf_offset = 0;
-
- /*
- * If we came here via startup_32(), cr3 will be _pgtable already
- * and we must append to the existing area instead of entirely
- * overwriting it.
- *
- * With 5-level paging, we use '_pgtable' to allocate the p4d page table,
- * the top-level page table is allocated separately.
- *
- * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
- * cases. On 4-level paging it's equal to 'top_level_pgt'.
- */
- top_level_pgt = read_cr3_pa();
- if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
- debug_putstr("booted via startup_32()\n");
- pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
- pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
- memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
- } else {
- debug_putstr("booted via startup_64()\n");
- pgt_data.pgt_buf = _pgtable;
- pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
- memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
- top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
- }
-}
-
-/*
- * Adds the specified range to what will become the new identity mappings.
- * Once all ranges have been added, the new mapping is activated by calling
- * finalize_identity_maps() below.
- */
-void add_identity_map(unsigned long start, unsigned long size)
-{
- unsigned long end = start + size;
-
- /* Align boundary to 2M. */
- start = round_down(start, PMD_SIZE);
- end = round_up(end, PMD_SIZE);
- if (start >= end)
- return;
-
- /* Build the mapping. */
- kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
- start, end);
-}
-
-/*
- * This switches the page tables to the new level4 that has been built
- * via calls to add_identity_map() above. If booted via startup_32(),
- * this is effectively a no-op.
- */
-void finalize_identity_maps(void)
-{
- write_cr3(top_level_pgt);
-}
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index e478e40fbe5a..267e7f93050e 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -442,6 +442,13 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
parse_elf(output);
handle_relocations(output, output_len, virt_addr);
debug_putstr("done.\nBooting the kernel.\n");
+
+ /*
+ * Flush GHCB from cache and map it encrypted again when running as
+ * SEV-ES guest.
+ */
+ sev_es_shutdown_ghcb();
+
return output;
}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3efce27ba35c..6d31f1b4c4d1 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -23,6 +23,7 @@
#include <asm/page.h>
#include <asm/boot.h>
#include <asm/bootparam.h>
+#include <asm/desc_defs.h>
#define BOOT_CTYPE_H
#include <linux/acpi.h>
@@ -36,6 +37,9 @@
#define memptr unsigned
#endif
+/* boot/compressed/vmlinux start and end markers */
+extern char _head[], _end[];
+
/* misc.c */
extern memptr free_mem_ptr;
extern memptr free_mem_end_ptr;
@@ -81,8 +85,6 @@ void choose_random_location(unsigned long input,
unsigned long *output,
unsigned long output_size,
unsigned long *virt_addr);
-/* cpuflags.c */
-bool has_cpuflag(int flag);
#else
static inline void choose_random_location(unsigned long input,
unsigned long input_size,
@@ -93,18 +95,14 @@ static inline void choose_random_location(unsigned long input,
}
#endif
+/* cpuflags.c */
+bool has_cpuflag(int flag);
+
#ifdef CONFIG_X86_64
-void initialize_identity_maps(void);
-void add_identity_map(unsigned long start, unsigned long size);
-void finalize_identity_maps(void);
+extern int set_page_decrypted(unsigned long address);
+extern int set_page_encrypted(unsigned long address);
+extern int set_page_non_present(unsigned long address);
extern unsigned char _pgtable[];
-#else
-static inline void initialize_identity_maps(void)
-{ }
-static inline void add_identity_map(unsigned long start, unsigned long size)
-{ }
-static inline void finalize_identity_maps(void)
-{ }
#endif
#ifdef CONFIG_EARLY_PRINTK
@@ -119,6 +117,17 @@ static inline void console_init(void)
void set_sev_encryption_mask(void);
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+void sev_es_shutdown_ghcb(void);
+extern bool sev_es_check_ghcb_fault(unsigned long address);
+#else
+static inline void sev_es_shutdown_ghcb(void) { }
+static inline bool sev_es_check_ghcb_fault(unsigned long address)
+{
+ return false;
+}
+#endif
+
/* acpi.c */
#ifdef CONFIG_ACPI
acpi_physical_address get_rsdp_addr(void);
@@ -133,4 +142,21 @@ int count_immovable_mem_regions(void);
static inline int count_immovable_mem_regions(void) { return 0; }
#endif
+/* ident_map_64.c */
+#ifdef CONFIG_X86_5LEVEL
+extern unsigned int __pgtable_l5_enabled, pgdir_shift, ptrs_per_p4d;
+#endif
+
+/* Used by PAGE_KERN* macros: */
+extern pteval_t __default_kernel_pte_mask;
+
+/* idt_64.c */
+extern gate_desc boot_idt[BOOT_IDT_ENTRIES];
+extern struct desc_ptr boot_idt_desc;
+
+/* IDT Entry Points */
+void boot_page_fault(void);
+void boot_stage1_vc(void);
+void boot_stage2_vc(void);
+
#endif /* BOOT_COMPRESSED_MISC_H */
diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c
new file mode 100644
index 000000000000..954cb2702e23
--- /dev/null
+++ b/arch/x86/boot/compressed/sev-es.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD Encrypted Register State Support
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+/*
+ * misc.h needs to be first because it knows how to include the other kernel
+ * headers in the pre-decompression code in a way that does not break
+ * compilation.
+ */
+#include "misc.h"
+
+#include <asm/pgtable_types.h>
+#include <asm/sev-es.h>
+#include <asm/trapnr.h>
+#include <asm/trap_pf.h>
+#include <asm/msr-index.h>
+#include <asm/fpu/xcr.h>
+#include <asm/ptrace.h>
+#include <asm/svm.h>
+
+#include "error.h"
+
+struct ghcb boot_ghcb_page __aligned(PAGE_SIZE);
+struct ghcb *boot_ghcb;
+
+/*
+ * Copy a version of this function here - insn-eval.c can't be used in
+ * pre-decompression code.
+ */
+static bool insn_has_rep_prefix(struct insn *insn)
+{
+ int i;
+
+ insn_get_prefixes(insn);
+
+ for (i = 0; i < insn->prefixes.nbytes; i++) {
+ insn_byte_t p = insn->prefixes.bytes[i];
+
+ if (p == 0xf2 || p == 0xf3)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Only a dummy for insn_get_seg_base() - Early boot-code is 64bit only and
+ * doesn't use segments.
+ */
+static unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
+{
+ return 0UL;
+}
+
+static inline u64 sev_es_rd_ghcb_msr(void)
+{
+ unsigned long low, high;
+
+ asm volatile("rdmsr" : "=a" (low), "=d" (high) :
+ "c" (MSR_AMD64_SEV_ES_GHCB));
+
+ return ((high << 32) | low);
+}
+
+static inline void sev_es_wr_ghcb_msr(u64 val)
+{
+ u32 low, high;
+
+ low = val & 0xffffffffUL;
+ high = val >> 32;
+
+ asm volatile("wrmsr" : : "c" (MSR_AMD64_SEV_ES_GHCB),
+ "a"(low), "d" (high) : "memory");
+}
+
+static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ enum es_result ret;
+
+ memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+
+ insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE, 1);
+ insn_get_length(&ctxt->insn);
+
+ ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED;
+
+ return ret;
+}
+
+static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
+ void *dst, char *buf, size_t size)
+{
+ memcpy(dst, buf, size);
+
+ return ES_OK;
+}
+
+static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
+ void *src, char *buf, size_t size)
+{
+ memcpy(buf, src, size);
+
+ return ES_OK;
+}
+
+#undef __init
+#undef __pa
+#define __init
+#define __pa(x) ((unsigned long)(x))
+
+#define __BOOT_COMPRESSED
+
+/* Basic instruction decoding support needed */
+#include "../../lib/inat.c"
+#include "../../lib/insn.c"
+
+/* Include code for early handlers */
+#include "../../kernel/sev-es-shared.c"
+
+static bool early_setup_sev_es(void)
+{
+ if (!sev_es_negotiate_protocol())
+ sev_es_terminate(GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED);
+
+ if (set_page_decrypted((unsigned long)&boot_ghcb_page))
+ return false;
+
+ /* Page is now mapped decrypted, clear it */
+ memset(&boot_ghcb_page, 0, sizeof(boot_ghcb_page));
+
+ boot_ghcb = &boot_ghcb_page;
+
+ /* Initialize lookup tables for the instruction decoder */
+ inat_init_tables();
+
+ return true;
+}
+
+void sev_es_shutdown_ghcb(void)
+{
+ if (!boot_ghcb)
+ return;
+
+ if (!sev_es_check_cpu_features())
+ error("SEV-ES CPU Features missing.");
+
+ /*
+ * GHCB Page must be flushed from the cache and mapped encrypted again.
+ * Otherwise the running kernel will see strange cache effects when
+ * trying to use that page.
+ */
+ if (set_page_encrypted((unsigned long)&boot_ghcb_page))
+ error("Can't map GHCB page encrypted");
+
+ /*
+ * GHCB page is mapped encrypted again and flushed from the cache.
+ * Mark it non-present now to catch bugs when #VC exceptions trigger
+ * after this point.
+ */
+ if (set_page_non_present((unsigned long)&boot_ghcb_page))
+ error("Can't unmap GHCB page");
+}
+
+bool sev_es_check_ghcb_fault(unsigned long address)
+{
+ /* Check whether the fault was on the GHCB page */
+ return ((address & PAGE_MASK) == (unsigned long)&boot_ghcb_page);
+}
+
+void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
+{
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+
+ if (!boot_ghcb && !early_setup_sev_es())
+ sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+
+ vc_ghcb_invalidate(boot_ghcb);
+ result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+ if (result != ES_OK)
+ goto finish;
+
+ switch (exit_code) {
+ case SVM_EXIT_RDTSC:
+ case SVM_EXIT_RDTSCP:
+ result = vc_handle_rdtsc(boot_ghcb, &ctxt, exit_code);
+ break;
+ case SVM_EXIT_IOIO:
+ result = vc_handle_ioio(boot_ghcb, &ctxt);
+ break;
+ case SVM_EXIT_CPUID:
+ result = vc_handle_cpuid(boot_ghcb, &ctxt);
+ break;
+ default:
+ result = ES_UNSUPPORTED;
+ break;
+ }
+
+finish:
+ if (result == ES_OK) {
+ vc_finish_insn(&ctxt);
+ } else if (result != ES_RETRY) {
+ /*
+ * For now, just halt the machine. That makes debugging easier,
+ * later we just call sev_es_terminate() here.
+ */
+ while (true)
+ asm volatile("hlt\n");
+ }
+}
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 826e73488308..cad08703c4ad 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -101,6 +101,8 @@ SYM_CODE_START(entry_SYSCALL_64)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
+
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
@@ -446,6 +448,84 @@ _ASM_NOKPROBE(\asmsym)
SYM_CODE_END(\asmsym)
.endm
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/**
+ * idtentry_vc - Macro to generate entry stub for #VC
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ *
+ * The macro emits code to set up the kernel context for #VC. The #VC handler
+ * runs on an IST stack and needs to be able to cause nested #VC exceptions.
+ *
+ * To make this work the #VC entry code tries its best to pretend it doesn't use
+ * an IST stack by switching to the task stack if coming from user-space (which
+ * includes early SYSCALL entry path) or back to the stack in the IRET frame if
+ * entered from kernel-mode.
+ *
+ * If entered from kernel-mode the return stack is validated first, and if it is
+ * not safe to use (e.g. because it points to the entry stack) the #VC handler
+ * will switch to a fall-back stack (VC2) and call a special handler function.
+ *
+ * The macro is only used for one vector, but it is planned to be extended in
+ * the future for the #HV exception.
+ */
+.macro idtentry_vc vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_REGS
+ ASM_CLAC
+
+ /*
+ * If the entry is from userspace, switch stacks and treat it as
+ * a normal entry.
+ */
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_switch_stack_\@
+
+ /*
+ * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
+ * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
+ */
+ call paranoid_entry
+
+ UNWIND_HINT_REGS
+
+ /*
+ * Switch off the IST stack to make it free for nested exceptions. The
+ * vc_switch_off_ist() function will switch back to the interrupted
+ * stack if it is safe to do so. If not it switches to the VC fall-back
+ * stack.
+ */
+ movq %rsp, %rdi /* pt_regs pointer */
+ call vc_switch_off_ist
+ movq %rax, %rsp /* Switch to new stack */
+
+ UNWIND_HINT_REGS
+
+ /* Update pt_regs */
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
+
+ movq %rsp, %rdi /* pt_regs pointer */
+
+ call \cfunc
+
+ /*
+ * No need to switch back to the IST stack. The current stack is either
+ * identical to the stack in the IRET frame or the VC fall-back stack,
+ * so it is definitly mapped even with PTI enabled.
+ */
+ jmp paranoid_exit
+
+ /* Switch to the regular task stack */
+.Lfrom_usermode_switch_stack_\@:
+ idtentry_body safe_stack_\cfunc, has_error_code=1
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
+#endif
+
/*
* Double fault entry. Straight paranoid. No checks from which context
* this comes because for the espfix induced #DF this would do the wrong
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 8902fdb7de13..3d52b094850a 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -11,25 +11,29 @@
#ifdef CONFIG_X86_64
/* Macro to enforce the same ordering and stack sizes */
-#define ESTACKS_MEMBERS(guardsize) \
- char DF_stack_guard[guardsize]; \
- char DF_stack[EXCEPTION_STKSZ]; \
- char NMI_stack_guard[guardsize]; \
- char NMI_stack[EXCEPTION_STKSZ]; \
- char DB_stack_guard[guardsize]; \
- char DB_stack[EXCEPTION_STKSZ]; \
- char MCE_stack_guard[guardsize]; \
- char MCE_stack[EXCEPTION_STKSZ]; \
- char IST_top_guard[guardsize]; \
+#define ESTACKS_MEMBERS(guardsize, optional_stack_size) \
+ char DF_stack_guard[guardsize]; \
+ char DF_stack[EXCEPTION_STKSZ]; \
+ char NMI_stack_guard[guardsize]; \
+ char NMI_stack[EXCEPTION_STKSZ]; \
+ char DB_stack_guard[guardsize]; \
+ char DB_stack[EXCEPTION_STKSZ]; \
+ char MCE_stack_guard[guardsize]; \
+ char MCE_stack[EXCEPTION_STKSZ]; \
+ char VC_stack_guard[guardsize]; \
+ char VC_stack[optional_stack_size]; \
+ char VC2_stack_guard[guardsize]; \
+ char VC2_stack[optional_stack_size]; \
+ char IST_top_guard[guardsize]; \
/* The exception stacks' physical storage. No guard pages required */
struct exception_stacks {
- ESTACKS_MEMBERS(0)
+ ESTACKS_MEMBERS(0, 0)
};
/* The effective cpu entry area mapping with guard pages. */
struct cea_exception_stacks {
- ESTACKS_MEMBERS(PAGE_SIZE)
+ ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ)
};
/*
@@ -40,6 +44,8 @@ enum exception_stack_ordering {
ESTACK_NMI,
ESTACK_DB,
ESTACK_MCE,
+ ESTACK_VC,
+ ESTACK_VC2,
N_EXCEPTION_STACKS
};
@@ -139,4 +145,7 @@ static inline struct entry_stack *cpu_entry_stack(int cpu)
#define __this_cpu_ist_top_va(name) \
CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name)
+#define __this_cpu_ist_bottom_va(name) \
+ CEA_ESTACK_BOT(__this_cpu_read(cea_exception_stacks), name)
+
#endif
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 7b0afd5e6c57..dad350d42ecf 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -236,6 +236,7 @@
#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
+#define X86_FEATURE_SEV_ES ( 8*32+20) /* AMD Secure Encrypted Virtualization - Encrypted State */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 1ced11d31932..476082a83d1c 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -383,6 +383,33 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit)
void alloc_intr_gate(unsigned int n, const void *addr);
+static inline void init_idt_data(struct idt_data *data, unsigned int n,
+ const void *addr)
+{
+ BUG_ON(n > 0xFF);
+
+ memset(data, 0, sizeof(*data));
+ data->vector = n;
+ data->addr = addr;
+ data->segment = __KERNEL_CS;
+ data->bits.type = GATE_INTERRUPT;
+ data->bits.p = 1;
+}
+
+static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
+{
+ unsigned long addr = (unsigned long) d->addr;
+
+ gate->offset_low = (u16) addr;
+ gate->segment = (u16) d->segment;
+ gate->bits = d->bits;
+ gate->offset_middle = (u16) (addr >> 16);
+#ifdef CONFIG_X86_64
+ gate->offset_high = (u32) (addr >> 32);
+ gate->reserved = 0;
+#endif
+}
+
extern unsigned long system_vectors[];
extern void load_current_idt(void);
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index a91f3b6e4f2a..f7e7099af595 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -74,6 +74,13 @@ struct idt_bits {
p : 1;
} __attribute__((packed));
+struct idt_data {
+ unsigned int vector;
+ unsigned int segment;
+ struct idt_bits bits;
+ const void *addr;
+};
+
struct gate_struct {
u16 offset_low;
u16 segment;
@@ -109,6 +116,9 @@ struct desc_ptr {
#endif /* !__ASSEMBLY__ */
+/* Boot IDT definitions */
+#define BOOT_IDT_ENTRIES 32
+
/* Access rights as returned by LAR */
#define AR_TYPE_RODATA (0 * (1 << 9))
#define AR_TYPE_RWDATA (1 * (1 << 9))
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index eb1ed3bd8d96..8d33ad80704f 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -19,6 +19,7 @@
#include <asm/user.h>
#include <asm/fpu/api.h>
#include <asm/fpu/xstate.h>
+#include <asm/fpu/xcr.h>
#include <asm/cpufeature.h>
#include <asm/trace/fpu.h>
@@ -592,33 +593,4 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
update_pasid();
}
-/*
- * MXCSR and XCR definitions:
- */
-
-static inline void ldmxcsr(u32 mxcsr)
-{
- asm volatile("ldmxcsr %0" :: "m" (mxcsr));
-}
-
-extern unsigned int mxcsr_feature_mask;
-
-#define XCR_XFEATURE_ENABLED_MASK 0x00000000
-
-static inline u64 xgetbv(u32 index)
-{
- u32 eax, edx;
-
- asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index));
- return eax + ((u64)edx << 32);
-}
-
-static inline void xsetbv(u32 index, u64 value)
-{
- u32 eax = value;
- u32 edx = value >> 32;
-
- asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
-}
-
#endif /* _ASM_X86_FPU_INTERNAL_H */
diff --git a/arch/x86/include/asm/fpu/xcr.h b/arch/x86/include/asm/fpu/xcr.h
new file mode 100644
index 000000000000..1c7ab8d95da5
--- /dev/null
+++ b/arch/x86/include/asm/fpu/xcr.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_FPU_XCR_H
+#define _ASM_X86_FPU_XCR_H
+
+/*
+ * MXCSR and XCR definitions:
+ */
+
+static inline void ldmxcsr(u32 mxcsr)
+{
+ asm volatile("ldmxcsr %0" :: "m" (mxcsr));
+}
+
+extern unsigned int mxcsr_feature_mask;
+
+#define XCR_XFEATURE_ENABLED_MASK 0x00000000
+
+static inline u64 xgetbv(u32 index)
+{
+ u32 eax, edx;
+
+ asm volatile("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index));
+ return eax + ((u64)edx << 32);
+}
+
+static inline void xsetbv(u32 index, u64 value)
+{
+ u32 eax = value;
+ u32 edx = value >> 32;
+
+ asm volatile("xsetbv" :: "a" (eax), "d" (edx), "c" (index));
+}
+
+#endif /* _ASM_X86_FPU_XCR_H */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index cdd41d039cd1..b2442eb0ac2f 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -309,6 +309,19 @@ static __always_inline void __##func(struct pt_regs *regs)
__visible void noist_##func(struct pt_regs *regs)
/**
+ * DECLARE_IDTENTRY_VC - Declare functions for the VC entry point
+ * @vector: Vector number (ignored for C)
+ * @func: Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY_RAW_ERRORCODE, but declares also the
+ * safe_stack C handler.
+ */
+#define DECLARE_IDTENTRY_VC(vector, func) \
+ DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func); \
+ __visible noinstr void ist_##func(struct pt_regs *regs, unsigned long error_code); \
+ __visible noinstr void safe_stack_##func(struct pt_regs *regs, unsigned long error_code)
+
+/**
* DEFINE_IDTENTRY_IST - Emit code for IST entry points
* @func: Function name of the entry point
*
@@ -347,6 +360,35 @@ static __always_inline void __##func(struct pt_regs *regs)
#define DEFINE_IDTENTRY_DF(func) \
DEFINE_IDTENTRY_RAW_ERRORCODE(func)
+/**
+ * DEFINE_IDTENTRY_VC_SAFE_STACK - Emit code for VMM communication handler
+ which runs on a safe stack.
+ * @func: Function name of the entry point
+ *
+ * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
+ */
+#define DEFINE_IDTENTRY_VC_SAFE_STACK(func) \
+ DEFINE_IDTENTRY_RAW_ERRORCODE(safe_stack_##func)
+
+/**
+ * DEFINE_IDTENTRY_VC_IST - Emit code for VMM communication handler
+ which runs on the VC fall-back stack
+ * @func: Function name of the entry point
+ *
+ * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
+ */
+#define DEFINE_IDTENTRY_VC_IST(func) \
+ DEFINE_IDTENTRY_RAW_ERRORCODE(ist_##func)
+
+/**
+ * DEFINE_IDTENTRY_VC - Emit code for VMM communication handler
+ * @func: Function name of the entry point
+ *
+ * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
+ */
+#define DEFINE_IDTENTRY_VC(func) \
+ DEFINE_IDTENTRY_RAW_ERRORCODE(func)
+
#else /* CONFIG_X86_64 */
/**
@@ -433,6 +475,9 @@ __visible noinstr void func(struct pt_regs *regs, \
# define DECLARE_IDTENTRY_XENCB(vector, func) \
DECLARE_IDTENTRY(vector, func)
+# define DECLARE_IDTENTRY_VC(vector, func) \
+ idtentry_vc vector asm_##func func
+
#else
# define DECLARE_IDTENTRY_MCE(vector, func) \
DECLARE_IDTENTRY(vector, func)
@@ -564,6 +609,11 @@ DECLARE_IDTENTRY_RAW(X86_TRAP_DB, xenpv_exc_debug);
/* #DF */
DECLARE_IDTENTRY_DF(X86_TRAP_DF, exc_double_fault);
+/* #VC */
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+DECLARE_IDTENTRY_VC(X86_TRAP_VC, exc_vmm_communication);
+#endif
+
#ifdef CONFIG_XEN_PV
DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback);
#endif
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index 2b6ccf2c49f1..a0f839aa144d 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -15,9 +15,15 @@
#define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf)
#define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4))
+bool insn_has_rep_prefix(struct insn *insn);
void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs);
+int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs);
unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
int insn_get_code_seg_params(struct pt_regs *regs);
+int insn_fetch_from_user(struct pt_regs *regs,
+ unsigned char buf[MAX_INSN_SIZE]);
+bool insn_decode(struct insn *insn, struct pt_regs *regs,
+ unsigned char buf[MAX_INSN_SIZE], int buf_size);
#endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 5049f6c22683..c9f5df0a1c10 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -19,6 +19,7 @@
#ifdef CONFIG_AMD_MEM_ENCRYPT
extern u64 sme_me_mask;
+extern u64 sev_status;
extern bool sev_enabled;
void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr,
@@ -48,8 +49,10 @@ void __init mem_encrypt_free_decrypted_mem(void);
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void);
+void __init sev_es_init_vc_handling(void);
bool sme_active(void);
bool sev_active(void);
+bool sev_es_active(void);
#define __bss_decrypted __attribute__((__section__(".bss..decrypted")))
@@ -70,8 +73,10 @@ static inline void __init sme_early_init(void) { }
static inline void __init sme_encrypt_kernel(struct boot_params *bp) { }
static inline void __init sme_enable(struct boot_params *bp) { }
+static inline void sev_es_init_vc_handling(void) { }
static inline bool sme_active(void) { return false; }
static inline bool sev_active(void) { return false; }
+static inline bool sev_es_active(void) { return false; }
static inline int __init
early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; }
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index c07a70ce7ffd..972a34d93505 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -470,9 +470,12 @@
#define MSR_AMD64_ICIBSEXTDCTL 0xc001103c
#define MSR_AMD64_IBSOPDATA4 0xc001103d
#define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */
+#define MSR_AMD64_SEV_ES_GHCB 0xc0010130
#define MSR_AMD64_SEV 0xc0010131
#define MSR_AMD64_SEV_ENABLED_BIT 0
+#define MSR_AMD64_SEV_ES_ENABLED_BIT 1
#define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
+#define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT)
#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 288b065955b7..d0c6c10c18a0 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -28,6 +28,7 @@
#define IST_INDEX_NMI 1
#define IST_INDEX_DB 2
#define IST_INDEX_MCE 3
+#define IST_INDEX_VC 4
/*
* Set __PAGE_OFFSET to the most negative possible address +
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5e0dcc20614d..a02c67291cfc 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,7 +28,7 @@
#include <asm-generic/pgtable_uffd.h>
extern pgd_t early_top_pgt[PTRS_PER_PGD];
-int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
+bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index d8a82e650810..5ac507586769 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -696,6 +696,7 @@ extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int);
extern void cpu_init(void);
+extern void cpu_init_exception_handling(void);
extern void cr4_init(void);
static inline unsigned long get_debugctlmsr(void)
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 28996fe19301..2c35f1c01a2d 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -10,6 +10,7 @@ void syscall_init(void);
#ifdef CONFIG_X86_64
void entry_SYSCALL_64(void);
+void entry_SYSCALL_64_safe_stack(void);
long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2);
#endif
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index b35030eeec36..5db5d083c873 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -21,6 +21,9 @@ struct real_mode_header {
/* SMP trampoline */
u32 trampoline_start;
u32 trampoline_header;
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ u32 sev_es_trampoline_start;
+#endif
#ifdef CONFIG_X86_64
u32 trampoline_pgd;
#endif
@@ -57,6 +60,9 @@ extern unsigned char real_mode_blob_end[];
extern unsigned long initial_code;
extern unsigned long initial_gs;
extern unsigned long initial_stack;
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+extern unsigned long initial_vc_handler;
+#endif
extern unsigned char real_mode_blob[];
extern unsigned char real_mode_relocs[];
@@ -66,6 +72,7 @@ extern unsigned char startup_32_smp[];
extern unsigned char boot_gdt[];
#else
extern unsigned char secondary_startup_64[];
+extern unsigned char secondary_startup_64_no_verify[];
#endif
static inline size_t real_mode_size_needed(void)
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 517920928989..7fdd4facfce7 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -226,7 +226,7 @@
#define NUM_EXCEPTION_VECTORS 32
/* Bitmask of exception vectors which push an error code on the stack: */
-#define EXCEPTION_ERRCODE_MASK 0x00027d00
+#define EXCEPTION_ERRCODE_MASK 0x20027d00
#define GDT_SIZE (GDT_ENTRIES*8)
#define GDT_ENTRY_TLS_ENTRIES 3
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 84b645cc8bc9..7d7a064af6ff 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -39,6 +39,8 @@ void vsmp_init(void);
static inline void vsmp_init(void) { }
#endif
+struct pt_regs;
+
void setup_bios_corruption_check(void);
void early_platform_quirks(void);
@@ -48,7 +50,9 @@ extern void reserve_standard_io_resources(void);
extern void i386_reserve_resources(void);
extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);
extern unsigned long __startup_secondary_64(void);
-extern int early_make_pgtable(unsigned long address);
+extern void startup_64_setup_env(unsigned long physbase);
+extern void early_setup_idt(void);
+extern void __init do_early_exception(struct pt_regs *regs, int trapnr);
#ifdef CONFIG_X86_INTEL_MID
extern void x86_intel_mid_early_setup(void);
diff --git a/arch/x86/include/asm/sev-es.h b/arch/x86/include/asm/sev-es.h
new file mode 100644
index 000000000000..cf1d957c7091
--- /dev/null
+++ b/arch/x86/include/asm/sev-es.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AMD Encrypted Register State Support
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#ifndef __ASM_ENCRYPTED_STATE_H
+#define __ASM_ENCRYPTED_STATE_H
+
+#include <linux/types.h>
+#include <asm/insn.h>
+
+#define GHCB_SEV_INFO 0x001UL
+#define GHCB_SEV_INFO_REQ 0x002UL
+#define GHCB_INFO(v) ((v) & 0xfffUL)
+#define GHCB_PROTO_MAX(v) (((v) >> 48) & 0xffffUL)
+#define GHCB_PROTO_MIN(v) (((v) >> 32) & 0xffffUL)
+#define GHCB_PROTO_OUR 0x0001UL
+#define GHCB_SEV_CPUID_REQ 0x004UL
+#define GHCB_CPUID_REQ_EAX 0
+#define GHCB_CPUID_REQ_EBX 1
+#define GHCB_CPUID_REQ_ECX 2
+#define GHCB_CPUID_REQ_EDX 3
+#define GHCB_CPUID_REQ(fn, reg) (GHCB_SEV_CPUID_REQ | \
+ (((unsigned long)reg & 3) << 30) | \
+ (((unsigned long)fn) << 32))
+
+#define GHCB_PROTOCOL_MAX 0x0001UL
+#define GHCB_DEFAULT_USAGE 0x0000UL
+
+#define GHCB_SEV_CPUID_RESP 0x005UL
+#define GHCB_SEV_TERMINATE 0x100UL
+#define GHCB_SEV_TERMINATE_REASON(reason_set, reason_val) \
+ (((((u64)reason_set) & 0x7) << 12) | \
+ ((((u64)reason_val) & 0xff) << 16))
+#define GHCB_SEV_ES_REASON_GENERAL_REQUEST 0
+#define GHCB_SEV_ES_REASON_PROTOCOL_UNSUPPORTED 1
+
+#define GHCB_SEV_GHCB_RESP_CODE(v) ((v) & 0xfff)
+#define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); }
+
+enum es_result {
+ ES_OK, /* All good */
+ ES_UNSUPPORTED, /* Requested operation not supported */
+ ES_VMM_ERROR, /* Unexpected state from the VMM */
+ ES_DECODE_FAILED, /* Instruction decoding failed */
+ ES_EXCEPTION, /* Instruction caused exception */
+ ES_RETRY, /* Retry instruction emulation */
+};
+
+struct es_fault_info {
+ unsigned long vector;
+ unsigned long error_code;
+ unsigned long cr2;
+};
+
+struct pt_regs;
+
+/* ES instruction emulation context */
+struct es_em_ctxt {
+ struct pt_regs *regs;
+ struct insn insn;
+ struct es_fault_info fi;
+};
+
+void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code);
+
+static inline u64 lower_bits(u64 val, unsigned int bits)
+{
+ u64 mask = (1ULL << bits) - 1;
+
+ return (val & mask);
+}
+
+struct real_mode_header;
+enum stack_type;
+
+/* Early IDT entry points for #VC handler */
+extern void vc_no_ghcb(void);
+extern void vc_boot_ghcb(void);
+extern bool handle_vc_boot_ghcb(struct pt_regs *regs);
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+extern struct static_key_false sev_es_enable_key;
+extern void __sev_es_ist_enter(struct pt_regs *regs);
+extern void __sev_es_ist_exit(void);
+static __always_inline void sev_es_ist_enter(struct pt_regs *regs)
+{
+ if (static_branch_unlikely(&sev_es_enable_key))
+ __sev_es_ist_enter(regs);
+}
+static __always_inline void sev_es_ist_exit(void)
+{
+ if (static_branch_unlikely(&sev_es_enable_key))
+ __sev_es_ist_exit();
+}
+extern int sev_es_setup_ap_jump_table(struct real_mode_header *rmh);
+extern void __sev_es_nmi_complete(void);
+static __always_inline void sev_es_nmi_complete(void)
+{
+ if (static_branch_unlikely(&sev_es_enable_key))
+ __sev_es_nmi_complete();
+}
+extern int __init sev_es_efi_map_ghcbs(pgd_t *pgd);
+#else
+static inline void sev_es_ist_enter(struct pt_regs *regs) { }
+static inline void sev_es_ist_exit(void) { }
+static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; }
+static inline void sev_es_nmi_complete(void) { }
+static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; }
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 5ae5a68e469d..49600643faba 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -35,6 +35,8 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info);
int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask);
+bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info);
const char *stack_type_name(enum stack_type type);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 8a1f5382a4ea..cf13f9e78585 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -150,14 +150,14 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
#define SVM_NESTED_CTL_NP_ENABLE BIT(0)
#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
-struct __attribute__ ((__packed__)) vmcb_seg {
+struct vmcb_seg {
u16 selector;
u16 attrib;
u32 limit;
u64 base;
-};
+} __packed;
-struct __attribute__ ((__packed__)) vmcb_save_area {
+struct vmcb_save_area {
struct vmcb_seg es;
struct vmcb_seg cs;
struct vmcb_seg ss;
@@ -200,20 +200,67 @@ struct __attribute__ ((__packed__)) vmcb_save_area {
u64 br_to;
u64 last_excp_from;
u64 last_excp_to;
-};
+ /*
+ * The following part of the save area is valid only for
+ * SEV-ES guests when referenced through the GHCB.
+ */
+ u8 reserved_7[104];
+ u64 reserved_8; /* rax already available at 0x01f8 */
+ u64 rcx;
+ u64 rdx;
+ u64 rbx;
+ u64 reserved_9; /* rsp already available at 0x01d8 */
+ u64 rbp;
+ u64 rsi;
+ u64 rdi;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+ u8 reserved_10[16];
+ u64 sw_exit_code;
+ u64 sw_exit_info_1;
+ u64 sw_exit_info_2;
+ u64 sw_scratch;
+ u8 reserved_11[56];
+ u64 xcr0;
+ u8 valid_bitmap[16];
+ u64 x87_state_gpa;
+} __packed;
+
+struct ghcb {
+ struct vmcb_save_area save;
+ u8 reserved_save[2048 - sizeof(struct vmcb_save_area)];
+
+ u8 shared_buffer[2032];
+
+ u8 reserved_1[10];
+ u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */
+ u32 ghcb_usage;
+} __packed;
+
+
+#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032
+#define EXPECTED_VMCB_CONTROL_AREA_SIZE 256
+#define EXPECTED_GHCB_SIZE PAGE_SIZE
static inline void __unused_size_checks(void)
{
- BUILD_BUG_ON(sizeof(struct vmcb_save_area) != 0x298);
- BUILD_BUG_ON(sizeof(struct vmcb_control_area) != 256);
+ BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE);
+ BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE);
}
-struct __attribute__ ((__packed__)) vmcb {
+struct vmcb {
struct vmcb_control_area control;
u8 reserved_control[1024 - sizeof(struct vmcb_control_area)];
struct vmcb_save_area save;
-};
+} __packed;
#define SVM_CPUID_FUNC 0x8000000a
@@ -298,4 +345,47 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
+/* GHCB Accessor functions */
+
+#define GHCB_BITMAP_IDX(field) \
+ (offsetof(struct vmcb_save_area, field) / sizeof(u64))
+
+#define DEFINE_GHCB_ACCESSORS(field) \
+ static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \
+ { \
+ return test_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&ghcb->save.valid_bitmap); \
+ } \
+ \
+ static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \
+ { \
+ __set_bit(GHCB_BITMAP_IDX(field), \
+ (unsigned long *)&ghcb->save.valid_bitmap); \
+ ghcb->save.field = value; \
+ }
+
+DEFINE_GHCB_ACCESSORS(cpl)
+DEFINE_GHCB_ACCESSORS(rip)
+DEFINE_GHCB_ACCESSORS(rsp)
+DEFINE_GHCB_ACCESSORS(rax)
+DEFINE_GHCB_ACCESSORS(rcx)
+DEFINE_GHCB_ACCESSORS(rdx)
+DEFINE_GHCB_ACCESSORS(rbx)
+DEFINE_GHCB_ACCESSORS(rbp)
+DEFINE_GHCB_ACCESSORS(rsi)
+DEFINE_GHCB_ACCESSORS(rdi)
+DEFINE_GHCB_ACCESSORS(r8)
+DEFINE_GHCB_ACCESSORS(r9)
+DEFINE_GHCB_ACCESSORS(r10)
+DEFINE_GHCB_ACCESSORS(r11)
+DEFINE_GHCB_ACCESSORS(r12)
+DEFINE_GHCB_ACCESSORS(r13)
+DEFINE_GHCB_ACCESSORS(r14)
+DEFINE_GHCB_ACCESSORS(r15)
+DEFINE_GHCB_ACCESSORS(sw_exit_code)
+DEFINE_GHCB_ACCESSORS(sw_exit_info_1)
+DEFINE_GHCB_ACCESSORS(sw_exit_info_2)
+DEFINE_GHCB_ACCESSORS(sw_scratch)
+DEFINE_GHCB_ACCESSORS(xcr0)
+
#endif
diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h
new file mode 100644
index 000000000000..305bc1214aef
--- /dev/null
+++ b/arch/x86/include/asm/trap_pf.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_TRAP_PF_H
+#define _ASM_X86_TRAP_PF_H
+
+/*
+ * Page fault error code bits:
+ *
+ * bit 0 == 0: no page found 1: protection fault
+ * bit 1 == 0: read access 1: write access
+ * bit 2 == 0: kernel-mode access 1: user-mode access
+ * bit 3 == 1: use of reserved bit detected
+ * bit 4 == 1: fault was an instruction fetch
+ * bit 5 == 1: protection keys block access
+ */
+enum x86_pf_error_code {
+ X86_PF_PROT = 1 << 0,
+ X86_PF_WRITE = 1 << 1,
+ X86_PF_USER = 1 << 2,
+ X86_PF_RSVD = 1 << 3,
+ X86_PF_INSTR = 1 << 4,
+ X86_PF_PK = 1 << 5,
+};
+
+#endif /* _ASM_X86_TRAP_PF_H */
diff --git a/arch/x86/include/asm/trapnr.h b/arch/x86/include/asm/trapnr.h
index 082f45631fa9..f5d2325aa0b7 100644
--- a/arch/x86/include/asm/trapnr.h
+++ b/arch/x86/include/asm/trapnr.h
@@ -26,6 +26,7 @@
#define X86_TRAP_XF 19 /* SIMD Floating-Point Exception */
#define X86_TRAP_VE 20 /* Virtualization Exception */
#define X86_TRAP_CP 21 /* Control Protection Exception */
+#define X86_TRAP_VC 29 /* VMM Communication Exception */
#define X86_TRAP_IRET 32 /* IRET Exception */
#endif
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index df0b7bfc1234..7f7200021bd1 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -8,12 +8,14 @@
#include <asm/debugreg.h>
#include <asm/idtentry.h>
#include <asm/siginfo.h> /* TRAP_TRACE, ... */
+#include <asm/trap_pf.h>
#ifdef CONFIG_X86_64
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
asmlinkage __visible notrace
struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s);
void __init trap_init(void);
+asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs);
#endif
#ifdef CONFIG_X86_F00F_BUG
@@ -43,22 +45,4 @@ void __noreturn handle_stack_overflow(const char *message,
unsigned long fault_address);
#endif
-/*
- * Page fault error code bits:
- *
- * bit 0 == 0: no page found 1: protection fault
- * bit 1 == 0: read access 1: write access
- * bit 2 == 0: kernel-mode access 1: user-mode access
- * bit 3 == 1: use of reserved bit detected
- * bit 4 == 1: fault was an instruction fetch
- * bit 5 == 1: protection keys block access
- */
-enum x86_pf_error_code {
- X86_PF_PROT = 1 << 0,
- X86_PF_WRITE = 1 << 1,
- X86_PF_USER = 1 << 2,
- X86_PF_RSVD = 1 << 3,
- X86_PF_INSTR = 1 << 4,
- X86_PF_PK = 1 << 5,
-};
#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 397196fae24d..dde5b3f1e7cd 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -4,8 +4,10 @@
#include <asm/bootparam.h>
+struct ghcb;
struct mpc_bus;
struct mpc_cpu;
+struct pt_regs;
struct mpc_table;
struct cpuinfo_x86;
struct irq_domain;
@@ -229,10 +231,22 @@ struct x86_legacy_features {
/**
* struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks
*
- * @pin_vcpu: pin current vcpu to specified physical cpu (run rarely)
+ * @pin_vcpu: pin current vcpu to specified physical
+ * cpu (run rarely)
+ * @sev_es_hcall_prepare: Load additional hypervisor-specific
+ * state into the GHCB when doing a VMMCALL under
+ * SEV-ES. Called from the #VC exception handler.
+ * @sev_es_hcall_finish: Copies state from the GHCB back into the
+ * processor (or pt_regs). Also runs checks on the
+ * state returned from the hypervisor after a
+ * VMMCALL under SEV-ES. Needs to return 'false'
+ * if the checks fail. Called from the #VC
+ * exception handler.
*/
struct x86_hyper_runtime {
void (*pin_vcpu)(int cpu);
+ void (*sev_es_hcall_prepare)(struct ghcb *ghcb, struct pt_regs *regs);
+ bool (*sev_es_hcall_finish)(struct ghcb *ghcb, struct pt_regs *regs);
};
/**
diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h
index 2e8a30f06c74..a7a3403645e5 100644
--- a/arch/x86/include/uapi/asm/svm.h
+++ b/arch/x86/include/uapi/asm/svm.h
@@ -29,6 +29,7 @@
#define SVM_EXIT_WRITE_DR6 0x036
#define SVM_EXIT_WRITE_DR7 0x037
#define SVM_EXIT_EXCP_BASE 0x040
+#define SVM_EXIT_LAST_EXCP 0x05f
#define SVM_EXIT_INTR 0x060
#define SVM_EXIT_NMI 0x061
#define SVM_EXIT_SMI 0x062
@@ -80,6 +81,16 @@
#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401
#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402
+/* SEV-ES software-defined VMGEXIT events */
+#define SVM_VMGEXIT_MMIO_READ 0x80000001
+#define SVM_VMGEXIT_MMIO_WRITE 0x80000002
+#define SVM_VMGEXIT_NMI_COMPLETE 0x80000003
+#define SVM_VMGEXIT_AP_HLT_LOOP 0x80000004
+#define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005
+#define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0
+#define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1
+#define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff
+
#define SVM_EXIT_ERR -1
#define SVM_EXIT_REASONS \
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index de09af019e23..04ceea8f4a89 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -20,6 +20,7 @@ CFLAGS_REMOVE_kvmclock.o = -pg
CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
CFLAGS_REMOVE_head64.o = -pg
+CFLAGS_REMOVE_sev-es.o = -pg
endif
KASAN_SANITIZE_head$(BITS).o := n
@@ -27,6 +28,7 @@ KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
KASAN_SANITIZE_stacktrace.o := n
KASAN_SANITIZE_paravirt.o := n
+KASAN_SANITIZE_sev-es.o := n
# With some compiler versions the generated code results in boot hangs, caused
# by several compilation units. To be safe, disable all instrumentation.
@@ -146,6 +148,7 @@ obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
+obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev-es.o
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index dcc3d943c68f..6062ce586b95 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -614,7 +614,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
* If BIOS has not enabled SME then don't advertise the
* SME feature (set in scattered.c).
* For SEV: If BIOS has not enabled SEV then don't advertise the
- * SEV feature (set in scattered.c).
+ * SEV and SEV_ES feature (set in scattered.c).
*
* In all cases, since support for SME and SEV requires long mode,
* don't advertise the feature under CONFIG_X86_32.
@@ -645,6 +645,7 @@ clear_all:
setup_clear_cpu_cap(X86_FEATURE_SME);
clear_sev:
setup_clear_cpu_cap(X86_FEATURE_SEV);
+ setup_clear_cpu_cap(X86_FEATURE_SEV_ES);
}
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c51158914ea2..35ad8480c464 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1876,6 +1876,8 @@ static inline void tss_setup_ist(struct tss_struct *tss)
tss->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
tss->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
tss->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
+ /* Only mapped when SEV-ES is active */
+ tss->x86_tss.ist[IST_INDEX_VC] = __this_cpu_ist_top_va(VC);
}
#else /* CONFIG_X86_64 */
@@ -1908,6 +1910,29 @@ static inline void tss_setup_io_bitmap(struct tss_struct *tss)
}
/*
+ * Setup everything needed to handle exceptions from the IDT, including the IST
+ * exceptions which use paranoid_entry().
+ */
+void cpu_init_exception_handling(void)
+{
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+ int cpu = raw_smp_processor_id();
+
+ /* paranoid_entry() gets the CPU number from the GDT */
+ setup_getcpu(cpu);
+
+ /* IST vectors need TSS to be set up. */
+ tss_setup_ist(tss);
+ tss_setup_io_bitmap(tss);
+ set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
+
+ load_TR_desc();
+
+ /* Finally load the IDT */
+ load_current_idt();
+}
+
+/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
* and IDT. We reload them nevertheless, this function acts as a
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 2eb0a8c44b35..866c9a9bcdee 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -42,6 +42,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
{ X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
{ X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
+ { X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 },
{ X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 9b6fafa69be9..924571fe5864 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -33,6 +33,7 @@
#include <asm/timer.h>
#include <asm/apic.h>
#include <asm/vmware.h>
+#include <asm/svm.h>
#undef pr_fmt
#define pr_fmt(fmt) "vmware: " fmt
@@ -476,10 +477,49 @@ static bool __init vmware_legacy_x2apic_available(void)
(eax & (1 << VMWARE_CMD_LEGACY_X2APIC)) != 0;
}
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+static void vmware_sev_es_hcall_prepare(struct ghcb *ghcb,
+ struct pt_regs *regs)
+{
+ /* Copy VMWARE specific Hypercall parameters to the GHCB */
+ ghcb_set_rip(ghcb, regs->ip);
+ ghcb_set_rbx(ghcb, regs->bx);
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_rsi(ghcb, regs->si);
+ ghcb_set_rdi(ghcb, regs->di);
+ ghcb_set_rbp(ghcb, regs->bp);
+}
+
+static bool vmware_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ if (!(ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb) &&
+ ghcb_rsi_is_valid(ghcb) &&
+ ghcb_rdi_is_valid(ghcb) &&
+ ghcb_rbp_is_valid(ghcb)))
+ return false;
+
+ regs->bx = ghcb->save.rbx;
+ regs->cx = ghcb->save.rcx;
+ regs->dx = ghcb->save.rdx;
+ regs->si = ghcb->save.rsi;
+ regs->di = ghcb->save.rdi;
+ regs->bp = ghcb->save.rbp;
+
+ return true;
+}
+#endif
+
const __initconst struct hypervisor_x86 x86_hyper_vmware = {
- .name = "VMware",
- .detect = vmware_platform,
- .type = X86_HYPER_VMWARE,
- .init.init_platform = vmware_platform_setup,
- .init.x2apic_available = vmware_legacy_x2apic_available,
+ .name = "VMware",
+ .detect = vmware_platform,
+ .type = X86_HYPER_VMWARE,
+ .init.init_platform = vmware_platform_setup,
+ .init.x2apic_available = vmware_legacy_x2apic_available,
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .runtime.sev_es_hcall_prepare = vmware_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = vmware_sev_es_hcall_finish,
+#endif
};
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ea8d51ec251b..25c06b67e7e0 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -29,8 +29,8 @@ static int die_counter;
static struct pt_regs exec_summary_regs;
-bool in_task_stack(unsigned long *stack, struct task_struct *task,
- struct stack_info *info)
+bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info)
{
unsigned long *begin = task_stack_page(task);
unsigned long *end = task_stack_page(task) + THREAD_SIZE;
@@ -46,7 +46,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
return true;
}
-bool in_entry_stack(unsigned long *stack, struct stack_info *info)
+/* Called from get_stack_info_noinstr - so must be noinstr too */
+bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info)
{
struct entry_stack *ss = cpu_entry_stack(smp_processor_id());
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 4a94d38cd141..1dd851397bd9 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -24,11 +24,13 @@ static const char * const exception_stack_names[] = {
[ ESTACK_NMI ] = "NMI",
[ ESTACK_DB ] = "#DB",
[ ESTACK_MCE ] = "#MC",
+ [ ESTACK_VC ] = "#VC",
+ [ ESTACK_VC2 ] = "#VC2",
};
const char *stack_type_name(enum stack_type type)
{
- BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
if (type == STACK_TYPE_IRQ)
return "IRQ";
@@ -79,16 +81,18 @@ struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
EPAGERANGE(NMI),
EPAGERANGE(DB),
EPAGERANGE(MCE),
+ EPAGERANGE(VC),
+ EPAGERANGE(VC2),
};
-static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
+static __always_inline bool in_exception_stack(unsigned long *stack, struct stack_info *info)
{
unsigned long begin, end, stk = (unsigned long)stack;
const struct estack_pages *ep;
struct pt_regs *regs;
unsigned int k;
- BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
+ BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
/*
@@ -122,7 +126,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
return true;
}
-static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
+static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
@@ -147,32 +151,38 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
return true;
}
-int get_stack_info(unsigned long *stack, struct task_struct *task,
- struct stack_info *info, unsigned long *visit_mask)
+bool noinstr get_stack_info_noinstr(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info)
{
- if (!stack)
- goto unknown;
-
- task = task ? : current;
-
if (in_task_stack(stack, task, info))
- goto recursion_check;
+ return true;
if (task != current)
- goto unknown;
+ return false;
if (in_exception_stack(stack, info))
- goto recursion_check;
+ return true;
if (in_irq_stack(stack, info))
- goto recursion_check;
+ return true;
if (in_entry_stack(stack, info))
- goto recursion_check;
+ return true;
+
+ return false;
+}
+
+int get_stack_info(unsigned long *stack, struct task_struct *task,
+ struct stack_info *info, unsigned long *visit_mask)
+{
+ task = task ? : current;
- goto unknown;
+ if (!stack)
+ goto unknown;
+
+ if (!get_stack_info_noinstr(stack, task, info))
+ goto unknown;
-recursion_check:
/*
* Make sure we don't iterate through any given stack more than once.
* If it comes up a second time then there's something wrong going on:
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index cbb71c1b574f..4199f25c0063 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -36,6 +36,11 @@
#include <asm/microcode.h>
#include <asm/kasan.h>
#include <asm/fixmap.h>
+#include <asm/realmode.h>
+#include <asm/desc.h>
+#include <asm/extable.h>
+#include <asm/trapnr.h>
+#include <asm/sev-es.h>
/*
* Manage page tables very early on.
@@ -61,6 +66,24 @@ unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4;
EXPORT_SYMBOL(vmemmap_base);
#endif
+/*
+ * GDT used on the boot CPU before switching to virtual addresses.
+ */
+static struct desc_struct startup_gdt[GDT_ENTRIES] = {
+ [GDT_ENTRY_KERNEL32_CS] = GDT_ENTRY_INIT(0xc09b, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_CS] = GDT_ENTRY_INIT(0xa09b, 0, 0xfffff),
+ [GDT_ENTRY_KERNEL_DS] = GDT_ENTRY_INIT(0xc093, 0, 0xfffff),
+};
+
+/*
+ * Address needs to be set at runtime because it references the startup_gdt
+ * while the kernel still uses a direct mapping.
+ */
+static struct desc_ptr startup_gdt_descr = {
+ .size = sizeof(startup_gdt),
+ .address = 0,
+};
+
#define __head __section(.head.text)
static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
@@ -297,7 +320,7 @@ static void __init reset_early_page_tables(void)
}
/* Create a new PMD entry */
-int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
+bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
{
unsigned long physaddr = address - __PAGE_OFFSET;
pgdval_t pgd, *pgd_p;
@@ -307,7 +330,7 @@ int __init __early_make_pgtable(unsigned long address, pmdval_t pmd)
/* Invalid address or early pgt is done ? */
if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
- return -1;
+ return false;
again:
pgd_p = &early_top_pgt[pgd_index(address)].pgd;
@@ -364,10 +387,10 @@ again:
}
pmd_p[pmd_index(address)] = pmd;
- return 0;
+ return true;
}
-int __init early_make_pgtable(unsigned long address)
+static bool __init early_make_pgtable(unsigned long address)
{
unsigned long physaddr = address - __PAGE_OFFSET;
pmdval_t pmd;
@@ -377,6 +400,19 @@ int __init early_make_pgtable(unsigned long address)
return __early_make_pgtable(address, pmd);
}
+void __init do_early_exception(struct pt_regs *regs, int trapnr)
+{
+ if (trapnr == X86_TRAP_PF &&
+ early_make_pgtable(native_read_cr2()))
+ return;
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT) &&
+ trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs))
+ return;
+
+ early_fixup_exception(regs, trapnr);
+}
+
/* Don't add a printk in there. printk relies on the PDA which is not initialized
yet. */
static void __init clear_bss(void)
@@ -489,3 +525,81 @@ void __init x86_64_start_reservations(char *real_mode_data)
start_kernel();
}
+
+/*
+ * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is
+ * used until the idt_table takes over. On the boot CPU this happens in
+ * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases
+ * this happens in the functions called from head_64.S.
+ *
+ * The idt_table can't be used that early because all the code modifying it is
+ * in idt.c and can be instrumented by tracing or KASAN, which both don't work
+ * during early CPU bringup. Also the idt_table has the runtime vectors
+ * configured which require certain CPU state to be setup already (like TSS),
+ * which also hasn't happened yet in early CPU bringup.
+ */
+static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data;
+
+static struct desc_ptr bringup_idt_descr = {
+ .size = (NUM_EXCEPTION_VECTORS * sizeof(gate_desc)) - 1,
+ .address = 0, /* Set at runtime */
+};
+
+static void set_bringup_idt_handler(gate_desc *idt, int n, void *handler)
+{
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ struct idt_data data;
+ gate_desc desc;
+
+ init_idt_data(&data, n, handler);
+ idt_init_desc(&desc, &data);
+ native_write_idt_entry(idt, n, &desc);
+#endif
+}
+
+/* This runs while still in the direct mapping */
+static void startup_64_load_idt(unsigned long physbase)
+{
+ struct desc_ptr *desc = fixup_pointer(&bringup_idt_descr, physbase);
+ gate_desc *idt = fixup_pointer(bringup_idt_table, physbase);
+
+
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) {
+ void *handler;
+
+ /* VMM Communication Exception */
+ handler = fixup_pointer(vc_no_ghcb, physbase);
+ set_bringup_idt_handler(idt, X86_TRAP_VC, handler);
+ }
+
+ desc->address = (unsigned long)idt;
+ native_load_idt(desc);
+}
+
+/* This is used when running on kernel addresses */
+void early_setup_idt(void)
+{
+ /* VMM Communication Exception */
+ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT))
+ set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb);
+
+ bringup_idt_descr.address = (unsigned long)bringup_idt_table;
+ native_load_idt(&bringup_idt_descr);
+}
+
+/*
+ * Setup boot CPU state needed before kernel switches to virtual addresses.
+ */
+void __head startup_64_setup_env(unsigned long physbase)
+{
+ /* Load GDT */
+ startup_gdt_descr.address = (unsigned long)fixup_pointer(startup_gdt, physbase);
+ native_load_gdt(&startup_gdt_descr);
+
+ /* New GDT is live - reload data segment registers */
+ asm volatile("movl %%eax, %%ds\n"
+ "movl %%eax, %%ss\n"
+ "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory");
+
+ startup_64_load_idt(physbase);
+}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 16da4ac01597..7eb2a1c87969 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -73,6 +73,20 @@ SYM_CODE_START_NOALIGN(startup_64)
/* Set up the stack for verify_cpu(), similar to initial_stack below */
leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
+ leaq _text(%rip), %rdi
+ pushq %rsi
+ call startup_64_setup_env
+ popq %rsi
+
+ /* Now switch to __KERNEL_CS so IRET works reliably */
+ pushq $__KERNEL_CS
+ leaq .Lon_kernel_cs(%rip), %rax
+ pushq %rax
+ lretq
+
+.Lon_kernel_cs:
+ UNWIND_HINT_EMPTY
+
/* Sanitize CPU configuration */
call verify_cpu
@@ -112,6 +126,18 @@ SYM_CODE_START(secondary_startup_64)
call verify_cpu
/*
+ * The secondary_startup_64_no_verify entry point is only used by
+ * SEV-ES guests. In those guests the call to verify_cpu() would cause
+ * #VC exceptions which can not be handled at this stage of secondary
+ * CPU bringup.
+ *
+ * All non SEV-ES systems, especially Intel systems, need to execute
+ * verify_cpu() above to make sure NX is enabled.
+ */
+SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
+ UNWIND_HINT_EMPTY
+
+ /*
* Retrieve the modifier (SME encryption mask if SME is active) to be
* added to the initial pgdir entry that will be programmed into CR3.
*/
@@ -144,33 +170,6 @@ SYM_CODE_START(secondary_startup_64)
1:
UNWIND_HINT_EMPTY
- /* Check if nx is implemented */
- movl $0x80000001, %eax
- cpuid
- movl %edx,%edi
-
- /* Setup EFER (Extended Feature Enable Register) */
- movl $MSR_EFER, %ecx
- rdmsr
- btsl $_EFER_SCE, %eax /* Enable System Call */
- btl $20,%edi /* No Execute supported? */
- jnc 1f
- btsl $_EFER_NX, %eax
- btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
-1: wrmsr /* Make changes effective */
-
- /* Setup cr0 */
- movl $CR0_STATE, %eax
- /* Make changes effective */
- movq %rax, %cr0
-
- /* Setup a boot time stack */
- movq initial_stack(%rip), %rsp
-
- /* zero EFLAGS after setting rsp */
- pushq $0
- popfq
-
/*
* We must switch to a new descriptor in kernel space for the GDT
* because soon the kernel won't have access anymore to the userspace
@@ -205,6 +204,41 @@ SYM_CODE_START(secondary_startup_64)
movl initial_gs+4(%rip),%edx
wrmsr
+ /*
+ * Setup a boot time stack - Any secondary CPU will have lost its stack
+ * by now because the cr3-switch above unmaps the real-mode stack
+ */
+ movq initial_stack(%rip), %rsp
+
+ /* Setup and Load IDT */
+ pushq %rsi
+ call early_setup_idt
+ popq %rsi
+
+ /* Check if nx is implemented */
+ movl $0x80000001, %eax
+ cpuid
+ movl %edx,%edi
+
+ /* Setup EFER (Extended Feature Enable Register) */
+ movl $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_SCE, %eax /* Enable System Call */
+ btl $20,%edi /* No Execute supported? */
+ jnc 1f
+ btsl $_EFER_NX, %eax
+ btsq $_PAGE_BIT_NX,early_pmd_flags(%rip)
+1: wrmsr /* Make changes effective */
+
+ /* Setup cr0 */
+ movl $CR0_STATE, %eax
+ /* Make changes effective */
+ movq %rax, %cr0
+
+ /* zero EFLAGS after setting rsp */
+ pushq $0
+ popfq
+
/* rsi is pointer to real mode structure with interesting info.
pass it to C */
movq %rsi, %rdi
@@ -259,11 +293,47 @@ SYM_CODE_START(start_cpu0)
SYM_CODE_END(start_cpu0)
#endif
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/*
+ * VC Exception handler used during early boot when running on kernel
+ * addresses, but before the switch to the idt_table can be made.
+ * The early_idt_handler_array can't be used here because it calls into a lot
+ * of __init code and this handler is also used during CPU offlining/onlining.
+ * Therefore this handler ends up in the .text section so that it stays around
+ * when .init.text is freed.
+ */
+SYM_CODE_START_NOALIGN(vc_boot_ghcb)
+ UNWIND_HINT_IRET_REGS offset=8
+
+ /* Build pt_regs */
+ PUSH_AND_CLEAR_REGS
+
+ /* Call C handler */
+ movq %rsp, %rdi
+ movq ORIG_RAX(%rsp), %rsi
+ movq initial_vc_handler(%rip), %rax
+ ANNOTATE_RETPOLINE_SAFE
+ call *%rax
+
+ /* Unwind pt_regs */
+ POP_REGS
+
+ /* Remove Error Code */
+ addq $8, %rsp
+
+ /* Pure iret required here - don't use INTERRUPT_RETURN */
+ iretq
+SYM_CODE_END(vc_boot_ghcb)
+#endif
+
/* Both SMP bootup and ACPI suspend change these variables */
__REFDATA
.balign 8
SYM_DATA(initial_code, .quad x86_64_start_kernel)
SYM_DATA(initial_gs, .quad INIT_PER_CPU_VAR(fixed_percpu_data))
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+SYM_DATA(initial_vc_handler, .quad handle_vc_boot_ghcb)
+#endif
/*
* The SIZEOF_PTREGS gap is a convention which helps the in-kernel unwinder
@@ -319,22 +389,43 @@ SYM_CODE_START_LOCAL(early_idt_handler_common)
pushq %r15 /* pt_regs->r15 */
UNWIND_HINT_REGS
- cmpq $14,%rsi /* Page fault? */
- jnz 10f
- GET_CR2_INTO(%rdi) /* can clobber %rax if pv */
- call early_make_pgtable
- andl %eax,%eax
- jz 20f /* All good */
-
-10:
movq %rsp,%rdi /* RDI = pt_regs; RSI is already trapnr */
- call early_fixup_exception
+ call do_early_exception
-20:
decl early_recursion_flag(%rip)
jmp restore_regs_and_return_to_kernel
SYM_CODE_END(early_idt_handler_common)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/*
+ * VC Exception handler used during very early boot. The
+ * early_idt_handler_array can't be used because it returns via the
+ * paravirtualized INTERRUPT_RETURN and pv-ops don't work that early.
+ *
+ * This handler will end up in the .init.text section and not be
+ * available to boot secondary CPUs.
+ */
+SYM_CODE_START_NOALIGN(vc_no_ghcb)
+ UNWIND_HINT_IRET_REGS offset=8
+
+ /* Build pt_regs */
+ PUSH_AND_CLEAR_REGS
+
+ /* Call C handler */
+ movq %rsp, %rdi
+ movq ORIG_RAX(%rsp), %rsi
+ call do_vc_no_ghcb
+
+ /* Unwind pt_regs */
+ POP_REGS
+
+ /* Remove Error Code */
+ addq $8, %rsp
+
+ /* Pure iret required here - don't use INTERRUPT_RETURN */
+ iretq
+SYM_CODE_END(vc_no_ghcb)
+#endif
#define SYM_DATA_START_PAGE_ALIGNED(name) \
SYM_START(name, SYM_L_GLOBAL, .balign PAGE_SIZE)
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 1bffb87dcfdc..ee1a283f8e96 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -11,13 +11,6 @@
#include <asm/desc.h>
#include <asm/hw_irq.h>
-struct idt_data {
- unsigned int vector;
- unsigned int segment;
- struct idt_bits bits;
- const void *addr;
-};
-
#define DPL0 0x0
#define DPL3 0x3
@@ -175,20 +168,6 @@ bool idt_is_f00f_address(unsigned long address)
}
#endif
-static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
-{
- unsigned long addr = (unsigned long) d->addr;
-
- gate->offset_low = (u16) addr;
- gate->segment = (u16) d->segment;
- gate->bits = d->bits;
- gate->offset_middle = (u16) (addr >> 16);
-#ifdef CONFIG_X86_64
- gate->offset_high = (u32) (addr >> 32);
- gate->reserved = 0;
-#endif
-}
-
static __init void
idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
{
@@ -206,14 +185,7 @@ static __init void set_intr_gate(unsigned int n, const void *addr)
{
struct idt_data data;
- BUG_ON(n > 0xFF);
-
- memset(&data, 0, sizeof(data));
- data.vector = n;
- data.addr = addr;
- data.segment = __KERNEL_CS;
- data.bits.type = GATE_INTERRUPT;
- data.bits.p = 1;
+ init_idt_data(&data, n, addr);
idt_setup_from_table(idt_table, &data, 1, false);
}
@@ -254,11 +226,14 @@ static const __initconst struct idt_data early_pf_idts[] = {
* cpu_init() when the TSS has been initialized.
*/
static const __initconst struct idt_data ist_idts[] = {
- ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
- ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
- ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
+ ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
+ ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
+ ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
#ifdef CONFIG_X86_MCE
- ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
+ ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
+#endif
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ ISTG(X86_TRAP_VC, asm_exc_vmm_communication, IST_INDEX_VC),
#endif
};
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 9663ba31347c..1c0f2560a41c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -36,6 +36,8 @@
#include <asm/hypervisor.h>
#include <asm/tlb.h>
#include <asm/cpuidle_haltpoll.h>
+#include <asm/ptrace.h>
+#include <asm/svm.h>
DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
@@ -744,13 +746,34 @@ static void __init kvm_init_platform(void)
x86_platform.apic_post_init = kvm_apic_init;
}
+#if defined(CONFIG_AMD_MEM_ENCRYPT)
+static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* RAX and CPL are already in the GHCB */
+ ghcb_set_rbx(ghcb, regs->bx);
+ ghcb_set_rcx(ghcb, regs->cx);
+ ghcb_set_rdx(ghcb, regs->dx);
+ ghcb_set_rsi(ghcb, regs->si);
+}
+
+static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs)
+{
+ /* No checking of the return state needed */
+ return true;
+}
+#endif
+
const __initconst struct hypervisor_x86 x86_hyper_kvm = {
- .name = "KVM",
- .detect = kvm_detect,
- .type = X86_HYPER_KVM,
- .init.guest_late_init = kvm_guest_init,
- .init.x2apic_available = kvm_para_available,
- .init.init_platform = kvm_init_platform,
+ .name = "KVM",
+ .detect = kvm_detect,
+ .type = X86_HYPER_KVM,
+ .init.guest_late_init = kvm_guest_init,
+ .init.x2apic_available = kvm_para_available,
+ .init.init_platform = kvm_init_platform,
+#if defined(CONFIG_AMD_MEM_ENCRYPT)
+ .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare,
+ .runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish,
+#endif
};
static __init int activate_jump_labels(void)
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 47381666d6a5..4bc77aaf1303 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -33,6 +33,7 @@
#include <asm/reboot.h>
#include <asm/cache.h>
#include <asm/nospec-branch.h>
+#include <asm/sev-es.h>
#define CREATE_TRACE_POINTS
#include <trace/events/nmi.h>
@@ -476,6 +477,12 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
{
bool irq_state;
+ /*
+ * Re-enable NMIs right here when running as an SEV-ES guest. This might
+ * cause nested NMIs, but those can be handled safely.
+ */
+ sev_es_nmi_complete();
+
if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
return;
@@ -487,6 +494,12 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
this_cpu_write(nmi_cr2, read_cr2());
nmi_restart:
+ /*
+ * Needs to happen before DR7 is accessed, because the hypervisor can
+ * intercept DR7 reads/writes, turning those into #VC exceptions.
+ */
+ sev_es_ist_enter(regs);
+
this_cpu_write(nmi_dr7, local_db_save());
irq_state = idtentry_enter_nmi(regs);
@@ -500,6 +513,8 @@ nmi_restart:
local_db_restore(this_cpu_read(nmi_dr7));
+ sev_es_ist_exit();
+
if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
write_cr2(this_cpu_read(nmi_cr2));
if (this_cpu_dec_return(nmi_state))
diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c
new file mode 100644
index 000000000000..5f83ccaab877
--- /dev/null
+++ b/arch/x86/kernel/sev-es-shared.c
@@ -0,0 +1,507 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AMD Encrypted Register State Support
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ *
+ * This file is not compiled stand-alone. It contains code shared
+ * between the pre-decompression boot code and the running Linux kernel
+ * and is included directly into both code-bases.
+ */
+
+#ifndef __BOOT_COMPRESSED
+#define error(v) pr_err(v)
+#define has_cpuflag(f) boot_cpu_has(f)
+#endif
+
+static bool __init sev_es_check_cpu_features(void)
+{
+ if (!has_cpuflag(X86_FEATURE_RDRAND)) {
+ error("RDRAND instruction not supported - no trusted source of randomness available\n");
+ return false;
+ }
+
+ return true;
+}
+
+static void sev_es_terminate(unsigned int reason)
+{
+ u64 val = GHCB_SEV_TERMINATE;
+
+ /*
+ * Tell the hypervisor what went wrong - only reason-set 0 is
+ * currently supported.
+ */
+ val |= GHCB_SEV_TERMINATE_REASON(0, reason);
+
+ /* Request Guest Termination from Hypvervisor */
+ sev_es_wr_ghcb_msr(val);
+ VMGEXIT();
+
+ while (true)
+ asm volatile("hlt\n" : : : "memory");
+}
+
+static bool sev_es_negotiate_protocol(void)
+{
+ u64 val;
+
+ /* Do the GHCB protocol version negotiation */
+ sev_es_wr_ghcb_msr(GHCB_SEV_INFO_REQ);
+ VMGEXIT();
+ val = sev_es_rd_ghcb_msr();
+
+ if (GHCB_INFO(val) != GHCB_SEV_INFO)
+ return false;
+
+ if (GHCB_PROTO_MAX(val) < GHCB_PROTO_OUR ||
+ GHCB_PROTO_MIN(val) > GHCB_PROTO_OUR)
+ return false;
+
+ return true;
+}
+
+static __always_inline void vc_ghcb_invalidate(struct ghcb *ghcb)
+{
+ memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap));
+}
+
+static bool vc_decoding_needed(unsigned long exit_code)
+{
+ /* Exceptions don't require to decode the instruction */
+ return !(exit_code >= SVM_EXIT_EXCP_BASE &&
+ exit_code <= SVM_EXIT_LAST_EXCP);
+}
+
+static enum es_result vc_init_em_ctxt(struct es_em_ctxt *ctxt,
+ struct pt_regs *regs,
+ unsigned long exit_code)
+{
+ enum es_result ret = ES_OK;
+
+ memset(ctxt, 0, sizeof(*ctxt));
+ ctxt->regs = regs;
+
+ if (vc_decoding_needed(exit_code))
+ ret = vc_decode_insn(ctxt);
+
+ return ret;
+}
+
+static void vc_finish_insn(struct es_em_ctxt *ctxt)
+{
+ ctxt->regs->ip += ctxt->insn.length;
+}
+
+static enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt,
+ u64 exit_code, u64 exit_info_1,
+ u64 exit_info_2)
+{
+ enum es_result ret;
+
+ /* Fill in protocol and format specifiers */
+ ghcb->protocol_version = GHCB_PROTOCOL_MAX;
+ ghcb->ghcb_usage = GHCB_DEFAULT_USAGE;
+
+ ghcb_set_sw_exit_code(ghcb, exit_code);
+ ghcb_set_sw_exit_info_1(ghcb, exit_info_1);
+ ghcb_set_sw_exit_info_2(ghcb, exit_info_2);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ if ((ghcb->save.sw_exit_info_1 & 0xffffffff) == 1) {
+ u64 info = ghcb->save.sw_exit_info_2;
+ unsigned long v;
+
+ info = ghcb->save.sw_exit_info_2;
+ v = info & SVM_EVTINJ_VEC_MASK;
+
+ /* Check if exception information from hypervisor is sane. */
+ if ((info & SVM_EVTINJ_VALID) &&
+ ((v == X86_TRAP_GP) || (v == X86_TRAP_UD)) &&
+ ((info & SVM_EVTINJ_TYPE_MASK) == SVM_EVTINJ_TYPE_EXEPT)) {
+ ctxt->fi.vector = v;
+ if (info & SVM_EVTINJ_VALID_ERR)
+ ctxt->fi.error_code = info >> 32;
+ ret = ES_EXCEPTION;
+ } else {
+ ret = ES_VMM_ERROR;
+ }
+ } else {
+ ret = ES_OK;
+ }
+
+ return ret;
+}
+
+/*
+ * Boot VC Handler - This is the first VC handler during boot, there is no GHCB
+ * page yet, so it only supports the MSR based communication with the
+ * hypervisor and only the CPUID exit-code.
+ */
+void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
+{
+ unsigned int fn = lower_bits(regs->ax, 32);
+ unsigned long val;
+
+ /* Only CPUID is supported via MSR protocol */
+ if (exit_code != SVM_EXIT_CPUID)
+ goto fail;
+
+ sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EAX));
+ VMGEXIT();
+ val = sev_es_rd_ghcb_msr();
+ if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
+ goto fail;
+ regs->ax = val >> 32;
+
+ sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EBX));
+ VMGEXIT();
+ val = sev_es_rd_ghcb_msr();
+ if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
+ goto fail;
+ regs->bx = val >> 32;
+
+ sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_ECX));
+ VMGEXIT();
+ val = sev_es_rd_ghcb_msr();
+ if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
+ goto fail;
+ regs->cx = val >> 32;
+
+ sev_es_wr_ghcb_msr(GHCB_CPUID_REQ(fn, GHCB_CPUID_REQ_EDX));
+ VMGEXIT();
+ val = sev_es_rd_ghcb_msr();
+ if (GHCB_SEV_GHCB_RESP_CODE(val) != GHCB_SEV_CPUID_RESP)
+ goto fail;
+ regs->dx = val >> 32;
+
+ /* Skip over the CPUID two-byte opcode */
+ regs->ip += 2;
+
+ return;
+
+fail:
+ sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE);
+ VMGEXIT();
+
+ /* Shouldn't get here - if we do halt the machine */
+ while (true)
+ asm volatile("hlt\n");
+}
+
+static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
+ void *src, char *buf,
+ unsigned int data_size,
+ unsigned int count,
+ bool backwards)
+{
+ int i, b = backwards ? -1 : 1;
+ enum es_result ret = ES_OK;
+
+ for (i = 0; i < count; i++) {
+ void *s = src + (i * data_size * b);
+ char *d = buf + (i * data_size);
+
+ ret = vc_read_mem(ctxt, s, d, data_size);
+ if (ret != ES_OK)
+ break;
+ }
+
+ return ret;
+}
+
+static enum es_result vc_insn_string_write(struct es_em_ctxt *ctxt,
+ void *dst, char *buf,
+ unsigned int data_size,
+ unsigned int count,
+ bool backwards)
+{
+ int i, s = backwards ? -1 : 1;
+ enum es_result ret = ES_OK;
+
+ for (i = 0; i < count; i++) {
+ void *d = dst + (i * data_size * s);
+ char *b = buf + (i * data_size);
+
+ ret = vc_write_mem(ctxt, d, b, data_size);
+ if (ret != ES_OK)
+ break;
+ }
+
+ return ret;
+}
+
+#define IOIO_TYPE_STR BIT(2)
+#define IOIO_TYPE_IN 1
+#define IOIO_TYPE_INS (IOIO_TYPE_IN | IOIO_TYPE_STR)
+#define IOIO_TYPE_OUT 0
+#define IOIO_TYPE_OUTS (IOIO_TYPE_OUT | IOIO_TYPE_STR)
+
+#define IOIO_REP BIT(3)
+
+#define IOIO_ADDR_64 BIT(9)
+#define IOIO_ADDR_32 BIT(8)
+#define IOIO_ADDR_16 BIT(7)
+
+#define IOIO_DATA_32 BIT(6)
+#define IOIO_DATA_16 BIT(5)
+#define IOIO_DATA_8 BIT(4)
+
+#define IOIO_SEG_ES (0 << 10)
+#define IOIO_SEG_DS (3 << 10)
+
+static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
+{
+ struct insn *insn = &ctxt->insn;
+ *exitinfo = 0;
+
+ switch (insn->opcode.bytes[0]) {
+ /* INS opcodes */
+ case 0x6c:
+ case 0x6d:
+ *exitinfo |= IOIO_TYPE_INS;
+ *exitinfo |= IOIO_SEG_ES;
+ *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ break;
+
+ /* OUTS opcodes */
+ case 0x6e:
+ case 0x6f:
+ *exitinfo |= IOIO_TYPE_OUTS;
+ *exitinfo |= IOIO_SEG_DS;
+ *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ break;
+
+ /* IN immediate opcodes */
+ case 0xe4:
+ case 0xe5:
+ *exitinfo |= IOIO_TYPE_IN;
+ *exitinfo |= (u64)insn->immediate.value << 16;
+ break;
+
+ /* OUT immediate opcodes */
+ case 0xe6:
+ case 0xe7:
+ *exitinfo |= IOIO_TYPE_OUT;
+ *exitinfo |= (u64)insn->immediate.value << 16;
+ break;
+
+ /* IN register opcodes */
+ case 0xec:
+ case 0xed:
+ *exitinfo |= IOIO_TYPE_IN;
+ *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ break;
+
+ /* OUT register opcodes */
+ case 0xee:
+ case 0xef:
+ *exitinfo |= IOIO_TYPE_OUT;
+ *exitinfo |= (ctxt->regs->dx & 0xffff) << 16;
+ break;
+
+ default:
+ return ES_DECODE_FAILED;
+ }
+
+ switch (insn->opcode.bytes[0]) {
+ case 0x6c:
+ case 0x6e:
+ case 0xe4:
+ case 0xe6:
+ case 0xec:
+ case 0xee:
+ /* Single byte opcodes */
+ *exitinfo |= IOIO_DATA_8;
+ break;
+ default:
+ /* Length determined by instruction parsing */
+ *exitinfo |= (insn->opnd_bytes == 2) ? IOIO_DATA_16
+ : IOIO_DATA_32;
+ }
+ switch (insn->addr_bytes) {
+ case 2:
+ *exitinfo |= IOIO_ADDR_16;
+ break;
+ case 4:
+ *exitinfo |= IOIO_ADDR_32;
+ break;
+ case 8:
+ *exitinfo |= IOIO_ADDR_64;
+ break;
+ }
+
+ if (insn_has_rep_prefix(insn))
+ *exitinfo |= IOIO_REP;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ struct pt_regs *regs = ctxt->regs;
+ u64 exit_info_1, exit_info_2;
+ enum es_result ret;
+
+ ret = vc_ioio_exitinfo(ctxt, &exit_info_1);
+ if (ret != ES_OK)
+ return ret;
+
+ if (exit_info_1 & IOIO_TYPE_STR) {
+
+ /* (REP) INS/OUTS */
+
+ bool df = ((regs->flags & X86_EFLAGS_DF) == X86_EFLAGS_DF);
+ unsigned int io_bytes, exit_bytes;
+ unsigned int ghcb_count, op_count;
+ unsigned long es_base;
+ u64 sw_scratch;
+
+ /*
+ * For the string variants with rep prefix the amount of in/out
+ * operations per #VC exception is limited so that the kernel
+ * has a chance to take interrupts and re-schedule while the
+ * instruction is emulated.
+ */
+ io_bytes = (exit_info_1 >> 4) & 0x7;
+ ghcb_count = sizeof(ghcb->shared_buffer) / io_bytes;
+
+ op_count = (exit_info_1 & IOIO_REP) ? regs->cx : 1;
+ exit_info_2 = min(op_count, ghcb_count);
+ exit_bytes = exit_info_2 * io_bytes;
+
+ es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
+
+ /* Read bytes of OUTS into the shared buffer */
+ if (!(exit_info_1 & IOIO_TYPE_IN)) {
+ ret = vc_insn_string_read(ctxt,
+ (void *)(es_base + regs->si),
+ ghcb->shared_buffer, io_bytes,
+ exit_info_2, df);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Issue an VMGEXIT to the HV to consume the bytes from the
+ * shared buffer or to have it write them into the shared buffer
+ * depending on the instruction: OUTS or INS.
+ */
+ sw_scratch = __pa(ghcb) + offsetof(struct ghcb, shared_buffer);
+ ghcb_set_sw_scratch(ghcb, sw_scratch);
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO,
+ exit_info_1, exit_info_2);
+ if (ret != ES_OK)
+ return ret;
+
+ /* Read bytes from shared buffer into the guest's destination. */
+ if (exit_info_1 & IOIO_TYPE_IN) {
+ ret = vc_insn_string_write(ctxt,
+ (void *)(es_base + regs->di),
+ ghcb->shared_buffer, io_bytes,
+ exit_info_2, df);
+ if (ret)
+ return ret;
+
+ if (df)
+ regs->di -= exit_bytes;
+ else
+ regs->di += exit_bytes;
+ } else {
+ if (df)
+ regs->si -= exit_bytes;
+ else
+ regs->si += exit_bytes;
+ }
+
+ if (exit_info_1 & IOIO_REP)
+ regs->cx -= exit_info_2;
+
+ ret = regs->cx ? ES_RETRY : ES_OK;
+
+ } else {
+
+ /* IN/OUT into/from rAX */
+
+ int bits = (exit_info_1 & 0x70) >> 1;
+ u64 rax = 0;
+
+ if (!(exit_info_1 & IOIO_TYPE_IN))
+ rax = lower_bits(regs->ax, bits);
+
+ ghcb_set_rax(ghcb, rax);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_IOIO, exit_info_1, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (exit_info_1 & IOIO_TYPE_IN) {
+ if (!ghcb_rax_is_valid(ghcb))
+ return ES_VMM_ERROR;
+ regs->ax = lower_bits(ghcb->save.rax, bits);
+ }
+ }
+
+ return ret;
+}
+
+static enum es_result vc_handle_cpuid(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct pt_regs *regs = ctxt->regs;
+ u32 cr4 = native_read_cr4();
+ enum es_result ret;
+
+ ghcb_set_rax(ghcb, regs->ax);
+ ghcb_set_rcx(ghcb, regs->cx);
+
+ if (cr4 & X86_CR4_OSXSAVE)
+ /* Safe to read xcr0 */
+ ghcb_set_xcr0(ghcb, xgetbv(XCR_XFEATURE_ENABLED_MASK));
+ else
+ /* xgetbv will cause #GP - use reset value for xcr0 */
+ ghcb_set_xcr0(ghcb, 1);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_CPUID, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) &&
+ ghcb_rbx_is_valid(ghcb) &&
+ ghcb_rcx_is_valid(ghcb) &&
+ ghcb_rdx_is_valid(ghcb)))
+ return ES_VMM_ERROR;
+
+ regs->ax = ghcb->save.rax;
+ regs->bx = ghcb->save.rbx;
+ regs->cx = ghcb->save.rcx;
+ regs->dx = ghcb->save.rdx;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_rdtsc(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt,
+ unsigned long exit_code)
+{
+ bool rdtscp = (exit_code == SVM_EXIT_RDTSCP);
+ enum es_result ret;
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb) &&
+ (!rdtscp || ghcb_rcx_is_valid(ghcb))))
+ return ES_VMM_ERROR;
+
+ ctxt->regs->ax = ghcb->save.rax;
+ ctxt->regs->dx = ghcb->save.rdx;
+ if (rdtscp)
+ ctxt->regs->cx = ghcb->save.rcx;
+
+ return ES_OK;
+}
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
new file mode 100644
index 000000000000..4a96726fbaf8
--- /dev/null
+++ b/arch/x86/kernel/sev-es.c
@@ -0,0 +1,1404 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2019 SUSE
+ *
+ * Author: Joerg Roedel <jroedel@suse.de>
+ */
+
+#define pr_fmt(fmt) "SEV-ES: " fmt
+
+#include <linux/sched/debug.h> /* For show_regs() */
+#include <linux/percpu-defs.h>
+#include <linux/mem_encrypt.h>
+#include <linux/lockdep.h>
+#include <linux/printk.h>
+#include <linux/mm_types.h>
+#include <linux/set_memory.h>
+#include <linux/memblock.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+
+#include <asm/cpu_entry_area.h>
+#include <asm/stacktrace.h>
+#include <asm/sev-es.h>
+#include <asm/insn-eval.h>
+#include <asm/fpu/internal.h>
+#include <asm/processor.h>
+#include <asm/realmode.h>
+#include <asm/traps.h>
+#include <asm/svm.h>
+#include <asm/smp.h>
+#include <asm/cpu.h>
+
+#define DR7_RESET_VALUE 0x400
+
+/* For early boot hypervisor communication in SEV-ES enabled guests */
+static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE);
+
+/*
+ * Needs to be in the .data section because we need it NULL before bss is
+ * cleared
+ */
+static struct ghcb __initdata *boot_ghcb;
+
+/* #VC handler runtime per-CPU data */
+struct sev_es_runtime_data {
+ struct ghcb ghcb_page;
+
+ /* Physical storage for the per-CPU IST stack of the #VC handler */
+ char ist_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
+
+ /*
+ * Physical storage for the per-CPU fall-back stack of the #VC handler.
+ * The fall-back stack is used when it is not safe to switch back to the
+ * interrupted stack in the #VC entry code.
+ */
+ char fallback_stack[EXCEPTION_STKSZ] __aligned(PAGE_SIZE);
+
+ /*
+ * Reserve one page per CPU as backup storage for the unencrypted GHCB.
+ * It is needed when an NMI happens while the #VC handler uses the real
+ * GHCB, and the NMI handler itself is causing another #VC exception. In
+ * that case the GHCB content of the first handler needs to be backed up
+ * and restored.
+ */
+ struct ghcb backup_ghcb;
+
+ /*
+ * Mark the per-cpu GHCBs as in-use to detect nested #VC exceptions.
+ * There is no need for it to be atomic, because nothing is written to
+ * the GHCB between the read and the write of ghcb_active. So it is safe
+ * to use it when a nested #VC exception happens before the write.
+ *
+ * This is necessary for example in the #VC->NMI->#VC case when the NMI
+ * happens while the first #VC handler uses the GHCB. When the NMI code
+ * raises a second #VC handler it might overwrite the contents of the
+ * GHCB written by the first handler. To avoid this the content of the
+ * GHCB is saved and restored when the GHCB is detected to be in use
+ * already.
+ */
+ bool ghcb_active;
+ bool backup_ghcb_active;
+
+ /*
+ * Cached DR7 value - write it on DR7 writes and return it on reads.
+ * That value will never make it to the real hardware DR7 as debugging
+ * is currently unsupported in SEV-ES guests.
+ */
+ unsigned long dr7;
+};
+
+struct ghcb_state {
+ struct ghcb *ghcb;
+};
+
+static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data);
+DEFINE_STATIC_KEY_FALSE(sev_es_enable_key);
+
+/* Needed in vc_early_forward_exception */
+void do_early_exception(struct pt_regs *regs, int trapnr);
+
+static void __init setup_vc_stacks(int cpu)
+{
+ struct sev_es_runtime_data *data;
+ struct cpu_entry_area *cea;
+ unsigned long vaddr;
+ phys_addr_t pa;
+
+ data = per_cpu(runtime_data, cpu);
+ cea = get_cpu_entry_area(cpu);
+
+ /* Map #VC IST stack */
+ vaddr = CEA_ESTACK_BOT(&cea->estacks, VC);
+ pa = __pa(data->ist_stack);
+ cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
+
+ /* Map VC fall-back stack */
+ vaddr = CEA_ESTACK_BOT(&cea->estacks, VC2);
+ pa = __pa(data->fallback_stack);
+ cea_set_pte((void *)vaddr, pa, PAGE_KERNEL);
+}
+
+static __always_inline bool on_vc_stack(unsigned long sp)
+{
+ return ((sp >= __this_cpu_ist_bottom_va(VC)) && (sp < __this_cpu_ist_top_va(VC)));
+}
+
+/*
+ * This function handles the case when an NMI is raised in the #VC exception
+ * handler entry code. In this case, the IST entry for #VC must be adjusted, so
+ * that any subsequent #VC exception will not overwrite the stack contents of the
+ * interrupted #VC handler.
+ *
+ * The IST entry is adjusted unconditionally so that it can be also be
+ * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested
+ * sev_es_ist_exit() call may adjust back the IST entry too early.
+ */
+void noinstr __sev_es_ist_enter(struct pt_regs *regs)
+{
+ unsigned long old_ist, new_ist;
+
+ /* Read old IST entry */
+ old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+
+ /* Make room on the IST stack */
+ if (on_vc_stack(regs->sp))
+ new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
+ else
+ new_ist = old_ist - sizeof(old_ist);
+
+ /* Store old IST entry */
+ *(unsigned long *)new_ist = old_ist;
+
+ /* Set new IST entry */
+ this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], new_ist);
+}
+
+void noinstr __sev_es_ist_exit(void)
+{
+ unsigned long ist;
+
+ /* Read IST entry */
+ ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+
+ if (WARN_ON(ist == __this_cpu_ist_top_va(VC)))
+ return;
+
+ /* Read back old IST entry and write it to the TSS */
+ this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long *)ist);
+}
+
+static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ if (unlikely(data->ghcb_active)) {
+ /* GHCB is already in use - save its contents */
+
+ if (unlikely(data->backup_ghcb_active))
+ return NULL;
+
+ /* Mark backup_ghcb active before writing to it */
+ data->backup_ghcb_active = true;
+
+ state->ghcb = &data->backup_ghcb;
+
+ /* Backup GHCB content */
+ *state->ghcb = *ghcb;
+ } else {
+ state->ghcb = NULL;
+ data->ghcb_active = true;
+ }
+
+ return ghcb;
+}
+
+static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
+{
+ struct sev_es_runtime_data *data;
+ struct ghcb *ghcb;
+
+ data = this_cpu_read(runtime_data);
+ ghcb = &data->ghcb_page;
+
+ if (state->ghcb) {
+ /* Restore GHCB from Backup */
+ *ghcb = *state->ghcb;
+ data->backup_ghcb_active = false;
+ state->ghcb = NULL;
+ } else {
+ data->ghcb_active = false;
+ }
+}
+
+/* Needed in vc_early_forward_exception */
+void do_early_exception(struct pt_regs *regs, int trapnr);
+
+static inline u64 sev_es_rd_ghcb_msr(void)
+{
+ return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
+}
+
+static inline void sev_es_wr_ghcb_msr(u64 val)
+{
+ u32 low, high;
+
+ low = (u32)(val);
+ high = (u32)(val >> 32);
+
+ native_wrmsr(MSR_AMD64_SEV_ES_GHCB, low, high);
+}
+
+static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
+ unsigned char *buffer)
+{
+ return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
+}
+
+static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ enum es_result ret;
+ int res;
+
+ if (user_mode(ctxt->regs)) {
+ res = insn_fetch_from_user(ctxt->regs, buffer);
+ if (!res) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
+ ctxt->fi.cr2 = ctxt->regs->ip;
+ return ES_EXCEPTION;
+ }
+
+ if (!insn_decode(&ctxt->insn, ctxt->regs, buffer, res))
+ return ES_DECODE_FAILED;
+ } else {
+ res = vc_fetch_insn_kernel(ctxt, buffer);
+ if (res) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_INSTR;
+ ctxt->fi.cr2 = ctxt->regs->ip;
+ return ES_EXCEPTION;
+ }
+
+ insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1);
+ insn_get_length(&ctxt->insn);
+ }
+
+ ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED;
+
+ return ret;
+}
+
+static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
+ char *dst, char *buf, size_t size)
+{
+ unsigned long error_code = X86_PF_PROT | X86_PF_WRITE;
+ char __user *target = (char __user *)dst;
+ u64 d8;
+ u32 d4;
+ u16 d2;
+ u8 d1;
+
+ switch (size) {
+ case 1:
+ memcpy(&d1, buf, 1);
+ if (put_user(d1, target))
+ goto fault;
+ break;
+ case 2:
+ memcpy(&d2, buf, 2);
+ if (put_user(d2, target))
+ goto fault;
+ break;
+ case 4:
+ memcpy(&d4, buf, 4);
+ if (put_user(d4, target))
+ goto fault;
+ break;
+ case 8:
+ memcpy(&d8, buf, 8);
+ if (put_user(d8, target))
+ goto fault;
+ break;
+ default:
+ WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
+ return ES_UNSUPPORTED;
+ }
+
+ return ES_OK;
+
+fault:
+ if (user_mode(ctxt->regs))
+ error_code |= X86_PF_USER;
+
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = error_code;
+ ctxt->fi.cr2 = (unsigned long)dst;
+
+ return ES_EXCEPTION;
+}
+
+static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
+ char *src, char *buf, size_t size)
+{
+ unsigned long error_code = X86_PF_PROT;
+ char __user *s = (char __user *)src;
+ u64 d8;
+ u32 d4;
+ u16 d2;
+ u8 d1;
+
+ switch (size) {
+ case 1:
+ if (get_user(d1, s))
+ goto fault;
+ memcpy(buf, &d1, 1);
+ break;
+ case 2:
+ if (get_user(d2, s))
+ goto fault;
+ memcpy(buf, &d2, 2);
+ break;
+ case 4:
+ if (get_user(d4, s))
+ goto fault;
+ memcpy(buf, &d4, 4);
+ break;
+ case 8:
+ if (get_user(d8, s))
+ goto fault;
+ memcpy(buf, &d8, 8);
+ break;
+ default:
+ WARN_ONCE(1, "%s: Invalid size: %zu\n", __func__, size);
+ return ES_UNSUPPORTED;
+ }
+
+ return ES_OK;
+
+fault:
+ if (user_mode(ctxt->regs))
+ error_code |= X86_PF_USER;
+
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = error_code;
+ ctxt->fi.cr2 = (unsigned long)src;
+
+ return ES_EXCEPTION;
+}
+
+static bool vc_slow_virt_to_phys(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ unsigned long vaddr, phys_addr_t *paddr)
+{
+ unsigned long va = (unsigned long)vaddr;
+ unsigned int level;
+ phys_addr_t pa;
+ pgd_t *pgd;
+ pte_t *pte;
+
+ pgd = __va(read_cr3_pa());
+ pgd = &pgd[pgd_index(va)];
+ pte = lookup_address_in_pgd(pgd, va, &level);
+ if (!pte) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.cr2 = vaddr;
+ ctxt->fi.error_code = 0;
+
+ if (user_mode(ctxt->regs))
+ ctxt->fi.error_code |= X86_PF_USER;
+
+ return false;
+ }
+
+ pa = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
+ pa |= va & ~page_level_mask(level);
+
+ *paddr = pa;
+
+ return true;
+}
+
+/* Include code shared with pre-decompression boot stage */
+#include "sev-es-shared.c"
+
+void noinstr __sev_es_nmi_complete(void)
+{
+ struct ghcb_state state;
+ struct ghcb *ghcb;
+
+ ghcb = sev_es_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
+ VMGEXIT();
+
+ sev_es_put_ghcb(&state);
+}
+
+static u64 get_jump_table_addr(void)
+{
+ struct ghcb_state state;
+ unsigned long flags;
+ struct ghcb *ghcb;
+ u64 ret = 0;
+
+ local_irq_save(flags);
+
+ ghcb = sev_es_get_ghcb(&state);
+
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
+ ghcb_set_sw_exit_info_1(ghcb, SVM_VMGEXIT_GET_AP_JUMP_TABLE);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ if (ghcb_sw_exit_info_1_is_valid(ghcb) &&
+ ghcb_sw_exit_info_2_is_valid(ghcb))
+ ret = ghcb->save.sw_exit_info_2;
+
+ sev_es_put_ghcb(&state);
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+int sev_es_setup_ap_jump_table(struct real_mode_header *rmh)
+{
+ u16 startup_cs, startup_ip;
+ phys_addr_t jump_table_pa;
+ u64 jump_table_addr;
+ u16 __iomem *jump_table;
+
+ jump_table_addr = get_jump_table_addr();
+
+ /* On UP guests there is no jump table so this is not a failure */
+ if (!jump_table_addr)
+ return 0;
+
+ /* Check if AP Jump Table is page-aligned */
+ if (jump_table_addr & ~PAGE_MASK)
+ return -EINVAL;
+
+ jump_table_pa = jump_table_addr & PAGE_MASK;
+
+ startup_cs = (u16)(rmh->trampoline_start >> 4);
+ startup_ip = (u16)(rmh->sev_es_trampoline_start -
+ rmh->trampoline_start);
+
+ jump_table = ioremap_encrypted(jump_table_pa, PAGE_SIZE);
+ if (!jump_table)
+ return -EIO;
+
+ writew(startup_ip, &jump_table[0]);
+ writew(startup_cs, &jump_table[1]);
+
+ iounmap(jump_table);
+
+ return 0;
+}
+
+/*
+ * This is needed by the OVMF UEFI firmware which will use whatever it finds in
+ * the GHCB MSR as its GHCB to talk to the hypervisor. So make sure the per-cpu
+ * runtime GHCBs used by the kernel are also mapped in the EFI page-table.
+ */
+int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
+{
+ struct sev_es_runtime_data *data;
+ unsigned long address, pflags;
+ int cpu;
+ u64 pfn;
+
+ if (!sev_es_active())
+ return 0;
+
+ pflags = _PAGE_NX | _PAGE_RW;
+
+ for_each_possible_cpu(cpu) {
+ data = per_cpu(runtime_data, cpu);
+
+ address = __pa(&data->ghcb_page);
+ pfn = address >> PAGE_SHIFT;
+
+ if (kernel_map_pages_in_pgd(pgd, pfn, address, 1, pflags))
+ return 1;
+ }
+
+ return 0;
+}
+
+static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ struct pt_regs *regs = ctxt->regs;
+ enum es_result ret;
+ u64 exit_info_1;
+
+ /* Is it a WRMSR? */
+ exit_info_1 = (ctxt->insn.opcode.bytes[1] == 0x30) ? 1 : 0;
+
+ ghcb_set_rcx(ghcb, regs->cx);
+ if (exit_info_1) {
+ ghcb_set_rax(ghcb, regs->ax);
+ ghcb_set_rdx(ghcb, regs->dx);
+ }
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_MSR, exit_info_1, 0);
+
+ if ((ret == ES_OK) && (!exit_info_1)) {
+ regs->ax = ghcb->save.rax;
+ regs->dx = ghcb->save.rdx;
+ }
+
+ return ret;
+}
+
+/*
+ * This function runs on the first #VC exception after the kernel
+ * switched to virtual addresses.
+ */
+static bool __init sev_es_setup_ghcb(void)
+{
+ /* First make sure the hypervisor talks a supported protocol. */
+ if (!sev_es_negotiate_protocol())
+ return false;
+
+ /*
+ * Clear the boot_ghcb. The first exception comes in before the bss
+ * section is cleared.
+ */
+ memset(&boot_ghcb_page, 0, PAGE_SIZE);
+
+ /* Alright - Make the boot-ghcb public */
+ boot_ghcb = &boot_ghcb_page;
+
+ return true;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void sev_es_ap_hlt_loop(void)
+{
+ struct ghcb_state state;
+ struct ghcb *ghcb;
+
+ ghcb = sev_es_get_ghcb(&state);
+
+ while (true) {
+ vc_ghcb_invalidate(ghcb);
+ ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_HLT_LOOP);
+ ghcb_set_sw_exit_info_1(ghcb, 0);
+ ghcb_set_sw_exit_info_2(ghcb, 0);
+
+ sev_es_wr_ghcb_msr(__pa(ghcb));
+ VMGEXIT();
+
+ /* Wakeup signal? */
+ if (ghcb_sw_exit_info_2_is_valid(ghcb) &&
+ ghcb->save.sw_exit_info_2)
+ break;
+ }
+
+ sev_es_put_ghcb(&state);
+}
+
+/*
+ * Play_dead handler when running under SEV-ES. This is needed because
+ * the hypervisor can't deliver an SIPI request to restart the AP.
+ * Instead the kernel has to issue a VMGEXIT to halt the VCPU until the
+ * hypervisor wakes it up again.
+ */
+static void sev_es_play_dead(void)
+{
+ play_dead_common();
+
+ /* IRQs now disabled */
+
+ sev_es_ap_hlt_loop();
+
+ /*
+ * If we get here, the VCPU was woken up again. Jump to CPU
+ * startup code to get it back online.
+ */
+ start_cpu0();
+}
+#else /* CONFIG_HOTPLUG_CPU */
+#define sev_es_play_dead native_play_dead
+#endif /* CONFIG_HOTPLUG_CPU */
+
+#ifdef CONFIG_SMP
+static void __init sev_es_setup_play_dead(void)
+{
+ smp_ops.play_dead = sev_es_play_dead;
+}
+#else
+static inline void sev_es_setup_play_dead(void) { }
+#endif
+
+static void __init alloc_runtime_data(int cpu)
+{
+ struct sev_es_runtime_data *data;
+
+ data = memblock_alloc(sizeof(*data), PAGE_SIZE);
+ if (!data)
+ panic("Can't allocate SEV-ES runtime data");
+
+ per_cpu(runtime_data, cpu) = data;
+}
+
+static void __init init_ghcb(int cpu)
+{
+ struct sev_es_runtime_data *data;
+ int err;
+
+ data = per_cpu(runtime_data, cpu);
+
+ err = early_set_memory_decrypted((unsigned long)&data->ghcb_page,
+ sizeof(data->ghcb_page));
+ if (err)
+ panic("Can't map GHCBs unencrypted");
+
+ memset(&data->ghcb_page, 0, sizeof(data->ghcb_page));
+
+ data->ghcb_active = false;
+ data->backup_ghcb_active = false;
+}
+
+void __init sev_es_init_vc_handling(void)
+{
+ int cpu;
+
+ BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % PAGE_SIZE);
+
+ if (!sev_es_active())
+ return;
+
+ if (!sev_es_check_cpu_features())
+ panic("SEV-ES CPU Features missing");
+
+ /* Enable SEV-ES special handling */
+ static_branch_enable(&sev_es_enable_key);
+
+ /* Initialize per-cpu GHCB pages */
+ for_each_possible_cpu(cpu) {
+ alloc_runtime_data(cpu);
+ init_ghcb(cpu);
+ setup_vc_stacks(cpu);
+ }
+
+ sev_es_setup_play_dead();
+
+ /* Secondary CPUs use the runtime #VC handler */
+ initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication;
+}
+
+static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
+{
+ int trapnr = ctxt->fi.vector;
+
+ if (trapnr == X86_TRAP_PF)
+ native_write_cr2(ctxt->fi.cr2);
+
+ ctxt->regs->orig_ax = ctxt->fi.error_code;
+ do_early_exception(ctxt->regs, trapnr);
+}
+
+static long *vc_insn_get_reg(struct es_em_ctxt *ctxt)
+{
+ long *reg_array;
+ int offset;
+
+ reg_array = (long *)ctxt->regs;
+ offset = insn_get_modrm_reg_off(&ctxt->insn, ctxt->regs);
+
+ if (offset < 0)
+ return NULL;
+
+ offset /= sizeof(long);
+
+ return reg_array + offset;
+}
+
+static long *vc_insn_get_rm(struct es_em_ctxt *ctxt)
+{
+ long *reg_array;
+ int offset;
+
+ reg_array = (long *)ctxt->regs;
+ offset = insn_get_modrm_rm_off(&ctxt->insn, ctxt->regs);
+
+ if (offset < 0)
+ return NULL;
+
+ offset /= sizeof(long);
+
+ return reg_array + offset;
+}
+static enum es_result vc_do_mmio(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
+ unsigned int bytes, bool read)
+{
+ u64 exit_code, exit_info_1, exit_info_2;
+ unsigned long ghcb_pa = __pa(ghcb);
+ phys_addr_t paddr;
+ void __user *ref;
+
+ ref = insn_get_addr_ref(&ctxt->insn, ctxt->regs);
+ if (ref == (void __user *)-1L)
+ return ES_UNSUPPORTED;
+
+ exit_code = read ? SVM_VMGEXIT_MMIO_READ : SVM_VMGEXIT_MMIO_WRITE;
+
+ if (!vc_slow_virt_to_phys(ghcb, ctxt, (unsigned long)ref, &paddr)) {
+ if (!read)
+ ctxt->fi.error_code |= X86_PF_WRITE;
+
+ return ES_EXCEPTION;
+ }
+
+ exit_info_1 = paddr;
+ /* Can never be greater than 8 */
+ exit_info_2 = bytes;
+
+ ghcb_set_sw_scratch(ghcb, ghcb_pa + offsetof(struct ghcb, shared_buffer));
+
+ return sev_es_ghcb_hv_call(ghcb, ctxt, exit_code, exit_info_1, exit_info_2);
+}
+
+static enum es_result vc_handle_mmio_twobyte_ops(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct insn *insn = &ctxt->insn;
+ unsigned int bytes = 0;
+ enum es_result ret;
+ int sign_byte;
+ long *reg_data;
+
+ switch (insn->opcode.bytes[1]) {
+ /* MMIO Read w/ zero-extension */
+ case 0xb6:
+ bytes = 1;
+ fallthrough;
+ case 0xb7:
+ if (!bytes)
+ bytes = 2;
+
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
+
+ /* Zero extend based on operand size */
+ reg_data = vc_insn_get_reg(ctxt);
+ if (!reg_data)
+ return ES_DECODE_FAILED;
+
+ memset(reg_data, 0, insn->opnd_bytes);
+
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+
+ /* MMIO Read w/ sign-extension */
+ case 0xbe:
+ bytes = 1;
+ fallthrough;
+ case 0xbf:
+ if (!bytes)
+ bytes = 2;
+
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
+
+ /* Sign extend based on operand size */
+ reg_data = vc_insn_get_reg(ctxt);
+ if (!reg_data)
+ return ES_DECODE_FAILED;
+
+ if (bytes == 1) {
+ u8 *val = (u8 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x80) ? 0xff : 0x00;
+ } else {
+ u16 *val = (u16 *)ghcb->shared_buffer;
+
+ sign_byte = (*val & 0x8000) ? 0xff : 0x00;
+ }
+ memset(reg_data, sign_byte, insn->opnd_bytes);
+
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+
+ default:
+ ret = ES_UNSUPPORTED;
+ }
+
+ return ret;
+}
+
+/*
+ * The MOVS instruction has two memory operands, which raises the
+ * problem that it is not known whether the access to the source or the
+ * destination caused the #VC exception (and hence whether an MMIO read
+ * or write operation needs to be emulated).
+ *
+ * Instead of playing games with walking page-tables and trying to guess
+ * whether the source or destination is an MMIO range, split the move
+ * into two operations, a read and a write with only one memory operand.
+ * This will cause a nested #VC exception on the MMIO address which can
+ * then be handled.
+ *
+ * This implementation has the benefit that it also supports MOVS where
+ * source _and_ destination are MMIO regions.
+ *
+ * It will slow MOVS on MMIO down a lot, but in SEV-ES guests it is a
+ * rare operation. If it turns out to be a performance problem the split
+ * operations can be moved to memcpy_fromio() and memcpy_toio().
+ */
+static enum es_result vc_handle_mmio_movs(struct es_em_ctxt *ctxt,
+ unsigned int bytes)
+{
+ unsigned long ds_base, es_base;
+ unsigned char *src, *dst;
+ unsigned char buffer[8];
+ enum es_result ret;
+ bool rep;
+ int off;
+
+ ds_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_DS);
+ es_base = insn_get_seg_base(ctxt->regs, INAT_SEG_REG_ES);
+
+ if (ds_base == -1L || es_base == -1L) {
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ return ES_EXCEPTION;
+ }
+
+ src = ds_base + (unsigned char *)ctxt->regs->si;
+ dst = es_base + (unsigned char *)ctxt->regs->di;
+
+ ret = vc_read_mem(ctxt, src, buffer, bytes);
+ if (ret != ES_OK)
+ return ret;
+
+ ret = vc_write_mem(ctxt, dst, buffer, bytes);
+ if (ret != ES_OK)
+ return ret;
+
+ if (ctxt->regs->flags & X86_EFLAGS_DF)
+ off = -bytes;
+ else
+ off = bytes;
+
+ ctxt->regs->si += off;
+ ctxt->regs->di += off;
+
+ rep = insn_has_rep_prefix(&ctxt->insn);
+ if (rep)
+ ctxt->regs->cx -= 1;
+
+ if (!rep || ctxt->regs->cx == 0)
+ return ES_OK;
+ else
+ return ES_RETRY;
+}
+
+static enum es_result vc_handle_mmio(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct insn *insn = &ctxt->insn;
+ unsigned int bytes = 0;
+ enum es_result ret;
+ long *reg_data;
+
+ switch (insn->opcode.bytes[0]) {
+ /* MMIO Write */
+ case 0x88:
+ bytes = 1;
+ fallthrough;
+ case 0x89:
+ if (!bytes)
+ bytes = insn->opnd_bytes;
+
+ reg_data = vc_insn_get_reg(ctxt);
+ if (!reg_data)
+ return ES_DECODE_FAILED;
+
+ memcpy(ghcb->shared_buffer, reg_data, bytes);
+
+ ret = vc_do_mmio(ghcb, ctxt, bytes, false);
+ break;
+
+ case 0xc6:
+ bytes = 1;
+ fallthrough;
+ case 0xc7:
+ if (!bytes)
+ bytes = insn->opnd_bytes;
+
+ memcpy(ghcb->shared_buffer, insn->immediate1.bytes, bytes);
+
+ ret = vc_do_mmio(ghcb, ctxt, bytes, false);
+ break;
+
+ /* MMIO Read */
+ case 0x8a:
+ bytes = 1;
+ fallthrough;
+ case 0x8b:
+ if (!bytes)
+ bytes = insn->opnd_bytes;
+
+ ret = vc_do_mmio(ghcb, ctxt, bytes, true);
+ if (ret)
+ break;
+
+ reg_data = vc_insn_get_reg(ctxt);
+ if (!reg_data)
+ return ES_DECODE_FAILED;
+
+ /* Zero-extend for 32-bit operation */
+ if (bytes == 4)
+ *reg_data = 0;
+
+ memcpy(reg_data, ghcb->shared_buffer, bytes);
+ break;
+
+ /* MOVS instruction */
+ case 0xa4:
+ bytes = 1;
+ fallthrough;
+ case 0xa5:
+ if (!bytes)
+ bytes = insn->opnd_bytes;
+
+ ret = vc_handle_mmio_movs(ctxt, bytes);
+ break;
+ /* Two-Byte Opcodes */
+ case 0x0f:
+ ret = vc_handle_mmio_twobyte_ops(ghcb, ctxt);
+ break;
+ default:
+ ret = ES_UNSUPPORTED;
+ }
+
+ return ret;
+}
+
+static enum es_result vc_handle_dr7_write(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
+ long val, *reg = vc_insn_get_rm(ctxt);
+ enum es_result ret;
+
+ if (!reg)
+ return ES_DECODE_FAILED;
+
+ val = *reg;
+
+ /* Upper 32 bits must be written as zeroes */
+ if (val >> 32) {
+ ctxt->fi.vector = X86_TRAP_GP;
+ ctxt->fi.error_code = 0;
+ return ES_EXCEPTION;
+ }
+
+ /* Clear out other reserved bits and set bit 10 */
+ val = (val & 0xffff23ffL) | BIT(10);
+
+ /* Early non-zero writes to DR7 are not supported */
+ if (!data && (val & ~DR7_RESET_VALUE))
+ return ES_UNSUPPORTED;
+
+ /* Using a value of 0 for ExitInfo1 means RAX holds the value */
+ ghcb_set_rax(ghcb, val);
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WRITE_DR7, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (data)
+ data->dr7 = val;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_dr7_read(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
+ long *reg = vc_insn_get_rm(ctxt);
+
+ if (!reg)
+ return ES_DECODE_FAILED;
+
+ if (data)
+ *reg = data->dr7;
+ else
+ *reg = DR7_RESET_VALUE;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_wbinvd(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ return sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_WBINVD, 0, 0);
+}
+
+static enum es_result vc_handle_rdpmc(struct ghcb *ghcb, struct es_em_ctxt *ctxt)
+{
+ enum es_result ret;
+
+ ghcb_set_rcx(ghcb, ctxt->regs->cx);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_RDPMC, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!(ghcb_rax_is_valid(ghcb) && ghcb_rdx_is_valid(ghcb)))
+ return ES_VMM_ERROR;
+
+ ctxt->regs->ax = ghcb->save.rax;
+ ctxt->regs->dx = ghcb->save.rdx;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_monitor(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ /*
+ * Treat it as a NOP and do not leak a physical address to the
+ * hypervisor.
+ */
+ return ES_OK;
+}
+
+static enum es_result vc_handle_mwait(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ /* Treat the same as MONITOR/MONITORX */
+ return ES_OK;
+}
+
+static enum es_result vc_handle_vmmcall(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ enum es_result ret;
+
+ ghcb_set_rax(ghcb, ctxt->regs->ax);
+ ghcb_set_cpl(ghcb, user_mode(ctxt->regs) ? 3 : 0);
+
+ if (x86_platform.hyper.sev_es_hcall_prepare)
+ x86_platform.hyper.sev_es_hcall_prepare(ghcb, ctxt->regs);
+
+ ret = sev_es_ghcb_hv_call(ghcb, ctxt, SVM_EXIT_VMMCALL, 0, 0);
+ if (ret != ES_OK)
+ return ret;
+
+ if (!ghcb_rax_is_valid(ghcb))
+ return ES_VMM_ERROR;
+
+ ctxt->regs->ax = ghcb->save.rax;
+
+ /*
+ * Call sev_es_hcall_finish() after regs->ax is already set.
+ * This allows the hypervisor handler to overwrite it again if
+ * necessary.
+ */
+ if (x86_platform.hyper.sev_es_hcall_finish &&
+ !x86_platform.hyper.sev_es_hcall_finish(ghcb, ctxt->regs))
+ return ES_VMM_ERROR;
+
+ return ES_OK;
+}
+
+static enum es_result vc_handle_trap_ac(struct ghcb *ghcb,
+ struct es_em_ctxt *ctxt)
+{
+ /*
+ * Calling ecx_alignment_check() directly does not work, because it
+ * enables IRQs and the GHCB is active. Forward the exception and call
+ * it later from vc_forward_exception().
+ */
+ ctxt->fi.vector = X86_TRAP_AC;
+ ctxt->fi.error_code = 0;
+ return ES_EXCEPTION;
+}
+
+static __always_inline void vc_handle_trap_db(struct pt_regs *regs)
+{
+ if (user_mode(regs))
+ noist_exc_debug(regs);
+ else
+ exc_debug(regs);
+}
+
+static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
+ struct ghcb *ghcb,
+ unsigned long exit_code)
+{
+ enum es_result result;
+
+ switch (exit_code) {
+ case SVM_EXIT_READ_DR7:
+ result = vc_handle_dr7_read(ghcb, ctxt);
+ break;
+ case SVM_EXIT_WRITE_DR7:
+ result = vc_handle_dr7_write(ghcb, ctxt);
+ break;
+ case SVM_EXIT_EXCP_BASE + X86_TRAP_AC:
+ result = vc_handle_trap_ac(ghcb, ctxt);
+ break;
+ case SVM_EXIT_RDTSC:
+ case SVM_EXIT_RDTSCP:
+ result = vc_handle_rdtsc(ghcb, ctxt, exit_code);
+ break;
+ case SVM_EXIT_RDPMC:
+ result = vc_handle_rdpmc(ghcb, ctxt);
+ break;
+ case SVM_EXIT_INVD:
+ pr_err_ratelimited("#VC exception for INVD??? Seriously???\n");
+ result = ES_UNSUPPORTED;
+ break;
+ case SVM_EXIT_CPUID:
+ result = vc_handle_cpuid(ghcb, ctxt);
+ break;
+ case SVM_EXIT_IOIO:
+ result = vc_handle_ioio(ghcb, ctxt);
+ break;
+ case SVM_EXIT_MSR:
+ result = vc_handle_msr(ghcb, ctxt);
+ break;
+ case SVM_EXIT_VMMCALL:
+ result = vc_handle_vmmcall(ghcb, ctxt);
+ break;
+ case SVM_EXIT_WBINVD:
+ result = vc_handle_wbinvd(ghcb, ctxt);
+ break;
+ case SVM_EXIT_MONITOR:
+ result = vc_handle_monitor(ghcb, ctxt);
+ break;
+ case SVM_EXIT_MWAIT:
+ result = vc_handle_mwait(ghcb, ctxt);
+ break;
+ case SVM_EXIT_NPF:
+ result = vc_handle_mmio(ghcb, ctxt);
+ break;
+ default:
+ /*
+ * Unexpected #VC exception
+ */
+ result = ES_UNSUPPORTED;
+ }
+
+ return result;
+}
+
+static __always_inline void vc_forward_exception(struct es_em_ctxt *ctxt)
+{
+ long error_code = ctxt->fi.error_code;
+ int trapnr = ctxt->fi.vector;
+
+ ctxt->regs->orig_ax = ctxt->fi.error_code;
+
+ switch (trapnr) {
+ case X86_TRAP_GP:
+ exc_general_protection(ctxt->regs, error_code);
+ break;
+ case X86_TRAP_UD:
+ exc_invalid_op(ctxt->regs);
+ break;
+ case X86_TRAP_AC:
+ exc_alignment_check(ctxt->regs, error_code);
+ break;
+ default:
+ pr_emerg("Unsupported exception in #VC instruction emulation - can't continue\n");
+ BUG();
+ }
+}
+
+static __always_inline bool on_vc_fallback_stack(struct pt_regs *regs)
+{
+ unsigned long sp = (unsigned long)regs;
+
+ return (sp >= __this_cpu_ist_bottom_va(VC2) && sp < __this_cpu_ist_top_va(VC2));
+}
+
+/*
+ * Main #VC exception handler. It is called when the entry code was able to
+ * switch off the IST to a safe kernel stack.
+ *
+ * With the current implementation it is always possible to switch to a safe
+ * stack because #VC exceptions only happen at known places, like intercepted
+ * instructions or accesses to MMIO areas/IO ports. They can also happen with
+ * code instrumentation when the hypervisor intercepts #DB, but the critical
+ * paths are forbidden to be instrumented, so #DB exceptions currently also
+ * only happen in safe places.
+ */
+DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
+{
+ struct sev_es_runtime_data *data = this_cpu_read(runtime_data);
+ struct ghcb_state state;
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+ struct ghcb *ghcb;
+
+ lockdep_assert_irqs_disabled();
+
+ /*
+ * Handle #DB before calling into !noinstr code to avoid recursive #DB.
+ */
+ if (error_code == SVM_EXIT_EXCP_BASE + X86_TRAP_DB) {
+ vc_handle_trap_db(regs);
+ return;
+ }
+
+ instrumentation_begin();
+
+ /*
+ * This is invoked through an interrupt gate, so IRQs are disabled. The
+ * code below might walk page-tables for user or kernel addresses, so
+ * keep the IRQs disabled to protect us against concurrent TLB flushes.
+ */
+
+ ghcb = sev_es_get_ghcb(&state);
+ if (!ghcb) {
+ /*
+ * Mark GHCBs inactive so that panic() is able to print the
+ * message.
+ */
+ data->ghcb_active = false;
+ data->backup_ghcb_active = false;
+
+ panic("Unable to handle #VC exception! GHCB and Backup GHCB are already in use");
+ }
+
+ vc_ghcb_invalidate(ghcb);
+ result = vc_init_em_ctxt(&ctxt, regs, error_code);
+
+ if (result == ES_OK)
+ result = vc_handle_exitcode(&ctxt, ghcb, error_code);
+
+ sev_es_put_ghcb(&state);
+
+ /* Done - now check the result */
+ switch (result) {
+ case ES_OK:
+ vc_finish_insn(&ctxt);
+ break;
+ case ES_UNSUPPORTED:
+ pr_err_ratelimited("Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
+ error_code, regs->ip);
+ goto fail;
+ case ES_VMM_ERROR:
+ pr_err_ratelimited("Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
+ error_code, regs->ip);
+ goto fail;
+ case ES_DECODE_FAILED:
+ pr_err_ratelimited("Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
+ error_code, regs->ip);
+ goto fail;
+ case ES_EXCEPTION:
+ vc_forward_exception(&ctxt);
+ break;
+ case ES_RETRY:
+ /* Nothing to do */
+ break;
+ default:
+ pr_emerg("Unknown result in %s():%d\n", __func__, result);
+ /*
+ * Emulating the instruction which caused the #VC exception
+ * failed - can't continue so print debug information
+ */
+ BUG();
+ }
+
+out:
+ instrumentation_end();
+
+ return;
+
+fail:
+ if (user_mode(regs)) {
+ /*
+ * Do not kill the machine if user-space triggered the
+ * exception. Send SIGBUS instead and let user-space deal with
+ * it.
+ */
+ force_sig_fault(SIGBUS, BUS_OBJERR, (void __user *)0);
+ } else {
+ pr_emerg("PANIC: Unhandled #VC exception in kernel space (result=%d)\n",
+ result);
+
+ /* Show some debug info */
+ show_regs(regs);
+
+ /* Ask hypervisor to sev_es_terminate */
+ sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+
+ /* If that fails and we get here - just panic */
+ panic("Returned from Terminate-Request to Hypervisor\n");
+ }
+
+ goto out;
+}
+
+/* This handler runs on the #VC fall-back stack. It can cause further #VC exceptions */
+DEFINE_IDTENTRY_VC_IST(exc_vmm_communication)
+{
+ instrumentation_begin();
+ panic("Can't handle #VC exception from unsupported context\n");
+ instrumentation_end();
+}
+
+DEFINE_IDTENTRY_VC(exc_vmm_communication)
+{
+ if (likely(!on_vc_fallback_stack(regs)))
+ safe_stack_exc_vmm_communication(regs, error_code);
+ else
+ ist_exc_vmm_communication(regs, error_code);
+}
+
+bool __init handle_vc_boot_ghcb(struct pt_regs *regs)
+{
+ unsigned long exit_code = regs->orig_ax;
+ struct es_em_ctxt ctxt;
+ enum es_result result;
+
+ /* Do initial setup or terminate the guest */
+ if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb()))
+ sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
+
+ vc_ghcb_invalidate(boot_ghcb);
+
+ result = vc_init_em_ctxt(&ctxt, regs, exit_code);
+ if (result == ES_OK)
+ result = vc_handle_exitcode(&ctxt, boot_ghcb, exit_code);
+
+ /* Done - now check the result */
+ switch (result) {
+ case ES_OK:
+ vc_finish_insn(&ctxt);
+ break;
+ case ES_UNSUPPORTED:
+ early_printk("PANIC: Unsupported exit-code 0x%02lx in early #VC exception (IP: 0x%lx)\n",
+ exit_code, regs->ip);
+ goto fail;
+ case ES_VMM_ERROR:
+ early_printk("PANIC: Failure in communication with VMM (exit-code 0x%02lx IP: 0x%lx)\n",
+ exit_code, regs->ip);
+ goto fail;
+ case ES_DECODE_FAILED:
+ early_printk("PANIC: Failed to decode instruction (exit-code 0x%02lx IP: 0x%lx)\n",
+ exit_code, regs->ip);
+ goto fail;
+ case ES_EXCEPTION:
+ vc_early_forward_exception(&ctxt);
+ break;
+ case ES_RETRY:
+ /* Nothing to do */
+ break;
+ default:
+ BUG();
+ }
+
+ return true;
+
+fail:
+ show_regs(regs);
+
+ while (true)
+ halt();
+}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f5ef689dd62a..de776b2e6046 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -227,7 +227,7 @@ static void notrace start_secondary(void *unused)
load_cr3(swapper_pg_dir);
__flush_tlb_all();
#endif
- load_current_idt();
+ cpu_init_exception_handling();
cpu_init();
x86_cpuinit.early_percpu_clock_init();
preempt_disable();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ec3a2572843f..3c70fb34028b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -43,6 +43,7 @@
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
+#include <asm/realmode.h>
#include <asm/text-patching.h>
#include <asm/ftrace.h>
#include <asm/traps.h>
@@ -673,6 +674,50 @@ asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
return regs;
}
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *regs)
+{
+ unsigned long sp, *stack;
+ struct stack_info info;
+ struct pt_regs *regs_ret;
+
+ /*
+ * In the SYSCALL entry path the RSP value comes from user-space - don't
+ * trust it and switch to the current kernel stack
+ */
+ if (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
+ regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack) {
+ sp = this_cpu_read(cpu_current_top_of_stack);
+ goto sync;
+ }
+
+ /*
+ * From here on the RSP value is trusted. Now check whether entry
+ * happened from a safe stack. Not safe are the entry or unknown stacks,
+ * use the fall-back stack instead in this case.
+ */
+ sp = regs->sp;
+ stack = (unsigned long *)sp;
+
+ if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY ||
+ info.type >= STACK_TYPE_EXCEPTION_LAST)
+ sp = __this_cpu_ist_top_va(VC2);
+
+sync:
+ /*
+ * Found a safe stack - switch to it as if the entry didn't happen via
+ * IST stack. The code below only copies pt_regs, the real switch happens
+ * in assembly code.
+ */
+ sp = ALIGN_DOWN(sp, 8) - sizeof(*regs_ret);
+
+ regs_ret = (struct pt_regs *)sp;
+ *regs_ret = *regs;
+
+ return regs_ret;
+}
+#endif
+
struct bad_iret_stack {
void *error_entry_ret;
struct pt_regs regs;
@@ -1082,6 +1127,9 @@ void __init trap_init(void)
/* Init cpu_entry_area before IST entries are set up */
setup_cpu_entry_areas();
+ /* Init GHCB memory pages when running as an SEV-ES guest */
+ sev_es_init_vc_handling();
+
idt_setup_traps();
/*
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index 2c304fd0bb1a..f6225bf22c02 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -335,63 +335,28 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
*/
bool fixup_umip_exception(struct pt_regs *regs)
{
- int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst;
- unsigned long seg_base = 0, *reg_addr;
+ int nr_copied, reg_offset, dummy_data_size, umip_inst;
/* 10 bytes is the maximum size of the result of UMIP instructions */
unsigned char dummy_data[10] = { 0 };
unsigned char buf[MAX_INSN_SIZE];
+ unsigned long *reg_addr;
void __user *uaddr;
struct insn insn;
- int seg_defs;
if (!regs)
return false;
- /*
- * If not in user-space long mode, a custom code segment could be in
- * use. This is true in protected mode (if the process defined a local
- * descriptor table), or virtual-8086 mode. In most of the cases
- * seg_base will be zero as in USER_CS.
- */
- if (!user_64bit_mode(regs))
- seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
-
- if (seg_base == -1L)
- return false;
-
- not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip),
- sizeof(buf));
- nr_copied = sizeof(buf) - not_copied;
+ nr_copied = insn_fetch_from_user(regs, buf);
/*
- * The copy_from_user above could have failed if user code is protected
- * by a memory protection key. Give up on emulation in such a case.
- * Should we issue a page fault?
+ * The insn_fetch_from_user above could have failed if user code
+ * is protected by a memory protection key. Give up on emulation
+ * in such a case. Should we issue a page fault?
*/
if (!nr_copied)
return false;
- insn_init(&insn, buf, nr_copied, user_64bit_mode(regs));
-
- /*
- * Override the default operand and address sizes with what is specified
- * in the code segment descriptor. The instruction decoder only sets
- * the address size it to either 4 or 8 address bytes and does nothing
- * for the operand bytes. This OK for most of the cases, but we could
- * have special cases where, for instance, a 16-bit code segment
- * descriptor is used.
- * If there is an address override prefix, the instruction decoder
- * correctly updates these values, even for 16-bit defaults.
- */
- seg_defs = insn_get_code_seg_params(regs);
- if (seg_defs == -EINVAL)
- return false;
-
- insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs);
- insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs);
-
- insn_get_length(&insn);
- if (nr_copied < insn.length)
+ if (!insn_decode(&insn, regs, buf, nr_copied))
return false;
umip_inst = identify_insn(&insn);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index e90bc436f584..598a769f1961 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1062,10 +1062,14 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
struct vmcb *hsave = svm->nested.hsave;
struct vmcb __user *user_vmcb = (struct vmcb __user *)
&user_kvm_nested_state->data.svm[0];
- struct vmcb_control_area ctl;
- struct vmcb_save_area save;
+ struct vmcb_control_area *ctl;
+ struct vmcb_save_area *save;
+ int ret;
u32 cr0;
+ BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
+ KVM_STATE_NESTED_SVM_VMCB_SIZE);
+
if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
return -EINVAL;
@@ -1097,13 +1101,22 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
return -EINVAL;
- if (copy_from_user(&ctl, &user_vmcb->control, sizeof(ctl)))
- return -EFAULT;
- if (copy_from_user(&save, &user_vmcb->save, sizeof(save)))
- return -EFAULT;
- if (!nested_vmcb_check_controls(&ctl))
- return -EINVAL;
+ ret = -ENOMEM;
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
+ save = kzalloc(sizeof(*save), GFP_KERNEL);
+ if (!ctl || !save)
+ goto out_free;
+
+ ret = -EFAULT;
+ if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl)))
+ goto out_free;
+ if (copy_from_user(save, &user_vmcb->save, sizeof(*save)))
+ goto out_free;
+
+ ret = -EINVAL;
+ if (!nested_vmcb_check_controls(ctl))
+ goto out_free;
/*
* Processor state contains L2 state. Check that it is
@@ -1111,15 +1124,15 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
*/
cr0 = kvm_read_cr0(vcpu);
if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
- return -EINVAL;
+ goto out_free;
/*
* Validate host state saved from before VMRUN (see
* nested_svm_check_permissions).
* TODO: validate reserved bits for all saved state.
*/
- if (!(save.cr0 & X86_CR0_PG))
- return -EINVAL;
+ if (!(save->cr0 & X86_CR0_PG))
+ goto out_free;
/*
* All checks done, we can enter guest mode. L1 control fields
@@ -1128,10 +1141,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
* contains saved L1 state.
*/
copy_vmcb_control_area(&hsave->control, &svm->vmcb->control);
- hsave->save = save;
+ hsave->save = *save;
svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa;
- load_nested_vmcb_control(svm, &ctl);
+ load_nested_vmcb_control(svm, ctl);
nested_prepare_vmcb_control(svm);
if (!nested_svm_vmrun_msrpm(svm))
@@ -1139,7 +1152,13 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
out_set_gif:
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
- return 0;
+
+ ret = 0;
+out_free:
+ kfree(save);
+ kfree(ctl);
+
+ return ret;
}
struct kvm_x86_nested_ops svm_nested_ops = {
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index dec6d5bc603c..9709c98d0d6c 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4176,6 +4176,8 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
static int __init svm_init(void)
{
+ __unused_size_checks();
+
return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm),
__alignof__(struct vcpu_svm), THIS_MODULE);
}
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 5e69603ff63f..58f7fb95c7f4 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -20,6 +20,7 @@
enum reg_type {
REG_TYPE_RM = 0,
+ REG_TYPE_REG,
REG_TYPE_INDEX,
REG_TYPE_BASE,
};
@@ -53,6 +54,30 @@ static bool is_string_insn(struct insn *insn)
}
/**
+ * insn_has_rep_prefix() - Determine if instruction has a REP prefix
+ * @insn: Instruction containing the prefix to inspect
+ *
+ * Returns:
+ *
+ * true if the instruction has a REP prefix, false if not.
+ */
+bool insn_has_rep_prefix(struct insn *insn)
+{
+ int i;
+
+ insn_get_prefixes(insn);
+
+ for (i = 0; i < insn->prefixes.nbytes; i++) {
+ insn_byte_t p = insn->prefixes.bytes[i];
+
+ if (p == 0xf2 || p == 0xf3)
+ return true;
+ }
+
+ return false;
+}
+
+/**
* get_seg_reg_override_idx() - obtain segment register override index
* @insn: Valid instruction with segment override prefixes
*
@@ -439,6 +464,13 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
regno += 8;
break;
+ case REG_TYPE_REG:
+ regno = X86_MODRM_REG(insn->modrm.value);
+
+ if (X86_REX_R(insn->rex_prefix.value))
+ regno += 8;
+ break;
+
case REG_TYPE_INDEX:
regno = X86_SIB_INDEX(insn->sib.value);
if (X86_REX_X(insn->rex_prefix.value))
@@ -808,6 +840,21 @@ int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs)
}
/**
+ * insn_get_modrm_reg_off() - Obtain register in reg part of the ModRM byte
+ * @insn: Instruction containing the ModRM byte
+ * @regs: Register values as seen when entering kernel mode
+ *
+ * Returns:
+ *
+ * The register indicated by the reg part of the ModRM byte. The
+ * register is obtained as an offset from the base of pt_regs.
+ */
+int insn_get_modrm_reg_off(struct insn *insn, struct pt_regs *regs)
+{
+ return get_reg_offset(insn, regs, REG_TYPE_REG);
+}
+
+/**
* get_seg_base_limit() - obtain base address and limit of a segment
* @insn: Instruction. Must be valid.
* @regs: Register values as seen when entering kernel mode
@@ -1367,3 +1414,86 @@ void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
return (void __user *)-1L;
}
}
+
+/**
+ * insn_fetch_from_user() - Copy instruction bytes from user-space memory
+ * @regs: Structure with register values as seen when entering kernel mode
+ * @buf: Array to store the fetched instruction
+ *
+ * Gets the linear address of the instruction and copies the instruction bytes
+ * to the buf.
+ *
+ * Returns:
+ *
+ * Number of instruction bytes copied.
+ *
+ * 0 if nothing was copied.
+ */
+int insn_fetch_from_user(struct pt_regs *regs, unsigned char buf[MAX_INSN_SIZE])
+{
+ unsigned long seg_base = 0;
+ int not_copied;
+
+ /*
+ * If not in user-space long mode, a custom code segment could be in
+ * use. This is true in protected mode (if the process defined a local
+ * descriptor table), or virtual-8086 mode. In most of the cases
+ * seg_base will be zero as in USER_CS.
+ */
+ if (!user_64bit_mode(regs)) {
+ seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
+ if (seg_base == -1L)
+ return 0;
+ }
+
+
+ not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip),
+ MAX_INSN_SIZE);
+
+ return MAX_INSN_SIZE - not_copied;
+}
+
+/**
+ * insn_decode() - Decode an instruction
+ * @insn: Structure to store decoded instruction
+ * @regs: Structure with register values as seen when entering kernel mode
+ * @buf: Buffer containing the instruction bytes
+ * @buf_size: Number of instruction bytes available in buf
+ *
+ * Decodes the instruction provided in buf and stores the decoding results in
+ * insn. Also determines the correct address and operand sizes.
+ *
+ * Returns:
+ *
+ * True if instruction was decoded, False otherwise.
+ */
+bool insn_decode(struct insn *insn, struct pt_regs *regs,
+ unsigned char buf[MAX_INSN_SIZE], int buf_size)
+{
+ int seg_defs;
+
+ insn_init(insn, buf, buf_size, user_64bit_mode(regs));
+
+ /*
+ * Override the default operand and address sizes with what is specified
+ * in the code segment descriptor. The instruction decoder only sets
+ * the address size it to either 4 or 8 address bytes and does nothing
+ * for the operand bytes. This OK for most of the cases, but we could
+ * have special cases where, for instance, a 16-bit code segment
+ * descriptor is used.
+ * If there is an address override prefix, the instruction decoder
+ * correctly updates these values, even for 16-bit defaults.
+ */
+ seg_defs = insn_get_code_seg_params(regs);
+ if (seg_defs == -EINVAL)
+ return false;
+
+ insn->addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs);
+ insn->opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs);
+
+ insn_get_length(insn);
+ if (buf_size < insn->length)
+ return false;
+
+ return true;
+}
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index 770b613790b3..f5e1e60c9095 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -21,7 +21,8 @@ DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
#endif
-struct cpu_entry_area *get_cpu_entry_area(int cpu)
+/* Is called from entry code, so must be noinstr */
+noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu)
{
unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 5829457f7ca3..b93d6cd08a7f 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -5,6 +5,7 @@
#include <xen/xen.h>
#include <asm/fpu/internal.h>
+#include <asm/sev-es.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 9f1177edc2e7..ebb7edc8bc0a 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -38,6 +38,7 @@
* section is later cleared.
*/
u64 sme_me_mask __section(.data) = 0;
+u64 sev_status __section(.data) = 0;
EXPORT_SYMBOL(sme_me_mask);
DEFINE_STATIC_KEY_FALSE(sev_enable_key);
EXPORT_SYMBOL_GPL(sev_enable_key);
@@ -347,7 +348,13 @@ bool sme_active(void)
bool sev_active(void)
{
- return sme_me_mask && sev_enabled;
+ return sev_status & MSR_AMD64_SEV_ENABLED;
+}
+
+/* Needs to be called from non-instrumentable code */
+bool noinstr sev_es_active(void)
+{
+ return sev_status & MSR_AMD64_SEV_ES_ENABLED;
}
/* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
@@ -400,6 +407,31 @@ void __init mem_encrypt_free_decrypted_mem(void)
free_init_pages("unused decrypted", vaddr, vaddr_end);
}
+static void print_mem_encrypt_feature_info(void)
+{
+ pr_info("AMD Memory Encryption Features active:");
+
+ /* Secure Memory Encryption */
+ if (sme_active()) {
+ /*
+ * SME is mutually exclusive with any of the SEV
+ * features below.
+ */
+ pr_cont(" SME\n");
+ return;
+ }
+
+ /* Secure Encrypted Virtualization */
+ if (sev_active())
+ pr_cont(" SEV");
+
+ /* Encrypted Register State */
+ if (sev_es_active())
+ pr_cont(" SEV-ES");
+
+ pr_cont("\n");
+}
+
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void)
{
@@ -415,8 +447,6 @@ void __init mem_encrypt_init(void)
if (sev_active())
static_branch_enable(&sev_enable_key);
- pr_info("AMD %s active\n",
- sev_active() ? "Secure Encrypted Virtualization (SEV)"
- : "Secure Memory Encryption (SME)");
+ print_mem_encrypt_feature_info();
}
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index e2b0e2ac07bb..68d75379e06a 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -540,6 +540,9 @@ void __init sme_enable(struct boot_params *bp)
if (!(msr & MSR_AMD64_SEV_ENABLED))
return;
+ /* Save SEV_STATUS to avoid reading MSR again */
+ sev_status = msr;
+
/* SEV state cannot be controlled by a command line option */
sme_me_mask = me_mask;
sev_enabled = true;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 6af4da1149ba..8f5759df7776 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -47,6 +47,7 @@
#include <asm/realmode.h>
#include <asm/time.h>
#include <asm/pgalloc.h>
+#include <asm/sev-es.h>
/*
* We allocate runtime services regions top-down, starting from -4G, i.e.
@@ -230,6 +231,15 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
}
/*
+ * When SEV-ES is active, the GHCB as set by the kernel will be used
+ * by firmware. Create a 1:1 unencrypted mapping for each GHCB.
+ */
+ if (sev_es_efi_map_ghcbs(pgd)) {
+ pr_err("Failed to create 1:1 mapping for the GHCBs!\n");
+ return 1;
+ }
+
+ /*
* When making calls to the firmware everything needs to be 1:1
* mapped and addressable with 32-bit pointers. Map the kernel
* text and allocate a new stack because we can't rely on the
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 1ed1208931e0..22fda7d99159 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -9,6 +9,7 @@
#include <asm/realmode.h>
#include <asm/tlbflush.h>
#include <asm/crash.h>
+#include <asm/sev-es.h>
struct real_mode_header *real_mode_header;
u32 *trampoline_cr4_features;
@@ -38,6 +39,25 @@ void __init reserve_real_mode(void)
crash_reserve_low_1M();
}
+static void sme_sev_setup_real_mode(struct trampoline_header *th)
+{
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ if (sme_active())
+ th->flags |= TH_FLAGS_SME_ACTIVE;
+
+ if (sev_es_active()) {
+ /*
+ * Skip the call to verify_cpu() in secondary_startup_64 as it
+ * will cause #VC exceptions when the AP can't handle them yet.
+ */
+ th->start = (u64) secondary_startup_64_no_verify;
+
+ if (sev_es_setup_ap_jump_table(real_mode_header))
+ panic("Failed to get/update SEV-ES AP Jump Table");
+ }
+#endif
+}
+
static void __init setup_real_mode(void)
{
u16 real_mode_seg;
@@ -104,13 +124,13 @@ static void __init setup_real_mode(void)
*trampoline_cr4_features = mmu_cr4_features;
trampoline_header->flags = 0;
- if (sme_active())
- trampoline_header->flags |= TH_FLAGS_SME_ACTIVE;
trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
trampoline_pgd[0] = trampoline_pgd_entry.pgd;
trampoline_pgd[511] = init_top_pgt[511].pgd;
#endif
+
+ sme_sev_setup_real_mode(trampoline_header);
}
/*
diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S
index af04512c02d9..8c1db5bf5d78 100644
--- a/arch/x86/realmode/rm/header.S
+++ b/arch/x86/realmode/rm/header.S
@@ -20,6 +20,9 @@ SYM_DATA_START(real_mode_header)
/* SMP trampoline */
.long pa_trampoline_start
.long pa_trampoline_header
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ .long pa_sev_es_trampoline_start
+#endif
#ifdef CONFIG_X86_64
.long pa_trampoline_pgd;
#endif
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index 251758ed7443..84c5d1b33d10 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -56,6 +56,7 @@ SYM_CODE_START(trampoline_start)
testl %eax, %eax # Check for return code
jnz no_longmode
+.Lswitch_to_protected:
/*
* GDT tables in non default location kernel can be beyond 16MB and
* lgdt will not be able to load the address as in real mode default
@@ -80,6 +81,25 @@ no_longmode:
jmp no_longmode
SYM_CODE_END(trampoline_start)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/* SEV-ES supports non-zero IP for entry points - no alignment needed */
+SYM_CODE_START(sev_es_trampoline_start)
+ cli # We should be safe anyway
+
+ LJMPW_RM(1f)
+1:
+ mov %cs, %ax # Code and data in the same place
+ mov %ax, %ds
+ mov %ax, %es
+ mov %ax, %ss
+
+ # Setup stack
+ movl $rm_stack_end, %esp
+
+ jmp .Lswitch_to_protected
+SYM_CODE_END(sev_es_trampoline_start)
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
#include "../kernel/verify_cpu.S"
.section ".text32","ax"
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
index a42015b305f4..af38469afd14 100644
--- a/arch/x86/tools/gen-insn-attr-x86.awk
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -362,6 +362,9 @@ function convert_operands(count,opnd, i,j,imm,mod)
END {
if (awkchecked != "")
exit 1
+
+ print "#ifndef __BOOT_COMPRESSED\n"
+
# print escape opcode map's array
print "/* Escape opcode map array */"
print "const insn_attr_t * const inat_escape_tables[INAT_ESC_MAX + 1]" \
@@ -388,6 +391,51 @@ END {
for (j = 0; j < max_lprefix; j++)
if (atable[i,j])
print " ["i"]["j"] = "atable[i,j]","
- print "};"
+ print "};\n"
+
+ print "#else /* !__BOOT_COMPRESSED */\n"
+
+ print "/* Escape opcode map array */"
+ print "static const insn_attr_t *inat_escape_tables[INAT_ESC_MAX + 1]" \
+ "[INAT_LSTPFX_MAX + 1];"
+ print ""
+
+ print "/* Group opcode map array */"
+ print "static const insn_attr_t *inat_group_tables[INAT_GRP_MAX + 1]"\
+ "[INAT_LSTPFX_MAX + 1];"
+ print ""
+
+ print "/* AVX opcode map array */"
+ print "static const insn_attr_t *inat_avx_tables[X86_VEX_M_MAX + 1]"\
+ "[INAT_LSTPFX_MAX + 1];"
+ print ""
+
+ print "static void inat_init_tables(void)"
+ print "{"
+
+ # print escape opcode map's array
+ print "\t/* Print Escape opcode map array */"
+ for (i = 0; i < geid; i++)
+ for (j = 0; j < max_lprefix; j++)
+ if (etable[i,j])
+ print "\tinat_escape_tables["i"]["j"] = "etable[i,j]";"
+ print ""
+
+ # print group opcode map's array
+ print "\t/* Print Group opcode map array */"
+ for (i = 0; i < ggid; i++)
+ for (j = 0; j < max_lprefix; j++)
+ if (gtable[i,j])
+ print "\tinat_group_tables["i"]["j"] = "gtable[i,j]";"
+ print ""
+ # print AVX opcode map's array
+ print "\t/* Print AVX opcode map array */"
+ for (i = 0; i < gaid; i++)
+ for (j = 0; j < max_lprefix; j++)
+ if (atable[i,j])
+ print "\tinat_avx_tables["i"]["j"] = "atable[i,j]";"
+
+ print "}"
+ print "#endif"
}