diff options
-rw-r--r-- | arch/x86/hyperv/hv_apic.c | 15 | ||||
-rw-r--r-- | arch/x86/hyperv/hv_init.c | 105 | ||||
-rw-r--r-- | arch/x86/hyperv/ivm.c | 263 | ||||
-rw-r--r-- | arch/x86/include/asm/hyperv-tlfs.h | 10 | ||||
-rw-r--r-- | arch/x86/include/asm/mshyperv.h | 71 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mshyperv.c | 91 | ||||
-rw-r--r-- | drivers/clocksource/hyperv_timer.c | 2 | ||||
-rw-r--r-- | drivers/hv/connection.c | 16 | ||||
-rw-r--r-- | drivers/hv/hv.c | 131 | ||||
-rw-r--r-- | drivers/hv/hv_balloon.c | 82 | ||||
-rw-r--r-- | drivers/hv/hv_common.c | 48 | ||||
-rw-r--r-- | drivers/hv/hyperv_vmbus.h | 11 | ||||
-rw-r--r-- | drivers/hv/vmbus_drv.c | 3 | ||||
-rw-r--r-- | include/asm-generic/hyperv-tlfs.h | 1 | ||||
-rw-r--r-- | include/asm-generic/mshyperv.h | 17 | ||||
-rw-r--r-- | include/linux/hyperv.h | 6 |
16 files changed, 759 insertions, 113 deletions
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 187e13b15e9a..97bfe5f0531f 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -175,8 +175,11 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector, (exclude_self && weight == 1 && cpumask_test_cpu(this_cpu, mask))) return true; - if (!hv_hypercall_pg) - return false; + /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */ + if (!hv_hypercall_pg) { + if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx()) + return false; + } if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return false; @@ -229,9 +232,15 @@ static bool __send_ipi_one(int cpu, int vector) trace_hyperv_send_ipi_one(cpu, vector); - if (!hv_hypercall_pg || (vp == VP_INVAL)) + if (vp == VP_INVAL) return false; + /* A fully enlightened TDX VM uses GHCI rather than hv_hypercall_pg. */ + if (!hv_hypercall_pg) { + if (ms_hyperv.paravisor_present || !hv_isolation_type_tdx()) + return false; + } + if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) return false; diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 507d98331e7c..783ed339f341 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -19,6 +19,7 @@ #include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> #include <asm/idtentry.h> +#include <asm/set_memory.h> #include <linux/kexec.h> #include <linux/version.h> #include <linux/vmalloc.h> @@ -52,7 +53,7 @@ static int hyperv_init_ghcb(void) void *ghcb_va; void **ghcb_base; - if (!hv_isolation_type_snp()) + if (!ms_hyperv.paravisor_present || !hv_isolation_type_snp()) return 0; if (!hv_ghcb_pg) @@ -80,7 +81,7 @@ static int hyperv_init_ghcb(void) static int hv_cpu_init(unsigned int cpu) { union hv_vp_assist_msr_contents msr = { 0 }; - struct hv_vp_assist_page **hvp = &hv_vp_assist_page[cpu]; + struct hv_vp_assist_page **hvp; int ret; ret = hv_common_cpu_init(cpu); @@ -90,6 +91,7 @@ static int hv_cpu_init(unsigned int cpu) if (!hv_vp_assist_page) return 0; + hvp = &hv_vp_assist_page[cpu]; if (hv_root_partition) { /* * For root partition we get the hypervisor provided VP assist @@ -107,8 +109,21 @@ static int hv_cpu_init(unsigned int cpu) * in hv_cpu_die(), otherwise a CPU may not be stopped in the * case of CPU offlining and the VM will hang. */ - if (!*hvp) + if (!*hvp) { *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); + + /* + * Hyper-V should never specify a VM that is a Confidential + * VM and also running in the root partition. Root partition + * is blocked to run in Confidential VM. So only decrypt assist + * page in non-root partition here. + */ + if (*hvp && !ms_hyperv.paravisor_present && hv_isolation_type_snp()) { + WARN_ON_ONCE(set_memory_decrypted((unsigned long)(*hvp), 1)); + memset(*hvp, 0, PAGE_SIZE); + } + } + if (*hvp) msr.pfn = vmalloc_to_pfn(*hvp); @@ -379,6 +394,36 @@ static void __init hv_get_partition_id(void) local_irq_restore(flags); } +static u8 __init get_vtl(void) +{ + u64 control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_REGISTERS; + struct hv_get_vp_registers_input *input; + struct hv_get_vp_registers_output *output; + unsigned long flags; + u64 ret; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + output = (struct hv_get_vp_registers_output *)input; + + memset(input, 0, struct_size(input, element, 1)); + input->header.partitionid = HV_PARTITION_ID_SELF; + input->header.vpindex = HV_VP_INDEX_SELF; + input->header.inputvtl = 0; + input->element[0].name0 = HV_X64_REGISTER_VSM_VP_STATUS; + + ret = hv_do_hypercall(control, input, output); + if (hv_result_success(ret)) { + ret = output->as64.low & HV_X64_VTL_MASK; + } else { + pr_err("Failed to get VTL(%lld) and set VTL to zero by default.\n", ret); + ret = 0; + } + + local_irq_restore(flags); + return ret; +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -399,14 +444,24 @@ void __init hyperv_init(void) if (hv_common_init()) return; - hv_vp_assist_page = kcalloc(num_possible_cpus(), - sizeof(*hv_vp_assist_page), GFP_KERNEL); + /* + * The VP assist page is useless to a TDX guest: the only use we + * would have for it is lazy EOI, which can not be used with TDX. + */ + if (hv_isolation_type_tdx()) + hv_vp_assist_page = NULL; + else + hv_vp_assist_page = kcalloc(num_possible_cpus(), + sizeof(*hv_vp_assist_page), + GFP_KERNEL); if (!hv_vp_assist_page) { ms_hyperv.hints &= ~HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; - goto common_free; + + if (!hv_isolation_type_tdx()) + goto common_free; } - if (hv_isolation_type_snp()) { + if (ms_hyperv.paravisor_present && hv_isolation_type_snp()) { /* Negotiate GHCB Version. */ if (!hv_ghcb_negotiate_protocol()) hv_ghcb_terminate(SEV_TERM_SET_GEN, @@ -426,12 +481,32 @@ void __init hyperv_init(void) * Setup the hypercall page and enable hypercalls. * 1. Register the guest ID * 2. Enable the hypercall and register the hypercall page + * + * A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg: + * when the hypercall input is a page, such a VM must pass a decrypted + * page to Hyper-V, e.g. hv_post_message() uses the per-CPU page + * hyperv_pcpu_input_arg, which is decrypted if no paravisor is present. + * + * A TDX VM with the paravisor uses hv_hypercall_pg for most hypercalls, + * which are handled by the paravisor and the VM must use an encrypted + * input page: in such a VM, the hyperv_pcpu_input_arg is encrypted and + * used in the hypercalls, e.g. see hv_mark_gpa_visibility() and + * hv_arch_irq_unmask(). Such a VM uses TDX GHCI for two hypercalls: + * 1. HVCALL_SIGNAL_EVENT: see vmbus_set_event() and _hv_do_fast_hypercall8(). + * 2. HVCALL_POST_MESSAGE: the input page must be a decrypted page, i.e. + * hv_post_message() in such a VM can't use the encrypted hyperv_pcpu_input_arg; + * instead, hv_post_message() uses the post_msg_page, which is decrypted + * in such a VM and is only used in such a VM. */ guest_id = hv_generate_guest_id(LINUX_VERSION_CODE); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - /* Hyper-V requires to write guest os id via ghcb in SNP IVM. */ - hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id); + /* With the paravisor, the VM must also write the ID via GHCB/GHCI */ + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, guest_id); + + /* A TDX VM with no paravisor only uses TDX GHCI rather than hv_hypercall_pg */ + if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) + goto skip_hypercall_pg_init; hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, @@ -472,6 +547,7 @@ void __init hyperv_init(void) wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); } +skip_hypercall_pg_init: /* * Some versions of Hyper-V that provide IBT in guest VMs have a bug * in that there's no ENDBR64 instruction at the entry to the @@ -527,11 +603,15 @@ void __init hyperv_init(void) /* Query the VMs extended capability once, so that it can be cached. */ hv_query_ext_cap(0); + /* Find the VTL */ + if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) + ms_hyperv.vtl = get_vtl(); + return; clean_guest_os_id: wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); cpuhp_remove_state(cpuhp); free_ghcb_page: free_percpu(hv_ghcb_pg); @@ -552,7 +632,7 @@ void hyperv_cleanup(void) /* Reset our OS id */ wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - hv_ghcb_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); + hv_ivm_msr_write(HV_X64_MSR_GUEST_OS_ID, 0); /* * Reset hypercall page reference before reset the page, @@ -615,6 +695,9 @@ bool hv_is_hyperv_initialized(void) if (x86_hyper_type != X86_HYPER_MS_HYPERV) return false; + /* A TDX VM with no paravisor uses TDX GHCI call rather than hv_hypercall_pg */ + if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) + return true; /* * Verify that earlier initialization succeeded by checking * that the hypercall page is setup diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index 28be6df88063..8c6bf07f7d2b 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -18,6 +18,11 @@ #include <asm/mshyperv.h> #include <asm/hypervisor.h> #include <asm/mtrr.h> +#include <asm/io_apic.h> +#include <asm/realmode.h> +#include <asm/e820/api.h> +#include <asm/desc.h> +#include <uapi/asm/vmx.h> #ifdef CONFIG_AMD_MEM_ENCRYPT @@ -56,8 +61,10 @@ union hv_ghcb { } hypercall; } __packed __aligned(HV_HYP_PAGE_SIZE); +/* Only used in an SNP VM with the paravisor */ static u16 hv_ghcb_version __ro_after_init; +/* Functions only used in an SNP VM with the paravisor go here. */ u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size) { union hv_ghcb *hv_ghcb; @@ -175,7 +182,7 @@ bool hv_ghcb_negotiate_protocol(void) return true; } -void hv_ghcb_msr_write(u64 msr, u64 value) +static void hv_ghcb_msr_write(u64 msr, u64 value) { union hv_ghcb *hv_ghcb; void **ghcb_base; @@ -203,9 +210,8 @@ void hv_ghcb_msr_write(u64 msr, u64 value) local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(hv_ghcb_msr_write); -void hv_ghcb_msr_read(u64 msr, u64 *value) +static void hv_ghcb_msr_read(u64 msr, u64 *value) { union hv_ghcb *hv_ghcb; void **ghcb_base; @@ -235,7 +241,217 @@ void hv_ghcb_msr_read(u64 msr, u64 *value) | ((u64)lower_32_bits(hv_ghcb->ghcb.save.rdx) << 32); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(hv_ghcb_msr_read); + +/* Only used in a fully enlightened SNP VM, i.e. without the paravisor */ +static u8 ap_start_input_arg[PAGE_SIZE] __bss_decrypted __aligned(PAGE_SIZE); +static u8 ap_start_stack[PAGE_SIZE] __aligned(PAGE_SIZE); +static DEFINE_PER_CPU(struct sev_es_save_area *, hv_sev_vmsa); + +/* Functions only used in an SNP VM without the paravisor go here. */ + +#define hv_populate_vmcb_seg(seg, gdtr_base) \ +do { \ + if (seg.selector) { \ + seg.base = 0; \ + seg.limit = HV_AP_SEGMENT_LIMIT; \ + seg.attrib = *(u16 *)(gdtr_base + seg.selector + 5); \ + seg.attrib = (seg.attrib & 0xFF) | ((seg.attrib >> 4) & 0xF00); \ + } \ +} while (0) \ + +static int snp_set_vmsa(void *va, bool vmsa) +{ + u64 attrs; + + /* + * Running at VMPL0 allows the kernel to change the VMSA bit for a page + * using the RMPADJUST instruction. However, for the instruction to + * succeed it must target the permissions of a lesser privileged + * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST + * instruction in the AMD64 APM Volume 3). + */ + attrs = 1; + if (vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) +{ + int err; + + err = snp_set_vmsa(vmsa, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + +int hv_snp_boot_ap(int cpu, unsigned long start_ip) +{ + struct sev_es_save_area *vmsa = (struct sev_es_save_area *) + __get_free_page(GFP_KERNEL | __GFP_ZERO); + struct sev_es_save_area *cur_vmsa; + struct desc_ptr gdtr; + u64 ret, retry = 5; + struct hv_enable_vp_vtl *start_vp_input; + unsigned long flags; + + if (!vmsa) + return -ENOMEM; + + native_store_gdt(&gdtr); + + vmsa->gdtr.base = gdtr.address; + vmsa->gdtr.limit = gdtr.size; + + asm volatile("movl %%es, %%eax;" : "=a" (vmsa->es.selector)); + hv_populate_vmcb_seg(vmsa->es, vmsa->gdtr.base); + + asm volatile("movl %%cs, %%eax;" : "=a" (vmsa->cs.selector)); + hv_populate_vmcb_seg(vmsa->cs, vmsa->gdtr.base); + + asm volatile("movl %%ss, %%eax;" : "=a" (vmsa->ss.selector)); + hv_populate_vmcb_seg(vmsa->ss, vmsa->gdtr.base); + + asm volatile("movl %%ds, %%eax;" : "=a" (vmsa->ds.selector)); + hv_populate_vmcb_seg(vmsa->ds, vmsa->gdtr.base); + + vmsa->efer = native_read_msr(MSR_EFER); + + asm volatile("movq %%cr4, %%rax;" : "=a" (vmsa->cr4)); + asm volatile("movq %%cr3, %%rax;" : "=a" (vmsa->cr3)); + asm volatile("movq %%cr0, %%rax;" : "=a" (vmsa->cr0)); + + vmsa->xcr0 = 1; + vmsa->g_pat = HV_AP_INIT_GPAT_DEFAULT; + vmsa->rip = (u64)secondary_startup_64_no_verify; + vmsa->rsp = (u64)&ap_start_stack[PAGE_SIZE]; + + /* + * Set the SNP-specific fields for this VMSA: + * VMPL level + * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) + */ + vmsa->vmpl = 0; + vmsa->sev_features = sev_status >> 2; + + ret = snp_set_vmsa(vmsa, true); + if (!ret) { + pr_err("RMPADJUST(%llx) failed: %llx\n", (u64)vmsa, ret); + free_page((u64)vmsa); + return ret; + } + + local_irq_save(flags); + start_vp_input = (struct hv_enable_vp_vtl *)ap_start_input_arg; + memset(start_vp_input, 0, sizeof(*start_vp_input)); + start_vp_input->partition_id = -1; + start_vp_input->vp_index = cpu; + start_vp_input->target_vtl.target_vtl = ms_hyperv.vtl; + *(u64 *)&start_vp_input->vp_context = __pa(vmsa) | 1; + + do { + ret = hv_do_hypercall(HVCALL_START_VP, + start_vp_input, NULL); + } while (hv_result(ret) == HV_STATUS_TIME_OUT && retry--); + + local_irq_restore(flags); + + if (!hv_result_success(ret)) { + pr_err("HvCallStartVirtualProcessor failed: %llx\n", ret); + snp_cleanup_vmsa(vmsa); + vmsa = NULL; + } + + cur_vmsa = per_cpu(hv_sev_vmsa, cpu); + /* Free up any previous VMSA page */ + if (cur_vmsa) + snp_cleanup_vmsa(cur_vmsa); + + /* Record the current VMSA page */ + per_cpu(hv_sev_vmsa, cpu) = vmsa; + + return ret; +} + +#else +static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} +static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} +#endif /* CONFIG_AMD_MEM_ENCRYPT */ + +#ifdef CONFIG_INTEL_TDX_GUEST +static void hv_tdx_msr_write(u64 msr, u64 val) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_MSR_WRITE, + .r12 = msr, + .r13 = val, + }; + + u64 ret = __tdx_hypercall(&args); + + WARN_ONCE(ret, "Failed to emulate MSR write: %lld\n", ret); +} + +static void hv_tdx_msr_read(u64 msr, u64 *val) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_MSR_READ, + .r12 = msr, + }; + + u64 ret = __tdx_hypercall_ret(&args); + + if (WARN_ONCE(ret, "Failed to emulate MSR read: %lld\n", ret)) + *val = 0; + else + *val = args.r11; +} + +u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2) +{ + struct tdx_hypercall_args args = { }; + + args.r10 = control; + args.rdx = param1; + args.r8 = param2; + + (void)__tdx_hypercall_ret(&args); + + return args.r11; +} + +#else +static inline void hv_tdx_msr_write(u64 msr, u64 value) {} +static inline void hv_tdx_msr_read(u64 msr, u64 *value) {} +#endif /* CONFIG_INTEL_TDX_GUEST */ + +#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) +void hv_ivm_msr_write(u64 msr, u64 value) +{ + if (!ms_hyperv.paravisor_present) + return; + + if (hv_isolation_type_tdx()) + hv_tdx_msr_write(msr, value); + else if (hv_isolation_type_snp()) + hv_ghcb_msr_write(msr, value); +} + +void hv_ivm_msr_read(u64 msr, u64 *value) +{ + if (!ms_hyperv.paravisor_present) + return; + + if (hv_isolation_type_tdx()) + hv_tdx_msr_read(msr, value); + else if (hv_isolation_type_snp()) + hv_ghcb_msr_read(msr, value); +} /* * hv_mark_gpa_visibility - Set pages visible to host via hvcall. @@ -358,13 +574,34 @@ static bool hv_is_private_mmio(u64 addr) void __init hv_vtom_init(void) { + enum hv_isolation_type type = hv_get_isolation_type(); + + switch (type) { + case HV_ISOLATION_TYPE_VBS: + fallthrough; /* * By design, a VM using vTOM doesn't see the SEV setting, * so SEV initialization is bypassed and sev_status isn't set. * Set it here to indicate a vTOM VM. + * + * Note: if CONFIG_AMD_MEM_ENCRYPT is not set, sev_status is + * defined as 0ULL, to which we can't assigned a value. */ - sev_status = MSR_AMD64_SNP_VTOM; - cc_vendor = CC_VENDOR_AMD; +#ifdef CONFIG_AMD_MEM_ENCRYPT + case HV_ISOLATION_TYPE_SNP: + sev_status = MSR_AMD64_SNP_VTOM; + cc_vendor = CC_VENDOR_AMD; + break; +#endif + + case HV_ISOLATION_TYPE_TDX: + cc_vendor = CC_VENDOR_INTEL; + break; + + default: + panic("hv_vtom_init: unsupported isolation type %d\n", type); + } + cc_set_mask(ms_hyperv.shared_gpa_boundary); physical_mask &= ms_hyperv.shared_gpa_boundary - 1; @@ -377,7 +614,7 @@ void __init hv_vtom_init(void) mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); } -#endif /* CONFIG_AMD_MEM_ENCRYPT */ +#endif /* defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) */ enum hv_isolation_type hv_get_isolation_type(void) { @@ -405,10 +642,20 @@ bool hv_is_isolation_supported(void) DEFINE_STATIC_KEY_FALSE(isolation_type_snp); /* - * hv_isolation_type_snp - Check system runs in the AMD SEV-SNP based + * hv_isolation_type_snp - Check if the system runs in an AMD SEV-SNP based * isolation VM. */ bool hv_isolation_type_snp(void) { return static_branch_unlikely(&isolation_type_snp); } + +DEFINE_STATIC_KEY_FALSE(isolation_type_tdx); +/* + * hv_isolation_type_tdx - Check if the system runs in an Intel TDX based + * isolated VM. + */ +bool hv_isolation_type_tdx(void) +{ + return static_branch_unlikely(&isolation_type_tdx); +} diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index cea95dcd27c2..2ff26f53cd62 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -169,7 +169,8 @@ enum hv_isolation_type { HV_ISOLATION_TYPE_NONE = 0, HV_ISOLATION_TYPE_VBS = 1, - HV_ISOLATION_TYPE_SNP = 2 + HV_ISOLATION_TYPE_SNP = 2, + HV_ISOLATION_TYPE_TDX = 3 }; /* Hyper-V specific model specific registers (MSRs) */ @@ -301,6 +302,13 @@ enum hv_isolation_type { #define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT #define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC +/* + * Registers are only accessible via HVCALL_GET_VP_REGISTERS hvcall and + * there is not associated MSR address. + */ +#define HV_X64_REGISTER_VSM_VP_STATUS 0x000D0003 +#define HV_X64_VTL_MASK GENMASK(3, 0) + /* Hyper-V memory host visibility */ enum hv_mem_host_visibility { VMBUS_PAGE_NOT_VISIBLE = 0, diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index fa83d88e4c99..033b53f993c6 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -26,6 +26,7 @@ union hv_ghcb; DECLARE_STATIC_KEY_FALSE(isolation_type_snp); +DECLARE_STATIC_KEY_FALSE(isolation_type_tdx); typedef int (*hyperv_fill_flush_list_func)( struct hv_guest_mapping_flush_list *flush, @@ -40,6 +41,7 @@ static inline unsigned char hv_get_nmi_reason(void) #if IS_ENABLED(CONFIG_HYPERV) extern int hyperv_init_cpuhp; +extern bool hyperv_paravisor_present; extern void *hv_hypercall_pg; @@ -47,10 +49,25 @@ extern u64 hv_current_partition_id; extern union hv_ghcb * __percpu *hv_ghcb_pg; +bool hv_isolation_type_snp(void); +bool hv_isolation_type_tdx(void); +u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); + +/* + * DEFAULT INIT GPAT and SEGMENT LIMIT value in struct VMSA + * to start AP in enlightened SEV guest. + */ +#define HV_AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL +#define HV_AP_SEGMENT_LIMIT 0xffffffff + int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages); int hv_call_add_logical_proc(int node, u32 lp_index, u32 acpi_id); int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags); +/* + * If the hypercall involves no input or output parameters, the hypervisor + * ignores the corresponding GPA pointer. + */ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) { u64 input_address = input ? virt_to_phys(input) : 0; @@ -58,6 +75,19 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output) u64 hv_status; #ifdef CONFIG_X86_64 + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) + return hv_tdx_hypercall(control, input_address, output_address); + + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { + __asm__ __volatile__("mov %4, %%r8\n" + "vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input_address) + : "r" (output_address) + : "cc", "memory", "r8", "r9", "r10", "r11"); + return hv_status; + } + if (!hv_hypercall_pg) return U64_MAX; @@ -101,7 +131,16 @@ static inline u64 _hv_do_fast_hypercall8(u64 control, u64 input1) u64 hv_status; #ifdef CONFIG_X86_64 - { + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) + return hv_tdx_hypercall(control, input1, 0); + + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { + __asm__ __volatile__( + "vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input1) + :: "cc", "r8", "r9", "r10", "r11"); + } else { __asm__ __volatile__(CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, "+c" (control), "+d" (input1) @@ -146,7 +185,17 @@ static inline u64 _hv_do_fast_hypercall16(u64 control, u64 input1, u64 input2) u64 hv_status; #ifdef CONFIG_X86_64 - { + if (hv_isolation_type_tdx() && !hyperv_paravisor_present) + return hv_tdx_hypercall(control, input1, input2); + + if (hv_isolation_type_snp() && !hyperv_paravisor_present) { + __asm__ __volatile__("mov %4, %%r8\n" + "vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (input1) + : "r" (input2) + : "cc", "r8", "r9", "r10", "r11"); + } else { __asm__ __volatile__("mov %4, %%r8\n" CALL_NOSPEC : "=a" (hv_status), ASM_CALL_CONSTRAINT, @@ -225,20 +274,24 @@ int hv_map_ioapic_interrupt(int ioapic_id, bool level, int vcpu, int vector, int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry); #ifdef CONFIG_AMD_MEM_ENCRYPT -void hv_ghcb_msr_write(u64 msr, u64 value); -void hv_ghcb_msr_read(u64 msr, u64 *value); bool hv_ghcb_negotiate_protocol(void); void __noreturn hv_ghcb_terminate(unsigned int set, unsigned int reason); -void hv_vtom_init(void); +int hv_snp_boot_ap(int cpu, unsigned long start_ip); #else -static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} -static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} static inline bool hv_ghcb_negotiate_protocol(void) { return false; } static inline void hv_ghcb_terminate(unsigned int set, unsigned int reason) {} -static inline void hv_vtom_init(void) {} +static inline int hv_snp_boot_ap(int cpu, unsigned long start_ip) { return 0; } #endif -extern bool hv_isolation_type_snp(void); +#if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) +void hv_vtom_init(void); +void hv_ivm_msr_write(u64 msr, u64 value); +void hv_ivm_msr_read(u64 msr, u64 *value); +#else +static inline void hv_vtom_init(void) {} +static inline void hv_ivm_msr_write(u64 msr, u64 value) {} +static inline void hv_ivm_msr_read(u64 msr, u64 *value) {} +#endif static inline bool hv_is_synic_reg(unsigned int reg) { diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 0100468e72ca..e6bba12c759c 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -32,6 +32,7 @@ #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> #include <asm/numa.h> +#include <asm/svm.h> /* Is Linux running as the root partition? */ bool hv_root_partition; @@ -39,6 +40,10 @@ bool hv_root_partition; bool hv_nested; struct ms_hyperv_info ms_hyperv; +/* Used in modules via hv_do_hypercall(): see arch/x86/include/asm/mshyperv.h */ +bool hyperv_paravisor_present __ro_after_init; +EXPORT_SYMBOL_GPL(hyperv_paravisor_present); + #if IS_ENABLED(CONFIG_HYPERV) static inline unsigned int hv_get_nested_reg(unsigned int reg) { @@ -65,8 +70,8 @@ u64 hv_get_non_nested_register(unsigned int reg) { u64 value; - if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) - hv_ghcb_msr_read(reg, &value); + if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) + hv_ivm_msr_read(reg, &value); else rdmsrl(reg, value); return value; @@ -75,8 +80,8 @@ EXPORT_SYMBOL_GPL(hv_get_non_nested_register); void hv_set_non_nested_register(unsigned int reg, u64 value) { - if (hv_is_synic_reg(reg) && hv_isolation_type_snp()) { - hv_ghcb_msr_write(reg, value); + if (hv_is_synic_reg(reg) && ms_hyperv.paravisor_present) { + hv_ivm_msr_write(reg, value); /* Write proxy bit via wrmsl instruction */ if (hv_is_sint_reg(reg)) @@ -295,6 +300,15 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus) native_smp_prepare_cpus(max_cpus); + /* + * Override wakeup_secondary_cpu_64 callback for SEV-SNP + * enlightened guest. + */ + if (!ms_hyperv.paravisor_present && hv_isolation_type_snp()) { + apic->wakeup_secondary_cpu_64 = hv_snp_boot_ap; + return; + } + #ifdef CONFIG_X86_64 for_each_present_cpu(i) { if (i == 0) @@ -313,6 +327,26 @@ static void __init hv_smp_prepare_cpus(unsigned int max_cpus) } #endif +/* + * When a fully enlightened TDX VM runs on Hyper-V, the firmware sets the + * HW_REDUCED flag: refer to acpi_tb_create_local_fadt(). Consequently ttyS0 + * interrupts can't work because request_irq() -> ... -> irq_to_desc() returns + * NULL for ttyS0. This happens because mp_config_acpi_legacy_irqs() sees a + * nr_legacy_irqs() of 0, so it doesn't initialize the array 'mp_irqs[]', and + * later setup_IO_APIC_irqs() -> find_irq_entry() fails to find the legacy irqs + * from the array and hence doesn't create the necessary irq description info. + * + * Clone arch/x86/kernel/acpi/boot.c: acpi_generic_reduced_hw_init() here, + * except don't change 'legacy_pic', which keeps its default value + * 'default_legacy_pic'. This way, mp_config_acpi_legacy_irqs() sees a non-zero + * nr_legacy_irqs() and eventually serial console interrupts works properly. + */ +static void __init reduced_hw_init(void) +{ + x86_init.timers.timer_init = x86_init_noop; + x86_init.irqs.pre_vector_init = x86_init_noop; +} + static void __init ms_hyperv_init_platform(void) { int hv_max_functions_eax; @@ -399,11 +433,33 @@ static void __init ms_hyperv_init_platform(void) ms_hyperv.shared_gpa_boundary = BIT_ULL(ms_hyperv.shared_gpa_boundary_bits); + hyperv_paravisor_present = !!ms_hyperv.paravisor_present; + pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n", ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b); - if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) + + if (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP) { static_branch_enable(&isolation_type_snp); + } else if (hv_get_isolation_type() == HV_ISOLATION_TYPE_TDX) { + static_branch_enable(&isolation_type_tdx); + + /* A TDX VM must use x2APIC and doesn't use lazy EOI. */ + ms_hyperv.hints &= ~HV_X64_APIC_ACCESS_RECOMMENDED; + + if (!ms_hyperv.paravisor_present) { + /* To be supported: more work is required. */ + ms_hyperv.features &= ~HV_MSR_REFERENCE_TSC_AVAILABLE; + + /* HV_REGISTER_CRASH_CTL is unsupported. */ + ms_hyperv.misc_features &= ~HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + + /* Don't trust Hyper-V's TLB-flushing hypercalls. */ + ms_hyperv.hints &= ~HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; + + x86_init.acpi.reduced_hw_early_init = reduced_hw_init; + } + } } if (hv_max_functions_eax >= HYPERV_CPUID_NESTED_FEATURES) { @@ -473,7 +529,7 @@ static void __init ms_hyperv_init_platform(void) #if IS_ENABLED(CONFIG_HYPERV) if ((hv_get_isolation_type() == HV_ISOLATION_TYPE_VBS) || - (hv_get_isolation_type() == HV_ISOLATION_TYPE_SNP)) + ms_hyperv.paravisor_present) hv_vtom_init(); /* * Setup the hook to get control post apic initialization. @@ -497,7 +553,8 @@ static void __init ms_hyperv_init_platform(void) # ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu; - if (hv_root_partition) + if (hv_root_partition || + (!ms_hyperv.paravisor_present && hv_isolation_type_snp())) smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus; # endif @@ -560,6 +617,22 @@ static bool __init ms_hyperv_msi_ext_dest_id(void) return eax & HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE; } +#ifdef CONFIG_AMD_MEM_ENCRYPT +static void hv_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* RAX and CPL are already in the GHCB */ + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_r8(ghcb, regs->r8); +} + +static bool hv_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* No checking of the return state needed */ + return true; +} +#endif + const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, @@ -567,4 +640,8 @@ const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { .init.x2apic_available = ms_hyperv_x2apic_available, .init.msi_ext_dest_id = ms_hyperv_msi_ext_dest_id, .init.init_platform = ms_hyperv_init_platform, +#ifdef CONFIG_AMD_MEM_ENCRYPT + .runtime.sev_es_hcall_prepare = hv_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = hv_sev_es_hcall_finish, +#endif }; diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index e56307a81f4d..8ff7cd4e20bb 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -390,7 +390,7 @@ static __always_inline u64 read_hv_clock_msr(void) static union { struct ms_hyperv_tsc_page page; u8 reserved[PAGE_SIZE]; -} tsc_pg __aligned(PAGE_SIZE); +} tsc_pg __bss_decrypted __aligned(PAGE_SIZE); static struct ms_hyperv_tsc_page *tsc_page = &tsc_pg.page; static unsigned long tsc_pfn; diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index ebf15f31d97e..3cabeeabb1ca 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -98,6 +98,7 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) */ if (version >= VERSION_WIN10_V5) { msg->msg_sint = VMBUS_MESSAGE_SINT; + msg->msg_vtl = ms_hyperv.vtl; vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID_4; } else { msg->interrupt_page = virt_to_phys(vmbus_connection.int_page); @@ -482,10 +483,17 @@ void vmbus_set_event(struct vmbus_channel *channel) ++channel->sig_events; - if (hv_isolation_type_snp()) - hv_ghcb_hypercall(HVCALL_SIGNAL_EVENT, &channel->sig_event, - NULL, sizeof(channel->sig_event)); - else + if (ms_hyperv.paravisor_present) { + if (hv_isolation_type_snp()) + hv_ghcb_hypercall(HVCALL_SIGNAL_EVENT, &channel->sig_event, + NULL, sizeof(channel->sig_event)); + else if (hv_isolation_type_tdx()) + hv_tdx_hypercall(HVCALL_SIGNAL_EVENT | HV_HYPERCALL_FAST_BIT, + channel->sig_event, 0); + else + WARN_ON_ONCE(1); + } else { hv_do_fast_hypercall8(HVCALL_SIGNAL_EVENT, channel->sig_event); + } } EXPORT_SYMBOL_GPL(vmbus_set_event); diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index de6708dbe0df..51e5018ac9b2 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -20,6 +20,7 @@ #include <linux/interrupt.h> #include <clocksource/hyperv_timer.h> #include <asm/mshyperv.h> +#include <linux/set_memory.h> #include "hyperv_vmbus.h" /* The one and only */ @@ -56,20 +57,37 @@ int hv_post_message(union hv_connection_id connection_id, local_irq_save(flags); - aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg); + /* + * A TDX VM with the paravisor must use the decrypted post_msg_page: see + * the comment in struct hv_per_cpu_context. A SNP VM with the paravisor + * can use the encrypted hyperv_pcpu_input_arg because it copies the + * input into the GHCB page, which has been decrypted by the paravisor. + */ + if (hv_isolation_type_tdx() && ms_hyperv.paravisor_present) + aligned_msg = this_cpu_ptr(hv_context.cpu_context)->post_msg_page; + else + aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg); + aligned_msg->connectionid = connection_id; aligned_msg->reserved = 0; aligned_msg->message_type = message_type; aligned_msg->payload_size = payload_size; memcpy((void *)aligned_msg->payload, payload, payload_size); - if (hv_isolation_type_snp()) - status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE, - (void *)aligned_msg, NULL, - sizeof(*aligned_msg)); - else + if (ms_hyperv.paravisor_present) { + if (hv_isolation_type_tdx()) + status = hv_tdx_hypercall(HVCALL_POST_MESSAGE, + virt_to_phys(aligned_msg), 0); + else if (hv_isolation_type_snp()) + status = hv_ghcb_hypercall(HVCALL_POST_MESSAGE, + aligned_msg, NULL, + sizeof(*aligned_msg)); + else + status = HV_STATUS_INVALID_PARAMETER; + } else { status = hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL); + } local_irq_restore(flags); @@ -78,7 +96,7 @@ int hv_post_message(union hv_connection_id connection_id, int hv_synic_alloc(void) { - int cpu; + int cpu, ret = -ENOMEM; struct hv_per_cpu_context *hv_cpu; /* @@ -104,11 +122,29 @@ int hv_synic_alloc(void) tasklet_init(&hv_cpu->msg_dpc, vmbus_on_msg_dpc, (unsigned long) hv_cpu); + if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { + hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC); + if (hv_cpu->post_msg_page == NULL) { + pr_err("Unable to allocate post msg page\n"); + goto err; + } + + ret = set_memory_decrypted((unsigned long)hv_cpu->post_msg_page, 1); + if (ret) { + pr_err("Failed to decrypt post msg page: %d\n", ret); + /* Just leak the page, as it's unsafe to free the page. */ + hv_cpu->post_msg_page = NULL; + goto err; + } + + memset(hv_cpu->post_msg_page, 0, PAGE_SIZE); + } + /* * Synic message and event pages are allocated by paravisor. * Skip these pages allocation here. */ - if (!hv_isolation_type_snp() && !hv_root_partition) { + if (!ms_hyperv.paravisor_present && !hv_root_partition) { hv_cpu->synic_message_page = (void *)get_zeroed_page(GFP_ATOMIC); if (hv_cpu->synic_message_page == NULL) { @@ -120,29 +156,96 @@ int hv_synic_alloc(void) (void *)get_zeroed_page(GFP_ATOMIC); if (hv_cpu->synic_event_page == NULL) { pr_err("Unable to allocate SYNIC event page\n"); + + free_page((unsigned long)hv_cpu->synic_message_page); + hv_cpu->synic_message_page = NULL; goto err; } } + + if (!ms_hyperv.paravisor_present && + (hv_isolation_type_snp() || hv_isolation_type_tdx())) { + ret = set_memory_decrypted((unsigned long) + hv_cpu->synic_message_page, 1); + if (ret) { + pr_err("Failed to decrypt SYNIC msg page: %d\n", ret); + hv_cpu->synic_message_page = NULL; + + /* + * Free the event page here so that hv_synic_free() + * won't later try to re-encrypt it. + */ + free_page((unsigned long)hv_cpu->synic_event_page); + hv_cpu->synic_event_page = NULL; + goto err; + } + + ret = set_memory_decrypted((unsigned long) + hv_cpu->synic_event_page, 1); + if (ret) { + pr_err("Failed to decrypt SYNIC event page: %d\n", ret); + hv_cpu->synic_event_page = NULL; + goto err; + } + + memset(hv_cpu->synic_message_page, 0, PAGE_SIZE); + memset(hv_cpu->synic_event_page, 0, PAGE_SIZE); + } } return 0; + err: /* * Any memory allocations that succeeded will be freed when * the caller cleans up by calling hv_synic_free() */ - return -ENOMEM; + return ret; } void hv_synic_free(void) { - int cpu; + int cpu, ret; for_each_present_cpu(cpu) { struct hv_per_cpu_context *hv_cpu = per_cpu_ptr(hv_context.cpu_context, cpu); + /* It's better to leak the page if the encryption fails. */ + if (ms_hyperv.paravisor_present && hv_isolation_type_tdx()) { + if (hv_cpu->post_msg_page) { + ret = set_memory_encrypted((unsigned long) + hv_cpu->post_msg_page, 1); + if (ret) { + pr_err("Failed to encrypt post msg page: %d\n", ret); + hv_cpu->post_msg_page = NULL; + } + } + } + + if (!ms_hyperv.paravisor_present && + (hv_isolation_type_snp() || hv_isolation_type_tdx())) { + if (hv_cpu->synic_message_page) { + ret = set_memory_encrypted((unsigned long) + hv_cpu->synic_message_page, 1); + if (ret) { + pr_err("Failed to encrypt SYNIC msg page: %d\n", ret); + hv_cpu->synic_message_page = NULL; + } + } + + if (hv_cpu->synic_event_page) { + ret = set_memory_encrypted((unsigned long) + hv_cpu->synic_event_page, 1); + if (ret) { + pr_err("Failed to encrypt SYNIC event page: %d\n", ret); + hv_cpu->synic_event_page = NULL; + } + } + } + + free_page((unsigned long)hv_cpu->post_msg_page); free_page((unsigned long)hv_cpu->synic_event_page); free_page((unsigned long)hv_cpu->synic_message_page); } @@ -170,7 +273,7 @@ void hv_synic_enable_regs(unsigned int cpu) simp.as_uint64 = hv_get_register(HV_REGISTER_SIMP); simp.simp_enabled = 1; - if (hv_isolation_type_snp() || hv_root_partition) { + if (ms_hyperv.paravisor_present || hv_root_partition) { /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; @@ -189,7 +292,7 @@ void hv_synic_enable_regs(unsigned int cpu) siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP); siefp.siefp_enabled = 1; - if (hv_isolation_type_snp() || hv_root_partition) { + if (ms_hyperv.paravisor_present || hv_root_partition) { /* Mask out vTOM bit. ioremap_cache() maps decrypted */ u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & ~ms_hyperv.shared_gpa_boundary; @@ -272,7 +375,7 @@ void hv_synic_disable_regs(unsigned int cpu) * addresses. */ simp.simp_enabled = 0; - if (hv_isolation_type_snp() || hv_root_partition) { + if (ms_hyperv.paravisor_present || hv_root_partition) { iounmap(hv_cpu->synic_message_page); hv_cpu->synic_message_page = NULL; } else { @@ -284,7 +387,7 @@ void hv_synic_disable_regs(unsigned int cpu) siefp.as_uint64 = hv_get_register(HV_REGISTER_SIEFP); siefp.siefp_enabled = 0; - if (hv_isolation_type_snp() || hv_root_partition) { + if (ms_hyperv.paravisor_present || hv_root_partition) { iounmap(hv_cpu->synic_event_page); hv_cpu->synic_event_page = NULL; } else { diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 0d7a3ba66396..e000fa3b9f97 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/cleanup.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/mman.h> @@ -646,7 +647,7 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) { struct memory_notify *mem = (struct memory_notify *)v; - unsigned long flags, pfn_count; + unsigned long pfn_count; switch (val) { case MEM_ONLINE: @@ -655,21 +656,22 @@ static int hv_memory_notifier(struct notifier_block *nb, unsigned long val, break; case MEM_OFFLINE: - spin_lock_irqsave(&dm_device.ha_lock, flags); - pfn_count = hv_page_offline_check(mem->start_pfn, - mem->nr_pages); - if (pfn_count <= dm_device.num_pages_onlined) { - dm_device.num_pages_onlined -= pfn_count; - } else { - /* - * We're offlining more pages than we managed to online. - * This is unexpected. In any case don't let - * num_pages_onlined wrap around zero. - */ - WARN_ON_ONCE(1); - dm_device.num_pages_onlined = 0; + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { + pfn_count = hv_page_offline_check(mem->start_pfn, + mem->nr_pages); + if (pfn_count <= dm_device.num_pages_onlined) { + dm_device.num_pages_onlined -= pfn_count; + } else { + /* + * We're offlining more pages than we + * managed to online. This is + * unexpected. In any case don't let + * num_pages_onlined wrap around zero. + */ + WARN_ON_ONCE(1); + dm_device.num_pages_onlined = 0; + } } - spin_unlock_irqrestore(&dm_device.ha_lock, flags); break; case MEM_GOING_ONLINE: case MEM_GOING_OFFLINE: @@ -721,24 +723,23 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, unsigned long start_pfn; unsigned long processed_pfn; unsigned long total_pfn = pfn_count; - unsigned long flags; for (i = 0; i < (size/HA_CHUNK); i++) { start_pfn = start + (i * HA_CHUNK); - spin_lock_irqsave(&dm_device.ha_lock, flags); - has->ha_end_pfn += HA_CHUNK; + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { + has->ha_end_pfn += HA_CHUNK; - if (total_pfn > HA_CHUNK) { - processed_pfn = HA_CHUNK; - total_pfn -= HA_CHUNK; - } else { - processed_pfn = total_pfn; - total_pfn = 0; - } + if (total_pfn > HA_CHUNK) { + processed_pfn = HA_CHUNK; + total_pfn -= HA_CHUNK; + } else { + processed_pfn = total_pfn; + total_pfn = 0; + } - has->covered_end_pfn += processed_pfn; - spin_unlock_irqrestore(&dm_device.ha_lock, flags); + has->covered_end_pfn += processed_pfn; + } reinit_completion(&dm_device.ol_waitevent); @@ -758,10 +759,10 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, */ do_hot_add = false; } - spin_lock_irqsave(&dm_device.ha_lock, flags); - has->ha_end_pfn -= HA_CHUNK; - has->covered_end_pfn -= processed_pfn; - spin_unlock_irqrestore(&dm_device.ha_lock, flags); + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { + has->ha_end_pfn -= HA_CHUNK; + has->covered_end_pfn -= processed_pfn; + } break; } @@ -781,10 +782,9 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, static void hv_online_page(struct page *pg, unsigned int order) { struct hv_hotadd_state *has; - unsigned long flags; unsigned long pfn = page_to_pfn(pg); - spin_lock_irqsave(&dm_device.ha_lock, flags); + guard(spinlock_irqsave)(&dm_device.ha_lock); list_for_each_entry(has, &dm_device.ha_region_list, list) { /* The page belongs to a different HAS. */ if ((pfn < has->start_pfn) || @@ -794,7 +794,6 @@ static void hv_online_page(struct page *pg, unsigned int order) hv_bring_pgs_online(has, pfn, 1UL << order); break; } - spin_unlock_irqrestore(&dm_device.ha_lock, flags); } static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) @@ -803,9 +802,8 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) struct hv_hotadd_gap *gap; unsigned long residual, new_inc; int ret = 0; - unsigned long flags; - spin_lock_irqsave(&dm_device.ha_lock, flags); + guard(spinlock_irqsave)(&dm_device.ha_lock); list_for_each_entry(has, &dm_device.ha_region_list, list) { /* * If the pfn range we are dealing with is not in the current @@ -852,7 +850,6 @@ static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) ret = 1; break; } - spin_unlock_irqrestore(&dm_device.ha_lock, flags); return ret; } @@ -947,7 +944,6 @@ static unsigned long process_hot_add(unsigned long pg_start, { struct hv_hotadd_state *ha_region = NULL; int covered; - unsigned long flags; if (pfn_cnt == 0) return 0; @@ -979,9 +975,9 @@ static unsigned long process_hot_add(unsigned long pg_start, ha_region->covered_end_pfn = pg_start; ha_region->end_pfn = rg_start + rg_size; - spin_lock_irqsave(&dm_device.ha_lock, flags); - list_add_tail(&ha_region->list, &dm_device.ha_region_list); - spin_unlock_irqrestore(&dm_device.ha_lock, flags); + scoped_guard(spinlock_irqsave, &dm_device.ha_lock) { + list_add_tail(&ha_region->list, &dm_device.ha_region_list); + } } do_pg_range: @@ -2047,7 +2043,6 @@ static void balloon_remove(struct hv_device *dev) struct hv_dynmem_device *dm = hv_get_drvdata(dev); struct hv_hotadd_state *has, *tmp; struct hv_hotadd_gap *gap, *tmp_gap; - unsigned long flags; if (dm->num_pages_ballooned != 0) pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); @@ -2073,7 +2068,7 @@ static void balloon_remove(struct hv_device *dev) #endif } - spin_lock_irqsave(&dm_device.ha_lock, flags); + guard(spinlock_irqsave)(&dm_device.ha_lock); list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) { list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) { list_del(&gap->list); @@ -2082,7 +2077,6 @@ static void balloon_remove(struct hv_device *dev) list_del(&has->list); kfree(has); } - spin_unlock_irqrestore(&dm_device.ha_lock, flags); } static int balloon_suspend(struct hv_device *hv_dev) diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 6a2258fef1fe..ccad7bca3fd3 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -24,6 +24,7 @@ #include <linux/kmsg_dump.h> #include <linux/slab.h> #include <linux/dma-map-ops.h> +#include <linux/set_memory.h> #include <asm/hyperv-tlfs.h> #include <asm/mshyperv.h> @@ -359,6 +360,8 @@ int hv_common_cpu_init(unsigned int cpu) u64 msr_vp_index; gfp_t flags; int pgcount = hv_root_partition ? 2 : 1; + void *mem; + int ret; /* hv_cpu_init() can be called with IRQs disabled from hv_resume() */ flags = irqs_disabled() ? GFP_ATOMIC : GFP_KERNEL; @@ -370,14 +373,41 @@ int hv_common_cpu_init(unsigned int cpu) * allocated if this CPU was previously online and then taken offline */ if (!*inputarg) { - *inputarg = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); - if (!(*inputarg)) + mem = kmalloc(pgcount * HV_HYP_PAGE_SIZE, flags); + if (!mem) return -ENOMEM; if (hv_root_partition) { outputarg = (void **)this_cpu_ptr(hyperv_pcpu_output_arg); - *outputarg = (char *)(*inputarg) + HV_HYP_PAGE_SIZE; + *outputarg = (char *)mem + HV_HYP_PAGE_SIZE; + } + + if (!ms_hyperv.paravisor_present && + (hv_isolation_type_snp() || hv_isolation_type_tdx())) { + ret = set_memory_decrypted((unsigned long)mem, pgcount); + if (ret) { + /* It may be unsafe to free 'mem' */ + return ret; + } + + memset(mem, 0x00, pgcount * HV_HYP_PAGE_SIZE); } + + /* + * In a fully enlightened TDX/SNP VM with more than 64 VPs, if + * hyperv_pcpu_input_arg is not NULL, set_memory_decrypted() -> + * ... -> cpa_flush()-> ... -> __send_ipi_mask_ex() tries to + * use hyperv_pcpu_input_arg as the hypercall input page, which + * must be a decrypted page in such a VM, but the page is still + * encrypted before set_memory_decrypted() returns. Fix this by + * setting *inputarg after the above set_memory_decrypted(): if + * hyperv_pcpu_input_arg is NULL, __send_ipi_mask_ex() returns + * HV_STATUS_INVALID_PARAMETER immediately, and the function + * hv_send_ipi_mask() falls back to orig_apic.send_IPI_mask(), + * which may be slightly slower than the hypercall, but still + * works correctly in such a VM. + */ + *inputarg = mem; } msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX); @@ -502,6 +532,12 @@ bool __weak hv_isolation_type_snp(void) } EXPORT_SYMBOL_GPL(hv_isolation_type_snp); +bool __weak hv_isolation_type_tdx(void) +{ + return false; +} +EXPORT_SYMBOL_GPL(hv_isolation_type_tdx); + void __weak hv_setup_vmbus_handler(void (*handler)(void)) { } @@ -542,3 +578,9 @@ u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_s return HV_STATUS_INVALID_PARAMETER; } EXPORT_SYMBOL_GPL(hv_ghcb_hypercall); + +u64 __weak hv_tdx_hypercall(u64 control, u64 param1, u64 param2) +{ + return HV_STATUS_INVALID_PARAMETER; +} +EXPORT_SYMBOL_GPL(hv_tdx_hypercall); diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 55f2086841ae..f6b1e710f805 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -124,6 +124,17 @@ struct hv_per_cpu_context { void *synic_event_page; /* + * The page is only used in hv_post_message() for a TDX VM (with the + * paravisor) to post a messages to Hyper-V: when such a VM calls + * HVCALL_POST_MESSAGE, it can't use the hyperv_pcpu_input_arg (which + * is encrypted in such a VM) as the hypercall input page, because + * the input page for HVCALL_POST_MESSAGE must be decrypted in such a + * VM, so post_msg_page (which is decrypted in hv_synic_alloc()) is + * introduced for this purpose. See hyperv_init() for more comments. + */ + void *post_msg_page; + + /* * Starting with win8, we can take channel interrupts on any CPU; * we will manage the tasklet that handles events messages on a per CPU * basis. diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 67f95a29aeca..edbb38f6956b 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -2287,7 +2287,8 @@ static int vmbus_acpi_add(struct platform_device *pdev) * Some ancestor of the vmbus acpi device (Gen1 or Gen2 * firmware) is the VMOD that has the mmio ranges. Get that. */ - for (ancestor = acpi_dev_parent(device); ancestor; + for (ancestor = acpi_dev_parent(device); + ancestor && ancestor->handle != ACPI_ROOT_OBJECT; ancestor = acpi_dev_parent(ancestor)) { result = acpi_walk_resources(ancestor->handle, METHOD_NAME__CRS, vmbus_walk_resources, NULL); diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h index f4e4cc4f965f..fdac4a1714ec 100644 --- a/include/asm-generic/hyperv-tlfs.h +++ b/include/asm-generic/hyperv-tlfs.h @@ -223,6 +223,7 @@ enum HV_GENERIC_SET_FORMAT { #define HV_STATUS_INVALID_PORT_ID 17 #define HV_STATUS_INVALID_CONNECTION_ID 18 #define HV_STATUS_INSUFFICIENT_BUFFERS 19 +#define HV_STATUS_TIME_OUT 120 #define HV_STATUS_VTL_ALREADY_ENABLED 134 /* diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index a8f4b653ef4e..cecd2b7bd033 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -36,18 +36,25 @@ struct ms_hyperv_info { u32 nested_features; u32 max_vp_index; u32 max_lp_index; - u32 isolation_config_a; + union { + u32 isolation_config_a; + struct { + u32 paravisor_present : 1; + u32 reserved_a1 : 31; + }; + }; union { u32 isolation_config_b; struct { u32 cvm_type : 4; - u32 reserved1 : 1; + u32 reserved_b1 : 1; u32 shared_gpa_boundary_active : 1; u32 shared_gpa_boundary_bits : 6; - u32 reserved2 : 20; + u32 reserved_b2 : 20; }; }; u64 shared_gpa_boundary; + u8 vtl; }; extern struct ms_hyperv_info ms_hyperv; extern bool hv_nested; @@ -57,7 +64,8 @@ extern void * __percpu *hyperv_pcpu_output_arg; extern u64 hv_do_hypercall(u64 control, void *inputaddr, void *outputaddr); extern u64 hv_do_fast_hypercall8(u16 control, u64 input8); -extern bool hv_isolation_type_snp(void); +bool hv_isolation_type_snp(void); +bool hv_isolation_type_tdx(void); /* Helper functions that provide a consistent pattern for checking Hyper-V hypercall status. */ static inline int hv_result(u64 status) @@ -274,6 +282,7 @@ enum hv_isolation_type hv_get_isolation_type(void); bool hv_is_isolation_supported(void); bool hv_isolation_type_snp(void); u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size); +u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); void hyperv_cleanup(void); bool hv_query_ext_cap(u64 cap_query); void hv_setup_dma_ops(struct device *dev, bool coherent); diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 3ac3974b3c78..2b00faf98017 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -348,7 +348,7 @@ struct vmtransfer_page_packet_header { u8 sender_owns_set; u8 reserved; u32 range_cnt; - struct vmtransfer_page_range ranges[1]; + struct vmtransfer_page_range ranges[]; } __packed; struct vmgpadl_packet_header { @@ -665,8 +665,8 @@ struct vmbus_channel_initiate_contact { u64 interrupt_page; struct { u8 msg_sint; - u8 padding1[3]; - u32 padding2; + u8 msg_vtl; + u8 reserved[6]; }; }; u64 monitor_page1; |