diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-27 17:17:12 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-27 17:17:12 -0700 |
commit | da46b58ff884146f6153064f18d276806f3c114c (patch) | |
tree | cdf15e3e00571771fb337dea79d3cf474efeafc4 /drivers | |
parent | 8ccd54fe45713cd458015b5b08d6098545e70543 (diff) | |
parent | a494aef23dfc732945cb42e22246a5c31174e4a5 (diff) |
Merge tag 'hyperv-next-signed-20230424' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux
Pull hyperv updates from Wei Liu:
- PCI passthrough for Hyper-V confidential VMs (Michael Kelley)
- Hyper-V VTL mode support (Saurabh Sengar)
- Move panic report initialization code earlier (Long Li)
- Various improvements and bug fixes (Dexuan Cui and Michael Kelley)
* tag 'hyperv-next-signed-20230424' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux: (22 commits)
PCI: hv: Replace retarget_msi_interrupt_params with hyperv_pcpu_input_arg
Drivers: hv: move panic report code from vmbus to hv early init code
x86/hyperv: VTL support for Hyper-V
Drivers: hv: Kconfig: Add HYPERV_VTL_MODE
x86/hyperv: Make hv_get_nmi_reason public
x86/hyperv: Add VTL specific structs and hypercalls
x86/init: Make get/set_rtc_noop() public
x86/hyperv: Exclude lazy TLB mode CPUs from enlightened TLB flushes
x86/hyperv: Add callback filter to cpumask_to_vpset()
Drivers: hv: vmbus: Remove the per-CPU post_msg_page
clocksource: hyper-v: make sure Invariant-TSC is used if it is available
PCI: hv: Enable PCI pass-thru devices in Confidential VMs
Drivers: hv: Don't remap addresses that are above shared_gpa_boundary
hv_netvsc: Remove second mapping of send and recv buffers
Drivers: hv: vmbus: Remove second way of mapping ring buffers
Drivers: hv: vmbus: Remove second mapping of VMBus monitor pages
swiotlb: Remove bounce buffer remapping for Hyper-V
Driver: VMBus: Add Devicetree support
dt-bindings: bus: Add Hyper-V VMBus
Drivers: hv: vmbus: Convert acpi_device to more generic platform_device
...
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/clocksource/hyperv_timer.c | 21 | ||||
-rw-r--r-- | drivers/hv/Kconfig | 30 | ||||
-rw-r--r-- | drivers/hv/channel_mgmt.c | 2 | ||||
-rw-r--r-- | drivers/hv/connection.c | 113 | ||||
-rw-r--r-- | drivers/hv/hv.c | 79 | ||||
-rw-r--r-- | drivers/hv/hv_common.c | 242 | ||||
-rw-r--r-- | drivers/hv/hyperv_vmbus.h | 6 | ||||
-rw-r--r-- | drivers/hv/ring_buffer.c | 62 | ||||
-rw-r--r-- | drivers/hv/vmbus_drv.c | 312 | ||||
-rw-r--r-- | drivers/net/hyperv/hyperv_net.h | 2 | ||||
-rw-r--r-- | drivers/net/hyperv/netvsc.c | 48 | ||||
-rw-r--r-- | drivers/pci/controller/pci-hyperv.c | 280 |
12 files changed, 614 insertions, 583 deletions
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index c0cef92b12b8..bcd9042a0c9f 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -49,7 +49,7 @@ static bool direct_mode_enabled; static int stimer0_irq = -1; static int stimer0_message_sint; -static DEFINE_PER_CPU(long, stimer0_evt); +static __maybe_unused DEFINE_PER_CPU(long, stimer0_evt); /* * Common code for stimer0 interrupts coming via Direct Mode or @@ -68,7 +68,7 @@ EXPORT_SYMBOL_GPL(hv_stimer0_isr); * stimer0 interrupt handler for architectures that support * per-cpu interrupts, which also implies Direct Mode. */ -static irqreturn_t hv_stimer0_percpu_isr(int irq, void *dev_id) +static irqreturn_t __maybe_unused hv_stimer0_percpu_isr(int irq, void *dev_id) { hv_stimer0_isr(); return IRQ_HANDLED; @@ -196,6 +196,7 @@ void __weak hv_remove_stimer0_handler(void) { }; +#ifdef CONFIG_ACPI /* Called only on architectures with per-cpu IRQs (i.e., not x86/x64) */ static int hv_setup_stimer0_irq(void) { @@ -230,6 +231,16 @@ static void hv_remove_stimer0_irq(void) stimer0_irq = -1; } } +#else +static int hv_setup_stimer0_irq(void) +{ + return 0; +} + +static void hv_remove_stimer0_irq(void) +{ +} +#endif /* hv_stimer_alloc - Global initialization of the clockevent and stimer0 */ int hv_stimer_alloc(bool have_percpu_irqs) @@ -506,9 +517,6 @@ static bool __init hv_init_tsc_clocksource(void) { union hv_reference_tsc_msr tsc_msr; - if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) - return false; - /* * If Hyper-V offers TSC_INVARIANT, then the virtualized TSC correctly * handles frequency and offset changes due to live migration, @@ -525,6 +533,9 @@ static bool __init hv_init_tsc_clocksource(void) hyperv_cs_msr.rating = 250; } + if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) + return false; + hv_read_reference_counter = read_hv_clock_tsc; /* diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 0747a8f1fcee..00242107d62e 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -4,15 +4,39 @@ menu "Microsoft Hyper-V guest support" config HYPERV tristate "Microsoft Hyper-V client drivers" - depends on ACPI && ((X86 && X86_LOCAL_APIC && HYPERVISOR_GUEST) \ - || (ARM64 && !CPU_BIG_ENDIAN)) + depends on (X86 && X86_LOCAL_APIC && HYPERVISOR_GUEST) \ + || (ACPI && ARM64 && !CPU_BIG_ENDIAN) select PARAVIRT select X86_HV_CALLBACK_VECTOR if X86 - select VMAP_PFN + select OF_EARLY_FLATTREE if OF help Select this option to run Linux as a Hyper-V client operating system. +config HYPERV_VTL_MODE + bool "Enable Linux to boot in VTL context" + depends on X86_64 && HYPERV + default n + help + Virtual Secure Mode (VSM) is a set of hypervisor capabilities and + enlightenments offered to host and guest partitions which enables + the creation and management of new security boundaries within + operating system software. + + VSM achieves and maintains isolation through Virtual Trust Levels + (VTLs). Virtual Trust Levels are hierarchical, with higher levels + being more privileged than lower levels. VTL0 is the least privileged + level, and currently only other level supported is VTL2. + + Select this option to build a Linux kernel to run at a VTL other than + the normal VTL0, which currently is only VTL2. This option + initializes the x86 platform for VTL2, and adds the ability to boot + secondary CPUs directly into 64-bit context as required for VTLs other + than 0. A kernel built with this option must run at VTL2, and will + not run as a normal guest. + + If unsure, say N + config HYPERV_TIMER def_bool HYPERV && X86 diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index cc23b90cae02..007f26d5f1a4 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -67,7 +67,7 @@ const struct vmbus_device vmbus_devs[] = { { .dev_type = HV_PCIE, HV_PCIE_GUID, .perf_device = false, - .allowed_in_isolated = false, + .allowed_in_isolated = true, }, /* Synthetic Frame Buffer */ diff --git a/drivers/hv/connection.c b/drivers/hv/connection.c index da51b50787df..5978e9dbc286 100644 --- a/drivers/hv/connection.c +++ b/drivers/hv/connection.c @@ -104,8 +104,14 @@ int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) vmbus_connection.msg_conn_id = VMBUS_MESSAGE_CONNECTION_ID; } - msg->monitor_page1 = vmbus_connection.monitor_pages_pa[0]; - msg->monitor_page2 = vmbus_connection.monitor_pages_pa[1]; + /* + * shared_gpa_boundary is zero in non-SNP VMs, so it's safe to always + * bitwise OR it + */ + msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]) | + ms_hyperv.shared_gpa_boundary; + msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]) | + ms_hyperv.shared_gpa_boundary; msg->target_vcpu = hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU); @@ -219,72 +225,27 @@ int vmbus_connect(void) * Setup the monitor notification facility. The 1st page for * parent->child and the 2nd page for child->parent */ - vmbus_connection.monitor_pages[0] = (void *)hv_alloc_hyperv_zeroed_page(); - vmbus_connection.monitor_pages[1] = (void *)hv_alloc_hyperv_zeroed_page(); + vmbus_connection.monitor_pages[0] = (void *)hv_alloc_hyperv_page(); + vmbus_connection.monitor_pages[1] = (void *)hv_alloc_hyperv_page(); if ((vmbus_connection.monitor_pages[0] == NULL) || (vmbus_connection.monitor_pages[1] == NULL)) { ret = -ENOMEM; goto cleanup; } - vmbus_connection.monitor_pages_original[0] - = vmbus_connection.monitor_pages[0]; - vmbus_connection.monitor_pages_original[1] - = vmbus_connection.monitor_pages[1]; - vmbus_connection.monitor_pages_pa[0] - = virt_to_phys(vmbus_connection.monitor_pages[0]); - vmbus_connection.monitor_pages_pa[1] - = virt_to_phys(vmbus_connection.monitor_pages[1]); - - if (hv_is_isolation_supported()) { - ret = set_memory_decrypted((unsigned long) - vmbus_connection.monitor_pages[0], - 1); - ret |= set_memory_decrypted((unsigned long) - vmbus_connection.monitor_pages[1], - 1); - if (ret) - goto cleanup; - - /* - * Isolation VM with AMD SNP needs to access monitor page via - * address space above shared gpa boundary. - */ - if (hv_isolation_type_snp()) { - vmbus_connection.monitor_pages_pa[0] += - ms_hyperv.shared_gpa_boundary; - vmbus_connection.monitor_pages_pa[1] += - ms_hyperv.shared_gpa_boundary; - - vmbus_connection.monitor_pages[0] - = memremap(vmbus_connection.monitor_pages_pa[0], - HV_HYP_PAGE_SIZE, - MEMREMAP_WB); - if (!vmbus_connection.monitor_pages[0]) { - ret = -ENOMEM; - goto cleanup; - } - - vmbus_connection.monitor_pages[1] - = memremap(vmbus_connection.monitor_pages_pa[1], - HV_HYP_PAGE_SIZE, - MEMREMAP_WB); - if (!vmbus_connection.monitor_pages[1]) { - ret = -ENOMEM; - goto cleanup; - } - } - - /* - * Set memory host visibility hvcall smears memory - * and so zero monitor pages here. - */ - memset(vmbus_connection.monitor_pages[0], 0x00, - HV_HYP_PAGE_SIZE); - memset(vmbus_connection.monitor_pages[1], 0x00, - HV_HYP_PAGE_SIZE); + ret = set_memory_decrypted((unsigned long) + vmbus_connection.monitor_pages[0], 1); + ret |= set_memory_decrypted((unsigned long) + vmbus_connection.monitor_pages[1], 1); + if (ret) + goto cleanup; - } + /* + * Set_memory_decrypted() will change the memory contents if + * decryption occurs, so zero monitor pages here. + */ + memset(vmbus_connection.monitor_pages[0], 0x00, HV_HYP_PAGE_SIZE); + memset(vmbus_connection.monitor_pages[1], 0x00, HV_HYP_PAGE_SIZE); msginfo = kzalloc(sizeof(*msginfo) + sizeof(struct vmbus_channel_initiate_contact), @@ -376,31 +337,13 @@ void vmbus_disconnect(void) vmbus_connection.int_page = NULL; } - if (hv_is_isolation_supported()) { - /* - * memunmap() checks input address is ioremap address or not - * inside. It doesn't unmap any thing in the non-SNP CVM and - * so not check CVM type here. - */ - memunmap(vmbus_connection.monitor_pages[0]); - memunmap(vmbus_connection.monitor_pages[1]); - - set_memory_encrypted((unsigned long) - vmbus_connection.monitor_pages_original[0], - 1); - set_memory_encrypted((unsigned long) - vmbus_connection.monitor_pages_original[1], - 1); - } + set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[0], 1); + set_memory_encrypted((unsigned long)vmbus_connection.monitor_pages[1], 1); - hv_free_hyperv_page((unsigned long) - vmbus_connection.monitor_pages_original[0]); - hv_free_hyperv_page((unsigned long) - vmbus_connection.monitor_pages_original[1]); - vmbus_connection.monitor_pages_original[0] = - vmbus_connection.monitor_pages[0] = NULL; - vmbus_connection.monitor_pages_original[1] = - vmbus_connection.monitor_pages[1] = NULL; + hv_free_hyperv_page((unsigned long)vmbus_connection.monitor_pages[0]); + hv_free_hyperv_page((unsigned long)vmbus_connection.monitor_pages[1]); + vmbus_connection.monitor_pages[0] = NULL; + vmbus_connection.monitor_pages[1] = NULL; } /* diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 8b0dd8e5244d..de6708dbe0df 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -39,42 +39,6 @@ int hv_init(void) } /* - * Functions for allocating and freeing memory with size and - * alignment HV_HYP_PAGE_SIZE. These functions are needed because - * the guest page size may not be the same as the Hyper-V page - * size. We depend upon kmalloc() aligning power-of-two size - * allocations to the allocation size boundary, so that the - * allocated memory appears to Hyper-V as a page of the size - * it expects. - */ - -void *hv_alloc_hyperv_page(void) -{ - BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE); - - if (PAGE_SIZE == HV_HYP_PAGE_SIZE) - return (void *)__get_free_page(GFP_KERNEL); - else - return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); -} - -void *hv_alloc_hyperv_zeroed_page(void) -{ - if (PAGE_SIZE == HV_HYP_PAGE_SIZE) - return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - else - return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); -} - -void hv_free_hyperv_page(unsigned long addr) -{ - if (PAGE_SIZE == HV_HYP_PAGE_SIZE) - free_page(addr); - else - kfree((void *)addr); -} - -/* * hv_post_message - Post a message using the hypervisor message IPC. * * This involves a hypercall. @@ -84,14 +48,15 @@ int hv_post_message(union hv_connection_id connection_id, void *payload, size_t payload_size) { struct hv_input_post_message *aligned_msg; - struct hv_per_cpu_context *hv_cpu; + unsigned long flags; u64 status; if (payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) return -EMSGSIZE; - hv_cpu = get_cpu_ptr(hv_context.cpu_context); - aligned_msg = hv_cpu->post_msg_page; + local_irq_save(flags); + + aligned_msg = *this_cpu_ptr(hyperv_pcpu_input_arg); aligned_msg->connectionid = connection_id; aligned_msg->reserved = 0; aligned_msg->message_type = message_type; @@ -106,11 +71,7 @@ int hv_post_message(union hv_connection_id connection_id, status = hv_do_hypercall(HVCALL_POST_MESSAGE, aligned_msg, NULL); - /* Preemption must remain disabled until after the hypercall - * so some other thread can't get scheduled onto this cpu and - * corrupt the per-cpu post_msg_page - */ - put_cpu_ptr(hv_cpu); + local_irq_restore(flags); return hv_result(status); } @@ -162,12 +123,6 @@ int hv_synic_alloc(void) goto err; } } - - hv_cpu->post_msg_page = (void *)get_zeroed_page(GFP_ATOMIC); - if (hv_cpu->post_msg_page == NULL) { - pr_err("Unable to allocate post msg page\n"); - goto err; - } } return 0; @@ -190,7 +145,6 @@ void hv_synic_free(void) free_page((unsigned long)hv_cpu->synic_event_page); free_page((unsigned long)hv_cpu->synic_message_page); - free_page((unsigned long)hv_cpu->post_msg_page); } kfree(hv_context.hv_numa_map); @@ -217,11 +171,13 @@ void hv_synic_enable_regs(unsigned int cpu) simp.simp_enabled = 1; if (hv_isolation_type_snp() || hv_root_partition) { + /* Mask out vTOM bit. ioremap_cache() maps decrypted */ + u64 base = (simp.base_simp_gpa << HV_HYP_PAGE_SHIFT) & + ~ms_hyperv.shared_gpa_boundary; hv_cpu->synic_message_page - = memremap(simp.base_simp_gpa << HV_HYP_PAGE_SHIFT, - HV_HYP_PAGE_SIZE, MEMREMAP_WB); + = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); if (!hv_cpu->synic_message_page) - pr_err("Fail to map syinc message page.\n"); + pr_err("Fail to map synic message page.\n"); } else { simp.base_simp_gpa = virt_to_phys(hv_cpu->synic_message_page) >> HV_HYP_PAGE_SHIFT; @@ -234,12 +190,13 @@ void hv_synic_enable_regs(unsigned int cpu) siefp.siefp_enabled = 1; if (hv_isolation_type_snp() || hv_root_partition) { - hv_cpu->synic_event_page = - memremap(siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT, - HV_HYP_PAGE_SIZE, MEMREMAP_WB); - + /* Mask out vTOM bit. ioremap_cache() maps decrypted */ + u64 base = (siefp.base_siefp_gpa << HV_HYP_PAGE_SHIFT) & + ~ms_hyperv.shared_gpa_boundary; + hv_cpu->synic_event_page + = (void *)ioremap_cache(base, HV_HYP_PAGE_SIZE); if (!hv_cpu->synic_event_page) - pr_err("Fail to map syinc event page.\n"); + pr_err("Fail to map synic event page.\n"); } else { siefp.base_siefp_gpa = virt_to_phys(hv_cpu->synic_event_page) >> HV_HYP_PAGE_SHIFT; @@ -316,7 +273,7 @@ void hv_synic_disable_regs(unsigned int cpu) */ simp.simp_enabled = 0; if (hv_isolation_type_snp() || hv_root_partition) { - memunmap(hv_cpu->synic_message_page); + iounmap(hv_cpu->synic_message_page); hv_cpu->synic_message_page = NULL; } else { simp.base_simp_gpa = 0; @@ -328,7 +285,7 @@ void hv_synic_disable_regs(unsigned int cpu) siefp.siefp_enabled = 0; if (hv_isolation_type_snp() || hv_root_partition) { - memunmap(hv_cpu->synic_event_page); + iounmap(hv_cpu->synic_event_page); hv_cpu->synic_event_page = NULL; } else { siefp.base_siefp_gpa = 0; diff --git a/drivers/hv/hv_common.c b/drivers/hv/hv_common.c index 52a6f89ccdbd..64f9ceca887b 100644 --- a/drivers/hv/hv_common.c +++ b/drivers/hv/hv_common.c @@ -17,8 +17,11 @@ #include <linux/export.h> #include <linux/bitfield.h> #include <linux/cpumask.h> +#include <linux/sched/task_stack.h> #include <linux/panic_notifier.h> #include <linux/ptrace.h> +#include <linux/kdebug.h> +#include <linux/kmsg_dump.h> #include <linux/slab.h> #include <linux/dma-map-ops.h> #include <asm/hyperv-tlfs.h> @@ -54,6 +57,10 @@ EXPORT_SYMBOL_GPL(hyperv_pcpu_input_arg); void * __percpu *hyperv_pcpu_output_arg; EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg); +static void hv_kmsg_dump_unregister(void); + +static struct ctl_table_header *hv_ctl_table_hdr; + /* * Hyper-V specific initialization and shutdown code that is * common across all architectures. Called from architecture @@ -62,6 +69,12 @@ EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg); void __init hv_common_free(void) { + unregister_sysctl_table(hv_ctl_table_hdr); + hv_ctl_table_hdr = NULL; + + if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) + hv_kmsg_dump_unregister(); + kfree(hv_vp_index); hv_vp_index = NULL; @@ -72,10 +85,203 @@ void __init hv_common_free(void) hyperv_pcpu_input_arg = NULL; } +/* + * Functions for allocating and freeing memory with size and + * alignment HV_HYP_PAGE_SIZE. These functions are needed because + * the guest page size may not be the same as the Hyper-V page + * size. We depend upon kmalloc() aligning power-of-two size + * allocations to the allocation size boundary, so that the + * allocated memory appears to Hyper-V as a page of the size + * it expects. + */ + +void *hv_alloc_hyperv_page(void) +{ + BUILD_BUG_ON(PAGE_SIZE < HV_HYP_PAGE_SIZE); + + if (PAGE_SIZE == HV_HYP_PAGE_SIZE) + return (void *)__get_free_page(GFP_KERNEL); + else + return kmalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page); + +void *hv_alloc_hyperv_zeroed_page(void) +{ + if (PAGE_SIZE == HV_HYP_PAGE_SIZE) + return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + else + return kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); +} +EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page); + +void hv_free_hyperv_page(unsigned long addr) +{ + if (PAGE_SIZE == HV_HYP_PAGE_SIZE) + free_page(addr); + else + kfree((void *)addr); +} +EXPORT_SYMBOL_GPL(hv_free_hyperv_page); + +static void *hv_panic_page; + +/* + * Boolean to control whether to report panic messages over Hyper-V. + * + * It can be set via /proc/sys/kernel/hyperv_record_panic_msg + */ +static int sysctl_record_panic_msg = 1; + +/* + * sysctl option to allow the user to control whether kmsg data should be + * reported to Hyper-V on panic. + */ +static struct ctl_table hv_ctl_table[] = { + { + .procname = "hyperv_record_panic_msg", + .data = &sysctl_record_panic_msg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, + {} +}; + +static int hv_die_panic_notify_crash(struct notifier_block *self, + unsigned long val, void *args); + +static struct notifier_block hyperv_die_report_block = { + .notifier_call = hv_die_panic_notify_crash, +}; + +static struct notifier_block hyperv_panic_report_block = { + .notifier_call = hv_die_panic_notify_crash, +}; + +/* + * The following callback works both as die and panic notifier; its + * goal is to provide panic information to the hypervisor unless the + * kmsg dumper is used [see hv_kmsg_dump()], which provides more + * information but isn't always available. + * + * Notice that both the panic/die report notifiers are registered only + * if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set. + */ +static int hv_die_panic_notify_crash(struct notifier_block *self, + unsigned long val, void *args) +{ + struct pt_regs *regs; + bool is_die; + + /* Don't notify Hyper-V unless we have a die oops event or panic. */ + if (self == &hyperv_panic_report_block) { + is_die = false; + regs = current_pt_regs(); + } else { /* die event */ + if (val != DIE_OOPS) + return NOTIFY_DONE; + + is_die = true; + regs = ((struct die_args *)args)->regs; + } + + /* + * Hyper-V should be notified only once about a panic/die. If we will + * be calling hv_kmsg_dump() later with kmsg data, don't do the + * notification here. + */ + if (!sysctl_record_panic_msg || !hv_panic_page) + hyperv_report_panic(regs, val, is_die); + + return NOTIFY_DONE; +} + +/* + * Callback from kmsg_dump. Grab as much as possible from the end of the kmsg + * buffer and call into Hyper-V to transfer the data. + */ +static void hv_kmsg_dump(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason) +{ + struct kmsg_dump_iter iter; + size_t bytes_written; + + /* We are only interested in panics. */ + if (reason != KMSG_DUMP_PANIC || !sysctl_record_panic_msg) + return; + + /* + * Write dump contents to the page. No need to synchronize; panic should + * be single-threaded. + */ + kmsg_dump_rewind(&iter); + kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE, + &bytes_written); + if (!bytes_written) + return; + /* + * P3 to contain the physical address of the panic page & P4 to + * contain the size of the panic data in that page. Rest of the + * registers are no-op when the NOTIFY_MSG flag is set. + */ + hv_set_register(HV_REGISTER_CRASH_P0, 0); + hv_set_register(HV_REGISTER_CRASH_P1, 0); + hv_set_register(HV_REGISTER_CRASH_P2, 0); + hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page)); + hv_set_register(HV_REGISTER_CRASH_P4, bytes_written); + + /* + * Let Hyper-V know there is crash data available along with + * the panic message. + */ + hv_set_register(HV_REGISTER_CRASH_CTL, + (HV_CRASH_CTL_CRASH_NOTIFY | + HV_CRASH_CTL_CRASH_NOTIFY_MSG)); +} + +static struct kmsg_dumper hv_kmsg_dumper = { + .dump = hv_kmsg_dump, +}; + +static void hv_kmsg_dump_unregister(void) +{ + kmsg_dump_unregister(&hv_kmsg_dumper); + unregister_die_notifier(&hyperv_die_report_block); + atomic_notifier_chain_unregister(&panic_notifier_list, + &hyperv_panic_report_block); + + hv_free_hyperv_page((unsigned long)hv_panic_page); + hv_panic_page = NULL; +} + +static void hv_kmsg_dump_register(void) +{ + int ret; + + hv_panic_page = hv_alloc_hyperv_zeroed_page(); + if (!hv_panic_page) { + pr_err("Hyper-V: panic message page memory allocation failed\n"); + return; + } + + ret = kmsg_dump_register(&hv_kmsg_dumper); + if (ret) { + pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret); + hv_free_hyperv_page((unsigned long)hv_panic_page); + hv_panic_page = NULL; + } +} + int __init hv_common_init(void) { int i; + if (hv_is_isolation_supported()) + sysctl_record_panic_msg = 0; + /* * Hyper-V expects to get crash register data or kmsg when * crash enlightment is available and system crashes. Set @@ -84,8 +290,33 @@ int __init hv_common_init(void) * kernel. */ if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { + u64 hyperv_crash_ctl; + crash_kexec_post_notifiers = true; pr_info("Hyper-V: enabling crash_kexec_post_notifiers\n"); + + /* + * Panic message recording (sysctl_record_panic_msg) + * is enabled by default in non-isolated guests and + * disabled by default in isolated guests; the panic + * message recording won't be available in isolated + * guests should the following registration fail. + */ + hv_ctl_table_hdr = register_sysctl("kernel", hv_ctl_table); + if (!hv_ctl_table_hdr) + pr_err("Hyper-V: sysctl table register error"); + + /* + * Register for panic kmsg callback only if the right + * capability is supported by the hypervisor. + */ + hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL); + if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) + hv_kmsg_dump_register(); + + register_die_notifier(&hyperv_die_report_block); + atomic_notifier_chain_register(&panic_notifier_list, + &hyperv_panic_report_block); } /* @@ -311,14 +542,3 @@ u64 __weak hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_s return HV_STATUS_INVALID_PARAMETER; } EXPORT_SYMBOL_GPL(hv_ghcb_hypercall); - -void __weak *hv_map_memory(void *addr, unsigned long size) -{ - return NULL; -} -EXPORT_SYMBOL_GPL(hv_map_memory); - -void __weak hv_unmap_memory(void *addr) -{ -} -EXPORT_SYMBOL_GPL(hv_unmap_memory); diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index dc673edf053c..55f2086841ae 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -122,10 +122,6 @@ enum { struct hv_per_cpu_context { void *synic_message_page; void *synic_event_page; - /* - * buffer to post messages to the host. - */ - void *post_msg_page; /* * Starting with win8, we can take channel interrupts on any CPU; @@ -241,8 +237,6 @@ struct vmbus_connection { * is child->parent notification */ struct hv_monitor_page *monitor_pages[2]; - void *monitor_pages_original[2]; - phys_addr_t monitor_pages_pa[2]; struct list_head chn_msg_list; spinlock_t channelmsg_lock; diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index 2111e97c3b63..3c9b02471760 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -186,8 +186,6 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, struct page *pages, u32 page_cnt, u32 max_pkt_size) { struct page **pages_wraparound; - unsigned long *pfns_wraparound; - u64 pfn; int i; BUILD_BUG_ON((sizeof(struct hv_ring_buffer) != PAGE_SIZE)); @@ -196,50 +194,30 @@ int hv_ringbuffer_init(struct hv_ring_buffer_info *ring_info, * First page holds struct hv_ring_buffer, do wraparound mapping for * the rest. */ - if (hv_isolation_type_snp()) { - pfn = page_to_pfn(pages) + - PFN_DOWN(ms_hyperv.shared_gpa_boundary); + pages_wraparound = kcalloc(page_cnt * 2 - 1, + sizeof(struct page *), + GFP_KERNEL); + if (!pages_wraparound) + return -ENOMEM; - pfns_wraparound = kcalloc(page_cnt * 2 - 1, - sizeof(unsigned long), GFP_KERNEL); - if (!pfns_wraparound) - return -ENOMEM; - - pfns_wraparound[0] = pfn; - for (i = 0; i < 2 * (page_cnt - 1); i++) - pfns_wraparound[i + 1] = pfn + i % (page_cnt - 1) + 1; - - ring_info->ring_buffer = (struct hv_ring_buffer *) - vmap_pfn(pfns_wraparound, page_cnt * 2 - 1, - pgprot_decrypted(PAGE_KERNEL)); - kfree(pfns_wraparound); - - if (!ring_info->ring_buffer) - return -ENOMEM; - - /* Zero ring buffer after setting memory host visibility. */ - memset(ring_info->ring_buffer, 0x00, PAGE_SIZE * page_cnt); - } else { - pages_wraparound = kcalloc(page_cnt * 2 - 1, - sizeof(struct page *), - GFP_KERNEL); - if (!pages_wraparound) - return -ENOMEM; - - pages_wraparound[0] = pages; - for (i = 0; i < 2 * (page_cnt - 1); i++) - pages_wraparound[i + 1] = - &pages[i % (page_cnt - 1) + 1]; + pages_wraparound[0] = pages; + for (i = 0; i < 2 * (page_cnt - 1); i++) + pages_wraparound[i + 1] = + &pages[i % (page_cnt - 1) + 1]; - ring_info->ring_buffer = (struct hv_ring_buffer *) - vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP, - PAGE_KERNEL); + ring_info->ring_buffer = (struct hv_ring_buffer *) + vmap(pages_wraparound, page_cnt * 2 - 1, VM_MAP, + pgprot_decrypted(PAGE_KERNEL)); - kfree(pages_wraparound); - if (!ring_info->ring_buffer) - return -ENOMEM; - } + kfree(pages_wraparound); + if (!ring_info->ring_buffer) + return -ENOMEM; + /* + * Ensure the header page is zero'ed since + * encryption status may have changed. + */ + memset(ring_info->ring_buffer, 0, HV_HYP_PAGE_SIZE); ring_info->ring_buffer->read_index = ring_info->ring_buffer->write_index = 0; diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index 734fff86c8dc..1c65a6dfb9fa 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -12,6 +12,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/device.h> +#include <linux/platform_device.h> #include <linux/interrupt.h> #include <linux/sysctl.h> #include <linux/slab.h> @@ -19,6 +20,7 @@ #include <linux/completion.h> #include <linux/hyperv.h> #include <linux/kernel_stat.h> +#include <linux/of_address.h> #include <linux/clockchips.h> #include <linux/cpu.h> #include <linux/sched/isolation.h> @@ -28,7 +30,6 @@ #include <linux/panic_notifier.h> #include <linux/ptrace.h> #include <linux/screen_info.h> -#include <linux/kdebug.h> #include <linux/efi.h> #include <linux/random.h> #include <linux/kernel.h> @@ -44,12 +45,10 @@ struct vmbus_dynid { struct hv_vmbus_device_id id; }; -static struct acpi_device *hv_acpi_dev; +static struct device *hv_dev; static int hyperv_cpuhp_online; -static void *hv_panic_page; - static long __percpu *vmbus_evt; /* Values parsed from ACPI DSDT */ @@ -57,18 +56,6 @@ int vmbus_irq; int vmbus_interrupt; /* - * Boolean to control whether to report panic messages over Hyper-V. - * - * It can be set via /proc/sys/kernel/hyperv_record_panic_msg - */ -static int sysctl_record_panic_msg = 1; - -static int hyperv_report_reg(void) -{ - return !sysctl_record_panic_msg || !hv_panic_page; -} - -/* * The panic notifier below is responsible solely for unloading the * vmbus connection, which is necessary in a panic event. * @@ -88,54 +75,6 @@ static struct notifier_block hyperv_panic_vmbus_unload_block = { .priority = INT_MIN + 1, /* almost the latest one to execute */ }; -static int hv_die_panic_notify_crash(struct notifier_block *self, - unsigned long val, void *args); - -static struct notifier_block hyperv_die_report_block = { - .notifier_call = hv_die_panic_notify_crash, -}; -static struct notifier_block hyperv_panic_report_block = { - .notifier_call = hv_die_panic_notify_crash, -}; - -/* - * The following callback works both as die and panic notifier; its - * goal is to provide panic information to the hypervisor unless the - * kmsg dumper is used [see hv_kmsg_dump()], which provides more - * information but isn't always available. - * - * Notice that both the panic/die report notifiers are registered only - * if we have the capability HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE set. - */ -static int hv_die_panic_notify_crash(struct notifier_block *self, - unsigned long val, void *args) -{ - struct pt_regs *regs; - bool is_die; - - /* Don't notify Hyper-V unless we have a die oops event or panic. */ - if (self == &hyperv_panic_report_block) { - is_die = false; - regs = current_pt_regs(); - } else { /* die event */ - if (val != DIE_OOPS) - return NOTIFY_DONE; - - is_die = true; - regs = ((struct die_args *)args)->regs; - } - - /* - * Hyper-V should be notified only once about a panic/die. If we will - * be calling hv_kmsg_dump() later with kmsg data, don't do the - * notification here. - */ - if (hyperv_report_reg()) - hyperv_report_panic(regs, val, is_die); - - return NOTIFY_DONE; -} - static const char *fb_mmio_name = "fb_range"; static struct resource *fb_mmio; static struct resource *hyperv_mmio; @@ -143,7 +82,7 @@ static DEFINE_MUTEX(hyperv_mmio_lock); static int vmbus_exists(void) { - if (hv_acpi_dev == NULL) + if (hv_dev == NULL) return -ENODEV; return 0; @@ -932,7 +871,7 @@ static int vmbus_dma_configure(struct device *child_device) * On x86/x64 coherence is assumed and these calls have no effect. */ hv_setup_dma_ops(child_device, - device_get_dma_attr(&hv_acpi_dev->dev) == DEV_DMA_COHERENT); + device_get_dma_attr(hv_dev) == DEV_DMA_COHERENT); return 0; } @@ -1378,89 +1317,6 @@ static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id) } /* - * Callback from kmsg_dump. Grab as much as possible from the end of the kmsg - * buffer and call into Hyper-V to transfer the data. - */ -static void hv_kmsg_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) -{ - struct kmsg_dump_iter iter; - size_t bytes_written; - - /* We are only interested in panics. */ - if ((reason != KMSG_DUMP_PANIC) || (!sysctl_record_panic_msg)) - return; - - /* - * Write dump contents to the page. No need to synchronize; panic should - * be single-threaded. - */ - kmsg_dump_rewind(&iter); - kmsg_dump_get_buffer(&iter, false, hv_panic_page, HV_HYP_PAGE_SIZE, - &bytes_written); - if (!bytes_written) - return; - /* - * P3 to contain the physical address of the panic page & P4 to - * contain the size of the panic data in that page. Rest of the - * registers are no-op when the NOTIFY_MSG flag is set. - */ - hv_set_register(HV_REGISTER_CRASH_P0, 0); - hv_set_register(HV_REGISTER_CRASH_P1, 0); - hv_set_register(HV_REGISTER_CRASH_P2, 0); - hv_set_register(HV_REGISTER_CRASH_P3, virt_to_phys(hv_panic_page)); - hv_set_register(HV_REGISTER_CRASH_P4, bytes_written); - - /* - * Let Hyper-V know there is crash data available along with - * the panic message. - */ - hv_set_register(HV_REGISTER_CRASH_CTL, - (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG)); -} - -static struct kmsg_dumper hv_kmsg_dumper = { - .dump = hv_kmsg_dump, -}; - -static void hv_kmsg_dump_register(void) -{ - int ret; - - hv_panic_page = hv_alloc_hyperv_zeroed_page(); - if (!hv_panic_page) { - pr_err("Hyper-V: panic message page memory allocation failed\n"); - return; - } - - ret = kmsg_dump_register(&hv_kmsg_dumper); - if (ret) { - pr_err("Hyper-V: kmsg dump register error 0x%x\n", ret); - hv_free_hyperv_page((unsigned long)hv_panic_page); - hv_panic_page = NULL; - } -} - -static struct ctl_table_header *hv_ctl_table_hdr; - -/* - * sysctl option to allow the user to control whether kmsg data should be - * reported to Hyper-V on panic. - */ -static struct ctl_table hv_ctl_table[] = { - { - .procname = "hyperv_record_panic_msg", - .data = &sysctl_record_panic_msg, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE - }, - {} -}; - -/* * vmbus_bus_init -Main vmbus driver initialization routine. * * Here, we @@ -1523,38 +1379,6 @@ static int vmbus_bus_init(void) if (ret) goto err_connect; - if (hv_is_isolation_supported()) - sysctl_record_panic_msg = 0; - - /* - * Only register if the crash MSRs are available - */ - if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { - u64 hyperv_crash_ctl; - /* - * Panic message recording (sysctl_record_panic_msg) - * is enabled by default in non-isolated guests and - * disabled by default in isolated guests; the panic - * message recording won't be available in isolated - * guests should the following registration fail. - */ - hv_ctl_table_hdr = register_sysctl("kernel", hv_ctl_table); - if (!hv_ctl_table_hdr) - pr_err("Hyper-V: sysctl table register error"); - - /* - * Register for panic kmsg callback only if the right - * capability is supported by the hypervisor. - */ - hyperv_crash_ctl = hv_get_register(HV_REGISTER_CRASH_CTL); - if (hyperv_crash_ctl & HV_CRASH_CTL_CRASH_NOTIFY_MSG) - hv_kmsg_dump_register(); - - register_die_notifier(&hyperv_die_report_block); - atomic_notifier_chain_register(&panic_notifier_list, - &hyperv_panic_report_block); - } - /* * Always register the vmbus unload panic notifier because we * need to shut the VMbus channel connection on panic. @@ -1579,8 +1403,6 @@ err_alloc: } err_setup: bus_unregister(&hv_bus); - unregister_sysctl_table(hv_ctl_table_hdr); - hv_ctl_table_hdr = NULL; return ret; } @@ -2081,7 +1903,7 @@ int vmbus_device_register(struct hv_device *child_device_obj) &child_device_obj->channel->offermsg.offer.if_instance); child_device_obj->device.bus = &hv_bus; - child_device_obj->device.parent = &hv_acpi_dev->dev; + child_device_obj->device.parent = hv_dev; child_device_obj->device.release = vmbus_device_release; child_device_obj->device.dma_parms = &child_device_obj->dma_parms; @@ -2142,7 +1964,7 @@ void vmbus_device_unregister(struct hv_device *device_obj) device_unregister(&device_obj->device); } - +#ifdef CONFIG_ACPI /* * VMBUS is an acpi enumerated device. Get the information we * need from DSDT. @@ -2251,8 +2073,9 @@ static acpi_status vmbus_walk_resources(struct acpi_resource *res, void *ctx) return AE_OK; } +#endif -static void vmbus_acpi_remove(struct acpi_device *device) +static void vmbus_mmio_remove(void) { struct resource *cur_res; struct resource *next_res; @@ -2271,7 +2094,7 @@ static void vmbus_acpi_remove(struct acpi_device *device) } } -static void vmbus_reserve_fb(void) +static void __maybe_unused vmbus_reserve_fb(void) { resource_size_t start = 0, size; struct pci_dev *pdev; @@ -2431,13 +2254,15 @@ void vmbus_free_mmio(resource_size_t start, resource_size_t size) } EXPORT_SYMBOL_GPL(vmbus_free_mmio); -static int vmbus_acpi_add(struct acpi_device *device) +#ifdef CONFIG_ACPI +static int vmbus_acpi_add(struct platform_device *pdev) { acpi_status result; int ret_val = -ENODEV; struct acpi_device *ancestor; + struct acpi_device *device = ACPI_COMPANION(&pdev->dev); - hv_acpi_dev = device; + hv_dev = &device->dev; /* * Older versions of Hyper-V for ARM64 fail to include the _CCA @@ -2479,9 +2304,64 @@ static int vmbus_acpi_add(struct acpi_device *device) acpi_walk_err: if (ret_val) - vmbus_acpi_remove(device); + vmbus_mmio_remove(); return ret_val; } +#else +static int vmbus_acpi_add(struct platform_device *pdev) +{ + return 0; +} +#endif + +static int vmbus_device_add(struct platform_device *pdev) +{ + struct resource **cur_res = &hyperv_mmio; + struct of_range range; + struct of_range_parser parser; + struct device_node *np = pdev->dev.of_node; + int ret; + + hv_dev = &pdev->dev; + + ret = of_range_parser_init(&parser, np); + if (ret) + return ret; + + for_each_of_range(&parser, &range) { + struct resource *res; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + if (!res) { + vmbus_mmio_remove(); + return -ENOMEM; + } + + res->name = "hyperv mmio"; + res->flags = range.flags; + res->start = range.cpu_addr; + res->end = range.cpu_addr + range.size; + + *cur_res = res; + cur_res = &res->sibling; + } + + return ret; +} + +static int vmbus_platform_driver_probe(struct platform_device *pdev) +{ + if (acpi_disabled) + return vmbus_device_add(pdev); + else + return vmbus_acpi_add(pdev); +} + +static int vmbus_platform_driver_remove(struct platform_device *pdev) +{ + vmbus_mmio_remove(); + return 0; +} #ifdef CONFIG_PM_SLEEP static int vmbus_bus_suspend(struct device *dev) @@ -2620,7 +2500,17 @@ static int vmbus_bus_resume(struct device *dev) #define vmbus_bus_resume NULL #endif /* CONFIG_PM_SLEEP */ -static const struct acpi_device_id vmbus_acpi_device_ids[] = { +static const __maybe_unused struct of_device_id vmbus_of_match[] = { + { + .compatible = "microsoft,vmbus", + }, + { + /* sentinel */ + }, +}; +MODULE_DEVICE_TABLE(of, vmbus_of_match); + +static const __maybe_unused struct acpi_device_id vmbus_acpi_device_ids[] = { {"VMBUS", 0}, {"VMBus", 0}, {"", 0}, @@ -2648,15 +2538,16 @@ static const struct dev_pm_ops vmbus_bus_pm = { .restore_noirq = vmbus_bus_resume }; -static struct acpi_driver vmbus_acpi_driver = { - .name = "vmbus", - .ids = vmbus_acpi_device_ids, - .ops = { - .add = vmbus_acpi_add, - .remove = vmbus_acpi_remove, - }, - .drv.pm = &vmbus_bus_pm, - .drv.probe_type = PROBE_FORCE_SYNCHRONOUS, +static struct platform_driver vmbus_platform_driver = { + .probe = vmbus_platform_driver_probe, + .remove = vmbus_platform_driver_remove, + .driver = { + .name = "vmbus", + .acpi_match_table = ACPI_PTR(vmbus_acpi_device_ids), + .of_match_table = of_match_ptr(vmbus_of_match), + .pm = &vmbus_bus_pm, + .probe_type = PROBE_FORCE_SYNCHRONOUS, + } }; static void hv_kexec_handler(void) @@ -2740,12 +2631,11 @@ static int __init hv_acpi_init(void) /* * Get ACPI resources first. */ - ret = acpi_bus_register_driver(&vmbus_acpi_driver); - + ret = platform_driver_register(&vmbus_platform_driver); if (ret) return ret; - if (!hv_acpi_dev) { + if (!hv_dev) { ret = -ENODEV; goto cleanup; } @@ -2775,8 +2665,8 @@ static int __init hv_acpi_init(void) return 0; cleanup: - acpi_bus_unregister_driver(&vmbus_acpi_driver); - hv_acpi_dev = NULL; + platform_driver_unregister(&vmbus_platform_driver); + hv_dev = NULL; return ret; } @@ -2808,13 +2698,6 @@ static void __exit vmbus_exit(void) vmbus_free_channels(); kfree(vmbus_connection.channels); - if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { - kmsg_dump_unregister(&hv_kmsg_dumper); - unregister_die_notifier(&hyperv_die_report_block); - atomic_notifier_chain_unregister(&panic_notifier_list, - &hyperv_panic_report_block); - } - /* * The vmbus panic notifier is always registered, hence we should * also unconditionally unregister it here as well. @@ -2822,14 +2705,11 @@ static void __exit vmbus_exit(void) atomic_notifier_chain_unregister(&panic_notifier_list, &hyperv_panic_vmbus_unload_block); - free_page((unsigned long)hv_panic_page); - unregister_sysctl_table(hv_ctl_table_hdr); - hv_ctl_table_hdr = NULL; bus_unregister(&hv_bus); cpuhp_remove_state(hyperv_cpuhp_online); hv_synic_free(); - acpi_bus_unregister_driver(&vmbus_acpi_driver); + platform_driver_unregister(&vmbus_platform_driver); } diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index dd5919ec408b..33d51e363913 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -1139,7 +1139,6 @@ struct netvsc_device { /* Receive buffer allocated by us but manages by NetVSP */ void *recv_buf; - void *recv_original_buf; u32 recv_buf_size; /* allocated bytes */ struct vmbus_gpadl recv_buf_gpadl_handle; u32 recv_section_cnt; @@ -1148,7 +1147,6 @@ struct netvsc_device { /* Send buffer allocated by us */ void *send_buf; - void *send_original_buf; u32 send_buf_size; struct vmbus_gpadl send_buf_gpadl_handle; u32 send_section_cnt; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index da737d959e81..82e9796c8f5e 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -154,17 +154,8 @@ static void free_netvsc_device(struct rcu_head *head) int i; kfree(nvdev->extension); - - if (nvdev->recv_original_buf) - vfree(nvdev->recv_original_buf); - else - vfree(nvdev->recv_buf); - - if (nvdev->send_original_buf) - vfree(nvdev->send_original_buf); - else - vfree(nvdev->send_buf); - + vfree(nvdev->recv_buf); + vfree(nvdev->send_buf); bitmap_free(nvdev->send_section_map); for (i = 0; i < VRSS_CHANNEL_MAX; i++) { @@ -347,7 +338,6 @@ static int netvsc_init_buf(struct hv_device *device, struct nvsp_message *init_packet; unsigned int buf_size; int i, ret = 0; - void *vaddr; /* Get receive buffer area. */ buf_size = device_info->recv_sections * device_info->recv_section_size; @@ -383,17 +373,6 @@ static int netvsc_init_buf(struct hv_device *device, goto cleanup; } - if (hv_isolation_type_snp()) { - vaddr = hv_map_memory(net_device->recv_buf, buf_size); - if (!vaddr) { - ret = -ENOMEM; - goto cleanup; - } - - net_device->recv_original_buf = net_device->recv_buf; - net_device->recv_buf = vaddr; - } - /* Notify the NetVsp of the gpadl handle */ init_packet = &net_device->channel_init_pkt; memset(init_packet, 0, sizeof(struct nvsp_message)); @@ -497,17 +476,6 @@ static int netvsc_init_buf(struct hv_device *device, goto cleanup; } - if (hv_isolation_type_snp()) { - vaddr = hv_map_memory(net_device->send_buf, buf_size); - if (!vaddr) { - ret = -ENOMEM; - goto cleanup; - } - - net_device->send_original_buf = net_device->send_buf; - net_device->send_buf = vaddr; - } - /* Notify the NetVsp of the gpadl handle */ init_packet = &net_device->channel_init_pkt; memset(init_packet, 0, sizeof(struct nvsp_message)); @@ -762,12 +730,6 @@ void netvsc_device_remove(struct hv_device *device) netvsc_teardown_send_gpadl(device, net_device, ndev); } - if (net_device->recv_original_buf) - hv_unmap_memory(net_device->recv_buf); - - if (net_device->send_original_buf) - hv_unmap_memory(net_device->send_buf); - /* Release all resources */ free_netvsc_device_rcu(net_device); } @@ -1844,12 +1806,6 @@ cleanup: netif_napi_del(&net_device->chan_table[0].napi); cleanup2: - if (net_device->recv_original_buf) - hv_unmap_memory(net_device->recv_buf); - - if (net_device->send_original_buf) - hv_unmap_memory(net_device->send_buf); - free_netvsc_device(&net_device->rcu); return ERR_PTR(ret); diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index f33370b75628..bc32662c6bb7 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -508,19 +508,11 @@ struct hv_pcibus_device { struct msi_domain_info msi_info; struct irq_domain *irq_domain; - spinlock_t retarget_msi_interrupt_lock; - struct workqueue_struct *wq; /* Highest slot of child device with resources allocated */ int wslot_res_allocated; - - /* hypercall arg, must not cross page boundary */ - struct hv_retarget_device_interrupt retarget_msi_interrupt_params; - - /* - * Don't put anything here: retarget_msi_interrupt_params must be last - */ + bool use_calls; /* Use hypercalls to access mmio cfg space */ }; /* @@ -644,9 +636,9 @@ static void hv_arch_irq_unmask(struct irq_data *data) hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); int_desc = data->chip_data; - spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags); + local_irq_save(flags); - params = &hbus->retarget_msi_interrupt_params; + params = *this_cpu_ptr(hyperv_pcpu_input_arg); memset(params, 0, sizeof(*params)); params->partition_id = HV_PARTITION_ID_SELF; params->int_entry.source = HV_INTERRUPT_SOURCE_MSI; @@ -679,7 +671,7 @@ static void hv_arch_irq_unmask(struct irq_data *data) if (!alloc_cpumask_var(&tmp, GFP_ATOMIC)) { res = 1; - goto exit_unlock; + goto out; } cpumask_and(tmp, dest, cpu_online_mask); @@ -688,7 +680,7 @@ static void hv_arch_irq_unmask(struct irq_data *data) if (nr_bank <= 0) { res = 1; - goto exit_unlock; + goto out; } /* @@ -707,8 +699,8 @@ static void hv_arch_irq_unmask(struct irq_data *data) res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17), params, NULL); -exit_unlock: - spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags); +out: + local_irq_restore(flags); /* * During hibernation, when a CPU is offlined, the kernel tries @@ -1041,6 +1033,70 @@ static int wslot_to_devfn(u32 wslot) return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func); } +static void hv_pci_read_mmio(struct device *dev, phys_addr_t gpa, int size, u32 *val) +{ + struct hv_mmio_read_input *in; + struct hv_mmio_read_output *out; + u64 ret; + + /* + * Must be called with interrupts disabled so it is safe + * to use the per-cpu input argument page. Use it for + * both input and output. + */ + in = *this_cpu_ptr(hyperv_pcpu_input_arg); + out = *this_cpu_ptr(hyperv_pcpu_input_arg) + sizeof(*in); + in->gpa = gpa; + in->size = size; + + ret = hv_do_hypercall(HVCALL_MMIO_READ, in, out); + if (hv_result_success(ret)) { + switch (size) { + case 1: + *val = *(u8 *)(out->data); + break; + case 2: + *val = *(u16 *)(out->data); + break; + default: + *val = *(u32 *)(out->data); + break; + } + } else + dev_err(dev, "MMIO read hypercall error %llx addr %llx size %d\n", + ret, gpa, size); +} + +static void hv_pci_write_mmio(struct device *dev, phys_addr_t gpa, int size, u32 val) +{ + struct hv_mmio_write_input *in; + u64 ret; + + /* + * Must be called with interrupts disabled so it is safe + * to use the per-cpu input argument memory. + */ + in = *this_cpu_ptr(hyperv_pcpu_input_arg); + in->gpa = gpa; + in->size = size; + switch (size) { + case 1: + *(u8 *)(in->data) = val; + break; + case 2: + *(u16 *)(in->data) = val; + break; + default: + *(u32 *)(in->data) = val; + break; + } + + ret = hv_do_hypercall(HVCALL_MMIO_WRITE, in, NULL); + if (!hv_result_success(ret)) + dev_err(dev, "MMIO write hypercall error %llx addr %llx size %d\n", + ret, gpa, size); +} + /* * PCI Configuration Space for these root PCI buses is implemented as a pair * of pages in memory-mapped I/O space. Writing to the first page chooses @@ -1059,8 +1115,10 @@ static int wslot_to_devfn(u32 wslot) static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, int size, u32 *val) { + struct hv_pcibus_device *hbus = hpdev->hbus; + struct device *dev = &hbus->hdev->device; + int offset = where + CFG_PAGE_OFFSET; unsigned long flags; - void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; /* * If the attempt is to read the IDs or the ROM BAR, simulate that. @@ -1088,56 +1146,79 @@ static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, */ *val = 0; } else if (where + size <= CFG_PAGE_SIZE) { - spin_lock_irqsave(&hpdev->hbus->config_lock, flags); - /* Choose the function to be read. (See comment above) */ - writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); - /* Make sure the function was chosen before we start reading. */ - mb(); - /* Read from that function's config space. */ - switch (size) { - case 1: - *val = readb(addr); - break; - case 2: - *val = readw(addr); - break; - default: - *val = readl(addr); - break; + + spin_lock_irqsave(&hbus->config_lock, flags); + if (hbus->use_calls) { + phys_addr_t addr = hbus->mem_config->start + offset; + + hv_pci_write_mmio(dev, hbus->mem_config->start, 4, + hpdev->desc.win_slot.slot); + hv_pci_read_mmio(dev, addr, size, val); + } else { + void __iomem *addr = hbus->cfg_addr + offset; + + /* Choose the function to be read. (See comment above) */ + writel(hpdev->desc.win_slot.slot, hbus->cfg_addr); + /* Make sure the function was chosen before reading. */ + mb(); + /* Read from that function's config space. */ + switch (size) { + case 1: + *val = readb(addr); + break; + case 2: + *val = readw(addr); + break; + default: + *val = readl(addr); + break; + } + /* + * Make sure the read was done before we release the + * spinlock allowing consecutive reads/writes. + */ + mb(); } - /* - * Make sure the read was done before we release the spinlock - * allowing consecutive reads/writes. - */ - mb(); - spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); + spin_unlock_irqrestore(&hbus->config_lock, flags); } else { - dev_err(&hpdev->hbus->hdev->device, - "Attempt to read beyond a function's config space.\n"); + dev_err(dev, "Attempt to read beyond a function's config space.\n"); } } static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev) { + struct hv_pcibus_device *hbus = hpdev->hbus; + struct device *dev = &hbus->hdev->device; + u32 val; u16 ret; unsigned long flags; - void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + - PCI_VENDOR_ID; - spin_lock_irqsave(&hpdev->hbus->config_lock, flags); + spin_lock_irqsave(&hbus->config_lock, flags); - /* Choose the function to be read. (See comment above) */ - writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); - /* Make sure the function was chosen before we start reading. */ - mb(); - /* Read from that function's config space. */ - ret = readw(addr); - /* - * mb() is not required here, because the spin_unlock_irqrestore() - * is a barrier. - */ + if (hbus->use_calls) { + phys_addr_t addr = hbus->mem_config->start + + CFG_PAGE_OFFSET + PCI_VENDOR_ID; - spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); + hv_pci_write_mmio(dev, hbus->mem_config->start, 4, + hpdev->desc.win_slot.slot); + hv_pci_read_mmio(dev, addr, 2, &val); + ret = val; /* Truncates to 16 bits */ + } else { + void __iomem *addr = hbus->cfg_addr + CFG_PAGE_OFFSET + + PCI_VENDOR_ID; + /* Choose the function to be read. (See comment above) */ + writel(hpdev->desc.win_slot.slot, hbus->cfg_addr); + /* Make sure the function was chosen before we start reading. */ + mb(); + /* Read from that function's config space. */ + ret = readw(addr); + /* + * mb() is not required here, because the + * spin_unlock_irqrestore() is a barrier. + */ + } + + spin_unlock_irqrestore(&hbus->config_lock, flags); return ret; } @@ -1152,39 +1233,51 @@ static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev) static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, int size, u32 val) { + struct hv_pcibus_device *hbus = hpdev->hbus; + struct device *dev = &hbus->hdev->device; + int offset = where + CFG_PAGE_OFFSET; unsigned long flags; - void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where; if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <= PCI_CAPABILITY_LIST) { /* SSIDs and ROM BARs are read-only */ } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) { - spin_lock_irqsave(&hpdev->hbus->config_lock, flags); - /* Choose the function to be written. (See comment above) */ - writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr); - /* Make sure the function was chosen before we start writing. */ - wmb(); - /* Write to that function's config space. */ - switch (size) { - case 1: - writeb(val, addr); - break; - case 2: - writew(val, addr); - break; - default: - writel(val, addr); - break; + spin_lock_irqsave(&hbus->config_lock, flags); + + if (hbus->use_calls) { + phys_addr_t addr = hbus->mem_config->start + offset; + + hv_pci_write_mmio(dev, hbus->mem_config->start, 4, + hpdev->desc.win_slot.slot); + hv_pci_write_mmio(dev, addr, size, val); + } else { + void __iomem *addr = hbus->cfg_addr + offset; + + /* Choose the function to write. (See comment above) */ + writel(hpdev->desc.win_slot.slot, hbus->cfg_addr); + /* Make sure the function was chosen before writing. */ + wmb(); + /* Write to that function's config space. */ + switch (size) { + case 1: + writeb(val, addr); + break; + case 2: + writew(val, addr); + break; + default: + writel(val, addr); + break; + } + /* + * Make sure the write was done before we release the + * spinlock allowing consecutive reads/writes. + */ + mb(); } - /* - * Make sure the write was done before we release the spinlock - * allowing consecutive reads/writes. - */ - mb(); - spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags); + spin_unlock_irqrestore(&hbus->config_lock, flags); } else { - dev_err(&hpdev->hbus->hdev->device, - "Attempt to write beyond a function's config space.\n"); + dev_err(dev, "Attempt to write beyond a function's config space.\n"); } } @@ -3496,35 +3589,11 @@ static int hv_pci_probe(struct hv_device *hdev, bool enter_d0_retry = true; int ret; - /* - * hv_pcibus_device contains the hypercall arguments for retargeting in - * hv_irq_unmask(). Those must not cross a page boundary. - */ - BUILD_BUG_ON(sizeof(*hbus) > HV_HYP_PAGE_SIZE); - bridge = devm_pci_alloc_host_bridge(&hdev->device, 0); if (!bridge) return -ENOMEM; - /* - * With the recent 59bb47985c1d ("mm, sl[aou]b: guarantee natural - * alignment for kmalloc(power-of-two)"), kzalloc() is able to allocate - * a 4KB buffer that is guaranteed to be 4KB-aligned. Here the size and - * alignment of hbus is important because hbus's field - * retarget_msi_interrupt_params must not cross a 4KB page boundary. - * - * Here we prefer kzalloc to get_zeroed_page(), because a buffer - * allocated by the latter is not tracked and scanned by kmemleak, and - * hence kmemleak reports the pointer contained in the hbus buffer - * (i.e. the hpdev struct, which is created in new_pcichild_device() and - * is tracked by hbus->children) as memory leak (false positive). - * - * If the kernel doesn't have 59bb47985c1d, get_zeroed_page() *must* be - * used to allocate the hbus buffer and we can avoid the kmemleak false - * positive by using kmemleak_alloc() and kmemleak_free() to ask - * kmemleak to track and scan the hbus buffer. - */ - hbus = kzalloc(HV_HYP_PAGE_SIZE, GFP_KERNEL); + hbus = kzalloc(sizeof(*hbus), GFP_KERNEL); if (!hbus) return -ENOMEM; @@ -3563,6 +3632,7 @@ static int hv_pci_probe(struct hv_device *hdev, hbus->bridge->domain_nr = dom; #ifdef CONFIG_X86 hbus->sysdata.domain = dom; + hbus->use_calls = !!(ms_hyperv.hints & HV_X64_USE_MMIO_HYPERCALLS); #elif defined(CONFIG_ARM64) /* * Set the PCI bus parent to be the corresponding VMbus @@ -3572,6 +3642,7 @@ static int hv_pci_probe(struct hv_device *hdev, * information to devices created on the bus. */ hbus->sysdata.parent = hdev->device.parent; + hbus->use_calls = false; #endif hbus->hdev = hdev; @@ -3579,7 +3650,6 @@ static int hv_pci_probe(struct hv_device *hdev, INIT_LIST_HEAD(&hbus->dr_list); spin_lock_init(&hbus->config_lock); spin_lock_init(&hbus->device_list_lock); - spin_lock_init(&hbus->retarget_msi_interrupt_lock); hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0, hbus->bridge->domain_nr); if (!hbus->wq) { |