From 0bbadafdc49d11a1836e5946f517d18cceaea6c8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 5 Mar 2021 13:19:51 +0100 Subject: um: allow disabling NO_IOMEM Adjust the kconfig a little to allow disabling NO_IOMEM in UML. To make an "allyesconfig" with CONFIG_NO_IOMEM=n build, adjust a few Kconfig things elsewhere and add dummy asm/fb.h and asm/vga.h files. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/Kconfig | 4 ++++ arch/um/include/asm/Kbuild | 2 ++ 2 files changed, 6 insertions(+) (limited to 'arch') diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 57cfd9a1c082..e72b4b393367 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -27,6 +27,10 @@ config MMU default y config NO_IOMEM + bool "disable IOMEM" if EXPERT + default y + +config NO_IOPORT_MAP def_bool y config ISA diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index d7492e5a1bbb..10b7228b3aee 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -7,6 +7,7 @@ generic-y += device.h generic-y += emergency-restart.h generic-y += exec.h generic-y += extable.h +generic-y += fb.h generic-y += ftrace.h generic-y += futex.h generic-y += hw_irq.h @@ -27,3 +28,4 @@ generic-y += trace_clock.h generic-y += word-at-a-time.h generic-y += kprobes.h generic-y += mm_hooks.h +generic-y += vga.h -- cgit v1.2.3-58-ga151 From 2efea7dfaa67eba020e88bcb07a10030de63dfa5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 5 Mar 2021 13:19:53 +0100 Subject: um: remove unused smp_sigio_handler() declaration This function doesn't exist, remove its declaration. Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/include/shared/kern_util.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index 2888ec812f6e..a2cfd42608a0 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -33,7 +33,6 @@ extern int handle_page_fault(unsigned long address, unsigned long ip, int is_write, int is_user, int *code_out); extern unsigned int do_IRQ(int irq, struct uml_pt_regs *regs); -extern int smp_sigio_handler(void); extern void initial_thread_cb(void (*proc)(void *), void *arg); extern int is_syscall(unsigned long addr); -- cgit v1.2.3-58-ga151 From fbb42e7fe2c4962cb51776fff5462e6264d9716b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 5 Mar 2021 13:19:54 +0100 Subject: um: export signals_enabled directly Use signals_enabled instead of always jumping through a function call to read it, there's not much point in that. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/include/asm/irqflags.h | 10 +++++----- arch/um/include/shared/longjmp.h | 14 +++++++------- arch/um/include/shared/os.h | 1 - arch/um/kernel/ksyms.c | 2 +- arch/um/os-Linux/signal.c | 7 +------ 5 files changed, 14 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/um/include/asm/irqflags.h b/arch/um/include/asm/irqflags.h index 0642ad9035d1..dab5744e9253 100644 --- a/arch/um/include/asm/irqflags.h +++ b/arch/um/include/asm/irqflags.h @@ -2,15 +2,15 @@ #ifndef __UM_IRQFLAGS_H #define __UM_IRQFLAGS_H -extern int get_signals(void); -extern int set_signals(int enable); -extern void block_signals(void); -extern void unblock_signals(void); +extern int signals_enabled; +int set_signals(int enable); +void block_signals(void); +void unblock_signals(void); #define arch_local_save_flags arch_local_save_flags static inline unsigned long arch_local_save_flags(void) { - return get_signals(); + return signals_enabled; } #define arch_local_irq_restore arch_local_irq_restore diff --git a/arch/um/include/shared/longjmp.h b/arch/um/include/shared/longjmp.h index 85a1cc290ecb..bdb2869b72b3 100644 --- a/arch/um/include/shared/longjmp.h +++ b/arch/um/include/shared/longjmp.h @@ -5,6 +5,7 @@ #include #include +extern int signals_enabled; extern int setjmp(jmp_buf); extern void longjmp(jmp_buf, int); @@ -12,13 +13,12 @@ extern void longjmp(jmp_buf, int); longjmp(*buf, val); \ } while(0) -#define UML_SETJMP(buf) ({ \ - int n; \ - volatile int enable; \ - enable = get_signals(); \ - n = setjmp(*buf); \ - if(n != 0) \ - set_signals_trace(enable); \ +#define UML_SETJMP(buf) ({ \ + int n, enable; \ + enable = *(volatile int *)&signals_enabled; \ + n = setjmp(*buf); \ + if(n != 0) \ + set_signals_trace(enable); \ n; }) #endif diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 13d86f94cf0f..f9fbbddc38bb 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -237,7 +237,6 @@ extern void send_sigio_to_self(void); extern int change_sig(int signal, int on); extern void block_signals(void); extern void unblock_signals(void); -extern int get_signals(void); extern int set_signals(int enable); extern int set_signals_trace(int enable); extern int os_is_signal_stack(void); diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c index 8ade54a86a7e..b1e5634398d0 100644 --- a/arch/um/kernel/ksyms.c +++ b/arch/um/kernel/ksyms.c @@ -7,7 +7,7 @@ #include EXPORT_SYMBOL(set_signals); -EXPORT_SYMBOL(get_signals); +EXPORT_SYMBOL(signals_enabled); EXPORT_SYMBOL(os_stat_fd); EXPORT_SYMBOL(os_stat_file); diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 96f511d1aabe..8c9d162e6c51 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -62,7 +62,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) #define SIGALRM_BIT 1 #define SIGALRM_MASK (1 << SIGALRM_BIT) -static int signals_enabled; +int signals_enabled; static unsigned int signals_pending; static unsigned int signals_active = 0; @@ -334,11 +334,6 @@ void unblock_signals(void) } } -int get_signals(void) -{ - return signals_enabled; -} - int set_signals(int enable) { int ret; -- cgit v1.2.3-58-ga151 From 33c7d0616a0482def19d7f981d4eaa429086c771 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 5 Mar 2021 13:19:55 +0100 Subject: um: expose time-travel mode to userspace side This will be necessary in the userspace side to fix the signal/interrupt handling in time-travel=ext mode, which is the next patch. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/include/linux/time-internal.h | 12 +----------- arch/um/include/shared/timetravel.h | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 11 deletions(-) create mode 100644 arch/um/include/shared/timetravel.h (limited to 'arch') diff --git a/arch/um/include/linux/time-internal.h b/arch/um/include/linux/time-internal.h index 759956ab0108..b22226634ff6 100644 --- a/arch/um/include/linux/time-internal.h +++ b/arch/um/include/linux/time-internal.h @@ -8,17 +8,11 @@ #define __TIMER_INTERNAL_H__ #include #include +#include #define TIMER_MULTIPLIER 256 #define TIMER_MIN_DELTA 500 -enum time_travel_mode { - TT_MODE_OFF, - TT_MODE_BASIC, - TT_MODE_INFCPU, - TT_MODE_EXTERNAL, -}; - #ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT struct time_travel_event { unsigned long long time; @@ -27,8 +21,6 @@ struct time_travel_event { bool pending, onstack; }; -extern enum time_travel_mode time_travel_mode; - void time_travel_sleep(void); static inline void @@ -62,8 +54,6 @@ bool time_travel_del_event(struct time_travel_event *e); struct time_travel_event { }; -#define time_travel_mode TT_MODE_OFF - static inline void time_travel_sleep(void) { } diff --git a/arch/um/include/shared/timetravel.h b/arch/um/include/shared/timetravel.h new file mode 100644 index 000000000000..e5c3d69f1b69 --- /dev/null +++ b/arch/um/include/shared/timetravel.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2019-2021 Intel Corporation + */ +#ifndef _UM_TIME_TRAVEL_H_ +#define _UM_TIME_TRAVEL_H_ + +enum time_travel_mode { + TT_MODE_OFF, + TT_MODE_BASIC, + TT_MODE_INFCPU, + TT_MODE_EXTERNAL, +}; + +#if defined(UML_CONFIG_UML_TIME_TRAVEL_SUPPORT) || \ + defined(CONFIG_UML_TIME_TRAVEL_SUPPORT) +extern enum time_travel_mode time_travel_mode; +#else +#define time_travel_mode TT_MODE_OFF +#endif /* (UML_)CONFIG_UML_TIME_TRAVEL_SUPPORT */ + +#endif /* _UM_TIME_TRAVEL_H_ */ -- cgit v1.2.3-58-ga151 From d6b399a0e02a9063a5812af6cb8b657a4a1ecf68 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 5 Mar 2021 13:19:56 +0100 Subject: um: time-travel/signals: fix ndelay() in interrupt We should be able to ndelay() from any context, even from an interrupt context! However, this is broken (not functionally, but locking-wise) in time-travel because we'll get into the time-travel code and enable interrupts to handle messages on other time-travel aware subsystems (only virtio for now). Luckily, I've already reworked the time-travel aware signal (interrupt) delivery for suspend/resume to have a time travel handler, which runs directly in the context of the signal and not from the Linux interrupt. In order to fix this time-travel issue then, we need to do a few things: 1) rework the signal handling code to call time-travel handlers (only) if interrupts are disabled but signals aren't blocked, instead of marking it only pending there. This is needed to not deadlock other communication. 2) rework time-travel to not enable interrupts while it's waiting for a message; 3) rework time-travel to not (just) disable interrupts but rather block signals at a lower level while it needs them disabled for communicating with the controller. Finally, since now we can actually spend even virtual time in interrupts-disabled sections, the delay warning when we deliver a time-travel delayed interrupt is no longer valid, things can (and should) now get delayed. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/include/shared/irq_user.h | 1 + arch/um/include/shared/os.h | 3 +++ arch/um/kernel/irq.c | 35 ++++++++++++++++++++----- arch/um/kernel/time.c | 35 +++++++++---------------- arch/um/os-Linux/signal.c | 55 ++++++++++++++++++++++++++++++++++++--- 5 files changed, 96 insertions(+), 33 deletions(-) (limited to 'arch') diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h index 07239e801a5b..065829f443ae 100644 --- a/arch/um/include/shared/irq_user.h +++ b/arch/um/include/shared/irq_user.h @@ -17,6 +17,7 @@ enum um_irq_type { struct siginfo; extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); +void sigio_run_timetravel_handlers(void); extern void free_irq_by_fd(int fd); extern void deactivate_fd(int fd, int irqnum); extern int deactivate_all_fds(void); diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index f9fbbddc38bb..453b13369533 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -242,6 +242,9 @@ extern int set_signals_trace(int enable); extern int os_is_signal_stack(void); extern void deliver_alarm(void); extern void register_pm_wake_signal(void); +extern void block_signals_hard(void); +extern void unblock_signals_hard(void); +extern void mark_sigio_pending(void); /* util.c */ extern void stack_protections(unsigned long address); diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 82af5191e73d..1c448ea4729e 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -123,7 +123,8 @@ static bool irq_do_timetravel_handler(struct irq_entry *entry, #endif static void sigio_reg_handler(int idx, struct irq_entry *entry, enum um_irq_type t, - struct uml_pt_regs *regs) + struct uml_pt_regs *regs, + bool timetravel_handlers_only) { struct irq_reg *reg = &entry->reg[t]; @@ -136,18 +137,29 @@ static void sigio_reg_handler(int idx, struct irq_entry *entry, enum um_irq_type if (irq_do_timetravel_handler(entry, t)) return; - if (irqs_suspended) + /* + * If we're called to only run time-travel handlers then don't + * actually proceed but mark sigio as pending (if applicable). + * For suspend/resume, timetravel_handlers_only may be true + * despite time-travel not being configured and used. + */ + if (timetravel_handlers_only) { +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT + mark_sigio_pending(); +#endif return; + } irq_io_loop(reg, regs); } -void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +static void _sigio_handler(struct uml_pt_regs *regs, + bool timetravel_handlers_only) { struct irq_entry *irq_entry; int n, i; - if (irqs_suspended && !um_irq_timetravel_handler_used()) + if (timetravel_handlers_only && !um_irq_timetravel_handler_used()) return; while (1) { @@ -172,14 +184,20 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) irq_entry = os_epoll_get_data_pointer(i); for (t = 0; t < NUM_IRQ_TYPES; t++) - sigio_reg_handler(i, irq_entry, t, regs); + sigio_reg_handler(i, irq_entry, t, regs, + timetravel_handlers_only); } } - if (!irqs_suspended) + if (!timetravel_handlers_only) free_irqs(); } +void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +{ + _sigio_handler(regs, irqs_suspended); +} + static struct irq_entry *get_irq_entry_by_fd(int fd) { struct irq_entry *walk; @@ -467,6 +485,11 @@ int um_request_irq_tt(int irq, int fd, enum um_irq_type type, devname, dev_id, timetravel_handler); } EXPORT_SYMBOL(um_request_irq_tt); + +void sigio_run_timetravel_handlers(void) +{ + _sigio_handler(NULL, true); +} #endif #ifdef CONFIG_PM_SLEEP diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index e0cdb9694fb8..fddd1dec27e6 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -68,23 +68,15 @@ static void time_travel_handle_message(struct um_timetravel_msg *msg, int ret; /* - * Poll outside the locked section (if we're not called to only read - * the response) so we can get interrupts for e.g. virtio while we're - * here, but then we need to lock to not get interrupted between the - * read of the message and write of the ACK. + * We can't unlock here, but interrupt signals with a timetravel_handler + * (see um_request_irq_tt) get to the timetravel_handler anyway. */ if (mode != TTMH_READ) { - bool disabled = irqs_disabled(); + BUG_ON(mode == TTMH_IDLE && !irqs_disabled()); - BUG_ON(mode == TTMH_IDLE && !disabled); - - if (disabled) - local_irq_enable(); while (os_poll(1, &time_travel_ext_fd) != 0) { /* nothing */ } - if (disabled) - local_irq_disable(); } ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg)); @@ -123,15 +115,15 @@ static u64 time_travel_ext_req(u32 op, u64 time) .time = time, .seq = mseq, }; - unsigned long flags; /* - * We need to save interrupts here and only restore when we - * got the ACK - otherwise we can get interrupted and send - * another request while we're still waiting for an ACK, but - * the peer doesn't know we got interrupted and will send - * the ACKs in the same order as the message, but we'd need - * to see them in the opposite order ... + * We need to block even the timetravel handlers of SIGIO here and + * only restore their use when we got the ACK - otherwise we may + * (will) get interrupted by that, try to queue the IRQ for future + * processing and thus send another request while we're still waiting + * for an ACK, but the peer doesn't know we got interrupted and will + * send the ACKs in the same order as the message, but we'd need to + * see them in the opposite order ... * * This wouldn't matter *too* much, but some ACKs carry the * current time (for UM_TIMETRAVEL_GET) and getting another @@ -140,7 +132,7 @@ static u64 time_travel_ext_req(u32 op, u64 time) * The sequence number assignment that happens here lets us * debug such message handling issues more easily. */ - local_irq_save(flags); + block_signals_hard(); os_write_file(time_travel_ext_fd, &msg, sizeof(msg)); while (msg.op != UM_TIMETRAVEL_ACK) @@ -152,7 +144,7 @@ static u64 time_travel_ext_req(u32 op, u64 time) if (op == UM_TIMETRAVEL_GET) time_travel_set_time(msg.time); - local_irq_restore(flags); + unblock_signals_hard(); return msg.time; } @@ -352,9 +344,6 @@ void deliver_time_travel_irqs(void) while ((e = list_first_entry_or_null(&time_travel_irqs, struct time_travel_event, list))) { - WARN(e->time != time_travel_time, - "time moved from %lld to %lld before IRQ delivery\n", - time_travel_time, e->time); list_del(&e->list); e->pending = false; e->fn(e); diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 8c9d162e6c51..1d501acb22ee 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -18,6 +18,7 @@ #include #include #include +#include void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = { [SIGTRAP] = relay_signal, @@ -63,16 +64,29 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) #define SIGALRM_MASK (1 << SIGALRM_BIT) int signals_enabled; +#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT +static int signals_blocked; +#else +#define signals_blocked false +#endif static unsigned int signals_pending; static unsigned int signals_active = 0; void sig_handler(int sig, struct siginfo *si, mcontext_t *mc) { - int enabled; + int enabled = signals_enabled; - enabled = signals_enabled; - if (!enabled && (sig == SIGIO)) { - signals_pending |= SIGIO_MASK; + if ((signals_blocked || !enabled) && (sig == SIGIO)) { + /* + * In TT_MODE_EXTERNAL, need to still call time-travel + * handlers unless signals are also blocked for the + * external time message processing. This will mark + * signals_pending by itself (only if necessary.) + */ + if (!signals_blocked && time_travel_mode == TT_MODE_EXTERNAL) + sigio_run_timetravel_handlers(); + else + signals_pending |= SIGIO_MASK; return; } @@ -363,6 +377,39 @@ int set_signals_trace(int enable) return ret; } +#ifdef UML_CONFIG_UML_TIME_TRAVEL_SUPPORT +void mark_sigio_pending(void) +{ + signals_pending |= SIGIO_MASK; +} + +void block_signals_hard(void) +{ + if (signals_blocked) + return; + signals_blocked = 1; + barrier(); +} + +void unblock_signals_hard(void) +{ + if (!signals_blocked) + return; + /* Must be set to 0 before we check the pending bits etc. */ + signals_blocked = 0; + barrier(); + + if (signals_pending && signals_enabled) { + /* this is a bit inefficient, but that's not really important */ + block_signals(); + unblock_signals(); + } else if (signals_pending & SIGIO_MASK) { + /* we need to run time-travel handlers even if not enabled */ + sigio_run_timetravel_handlers(); + } +} +#endif + int os_is_signal_stack(void) { stack_t ss; -- cgit v1.2.3-58-ga151 From a5ab7c8467f1934236e33d5fa1c3c6de831a6648 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 5 Mar 2021 13:19:57 +0100 Subject: um: irqs: allow invoking time-travel handler multiple times If we happen to get multiple messages while IRQS are already suspended, we still need to handle them, since otherwise the simulation blocks. Remove the "prevent nesting" part, time_travel_add_irq_event() will deal with being called multiple times just fine. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/kernel/irq.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 1c448ea4729e..f3f19da7b876 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -101,10 +101,12 @@ static bool irq_do_timetravel_handler(struct irq_entry *entry, if (!reg->timetravel_handler) return false; - /* prevent nesting - we'll get it again later when we SIGIO ourselves */ - if (reg->pending_on_resume) - return true; - + /* + * Handle all messages - we might get multiple even while + * interrupts are already suspended, due to suspend order + * etc. Note that time_travel_add_irq_event() will not add + * an event twice, if it's pending already "first wins". + */ reg->timetravel_handler(reg->irq, entry->fd, reg->id, ®->event); if (!reg->event.pending) -- cgit v1.2.3-58-ga151 From 68f5d3f3b6543266b29e047cfaf9842333019b4c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 5 Mar 2021 13:19:58 +0100 Subject: um: add PCI over virtio emulation driver To support testing of PCI/PCIe drivers in UML, add a PCI bus support driver. This driver uses virtio, which in UML is really just vhost-user, to talk to devices, and adds the devices to the virtual PCI bus in the system. Since virtio already allows DMA/bus mastering this really isn't all that hard, of course we need the logic_iomem infrastructure that was added by a previous patch. The protocol to talk to the device is has a few fairly simple messages for reading to/writing from config and IO spaces, and messages for the device to send the various interrupts (INT#, MSI/MSI-X and while suspended PME#). Note that currently no offical virtio device ID is assigned for this protocol, as a consequence this patch requires defining it in the Kconfig, with a default that makes the driver refuse to work at all. Finally, in order to add support for MSI/MSI-X interrupts, some small changes are needed in the UML IRQ code, it needs to have more interrupts, changing NR_IRQS from 64 to 128 if this driver is enabled, but not actually use them for anything so that the generic IRQ domain/MSI infrastructure can allocate IRQ numbers. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/Kconfig | 13 +- arch/um/drivers/Kconfig | 20 + arch/um/drivers/Makefile | 1 + arch/um/drivers/virt-pci.c | 885 +++++++++++++++++++++++++++++++++++++ arch/um/include/asm/Kbuild | 1 - arch/um/include/asm/io.h | 7 + arch/um/include/asm/irq.h | 8 +- arch/um/include/asm/msi.h | 1 + arch/um/include/asm/pci.h | 39 ++ arch/um/kernel/Makefile | 1 + arch/um/kernel/ioport.c | 13 + arch/um/kernel/irq.c | 7 +- include/uapi/linux/virtio_pcidev.h | 64 +++ 13 files changed, 1054 insertions(+), 6 deletions(-) create mode 100644 arch/um/drivers/virt-pci.c create mode 100644 arch/um/include/asm/msi.h create mode 100644 arch/um/include/asm/pci.h create mode 100644 arch/um/kernel/ioport.c create mode 100644 include/uapi/linux/virtio_pcidev.h (limited to 'arch') diff --git a/arch/um/Kconfig b/arch/um/Kconfig index e72b4b393367..328ba973c07d 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -15,7 +15,7 @@ config UML select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_BUGVERBOSE - select NO_DMA + select NO_DMA if !UML_DMA_EMULATION select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES select HAVE_GCC_PLUGINS @@ -26,10 +26,21 @@ config MMU bool default y +config UML_DMA_EMULATION + bool + config NO_IOMEM bool "disable IOMEM" if EXPERT + depends on !INDIRECT_IOMEM default y +config UML_IOMEM_EMULATION + bool + select INDIRECT_IOMEM + select GENERIC_PCI_IOMAP + select GENERIC_IOMAP + select NO_GENERIC_PCI_IOPORT_MAP + config NO_IOPORT_MAP def_bool y diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index 03ba34b61115..f145842c40b9 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -357,3 +357,23 @@ config UML_RTC rtcwake, especially in time-travel mode. This driver enables that by providing a fake RTC clock that causes a wakeup at the right time. + +config UML_PCI_OVER_VIRTIO + bool "Enable PCI over VIRTIO device simulation" + # in theory, just VIRTIO is enough, but that causes recursion + depends on VIRTIO_UML + select FORCE_PCI + select UML_IOMEM_EMULATION + select UML_DMA_EMULATION + select PCI_MSI + select PCI_MSI_IRQ_DOMAIN + select PCI_LOCKLESS_CONFIG + +config UML_PCI_OVER_VIRTIO_DEVICE_ID + int "set the virtio device ID for PCI emulation" + default -1 + depends on UML_PCI_OVER_VIRTIO + help + There's no official device ID assigned (yet), set the one you + wish to use for experimentation here. The default of -1 is + not valid and will cause the driver to fail at probe. diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile index dcc64a02f81f..803666e85414 100644 --- a/arch/um/drivers/Makefile +++ b/arch/um/drivers/Makefile @@ -64,6 +64,7 @@ obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o obj-$(CONFIG_UML_RANDOM) += random.o obj-$(CONFIG_VIRTIO_UML) += virtio_uml.o obj-$(CONFIG_UML_RTC) += rtc.o +obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virt-pci.o # pcap_user.o must be added explicitly. USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o pcap_user.o vde_user.o vector_user.o diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c new file mode 100644 index 000000000000..dd85f36197aa --- /dev/null +++ b/arch/um/drivers/virt-pci.c @@ -0,0 +1,885 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Intel Corporation + * Author: Johannes Berg + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_DEVICES 8 +#define MAX_MSI_VECTORS 32 +#define CFG_SPACE_SIZE 4096 + +/* for MSI-X we have a 32-bit payload */ +#define MAX_IRQ_MSG_SIZE (sizeof(struct virtio_pcidev_msg) + sizeof(u32)) +#define NUM_IRQ_MSGS 10 + +#define HANDLE_NO_FREE(ptr) ((void *)((unsigned long)(ptr) | 1)) +#define HANDLE_IS_NO_FREE(ptr) ((unsigned long)(ptr) & 1) + +struct um_pci_device { + struct virtio_device *vdev; + + /* for now just standard BARs */ + u8 resptr[PCI_STD_NUM_BARS]; + + struct virtqueue *cmd_vq, *irq_vq; + +#define UM_PCI_STAT_WAITING 0 + unsigned long status; + + int irq; +}; + +struct um_pci_device_reg { + struct um_pci_device *dev; + void __iomem *iomem; +}; + +static struct pci_host_bridge *bridge; +static DEFINE_MUTEX(um_pci_mtx); +static struct um_pci_device_reg um_pci_devices[MAX_DEVICES]; +static struct fwnode_handle *um_pci_fwnode; +static struct irq_domain *um_pci_inner_domain; +static struct irq_domain *um_pci_msi_domain; +static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)]; + +#define UM_VIRT_PCI_MAXDELAY 40000 + +static int um_pci_send_cmd(struct um_pci_device *dev, + struct virtio_pcidev_msg *cmd, + unsigned int cmd_size, + const void *extra, unsigned int extra_size, + void *out, unsigned int out_size) +{ + struct scatterlist out_sg, extra_sg, in_sg; + struct scatterlist *sgs_list[] = { + [0] = &out_sg, + [1] = extra ? &extra_sg : &in_sg, + [2] = extra ? &in_sg : NULL, + }; + int delay_count = 0; + int ret, len; + bool posted; + + if (WARN_ON(cmd_size < sizeof(*cmd))) + return -EINVAL; + + switch (cmd->op) { + case VIRTIO_PCIDEV_OP_CFG_WRITE: + case VIRTIO_PCIDEV_OP_MMIO_WRITE: + case VIRTIO_PCIDEV_OP_MMIO_MEMSET: + /* in PCI, writes are posted, so don't wait */ + posted = !out; + WARN_ON(!posted); + break; + default: + posted = false; + break; + } + + if (posted) { + u8 *ncmd = kmalloc(cmd_size + extra_size, GFP_ATOMIC); + + if (ncmd) { + memcpy(ncmd, cmd, cmd_size); + if (extra) + memcpy(ncmd + cmd_size, extra, extra_size); + cmd = (void *)ncmd; + cmd_size += extra_size; + extra = NULL; + extra_size = 0; + } else { + /* try without allocating memory */ + posted = false; + } + } + + sg_init_one(&out_sg, cmd, cmd_size); + if (extra) + sg_init_one(&extra_sg, extra, extra_size); + if (out) + sg_init_one(&in_sg, out, out_size); + + /* add to internal virtio queue */ + ret = virtqueue_add_sgs(dev->cmd_vq, sgs_list, + extra ? 2 : 1, + out ? 1 : 0, + posted ? cmd : HANDLE_NO_FREE(cmd), + GFP_ATOMIC); + if (ret) + return ret; + + if (posted) { + virtqueue_kick(dev->cmd_vq); + return 0; + } + + /* kick and poll for getting a response on the queue */ + set_bit(UM_PCI_STAT_WAITING, &dev->status); + virtqueue_kick(dev->cmd_vq); + + while (1) { + void *completed = virtqueue_get_buf(dev->cmd_vq, &len); + + if (completed == HANDLE_NO_FREE(cmd)) + break; + + if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) || + ++delay_count > UM_VIRT_PCI_MAXDELAY, + "um virt-pci delay: %d", delay_count)) { + ret = -EIO; + break; + } + udelay(1); + } + clear_bit(UM_PCI_STAT_WAITING, &dev->status); + + return ret; +} + +static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, + int size) +{ + struct um_pci_device_reg *reg = priv; + struct um_pci_device *dev = reg->dev; + struct virtio_pcidev_msg hdr = { + .op = VIRTIO_PCIDEV_OP_CFG_READ, + .size = size, + .addr = offset, + }; + /* maximum size - we may only use parts of it */ + u8 data[8]; + + if (!dev) + return ~0ULL; + + memset(data, 0xff, sizeof(data)); + + switch (size) { + case 1: + case 2: + case 4: +#ifdef CONFIG_64BIT + case 8: +#endif + break; + default: + WARN(1, "invalid config space read size %d\n", size); + return ~0ULL; + } + + if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, + data, sizeof(data))) + return ~0ULL; + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ~0ULL; + } +} + +static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size, + unsigned long val) +{ + struct um_pci_device_reg *reg = priv; + struct um_pci_device *dev = reg->dev; + struct { + struct virtio_pcidev_msg hdr; + /* maximum size - we may only use parts of it */ + u8 data[8]; + } msg = { + .hdr = { + .op = VIRTIO_PCIDEV_OP_CFG_WRITE, + .size = size, + .addr = offset, + }, + }; + + if (!dev) + return; + + switch (size) { + case 1: + msg.data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)msg.data); + break; + case 4: + put_unaligned_le32(val, (void *)msg.data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)msg.data); + break; +#endif + default: + WARN(1, "invalid config space write size %d\n", size); + return; + } + + WARN_ON(um_pci_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0)); +} + +static const struct logic_iomem_ops um_pci_device_cfgspace_ops = { + .read = um_pci_cfgspace_read, + .write = um_pci_cfgspace_write, +}; + +static void um_pci_bar_copy_from(void *priv, void *buffer, + unsigned int offset, int size) +{ + u8 *resptr = priv; + struct um_pci_device *dev = container_of(resptr - *resptr, + struct um_pci_device, + resptr[0]); + struct virtio_pcidev_msg hdr = { + .op = VIRTIO_PCIDEV_OP_MMIO_READ, + .bar = *resptr, + .size = size, + .addr = offset, + }; + + memset(buffer, 0xff, size); + + um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, buffer, size); +} + +static unsigned long um_pci_bar_read(void *priv, unsigned int offset, + int size) +{ + /* maximum size - we may only use parts of it */ + u8 data[8]; + + switch (size) { + case 1: + case 2: + case 4: +#ifdef CONFIG_64BIT + case 8: +#endif + break; + default: + WARN(1, "invalid config space read size %d\n", size); + return ~0ULL; + } + + um_pci_bar_copy_from(priv, data, offset, size); + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ~0ULL; + } +} + +static void um_pci_bar_copy_to(void *priv, unsigned int offset, + const void *buffer, int size) +{ + u8 *resptr = priv; + struct um_pci_device *dev = container_of(resptr - *resptr, + struct um_pci_device, + resptr[0]); + struct virtio_pcidev_msg hdr = { + .op = VIRTIO_PCIDEV_OP_MMIO_WRITE, + .bar = *resptr, + .size = size, + .addr = offset, + }; + + um_pci_send_cmd(dev, &hdr, sizeof(hdr), buffer, size, NULL, 0); +} + +static void um_pci_bar_write(void *priv, unsigned int offset, int size, + unsigned long val) +{ + /* maximum size - we may only use parts of it */ + u8 data[8]; + + switch (size) { + case 1: + data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)data); + break; + case 4: + put_unaligned_le32(val, (void *)data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)data); + break; +#endif + default: + WARN(1, "invalid config space write size %d\n", size); + return; + } + + um_pci_bar_copy_to(priv, offset, data, size); +} + +static void um_pci_bar_set(void *priv, unsigned int offset, u8 value, int size) +{ + u8 *resptr = priv; + struct um_pci_device *dev = container_of(resptr - *resptr, + struct um_pci_device, + resptr[0]); + struct { + struct virtio_pcidev_msg hdr; + u8 data; + } msg = { + .hdr = { + .op = VIRTIO_PCIDEV_OP_CFG_WRITE, + .bar = *resptr, + .size = size, + .addr = offset, + }, + .data = value, + }; + + um_pci_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0); +} + +static const struct logic_iomem_ops um_pci_device_bar_ops = { + .read = um_pci_bar_read, + .write = um_pci_bar_write, + .set = um_pci_bar_set, + .copy_from = um_pci_bar_copy_from, + .copy_to = um_pci_bar_copy_to, +}; + +static void __iomem *um_pci_map_bus(struct pci_bus *bus, unsigned int devfn, + int where) +{ + struct um_pci_device_reg *dev; + unsigned int busn = bus->number; + + if (busn > 0) + return NULL; + + /* not allowing functions for now ... */ + if (devfn % 8) + return NULL; + + if (devfn / 8 >= ARRAY_SIZE(um_pci_devices)) + return NULL; + + dev = &um_pci_devices[devfn / 8]; + if (!dev) + return NULL; + + return (void __iomem *)((unsigned long)dev->iomem + where); +} + +static struct pci_ops um_pci_ops = { + .map_bus = um_pci_map_bus, + .read = pci_generic_config_read, + .write = pci_generic_config_write, +}; + +static void um_pci_rescan(void) +{ + pci_lock_rescan_remove(); + pci_rescan_bus(bridge->bus); + pci_unlock_rescan_remove(); +} + +static void um_pci_irq_vq_addbuf(struct virtqueue *vq, void *buf, bool kick) +{ + struct scatterlist sg[1]; + + sg_init_one(sg, buf, MAX_IRQ_MSG_SIZE); + if (virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC)) + kfree(buf); + else if (kick) + virtqueue_kick(vq); +} + +static void um_pci_handle_irq_message(struct virtqueue *vq, + struct virtio_pcidev_msg *msg) +{ + struct virtio_device *vdev = vq->vdev; + struct um_pci_device *dev = vdev->priv; + + /* we should properly chain interrupts, but on ARCH=um we don't care */ + + switch (msg->op) { + case VIRTIO_PCIDEV_OP_INT: + generic_handle_irq(dev->irq); + break; + case VIRTIO_PCIDEV_OP_MSI: + /* our MSI message is just the interrupt number */ + if (msg->size == sizeof(u32)) + generic_handle_irq(le32_to_cpup((void *)msg->data)); + else + generic_handle_irq(le16_to_cpup((void *)msg->data)); + break; + case VIRTIO_PCIDEV_OP_PME: + /* nothing to do - we already woke up due to the message */ + break; + default: + dev_err(&vdev->dev, "unexpected virt-pci message %d\n", msg->op); + break; + } +} + +static void um_pci_cmd_vq_cb(struct virtqueue *vq) +{ + struct virtio_device *vdev = vq->vdev; + struct um_pci_device *dev = vdev->priv; + void *cmd; + int len; + + if (test_bit(UM_PCI_STAT_WAITING, &dev->status)) + return; + + while ((cmd = virtqueue_get_buf(vq, &len))) { + if (WARN_ON(HANDLE_IS_NO_FREE(cmd))) + continue; + kfree(cmd); + } +} + +static void um_pci_irq_vq_cb(struct virtqueue *vq) +{ + struct virtio_pcidev_msg *msg; + int len; + + while ((msg = virtqueue_get_buf(vq, &len))) { + if (len >= sizeof(*msg)) + um_pci_handle_irq_message(vq, msg); + + /* recycle the message buffer */ + um_pci_irq_vq_addbuf(vq, msg, true); + } +} + +static int um_pci_init_vqs(struct um_pci_device *dev) +{ + struct virtqueue *vqs[2]; + static const char *const names[2] = { "cmd", "irq" }; + vq_callback_t *cbs[2] = { um_pci_cmd_vq_cb, um_pci_irq_vq_cb }; + int err, i; + + err = virtio_find_vqs(dev->vdev, 2, vqs, cbs, names, NULL); + if (err) + return err; + + dev->cmd_vq = vqs[0]; + dev->irq_vq = vqs[1]; + + for (i = 0; i < NUM_IRQ_MSGS; i++) { + void *msg = kzalloc(MAX_IRQ_MSG_SIZE, GFP_KERNEL); + + if (msg) + um_pci_irq_vq_addbuf(dev->irq_vq, msg, false); + } + + virtqueue_kick(dev->irq_vq); + + return 0; +} + +static int um_pci_virtio_probe(struct virtio_device *vdev) +{ + struct um_pci_device *dev; + int i, free = -1; + int err = -ENOSPC; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->vdev = vdev; + vdev->priv = dev; + + mutex_lock(&um_pci_mtx); + for (i = 0; i < MAX_DEVICES; i++) { + if (um_pci_devices[i].dev) + continue; + free = i; + break; + } + + if (free < 0) + goto error; + + err = um_pci_init_vqs(dev); + if (err) + goto error; + + dev->irq = irq_alloc_desc(numa_node_id()); + if (dev->irq < 0) { + err = dev->irq; + goto error; + } + um_pci_devices[free].dev = dev; + vdev->priv = dev; + + mutex_unlock(&um_pci_mtx); + + device_set_wakeup_enable(&vdev->dev, true); + + um_pci_rescan(); + return 0; +error: + mutex_unlock(&um_pci_mtx); + kfree(dev); + return err; +} + +static void um_pci_virtio_remove(struct virtio_device *vdev) +{ + struct um_pci_device *dev = vdev->priv; + int i; + + /* Stop all virtqueues */ + vdev->config->reset(vdev); + vdev->config->del_vqs(vdev); + + device_set_wakeup_enable(&vdev->dev, false); + + mutex_lock(&um_pci_mtx); + for (i = 0; i < MAX_DEVICES; i++) { + if (um_pci_devices[i].dev != dev) + continue; + um_pci_devices[i].dev = NULL; + irq_free_desc(dev->irq); + } + mutex_unlock(&um_pci_mtx); + + um_pci_rescan(); + + kfree(dev); +} + +static struct virtio_device_id id_table[] = { + { CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; +MODULE_DEVICE_TABLE(virtio, id_table); + +static struct virtio_driver um_pci_virtio_driver = { + .driver.name = "virtio-pci", + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = um_pci_virtio_probe, + .remove = um_pci_virtio_remove, +}; + +static struct resource virt_cfgspace_resource = { + .name = "PCI config space", + .start = 0xf0000000 - MAX_DEVICES * CFG_SPACE_SIZE, + .end = 0xf0000000 - 1, + .flags = IORESOURCE_MEM, +}; + +static long um_pci_map_cfgspace(unsigned long offset, size_t size, + const struct logic_iomem_ops **ops, + void **priv) +{ + if (WARN_ON(size > CFG_SPACE_SIZE || offset % CFG_SPACE_SIZE)) + return -EINVAL; + + if (offset / CFG_SPACE_SIZE < MAX_DEVICES) { + *ops = &um_pci_device_cfgspace_ops; + *priv = &um_pci_devices[offset / CFG_SPACE_SIZE]; + return 0; + } + + WARN(1, "cannot map offset 0x%lx/0x%zx\n", offset, size); + return -ENOENT; +} + +static const struct logic_iomem_region_ops um_pci_cfgspace_ops = { + .map = um_pci_map_cfgspace, +}; + +static struct resource virt_iomem_resource = { + .name = "PCI iomem", + .start = 0xf0000000, + .end = 0xffffffff, + .flags = IORESOURCE_MEM, +}; + +struct um_pci_map_iomem_data { + unsigned long offset; + size_t size; + const struct logic_iomem_ops **ops; + void **priv; + long ret; +}; + +static int um_pci_map_iomem_walk(struct pci_dev *pdev, void *_data) +{ + struct um_pci_map_iomem_data *data = _data; + struct um_pci_device_reg *reg = &um_pci_devices[pdev->devfn / 8]; + struct um_pci_device *dev; + int i; + + if (!reg->dev) + return 0; + + for (i = 0; i < ARRAY_SIZE(dev->resptr); i++) { + struct resource *r = &pdev->resource[i]; + + if ((r->flags & IORESOURCE_TYPE_BITS) != IORESOURCE_MEM) + continue; + + /* + * must be the whole or part of the resource, + * not allowed to only overlap + */ + if (data->offset < r->start || data->offset > r->end) + continue; + if (data->offset + data->size - 1 > r->end) + continue; + + dev = reg->dev; + *data->ops = &um_pci_device_bar_ops; + dev->resptr[i] = i; + *data->priv = &dev->resptr[i]; + data->ret = data->offset - r->start; + + /* no need to continue */ + return 1; + } + + return 0; +} + +static long um_pci_map_iomem(unsigned long offset, size_t size, + const struct logic_iomem_ops **ops, + void **priv) +{ + struct um_pci_map_iomem_data data = { + /* we want the full address here */ + .offset = offset + virt_iomem_resource.start, + .size = size, + .ops = ops, + .priv = priv, + .ret = -ENOENT, + }; + + pci_walk_bus(bridge->bus, um_pci_map_iomem_walk, &data); + return data.ret; +} + +static const struct logic_iomem_region_ops um_pci_iomem_ops = { + .map = um_pci_map_iomem, +}; + +static void um_pci_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + /* + * This is a very low address and not actually valid 'physical' memory + * in UML, so we can simply map MSI(-X) vectors to there, it cannot be + * legitimately written to by the device in any other way. + * We use the (virtual) IRQ number here as the message to simplify the + * code that receives the message, where for now we simply trust the + * device to send the correct message. + */ + msg->address_hi = 0; + msg->address_lo = 0xa0000; + msg->data = data->irq; +} + +static struct irq_chip um_pci_msi_bottom_irq_chip = { + .name = "UM virtio MSI", + .irq_compose_msi_msg = um_pci_compose_msi_msg, +}; + +static int um_pci_inner_domain_alloc(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs, + void *args) +{ + unsigned long bit; + + WARN_ON(nr_irqs != 1); + + mutex_lock(&um_pci_mtx); + bit = find_first_zero_bit(um_pci_msi_used, MAX_MSI_VECTORS); + if (bit >= MAX_MSI_VECTORS) { + mutex_unlock(&um_pci_mtx); + return -ENOSPC; + } + + set_bit(bit, um_pci_msi_used); + mutex_unlock(&um_pci_mtx); + + irq_domain_set_info(domain, virq, bit, &um_pci_msi_bottom_irq_chip, + domain->host_data, handle_simple_irq, + NULL, NULL); + + return 0; +} + +static void um_pci_inner_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + + mutex_lock(&um_pci_mtx); + + if (!test_bit(d->hwirq, um_pci_msi_used)) + pr_err("trying to free unused MSI#%lu\n", d->hwirq); + else + __clear_bit(d->hwirq, um_pci_msi_used); + + mutex_unlock(&um_pci_mtx); +} + +static const struct irq_domain_ops um_pci_inner_domain_ops = { + .alloc = um_pci_inner_domain_alloc, + .free = um_pci_inner_domain_free, +}; + +static struct irq_chip um_pci_msi_irq_chip = { + .name = "UM virtio PCIe MSI", + .irq_mask = pci_msi_mask_irq, + .irq_unmask = pci_msi_unmask_irq, +}; + +static struct msi_domain_info um_pci_msi_domain_info = { + .flags = MSI_FLAG_USE_DEF_DOM_OPS | + MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_PCI_MSIX, + .chip = &um_pci_msi_irq_chip, +}; + +static struct resource busn_resource = { + .name = "PCI busn", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUS, +}; + +static int um_pci_map_irq(const struct pci_dev *pdev, u8 slot, u8 pin) +{ + struct um_pci_device_reg *reg = &um_pci_devices[pdev->devfn / 8]; + + if (WARN_ON(!reg->dev)) + return -EINVAL; + + /* Yes, we map all pins to the same IRQ ... doesn't matter for now. */ + return reg->dev->irq; +} + +void *pci_root_bus_fwnode(struct pci_bus *bus) +{ + return um_pci_fwnode; +} + +int um_pci_init(void) +{ + int err, i; + + WARN_ON(logic_iomem_add_region(&virt_cfgspace_resource, + &um_pci_cfgspace_ops)); + WARN_ON(logic_iomem_add_region(&virt_iomem_resource, + &um_pci_iomem_ops)); + + if (WARN(CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID < 0, + "No virtio device ID configured for PCI - no PCI support\n")) + return 0; + + bridge = pci_alloc_host_bridge(0); + if (!bridge) + return -ENOMEM; + + um_pci_fwnode = irq_domain_alloc_named_fwnode("um-pci"); + if (!um_pci_fwnode) { + err = -ENOMEM; + goto free; + } + + um_pci_inner_domain = __irq_domain_add(um_pci_fwnode, MAX_MSI_VECTORS, + MAX_MSI_VECTORS, 0, + &um_pci_inner_domain_ops, NULL); + if (!um_pci_inner_domain) { + err = -ENOMEM; + goto free; + } + + um_pci_msi_domain = pci_msi_create_irq_domain(um_pci_fwnode, + &um_pci_msi_domain_info, + um_pci_inner_domain); + if (!um_pci_msi_domain) { + err = -ENOMEM; + goto free; + } + + pci_add_resource(&bridge->windows, &virt_iomem_resource); + pci_add_resource(&bridge->windows, &busn_resource); + bridge->ops = &um_pci_ops; + bridge->map_irq = um_pci_map_irq; + + for (i = 0; i < MAX_DEVICES; i++) { + resource_size_t start; + + start = virt_cfgspace_resource.start + i * CFG_SPACE_SIZE; + um_pci_devices[i].iomem = ioremap(start, CFG_SPACE_SIZE); + if (WARN(!um_pci_devices[i].iomem, "failed to map %d\n", i)) { + err = -ENOMEM; + goto free; + } + } + + err = pci_host_probe(bridge); + if (err) + goto free; + + err = register_virtio_driver(&um_pci_virtio_driver); + if (err) + goto free; + return 0; +free: + if (um_pci_inner_domain) + irq_domain_remove(um_pci_inner_domain); + if (um_pci_fwnode) + irq_domain_free_fwnode(um_pci_fwnode); + pci_free_resource_list(&bridge->windows); + pci_free_host_bridge(bridge); + return err; +} +module_init(um_pci_init); + +void um_pci_exit(void) +{ + unregister_virtio_driver(&um_pci_virtio_driver); + irq_domain_remove(um_pci_msi_domain); + irq_domain_remove(um_pci_inner_domain); + pci_free_resource_list(&bridge->windows); + pci_free_host_bridge(bridge); +} +module_exit(um_pci_exit); diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 10b7228b3aee..0c31d19a7a9c 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -18,7 +18,6 @@ generic-y += mcs_spinlock.h generic-y += mmiowb.h generic-y += module.lds.h generic-y += param.h -generic-y += pci.h generic-y += percpu.h generic-y += preempt.h generic-y += softirq_stack.h diff --git a/arch/um/include/asm/io.h b/arch/um/include/asm/io.h index 6ce18d343997..9ea42cc746d9 100644 --- a/arch/um/include/asm/io.h +++ b/arch/um/include/asm/io.h @@ -3,16 +3,23 @@ #define _ASM_UM_IO_H #include +/* get emulated iomem (if desired) */ +#include + +#ifndef ioremap #define ioremap ioremap static inline void __iomem *ioremap(phys_addr_t offset, size_t size) { return NULL; } +#endif /* ioremap */ +#ifndef iounmap #define iounmap iounmap static inline void iounmap(void __iomem *addr) { } +#endif /* iounmap */ #include diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h index 3f5d3e8228fc..e187c789369d 100644 --- a/arch/um/include/asm/irq.h +++ b/arch/um/include/asm/irq.h @@ -31,7 +31,13 @@ #endif -#define NR_IRQS 64 +#define UM_LAST_SIGNAL_IRQ 64 +/* If we have (simulated) PCI MSI, allow 64 more interrupt numbers for it */ +#ifdef CONFIG_PCI_MSI +#define NR_IRQS (UM_LAST_SIGNAL_IRQ + 64) +#else +#define NR_IRQS UM_LAST_SIGNAL_IRQ +#endif /* CONFIG_PCI_MSI */ #include #endif diff --git a/arch/um/include/asm/msi.h b/arch/um/include/asm/msi.h new file mode 100644 index 000000000000..c8c6c381f394 --- /dev/null +++ b/arch/um/include/asm/msi.h @@ -0,0 +1 @@ +#include diff --git a/arch/um/include/asm/pci.h b/arch/um/include/asm/pci.h new file mode 100644 index 000000000000..da13fd5519ef --- /dev/null +++ b/arch/um/include/asm/pci.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_UM_PCI_H +#define __ASM_UM_PCI_H +#include +#include + +#define PCIBIOS_MIN_IO 0 +#define PCIBIOS_MIN_MEM 0 + +#define pcibios_assign_all_busses() 1 + +extern int isa_dma_bridge_buggy; + +#ifdef CONFIG_PCI +static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) +{ + /* no legacy IRQs */ + return -ENODEV; +} +#endif + +#ifdef CONFIG_PCI_DOMAINS +static inline int pci_proc_domain(struct pci_bus *bus) +{ + /* always show the domain in /proc */ + return 1; +} +#endif /* CONFIG_PCI */ + +#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN +/* + * This is a bit of an annoying hack, and it assumes we only have + * the virt-pci (if anything). Which is true, but still. + */ +void *pci_root_bus_fwnode(struct pci_bus *bus); +#define pci_root_bus_fwnode pci_root_bus_fwnode +#endif + +#endif /* __ASM_UM_PCI_H */ diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index e698e0c7dbdc..2fdd00de5a72 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -23,6 +23,7 @@ obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o obj-$(CONFIG_GPROF) += gprof_syms.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-$(CONFIG_GENERIC_PCI_IOMAP) += ioport.o USER_OBJS := config.o diff --git a/arch/um/kernel/ioport.c b/arch/um/kernel/ioport.c new file mode 100644 index 000000000000..7220615b3beb --- /dev/null +++ b/arch/um/kernel/ioport.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Intel Corporation + * Author: Johannes Berg + */ +#include +#include + +void __iomem *__pci_ioport_map(struct pci_dev *dev, unsigned long port, + unsigned int nr) +{ + return NULL; +} diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index f3f19da7b876..a8873d9bc28b 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -56,7 +56,7 @@ struct irq_entry { static DEFINE_SPINLOCK(irq_lock); static LIST_HEAD(active_fds); -static DECLARE_BITMAP(irqs_allocated, NR_IRQS); +static DECLARE_BITMAP(irqs_allocated, UM_LAST_SIGNAL_IRQ); static bool irqs_suspended; static void irq_io_loop(struct irq_reg *irq, struct uml_pt_regs *regs) @@ -419,7 +419,8 @@ unsigned int do_IRQ(int irq, struct uml_pt_regs *regs) void um_free_irq(int irq, void *dev) { - if (WARN(irq < 0 || irq > NR_IRQS, "freeing invalid irq %d", irq)) + if (WARN(irq < 0 || irq > UM_LAST_SIGNAL_IRQ, + "freeing invalid irq %d", irq)) return; free_irq_by_irq_and_dev(irq, dev); @@ -648,7 +649,7 @@ void __init init_IRQ(void) irq_set_chip_and_handler(TIMER_IRQ, &alarm_irq_type, handle_edge_irq); - for (i = 1; i < NR_IRQS; i++) + for (i = 1; i < UM_LAST_SIGNAL_IRQ; i++) irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq); /* Initialize EPOLL Loop */ os_setup_epoll(); diff --git a/include/uapi/linux/virtio_pcidev.h b/include/uapi/linux/virtio_pcidev.h new file mode 100644 index 000000000000..89daa88bcfef --- /dev/null +++ b/include/uapi/linux/virtio_pcidev.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* + * Copyright (C) 2021 Intel Corporation + * Author: Johannes Berg + */ +#ifndef _UAPI_LINUX_VIRTIO_PCIDEV_H +#define _UAPI_LINUX_VIRTIO_PCIDEV_H +#include + +/** + * enum virtio_pcidev_ops - virtual PCI device operations + * @VIRTIO_PCIDEV_OP_CFG_READ: read config space, size is 1, 2, 4 or 8; + * the @data field should be filled in by the device (in little endian). + * @VIRTIO_PCIDEV_OP_CFG_WRITE: write config space, size is 1, 2, 4 or 8; + * the @data field contains the data to write (in little endian). + * @VIRTIO_PCIDEV_OP_BAR_READ: read BAR mem/pio, size can be variable; + * the @data field should be filled in by the device (in little endian). + * @VIRTIO_PCIDEV_OP_BAR_WRITE: write BAR mem/pio, size can be variable; + * the @data field contains the data to write (in little endian). + * @VIRTIO_PCIDEV_OP_MMIO_MEMSET: memset MMIO, size is variable but + * the @data field only has one byte (unlike @VIRTIO_PCIDEV_OP_MMIO_WRITE) + * @VIRTIO_PCIDEV_OP_INT: legacy INTx# pin interrupt, the addr field is 1-4 for + * the number + * @VIRTIO_PCIDEV_OP_MSI: MSI(-X) interrupt, this message basically transports + * the 16- or 32-bit write that would otherwise be done into memory, + * analogous to the write messages (@VIRTIO_PCIDEV_OP_MMIO_WRITE) above + * @VIRTIO_PCIDEV_OP_PME: Dummy message whose content is ignored (and should be + * all zeroes) to signal the PME# pin. + */ +enum virtio_pcidev_ops { + VIRTIO_PCIDEV_OP_RESERVED = 0, + VIRTIO_PCIDEV_OP_CFG_READ, + VIRTIO_PCIDEV_OP_CFG_WRITE, + VIRTIO_PCIDEV_OP_MMIO_READ, + VIRTIO_PCIDEV_OP_MMIO_WRITE, + VIRTIO_PCIDEV_OP_MMIO_MEMSET, + VIRTIO_PCIDEV_OP_INT, + VIRTIO_PCIDEV_OP_MSI, + VIRTIO_PCIDEV_OP_PME, +}; + +/** + * struct virtio_pcidev_msg - virtio PCI device operation + * @op: the operation to do + * @bar: the bar (only with BAR read/write messages) + * @reserved: reserved + * @size: the size of the read/write (in bytes) + * @addr: the address to read/write + * @data: the data, normally @size long, but just one byte for + * %VIRTIO_PCIDEV_OP_MMIO_MEMSET + * + * Note: the fields are all in native (CPU) endian, however, the + * @data values will often be in little endian (see the ops above.) + */ +struct virtio_pcidev_msg { + __u8 op; + __u8 bar; + __u16 reserved; + __u32 size; + __u64 addr; + __u8 data[]; +}; + +#endif /* _UAPI_LINUX_VIRTIO_PCIDEV_H */ -- cgit v1.2.3-58-ga151 From 43c590cb86665be702c0af0231a10ec813df9cfd Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 5 Mar 2021 13:19:59 +0100 Subject: um: virtio/pci: enable suspend/resume The UM virtual PCI devices currently cannot be suspended properly since the virtio driver already disables VQs well before the PCI bus's suspend_noirq wants to complete the transition by writing to PCI config space. After trying around for a long time with moving the devices on the DPM list, trying to create dependencies between them, etc. I gave up and instead added UML specific cross-driver API that lets the virt-pci code enable not suspending/resuming VQs for its devices. This then allows the PCI bus suspend_noirq to still talk to the device, and suspend/resume works properly. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/drivers/virt-pci.c | 10 ++++++++++ arch/um/drivers/virtio_uml.c | 40 ++++++++++++++++++++++++++++---------- arch/um/include/linux/virtio-uml.h | 13 +++++++++++++ 3 files changed, 53 insertions(+), 10 deletions(-) create mode 100644 arch/um/include/linux/virtio-uml.h (limited to 'arch') diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index dd85f36197aa..0b802834f40a 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -134,6 +135,9 @@ static int um_pci_send_cmd(struct um_pci_device *dev, if (completed == HANDLE_NO_FREE(cmd)) break; + if (completed && !HANDLE_IS_NO_FREE(completed)) + kfree(completed); + if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) || ++delay_count > UM_VIRT_PCI_MAXDELAY, "um virt-pci delay: %d", delay_count)) { @@ -550,6 +554,12 @@ static int um_pci_virtio_probe(struct virtio_device *vdev) device_set_wakeup_enable(&vdev->dev, true); + /* + * In order to do suspend-resume properly, don't allow VQs + * to be suspended. + */ + virtio_uml_set_no_vq_suspend(vdev, true); + um_pci_rescan(); return 0; error: diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c index 91ddf74ca888..4412d6febade 100644 --- a/arch/um/drivers/virtio_uml.c +++ b/arch/um/drivers/virtio_uml.c @@ -56,6 +56,7 @@ struct virtio_uml_device { u8 status; u8 registered:1; u8 suspended:1; + u8 no_vq_suspend:1; u8 config_changed_irq:1; uint64_t vq_irq_vq_map; @@ -1098,6 +1099,19 @@ static void virtio_uml_release_dev(struct device *d) kfree(vu_dev); } +void virtio_uml_set_no_vq_suspend(struct virtio_device *vdev, + bool no_vq_suspend) +{ + struct virtio_uml_device *vu_dev = to_virtio_uml_device(vdev); + + if (WARN_ON(vdev->config != &virtio_uml_config_ops)) + return; + + vu_dev->no_vq_suspend = no_vq_suspend; + dev_info(&vdev->dev, "%sabled VQ suspend\n", + no_vq_suspend ? "dis" : "en"); +} + /* Platform device */ static int virtio_uml_probe(struct platform_device *pdev) @@ -1302,13 +1316,16 @@ MODULE_DEVICE_TABLE(of, virtio_uml_match); static int virtio_uml_suspend(struct platform_device *pdev, pm_message_t state) { struct virtio_uml_device *vu_dev = platform_get_drvdata(pdev); - struct virtqueue *vq; - virtio_device_for_each_vq((&vu_dev->vdev), vq) { - struct virtio_uml_vq_info *info = vq->priv; + if (!vu_dev->no_vq_suspend) { + struct virtqueue *vq; - info->suspended = true; - vhost_user_set_vring_enable(vu_dev, vq->index, false); + virtio_device_for_each_vq((&vu_dev->vdev), vq) { + struct virtio_uml_vq_info *info = vq->priv; + + info->suspended = true; + vhost_user_set_vring_enable(vu_dev, vq->index, false); + } } if (!device_may_wakeup(&vu_dev->vdev.dev)) { @@ -1322,13 +1339,16 @@ static int virtio_uml_suspend(struct platform_device *pdev, pm_message_t state) static int virtio_uml_resume(struct platform_device *pdev) { struct virtio_uml_device *vu_dev = platform_get_drvdata(pdev); - struct virtqueue *vq; - virtio_device_for_each_vq((&vu_dev->vdev), vq) { - struct virtio_uml_vq_info *info = vq->priv; + if (!vu_dev->no_vq_suspend) { + struct virtqueue *vq; + + virtio_device_for_each_vq((&vu_dev->vdev), vq) { + struct virtio_uml_vq_info *info = vq->priv; - info->suspended = false; - vhost_user_set_vring_enable(vu_dev, vq->index, true); + info->suspended = false; + vhost_user_set_vring_enable(vu_dev, vq->index, true); + } } vu_dev->suspended = false; diff --git a/arch/um/include/linux/virtio-uml.h b/arch/um/include/linux/virtio-uml.h new file mode 100644 index 000000000000..2f652fa90f04 --- /dev/null +++ b/arch/um/include/linux/virtio-uml.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021 Intel Corporation + * Author: Johannes Berg + */ + +#ifndef __VIRTIO_UML_H__ +#define __VIRTIO_UML_H__ + +void virtio_uml_set_no_vq_suspend(struct virtio_device *vdev, + bool no_vq_suspend); + +#endif /* __VIRTIO_UML_H__ */ -- cgit v1.2.3-58-ga151 From 386093c68ba3e8bcfe7f46deba901e0e80713c29 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 8 Mar 2021 14:02:37 +0100 Subject: um: allow not setting extra rpaths in the linux binary There doesn't seem to be any reason for the rpath being set in the binaries, at on systems that I tested on. On the other hand, setting rpath is actually harming binaries in some cases, e.g. if using nix-based compilation environments where /lib & /lib64 are not part of the actual environment. Add a new Kconfig option (under EXPERT, for less user confusion) that allows disabling the rpath additions. Signed-off-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/Kconfig | 13 +++++++++++++ arch/um/Makefile | 3 ++- arch/x86/Makefile.um | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 328ba973c07d..fe6d898aeb85 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -106,6 +106,19 @@ config LD_SCRIPT_DYN depends on !LD_SCRIPT_STATIC select MODULE_REL_CRCS if MODVERSIONS +config LD_SCRIPT_DYN_RPATH + bool "set rpath in the binary" if EXPERT + default y + depends on LD_SCRIPT_DYN + help + Add /lib (and /lib64 for 64-bit) to the linux binary's rpath + explicitly. + + You may need to turn this off if compiling for nix systems + that have their libraries in random /nix directories and + might otherwise unexpected use libraries from /lib or /lib64 + instead of the desired ones. + config HOSTFS tristate "Host filesystem" help diff --git a/arch/um/Makefile b/arch/um/Makefile index 1cea46ff9bb7..12a7acef0357 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -118,7 +118,8 @@ archprepare: $(Q)$(MAKE) $(build)=$(HOST_DIR)/um include/generated/user_constants.h LINK-$(CONFIG_LD_SCRIPT_STATIC) += -static -LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib $(call cc-option, -no-pie) +LINK-$(CONFIG_LD_SCRIPT_DYN) += $(call cc-option, -no-pie) +LINK-$(CONFIG_LD_SCRIPT_DYN_RPATH) += -Wl,-rpath,/lib CFLAGS_NO_HARDENING := $(call cc-option, -fno-PIC,) $(call cc-option, -fno-pic,) \ -fno-stack-protector $(call cc-option, -fno-stack-protector-all) diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 1db7913795f5..b3c1ae084180 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -44,7 +44,7 @@ ELF_FORMAT := elf64-x86-64 # Not on all 64-bit distros /lib is a symlink to /lib64. PLD is an example. -LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib64 +LINK-$(CONFIG_LD_SCRIPT_DYN_RPATH) += -Wl,-rpath,/lib64 LINK-y += -m64 endif -- cgit v1.2.3-58-ga151 From d8fb32f4790f2a286e58db8548016378ac35fc6f Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Fri, 12 Mar 2021 15:16:07 +0000 Subject: um: Add support for host CPU flags and alignment 1. Reflect host cpu flags into the UML instance so they can be used to select the correct implementations for xor, crypto, etc. 2. Reflect host cache alignment into UML instance. This is important when running 32 bit on a 64 bit host as 32 bit by default aligns to 32 while the actual alignment should be 64. Ditto for some Xeons which align at 128. Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/Kconfig | 3 + arch/um/include/asm/cpufeature.h | 157 ++++++++++++++++++++++++++++++++ arch/um/include/asm/processor-generic.h | 8 ++ arch/um/include/shared/os.h | 3 + arch/um/kernel/Makefile | 13 ++- arch/um/kernel/um_arch.c | 48 +++++++++- arch/um/os-Linux/start_up.c | 32 +++++++ 7 files changed, 258 insertions(+), 6 deletions(-) create mode 100644 arch/um/include/asm/cpufeature.h (limited to 'arch') diff --git a/arch/um/Kconfig b/arch/um/Kconfig index fe6d898aeb85..0561b73cfd9a 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -76,6 +76,9 @@ config NR_CPUS range 1 1 default 1 +config ARCH_HAS_CACHE_LINE_SIZE + def_bool y + source "arch/$(HEADER_ARCH)/um/Kconfig" config MAY_HAVE_RUNTIME_DEPS diff --git a/arch/um/include/asm/cpufeature.h b/arch/um/include/asm/cpufeature.h new file mode 100644 index 000000000000..19cd7ed6ec3c --- /dev/null +++ b/arch/um/include/asm/cpufeature.h @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_UM_CPUFEATURE_H +#define _ASM_UM_CPUFEATURE_H + +#include + +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) + +#include +#include + +extern const char * const x86_cap_flags[NCAPINTS*32]; +extern const char * const x86_power_flags[32]; +#define X86_CAP_FMT "%s" +#define x86_cap_flag(flag) x86_cap_flags[flag] + +/* + * In order to save room, we index into this array by doing + * X86_BUG_ - NCAPINTS*32. + */ +extern const char * const x86_bug_flags[NBUGINTS*32]; + +#define test_cpu_cap(c, bit) \ + test_bit(bit, (unsigned long *)((c)->x86_capability)) + +/* + * There are 32 bits/features in each mask word. The high bits + * (selected with (bit>>5) give us the word number and the low 5 + * bits give us the bit/feature number inside the word. + * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can + * see if it is set in the mask word. + */ +#define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \ + (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word )) + +#define cpu_has(c, bit) \ + test_cpu_cap(c, bit) + +#define this_cpu_has(bit) \ + (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ + x86_this_cpu_test_bit(bit, \ + (unsigned long __percpu *)&cpu_info.x86_capability)) + +/* + * This macro is for detection of features which need kernel + * infrastructure to be used. It may *not* directly test the CPU + * itself. Use the cpu_has() family if you want true runtime + * testing of CPU features, like in hypervisor code where you are + * supporting a possible guest feature where host support for it + * is not relevant. + */ +#define cpu_feature_enabled(bit) \ + (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit)) + +#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) + +#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) + +extern void setup_clear_cpu_cap(unsigned int bit); + +#define setup_force_cpu_cap(bit) do { \ + set_cpu_cap(&boot_cpu_data, bit); \ + set_bit(bit, (unsigned long *)cpu_caps_set); \ +} while (0) + +#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) + +#if defined(__clang__) && !defined(CONFIG_CC_HAS_ASM_GOTO) + +/* + * Workaround for the sake of BPF compilation which utilizes kernel + * headers, but clang does not support ASM GOTO and fails the build. + */ +#ifndef __BPF_TRACING__ +#warning "Compiler lacks ASM_GOTO support. Add -D __BPF_TRACING__ to your compiler arguments" +#endif + +#define static_cpu_has(bit) boot_cpu_has(bit) + +#else + +/* + * Static testing of CPU features. Used the same as boot_cpu_has(). It + * statically patches the target code for additional performance. Use + * static_cpu_has() only in fast paths, where every cycle counts. Which + * means that the boot_cpu_has() variant is already fast enough for the + * majority of cases and you should stick to using it as it is generally + * only two instructions: a RIP-relative MOV and a TEST. + */ +static __always_inline bool _static_cpu_has(u16 bit) +{ + asm_volatile_goto("1: jmp 6f\n" + "2:\n" + ".skip -(((5f-4f) - (2b-1b)) > 0) * " + "((5f-4f) - (2b-1b)),0x90\n" + "3:\n" + ".section .altinstructions,\"a\"\n" + " .long 1b - .\n" /* src offset */ + " .long 4f - .\n" /* repl offset */ + " .word %P[always]\n" /* always replace */ + " .byte 3b - 1b\n" /* src len */ + " .byte 5f - 4f\n" /* repl len */ + " .byte 3b - 2b\n" /* pad len */ + ".previous\n" + ".section .altinstr_replacement,\"ax\"\n" + "4: jmp %l[t_no]\n" + "5:\n" + ".previous\n" + ".section .altinstructions,\"a\"\n" + " .long 1b - .\n" /* src offset */ + " .long 0\n" /* no replacement */ + " .word %P[feature]\n" /* feature bit */ + " .byte 3b - 1b\n" /* src len */ + " .byte 0\n" /* repl len */ + " .byte 0\n" /* pad len */ + ".previous\n" + ".section .altinstr_aux,\"ax\"\n" + "6:\n" + " testb %[bitnum],%[cap_byte]\n" + " jnz %l[t_yes]\n" + " jmp %l[t_no]\n" + ".previous\n" + : : [feature] "i" (bit), + [always] "i" (X86_FEATURE_ALWAYS), + [bitnum] "i" (1 << (bit & 7)), + [cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3]) + : : t_yes, t_no); +t_yes: + return true; +t_no: + return false; +} + +#define static_cpu_has(bit) \ +( \ + __builtin_constant_p(boot_cpu_has(bit)) ? \ + boot_cpu_has(bit) : \ + _static_cpu_has(bit) \ +) +#endif + +#define cpu_has_bug(c, bit) cpu_has(c, (bit)) +#define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) + +#define static_cpu_has_bug(bit) static_cpu_has((bit)) +#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) +#define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit)) + +#define MAX_CPU_FEATURES (NCAPINTS * 32) +#define cpu_have_feature boot_cpu_has + +#define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" +#define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ + boot_cpu_data.x86_model + +#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ +#endif /* _ASM_UM_CPUFEATURE_H */ diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index afd9b267cf81..b5cf0ed116d9 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -16,6 +16,8 @@ struct task_struct; #include +#include + struct mm_struct; struct thread_struct { @@ -90,12 +92,18 @@ extern void start_thread(struct pt_regs *regs, unsigned long entry, struct cpuinfo_um { unsigned long loops_per_jiffy; int ipi_pipe[2]; + int cache_alignment; + union { + __u32 x86_capability[NCAPINTS + NBUGINTS]; + unsigned long x86_capability_alignment; + }; }; extern struct cpuinfo_um boot_cpu_data; #define cpu_data (&boot_cpu_data) #define current_cpu_data boot_cpu_data +#define cache_line_size() (boot_cpu_data.cache_alignment) #define KSTK_REG(tsk, reg) get_thread_reg(reg, &tsk->thread.switch_buf) extern unsigned long get_wchan(struct task_struct *p); diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 453b13369533..8e9e05d3b419 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -187,6 +187,9 @@ int os_poll(unsigned int n, const int *fds); extern void os_early_checks(void); extern void os_check_bugs(void); extern void check_host_supports_tls(int *supports_tls, int *tls_min); +extern void get_host_cpu_features( + void (*flags_helper_func)(char *line), + void (*cache_helper_func)(char *line)); /* mem.c */ extern int create_mem_file(unsigned long long len); diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 2fdd00de5a72..1d18e4e46989 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -17,7 +17,7 @@ extra-y := vmlinux.lds obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \ physmem.o process.o ptrace.o reboot.o sigio.o \ signal.o syscall.o sysrq.o time.o tlb.o trap.o \ - um_arch.o umid.o maccess.o kmsg_dump.o skas/ + um_arch.o umid.o maccess.o kmsg_dump.o capflags.o skas/ obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o obj-$(CONFIG_GPROF) += gprof_syms.o @@ -29,7 +29,7 @@ USER_OBJS := config.o include arch/um/scripts/Makefile.rules -targets := config.c config.tmp +targets := config.c config.tmp capflags.c # Be careful with the below Sed code - sed is pitfall-rich! # We use sed to lower build requirements, for "embedded" builders for instance. @@ -44,6 +44,15 @@ quiet_cmd_quote1 = QUOTE $@ $(obj)/config.c: $(src)/config.c.in $(obj)/config.tmp FORCE $(call if_changed,quote2) +quiet_cmd_mkcapflags = MKCAP $@ + cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/../../x86/kernel/cpu/mkcapflags.sh $@ $^ + +cpufeature = $(src)/../../x86/include/asm/cpufeatures.h +vmxfeature = $(src)/../../x86/include/asm/vmxfeatures.h + +$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/../../x86/kernel/cpu/mkcapflags.sh FORCE + $(call if_changed,mkcapflags) + quiet_cmd_quote2 = QUOTE $@ cmd_quote2 = sed -e '/CONFIG/{' \ -e 's/"CONFIG"//' \ diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 74e07e748a9b..20cce10bde51 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,7 @@ #include #include +#include #include #include #include @@ -50,9 +52,13 @@ static void __init add_arg(char *arg) */ struct cpuinfo_um boot_cpu_data = { .loops_per_jiffy = 0, - .ipi_pipe = { -1, -1 } + .ipi_pipe = { -1, -1 }, + .cache_alignment = L1_CACHE_BYTES, + .x86_capability = { 0 } }; +EXPORT_SYMBOL(boot_cpu_data); + union thread_union cpu0_irqstack __section(".data..init_irqstack") = { .thread_info = INIT_THREAD_INFO(init_task) }; @@ -62,17 +68,25 @@ static char host_info[(__NEW_UTS_LEN + 1) * 5]; static int show_cpuinfo(struct seq_file *m, void *v) { - int index = 0; + int i = 0; - seq_printf(m, "processor\t: %d\n", index); + seq_printf(m, "processor\t: %d\n", i); seq_printf(m, "vendor_id\t: User Mode Linux\n"); seq_printf(m, "model name\t: UML\n"); seq_printf(m, "mode\t\t: skas\n"); seq_printf(m, "host\t\t: %s\n", host_info); - seq_printf(m, "bogomips\t: %lu.%02lu\n\n", + seq_printf(m, "fpu\t\t: %s\n", cpu_has(&boot_cpu_data, X86_FEATURE_FPU) ? "yes" : "no"); + seq_printf(m, "flags\t\t:"); + for (i = 0; i < 32*NCAPINTS; i++) + if (cpu_has(&boot_cpu_data, i) && (x86_cap_flags[i] != NULL)) + seq_printf(m, " %s", x86_cap_flags[i]); + seq_printf(m, "\n"); + seq_printf(m, "cache_alignment\t: %d\n", boot_cpu_data.cache_alignment); + seq_printf(m, "bogomips\t: %lu.%02lu\n", loops_per_jiffy/(500000/HZ), (loops_per_jiffy/(5000/HZ)) % 100); + return 0; } @@ -261,6 +275,30 @@ EXPORT_SYMBOL(end_iomem); #define MIN_VMALLOC (32 * 1024 * 1024) +static void parse_host_cpu_flags(char *line) +{ + int i; + for (i = 0; i < 32*NCAPINTS; i++) { + if ((x86_cap_flags[i] != NULL) && strstr(line, x86_cap_flags[i])) + set_cpu_cap(&boot_cpu_data, i);; + } +} +static void parse_cache_line(char *line) +{ + long res; + char *to_parse = strstr(line, ":"); + if (to_parse) { + to_parse++; + while (*to_parse != 0 && isspace(*to_parse)) { + to_parse++; + } + if (kstrtoul(to_parse, 10, &res) == 0 && is_power_of_2(res)) + boot_cpu_data.cache_alignment = res; + else + boot_cpu_data.cache_alignment = L1_CACHE_BYTES; + } +} + int __init linux_main(int argc, char **argv) { unsigned long avail, diff; @@ -297,6 +335,8 @@ int __init linux_main(int argc, char **argv) /* OS sanity checks that need to happen before the kernel runs */ os_early_checks(); + get_host_cpu_features(parse_host_cpu_flags, parse_cache_line); + brk_start = (unsigned long) sbrk(0); /* diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index f79dc338279e..8a72c99994eb 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -321,6 +321,38 @@ static void __init check_coredump_limit(void) os_info("%llu\n", (unsigned long long)lim.rlim_max); } +void __init get_host_cpu_features( + void (*flags_helper_func)(char *line), + void (*cache_helper_func)(char *line)) +{ + FILE *cpuinfo; + char *line = NULL; + size_t len = 0; + int done_parsing = 0; + + cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo == NULL) { + os_info("Failed to get host CPU features\n"); + } else { + while ((getline(&line, &len, cpuinfo)) != -1) { + if (strstr(line, "flags")) { + flags_helper_func(line); + done_parsing++; + } + if (strstr(line, "cache_alignment")) { + cache_helper_func(line); + done_parsing++; + } + free(line); + line = NULL; + if (done_parsing > 1) + break; + } + fclose(cpuinfo); + } +} + + void __init os_early_checks(void) { int pid; -- cgit v1.2.3-58-ga151 From c0ecca6604b80e438b032578634c6e133c7028f6 Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Fri, 12 Mar 2021 15:16:08 +0000 Subject: um: enable the use of optimized xor routines in UML This patch enables the use of optimized xor routines from the x86 tree as well as the necessary fpu api shims so they can work on UML. Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/include/asm/fpu/api.h | 20 ++++++++++++++++++++ arch/um/include/asm/xor.h | 17 ++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 arch/um/include/asm/fpu/api.h (limited to 'arch') diff --git a/arch/um/include/asm/fpu/api.h b/arch/um/include/asm/fpu/api.h new file mode 100644 index 000000000000..71bfd9ef3938 --- /dev/null +++ b/arch/um/include/asm/fpu/api.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _ASM_UM_FPU_API_H +#define _ASM_UM_FPU_API_H + +/* Copyright (c) 2020 Cambridge Greys Ltd + * Copyright (c) 2020 Red Hat Inc. + * A set of "dummy" defines to allow the direct inclusion + * of x86 optimized copy, xor, etc routines into the + * UML code tree. */ + +#define kernel_fpu_begin() (void)0 +#define kernel_fpu_end() (void)0 + +static inline bool irq_fpu_usable(void) +{ + return true; +} + + +#endif diff --git a/arch/um/include/asm/xor.h b/arch/um/include/asm/xor.h index 36b33d62a35d..f512704a9ec7 100644 --- a/arch/um/include/asm/xor.h +++ b/arch/um/include/asm/xor.h @@ -1,7 +1,22 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#include +#ifndef _ASM_UM_XOR_H +#define _ASM_UM_XOR_H + +#ifdef CONFIG_64BIT +#undef CONFIG_X86_32 +#else +#define CONFIG_X86_32 1 +#endif + +#include +#include <../../x86/include/asm/xor.h> #include +#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT +#undef XOR_SELECT_TEMPLATE /* pick an arbitrary one - measuring isn't possible with inf-cpu */ #define XOR_SELECT_TEMPLATE(x) \ (time_travel_mode == TT_MODE_INFCPU ? &xor_block_8regs : NULL) +#endif + +#endif -- cgit v1.2.3-58-ga151 From dd3035a21ba7ccaa883d7107d357ad06320d78fc Mon Sep 17 00:00:00 2001 From: Anton Ivanov Date: Fri, 12 Mar 2021 15:16:09 +0000 Subject: um: add a UML specific futex implementation The generic asm futex implementation emulates atomic access to memory by doing a get_user followed by put_user. These translate to two mapping operations on UML with paging enabled in the meantime. This, in turn may end up changing interrupts, invoking the signal loop, etc. This replaces the generic implementation by a mapping followed by an operation on the mapped segment. Signed-off-by: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/include/asm/Kbuild | 1 - arch/um/include/asm/futex.h | 14 +++++ arch/um/kernel/skas/uaccess.c | 136 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 arch/um/include/asm/futex.h (limited to 'arch') diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 0c31d19a7a9c..e5a7b552bb38 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -9,7 +9,6 @@ generic-y += exec.h generic-y += extable.h generic-y += fb.h generic-y += ftrace.h -generic-y += futex.h generic-y += hw_irq.h generic-y += irq_regs.h generic-y += irq_work.h diff --git a/arch/um/include/asm/futex.h b/arch/um/include/asm/futex.h new file mode 100644 index 000000000000..780aa6bfc050 --- /dev/null +++ b/arch/um/include/asm/futex.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_UM_FUTEX_H +#define _ASM_UM_FUTEX_H + +#include +#include +#include + + +int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr); +int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval); + +#endif diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c index 2dec915abe6f..6c76df96e858 100644 --- a/arch/um/kernel/skas/uaccess.c +++ b/arch/um/kernel/skas/uaccess.c @@ -11,6 +11,7 @@ #include #include #include +#include #include pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr) @@ -248,3 +249,138 @@ long __strnlen_user(const void __user *str, long len) return 0; } EXPORT_SYMBOL(__strnlen_user); + +/** + * arch_futex_atomic_op_inuser() - Atomic arithmetic operation with constant + * argument and comparison of the previous + * futex value with another constant. + * + * @encoded_op: encoded operation to execute + * @uaddr: pointer to user space address + * + * Return: + * 0 - On success + * -EFAULT - User access resulted in a page fault + * -EAGAIN - Atomic operation was unable to complete due to contention + * -ENOSYS - Operation not supported + */ + +int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr) +{ + int oldval, ret; + struct page *page; + unsigned long addr = (unsigned long) uaddr; + pte_t *pte; + + ret = -EFAULT; + if (!access_ok(uaddr, sizeof(*uaddr))) + return -EFAULT; + preempt_disable(); + pte = maybe_map(addr, 1); + if (pte == NULL) + goto out_inuser; + + page = pte_page(*pte); +#ifdef CONFIG_64BIT + pagefault_disable(); + addr = (unsigned long) page_address(page) + + (((unsigned long) addr) & ~PAGE_MASK); +#else + addr = (unsigned long) kmap_atomic(page) + + ((unsigned long) addr & ~PAGE_MASK); +#endif + uaddr = (u32 *) addr; + oldval = *uaddr; + + ret = 0; + + switch (op) { + case FUTEX_OP_SET: + *uaddr = oparg; + break; + case FUTEX_OP_ADD: + *uaddr += oparg; + break; + case FUTEX_OP_OR: + *uaddr |= oparg; + break; + case FUTEX_OP_ANDN: + *uaddr &= ~oparg; + break; + case FUTEX_OP_XOR: + *uaddr ^= oparg; + break; + default: + ret = -ENOSYS; + } +#ifdef CONFIG_64BIT + pagefault_enable(); +#else + kunmap_atomic((void *)addr); +#endif + +out_inuser: + preempt_enable(); + + if (ret == 0) + *oval = oldval; + + return ret; +} +EXPORT_SYMBOL(arch_futex_atomic_op_inuser); + +/** + * futex_atomic_cmpxchg_inatomic() - Compare and exchange the content of the + * uaddr with newval if the current value is + * oldval. + * @uval: pointer to store content of @uaddr + * @uaddr: pointer to user space address + * @oldval: old value + * @newval: new value to store to @uaddr + * + * Return: + * 0 - On success + * -EFAULT - User access resulted in a page fault + * -EAGAIN - Atomic operation was unable to complete due to contention + * -ENOSYS - Function not implemented (only if !HAVE_FUTEX_CMPXCHG) + */ + +int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + u32 oldval, u32 newval) +{ + struct page *page; + pte_t *pte; + int ret = -EFAULT; + + if (!access_ok(uaddr, sizeof(*uaddr))) + return -EFAULT; + + preempt_disable(); + pte = maybe_map((unsigned long) uaddr, 1); + if (pte == NULL) + goto out_inatomic; + + page = pte_page(*pte); +#ifdef CONFIG_64BIT + pagefault_disable(); + uaddr = page_address(page) + (((unsigned long) uaddr) & ~PAGE_MASK); +#else + uaddr = kmap_atomic(page) + ((unsigned long) uaddr & ~PAGE_MASK); +#endif + + *uval = *uaddr; + + ret = cmpxchg(uaddr, oldval, newval); + +#ifdef CONFIG_64BIT + pagefault_enable(); +#else + kunmap_atomic(uaddr); +#endif + ret = 0; + +out_inatomic: + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(futex_atomic_cmpxchg_inatomic); -- cgit v1.2.3-58-ga151 From 80f849bf541ef9b633a9c08ac208f9c9afd14eb9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 15 Mar 2021 23:38:04 +0100 Subject: um: implement flush_cache_vmap/flush_cache_vunmap vmalloc() heavy workloads in UML are extremely slow, due to flushing the entire kernel VM space (flush_tlb_kernel_vm()) on the first segfault. Implement flush_cache_vmap() to avoid that, and while at it also add flush_cache_vunmap() since it's trivial. This speeds up my vmalloc() heavy test of copying files out from /sys/kernel/debug/gcov/ by 30x (from 30s to 1s.) Signed-off-by: Johannes Berg Acked-By: Anton Ivanov Signed-off-by: Richard Weinberger --- arch/um/include/asm/cacheflush.h | 9 +++++++++ arch/um/include/asm/tlb.h | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 arch/um/include/asm/cacheflush.h (limited to 'arch') diff --git a/arch/um/include/asm/cacheflush.h b/arch/um/include/asm/cacheflush.h new file mode 100644 index 000000000000..4c9858cd36ec --- /dev/null +++ b/arch/um/include/asm/cacheflush.h @@ -0,0 +1,9 @@ +#ifndef __UM_ASM_CACHEFLUSH_H +#define __UM_ASM_CACHEFLUSH_H + +#include +#define flush_cache_vmap flush_tlb_kernel_range +#define flush_cache_vunmap flush_tlb_kernel_range + +#include +#endif /* __UM_ASM_CACHEFLUSH_H */ diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index ff9c62828962..0422467bda5b 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -5,7 +5,7 @@ #include #include -#include +#include #include #endif -- cgit v1.2.3-58-ga151 From 558f9b2f94dbd2d5c5c8292aa13e081cc11ea7d9 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 20 Apr 2021 00:56:10 -0500 Subject: um: Fix stack pointer alignment GCC assumes that stack is aligned to 16-byte on call sites [1]. Since GCC 8, GCC began using 16-byte aligned SSE instructions to implement assignments to structs on stack. When CC_OPTIMIZE_FOR_PERFORMANCE is enabled, this affects os-Linux/sigio.c, write_sigio_thread: struct pollfds *fds, tmp; tmp = current_poll; Note that struct pollfds is exactly 16 bytes in size. GCC 8+ generates assembly similar to: movdqa (%rdi),%xmm0 movaps %xmm0,-0x50(%rbp) This is an issue, because movaps will #GP if -0x50(%rbp) is not aligned to 16 bytes [2], and how rbp gets assigned to is via glibc clone thread_start, then function prologue, going though execution trace similar to (showing only relevant instructions): sub $0x10,%rsi mov %rcx,0x8(%rsi) mov %rdi,(%rsi) syscall pop %rax pop %rdi callq *%rax push %rbp mov %rsp,%rbp The stack pointer always points to the topmost element on stack, rather then the space right above the topmost. On push, the pointer decrements first before writing to the memory pointed to by it. Therefore, there is no need to have the stack pointer pointer always point to valid memory unless the stack is poped; so the `- sizeof(void *)` in the code is unnecessary. On the other hand, glibc reserves the 16 bytes it needs on stack and pops itself, so by the call instruction the stack pointer is exactly the caller-supplied sp. It then push the 16 bytes of the return address and the saved stack pointer, so the base pointer will be 16-byte aligned if and only if the caller supplied sp is 16-byte aligned. Therefore, the caller must supply a 16-byte aligned pointer, which `stack + UM_KERN_PAGE_SIZE` already satisfies. On a side note, musl is unaffected by this issue because it forces 16 byte alignment via `and $-16,%rsi` in its clone wrapper. Similarly, glibc i386 is also unaffected because it has `andl $0xfffffff0, %ecx`. To reproduce this bug, enable CONFIG_UML_RTC and CC_OPTIMIZE_FOR_PERFORMANCE. uml_rtc will call add_sigio_fd which will then cause write_sigio_thread to either go into segfault loop or panic with "Segfault with no mm". Similarly, signal stacks will be aligned by the host kernel upon signal delivery. `- sizeof(void *)` to sigaltstack is unconventional and extraneous. On a related note, initialization of longjmp buffers do require `- sizeof(void *)`. This is to account for the return address that would have been pushed to the stack at the call site. The reason for uml to respect 16-byte alignment, rather than telling GCC to assume 8-byte alignment like the host kernel since commit d9b0cde91c60 ("x86-64, gcc: Use -mpreferred-stack-boundary=3 if supported"), is because uml links against libc. There is no reason to assume libc is also compiled with that flag and assumes 8-byte alignment rather than 16-byte. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=40838 [2] https://c9x.me/x86/html/file_module_x86_id_180.html Signed-off-by: YiFei Zhu Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reviewed-by: Johannes Berg Signed-off-by: Richard Weinberger --- arch/um/drivers/ubd_kern.c | 3 +-- arch/um/kernel/skas/clone.c | 2 +- arch/um/os-Linux/helper.c | 4 ++-- arch/um/os-Linux/signal.c | 2 +- arch/um/os-Linux/skas/process.c | 2 +- 5 files changed, 6 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 8e0b43cf089f..cbd4f00fe77e 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1242,8 +1242,7 @@ static int __init ubd_driver_init(void){ * enough. So use anyway the io thread. */ } stack = alloc_stack(0, 0); - io_pid = start_io_thread(stack + PAGE_SIZE - sizeof(void *), - &thread_fd); + io_pid = start_io_thread(stack + PAGE_SIZE, &thread_fd); if(io_pid < 0){ printk(KERN_ERR "ubd : Failed to start I/O thread (errno = %d) - " diff --git a/arch/um/kernel/skas/clone.c b/arch/um/kernel/skas/clone.c index 592cdb138441..5afac0fef24e 100644 --- a/arch/um/kernel/skas/clone.c +++ b/arch/um/kernel/skas/clone.c @@ -29,7 +29,7 @@ stub_clone_handler(void) long err; err = stub_syscall2(__NR_clone, CLONE_PARENT | CLONE_FILES | SIGCHLD, - (unsigned long)data + UM_KERN_PAGE_SIZE / 2 - sizeof(void *)); + (unsigned long)data + UM_KERN_PAGE_SIZE / 2); if (err) { data->parent_err = err; goto done; diff --git a/arch/um/os-Linux/helper.c b/arch/um/os-Linux/helper.c index 9fa6e4187d4f..32e88baf18dd 100644 --- a/arch/um/os-Linux/helper.c +++ b/arch/um/os-Linux/helper.c @@ -64,7 +64,7 @@ int run_helper(void (*pre_exec)(void *), void *pre_data, char **argv) goto out_close; } - sp = stack + UM_KERN_PAGE_SIZE - sizeof(void *); + sp = stack + UM_KERN_PAGE_SIZE; data.pre_exec = pre_exec; data.pre_data = pre_data; data.argv = argv; @@ -120,7 +120,7 @@ int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, if (stack == 0) return -ENOMEM; - sp = stack + UM_KERN_PAGE_SIZE - sizeof(void *); + sp = stack + UM_KERN_PAGE_SIZE; pid = clone(proc, (void *) sp, flags, arg); if (pid < 0) { err = -errno; diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 1d501acb22ee..6de99bb16113 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -143,7 +143,7 @@ void set_sigstack(void *sig_stack, int size) stack_t stack = { .ss_flags = 0, .ss_sp = sig_stack, - .ss_size = size - sizeof(void *) + .ss_size = size }; if (sigaltstack(&stack, NULL) != 0) diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index fba674fac8b7..87d3129e7362 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -327,7 +327,7 @@ int start_userspace(unsigned long stub_stack) } /* set stack pointer to the end of the stack page, so it can grow downwards */ - sp = (unsigned long) stack + UM_KERN_PAGE_SIZE - sizeof(void *); + sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; flags = CLONE_FILES | SIGCHLD; -- cgit v1.2.3-58-ga151 From b77e81fbe5f5fb4ad9a61ec80f6d1e30b6da093a Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 8 May 2021 11:13:54 +0800 Subject: um: fix error return code in slip_open() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: a3c77c67a443 ("[PATCH] uml: slirp and slip driver cleanups and fixes") Reported-by: Hulk Robot Signed-off-by: Zhen Lei Acked-By: anton.ivanov@cambridgegreys.com Signed-off-by: Richard Weinberger --- arch/um/drivers/slip_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/um/drivers/slip_user.c b/arch/um/drivers/slip_user.c index 482a19c5105c..7334019c9e60 100644 --- a/arch/um/drivers/slip_user.c +++ b/arch/um/drivers/slip_user.c @@ -145,7 +145,8 @@ static int slip_open(void *data) } sfd = err; - if (set_up_tty(sfd)) + err = set_up_tty(sfd); + if (err) goto out_close2; pri->slave = sfd; -- cgit v1.2.3-58-ga151 From ccf1236ecac476d9d2704866d9a476c86e387971 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Sat, 8 May 2021 11:22:39 +0800 Subject: um: fix error return code in winch_tramp() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: 89df6bfc0405 ("uml: DEBUG_SHIRQ fixes") Reported-by: Hulk Robot Signed-off-by: Zhen Lei Acked-By: anton.ivanov@cambridgegreys.com Signed-off-by: Richard Weinberger --- arch/um/drivers/chan_user.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/um/drivers/chan_user.c b/arch/um/drivers/chan_user.c index d8845d4aac6a..6040817c036f 100644 --- a/arch/um/drivers/chan_user.c +++ b/arch/um/drivers/chan_user.c @@ -256,7 +256,8 @@ static int winch_tramp(int fd, struct tty_port *port, int *fd_out, goto out_close; } - if (os_set_fd_block(*fd_out, 0)) { + err = os_set_fd_block(*fd_out, 0); + if (err) { printk(UM_KERN_ERR "winch_tramp: failed to set thread_fd " "non-blocking.\n"); goto out_close; -- cgit v1.2.3-58-ga151 From 80f9733114e8f925b88d8f4e65ee827640ce4253 Mon Sep 17 00:00:00 2001 From: Shaokun Zhang Date: Mon, 31 May 2021 11:17:07 +0800 Subject: um: Remove the repeated declaration Function 'os_flush_stdout' is declared twice, so remove the repeated declaration. Cc: Jeff Dike Cc: Richard Weinberger Signed-off-by: Shaokun Zhang Signed-off-by: Richard Weinberger --- arch/um/include/shared/os.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 8e9e05d3b419..60b84edc8a68 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -214,7 +214,6 @@ extern int os_protect_memory(void *addr, unsigned long len, extern int os_unmap_memory(void *addr, int len); extern int os_drop_memory(void *addr, int length); extern int can_drop_memory(void); -extern void os_flush_stdout(void); extern int os_mincore(void *addr, unsigned long len); /* execvp.c */ -- cgit v1.2.3-58-ga151 From 1aee020155f364ef538370d3392969f1077b9bae Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Sat, 19 Jun 2021 10:02:59 +0800 Subject: um: remove unneeded semicolon in um_arch.c Fix following coccicheck warning: ./arch/um/kernel/um_arch.c:284:34-35: Unneeded semicolon Signed-off-by: Wan Jiabing Signed-off-by: Richard Weinberger --- arch/um/kernel/um_arch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 20cce10bde51..0a7eb104850a 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -280,7 +280,7 @@ static void parse_host_cpu_flags(char *line) int i; for (i = 0; i < 32*NCAPINTS; i++) { if ((x86_cap_flags[i] != NULL) && strstr(line, x86_cap_flags[i])) - set_cpu_cap(&boot_cpu_data, i);; + set_cpu_cap(&boot_cpu_data, i); } } static void parse_cache_line(char *line) -- cgit v1.2.3-58-ga151