From 0e84ec96b561d298bc9a22d58e7a4730bc408aca Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 30 May 2019 14:10:14 -0300 Subject: KVM: PPC: Remove leftover comment from emulate_loadstore.c Commit 4eeb85568e56 ("KVM: PPC: Remove mmio_vsx_tx_sx_enabled in KVM MMIO emulation") removed the mmio_vsx_tx_sx_enabled field, but its documentation was left behind. Remove the superfluous comment. Signed-off-by: Fabiano Rosas Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/emulate_loadstore.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c index f91b1309a0a8..806dbc439131 100644 --- a/arch/powerpc/kvm/emulate_loadstore.c +++ b/arch/powerpc/kvm/emulate_loadstore.c @@ -100,12 +100,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) rs = get_rs(inst); rt = get_rt(inst); - /* - * if mmio_vsx_tx_sx_enabled == 0, copy data between - * VSR[0..31] and memory - * if mmio_vsx_tx_sx_enabled == 1, copy data between - * VSR[32..63] and memory - */ vcpu->arch.mmio_vsx_copy_nums = 0; vcpu->arch.mmio_vsx_offset = 0; vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE; -- cgit v1.2.3-58-ga151 From 3bda7f0ae0f786b202c0c5124c208be067acca30 Mon Sep 17 00:00:00 2001 From: Mark Cave-Ayland Date: Fri, 14 Jun 2019 19:57:45 +0100 Subject: KVM: PPC: Book3S PR: Fix software breakpoints QEMU's kvm_handle_debug() function identifies software breakpoints by checking for a value of 0 in kvm_debug_exit_arch's status field. Since this field isn't explicitly set to 0 when the software breakpoint instruction is detected, any previous non-zero value present causes a hang in QEMU as it tries to process the breakpoint instruction incorrectly as a hardware breakpoint. Ensure that the kvm_debug_exit_arch status field is set to 0 when the software breakpoint instruction is detected (similar to the existing logic in booke.c and e500_emulate.c) to restore software breakpoint functionality under Book3S PR. Signed-off-by: Mark Cave-Ayland Reviewed-by: Fabiano Rosas Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/emulate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 9f5b8c01c4e1..e77becaad5dd 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -282,6 +282,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) */ if (inst == KVMPPC_INST_SW_BREAKPOINT) { run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.status = 0; run->debug.arch.address = kvmppc_get_pc(vcpu); emulated = EMULATE_EXIT_USER; advance = 0; -- cgit v1.2.3-58-ga151 From 237aed48c642328ff0ab19b63423634340224a06 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Tue, 6 Aug 2019 19:25:38 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Free escalation interrupts before disabling the VP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a vCPU is brought done, the XIVE VP (Virtual Processor) is first disabled and then the event notification queues are freed. When freeing the queues, we check for possible escalation interrupts and free them also. But when a XIVE VP is disabled, the underlying XIVE ENDs also are disabled in OPAL. When an END (Event Notification Descriptor) is disabled, its ESB pages (ESn and ESe) are disabled and loads return all 1s. Which means that any access on the ESB page of the escalation interrupt will return invalid values. When an interrupt is freed, the shutdown handler computes a 'saved_p' field from the value returned by a load in xive_do_source_set_mask(). This value is incorrect for escalation interrupts for the reason described above. This has no impact on Linux/KVM today because we don't make use of it but we will introduce in future changes a xive_get_irqchip_state() handler. This handler will use the 'saved_p' field to return the state of an interrupt and 'saved_p' being incorrect, softlockup will occur. Fix the vCPU cleanup sequence by first freeing the escalation interrupts if any, then disable the XIVE VP and last free the queues. Fixes: 90c73795afa2 ("KVM: PPC: Book3S HV: Add a new KVM device for the XIVE native exploitation mode") Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller") Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190806172538.5087-1-clg@kaod.org --- arch/powerpc/kvm/book3s_xive.c | 18 ++++++++++-------- arch/powerpc/kvm/book3s_xive_native.c | 12 +++++++----- 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index e3ba67095895..09f838aa3138 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1134,20 +1134,22 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) /* Mask the VP IPI */ xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01); - /* Disable the VP */ - xive_native_disable_vp(xc->vp_id); - - /* Free the queues & associated interrupts */ + /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { - struct xive_q *q = &xc->queues[i]; - - /* Free the escalation irq */ if (xc->esc_virq[i]) { free_irq(xc->esc_virq[i], vcpu); irq_dispose_mapping(xc->esc_virq[i]); kfree(xc->esc_virq_names[i]); } - /* Free the queue */ + } + + /* Disable the VP */ + xive_native_disable_vp(xc->vp_id); + + /* Free the queues */ + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { + struct xive_q *q = &xc->queues[i]; + xive_native_disable_queue(xc->vp_id, q, i); if (q->qpage) { free_pages((unsigned long)q->qpage, diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index a998823f68a3..368427fcad20 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -67,10 +67,7 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) xc->valid = false; kvmppc_xive_disable_vcpu_interrupts(vcpu); - /* Disable the VP */ - xive_native_disable_vp(xc->vp_id); - - /* Free the queues & associated interrupts */ + /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { /* Free the escalation irq */ if (xc->esc_virq[i]) { @@ -79,8 +76,13 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) kfree(xc->esc_virq_names[i]); xc->esc_virq[i] = 0; } + } - /* Free the queue */ + /* Disable the VP */ + xive_native_disable_vp(xc->vp_id); + + /* Free the queues */ + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { kvmppc_xive_native_cleanup_queue(vcpu, i); } -- cgit v1.2.3-58-ga151 From 959c5d5134786b4988b6fdd08e444aa67d1667ed Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 13 Aug 2019 20:03:49 +1000 Subject: KVM: PPC: Book3S HV: Fix race in re-enabling XIVE escalation interrupts Escalation interrupts are interrupts sent to the host by the XIVE hardware when it has an interrupt to deliver to a guest VCPU but that VCPU is not running anywhere in the system. Hence we disable the escalation interrupt for the VCPU being run when we enter the guest and re-enable it when the guest does an H_CEDE hypercall indicating it is idle. It is possible that an escalation interrupt gets generated just as we are entering the guest. In that case the escalation interrupt may be using a queue entry in one of the interrupt queues, and that queue entry may not have been processed when the guest exits with an H_CEDE. The existing entry code detects this situation and does not clear the vcpu->arch.xive_esc_on flag as an indication that there is a pending queue entry (if the queue entry gets processed, xive_esc_irq() will clear the flag). There is a comment in the code saying that if the flag is still set on H_CEDE, we have to abort the cede rather than re-enabling the escalation interrupt, lest we end up with two occurrences of the escalation interrupt in the interrupt queue. However, the exit code doesn't do that; it aborts the cede in the sense that vcpu->arch.ceded gets cleared, but it still enables the escalation interrupt by setting the source's PQ bits to 00. Instead we need to set the PQ bits to 10, indicating that an interrupt has been triggered. We also need to avoid setting vcpu->arch.xive_esc_on in this case (i.e. vcpu->arch.xive_esc_on seen to be set on H_CEDE) because xive_esc_irq() will run at some point and clear it, and if we race with that we may end up with an incorrect result (i.e. xive_esc_on set when the escalation interrupt has just been handled). It is extremely unlikely that having two queue entries would cause observable problems; theoretically it could cause queue overflow, but the CPU would have to have thousands of interrupts targetted to it for that to be possible. However, this fix will also make it possible to determine accurately whether there is an unhandled escalation interrupt in the queue, which will be needed by the following patch. Fixes: 9b9b13a6d153 ("KVM: PPC: Book3S HV: Keep XIVE escalation interrupt masked unless ceded") Cc: stable@vger.kernel.org # v4.16+ Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190813100349.GD9567@blackberry --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 36 +++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 337e64468d78..2e7e788eb0cf 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -2831,29 +2831,39 @@ kvm_cede_prodded: kvm_cede_exit: ld r9, HSTATE_KVM_VCPU(r13) #ifdef CONFIG_KVM_XICS - /* Abort if we still have a pending escalation */ + /* are we using XIVE with single escalation? */ + ld r10, VCPU_XIVE_ESC_VADDR(r9) + cmpdi r10, 0 + beq 3f + li r6, XIVE_ESB_SET_PQ_00 + /* + * If we still have a pending escalation, abort the cede, + * and we must set PQ to 10 rather than 00 so that we don't + * potentially end up with two entries for the escalation + * interrupt in the XIVE interrupt queue. In that case + * we also don't want to set xive_esc_on to 1 here in + * case we race with xive_esc_irq(). + */ lbz r5, VCPU_XIVE_ESC_ON(r9) cmpwi r5, 0 - beq 1f + beq 4f li r0, 0 stb r0, VCPU_CEDED(r9) -1: /* Enable XIVE escalation */ - li r5, XIVE_ESB_SET_PQ_00 + li r6, XIVE_ESB_SET_PQ_10 + b 5f +4: li r0, 1 + stb r0, VCPU_XIVE_ESC_ON(r9) + /* make sure store to xive_esc_on is seen before xive_esc_irq runs */ + sync +5: /* Enable XIVE escalation */ mfmsr r0 andi. r0, r0, MSR_DR /* in real mode? */ beq 1f - ld r10, VCPU_XIVE_ESC_VADDR(r9) - cmpdi r10, 0 - beq 3f - ldx r0, r10, r5 + ldx r0, r10, r6 b 2f 1: ld r10, VCPU_XIVE_ESC_RADDR(r9) - cmpdi r10, 0 - beq 3f - ldcix r0, r10, r5 + ldcix r0, r10, r6 2: sync - li r0, 1 - stb r0, VCPU_XIVE_ESC_ON(r9) #endif /* CONFIG_KVM_XICS */ 3: b guest_exit_cont -- cgit v1.2.3-58-ga151 From 8d4ba9c931bc384bcc6889a43915aaaf19d3e499 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 13 Aug 2019 20:01:00 +1000 Subject: KVM: PPC: Book3S HV: Don't push XIVE context when not using XIVE device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At present, when running a guest on POWER9 using HV KVM but not using an in-kernel interrupt controller (XICS or XIVE), for example if QEMU is run with the kernel_irqchip=off option, the guest entry code goes ahead and tries to load the guest context into the XIVE hardware, even though no context has been set up. To fix this, we check that the "CAM word" is non-zero before pushing it to the hardware. The CAM word is initialized to a non-zero value in kvmppc_xive_connect_vcpu() and kvmppc_xive_native_connect_vcpu(), and is now cleared in kvmppc_xive_{,native_}cleanup_vcpu. Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller") Cc: stable@vger.kernel.org # v4.12+ Reported-by: Cédric Le Goater Signed-off-by: Paul Mackerras Reviewed-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190813100100.GC9567@blackberry --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 ++ arch/powerpc/kvm/book3s_xive.c | 11 ++++++++++- arch/powerpc/kvm/book3s_xive_native.c | 3 +++ 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 2e7e788eb0cf..07181d0dfcb7 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -942,6 +942,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) ld r11, VCPU_XIVE_SAVED_STATE(r4) li r9, TM_QW1_OS lwz r8, VCPU_XIVE_CAM_WORD(r4) + cmpwi r8, 0 + beq no_xive li r7, TM_QW1_OS + TM_WORD2 mfmsr r0 andi. r0, r0, MSR_DR /* in real mode? */ diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 09f838aa3138..586867e46e51 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -67,8 +67,14 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; u64 pq; - if (!tima) + /* + * Nothing to do if the platform doesn't have a XIVE + * or this vCPU doesn't have its own XIVE context + * (e.g. because it's not using an in-kernel interrupt controller). + */ + if (!tima || !vcpu->arch.xive_cam_word) return; + eieio(); __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); @@ -1146,6 +1152,9 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) /* Disable the VP */ xive_native_disable_vp(xc->vp_id); + /* Clear the cam word so guest entry won't try to push context */ + vcpu->arch.xive_cam_word = 0; + /* Free the queues */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { struct xive_q *q = &xc->queues[i]; diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 368427fcad20..11b91b46fc39 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -81,6 +81,9 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) /* Disable the VP */ xive_native_disable_vp(xc->vp_id); + /* Clear the cam word so guest entry won't try to push context */ + vcpu->arch.xive_cam_word = 0; + /* Free the queues */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { kvmppc_xive_native_cleanup_queue(vcpu, i); -- cgit v1.2.3-58-ga151 From da15c03b047dca891d37b9f4ef9ca14d84a6484f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 13 Aug 2019 20:06:48 +1000 Subject: powerpc/xive: Implement get_irqchip_state method for XIVE to fix shutdown race Testing has revealed the existence of a race condition where a XIVE interrupt being shut down can be in one of the XIVE interrupt queues (of which there are up to 8 per CPU, one for each priority) at the point where free_irq() is called. If this happens, can return an interrupt number which has been shut down. This can lead to various symptoms: - irq_to_desc(irq) can be NULL. In this case, no end-of-interrupt function gets called, resulting in the CPU's elevated interrupt priority (numerically lowered CPPR) never gets reset. That then means that the CPU stops processing interrupts, causing device timeouts and other errors in various device drivers. - The irq descriptor or related data structures can be in the process of being freed as the interrupt code is using them. This typically leads to crashes due to bad pointer dereferences. This race is basically what commit 62e0468650c3 ("genirq: Add optional hardware synchronization for shutdown", 2019-06-28) is intended to fix, given a get_irqchip_state() method for the interrupt controller being used. It works by polling the interrupt controller when an interrupt is being freed until the controller says it is not pending. With XIVE, the PQ bits of the interrupt source indicate the state of the interrupt source, and in particular the P bit goes from 0 to 1 at the point where the hardware writes an entry into the interrupt queue that this interrupt is directed towards. Normally, the code will then process the interrupt and do an end-of-interrupt (EOI) operation which will reset PQ to 00 (assuming another interrupt hasn't been generated in the meantime). However, there are situations where the code resets P even though a queue entry exists (for example, by setting PQ to 01, which disables the interrupt source), and also situations where the code leaves P at 1 after removing the queue entry (for example, this is done for escalation interrupts so they cannot fire again until they are explicitly re-enabled). The code already has a 'saved_p' flag for the interrupt source which indicates that a queue entry exists, although it isn't maintained consistently. This patch adds a 'stale_p' flag to indicate that P has been left at 1 after processing a queue entry, and adds code to set and clear saved_p and stale_p as necessary to maintain a consistent indication of whether a queue entry may or may not exist. With this, we can implement xive_get_irqchip_state() by looking at stale_p, saved_p and the ESB PQ bits for the interrupt. There is some additional code to handle escalation interrupts properly; because they are enabled and disabled in KVM assembly code, which does not have access to the xive_irq_data struct for the escalation interrupt. Hence, stale_p may be incorrect when the escalation interrupt is freed in kvmppc_xive_{,native_}cleanup_vcpu(). Fortunately, we can fix it up by looking at vcpu->arch.xive_esc_on, with some careful attention to barriers in order to ensure the correct result if xive_esc_irq() races with kvmppc_xive_cleanup_vcpu(). Finally, this adds code to make noise on the console (pr_crit and WARN_ON(1)) if we find an interrupt queue entry for an interrupt which does not have a descriptor. While this won't catch the race reliably, if it does get triggered it will be an indication that the race is occurring and needs to be debugged. Fixes: 243e25112d06 ("powerpc/xive: Native exploitation of the XIVE interrupt controller") Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20190813100648.GE9567@blackberry --- arch/powerpc/include/asm/xive.h | 8 ++++ arch/powerpc/kvm/book3s_xive.c | 31 +++++++++++++ arch/powerpc/kvm/book3s_xive.h | 2 + arch/powerpc/kvm/book3s_xive_native.c | 3 ++ arch/powerpc/sysdev/xive/common.c | 87 ++++++++++++++++++++++++++--------- 5 files changed, 108 insertions(+), 23 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index e4016985764e..efb0e597b272 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -46,7 +46,15 @@ struct xive_irq_data { /* Setup/used by frontend */ int target; + /* + * saved_p means that there is a queue entry for this interrupt + * in some CPU's queue (not including guest vcpu queues), even + * if P is not set in the source ESB. + * stale_p means that there is no queue entry for this interrupt + * in some CPU's queue, even if P is set in the source ESB. + */ bool saved_p; + bool stale_p; }; #define XIVE_IRQ_FLAG_STORE_EOI 0x01 #define XIVE_IRQ_FLAG_LSI 0x02 diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 586867e46e51..591bfb4bfd0f 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -166,6 +166,9 @@ static irqreturn_t xive_esc_irq(int irq, void *data) */ vcpu->arch.xive_esc_on = false; + /* This orders xive_esc_on = false vs. subsequent stale_p = true */ + smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */ + return IRQ_HANDLED; } @@ -1119,6 +1122,31 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu) vcpu->arch.xive_esc_raddr = 0; } +/* + * In single escalation mode, the escalation interrupt is marked so + * that EOI doesn't re-enable it, but just sets the stale_p flag to + * indicate that the P bit has already been dealt with. However, the + * assembly code that enters the guest sets PQ to 00 without clearing + * stale_p (because it has no easy way to address it). Hence we have + * to adjust stale_p before shutting down the interrupt. + */ +void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, + struct kvmppc_xive_vcpu *xc, int irq) +{ + struct irq_data *d = irq_get_irq_data(irq); + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + /* + * This slightly odd sequence gives the right result + * (i.e. stale_p set if xive_esc_on is false) even if + * we race with xive_esc_irq() and xive_irq_eoi(). + */ + xd->stale_p = false; + smp_mb(); /* paired with smb_wmb in xive_esc_irq */ + if (!vcpu->arch.xive_esc_on) + xd->stale_p = true; +} + void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; @@ -1143,6 +1171,9 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { if (xc->esc_virq[i]) { + if (xc->xive->single_escalation) + xive_cleanup_single_escalation(vcpu, xc, + xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); irq_dispose_mapping(xc->esc_virq[i]); kfree(xc->esc_virq_names[i]); diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 50494d0ee375..955b820ffd6d 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -282,6 +282,8 @@ int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio); int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, bool single_escalation); struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); +void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, + struct kvmppc_xive_vcpu *xc, int irq); #endif /* CONFIG_KVM_XICS */ #endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 11b91b46fc39..f0cab43e6f4b 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -71,6 +71,9 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { /* Free the escalation irq */ if (xc->esc_virq[i]) { + if (xc->xive->single_escalation) + xive_cleanup_single_escalation(vcpu, xc, + xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); irq_dispose_mapping(xc->esc_virq[i]); kfree(xc->esc_virq_names[i]); diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 1cdb39575eae..be86fce1a84e 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -135,7 +135,7 @@ static u32 xive_read_eq(struct xive_q *q, bool just_peek) static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) { u32 irq = 0; - u8 prio; + u8 prio = 0; /* Find highest pending priority */ while (xc->pending_prio != 0) { @@ -148,8 +148,19 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) irq = xive_read_eq(&xc->queue[prio], just_peek); /* Found something ? That's it */ - if (irq) - break; + if (irq) { + if (just_peek || irq_to_desc(irq)) + break; + /* + * We should never get here; if we do then we must + * have failed to synchronize the interrupt properly + * when shutting it down. + */ + pr_crit("xive: got interrupt %d without descriptor, dropping\n", + irq); + WARN_ON(1); + continue; + } /* Clear pending bits */ xc->pending_prio &= ~(1 << prio); @@ -307,6 +318,7 @@ static void xive_do_queue_eoi(struct xive_cpu *xc) */ static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) { + xd->stale_p = false; /* If the XIVE supports the new "store EOI facility, use it */ if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0); @@ -350,7 +362,7 @@ static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) } } -/* irq_chip eoi callback */ +/* irq_chip eoi callback, called with irq descriptor lock held */ static void xive_irq_eoi(struct irq_data *d) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -366,6 +378,8 @@ static void xive_irq_eoi(struct irq_data *d) if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) && !(xd->flags & XIVE_IRQ_NO_EOI)) xive_do_source_eoi(irqd_to_hwirq(d), xd); + else + xd->stale_p = true; /* * Clear saved_p to indicate that it's no longer occupying @@ -397,11 +411,16 @@ static void xive_do_source_set_mask(struct xive_irq_data *xd, */ if (mask) { val = xive_esb_read(xd, XIVE_ESB_SET_PQ_01); - xd->saved_p = !!(val & XIVE_ESB_VAL_P); - } else if (xd->saved_p) + if (!xd->stale_p && !!(val & XIVE_ESB_VAL_P)) + xd->saved_p = true; + xd->stale_p = false; + } else if (xd->saved_p) { xive_esb_read(xd, XIVE_ESB_SET_PQ_10); - else + xd->saved_p = false; + } else { xive_esb_read(xd, XIVE_ESB_SET_PQ_00); + xd->stale_p = false; + } } /* @@ -541,6 +560,8 @@ static unsigned int xive_irq_startup(struct irq_data *d) unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); int target, rc; + xd->saved_p = false; + xd->stale_p = false; pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", d->irq, hw_irq, d); @@ -587,6 +608,7 @@ static unsigned int xive_irq_startup(struct irq_data *d) return 0; } +/* called with irq descriptor lock held */ static void xive_irq_shutdown(struct irq_data *d) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -601,16 +623,6 @@ static void xive_irq_shutdown(struct irq_data *d) /* Mask the interrupt at the source */ xive_do_source_set_mask(xd, true); - /* - * The above may have set saved_p. We clear it otherwise it - * will prevent re-enabling later on. It is ok to forget the - * fact that the interrupt might be in a queue because we are - * accounting that already in xive_dec_target_count() and will - * be re-routing it to a new queue with proper accounting when - * it's started up again - */ - xd->saved_p = false; - /* * Mask the interrupt in HW in the IVT/EAS and set the number * to be the "bad" IRQ number @@ -797,6 +809,10 @@ static int xive_irq_retrigger(struct irq_data *d) return 1; } +/* + * Caller holds the irq descriptor lock, so this won't be called + * concurrently with xive_get_irqchip_state on the same interrupt. + */ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) { struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); @@ -820,6 +836,10 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) /* Set it to PQ=10 state to prevent further sends */ pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_10); + if (!xd->stale_p) { + xd->saved_p = !!(pq & XIVE_ESB_VAL_P); + xd->stale_p = !xd->saved_p; + } /* No target ? nothing to do */ if (xd->target == XIVE_INVALID_TARGET) { @@ -827,7 +847,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) * An untargetted interrupt should have been * also masked at the source */ - WARN_ON(pq & 2); + WARN_ON(xd->saved_p); return 0; } @@ -847,9 +867,8 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) * This saved_p is cleared by the host EOI, when we know * for sure the queue slot is no longer in use. */ - if (pq & 2) { - pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_11); - xd->saved_p = true; + if (xd->saved_p) { + xive_esb_read(xd, XIVE_ESB_SET_PQ_11); /* * Sync the XIVE source HW to ensure the interrupt @@ -862,8 +881,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) */ if (xive_ops->sync_source) xive_ops->sync_source(hw_irq); - } else - xd->saved_p = false; + } } else { irqd_clr_forwarded_to_vcpu(d); @@ -914,6 +932,23 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state) return 0; } +/* Called with irq descriptor lock held. */ +static int xive_get_irqchip_state(struct irq_data *data, + enum irqchip_irq_state which, bool *state) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(data); + + switch (which) { + case IRQCHIP_STATE_ACTIVE: + *state = !xd->stale_p && + (xd->saved_p || + !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P)); + return 0; + default: + return -EINVAL; + } +} + static struct irq_chip xive_irq_chip = { .name = "XIVE-IRQ", .irq_startup = xive_irq_startup, @@ -925,6 +960,7 @@ static struct irq_chip xive_irq_chip = { .irq_set_type = xive_irq_set_type, .irq_retrigger = xive_irq_retrigger, .irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity, + .irq_get_irqchip_state = xive_get_irqchip_state, }; bool is_xive_irq(struct irq_chip *chip) @@ -1337,6 +1373,11 @@ static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc) raw_spin_lock(&desc->lock); xd = irq_desc_get_handler_data(desc); + /* + * Clear saved_p to indicate that it's no longer pending + */ + xd->saved_p = false; + /* * For LSIs, we EOI, this will cause a resend if it's * still asserted. Otherwise do an MSI retrigger. -- cgit v1.2.3-58-ga151 From 12b58f4ed2a1b93f0eae84f308f6e0a143a533d4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 15 Aug 2019 10:22:37 -0700 Subject: KVM: Assert that struct kvm_vcpu is always as offset zero KVM implementations that wrap struct kvm_vcpu with a vendor specific struct, e.g. struct vcpu_vmx, must place the vcpu member at offset 0, otherwise the usercopy region intended to encompass struct kvm_vcpu_arch will instead overlap random chunks of the vendor specific struct. E.g. padding a large number of bytes before struct kvm_vcpu triggers a usercopy warn when running with CONFIG_HARDENED_USERCOPY=y. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/e500.c | 3 +++ arch/x86/kvm/svm.c | 3 +++ arch/x86/kvm/vmx/vmx.c | 3 +++ 3 files changed, 9 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index b5a848a55504..00649ca5fa9a 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -440,6 +440,9 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, struct kvm_vcpu *vcpu; int err; + BUILD_BUG_ON_MSG(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0, + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); + vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu_e500) { err = -ENOMEM; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6708bb82d8a8..1f220a85514f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2133,6 +2133,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) struct page *nested_msrpm_pages; int err; + BUILD_BUG_ON_MSG(offsetof(struct vcpu_svm, vcpu) != 0, + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); + svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!svm) { err = -ENOMEM; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bf70c0292e70..570a233e272b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6625,6 +6625,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) unsigned long *msr_bitmap; int cpu; + BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, + "struct kvm_vcpu must be at offset 0 for arch usercopy region"); + vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT); if (!vmx) return ERR_PTR(-ENOMEM); -- cgit v1.2.3-58-ga151 From ff7240ccf0cdab86762635840c6d62eec7147d18 Mon Sep 17 00:00:00 2001 From: Paul Menzel Date: Tue, 30 Jul 2019 10:53:10 +0200 Subject: KVM: PPC: Book3S: Mark expected switch fall-through MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the error below triggered by `-Wimplicit-fallthrough`, by tagging it as an expected fall-through. arch/powerpc/kvm/book3s_32_mmu.c: In function ‘kvmppc_mmu_book3s_32_xlate_pte’: arch/powerpc/kvm/book3s_32_mmu.c:241:21: error: this statement may fall through [-Werror=implicit-fallthrough=] pte->may_write = true; ~~~~~~~~~~~~~~~^~~~~~ arch/powerpc/kvm/book3s_32_mmu.c:242:5: note: here case 3: ^~~~ Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_32_mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index 653936177857..18f244aad7aa 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -239,6 +239,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, case 2: case 6: pte->may_write = true; + /* fall through */ case 3: case 5: case 7: -- cgit v1.2.3-58-ga151 From d22deab6960a6cb015a36e74a2dcbab6ca9f5544 Mon Sep 17 00:00:00 2001 From: Suraj Jitindar Singh Date: Tue, 20 Aug 2019 16:13:49 +1000 Subject: KVM: PPC: Book3S HV: Define usage types for rmap array in guest memslot The rmap array in the guest memslot is an array of size number of guest pages, allocated at memslot creation time. Each rmap entry in this array is used to store information about the guest page to which it corresponds. For example for a hpt guest it is used to store a lock bit, rc bits, a present bit and the index of a hpt entry in the guest hpt which maps this page. For a radix guest which is running nested guests it is used to store a pointer to a linked list of nested rmap entries which store the nested guest physical address which maps this guest address and for which there is a pte in the shadow page table. As there are currently two uses for the rmap array, and the potential for this to expand to more in the future, define a type field (being the top 8 bits of the rmap entry) to be used to define the type of the rmap entry which is currently present and define two values for this field for the two current uses of the rmap array. Since the nested case uses the rmap entry to store a pointer, define this type as having the two high bits set as is expected for a pointer. Define the hpt entry type as having bit 56 set (bit 7 IBM bit ordering). Signed-off-by: Suraj Jitindar Singh Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 22 ++++++++++++++++++---- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e6e5f59aaa97..6fb5fb4779e0 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -232,11 +232,25 @@ struct revmap_entry { }; /* - * We use the top bit of each memslot->arch.rmap entry as a lock bit, - * and bit 32 as a present flag. The bottom 32 bits are the - * index in the guest HPT of a HPTE that points to the page. + * The rmap array of size number of guest pages is allocated for each memslot. + * This array is used to store usage specific information about the guest page. + * Below are the encodings of the various possible usage types. */ -#define KVMPPC_RMAP_LOCK_BIT 63 +/* Free bits which can be used to define a new usage */ +#define KVMPPC_RMAP_TYPE_MASK 0xff00000000000000 +#define KVMPPC_RMAP_NESTED 0xc000000000000000 /* Nested rmap array */ +#define KVMPPC_RMAP_HPT 0x0100000000000000 /* HPT guest */ + +/* + * rmap usage definition for a hash page table (hpt) guest: + * 0x0000080000000000 Lock bit + * 0x0000018000000000 RC bits + * 0x0000000100000000 Present bit + * 0x00000000ffffffff HPT index bits + * The bottom 32 bits are the index in the guest HPT of a HPTE that points to + * the page. + */ +#define KVMPPC_RMAP_LOCK_BIT 43 #define KVMPPC_RMAP_RC_SHIFT 32 #define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT) #define KVMPPC_RMAP_PRESENT 0x100000000ul diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 63e0ce91e29d..7186c65c61c9 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -99,7 +99,7 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } else { rev->forw = rev->back = pte_index; *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | - pte_index | KVMPPC_RMAP_PRESENT; + pte_index | KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_HPT; } unlock_rmap(rmap); } -- cgit v1.2.3-58-ga151 From 2ad7a27deaf6d78545d97ab80874584f6990360e Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 26 Aug 2019 16:21:21 +1000 Subject: KVM: PPC: Book3S: Enable XIVE native capability only if OPAL has required functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are some POWER9 machines where the OPAL firmware does not support the OPAL_XIVE_GET_QUEUE_STATE and OPAL_XIVE_SET_QUEUE_STATE calls. The impact of this is that a guest using XIVE natively will not be able to be migrated successfully. On the source side, the get_attr operation on the KVM native device for the KVM_DEV_XIVE_GRP_EQ_CONFIG attribute will fail; on the destination side, the set_attr operation for the same attribute will fail. This adds tests for the existence of the OPAL get/set queue state functions, and if they are not supported, the XIVE-native KVM device is not created and the KVM_CAP_PPC_IRQ_XIVE capability returns false. Userspace can then either provide a software emulation of XIVE, or else tell the guest that it does not have a XIVE controller available to it. Cc: stable@vger.kernel.org # v5.2+ Fixes: 3fab2d10588e ("KVM: PPC: Book3S HV: XIVE: Activate XIVE exploitation mode") Reviewed-by: David Gibson Reviewed-by: Cédric Le Goater Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_ppc.h | 1 + arch/powerpc/include/asm/xive.h | 1 + arch/powerpc/kvm/book3s.c | 8 +++++--- arch/powerpc/kvm/book3s_xive_native.c | 5 +++++ arch/powerpc/kvm/powerpc.c | 3 ++- arch/powerpc/sysdev/xive/native.c | 7 +++++++ 6 files changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2484e6a8f5ca..8e8514efb124 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -598,6 +598,7 @@ extern int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val); extern int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val); +extern bool kvmppc_xive_native_supported(void); #else static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index efb0e597b272..818989e11678 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -135,6 +135,7 @@ extern int xive_native_get_queue_state(u32 vp_id, uint32_t prio, u32 *qtoggle, extern int xive_native_set_queue_state(u32 vp_id, uint32_t prio, u32 qtoggle, u32 qindex); extern int xive_native_get_vp_state(u32 vp_id, u64 *out_state); +extern bool xive_native_has_queue_state_support(void); #else diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 9524d92bc45d..d7fcdfa7fee4 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -1083,9 +1083,11 @@ static int kvmppc_book3s_init(void) if (xics_on_xive()) { kvmppc_xive_init_module(); kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS); - kvmppc_xive_native_init_module(); - kvm_register_device_ops(&kvm_xive_native_ops, - KVM_DEV_TYPE_XIVE); + if (kvmppc_xive_native_supported()) { + kvmppc_xive_native_init_module(); + kvm_register_device_ops(&kvm_xive_native_ops, + KVM_DEV_TYPE_XIVE); + } } else #endif kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS); diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index f0cab43e6f4b..248c1ea9e788 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -1179,6 +1179,11 @@ int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val) return 0; } +bool kvmppc_xive_native_supported(void) +{ + return xive_native_has_queue_state_support(); +} + static int xive_native_debug_show(struct seq_file *m, void *private) { struct kvmppc_xive *xive = m->private; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0dba7eb24f92..7012dd709787 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -566,7 +566,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * a POWER9 processor) and the PowerNV platform, as * nested is not yet supported. */ - r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE); + r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE) && + kvmppc_xive_native_supported(); break; #endif diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 2f26b74f6cfa..37987c815913 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -800,6 +800,13 @@ int xive_native_set_queue_state(u32 vp_id, u32 prio, u32 qtoggle, u32 qindex) } EXPORT_SYMBOL_GPL(xive_native_set_queue_state); +bool xive_native_has_queue_state_support(void) +{ + return opal_check_token(OPAL_XIVE_GET_QUEUE_STATE) && + opal_check_token(OPAL_XIVE_SET_QUEUE_STATE); +} +EXPORT_SYMBOL_GPL(xive_native_has_queue_state_support); + int xive_native_get_vp_state(u32 vp_id, u64 *out_state) { __be64 state; -- cgit v1.2.3-58-ga151 From d28eafc5a64045c78136162af9d4ba42f8230080 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 27 Aug 2019 11:31:37 +1000 Subject: KVM: PPC: Book3S HV: Check for MMU ready on piggybacked virtual cores When we are running multiple vcores on the same physical core, they could be from different VMs and so it is possible that one of the VMs could have its arch.mmu_ready flag cleared (for example by a concurrent HPT resize) when we go to run it on a physical core. We currently check the arch.mmu_ready flag for the primary vcore but not the flags for the other vcores that will be run alongside it. This adds that check, and also a check when we select the secondary vcores from the preempted vcores list. Cc: stable@vger.kernel.org # v4.14+ Fixes: 38c53af85306 ("KVM: PPC: Book3S HV: Fix exclusion between HPT resizing and other HPT updates") Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index cde3f5a4b3e4..36d72e9faddf 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2860,7 +2860,7 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) if (!spin_trylock(&pvc->lock)) continue; prepare_threads(pvc); - if (!pvc->n_runnable) { + if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) { list_del_init(&pvc->preempt_list); if (pvc->runner == NULL) { pvc->vcore_state = VCORE_INACTIVE; @@ -2881,15 +2881,20 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) spin_unlock(&lp->lock); } -static bool recheck_signals(struct core_info *cip) +static bool recheck_signals_and_mmu(struct core_info *cip) { int sub, i; struct kvm_vcpu *vcpu; + struct kvmppc_vcore *vc; - for (sub = 0; sub < cip->n_subcores; ++sub) - for_each_runnable_thread(i, vcpu, cip->vc[sub]) + for (sub = 0; sub < cip->n_subcores; ++sub) { + vc = cip->vc[sub]; + if (!vc->kvm->arch.mmu_ready) + return true; + for_each_runnable_thread(i, vcpu, vc) if (signal_pending(vcpu->arch.run_task)) return true; + } return false; } @@ -3119,7 +3124,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) local_irq_disable(); hard_irq_disable(); if (lazy_irq_pending() || need_resched() || - recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) { + recheck_signals_and_mmu(&core_info)) { local_irq_enable(); vc->vcore_state = VCORE_INACTIVE; /* Unlock all except the primary vcore */ -- cgit v1.2.3-58-ga151 From ff42df49e75f053a8a6b4c2533100cdcc23afe69 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 27 Aug 2019 11:35:40 +1000 Subject: KVM: PPC: Book3S HV: Don't lose pending doorbell request on migration on P9 On POWER9, when userspace reads the value of the DPDES register on a vCPU, it is possible for 0 to be returned although there is a doorbell interrupt pending for the vCPU. This can lead to a doorbell interrupt being lost across migration. If the guest kernel uses doorbell interrupts for IPIs, then it could malfunction because of the lost interrupt. This happens because a newly-generated doorbell interrupt is signalled by setting vcpu->arch.doorbell_request to 1; the DPDES value in vcpu->arch.vcore->dpdes is not updated, because it can only be updated when holding the vcpu mutex, in order to avoid races. To fix this, we OR in vcpu->arch.doorbell_request when reading the DPDES value. Cc: stable@vger.kernel.org # v4.13+ Fixes: 579006944e0d ("KVM: PPC: Book3S HV: Virtualize doorbell facility on POWER9") Signed-off-by: Paul Mackerras Tested-by: Alexey Kardashevskiy --- arch/powerpc/kvm/book3s_hv.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 36d72e9faddf..f8975c620f41 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1678,7 +1678,14 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, *val = get_reg_val(id, vcpu->arch.pspb); break; case KVM_REG_PPC_DPDES: - *val = get_reg_val(id, vcpu->arch.vcore->dpdes); + /* + * On POWER9, where we are emulating msgsndp etc., + * we return 1 bit for each vcpu, which can come from + * either vcore->dpdes or doorbell_request. + * On POWER8, doorbell_request is 0. + */ + *val = get_reg_val(id, vcpu->arch.vcore->dpdes | + vcpu->arch.doorbell_request); break; case KVM_REG_PPC_VTB: *val = get_reg_val(id, vcpu->arch.vcore->vtb); -- cgit v1.2.3-58-ga151