From cda2eaa35948893d70145490d5d6ded546fc3bc6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 10 Nov 2017 16:40:24 +1100 Subject: KVM: PPC: Book3S HV: Avoid shifts by negative amounts The kvmppc_hpte_page_shifts function decodes the actual and base page sizes for a HPTE, returning -1 if it doesn't recognize the page size encoding. This then gets used as a shift amount in various places, which is undefined behaviour. This was reported by Coverity. In fact this should never occur, since we should only get HPTEs in the HPT which have a recognized page size encoding. The only place where this might not be true is in the call to kvmppc_actual_pgsz() near the beginning of kvmppc_do_h_enter(), where we are validating the HPTE value passed in from the guest. So to fix this and eliminate the undefined behaviour, we make kvmppc_hpte_page_shifts return 0 for unrecognized page size encodings, and make kvmppc_actual_pgsz() detect that case and return 0 for the page size, which will then cause kvmppc_do_h_enter() to return an error and refuse to insert any HPTE with an unrecognized page size encoding. To ensure that we don't get undefined behaviour in compute_tlbie_rb(), we take the 4k page size path for any unrecognized page size encoding. This should never be hit in practice because it is only used on HPTE values which have previously been checked for having a recognized page size encoding. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_book3s_64.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 735cfa35298a..998f7b7aaa9e 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -122,13 +122,13 @@ static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l) lphi = (l >> 16) & 0xf; switch ((l >> 12) & 0xf) { case 0: - return !lphi ? 24 : -1; /* 16MB */ + return !lphi ? 24 : 0; /* 16MB */ break; case 1: return 16; /* 64kB */ break; case 3: - return !lphi ? 34 : -1; /* 16GB */ + return !lphi ? 34 : 0; /* 16GB */ break; case 7: return (16 << 8) + 12; /* 64kB in 4kB */ @@ -140,7 +140,7 @@ static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l) return (24 << 8) + 12; /* 16MB in 4kB */ break; } - return -1; + return 0; } static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l) @@ -159,7 +159,11 @@ static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r) { - return 1ul << kvmppc_hpte_actual_page_shift(v, r); + int shift = kvmppc_hpte_actual_page_shift(v, r); + + if (shift) + return 1ul << shift; + return 0; } static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift) @@ -232,7 +236,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, va_low ^= v >> (SID_SHIFT_1T - 16); va_low &= 0x7ff; - if (b_pgshift == 12) { + if (b_pgshift <= 12) { if (a_pgshift > 12) { sllp = (a_pgshift == 16) ? 5 : 4; rb |= sllp << 5; /* AP field */ -- cgit From 117647ff936e2d9684cc881d87c0291f46669c20 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 10 Nov 2017 16:43:35 +1100 Subject: KVM: PPC: Book3S HV: Fix typo in kvmppc_hv_get_dirty_log_radix() This fixes a typo where the intent was to assign to 'j' in order to skip some number of bits in the dirty bitmap for a guest. The effect of the typo is benign since it means we just iterate through all the bits rather than skipping bits which we know will be zero. This issue was found by Coverity. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 58618f644c56..0c854816e653 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -573,7 +573,7 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, j = i + 1; if (npages) { set_dirty_bits(map, i, npages); - i = j + npages; + j = i + npages; } } return 0; -- cgit From 4fcf361dbdbdb43038bb173e2391c4073e713745 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 20 Nov 2017 14:17:53 +1100 Subject: KVM: PPC: Book3S HV: Remove useless statement This removes a statement that has no effect. It should have been removed in commit 898b25b202f3 ("KVM: PPC: Book3S HV: Simplify dynamic micro-threading code", 2017-06-22) along with the loop over the piggy-backed virtual cores. This issue was reported by Coverity. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2d46037ce936..597498d6db2e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2831,7 +2831,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) */ if (!thr0_done) kvmppc_start_thread(NULL, pvc); - thr += pvc->num_threads; } /* -- cgit From c0093f1a38a0fd6c32a2269f0533bb13fb95143d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 20 Nov 2017 16:12:25 +1100 Subject: KVM: PPC: Book3S HV: Fix conditions for starting vcpu This corrects the test that determines whether a vcpu that has just become able to run in the guest (e.g. it has just finished handling a hypercall or hypervisor page fault) and whose virtual core is already running somewhere as a "piggybacked" vcore can start immediately or not. (A piggybacked vcore is one which is executing along with another vcore as a result of dynamic micro-threading.) Previously the test tried to lock the piggybacked vcore using spin_trylock, which would always fail because the vcore was already locked, and so the vcpu would have to wait until its vcore exited the guest before it could enter. In fact the vcpu can enter if its vcore is in VCORE_PIGGYBACK state and not already exiting (or exited) the guest, so the test in VCORE_PIGGYBACK state is basically the same as for VCORE_RUNNING state. Coverity detected this as a double unlock issue, which it isn't because the spin_trylock would always fail. This will fix the apparent double unlock as well. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 597498d6db2e..c4f0bebfc5ba 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3175,17 +3175,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * this thread straight away and have it join in. */ if (!signal_pending(current)) { - if (vc->vcore_state == VCORE_PIGGYBACK) { - if (spin_trylock(&vc->lock)) { - if (vc->vcore_state == VCORE_RUNNING && - !VCORE_IS_EXITING(vc)) { - kvmppc_create_dtl_entry(vcpu, vc); - kvmppc_start_thread(vcpu, vc); - trace_kvm_guest_enter(vcpu); - } - spin_unlock(&vc->lock); - } - } else if (vc->vcore_state == VCORE_RUNNING && + if ((vc->vcore_state == VCORE_PIGGYBACK || + vc->vcore_state == VCORE_RUNNING) && !VCORE_IS_EXITING(vc)) { kvmppc_create_dtl_entry(vcpu, vc); kvmppc_start_thread(vcpu, vc); -- cgit From 9aa6825bbb7526a7fdec137b7cc3b042581cd2fc Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 20 Nov 2017 19:56:27 +1100 Subject: KVM: PPC: Book3S: Eliminate some unnecessary checks In an excess of caution, commit 6f63e81bda98 ("KVM: PPC: Book3S: Add MMIO emulation for FP and VSX instructions", 2017-02-21) included checks for the case that vcpu->arch.mmio_vsx_copy_nums is less than zero, even though its type is u8. This causes a Coverity warning, so we remove the check for < 0. We also adjust the associated comment to be more accurate ("4 or less" rather than "less than 4"). Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/powerpc.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6b6c53c42ac9..c2c7ef330553 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1101,11 +1101,9 @@ int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu, { enum emulation_result emulated = EMULATE_DONE; - /* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */ - if ( (vcpu->arch.mmio_vsx_copy_nums > 4) || - (vcpu->arch.mmio_vsx_copy_nums < 0) ) { + /* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */ + if (vcpu->arch.mmio_vsx_copy_nums > 4) return EMULATE_FAIL; - } while (vcpu->arch.mmio_vsx_copy_nums) { emulated = __kvmppc_handle_load(run, vcpu, rt, bytes, @@ -1247,11 +1245,9 @@ int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->arch.io_gpr = rs; - /* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */ - if ( (vcpu->arch.mmio_vsx_copy_nums > 4) || - (vcpu->arch.mmio_vsx_copy_nums < 0) ) { + /* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */ + if (vcpu->arch.mmio_vsx_copy_nums > 4) return EMULATE_FAIL; - } while (vcpu->arch.mmio_vsx_copy_nums) { if (kvmppc_get_vsr_data(vcpu, rs, &val) == -1) -- cgit From 81ceca05a42e24768c73d4d6dbd3701a60f1ed85 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 19 Dec 2017 15:56:24 +0100 Subject: KVM: PPC: Book3S HV: Remove vcpu->arch.dec usage On Book3S in HV mode, we don't use the vcpu->arch.dec field at all. Instead, all logic is built around vcpu->arch.dec_expires. So let's remove the one remaining piece of code that was setting it. Signed-off-by: Alexander Graf Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 2659844784b8..c8ffd69adfec 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -957,7 +957,6 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) mftb r7 subf r3,r7,r8 mtspr SPRN_DEC,r3 - std r3,VCPU_DEC(r4) ld r5, VCPU_SPRG0(r4) ld r6, VCPU_SPRG1(r4) -- cgit From 1627301020cb460f5a74e13c291f8db3b2a8062e Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 7 Jan 2018 10:07:36 +0100 Subject: KVM: PPC: Use seq_puts() in kvmppc_exit_timing_show() A headline should be quickly put into a sequence. Thus use the function "seq_puts" instead of "seq_printf" for this purpose. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/timing.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c index e44d2b2ea97e..1c03c978eb18 100644 --- a/arch/powerpc/kvm/timing.c +++ b/arch/powerpc/kvm/timing.c @@ -143,8 +143,7 @@ static int kvmppc_exit_timing_show(struct seq_file *m, void *private) int i; u64 min, max, sum, sum_quad; - seq_printf(m, "%s", "type count min max sum sum_squared\n"); - + seq_puts(m, "type count min max sum sum_squared\n"); for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) { -- cgit From 12c1f339cd49119e39063ae67f02d936f988c079 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:39:27 +1100 Subject: powerpc/xive: Move definition of ESB bits From xive.h to xive-regs.h since it's a HW register definition and it can be used from assembly Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/xive-regs.h | 35 +++++++++++++++++++++++++++++++++++ arch/powerpc/include/asm/xive.h | 35 ----------------------------------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/include/asm/xive-regs.h b/arch/powerpc/include/asm/xive-regs.h index 1d3f2be5ae39..fa4288822b68 100644 --- a/arch/powerpc/include/asm/xive-regs.h +++ b/arch/powerpc/include/asm/xive-regs.h @@ -9,6 +9,41 @@ #ifndef _ASM_POWERPC_XIVE_REGS_H #define _ASM_POWERPC_XIVE_REGS_H +/* + * "magic" Event State Buffer (ESB) MMIO offsets. + * + * Each interrupt source has a 2-bit state machine called ESB + * which can be controlled by MMIO. It's made of 2 bits, P and + * Q. P indicates that an interrupt is pending (has been sent + * to a queue and is waiting for an EOI). Q indicates that the + * interrupt has been triggered while pending. + * + * This acts as a coalescing mechanism in order to guarantee + * that a given interrupt only occurs at most once in a queue. + * + * When doing an EOI, the Q bit will indicate if the interrupt + * needs to be re-triggered. + * + * The following offsets into the ESB MMIO allow to read or + * manipulate the PQ bits. They must be used with an 8-bytes + * load instruction. They all return the previous state of the + * interrupt (atomically). + * + * Additionally, some ESB pages support doing an EOI via a + * store at 0 and some ESBs support doing a trigger via a + * separate trigger page. + */ +#define XIVE_ESB_STORE_EOI 0x400 /* Store */ +#define XIVE_ESB_LOAD_EOI 0x000 /* Load */ +#define XIVE_ESB_GET 0x800 /* Load */ +#define XIVE_ESB_SET_PQ_00 0xc00 /* Load */ +#define XIVE_ESB_SET_PQ_01 0xd00 /* Load */ +#define XIVE_ESB_SET_PQ_10 0xe00 /* Load */ +#define XIVE_ESB_SET_PQ_11 0xf00 /* Load */ + +#define XIVE_ESB_VAL_P 0x2 +#define XIVE_ESB_VAL_Q 0x1 + /* * Thread Management (aka "TM") registers */ diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index 371fbebf1ec9..0e77005cf021 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -72,41 +72,6 @@ struct xive_q { atomic_t pending_count; }; -/* - * "magic" Event State Buffer (ESB) MMIO offsets. - * - * Each interrupt source has a 2-bit state machine called ESB - * which can be controlled by MMIO. It's made of 2 bits, P and - * Q. P indicates that an interrupt is pending (has been sent - * to a queue and is waiting for an EOI). Q indicates that the - * interrupt has been triggered while pending. - * - * This acts as a coalescing mechanism in order to guarantee - * that a given interrupt only occurs at most once in a queue. - * - * When doing an EOI, the Q bit will indicate if the interrupt - * needs to be re-triggered. - * - * The following offsets into the ESB MMIO allow to read or - * manipulate the PQ bits. They must be used with an 8-bytes - * load instruction. They all return the previous state of the - * interrupt (atomically). - * - * Additionally, some ESB pages support doing an EOI via a - * store at 0 and some ESBs support doing a trigger via a - * separate trigger page. - */ -#define XIVE_ESB_STORE_EOI 0x400 /* Store */ -#define XIVE_ESB_LOAD_EOI 0x000 /* Load */ -#define XIVE_ESB_GET 0x800 /* Load */ -#define XIVE_ESB_SET_PQ_00 0xc00 /* Load */ -#define XIVE_ESB_SET_PQ_01 0xd00 /* Load */ -#define XIVE_ESB_SET_PQ_10 0xe00 /* Load */ -#define XIVE_ESB_SET_PQ_11 0xf00 /* Load */ - -#define XIVE_ESB_VAL_P 0x2 -#define XIVE_ESB_VAL_Q 0x1 - /* Global enable flags for the XIVE support */ extern bool __xive_enabled; -- cgit From 7f1c410da59090f9bb2300efebbc3b717594d64c Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:39:28 +1100 Subject: powerpc/xive: Add interrupt flag to disable automatic EOI This will be used by KVM in order to keep escalation interrupts in the non-EOI (masked) state after they fire. They will be re-enabled directly in HW by KVM when needed. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/xive.h | 3 +++ arch/powerpc/sysdev/xive/common.c | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index 0e77005cf021..b619a5585cd6 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -58,6 +58,9 @@ struct xive_irq_data { #define XIVE_IRQ_FLAG_EOI_FW 0x10 #define XIVE_IRQ_FLAG_H_INT_ESB 0x20 +/* Special flag set by KVM for excalation interrupts */ +#define XIVE_IRQ_NO_EOI 0x80 + #define XIVE_INVALID_CHIP_ID -1 /* A queue tracking structure in a CPU */ diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index a3b8d7d1316e..2547b6021e6a 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -367,7 +367,8 @@ static void xive_irq_eoi(struct irq_data *d) * EOI the source if it hasn't been disabled and hasn't * been passed-through to a KVM guest */ - if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d)) + if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) && + !(xd->flags & XIVE_IRQ_NO_EOI)) xive_do_source_eoi(irqd_to_hwirq(d), xd); /* -- cgit From 5855564c8ab2d9cefca7b2933bd19818eb795e40 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 12 Jan 2018 20:55:20 +1100 Subject: KVM: PPC: Book3S HV: Enable migration of decrementer register This adds a register identifier for use with the one_reg interface to allow the decrementer expiry time to be read and written by userspace. The decrementer expiry time is in guest timebase units and is equal to the sum of the decrementer and the guest timebase. (The expiry time is used rather than the decrementer value itself because the expiry time is not constantly changing, though the decrementer value is, while the guest vcpu is not running.) Without this, a guest vcpu migrated to a new host will see its decrementer set to some random value. On POWER8 and earlier, the decrementer is 32 bits wide and counts down at 512MHz, so the guest vcpu will potentially see no decrementer interrupts for up to about 4 seconds, which will lead to a stall. With POWER9, the decrementer is now 56 bits side, so the stall can be much longer (up to 2.23 years) and more noticeable. To help work around the problem in cases where userspace has not been updated to migrate the decrementer expiry time, we now set the default decrementer expiry at vcpu creation time to the current time rather than the maximum possible value. This should mean an immediate decrementer interrupt when a migrated vcpu starts running. In cases where the decrementer is 32 bits wide and more than 4 seconds elapse between the creation of the vcpu and when it first runs, the decrementer would have wrapped around to positive values and there may still be a stall - but this is no worse than the current situation. In the large-decrementer case, we are sure to get an immediate decrementer interrupt (assuming the time from vcpu creation to first run is less than 2.23 years) and we thus avoid a very long stall. Signed-off-by: Paul Mackerras --- Documentation/virtual/kvm/api.txt | 1 + arch/powerpc/include/uapi/asm/kvm.h | 2 ++ arch/powerpc/kvm/book3s_hv.c | 8 ++++++++ arch/powerpc/kvm/powerpc.c | 2 +- 4 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f670e4b9e7f3..c6f9eebb79f2 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1841,6 +1841,7 @@ registers, find a list below: PPC | KVM_REG_PPC_DBSR | 32 PPC | KVM_REG_PPC_TIDR | 64 PPC | KVM_REG_PPC_PSSCR | 64 + PPC | KVM_REG_PPC_DEC_EXPIRY | 64 PPC | KVM_REG_PPC_TM_GPR0 | 64 ... PPC | KVM_REG_PPC_TM_GPR31 | 64 diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 61d6049f4c1e..8aaec831053a 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -607,6 +607,8 @@ struct kvm_ppc_rmmu_info { #define KVM_REG_PPC_TIDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc) #define KVM_REG_PPC_PSSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd) +#define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe) + /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c4f0bebfc5ba..b2d448c75008 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1497,6 +1497,10 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_ARCH_COMPAT: *val = get_reg_val(id, vcpu->arch.vcore->arch_compat); break; + case KVM_REG_PPC_DEC_EXPIRY: + *val = get_reg_val(id, vcpu->arch.dec_expires + + vcpu->arch.vcore->tb_offset); + break; default: r = -EINVAL; break; @@ -1724,6 +1728,10 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_ARCH_COMPAT: r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); break; + case KVM_REG_PPC_DEC_EXPIRY: + vcpu->arch.dec_expires = set_reg_val(id, *val) - + vcpu->arch.vcore->tb_offset; + break; default: r = -EINVAL; break; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index c2c7ef330553..7c9e45f54186 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -758,7 +758,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; - vcpu->arch.dec_expires = ~(u64)0; + vcpu->arch.dec_expires = get_tb(); #ifdef CONFIG_KVM_EXIT_TIMING mutex_init(&vcpu->arch.exit_timing_lock); -- cgit From 43ff3f65234061e08d234bdef5a9aadc19832b74 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Jan 2018 14:31:43 +1100 Subject: KVM: PPC: Book3S HV: Make sure we don't re-enter guest without XIVE loaded This fixes a bug where it is possible to enter a guest on a POWER9 system without having the XIVE (interrupt controller) context loaded. This can happen because we unload the XIVE context from the CPU before doing the real-mode handling for machine checks. After the real-mode handler runs, it is possible that we re-enter the guest via a fast path which does not load the XIVE context. To fix this, we move the unloading of the XIVE context to come after the real-mode machine check handler is called. Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller") Cc: stable@vger.kernel.org # v4.11+ Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 40 ++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c8ffd69adfec..76332a3f6c0d 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1423,6 +1423,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) blt deliver_guest_interrupt guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ + /* Save more register state */ + mfdar r6 + mfdsisr r7 + std r6, VCPU_DAR(r9) + stw r7, VCPU_DSISR(r9) + /* don't overwrite fault_dar/fault_dsisr if HDSI */ + cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE + beq mc_cont + std r6, VCPU_FAULT_DAR(r9) + stw r7, VCPU_FAULT_DSISR(r9) + + /* See if it is a machine check */ + cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK + beq machine_check_realmode +mc_cont: +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r9, VCPU_TB_RMEXIT + mr r4, r9 + bl kvmhv_accumulate_time +#endif #ifdef CONFIG_KVM_XICS /* We are exiting, pull the VP from the XIVE */ lwz r0, VCPU_XIVE_PUSHED(r9) @@ -1460,26 +1480,6 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ eieio 1: #endif /* CONFIG_KVM_XICS */ - /* Save more register state */ - mfdar r6 - mfdsisr r7 - std r6, VCPU_DAR(r9) - stw r7, VCPU_DSISR(r9) - /* don't overwrite fault_dar/fault_dsisr if HDSI */ - cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE - beq mc_cont - std r6, VCPU_FAULT_DAR(r9) - stw r7, VCPU_FAULT_DSISR(r9) - - /* See if it is a machine check */ - cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK - beq machine_check_realmode -mc_cont: -#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING - addi r3, r9, VCPU_TB_RMEXIT - mr r4, r9 - bl kvmhv_accumulate_time -#endif mr r3, r12 /* Increment exit count, poke other threads to exit */ -- cgit From 6964e6a4e4894c707e42d51d9d30683c57f43201 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Jan 2018 14:51:02 +1100 Subject: KVM: PPC: Book3S HV: Do SLB load/unload with guest LPCR value loaded This moves the code that loads and unloads the guest SLB values so that it is done while the guest LPCR value is loaded in the LPCR register. The reason for doing this is that on POWER9, the behaviour of the slbmte instruction depends on the LPCR[UPRT] bit. If UPRT is 1, as it is for a radix host (or guest), the SLB index is truncated to 2 bits. This means that for a HPT guest on a radix host, the SLB was not being loaded correctly, causing the guest to crash. The SLB is now loaded much later in the guest entry path, after the LPCR is loaded, which for a secondary thread is after it sees that the primary thread has switched the MMU to the guest. The loop that waits for the primary thread has a branch out to the exit code that is taken if it sees that other threads have commenced exiting the guest. Since we have now not loaded the SLB at this point, we make this path branch to a new label 'guest_bypass' and we move the SLB unload code to before this label. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 109 ++++++++++++++++---------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 76332a3f6c0d..30ece4cebaf5 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -617,13 +617,6 @@ kvmppc_hv_entry: lbz r0, KVM_RADIX(r9) cmpwi cr7, r0, 0 - /* Clear out SLB if hash */ - bne cr7, 2f - li r6,0 - slbmte r6,r6 - slbia - ptesync -2: /* * POWER7/POWER8 host -> guest partition switch code. * We don't have to lock against concurrent tlbies, @@ -738,19 +731,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 10: cmpdi r4, 0 beq kvmppc_primary_no_guest kvmppc_got_guest: - - /* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */ - lwz r5,VCPU_SLB_MAX(r4) - cmpwi r5,0 - beq 9f - mtctr r5 - addi r6,r4,VCPU_SLB -1: ld r8,VCPU_SLB_E(r6) - ld r9,VCPU_SLB_V(r6) - slbmte r9,r8 - addi r6,r6,VCPU_SLB_SIZE - bdnz 1b -9: /* Increment yield count if they have a VPA */ ld r3, VCPU_VPA(r4) cmpdi r3, 0 @@ -1017,6 +997,29 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) cmpdi r3, 512 /* 1 microsecond */ blt hdec_soon + /* For hash guest, clear out and reload the SLB */ + ld r6, VCPU_KVM(r4) + lbz r0, KVM_RADIX(r6) + cmpwi r0, 0 + bne 9f + li r6, 0 + slbmte r6, r6 + slbia + ptesync + + /* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */ + lwz r5,VCPU_SLB_MAX(r4) + cmpwi r5,0 + beq 9f + mtctr r5 + addi r6,r4,VCPU_SLB +1: ld r8,VCPU_SLB_E(r6) + ld r9,VCPU_SLB_V(r6) + slbmte r9,r8 + addi r6,r6,VCPU_SLB_SIZE + bdnz 1b +9: + #ifdef CONFIG_KVM_XICS /* We are entering the guest on that thread, push VCPU to XIVE */ ld r10, HSTATE_XIVE_TIMA_PHYS(r13) @@ -1193,7 +1196,7 @@ hdec_soon: addi r3, r4, VCPU_TB_RMEXIT bl kvmhv_accumulate_time #endif - b guest_exit_cont + b guest_bypass /****************************************************************************** * * @@ -1481,34 +1484,12 @@ mc_cont: 1: #endif /* CONFIG_KVM_XICS */ - mr r3, r12 - /* Increment exit count, poke other threads to exit */ - bl kvmhv_commence_exit - nop - ld r9, HSTATE_KVM_VCPU(r13) - lwz r12, VCPU_TRAP(r9) - - /* Stop others sending VCPU interrupts to this physical CPU */ - li r0, -1 - stw r0, VCPU_CPU(r9) - stw r0, VCPU_THREAD_CPU(r9) - - /* Save guest CTRL register, set runlatch to 1 */ - mfspr r6,SPRN_CTRLF - stw r6,VCPU_CTRL(r9) - andi. r0,r6,1 - bne 4f - ori r6,r6,1 - mtspr SPRN_CTRLT,r6 -4: - /* Check if we are running hash or radix and store it in cr2 */ + /* For hash guest, read the guest SLB and save it away */ ld r5, VCPU_KVM(r9) lbz r0, KVM_RADIX(r5) - cmpwi cr2,r0,0 - - /* Read the guest SLB and save it away */ li r5, 0 - bne cr2, 3f /* for radix, save 0 entries */ + cmpwi r0, 0 + bne 3f /* for radix, save 0 entries */ lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */ mtctr r0 li r6,0 @@ -1524,8 +1505,34 @@ mc_cont: addi r5,r5,1 2: addi r6,r6,1 bdnz 1b + /* Finally clear out the SLB */ + li r0,0 + slbmte r0,r0 + slbia + ptesync 3: stw r5,VCPU_SLB_MAX(r9) +guest_bypass: + mr r3, r12 + /* Increment exit count, poke other threads to exit */ + bl kvmhv_commence_exit + nop + ld r9, HSTATE_KVM_VCPU(r13) + lwz r12, VCPU_TRAP(r9) + + /* Stop others sending VCPU interrupts to this physical CPU */ + li r0, -1 + stw r0, VCPU_CPU(r9) + stw r0, VCPU_THREAD_CPU(r9) + + /* Save guest CTRL register, set runlatch to 1 */ + mfspr r6,SPRN_CTRLF + stw r6,VCPU_CTRL(r9) + andi. r0,r6,1 + bne 4f + ori r6,r6,1 + mtspr SPRN_CTRLT,r6 +4: /* * Save the guest PURR/SPURR */ @@ -1803,7 +1810,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ld r5, VCPU_KVM(r9) lbz r0, KVM_RADIX(r5) cmpwi cr2, r0, 0 - beq cr2, 3f + beq cr2, 4f /* Radix: Handle the case where the guest used an illegal PID */ LOAD_REG_ADDR(r4, mmu_base_pid) @@ -1839,15 +1846,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) BEGIN_FTR_SECTION PPC_INVALIDATE_ERAT END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) - b 4f +4: #endif /* CONFIG_PPC_RADIX_MMU */ - /* Hash: clear out SLB */ -3: li r5,0 - slbmte r5,r5 - slbia - ptesync -4: /* * POWER7/POWER8 guest -> host partition switch code. * We don't have to lock against tlbies but we do -- cgit From 00608e1f007e4cf6031485c5630e0e504bceef9b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 11 Jan 2018 16:54:26 +1100 Subject: KVM: PPC: Book3S HV: Allow HPT and radix on the same core for POWER9 v2.2 POWER9 chip versions starting with "Nimbus" v2.2 can support running with some threads of a core in HPT mode and others in radix mode. This means that we don't have to prohibit independent-threads mode when running a HPT guest on a radix host, and we don't have to do any of the synchronization between threads that was introduced in commit c01015091a77 ("KVM: PPC: Book3S HV: Run HPT guests on POWER9 radix hosts", 2017-10-19). Rather than using up another CPU feature bit, we just do an explicit test on the PVR (processor version register) at module startup time to determine whether we have to take steps to avoid having some threads in HPT mode and some in radix mode (so-called "mixed mode"). We test for "Nimbus" (indicated by 0 or 1 in the top nibble of the lower 16 bits) v2.2 or later, or "Cumulus" (indicated by 2 or 3 in that nibble) v1.1 or later. Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index b2d448c75008..76cf48051eb3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -118,6 +118,9 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); #endif +/* If set, the threads on each CPU core have to be in the same MMU mode */ +static bool no_mixing_hpt_and_radix; + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -2386,8 +2389,8 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc) static bool subcore_config_ok(int n_subcores, int n_threads) { /* - * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core - * mode, with one thread per subcore. + * POWER9 "SMT4" cores are permanently in what is effectively a 4-way + * split-core mode, with one thread per subcore. */ if (cpu_has_feature(CPU_FTR_ARCH_300)) return n_subcores <= 4 && n_threads == 1; @@ -2423,8 +2426,8 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) if (!cpu_has_feature(CPU_FTR_ARCH_207S)) return false; - /* POWER9 currently requires all threads to be in the same MMU mode */ - if (cpu_has_feature(CPU_FTR_ARCH_300) && + /* Some POWER9 chips require all threads to be in the same MMU mode */ + if (no_mixing_hpt_and_radix && kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) return false; @@ -2687,9 +2690,11 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * threads are offline. Also check if the number of threads in this * guest are greater than the current system threads per guest. * On POWER9, we need to be not in independent-threads mode if - * this is a HPT guest on a radix host. + * this is a HPT guest on a radix host machine where the + * CPU threads may not be in different MMU modes. */ - hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm); + hpt_on_radix = no_mixing_hpt_and_radix && radix_enabled() && + !kvm_is_radix(vc->kvm); if (((controlled_threads > 1) && ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) || (hpt_on_radix && vc->kvm->arch.threads_indep)) { @@ -4446,6 +4451,19 @@ static int kvmppc_book3s_init_hv(void) if (kvmppc_radix_possible()) r = kvmppc_radix_init(); + + /* + * POWER9 chips before version 2.02 can't have some threads in + * HPT mode and some in radix mode on the same core. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + unsigned int pvr = mfspr(SPRN_PVR); + if ((pvr >> 16) == PVR_POWER9 && + (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) || + ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101))) + no_mixing_hpt_and_radix = true; + } + return r; } -- cgit From d075745d893c78730e4a3b7a60fca23c2f764081 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 17 Jan 2018 20:51:13 +1100 Subject: KVM: PPC: Book3S HV: Improve handling of debug-trigger HMIs on POWER9 Hypervisor maintenance interrupts (HMIs) are generated by various causes, signalled by bits in the hypervisor maintenance exception register (HMER). In most cases calling OPAL to handle the interrupt is the correct thing to do, but the "debug trigger" HMIs signalled by PPC bit 17 (bit 46) of HMER are used to invoke software workarounds for hardware bugs, and OPAL does not have any code to handle this cause. The debug trigger HMI is used in POWER9 DD2.0 and DD2.1 chips to work around a hardware bug in executing vector load instructions to cache inhibited memory. In POWER9 DD2.2 chips, it is generated when conditions are detected relating to threads being in TM (transactional memory) suspended mode when the core SMT configuration needs to be reconfigured. The kernel currently has code to detect the vector CI load condition, but only when the HMI occurs in the host, not when it occurs in a guest. If a HMI occurs in the guest, it is always passed to OPAL, and then we always re-sync the timebase, because the HMI cause might have been a timebase error, for which OPAL would re-sync the timebase, thus removing the timebase offset which KVM applied for the guest. Since we don't know what OPAL did, we don't know whether to subtract the timebase offset from the timebase, so instead we re-sync the timebase. This adds code to determine explicitly what the cause of a debug trigger HMI will be. This is based on a new device-tree property under the CPU nodes called ibm,hmi-special-triggers, if it is present, or otherwise based on the PVR (processor version register). The handling of debug trigger HMIs is pulled out into a separate function which can be called from the KVM guest exit code. If this function handles and clears the HMI, and no other HMI causes remain, then we skip calling OPAL and we proceed to subtract the guest timebase offset from the timebase. The overall handling for HMIs that occur in the host (i.e. not in a KVM guest) is largely unchanged, except that we now don't set the flag for the vector CI load workaround on DD2.2 processors. This also removes a BUG_ON in the KVM code. BUG_ON is generally not useful in KVM guest entry/exit code since it is difficult to handle the resulting trap gracefully. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hmi.h | 4 + arch/powerpc/include/asm/reg.h | 5 +- arch/powerpc/kernel/mce.c | 142 +++++++++++++++++++++++++------- arch/powerpc/kvm/book3s_hv_ras.c | 8 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 +- 5 files changed, 131 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h index 85b7a1a21e22..9c14f7b5c46c 100644 --- a/arch/powerpc/include/asm/hmi.h +++ b/arch/powerpc/include/asm/hmi.h @@ -42,4 +42,8 @@ extern void wait_for_tb_resync(void); static inline void wait_for_subcore_guest_exit(void) { } static inline void wait_for_tb_resync(void) { } #endif + +struct pt_regs; +extern long hmi_handle_debugtrig(struct pt_regs *regs); + #endif /* __ASM_PPC64_HMI_H__ */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index b779f3ccd412..14e41b843952 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -432,8 +432,9 @@ #define SPRN_LPID 0x13F /* Logical Partition Identifier */ #endif #define LPID_RSVD 0x3ff /* Reserved LPID for partn switching */ -#define SPRN_HMER 0x150 /* Hardware m? error recovery */ -#define SPRN_HMEER 0x151 /* Hardware m? enable error recovery */ +#define SPRN_HMER 0x150 /* Hypervisor maintenance exception reg */ +#define HMER_DEBUG_TRIG (1ul << (63 - 17)) /* Debug trigger */ +#define SPRN_HMEER 0x151 /* Hyp maintenance exception enable reg */ #define SPRN_PCR 0x152 /* Processor compatibility register */ #define PCR_VEC_DIS (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */ #define PCR_VSX_DIS (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 742e4658c5dc..d2fecaec4fec 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -495,37 +495,123 @@ long machine_check_early(struct pt_regs *regs) return handled; } -long hmi_exception_realmode(struct pt_regs *regs) +/* Possible meanings for HMER_DEBUG_TRIG bit being set on POWER9 */ +static enum { + DTRIG_UNKNOWN, + DTRIG_VECTOR_CI, /* need to emulate vector CI load instr */ + DTRIG_SUSPEND_ESCAPE, /* need to escape from TM suspend mode */ +} hmer_debug_trig_function; + +static int init_debug_trig_function(void) { - __this_cpu_inc(irq_stat.hmi_exceptions); - -#ifdef CONFIG_PPC_BOOK3S_64 - /* Workaround for P9 vector CI loads (see p9_hmi_special_emu) */ - if (pvr_version_is(PVR_POWER9)) { - unsigned long hmer = mfspr(SPRN_HMER); - - /* Do we have the debug bit set */ - if (hmer & PPC_BIT(17)) { - hmer &= ~PPC_BIT(17); - mtspr(SPRN_HMER, hmer); - - /* - * Now to avoid problems with soft-disable we - * only do the emulation if we are coming from - * user space - */ - if (user_mode(regs)) - local_paca->hmi_p9_special_emu = 1; - - /* - * Don't bother going to OPAL if that's the - * only relevant bit. - */ - if (!(hmer & mfspr(SPRN_HMEER))) - return local_paca->hmi_p9_special_emu; + int pvr; + struct device_node *cpun; + struct property *prop = NULL; + const char *str; + + /* First look in the device tree */ + preempt_disable(); + cpun = of_get_cpu_node(smp_processor_id(), NULL); + if (cpun) { + of_property_for_each_string(cpun, "ibm,hmi-special-triggers", + prop, str) { + if (strcmp(str, "bit17-vector-ci-load") == 0) + hmer_debug_trig_function = DTRIG_VECTOR_CI; + else if (strcmp(str, "bit17-tm-suspend-escape") == 0) + hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE; } + of_node_put(cpun); + } + preempt_enable(); + + /* If we found the property, don't look at PVR */ + if (prop) + goto out; + + pvr = mfspr(SPRN_PVR); + /* Check for POWER9 Nimbus (scale-out) */ + if ((PVR_VER(pvr) == PVR_POWER9) && (pvr & 0xe000) == 0) { + /* DD2.2 and later */ + if ((pvr & 0xfff) >= 0x202) + hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE; + /* DD2.0 and DD2.1 - used for vector CI load emulation */ + else if ((pvr & 0xfff) >= 0x200) + hmer_debug_trig_function = DTRIG_VECTOR_CI; + } + + out: + switch (hmer_debug_trig_function) { + case DTRIG_VECTOR_CI: + pr_debug("HMI debug trigger used for vector CI load\n"); + break; + case DTRIG_SUSPEND_ESCAPE: + pr_debug("HMI debug trigger used for TM suspend escape\n"); + break; + default: + break; } -#endif /* CONFIG_PPC_BOOK3S_64 */ + return 0; +} +__initcall(init_debug_trig_function); + +/* + * Handle HMIs that occur as a result of a debug trigger. + * Return values: + * -1 means this is not a HMI cause that we know about + * 0 means no further handling is required + * 1 means further handling is required + */ +long hmi_handle_debugtrig(struct pt_regs *regs) +{ + unsigned long hmer = mfspr(SPRN_HMER); + long ret = 0; + + /* HMER_DEBUG_TRIG bit is used for various workarounds on P9 */ + if (!((hmer & HMER_DEBUG_TRIG) + && hmer_debug_trig_function != DTRIG_UNKNOWN)) + return -1; + + hmer &= ~HMER_DEBUG_TRIG; + /* HMER is a write-AND register */ + mtspr(SPRN_HMER, ~HMER_DEBUG_TRIG); + + switch (hmer_debug_trig_function) { + case DTRIG_VECTOR_CI: + /* + * Now to avoid problems with soft-disable we + * only do the emulation if we are coming from + * host user space + */ + if (regs && user_mode(regs)) + ret = local_paca->hmi_p9_special_emu = 1; + + break; + + default: + break; + } + + /* + * See if any other HMI causes remain to be handled + */ + if (hmer & mfspr(SPRN_HMEER)) + return -1; + + return ret; +} + +/* + * Return values: + */ +long hmi_exception_realmode(struct pt_regs *regs) +{ + int ret; + + __this_cpu_inc(irq_stat.hmi_exceptions); + + ret = hmi_handle_debugtrig(regs); + if (ret >= 0) + return ret; wait_for_subcore_guest_exit(); diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index c356f9a40b24..c296343d0dcc 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -268,17 +268,19 @@ static void kvmppc_tb_resync_done(void) * secondary threads to proceed. * - All secondary threads will eventually call opal hmi handler on * their exit path. + * + * Returns 1 if the timebase offset should be applied, 0 if not. */ long kvmppc_realmode_hmi_handler(void) { - int ptid = local_paca->kvm_hstate.ptid; bool resync_req; - /* This is only called on primary thread. */ - BUG_ON(ptid != 0); __this_cpu_inc(irq_stat.hmi_exceptions); + if (hmi_handle_debugtrig(NULL) >= 0) + return 1; + /* * By now primary thread has already completed guest->host * partition switch but haven't signaled secondaries yet. diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 2659844784b8..bd0b623335af 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1909,16 +1909,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) bne 27f bl kvmppc_realmode_hmi_handler nop + cmpdi r3, 0 li r12, BOOK3S_INTERRUPT_HMI /* - * At this point kvmppc_realmode_hmi_handler would have resync-ed - * the TB. Hence it is not required to subtract guest timebase - * offset from timebase. So, skip it. + * At this point kvmppc_realmode_hmi_handler may have resync-ed + * the TB, and if it has, we must not subtract the guest timebase + * offset from the timebase. So, skip it. * * Also, do not call kvmppc_subcore_exit_guest() because it has * been invoked as part of kvmppc_realmode_hmi_handler(). */ - b 30f + beq 30f 27: /* Subtract timebase offset from timebase */ -- cgit From c424c108233dc422a9a29ee833154006a5bdf9fc Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:37:11 +1100 Subject: KVM: PPC: Book3S HV: Add more info about XIVE queues in debugfs Add details about enabled queues and escalation interrupts. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xive.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index bf457843e032..6cff5bdfd6b7 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1794,6 +1794,7 @@ static int xive_debug_show(struct seq_file *m, void *private) kvm_for_each_vcpu(i, vcpu, kvm) { struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + unsigned int i; if (!xc) continue; @@ -1803,6 +1804,33 @@ static int xive_debug_show(struct seq_file *m, void *private) xc->server_num, xc->cppr, xc->hw_cppr, xc->mfrr, xc->pending, xc->stat_rm_h_xirr, xc->stat_vm_h_xirr); + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { + struct xive_q *q = &xc->queues[i]; + u32 i0, i1, idx; + + if (!q->qpage && !xc->esc_virq[i]) + continue; + + seq_printf(m, " [q%d]: ", i); + + if (q->qpage) { + idx = q->idx; + i0 = be32_to_cpup(q->qpage + idx); + idx = (idx + 1) & q->msk; + i1 = be32_to_cpup(q->qpage + idx); + seq_printf(m, "T=%d %08x %08x... \n", q->toggle, i0, i1); + } + if (xc->esc_virq[i]) { + struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]); + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET); + seq_printf(m, "E:%c%c I(%d:%llx:%llx)", + (pq & XIVE_ESB_VAL_P) ? 'P' : 'p', + (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q', + xc->esc_virq[i], pq, xd->eoi_page); + seq_printf(m, "\n"); + } + } t_rm_h_xirr += xc->stat_rm_h_xirr; t_rm_h_ipoll += xc->stat_rm_h_ipoll; -- cgit From bf4159da4751ab8eea43ca6e7c49193dbce8398c Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:37:12 +1100 Subject: KVM: PPC: Book3S HV: Enable use of the new XIVE "single escalation" feature That feature, provided by Power9 DD2.0 and later, when supported by newer OPAL versions, allows us to sacrifice a queue (priority 7) in favor of merging all the escalation interrupts of the queues of a single VP into a single interrupt. This reduces the number of host interrupts used up by KVM guests especially when those guests use multiple priorities. It will also enable a future change to control the masking of the escalation interrupts more precisely to avoid spurious ones. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/opal-api.h | 1 + arch/powerpc/include/asm/xive.h | 3 ++- arch/powerpc/kvm/book3s_xive.c | 48 ++++++++++++++++++++++++------------- arch/powerpc/kvm/book3s_xive.h | 15 +++++------- arch/powerpc/sysdev/xive/native.c | 18 ++++++++++++-- 5 files changed, 57 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 233c7504b1f2..fc926743647e 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -1073,6 +1073,7 @@ enum { /* Flags for OPAL_XIVE_GET/SET_VP_INFO */ enum { OPAL_XIVE_VP_ENABLED = 0x00000001, + OPAL_XIVE_VP_SINGLE_ESCALATION = 0x00000002, }; /* "Any chip" replacement for chip ID for allocation functions */ diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index b619a5585cd6..e602903c3029 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -111,9 +111,10 @@ extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio); extern void xive_native_sync_source(u32 hw_irq); extern bool is_xive_irq(struct irq_chip *chip); -extern int xive_native_enable_vp(u32 vp_id); +extern int xive_native_enable_vp(u32 vp_id, bool single_escalation); extern int xive_native_disable_vp(u32 vp_id); extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id); +extern bool xive_native_has_single_escalation(void); #else diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 6cff5bdfd6b7..a102efeabf05 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -112,19 +112,21 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio) return -EIO; } - /* - * Future improvement: start with them disabled - * and handle DD2 and later scheme of merged escalation - * interrupts - */ - name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d", - vcpu->kvm->arch.lpid, xc->server_num, prio); + if (xc->xive->single_escalation) + name = kasprintf(GFP_KERNEL, "kvm-%d-%d", + vcpu->kvm->arch.lpid, xc->server_num); + else + name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d", + vcpu->kvm->arch.lpid, xc->server_num, prio); if (!name) { pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n", prio, xc->server_num); rc = -ENOMEM; goto error; } + + pr_devel("Escalation %s irq %d (prio %d)\n", name, xc->esc_virq[prio], prio); + rc = request_irq(xc->esc_virq[prio], xive_esc_irq, IRQF_NO_THREAD, name, vcpu); if (rc) { @@ -191,12 +193,12 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio) pr_devel("Provisioning prio... %d\n", prio); - /* Provision each VCPU and enable escalations */ + /* Provision each VCPU and enable escalations if needed */ kvm_for_each_vcpu(i, vcpu, kvm) { if (!vcpu->arch.xive_vcpu) continue; rc = xive_provision_queue(vcpu, prio); - if (rc == 0) + if (rc == 0 && !xive->single_escalation) xive_attach_escalation(vcpu, prio); if (rc) return rc; @@ -1081,6 +1083,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, /* Allocate IPI */ xc->vp_ipi = xive_native_alloc_irq(); if (!xc->vp_ipi) { + pr_err("Failed to allocate xive irq for VCPU IPI\n"); r = -EIO; goto bail; } @@ -1090,19 +1093,34 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, if (r) goto bail; + /* + * Enable the VP first as the single escalation mode will + * affect escalation interrupts numbering + */ + r = xive_native_enable_vp(xc->vp_id, xive->single_escalation); + if (r) { + pr_err("Failed to enable VP in OPAL, err %d\n", r); + goto bail; + } + /* * Initialize queues. Initially we set them all for no queueing * and we enable escalation for queue 0 only which we'll use for * our mfrr change notifications. If the VCPU is hot-plugged, we - * do handle provisioning however. + * do handle provisioning however based on the existing "map" + * of enabled queues. */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { struct xive_q *q = &xc->queues[i]; + /* Single escalation, no queue 7 */ + if (i == 7 && xive->single_escalation) + break; + /* Is queue already enabled ? Provision it */ if (xive->qmap & (1 << i)) { r = xive_provision_queue(vcpu, i); - if (r == 0) + if (r == 0 && !xive->single_escalation) xive_attach_escalation(vcpu, i); if (r) goto bail; @@ -1122,11 +1140,6 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, if (r) goto bail; - /* Enable the VP */ - r = xive_native_enable_vp(xc->vp_id); - if (r) - goto bail; - /* Route the IPI */ r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI); if (!r) @@ -1473,6 +1486,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) pr_devel(" val=0x016%llx (server=0x%x, guest_prio=%d)\n", val, server, guest_prio); + /* * If the source doesn't already have an IPI, allocate * one and get the corresponding data @@ -1761,6 +1775,8 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) if (xive->vp_base == XIVE_INVALID_VP) ret = -ENOMEM; + xive->single_escalation = xive_native_has_single_escalation(); + if (ret) { kfree(xive); return ret; diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 6ba63f8e8a61..a08ae6fd4c51 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -120,6 +120,8 @@ struct kvmppc_xive { u32 q_order; u32 q_page_order; + /* Flags */ + u8 single_escalation; }; #define KVMPPC_XIVE_Q_COUNT 8 @@ -201,25 +203,20 @@ static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmpp * is as follow. * * Guest request for 0...6 are honored. Guest request for anything - * higher results in a priority of 7 being applied. - * - * However, when XIRR is returned via H_XIRR, 7 is translated to 0xb - * in order to match AIX expectations + * higher results in a priority of 6 being applied. * * Similar mapping is done for CPPR values */ static inline u8 xive_prio_from_guest(u8 prio) { - if (prio == 0xff || prio < 8) + if (prio == 0xff || prio < 6) return prio; - return 7; + return 6; } static inline u8 xive_prio_to_guest(u8 prio) { - if (prio == 0xff || prio < 7) - return prio; - return 0xb; + return prio; } static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle) diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index ebc244b08d67..d22aeb0b69e1 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -42,6 +42,7 @@ static u32 xive_provision_chip_count; static u32 xive_queue_shift; static u32 xive_pool_vps = XIVE_INVALID_VP; static struct kmem_cache *xive_provision_cache; +static bool xive_has_single_esc; int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) { @@ -571,6 +572,10 @@ bool __init xive_native_init(void) break; } + /* Do we support single escalation */ + if (of_get_property(np, "single-escalation-support", NULL) != NULL) + xive_has_single_esc = true; + /* Configure Thread Management areas for KVM */ for_each_possible_cpu(cpu) kvmppc_set_xive_tima(cpu, r.start, tima); @@ -667,12 +672,15 @@ void xive_native_free_vp_block(u32 vp_base) } EXPORT_SYMBOL_GPL(xive_native_free_vp_block); -int xive_native_enable_vp(u32 vp_id) +int xive_native_enable_vp(u32 vp_id, bool single_escalation) { s64 rc; + u64 flags = OPAL_XIVE_VP_ENABLED; + if (single_escalation) + flags |= OPAL_XIVE_VP_SINGLE_ESCALATION; for (;;) { - rc = opal_xive_set_vp_info(vp_id, OPAL_XIVE_VP_ENABLED, 0); + rc = opal_xive_set_vp_info(vp_id, flags, 0); if (rc != OPAL_BUSY) break; msleep(1); @@ -710,3 +718,9 @@ int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id) return 0; } EXPORT_SYMBOL_GPL(xive_native_get_vp_info); + +bool xive_native_has_single_escalation(void) +{ + return xive_has_single_esc; +} +EXPORT_SYMBOL_GPL(xive_native_has_single_escalation); -- cgit From 2267ea7661798a42f0da648a2970e2a03f4bc370 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:37:13 +1100 Subject: KVM: PPC: Book3S HV: Don't use existing "prodded" flag for XIVE escalations The prodded flag is only cleared at the beginning of H_CEDE, so every time we have an escalation, we will cause the *next* H_CEDE to return immediately. Instead use a dedicated "irq_pending" flag to indicate that a guest interrupt is pending for the VCPU. We don't reuse the existing exception bitmap so as to avoid expensive atomic ops. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 ++++++++++ arch/powerpc/kvm/book3s_xive.c | 3 +-- 5 files changed, 14 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3aa5b577cd60..bfe51356af5e 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -709,6 +709,7 @@ struct kvm_vcpu_arch { u8 ceded; u8 prodded; u8 doorbell_request; + u8 irq_pending; /* Used by XIVE to signal pending guest irqs */ u32 last_inst; struct swait_queue_head *wqp; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6b958414b4e0..825089cf3e23 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -514,6 +514,7 @@ int main(void) OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions); OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded); OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded); + OFFSET(VCPU_IRQ_PENDING, kvm_vcpu, arch.irq_pending); OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request); OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr); OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 76cf48051eb3..e5f81fc108e0 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2999,7 +2999,7 @@ static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu) { if (!xive_enabled()) return false; - return vcpu->arch.xive_saved_state.pipr < + return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr < vcpu->arch.xive_saved_state.cppr; } #else diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 7daf21be33d0..34dbab7deb39 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1035,6 +1035,16 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) li r9, 1 stw r9, VCPU_XIVE_PUSHED(r4) eieio + + /* + * We clear the irq_pending flag. There is a small chance of a + * race vs. the escalation interrupt happening on another + * processor setting it again, but the only consequence is to + * cause a spurrious wakeup on the next H_CEDE which is not an + * issue. + */ + li r0,0 + stb r0, VCPU_IRQ_PENDING(r4) no_xive: #endif /* CONFIG_KVM_XICS */ diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index a102efeabf05..eef9ccafdc09 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -84,8 +84,7 @@ static irqreturn_t xive_esc_irq(int irq, void *data) { struct kvm_vcpu *vcpu = data; - /* We use the existing H_PROD mechanism to wake up the target */ - vcpu->arch.prodded = 1; + vcpu->arch.irq_pending = 1; smp_mb(); if (vcpu->arch.ceded) kvmppc_fast_vcpu_kick(vcpu); -- cgit From 2662efd050953824de5c9b24449d6b5b342db10b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:37:14 +1100 Subject: KVM: PPC: Book3S HV: Check DR not IR to chose real vs virt mode MMIOs Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 34dbab7deb39..948f21cf84d5 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1464,7 +1464,7 @@ mc_cont: li r7, TM_SPC_PULL_OS_CTX li r6, TM_QW1_OS mfmsr r0 - andi. r0, r0, MSR_IR /* in real mode? */ + andi. r0, r0, MSR_DR /* in real mode? */ beq 2f ld r10, HSTATE_XIVE_TIMA_VIRT(r13) cmpldi cr0, r10, 0 -- cgit From 35c2405efc0142860c4b698f4c6331567c4ca1ef Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:37:15 +1100 Subject: KVM: PPC: Book3S HV: Make xive_pushed a byte, not a word Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index bfe51356af5e..0c44fa67608d 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -739,7 +739,7 @@ struct kvm_vcpu_arch { struct kvmppc_icp *icp; /* XICS presentation controller */ struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */ __be32 xive_cam_word; /* Cooked W2 in proper endian with valid bit */ - u32 xive_pushed; /* Is the VP pushed on the physical CPU ? */ + u8 xive_pushed; /* Is the VP pushed on the physical CPU ? */ union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */ #endif diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 948f21cf84d5..a7f429bc6de0 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1033,7 +1033,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) li r9, TM_QW1_OS + TM_WORD2 stwcix r11,r9,r10 li r9, 1 - stw r9, VCPU_XIVE_PUSHED(r4) + stb r9, VCPU_XIVE_PUSHED(r4) eieio /* @@ -1458,7 +1458,7 @@ mc_cont: #endif #ifdef CONFIG_KVM_XICS /* We are exiting, pull the VP from the XIVE */ - lwz r0, VCPU_XIVE_PUSHED(r9) + lbz r0, VCPU_XIVE_PUSHED(r9) cmpwi cr0, r0, 0 beq 1f li r7, TM_SPC_PULL_OS_CTX @@ -1487,7 +1487,7 @@ mc_cont: /* Fixup some of the state for the next load */ li r10, 0 li r0, 0xff - stw r10, VCPU_XIVE_PUSHED(r9) + stb r10, VCPU_XIVE_PUSHED(r9) stb r10, (VCPU_XIVE_SAVED_STATE+3)(r9) stb r0, (VCPU_XIVE_SAVED_STATE+4)(r9) eieio -- cgit From 9b9b13a6d1537ddc4caccd6f1c41b78edbc08437 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 12 Jan 2018 13:37:16 +1100 Subject: KVM: PPC: Book3S HV: Keep XIVE escalation interrupt masked unless ceded This works on top of the single escalation support. When in single escalation, with this change, we will keep the escalation interrupt disabled unless the VCPU is in H_CEDE (idle). In any other case, we know the VCPU will be rescheduled and thus there is no need to take escalation interrupts in the host whenever a guest interrupt fires. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 3 ++ arch/powerpc/kernel/asm-offsets.c | 3 ++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 62 ++++++++++++++++++++++++++++++++- arch/powerpc/kvm/book3s_xive.c | 30 ++++++++++++++++ 4 files changed, 97 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0c44fa67608d..fef8133becc8 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -740,7 +740,10 @@ struct kvm_vcpu_arch { struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */ __be32 xive_cam_word; /* Cooked W2 in proper endian with valid bit */ u8 xive_pushed; /* Is the VP pushed on the physical CPU ? */ + u8 xive_esc_on; /* Is the escalation irq enabled ? */ union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */ + u64 xive_esc_raddr; /* Escalation interrupt ESB real addr */ + u64 xive_esc_vaddr; /* Escalation interrupt ESB virt addr */ #endif #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 825089cf3e23..1672dffd94e2 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -734,6 +734,9 @@ int main(void) DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu, arch.xive_cam_word)); DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed)); + DEFINE(VCPU_XIVE_ESC_ON, offsetof(struct kvm_vcpu, arch.xive_esc_on)); + DEFINE(VCPU_XIVE_ESC_RADDR, offsetof(struct kvm_vcpu, arch.xive_esc_raddr)); + DEFINE(VCPU_XIVE_ESC_VADDR, offsetof(struct kvm_vcpu, arch.xive_esc_vaddr)); #endif #ifdef CONFIG_KVM_EXIT_TIMING diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index a7f429bc6de0..a7a20b85d8eb 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1045,6 +1045,41 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) */ li r0,0 stb r0, VCPU_IRQ_PENDING(r4) + + /* + * In single escalation mode, if the escalation interrupt is + * on, we mask it. + */ + lbz r0, VCPU_XIVE_ESC_ON(r4) + cmpwi r0,0 + beq 1f + ld r10, VCPU_XIVE_ESC_RADDR(r4) + li r9, XIVE_ESB_SET_PQ_01 + ldcix r0, r10, r9 + sync + + /* We have a possible subtle race here: The escalation interrupt might + * have fired and be on its way to the host queue while we mask it, + * and if we unmask it early enough (re-cede right away), there is + * a theorical possibility that it fires again, thus landing in the + * target queue more than once which is a big no-no. + * + * Fortunately, solving this is rather easy. If the above load setting + * PQ to 01 returns a previous value where P is set, then we know the + * escalation interrupt is somewhere on its way to the host. In that + * case we simply don't clear the xive_esc_on flag below. It will be + * eventually cleared by the handler for the escalation interrupt. + * + * Then, when doing a cede, we check that flag again before re-enabling + * the escalation interrupt, and if set, we abort the cede. + */ + andi. r0, r0, XIVE_ESB_VAL_P + bne- 1f + + /* Now P is 0, we can clear the flag */ + li r0, 0 + stb r0, VCPU_XIVE_ESC_ON(r4) +1: no_xive: #endif /* CONFIG_KVM_XICS */ @@ -2756,7 +2791,32 @@ kvm_cede_prodded: /* we've ceded but we want to give control to the host */ kvm_cede_exit: ld r9, HSTATE_KVM_VCPU(r13) - b guest_exit_cont +#ifdef CONFIG_KVM_XICS + /* Abort if we still have a pending escalation */ + lbz r5, VCPU_XIVE_ESC_ON(r9) + cmpwi r5, 0 + beq 1f + li r0, 0 + stb r0, VCPU_CEDED(r9) +1: /* Enable XIVE escalation */ + li r5, XIVE_ESB_SET_PQ_00 + mfmsr r0 + andi. r0, r0, MSR_DR /* in real mode? */ + beq 1f + ld r10, VCPU_XIVE_ESC_VADDR(r9) + cmpdi r10, 0 + beq 3f + ldx r0, r10, r5 + b 2f +1: ld r10, VCPU_XIVE_ESC_RADDR(r9) + cmpdi r10, 0 + beq 3f + ldcix r0, r10, r5 +2: sync + li r0, 1 + stb r0, VCPU_XIVE_ESC_ON(r9) +#endif /* CONFIG_KVM_XICS */ +3: b guest_exit_cont /* Try to handle a machine check in real mode */ machine_check_realmode: diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index eef9ccafdc09..7a047bc88f11 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -89,6 +89,17 @@ static irqreturn_t xive_esc_irq(int irq, void *data) if (vcpu->arch.ceded) kvmppc_fast_vcpu_kick(vcpu); + /* Since we have the no-EOI flag, the interrupt is effectively + * disabled now. Clearing xive_esc_on means we won't bother + * doing so on the next entry. + * + * This also allows the entry code to know that if a PQ combination + * of 10 is observed while xive_esc_on is true, it means the queue + * contains an unprocessed escalation interrupt. We don't make use of + * that knowledge today but might (see comment in book3s_hv_rmhandler.S) + */ + vcpu->arch.xive_esc_on = false; + return IRQ_HANDLED; } @@ -134,6 +145,25 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio) goto error; } xc->esc_virq_names[prio] = name; + + /* In single escalation mode, we grab the ESB MMIO of the + * interrupt and mask it. Also populate the VCPU v/raddr + * of the ESB page for use by asm entry/exit code. Finally + * set the XIVE_IRQ_NO_EOI flag which will prevent the + * core code from performing an EOI on the escalation + * interrupt, thus leaving it effectively masked after + * it fires once. + */ + if (xc->xive->single_escalation) { + struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]); + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01); + vcpu->arch.xive_esc_raddr = xd->eoi_page; + vcpu->arch.xive_esc_vaddr = (__force u64)xd->eoi_mmio; + xd->flags |= XIVE_IRQ_NO_EOI; + } + return 0; error: irq_dispose_mapping(xc->esc_virq[prio]); -- cgit