From cda2eaa35948893d70145490d5d6ded546fc3bc6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 10 Nov 2017 16:40:24 +1100
Subject: KVM: PPC: Book3S HV: Avoid shifts by negative amounts

The kvmppc_hpte_page_shifts function decodes the actual and base page
sizes for a HPTE, returning -1 if it doesn't recognize the page size
encoding.  This then gets used as a shift amount in various places,
which is undefined behaviour.  This was reported by Coverity.

In fact this should never occur, since we should only get HPTEs in the
HPT which have a recognized page size encoding.  The only place where
this might not be true is in the call to kvmppc_actual_pgsz() near the
beginning of kvmppc_do_h_enter(), where we are validating the HPTE
value passed in from the guest.

So to fix this and eliminate the undefined behaviour, we make
kvmppc_hpte_page_shifts return 0 for unrecognized page size encodings,
and make kvmppc_actual_pgsz() detect that case and return 0 for the
page size, which will then cause kvmppc_do_h_enter() to return an
error and refuse to insert any HPTE with an unrecognized page size
encoding.

To ensure that we don't get undefined behaviour in compute_tlbie_rb(),
we take the 4k page size path for any unrecognized page size encoding.
This should never be hit in practice because it is only used on HPTE
values which have previously been checked for having a recognized
page size encoding.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 735cfa35298a..998f7b7aaa9e 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -122,13 +122,13 @@ static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l)
 	lphi = (l >> 16) & 0xf;
 	switch ((l >> 12) & 0xf) {
 	case 0:
-		return !lphi ? 24 : -1;		/* 16MB */
+		return !lphi ? 24 : 0;		/* 16MB */
 		break;
 	case 1:
 		return 16;			/* 64kB */
 		break;
 	case 3:
-		return !lphi ? 34 : -1;		/* 16GB */
+		return !lphi ? 34 : 0;		/* 16GB */
 		break;
 	case 7:
 		return (16 << 8) + 12;		/* 64kB in 4kB */
@@ -140,7 +140,7 @@ static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l)
 			return (24 << 8) + 12;	/* 16MB in 4kB */
 		break;
 	}
-	return -1;
+	return 0;
 }
 
 static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l)
@@ -159,7 +159,11 @@ static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l
 
 static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r)
 {
-	return 1ul << kvmppc_hpte_actual_page_shift(v, r);
+	int shift = kvmppc_hpte_actual_page_shift(v, r);
+
+	if (shift)
+		return 1ul << shift;
+	return 0;
 }
 
 static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift)
@@ -232,7 +236,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 		va_low ^= v >> (SID_SHIFT_1T - 16);
 	va_low &= 0x7ff;
 
-	if (b_pgshift == 12) {
+	if (b_pgshift <= 12) {
 		if (a_pgshift > 12) {
 			sllp = (a_pgshift == 16) ? 5 : 4;
 			rb |= sllp << 5;	/*  AP field */
-- 
cgit 


From 117647ff936e2d9684cc881d87c0291f46669c20 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 10 Nov 2017 16:43:35 +1100
Subject: KVM: PPC: Book3S HV: Fix typo in kvmppc_hv_get_dirty_log_radix()

This fixes a typo where the intent was to assign to 'j' in order to
skip some number of bits in the dirty bitmap for a guest.  The effect
of the typo is benign since it means we just iterate through all the
bits rather than skipping bits which we know will be zero.  This issue
was found by Coverity.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 58618f644c56..0c854816e653 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -573,7 +573,7 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 		j = i + 1;
 		if (npages) {
 			set_dirty_bits(map, i, npages);
-			i = j + npages;
+			j = i + npages;
 		}
 	}
 	return 0;
-- 
cgit 


From 4fcf361dbdbdb43038bb173e2391c4073e713745 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 20 Nov 2017 14:17:53 +1100
Subject: KVM: PPC: Book3S HV: Remove useless statement

This removes a statement that has no effect.  It should have been
removed in commit 898b25b202f3 ("KVM: PPC: Book3S HV: Simplify dynamic
micro-threading code", 2017-06-22) along with the loop over the
piggy-backed virtual cores.

This issue was reported by Coverity.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2d46037ce936..597498d6db2e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2831,7 +2831,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		 */
 		if (!thr0_done)
 			kvmppc_start_thread(NULL, pvc);
-		thr += pvc->num_threads;
 	}
 
 	/*
-- 
cgit 


From c0093f1a38a0fd6c32a2269f0533bb13fb95143d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 20 Nov 2017 16:12:25 +1100
Subject: KVM: PPC: Book3S HV: Fix conditions for starting vcpu

This corrects the test that determines whether a vcpu that has just
become able to run in the guest (e.g. it has just finished handling
a hypercall or hypervisor page fault) and whose virtual core is
already running somewhere as a "piggybacked" vcore can start
immediately or not.  (A piggybacked vcore is one which is executing
along with another vcore as a result of dynamic micro-threading.)

Previously the test tried to lock the piggybacked vcore using
spin_trylock, which would always fail because the vcore was already
locked, and so the vcpu would have to wait until its vcore exited
the guest before it could enter.

In fact the vcpu can enter if its vcore is in VCORE_PIGGYBACK state
and not already exiting (or exited) the guest, so the test in
VCORE_PIGGYBACK state is basically the same as for VCORE_RUNNING
state.

Coverity detected this as a double unlock issue, which it isn't
because the spin_trylock would always fail.  This will fix the
apparent double unlock as well.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 597498d6db2e..c4f0bebfc5ba 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3175,17 +3175,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	 * this thread straight away and have it join in.
 	 */
 	if (!signal_pending(current)) {
-		if (vc->vcore_state == VCORE_PIGGYBACK) {
-			if (spin_trylock(&vc->lock)) {
-				if (vc->vcore_state == VCORE_RUNNING &&
-				    !VCORE_IS_EXITING(vc)) {
-					kvmppc_create_dtl_entry(vcpu, vc);
-					kvmppc_start_thread(vcpu, vc);
-					trace_kvm_guest_enter(vcpu);
-				}
-				spin_unlock(&vc->lock);
-			}
-		} else if (vc->vcore_state == VCORE_RUNNING &&
+		if ((vc->vcore_state == VCORE_PIGGYBACK ||
+		     vc->vcore_state == VCORE_RUNNING) &&
 			   !VCORE_IS_EXITING(vc)) {
 			kvmppc_create_dtl_entry(vcpu, vc);
 			kvmppc_start_thread(vcpu, vc);
-- 
cgit 


From 9aa6825bbb7526a7fdec137b7cc3b042581cd2fc Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 20 Nov 2017 19:56:27 +1100
Subject: KVM: PPC: Book3S: Eliminate some unnecessary checks

In an excess of caution, commit 6f63e81bda98 ("KVM: PPC: Book3S: Add
MMIO emulation for FP and VSX instructions", 2017-02-21) included
checks for the case that vcpu->arch.mmio_vsx_copy_nums is less than
zero, even though its type is u8.  This causes a Coverity warning,
so we remove the check for < 0.  We also adjust the associated
comment to be more accurate ("4 or less" rather than "less than 4").

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/powerpc.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6b6c53c42ac9..c2c7ef330553 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1101,11 +1101,9 @@ int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 {
 	enum emulation_result emulated = EMULATE_DONE;
 
-	/* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */
-	if ( (vcpu->arch.mmio_vsx_copy_nums > 4) ||
-		(vcpu->arch.mmio_vsx_copy_nums < 0) ) {
+	/* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */
+	if (vcpu->arch.mmio_vsx_copy_nums > 4)
 		return EMULATE_FAIL;
-	}
 
 	while (vcpu->arch.mmio_vsx_copy_nums) {
 		emulated = __kvmppc_handle_load(run, vcpu, rt, bytes,
@@ -1247,11 +1245,9 @@ int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	vcpu->arch.io_gpr = rs;
 
-	/* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */
-	if ( (vcpu->arch.mmio_vsx_copy_nums > 4) ||
-		(vcpu->arch.mmio_vsx_copy_nums < 0) ) {
+	/* Currently, mmio_vsx_copy_nums only allowed to be 4 or less */
+	if (vcpu->arch.mmio_vsx_copy_nums > 4)
 		return EMULATE_FAIL;
-	}
 
 	while (vcpu->arch.mmio_vsx_copy_nums) {
 		if (kvmppc_get_vsr_data(vcpu, rs, &val) == -1)
-- 
cgit 


From 81ceca05a42e24768c73d4d6dbd3701a60f1ed85 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 19 Dec 2017 15:56:24 +0100
Subject: KVM: PPC: Book3S HV: Remove vcpu->arch.dec usage

On Book3S in HV mode, we don't use the vcpu->arch.dec field at all.
Instead, all logic is built around vcpu->arch.dec_expires.

So let's remove the one remaining piece of code that was setting it.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 2659844784b8..c8ffd69adfec 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -957,7 +957,6 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	mftb	r7
 	subf	r3,r7,r8
 	mtspr	SPRN_DEC,r3
-	std	r3,VCPU_DEC(r4)
 
 	ld	r5, VCPU_SPRG0(r4)
 	ld	r6, VCPU_SPRG1(r4)
-- 
cgit 


From 1627301020cb460f5a74e13c291f8db3b2a8062e Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Sun, 7 Jan 2018 10:07:36 +0100
Subject: KVM: PPC: Use seq_puts() in kvmppc_exit_timing_show()

A headline should be quickly put into a sequence. Thus use the
function "seq_puts" instead of "seq_printf" for this purpose.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/timing.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
index e44d2b2ea97e..1c03c978eb18 100644
--- a/arch/powerpc/kvm/timing.c
+++ b/arch/powerpc/kvm/timing.c
@@ -143,8 +143,7 @@ static int kvmppc_exit_timing_show(struct seq_file *m, void *private)
 	int i;
 	u64 min, max, sum, sum_quad;
 
-	seq_printf(m, "%s", "type	count	min	max	sum	sum_squared\n");
-
+	seq_puts(m, "type	count	min	max	sum	sum_squared\n");
 
 	for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
 
-- 
cgit 


From 12c1f339cd49119e39063ae67f02d936f988c079 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:39:27 +1100
Subject: powerpc/xive: Move definition of ESB bits

From xive.h to xive-regs.h since it's a HW register definition
and it can be used from assembly

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/xive-regs.h | 35 +++++++++++++++++++++++++++++++++++
 arch/powerpc/include/asm/xive.h      | 35 -----------------------------------
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/include/asm/xive-regs.h b/arch/powerpc/include/asm/xive-regs.h
index 1d3f2be5ae39..fa4288822b68 100644
--- a/arch/powerpc/include/asm/xive-regs.h
+++ b/arch/powerpc/include/asm/xive-regs.h
@@ -9,6 +9,41 @@
 #ifndef _ASM_POWERPC_XIVE_REGS_H
 #define _ASM_POWERPC_XIVE_REGS_H
 
+/*
+ * "magic" Event State Buffer (ESB) MMIO offsets.
+ *
+ * Each interrupt source has a 2-bit state machine called ESB
+ * which can be controlled by MMIO. It's made of 2 bits, P and
+ * Q. P indicates that an interrupt is pending (has been sent
+ * to a queue and is waiting for an EOI). Q indicates that the
+ * interrupt has been triggered while pending.
+ *
+ * This acts as a coalescing mechanism in order to guarantee
+ * that a given interrupt only occurs at most once in a queue.
+ *
+ * When doing an EOI, the Q bit will indicate if the interrupt
+ * needs to be re-triggered.
+ *
+ * The following offsets into the ESB MMIO allow to read or
+ * manipulate the PQ bits. They must be used with an 8-bytes
+ * load instruction. They all return the previous state of the
+ * interrupt (atomically).
+ *
+ * Additionally, some ESB pages support doing an EOI via a
+ * store at 0 and some ESBs support doing a trigger via a
+ * separate trigger page.
+ */
+#define XIVE_ESB_STORE_EOI	0x400 /* Store */
+#define XIVE_ESB_LOAD_EOI	0x000 /* Load */
+#define XIVE_ESB_GET		0x800 /* Load */
+#define XIVE_ESB_SET_PQ_00	0xc00 /* Load */
+#define XIVE_ESB_SET_PQ_01	0xd00 /* Load */
+#define XIVE_ESB_SET_PQ_10	0xe00 /* Load */
+#define XIVE_ESB_SET_PQ_11	0xf00 /* Load */
+
+#define XIVE_ESB_VAL_P		0x2
+#define XIVE_ESB_VAL_Q		0x1
+
 /*
  * Thread Management (aka "TM") registers
  */
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 371fbebf1ec9..0e77005cf021 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -72,41 +72,6 @@ struct xive_q {
 	atomic_t		pending_count;
 };
 
-/*
- * "magic" Event State Buffer (ESB) MMIO offsets.
- *
- * Each interrupt source has a 2-bit state machine called ESB
- * which can be controlled by MMIO. It's made of 2 bits, P and
- * Q. P indicates that an interrupt is pending (has been sent
- * to a queue and is waiting for an EOI). Q indicates that the
- * interrupt has been triggered while pending.
- *
- * This acts as a coalescing mechanism in order to guarantee
- * that a given interrupt only occurs at most once in a queue.
- *
- * When doing an EOI, the Q bit will indicate if the interrupt
- * needs to be re-triggered.
- *
- * The following offsets into the ESB MMIO allow to read or
- * manipulate the PQ bits. They must be used with an 8-bytes
- * load instruction. They all return the previous state of the
- * interrupt (atomically).
- *
- * Additionally, some ESB pages support doing an EOI via a
- * store at 0 and some ESBs support doing a trigger via a
- * separate trigger page.
- */
-#define XIVE_ESB_STORE_EOI	0x400 /* Store */
-#define XIVE_ESB_LOAD_EOI	0x000 /* Load */
-#define XIVE_ESB_GET		0x800 /* Load */
-#define XIVE_ESB_SET_PQ_00	0xc00 /* Load */
-#define XIVE_ESB_SET_PQ_01	0xd00 /* Load */
-#define XIVE_ESB_SET_PQ_10	0xe00 /* Load */
-#define XIVE_ESB_SET_PQ_11	0xf00 /* Load */
-
-#define XIVE_ESB_VAL_P		0x2
-#define XIVE_ESB_VAL_Q		0x1
-
 /* Global enable flags for the XIVE support */
 extern bool __xive_enabled;
 
-- 
cgit 


From 7f1c410da59090f9bb2300efebbc3b717594d64c Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:39:28 +1100
Subject: powerpc/xive: Add interrupt flag to disable automatic EOI

This will be used by KVM in order to keep escalation interrupts
in the non-EOI (masked) state after they fire. They will be
re-enabled directly in HW by KVM when needed.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/xive.h   | 3 +++
 arch/powerpc/sysdev/xive/common.c | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index 0e77005cf021..b619a5585cd6 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -58,6 +58,9 @@ struct xive_irq_data {
 #define XIVE_IRQ_FLAG_EOI_FW	0x10
 #define XIVE_IRQ_FLAG_H_INT_ESB	0x20
 
+/* Special flag set by KVM for excalation interrupts */
+#define XIVE_IRQ_NO_EOI		0x80
+
 #define XIVE_INVALID_CHIP_ID	-1
 
 /* A queue tracking structure in a CPU */
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
index a3b8d7d1316e..2547b6021e6a 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -367,7 +367,8 @@ static void xive_irq_eoi(struct irq_data *d)
 	 * EOI the source if it hasn't been disabled and hasn't
 	 * been passed-through to a KVM guest
 	 */
-	if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d))
+	if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) &&
+	    !(xd->flags & XIVE_IRQ_NO_EOI))
 		xive_do_source_eoi(irqd_to_hwirq(d), xd);
 
 	/*
-- 
cgit 


From 5855564c8ab2d9cefca7b2933bd19818eb795e40 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 12 Jan 2018 20:55:20 +1100
Subject: KVM: PPC: Book3S HV: Enable migration of decrementer register

This adds a register identifier for use with the one_reg interface
to allow the decrementer expiry time to be read and written by
userspace.  The decrementer expiry time is in guest timebase units
and is equal to the sum of the decrementer and the guest timebase.
(The expiry time is used rather than the decrementer value itself
because the expiry time is not constantly changing, though the
decrementer value is, while the guest vcpu is not running.)

Without this, a guest vcpu migrated to a new host will see its
decrementer set to some random value.  On POWER8 and earlier, the
decrementer is 32 bits wide and counts down at 512MHz, so the
guest vcpu will potentially see no decrementer interrupts for up
to about 4 seconds, which will lead to a stall.  With POWER9, the
decrementer is now 56 bits side, so the stall can be much longer
(up to 2.23 years) and more noticeable.

To help work around the problem in cases where userspace has not been
updated to migrate the decrementer expiry time, we now set the
default decrementer expiry at vcpu creation time to the current time
rather than the maximum possible value.  This should mean an
immediate decrementer interrupt when a migrated vcpu starts
running.  In cases where the decrementer is 32 bits wide and more
than 4 seconds elapse between the creation of the vcpu and when it
first runs, the decrementer would have wrapped around to positive
values and there may still be a stall - but this is no worse than
the current situation.  In the large-decrementer case, we are sure
to get an immediate decrementer interrupt (assuming the time from
vcpu creation to first run is less than 2.23 years) and we thus
avoid a very long stall.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 Documentation/virtual/kvm/api.txt   | 1 +
 arch/powerpc/include/uapi/asm/kvm.h | 2 ++
 arch/powerpc/kvm/book3s_hv.c        | 8 ++++++++
 arch/powerpc/kvm/powerpc.c          | 2 +-
 4 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index f670e4b9e7f3..c6f9eebb79f2 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1841,6 +1841,7 @@ registers, find a list below:
   PPC	| KVM_REG_PPC_DBSR              | 32
   PPC   | KVM_REG_PPC_TIDR              | 64
   PPC   | KVM_REG_PPC_PSSCR             | 64
+  PPC   | KVM_REG_PPC_DEC_EXPIRY        | 64
   PPC   | KVM_REG_PPC_TM_GPR0           | 64
           ...
   PPC   | KVM_REG_PPC_TM_GPR31          | 64
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 61d6049f4c1e..8aaec831053a 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -607,6 +607,8 @@ struct kvm_ppc_rmmu_info {
 #define KVM_REG_PPC_TIDR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc)
 #define KVM_REG_PPC_PSSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd)
 
+#define KVM_REG_PPC_DEC_EXPIRY	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe)
+
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
  */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index c4f0bebfc5ba..b2d448c75008 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1497,6 +1497,10 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_ARCH_COMPAT:
 		*val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
 		break;
+	case KVM_REG_PPC_DEC_EXPIRY:
+		*val = get_reg_val(id, vcpu->arch.dec_expires +
+				   vcpu->arch.vcore->tb_offset);
+		break;
 	default:
 		r = -EINVAL;
 		break;
@@ -1724,6 +1728,10 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_ARCH_COMPAT:
 		r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
 		break;
+	case KVM_REG_PPC_DEC_EXPIRY:
+		vcpu->arch.dec_expires = set_reg_val(id, *val) -
+			vcpu->arch.vcore->tb_offset;
+		break;
 	default:
 		r = -EINVAL;
 		break;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index c2c7ef330553..7c9e45f54186 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -758,7 +758,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
 	vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
-	vcpu->arch.dec_expires = ~(u64)0;
+	vcpu->arch.dec_expires = get_tb();
 
 #ifdef CONFIG_KVM_EXIT_TIMING
 	mutex_init(&vcpu->arch.exit_timing_lock);
-- 
cgit 


From 43ff3f65234061e08d234bdef5a9aadc19832b74 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 11 Jan 2018 14:31:43 +1100
Subject: KVM: PPC: Book3S HV: Make sure we don't re-enter guest without XIVE
 loaded

This fixes a bug where it is possible to enter a guest on a POWER9
system without having the XIVE (interrupt controller) context loaded.
This can happen because we unload the XIVE context from the CPU
before doing the real-mode handling for machine checks.  After the
real-mode handler runs, it is possible that we re-enter the guest
via a fast path which does not load the XIVE context.

To fix this, we move the unloading of the XIVE context to come after
the real-mode machine check handler is called.

Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller")
Cc: stable@vger.kernel.org # v4.11+
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 40 ++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index c8ffd69adfec..76332a3f6c0d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1423,6 +1423,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	blt	deliver_guest_interrupt
 
 guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
+	/* Save more register state  */
+	mfdar	r6
+	mfdsisr	r7
+	std	r6, VCPU_DAR(r9)
+	stw	r7, VCPU_DSISR(r9)
+	/* don't overwrite fault_dar/fault_dsisr if HDSI */
+	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
+	beq	mc_cont
+	std	r6, VCPU_FAULT_DAR(r9)
+	stw	r7, VCPU_FAULT_DSISR(r9)
+
+	/* See if it is a machine check */
+	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+	beq	machine_check_realmode
+mc_cont:
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r9, VCPU_TB_RMEXIT
+	mr	r4, r9
+	bl	kvmhv_accumulate_time
+#endif
 #ifdef CONFIG_KVM_XICS
 	/* We are exiting, pull the VP from the XIVE */
 	lwz	r0, VCPU_XIVE_PUSHED(r9)
@@ -1460,26 +1480,6 @@ guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	eieio
 1:
 #endif /* CONFIG_KVM_XICS */
-	/* Save more register state  */
-	mfdar	r6
-	mfdsisr	r7
-	std	r6, VCPU_DAR(r9)
-	stw	r7, VCPU_DSISR(r9)
-	/* don't overwrite fault_dar/fault_dsisr if HDSI */
-	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
-	beq	mc_cont
-	std	r6, VCPU_FAULT_DAR(r9)
-	stw	r7, VCPU_FAULT_DSISR(r9)
-
-	/* See if it is a machine check */
-	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
-	beq	machine_check_realmode
-mc_cont:
-#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
-	addi	r3, r9, VCPU_TB_RMEXIT
-	mr	r4, r9
-	bl	kvmhv_accumulate_time
-#endif
 
 	mr 	r3, r12
 	/* Increment exit count, poke other threads to exit */
-- 
cgit 


From 6964e6a4e4894c707e42d51d9d30683c57f43201 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 11 Jan 2018 14:51:02 +1100
Subject: KVM: PPC: Book3S HV: Do SLB load/unload with guest LPCR value loaded

This moves the code that loads and unloads the guest SLB values so that
it is done while the guest LPCR value is loaded in the LPCR register.
The reason for doing this is that on POWER9, the behaviour of the
slbmte instruction depends on the LPCR[UPRT] bit.  If UPRT is 1, as
it is for a radix host (or guest), the SLB index is truncated to
2 bits.  This means that for a HPT guest on a radix host, the SLB
was not being loaded correctly, causing the guest to crash.

The SLB is now loaded much later in the guest entry path, after the
LPCR is loaded, which for a secondary thread is after it sees that
the primary thread has switched the MMU to the guest.  The loop that
waits for the primary thread has a branch out to the exit code that
is taken if it sees that other threads have commenced exiting the
guest.  Since we have now not loaded the SLB at this point, we make
this path branch to a new label 'guest_bypass' and we move the SLB
unload code to before this label.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 109 ++++++++++++++++----------------
 1 file changed, 55 insertions(+), 54 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 76332a3f6c0d..30ece4cebaf5 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -617,13 +617,6 @@ kvmppc_hv_entry:
 	lbz	r0, KVM_RADIX(r9)
 	cmpwi	cr7, r0, 0
 
-	/* Clear out SLB if hash */
-	bne	cr7, 2f
-	li	r6,0
-	slbmte	r6,r6
-	slbia
-	ptesync
-2:
 	/*
 	 * POWER7/POWER8 host -> guest partition switch code.
 	 * We don't have to lock against concurrent tlbies,
@@ -738,19 +731,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 10:	cmpdi	r4, 0
 	beq	kvmppc_primary_no_guest
 kvmppc_got_guest:
-
-	/* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
-	lwz	r5,VCPU_SLB_MAX(r4)
-	cmpwi	r5,0
-	beq	9f
-	mtctr	r5
-	addi	r6,r4,VCPU_SLB
-1:	ld	r8,VCPU_SLB_E(r6)
-	ld	r9,VCPU_SLB_V(r6)
-	slbmte	r9,r8
-	addi	r6,r6,VCPU_SLB_SIZE
-	bdnz	1b
-9:
 	/* Increment yield count if they have a VPA */
 	ld	r3, VCPU_VPA(r4)
 	cmpdi	r3, 0
@@ -1017,6 +997,29 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	cmpdi	r3, 512		/* 1 microsecond */
 	blt	hdec_soon
 
+	/* For hash guest, clear out and reload the SLB */
+	ld	r6, VCPU_KVM(r4)
+	lbz	r0, KVM_RADIX(r6)
+	cmpwi	r0, 0
+	bne	9f
+	li	r6, 0
+	slbmte	r6, r6
+	slbia
+	ptesync
+
+	/* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */
+	lwz	r5,VCPU_SLB_MAX(r4)
+	cmpwi	r5,0
+	beq	9f
+	mtctr	r5
+	addi	r6,r4,VCPU_SLB
+1:	ld	r8,VCPU_SLB_E(r6)
+	ld	r9,VCPU_SLB_V(r6)
+	slbmte	r9,r8
+	addi	r6,r6,VCPU_SLB_SIZE
+	bdnz	1b
+9:
+
 #ifdef CONFIG_KVM_XICS
 	/* We are entering the guest on that thread, push VCPU to XIVE */
 	ld	r10, HSTATE_XIVE_TIMA_PHYS(r13)
@@ -1193,7 +1196,7 @@ hdec_soon:
 	addi	r3, r4, VCPU_TB_RMEXIT
 	bl	kvmhv_accumulate_time
 #endif
-	b	guest_exit_cont
+	b	guest_bypass
 
 /******************************************************************************
  *                                                                            *
@@ -1481,34 +1484,12 @@ mc_cont:
 1:
 #endif /* CONFIG_KVM_XICS */
 
-	mr 	r3, r12
-	/* Increment exit count, poke other threads to exit */
-	bl	kvmhv_commence_exit
-	nop
-	ld	r9, HSTATE_KVM_VCPU(r13)
-	lwz	r12, VCPU_TRAP(r9)
-
-	/* Stop others sending VCPU interrupts to this physical CPU */
-	li	r0, -1
-	stw	r0, VCPU_CPU(r9)
-	stw	r0, VCPU_THREAD_CPU(r9)
-
-	/* Save guest CTRL register, set runlatch to 1 */
-	mfspr	r6,SPRN_CTRLF
-	stw	r6,VCPU_CTRL(r9)
-	andi.	r0,r6,1
-	bne	4f
-	ori	r6,r6,1
-	mtspr	SPRN_CTRLT,r6
-4:
-	/* Check if we are running hash or radix and store it in cr2 */
+	/* For hash guest, read the guest SLB and save it away */
 	ld	r5, VCPU_KVM(r9)
 	lbz	r0, KVM_RADIX(r5)
-	cmpwi	cr2,r0,0
-
-	/* Read the guest SLB and save it away */
 	li	r5, 0
-	bne	cr2, 3f			/* for radix, save 0 entries */
+	cmpwi	r0, 0
+	bne	3f			/* for radix, save 0 entries */
 	lwz	r0,VCPU_SLB_NR(r9)	/* number of entries in SLB */
 	mtctr	r0
 	li	r6,0
@@ -1524,8 +1505,34 @@ mc_cont:
 	addi	r5,r5,1
 2:	addi	r6,r6,1
 	bdnz	1b
+	/* Finally clear out the SLB */
+	li	r0,0
+	slbmte	r0,r0
+	slbia
+	ptesync
 3:	stw	r5,VCPU_SLB_MAX(r9)
 
+guest_bypass:
+	mr 	r3, r12
+	/* Increment exit count, poke other threads to exit */
+	bl	kvmhv_commence_exit
+	nop
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	lwz	r12, VCPU_TRAP(r9)
+
+	/* Stop others sending VCPU interrupts to this physical CPU */
+	li	r0, -1
+	stw	r0, VCPU_CPU(r9)
+	stw	r0, VCPU_THREAD_CPU(r9)
+
+	/* Save guest CTRL register, set runlatch to 1 */
+	mfspr	r6,SPRN_CTRLF
+	stw	r6,VCPU_CTRL(r9)
+	andi.	r0,r6,1
+	bne	4f
+	ori	r6,r6,1
+	mtspr	SPRN_CTRLT,r6
+4:
 	/*
 	 * Save the guest PURR/SPURR
 	 */
@@ -1803,7 +1810,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	ld	r5, VCPU_KVM(r9)
 	lbz	r0, KVM_RADIX(r5)
 	cmpwi	cr2, r0, 0
-	beq	cr2, 3f
+	beq	cr2, 4f
 
 	/* Radix: Handle the case where the guest used an illegal PID */
 	LOAD_REG_ADDR(r4, mmu_base_pid)
@@ -1839,15 +1846,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 BEGIN_FTR_SECTION
 	PPC_INVALIDATE_ERAT
 END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
-	b	4f
+4:
 #endif /* CONFIG_PPC_RADIX_MMU */
 
-	/* Hash: clear out SLB */
-3:	li	r5,0
-	slbmte	r5,r5
-	slbia
-	ptesync
-4:
 	/*
 	 * POWER7/POWER8 guest -> host partition switch code.
 	 * We don't have to lock against tlbies but we do
-- 
cgit 


From 00608e1f007e4cf6031485c5630e0e504bceef9b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 11 Jan 2018 16:54:26 +1100
Subject: KVM: PPC: Book3S HV: Allow HPT and radix on the same core for POWER9
 v2.2

POWER9 chip versions starting with "Nimbus" v2.2 can support running
with some threads of a core in HPT mode and others in radix mode.
This means that we don't have to prohibit independent-threads mode
when running a HPT guest on a radix host, and we don't have to do any
of the synchronization between threads that was introduced in commit
c01015091a77 ("KVM: PPC: Book3S HV: Run HPT guests on POWER9 radix
hosts", 2017-10-19).

Rather than using up another CPU feature bit, we just do an
explicit test on the PVR (processor version register) at module
startup time to determine whether we have to take steps to avoid
having some threads in HPT mode and some in radix mode (so-called
"mixed mode").  We test for "Nimbus" (indicated by 0 or 1 in the top
nibble of the lower 16 bits) v2.2 or later, or "Cumulus" (indicated by
2 or 3 in that nibble) v1.1 or later.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b2d448c75008..76cf48051eb3 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -118,6 +118,9 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
 #endif
 
+/* If set, the threads on each CPU core have to be in the same MMU mode */
+static bool no_mixing_hpt_and_radix;
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
@@ -2386,8 +2389,8 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
 static bool subcore_config_ok(int n_subcores, int n_threads)
 {
 	/*
-	 * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core
-	 * mode, with one thread per subcore.
+	 * POWER9 "SMT4" cores are permanently in what is effectively a 4-way
+	 * split-core mode, with one thread per subcore.
 	 */
 	if (cpu_has_feature(CPU_FTR_ARCH_300))
 		return n_subcores <= 4 && n_threads == 1;
@@ -2423,8 +2426,8 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
 	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
 		return false;
 
-	/* POWER9 currently requires all threads to be in the same MMU mode */
-	if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+	/* Some POWER9 chips require all threads to be in the same MMU mode */
+	if (no_mixing_hpt_and_radix &&
 	    kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
 		return false;
 
@@ -2687,9 +2690,11 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	 * threads are offline.  Also check if the number of threads in this
 	 * guest are greater than the current system threads per guest.
 	 * On POWER9, we need to be not in independent-threads mode if
-	 * this is a HPT guest on a radix host.
+	 * this is a HPT guest on a radix host machine where the
+	 * CPU threads may not be in different MMU modes.
 	 */
-	hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm);
+	hpt_on_radix = no_mixing_hpt_and_radix && radix_enabled() &&
+		!kvm_is_radix(vc->kvm);
 	if (((controlled_threads > 1) &&
 	     ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
 	    (hpt_on_radix && vc->kvm->arch.threads_indep)) {
@@ -4446,6 +4451,19 @@ static int kvmppc_book3s_init_hv(void)
 
 	if (kvmppc_radix_possible())
 		r = kvmppc_radix_init();
+
+	/*
+	 * POWER9 chips before version 2.02 can't have some threads in
+	 * HPT mode and some in radix mode on the same core.
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		unsigned int pvr = mfspr(SPRN_PVR);
+		if ((pvr >> 16) == PVR_POWER9 &&
+		    (((pvr & 0xe000) == 0 && (pvr & 0xfff) < 0x202) ||
+		     ((pvr & 0xe000) == 0x2000 && (pvr & 0xfff) < 0x101)))
+			no_mixing_hpt_and_radix = true;
+	}
+
 	return r;
 }
 
-- 
cgit 


From d075745d893c78730e4a3b7a60fca23c2f764081 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 17 Jan 2018 20:51:13 +1100
Subject: KVM: PPC: Book3S HV: Improve handling of debug-trigger HMIs on POWER9

Hypervisor maintenance interrupts (HMIs) are generated by various
causes, signalled by bits in the hypervisor maintenance exception
register (HMER).  In most cases calling OPAL to handle the interrupt
is the correct thing to do, but the "debug trigger" HMIs signalled by
PPC bit 17 (bit 46) of HMER are used to invoke software workarounds
for hardware bugs, and OPAL does not have any code to handle this
cause.  The debug trigger HMI is used in POWER9 DD2.0 and DD2.1 chips
to work around a hardware bug in executing vector load instructions to
cache inhibited memory.  In POWER9 DD2.2 chips, it is generated when
conditions are detected relating to threads being in TM (transactional
memory) suspended mode when the core SMT configuration needs to be
reconfigured.

The kernel currently has code to detect the vector CI load condition,
but only when the HMI occurs in the host, not when it occurs in a
guest.  If a HMI occurs in the guest, it is always passed to OPAL, and
then we always re-sync the timebase, because the HMI cause might have
been a timebase error, for which OPAL would re-sync the timebase, thus
removing the timebase offset which KVM applied for the guest.  Since
we don't know what OPAL did, we don't know whether to subtract the
timebase offset from the timebase, so instead we re-sync the timebase.

This adds code to determine explicitly what the cause of a debug
trigger HMI will be.  This is based on a new device-tree property
under the CPU nodes called ibm,hmi-special-triggers, if it is
present, or otherwise based on the PVR (processor version register).
The handling of debug trigger HMIs is pulled out into a separate
function which can be called from the KVM guest exit code.  If this
function handles and clears the HMI, and no other HMI causes remain,
then we skip calling OPAL and we proceed to subtract the guest
timebase offset from the timebase.

The overall handling for HMIs that occur in the host (i.e. not in a
KVM guest) is largely unchanged, except that we now don't set the flag
for the vector CI load workaround on DD2.2 processors.

This also removes a BUG_ON in the KVM code.  BUG_ON is generally not
useful in KVM guest entry/exit code since it is difficult to handle
the resulting trap gracefully.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/hmi.h          |   4 +
 arch/powerpc/include/asm/reg.h          |   5 +-
 arch/powerpc/kernel/mce.c               | 142 +++++++++++++++++++++++++-------
 arch/powerpc/kvm/book3s_hv_ras.c        |   8 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   9 +-
 5 files changed, 131 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h
index 85b7a1a21e22..9c14f7b5c46c 100644
--- a/arch/powerpc/include/asm/hmi.h
+++ b/arch/powerpc/include/asm/hmi.h
@@ -42,4 +42,8 @@ extern void wait_for_tb_resync(void);
 static inline void wait_for_subcore_guest_exit(void) { }
 static inline void wait_for_tb_resync(void) { }
 #endif
+
+struct pt_regs;
+extern long hmi_handle_debugtrig(struct pt_regs *regs);
+
 #endif /* __ASM_PPC64_HMI_H__ */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index b779f3ccd412..14e41b843952 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -432,8 +432,9 @@
 #define SPRN_LPID	0x13F	/* Logical Partition Identifier */
 #endif
 #define   LPID_RSVD	0x3ff		/* Reserved LPID for partn switching */
-#define	SPRN_HMER	0x150	/* Hardware m? error recovery */
-#define	SPRN_HMEER	0x151	/* Hardware m? enable error recovery */
+#define	SPRN_HMER	0x150	/* Hypervisor maintenance exception reg */
+#define   HMER_DEBUG_TRIG	(1ul << (63 - 17)) /* Debug trigger */
+#define	SPRN_HMEER	0x151	/* Hyp maintenance exception enable reg */
 #define SPRN_PCR	0x152	/* Processor compatibility register */
 #define   PCR_VEC_DIS	(1ul << (63-0))	/* Vec. disable (bit NA since POWER8) */
 #define   PCR_VSX_DIS	(1ul << (63-1))	/* VSX disable (bit NA since POWER8) */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 742e4658c5dc..d2fecaec4fec 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -495,37 +495,123 @@ long machine_check_early(struct pt_regs *regs)
 	return handled;
 }
 
-long hmi_exception_realmode(struct pt_regs *regs)
+/* Possible meanings for HMER_DEBUG_TRIG bit being set on POWER9 */
+static enum {
+	DTRIG_UNKNOWN,
+	DTRIG_VECTOR_CI,	/* need to emulate vector CI load instr */
+	DTRIG_SUSPEND_ESCAPE,	/* need to escape from TM suspend mode */
+} hmer_debug_trig_function;
+
+static int init_debug_trig_function(void)
 {
-	__this_cpu_inc(irq_stat.hmi_exceptions);
-
-#ifdef CONFIG_PPC_BOOK3S_64
-	/* Workaround for P9 vector CI loads (see p9_hmi_special_emu) */
-	if (pvr_version_is(PVR_POWER9)) {
-		unsigned long hmer = mfspr(SPRN_HMER);
-
-		/* Do we have the debug bit set */
-		if (hmer & PPC_BIT(17)) {
-			hmer &= ~PPC_BIT(17);
-			mtspr(SPRN_HMER, hmer);
-
-			/*
-			 * Now to avoid problems with soft-disable we
-			 * only do the emulation if we are coming from
-			 * user space
-			 */
-			if (user_mode(regs))
-				local_paca->hmi_p9_special_emu = 1;
-
-			/*
-			 * Don't bother going to OPAL if that's the
-			 * only relevant bit.
-			 */
-			if (!(hmer & mfspr(SPRN_HMEER)))
-				return local_paca->hmi_p9_special_emu;
+	int pvr;
+	struct device_node *cpun;
+	struct property *prop = NULL;
+	const char *str;
+
+	/* First look in the device tree */
+	preempt_disable();
+	cpun = of_get_cpu_node(smp_processor_id(), NULL);
+	if (cpun) {
+		of_property_for_each_string(cpun, "ibm,hmi-special-triggers",
+					    prop, str) {
+			if (strcmp(str, "bit17-vector-ci-load") == 0)
+				hmer_debug_trig_function = DTRIG_VECTOR_CI;
+			else if (strcmp(str, "bit17-tm-suspend-escape") == 0)
+				hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE;
 		}
+		of_node_put(cpun);
+	}
+	preempt_enable();
+
+	/* If we found the property, don't look at PVR */
+	if (prop)
+		goto out;
+
+	pvr = mfspr(SPRN_PVR);
+	/* Check for POWER9 Nimbus (scale-out) */
+	if ((PVR_VER(pvr) == PVR_POWER9) && (pvr & 0xe000) == 0) {
+		/* DD2.2 and later */
+		if ((pvr & 0xfff) >= 0x202)
+			hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE;
+		/* DD2.0 and DD2.1 - used for vector CI load emulation */
+		else if ((pvr & 0xfff) >= 0x200)
+			hmer_debug_trig_function = DTRIG_VECTOR_CI;
+	}
+
+ out:
+	switch (hmer_debug_trig_function) {
+	case DTRIG_VECTOR_CI:
+		pr_debug("HMI debug trigger used for vector CI load\n");
+		break;
+	case DTRIG_SUSPEND_ESCAPE:
+		pr_debug("HMI debug trigger used for TM suspend escape\n");
+		break;
+	default:
+		break;
 	}
-#endif /* CONFIG_PPC_BOOK3S_64 */
+	return 0;
+}
+__initcall(init_debug_trig_function);
+
+/*
+ * Handle HMIs that occur as a result of a debug trigger.
+ * Return values:
+ * -1 means this is not a HMI cause that we know about
+ *  0 means no further handling is required
+ *  1 means further handling is required
+ */
+long hmi_handle_debugtrig(struct pt_regs *regs)
+{
+	unsigned long hmer = mfspr(SPRN_HMER);
+	long ret = 0;
+
+	/* HMER_DEBUG_TRIG bit is used for various workarounds on P9 */
+	if (!((hmer & HMER_DEBUG_TRIG)
+	      && hmer_debug_trig_function != DTRIG_UNKNOWN))
+		return -1;
+		
+	hmer &= ~HMER_DEBUG_TRIG;
+	/* HMER is a write-AND register */
+	mtspr(SPRN_HMER, ~HMER_DEBUG_TRIG);
+
+	switch (hmer_debug_trig_function) {
+	case DTRIG_VECTOR_CI:
+		/*
+		 * Now to avoid problems with soft-disable we
+		 * only do the emulation if we are coming from
+		 * host user space
+		 */
+		if (regs && user_mode(regs))
+			ret = local_paca->hmi_p9_special_emu = 1;
+
+		break;
+
+	default:
+		break;
+	}
+
+	/*
+	 * See if any other HMI causes remain to be handled
+	 */
+	if (hmer & mfspr(SPRN_HMEER))
+		return -1;
+
+	return ret;
+}
+
+/*
+ * Return values:
+ */
+long hmi_exception_realmode(struct pt_regs *regs)
+{	
+	int ret;
+
+	__this_cpu_inc(irq_stat.hmi_exceptions);
+
+	ret = hmi_handle_debugtrig(regs);
+	if (ret >= 0)
+		return ret;
 
 	wait_for_subcore_guest_exit();
 
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index c356f9a40b24..c296343d0dcc 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -268,17 +268,19 @@ static void kvmppc_tb_resync_done(void)
  *   secondary threads to proceed.
  * - All secondary threads will eventually call opal hmi handler on
  *   their exit path.
+ *
+ * Returns 1 if the timebase offset should be applied, 0 if not.
  */
 
 long kvmppc_realmode_hmi_handler(void)
 {
-	int ptid = local_paca->kvm_hstate.ptid;
 	bool resync_req;
 
-	/* This is only called on primary thread. */
-	BUG_ON(ptid != 0);
 	__this_cpu_inc(irq_stat.hmi_exceptions);
 
+	if (hmi_handle_debugtrig(NULL) >= 0)
+		return 1;
+
 	/*
 	 * By now primary thread has already completed guest->host
 	 * partition switch but haven't signaled secondaries yet.
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 2659844784b8..bd0b623335af 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1909,16 +1909,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	bne	27f
 	bl	kvmppc_realmode_hmi_handler
 	nop
+	cmpdi	r3, 0
 	li	r12, BOOK3S_INTERRUPT_HMI
 	/*
-	 * At this point kvmppc_realmode_hmi_handler would have resync-ed
-	 * the TB. Hence it is not required to subtract guest timebase
-	 * offset from timebase. So, skip it.
+	 * At this point kvmppc_realmode_hmi_handler may have resync-ed
+	 * the TB, and if it has, we must not subtract the guest timebase
+	 * offset from the timebase. So, skip it.
 	 *
 	 * Also, do not call kvmppc_subcore_exit_guest() because it has
 	 * been invoked as part of kvmppc_realmode_hmi_handler().
 	 */
-	b	30f
+	beq	30f
 
 27:
 	/* Subtract timebase offset from timebase */
-- 
cgit 


From c424c108233dc422a9a29ee833154006a5bdf9fc Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:37:11 +1100
Subject: KVM: PPC: Book3S HV: Add more info about XIVE queues in debugfs

Add details about enabled queues and escalation interrupts.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_xive.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index bf457843e032..6cff5bdfd6b7 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1794,6 +1794,7 @@ static int xive_debug_show(struct seq_file *m, void *private)
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+		unsigned int i;
 
 		if (!xc)
 			continue;
@@ -1803,6 +1804,33 @@ static int xive_debug_show(struct seq_file *m, void *private)
 			   xc->server_num, xc->cppr, xc->hw_cppr,
 			   xc->mfrr, xc->pending,
 			   xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
+		for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+			struct xive_q *q = &xc->queues[i];
+			u32 i0, i1, idx;
+
+			if (!q->qpage && !xc->esc_virq[i])
+				continue;
+
+			seq_printf(m, " [q%d]: ", i);
+
+			if (q->qpage) {
+				idx = q->idx;
+				i0 = be32_to_cpup(q->qpage + idx);
+				idx = (idx + 1) & q->msk;
+				i1 = be32_to_cpup(q->qpage + idx);
+				seq_printf(m, "T=%d %08x %08x... \n", q->toggle, i0, i1);
+			}
+			if (xc->esc_virq[i]) {
+				struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
+				struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+				u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
+				seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
+					   (pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
+					   (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
+					   xc->esc_virq[i], pq, xd->eoi_page);
+				seq_printf(m, "\n");
+			}
+		}
 
 		t_rm_h_xirr += xc->stat_rm_h_xirr;
 		t_rm_h_ipoll += xc->stat_rm_h_ipoll;
-- 
cgit 


From bf4159da4751ab8eea43ca6e7c49193dbce8398c Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:37:12 +1100
Subject: KVM: PPC: Book3S HV: Enable use of the new XIVE "single escalation"
 feature

That feature, provided by Power9 DD2.0 and later, when supported
by newer OPAL versions, allows us to sacrifice a queue (priority 7)
in favor of merging all the escalation interrupts of the queues
of a single VP into a single interrupt.

This reduces the number of host interrupts used up by KVM guests
especially when those guests use multiple priorities.

It will also enable a future change to control the masking of the
escalation interrupts more precisely to avoid spurious ones.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/opal-api.h |  1 +
 arch/powerpc/include/asm/xive.h     |  3 ++-
 arch/powerpc/kvm/book3s_xive.c      | 48 ++++++++++++++++++++++++-------------
 arch/powerpc/kvm/book3s_xive.h      | 15 +++++-------
 arch/powerpc/sysdev/xive/native.c   | 18 ++++++++++++--
 5 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 233c7504b1f2..fc926743647e 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -1073,6 +1073,7 @@ enum {
 /* Flags for OPAL_XIVE_GET/SET_VP_INFO */
 enum {
 	OPAL_XIVE_VP_ENABLED		= 0x00000001,
+	OPAL_XIVE_VP_SINGLE_ESCALATION	= 0x00000002,
 };
 
 /* "Any chip" replacement for chip ID for allocation functions */
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
index b619a5585cd6..e602903c3029 100644
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -111,9 +111,10 @@ extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
 
 extern void xive_native_sync_source(u32 hw_irq);
 extern bool is_xive_irq(struct irq_chip *chip);
-extern int xive_native_enable_vp(u32 vp_id);
+extern int xive_native_enable_vp(u32 vp_id, bool single_escalation);
 extern int xive_native_disable_vp(u32 vp_id);
 extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
+extern bool xive_native_has_single_escalation(void);
 
 #else
 
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 6cff5bdfd6b7..a102efeabf05 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -112,19 +112,21 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
 		return -EIO;
 	}
 
-	/*
-	 * Future improvement: start with them disabled
-	 * and handle DD2 and later scheme of merged escalation
-	 * interrupts
-	 */
-	name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
-			 vcpu->kvm->arch.lpid, xc->server_num, prio);
+	if (xc->xive->single_escalation)
+		name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
+				 vcpu->kvm->arch.lpid, xc->server_num);
+	else
+		name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d",
+				 vcpu->kvm->arch.lpid, xc->server_num, prio);
 	if (!name) {
 		pr_err("Failed to allocate escalation irq name for queue %d of VCPU %d\n",
 		       prio, xc->server_num);
 		rc = -ENOMEM;
 		goto error;
 	}
+
+	pr_devel("Escalation %s irq %d (prio %d)\n", name, xc->esc_virq[prio], prio);
+
 	rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
 			 IRQF_NO_THREAD, name, vcpu);
 	if (rc) {
@@ -191,12 +193,12 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio)
 
 	pr_devel("Provisioning prio... %d\n", prio);
 
-	/* Provision each VCPU and enable escalations */
+	/* Provision each VCPU and enable escalations if needed */
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		if (!vcpu->arch.xive_vcpu)
 			continue;
 		rc = xive_provision_queue(vcpu, prio);
-		if (rc == 0)
+		if (rc == 0 && !xive->single_escalation)
 			xive_attach_escalation(vcpu, prio);
 		if (rc)
 			return rc;
@@ -1081,6 +1083,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 	/* Allocate IPI */
 	xc->vp_ipi = xive_native_alloc_irq();
 	if (!xc->vp_ipi) {
+		pr_err("Failed to allocate xive irq for VCPU IPI\n");
 		r = -EIO;
 		goto bail;
 	}
@@ -1090,19 +1093,34 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 	if (r)
 		goto bail;
 
+	/*
+	 * Enable the VP first as the single escalation mode will
+	 * affect escalation interrupts numbering
+	 */
+	r = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
+	if (r) {
+		pr_err("Failed to enable VP in OPAL, err %d\n", r);
+		goto bail;
+	}
+
 	/*
 	 * Initialize queues. Initially we set them all for no queueing
 	 * and we enable escalation for queue 0 only which we'll use for
 	 * our mfrr change notifications. If the VCPU is hot-plugged, we
-	 * do handle provisioning however.
+	 * do handle provisioning however based on the existing "map"
+	 * of enabled queues.
 	 */
 	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
 		struct xive_q *q = &xc->queues[i];
 
+		/* Single escalation, no queue 7 */
+		if (i == 7 && xive->single_escalation)
+			break;
+
 		/* Is queue already enabled ? Provision it */
 		if (xive->qmap & (1 << i)) {
 			r = xive_provision_queue(vcpu, i);
-			if (r == 0)
+			if (r == 0 && !xive->single_escalation)
 				xive_attach_escalation(vcpu, i);
 			if (r)
 				goto bail;
@@ -1122,11 +1140,6 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 	if (r)
 		goto bail;
 
-	/* Enable the VP */
-	r = xive_native_enable_vp(xc->vp_id);
-	if (r)
-		goto bail;
-
 	/* Route the IPI */
 	r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI);
 	if (!r)
@@ -1473,6 +1486,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
 
 	pr_devel("  val=0x016%llx (server=0x%x, guest_prio=%d)\n",
 		 val, server, guest_prio);
+
 	/*
 	 * If the source doesn't already have an IPI, allocate
 	 * one and get the corresponding data
@@ -1761,6 +1775,8 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
 	if (xive->vp_base == XIVE_INVALID_VP)
 		ret = -ENOMEM;
 
+	xive->single_escalation = xive_native_has_single_escalation();
+
 	if (ret) {
 		kfree(xive);
 		return ret;
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index 6ba63f8e8a61..a08ae6fd4c51 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -120,6 +120,8 @@ struct kvmppc_xive {
 	u32	q_order;
 	u32	q_page_order;
 
+	/* Flags */
+	u8	single_escalation;
 };
 
 #define KVMPPC_XIVE_Q_COUNT	8
@@ -201,25 +203,20 @@ static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmpp
  * is as follow.
  *
  * Guest request for 0...6 are honored. Guest request for anything
- * higher results in a priority of 7 being applied.
- *
- * However, when XIRR is returned via H_XIRR, 7 is translated to 0xb
- * in order to match AIX expectations
+ * higher results in a priority of 6 being applied.
  *
  * Similar mapping is done for CPPR values
  */
 static inline u8 xive_prio_from_guest(u8 prio)
 {
-	if (prio == 0xff || prio < 8)
+	if (prio == 0xff || prio < 6)
 		return prio;
-	return 7;
+	return 6;
 }
 
 static inline u8 xive_prio_to_guest(u8 prio)
 {
-	if (prio == 0xff || prio < 7)
-		return prio;
-	return 0xb;
+	return prio;
 }
 
 static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle)
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
index ebc244b08d67..d22aeb0b69e1 100644
--- a/arch/powerpc/sysdev/xive/native.c
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -42,6 +42,7 @@ static u32 xive_provision_chip_count;
 static u32 xive_queue_shift;
 static u32 xive_pool_vps = XIVE_INVALID_VP;
 static struct kmem_cache *xive_provision_cache;
+static bool xive_has_single_esc;
 
 int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
 {
@@ -571,6 +572,10 @@ bool __init xive_native_init(void)
 			break;
 	}
 
+	/* Do we support single escalation */
+	if (of_get_property(np, "single-escalation-support", NULL) != NULL)
+		xive_has_single_esc = true;
+
 	/* Configure Thread Management areas for KVM */
 	for_each_possible_cpu(cpu)
 		kvmppc_set_xive_tima(cpu, r.start, tima);
@@ -667,12 +672,15 @@ void xive_native_free_vp_block(u32 vp_base)
 }
 EXPORT_SYMBOL_GPL(xive_native_free_vp_block);
 
-int xive_native_enable_vp(u32 vp_id)
+int xive_native_enable_vp(u32 vp_id, bool single_escalation)
 {
 	s64 rc;
+	u64 flags = OPAL_XIVE_VP_ENABLED;
 
+	if (single_escalation)
+		flags |= OPAL_XIVE_VP_SINGLE_ESCALATION;
 	for (;;) {
-		rc = opal_xive_set_vp_info(vp_id, OPAL_XIVE_VP_ENABLED, 0);
+		rc = opal_xive_set_vp_info(vp_id, flags, 0);
 		if (rc != OPAL_BUSY)
 			break;
 		msleep(1);
@@ -710,3 +718,9 @@ int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xive_native_get_vp_info);
+
+bool xive_native_has_single_escalation(void)
+{
+	return xive_has_single_esc;
+}
+EXPORT_SYMBOL_GPL(xive_native_has_single_escalation);
-- 
cgit 


From 2267ea7661798a42f0da648a2970e2a03f4bc370 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:37:13 +1100
Subject: KVM: PPC: Book3S HV: Don't use existing "prodded" flag for XIVE
 escalations

The prodded flag is only cleared at the beginning of H_CEDE,
so every time we have an escalation, we will cause the *next*
H_CEDE to return immediately.

Instead use a dedicated "irq_pending" flag to indicate that
a guest interrupt is pending for the VCPU. We don't reuse the
existing exception bitmap so as to avoid expensive atomic ops.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h     |  1 +
 arch/powerpc/kernel/asm-offsets.c       |  1 +
 arch/powerpc/kvm/book3s_hv.c            |  2 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 10 ++++++++++
 arch/powerpc/kvm/book3s_xive.c          |  3 +--
 5 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 3aa5b577cd60..bfe51356af5e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -709,6 +709,7 @@ struct kvm_vcpu_arch {
 	u8 ceded;
 	u8 prodded;
 	u8 doorbell_request;
+	u8 irq_pending; /* Used by XIVE to signal pending guest irqs */
 	u32 last_inst;
 
 	struct swait_queue_head *wqp;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 6b958414b4e0..825089cf3e23 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -514,6 +514,7 @@ int main(void)
 	OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions);
 	OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded);
 	OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded);
+	OFFSET(VCPU_IRQ_PENDING, kvm_vcpu, arch.irq_pending);
 	OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request);
 	OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr);
 	OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 76cf48051eb3..e5f81fc108e0 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2999,7 +2999,7 @@ static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
 {
 	if (!xive_enabled())
 		return false;
-	return vcpu->arch.xive_saved_state.pipr <
+	return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr <
 		vcpu->arch.xive_saved_state.cppr;
 }
 #else
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 7daf21be33d0..34dbab7deb39 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1035,6 +1035,16 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	li	r9, 1
 	stw	r9, VCPU_XIVE_PUSHED(r4)
 	eieio
+
+	/*
+	 * We clear the irq_pending flag. There is a small chance of a
+	 * race vs. the escalation interrupt happening on another
+	 * processor setting it again, but the only consequence is to
+	 * cause a spurrious wakeup on the next H_CEDE which is not an
+	 * issue.
+	 */
+	li	r0,0
+	stb	r0, VCPU_IRQ_PENDING(r4)
 no_xive:
 #endif /* CONFIG_KVM_XICS */
 
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index a102efeabf05..eef9ccafdc09 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -84,8 +84,7 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
 {
 	struct kvm_vcpu *vcpu = data;
 
-	/* We use the existing H_PROD mechanism to wake up the target */
-	vcpu->arch.prodded = 1;
+	vcpu->arch.irq_pending = 1;
 	smp_mb();
 	if (vcpu->arch.ceded)
 		kvmppc_fast_vcpu_kick(vcpu);
-- 
cgit 


From 2662efd050953824de5c9b24449d6b5b342db10b Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:37:14 +1100
Subject: KVM: PPC: Book3S HV: Check DR not IR to chose real vs virt mode MMIOs

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 34dbab7deb39..948f21cf84d5 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1464,7 +1464,7 @@ mc_cont:
 	li	r7, TM_SPC_PULL_OS_CTX
 	li	r6, TM_QW1_OS
 	mfmsr	r0
-	andi.	r0, r0, MSR_IR		/* in real mode? */
+	andi.	r0, r0, MSR_DR		/* in real mode? */
 	beq	2f
 	ld	r10, HSTATE_XIVE_TIMA_VIRT(r13)
 	cmpldi	cr0, r10, 0
-- 
cgit 


From 35c2405efc0142860c4b698f4c6331567c4ca1ef Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:37:15 +1100
Subject: KVM: PPC: Book3S HV: Make xive_pushed a byte, not a word

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h     | 2 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index bfe51356af5e..0c44fa67608d 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -739,7 +739,7 @@ struct kvm_vcpu_arch {
 	struct kvmppc_icp *icp; /* XICS presentation controller */
 	struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */
 	__be32 xive_cam_word;    /* Cooked W2 in proper endian with valid bit */
-	u32 xive_pushed;	 /* Is the VP pushed on the physical CPU ? */
+	u8 xive_pushed;		 /* Is the VP pushed on the physical CPU ? */
 	union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 948f21cf84d5..a7f429bc6de0 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1033,7 +1033,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	li	r9, TM_QW1_OS + TM_WORD2
 	stwcix	r11,r9,r10
 	li	r9, 1
-	stw	r9, VCPU_XIVE_PUSHED(r4)
+	stb	r9, VCPU_XIVE_PUSHED(r4)
 	eieio
 
 	/*
@@ -1458,7 +1458,7 @@ mc_cont:
 #endif
 #ifdef CONFIG_KVM_XICS
 	/* We are exiting, pull the VP from the XIVE */
-	lwz	r0, VCPU_XIVE_PUSHED(r9)
+	lbz	r0, VCPU_XIVE_PUSHED(r9)
 	cmpwi	cr0, r0, 0
 	beq	1f
 	li	r7, TM_SPC_PULL_OS_CTX
@@ -1487,7 +1487,7 @@ mc_cont:
 	/* Fixup some of the state for the next load */
 	li	r10, 0
 	li	r0, 0xff
-	stw	r10, VCPU_XIVE_PUSHED(r9)
+	stb	r10, VCPU_XIVE_PUSHED(r9)
 	stb	r10, (VCPU_XIVE_SAVED_STATE+3)(r9)
 	stb	r0, (VCPU_XIVE_SAVED_STATE+4)(r9)
 	eieio
-- 
cgit 


From 9b9b13a6d1537ddc4caccd6f1c41b78edbc08437 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 12 Jan 2018 13:37:16 +1100
Subject: KVM: PPC: Book3S HV: Keep XIVE escalation interrupt masked unless
 ceded

This works on top of the single escalation support. When in single
escalation, with this change, we will keep the escalation interrupt
disabled unless the VCPU is in H_CEDE (idle). In any other case, we
know the VCPU will be rescheduled and thus there is no need to take
escalation interrupts in the host whenever a guest interrupt fires.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h     |  3 ++
 arch/powerpc/kernel/asm-offsets.c       |  3 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 62 ++++++++++++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_xive.c          | 30 ++++++++++++++++
 4 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0c44fa67608d..fef8133becc8 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -740,7 +740,10 @@ struct kvm_vcpu_arch {
 	struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */
 	__be32 xive_cam_word;    /* Cooked W2 in proper endian with valid bit */
 	u8 xive_pushed;		 /* Is the VP pushed on the physical CPU ? */
+	u8 xive_esc_on;		 /* Is the escalation irq enabled ? */
 	union xive_tma_w01 xive_saved_state; /* W0..1 of XIVE thread state */
+	u64 xive_esc_raddr;	 /* Escalation interrupt ESB real addr */
+	u64 xive_esc_vaddr;	 /* Escalation interrupt ESB virt addr */
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 825089cf3e23..1672dffd94e2 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -734,6 +734,9 @@ int main(void)
 	DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu,
 					    arch.xive_cam_word));
 	DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed));
+	DEFINE(VCPU_XIVE_ESC_ON, offsetof(struct kvm_vcpu, arch.xive_esc_on));
+	DEFINE(VCPU_XIVE_ESC_RADDR, offsetof(struct kvm_vcpu, arch.xive_esc_raddr));
+	DEFINE(VCPU_XIVE_ESC_VADDR, offsetof(struct kvm_vcpu, arch.xive_esc_vaddr));
 #endif
 
 #ifdef CONFIG_KVM_EXIT_TIMING
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index a7f429bc6de0..a7a20b85d8eb 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1045,6 +1045,41 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 	 */
 	li	r0,0
 	stb	r0, VCPU_IRQ_PENDING(r4)
+
+	/*
+	 * In single escalation mode, if the escalation interrupt is
+	 * on, we mask it.
+	 */
+	lbz	r0, VCPU_XIVE_ESC_ON(r4)
+	cmpwi	r0,0
+	beq	1f
+	ld	r10, VCPU_XIVE_ESC_RADDR(r4)
+	li	r9, XIVE_ESB_SET_PQ_01
+	ldcix	r0, r10, r9
+	sync
+
+	/* We have a possible subtle race here: The escalation interrupt might
+	 * have fired and be on its way to the host queue while we mask it,
+	 * and if we unmask it early enough (re-cede right away), there is
+	 * a theorical possibility that it fires again, thus landing in the
+	 * target queue more than once which is a big no-no.
+	 *
+	 * Fortunately, solving this is rather easy. If the above load setting
+	 * PQ to 01 returns a previous value where P is set, then we know the
+	 * escalation interrupt is somewhere on its way to the host. In that
+	 * case we simply don't clear the xive_esc_on flag below. It will be
+	 * eventually cleared by the handler for the escalation interrupt.
+	 *
+	 * Then, when doing a cede, we check that flag again before re-enabling
+	 * the escalation interrupt, and if set, we abort the cede.
+	 */
+	andi.	r0, r0, XIVE_ESB_VAL_P
+	bne-	1f
+
+	/* Now P is 0, we can clear the flag */
+	li	r0, 0
+	stb	r0, VCPU_XIVE_ESC_ON(r4)
+1:
 no_xive:
 #endif /* CONFIG_KVM_XICS */
 
@@ -2756,7 +2791,32 @@ kvm_cede_prodded:
 	/* we've ceded but we want to give control to the host */
 kvm_cede_exit:
 	ld	r9, HSTATE_KVM_VCPU(r13)
-	b	guest_exit_cont
+#ifdef CONFIG_KVM_XICS
+	/* Abort if we still have a pending escalation */
+	lbz	r5, VCPU_XIVE_ESC_ON(r9)
+	cmpwi	r5, 0
+	beq	1f
+	li	r0, 0
+	stb	r0, VCPU_CEDED(r9)
+1:	/* Enable XIVE escalation */
+	li	r5, XIVE_ESB_SET_PQ_00
+	mfmsr	r0
+	andi.	r0, r0, MSR_DR		/* in real mode? */
+	beq	1f
+	ld	r10, VCPU_XIVE_ESC_VADDR(r9)
+	cmpdi	r10, 0
+	beq	3f
+	ldx	r0, r10, r5
+	b	2f
+1:	ld	r10, VCPU_XIVE_ESC_RADDR(r9)
+	cmpdi	r10, 0
+	beq	3f
+	ldcix	r0, r10, r5
+2:	sync
+	li	r0, 1
+	stb	r0, VCPU_XIVE_ESC_ON(r9)
+#endif /* CONFIG_KVM_XICS */
+3:	b	guest_exit_cont
 
 	/* Try to handle a machine check in real mode */
 machine_check_realmode:
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index eef9ccafdc09..7a047bc88f11 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -89,6 +89,17 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
 	if (vcpu->arch.ceded)
 		kvmppc_fast_vcpu_kick(vcpu);
 
+	/* Since we have the no-EOI flag, the interrupt is effectively
+	 * disabled now. Clearing xive_esc_on means we won't bother
+	 * doing so on the next entry.
+	 *
+	 * This also allows the entry code to know that if a PQ combination
+	 * of 10 is observed while xive_esc_on is true, it means the queue
+	 * contains an unprocessed escalation interrupt. We don't make use of
+	 * that knowledge today but might (see comment in book3s_hv_rmhandler.S)
+	 */
+	vcpu->arch.xive_esc_on = false;
+
 	return IRQ_HANDLED;
 }
 
@@ -134,6 +145,25 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
 		goto error;
 	}
 	xc->esc_virq_names[prio] = name;
+
+	/* In single escalation mode, we grab the ESB MMIO of the
+	 * interrupt and mask it. Also populate the VCPU v/raddr
+	 * of the ESB page for use by asm entry/exit code. Finally
+	 * set the XIVE_IRQ_NO_EOI flag which will prevent the
+	 * core code from performing an EOI on the escalation
+	 * interrupt, thus leaving it effectively masked after
+	 * it fires once.
+	 */
+	if (xc->xive->single_escalation) {
+		struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
+		struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+		xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
+		vcpu->arch.xive_esc_raddr = xd->eoi_page;
+		vcpu->arch.xive_esc_vaddr = (__force u64)xd->eoi_mmio;
+		xd->flags |= XIVE_IRQ_NO_EOI;
+	}
+
 	return 0;
 error:
 	irq_dispose_mapping(xc->esc_virq[prio]);
-- 
cgit