From 857b99e1405e219e741c494753fc78871d740d2b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 1 Sep 2017 16:17:27 +1000
Subject: KVM: PPC: Book3S HV: Handle unexpected interrupts better

At present, if an interrupt (i.e. an exception or trap) occurs in the
code where KVM is switching the MMU to or from guest context, we jump
to kvmppc_bad_host_intr, where we simply spin with interrupts disabled.
In this situation, it is hard to debug what happened because we get no
indication as to which interrupt occurred or where.  Typically we get
a cascade of stall and soft lockup warnings from other CPUs.

In order to get more information for debugging, this adds code to
create a stack frame on the emergency stack and save register values
to it.  We start half-way down the emergency stack in order to give
ourselves some chance of being able to do a stack trace on secondary
threads that are already on the emergency stack.

On POWER7 or POWER8, we then just spin, as before, because we don't
know what state the MMU context is in or what other threads are doing,
and we can't switch back to host context without coordinating with
other threads.  On POWER9 we can do better; there we load up the host
MMU context and jump to C code, which prints an oops message to the
console and panics.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_builtin.c    |   6 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 132 +++++++++++++++++++++++++++++++-
 2 files changed, 137 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 90644db9d38e..2791922e97e2 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -601,3 +601,9 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 		return xics_rm_h_eoi(vcpu, xirr);
 }
 #endif /* CONFIG_KVM_XICS */
+
+void kvmppc_bad_interrupt(struct pt_regs *regs)
+{
+	die("Bad interrupt in KVM entry/exit code", regs, SIGABRT);
+	panic("Bad KVM trap");
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index ec69fa45d5a2..985bdc0ba1c1 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -31,6 +31,7 @@
 #include <asm/tm.h>
 #include <asm/opal.h>
 #include <asm/xive-regs.h>
+#include <asm/thread_info.h>
 
 /* Sign-extend HDEC if not on POWER9 */
 #define EXTEND_HDEC(reg)			\
@@ -3134,10 +3135,139 @@ kvmppc_restore_tm:
 /*
  * We come here if we get any exception or interrupt while we are
  * executing host real mode code while in guest MMU context.
- * For now just spin, but we should do something better.
+ * r12 is (CR << 32) | vector
+ * r13 points to our PACA
+ * r12 is saved in HSTATE_SCRATCH0(r13)
+ * ctr is saved in HSTATE_SCRATCH1(r13) if RELOCATABLE
+ * r9 is saved in HSTATE_SCRATCH2(r13)
+ * r13 is saved in HSPRG1
+ * cfar is saved in HSTATE_CFAR(r13)
+ * ppr is saved in HSTATE_PPR(r13)
  */
 kvmppc_bad_host_intr:
+	/*
+	 * Switch to the emergency stack, but start half-way down in
+	 * case we were already on it.
+	 */
+	mr	r9, r1
+	std	r1, PACAR1(r13)
+	ld	r1, PACAEMERGSP(r13)
+	subi	r1, r1, THREAD_SIZE/2 + INT_FRAME_SIZE
+	std	r9, 0(r1)
+	std	r0, GPR0(r1)
+	std	r9, GPR1(r1)
+	std	r2, GPR2(r1)
+	SAVE_4GPRS(3, r1)
+	SAVE_2GPRS(7, r1)
+	srdi	r0, r12, 32
+	clrldi	r12, r12, 32
+	std	r0, _CCR(r1)
+	std	r12, _TRAP(r1)
+	andi.	r0, r12, 2
+	beq	1f
+	mfspr	r3, SPRN_HSRR0
+	mfspr	r4, SPRN_HSRR1
+	mfspr	r5, SPRN_HDAR
+	mfspr	r6, SPRN_HDSISR
+	b	2f
+1:	mfspr	r3, SPRN_SRR0
+	mfspr	r4, SPRN_SRR1
+	mfspr	r5, SPRN_DAR
+	mfspr	r6, SPRN_DSISR
+2:	std	r3, _NIP(r1)
+	std	r4, _MSR(r1)
+	std	r5, _DAR(r1)
+	std	r6, _DSISR(r1)
+	ld	r9, HSTATE_SCRATCH2(r13)
+	ld	r12, HSTATE_SCRATCH0(r13)
+	GET_SCRATCH0(r0)
+	SAVE_4GPRS(9, r1)
+	std	r0, GPR13(r1)
+	SAVE_NVGPRS(r1)
+	ld	r5, HSTATE_CFAR(r13)
+	std	r5, ORIG_GPR3(r1)
+	mflr	r3
+#ifdef CONFIG_RELOCATABLE
+	ld	r4, HSTATE_SCRATCH1(r13)
+#else
+	mfctr	r4
+#endif
+	mfxer	r5
+	lbz	r6, PACASOFTIRQEN(r13)
+	std	r3, _LINK(r1)
+	std	r4, _CTR(r1)
+	std	r5, _XER(r1)
+	std	r6, SOFTE(r1)
+	ld	r2, PACATOC(r13)
+	LOAD_REG_IMMEDIATE(3, 0x7265677368657265)
+	std	r3, STACK_FRAME_OVERHEAD-16(r1)
+
+	/*
+	 * On POWER9 do a minimal restore of the MMU and call C code,
+	 * which will print a message and panic.
+	 * XXX On POWER7 and POWER8, we just spin here since we don't
+	 * know what the other threads are doing (and we don't want to
+	 * coordinate with them) - but at least we now have register state
+	 * in memory that we might be able to look at from another CPU.
+	 */
+BEGIN_FTR_SECTION
 	b	.
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	ld	r10, VCPU_KVM(r9)
+
+	li	r0, 0
+	mtspr	SPRN_AMR, r0
+	mtspr	SPRN_IAMR, r0
+	mtspr	SPRN_CIABR, r0
+	mtspr	SPRN_DAWRX, r0
+
+	/* Flush the ERAT on radix P9 DD1 guest exit */
+BEGIN_FTR_SECTION
+	PPC_INVALIDATE_ERAT
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
+
+BEGIN_MMU_FTR_SECTION
+	b	4f
+END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
+
+	slbmte	r0, r0
+	slbia
+	ptesync
+	ld	r8, PACA_SLBSHADOWPTR(r13)
+	.rept	SLB_NUM_BOLTED
+	li	r3, SLBSHADOW_SAVEAREA
+	LDX_BE	r5, r8, r3
+	addi	r3, r3, 8
+	LDX_BE	r6, r8, r3
+	andis.	r7, r5, SLB_ESID_V@h
+	beq	3f
+	slbmte	r6, r5
+3:	addi	r8, r8, 16
+	.endr
+
+4:	lwz	r7, KVM_HOST_LPID(r10)
+	mtspr	SPRN_LPID, r7
+	mtspr	SPRN_PID, r0
+	ld	r8, KVM_HOST_LPCR(r10)
+	mtspr	SPRN_LPCR, r8
+	isync
+	li	r0, KVM_GUEST_MODE_NONE
+	stb	r0, HSTATE_IN_GUEST(r13)
+
+	/*
+	 * Turn on the MMU and jump to C code
+	 */
+	bcl	20, 31, .+4
+5:	mflr	r3
+	addi	r3, r3, 9f - 5b
+	ld	r4, PACAKMSR(r13)
+	mtspr	SPRN_SRR0, r3
+	mtspr	SPRN_SRR1, r4
+	rfid
+9:	addi	r3, r1, STACK_FRAME_OVERHEAD
+	bl	kvmppc_bad_interrupt
+	b	9b
 
 /*
  * This mimics the MSR transition on IRQ delivery.  The new guest MSR is taken
-- 
cgit 


From 4bb817ed833c070e5b5ee0482f74e2a5e1a12c4c Mon Sep 17 00:00:00 2001
From: Thomas Meyer <thomas@m3y3r.de>
Date: Sun, 3 Sep 2017 14:19:31 +0200
Subject: KVM: PPC: Book3S HV: Use ARRAY_SIZE macro

Use ARRAY_SIZE macro, rather than explicitly coding some variant of it
yourself.
Found with: find -type f -name "*.c" -o -name "*.h" | xargs perl -p -i -e
's/\bsizeof\s*\(\s*(\w+)\s*\)\s*\ /\s*sizeof\s*\(\s*\1\s*\[\s*0\s*\]\s*\)
/ARRAY_SIZE(\1)/g' and manual check/verification.

Signed-off-by: Thomas Meyer <thomas@m3y3r.de>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 73bf1ebfa78f..362fbd972f76 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/kvm_host.h>
+#include <linux/kernel.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/preempt.h>
@@ -1772,7 +1773,7 @@ static struct debugfs_timings_element {
 	{"cede",	offsetof(struct kvm_vcpu, arch.cede_time)},
 };
 
-#define N_TIMINGS	(sizeof(timings) / sizeof(timings[0]))
+#define N_TIMINGS	(ARRAY_SIZE(timings))
 
 struct debugfs_timings_state {
 	struct kvm_vcpu	*vcpu;
-- 
cgit 


From 4bdcb7016f95dd7668cd2f55f494ba449443e8d7 Mon Sep 17 00:00:00 2001
From: Thomas Meyer <thomas@m3y3r.de>
Date: Thu, 21 Sep 2017 00:29:36 +0200
Subject: KVM: PPC: BookE: Use vma_pages function

Use vma_pages function on vma object instead of explicit computation.
Found by coccinelle spatch "api/vma_pages.cocci"

Signed-off-by: Thomas Meyer <thomas@m3y3r.de>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/e500_mmu_host.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index c6c734424c70..423b21393bc9 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -377,7 +377,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 
 			start = vma->vm_pgoff;
 			end = start +
-			      ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
+			      vma_pages(vma);
 
 			pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
 
-- 
cgit 


From 9c7e53dc00a543478b46c6a90694a41be69a089d Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Thu, 5 Oct 2017 13:16:51 +0200
Subject: KVM: PPC: Book3S HV: Delete an error message for a failed memory
 allocation in kvmppc_allocate_hpt()

Omit an extra message for a memory allocation failure in this function.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 7c62967d672c..7df89d58ee91 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -106,7 +106,6 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
 	/* Allocate reverse map array */
 	rev = vmalloc(sizeof(struct revmap_entry) * npte);
 	if (!rev) {
-		pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n");
 		if (cma)
 			kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
 		else
-- 
cgit 


From 3f2bb76433bb4348cd9076e4d305a040ff2012a9 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 11 Oct 2017 16:01:08 +1100
Subject: KVM: PPC: Book3S PR: Enable in-kernel TCE handlers for PR KVM

The handlers support PR KVM from the day one; however the PR KVM's
enable/disable hcalls handler missed these ones.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_pr_papr.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index 8a4205fa774f..dae3be5ff42b 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -419,6 +419,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd)
 	case H_PROTECT:
 	case H_BULK_REMOVE:
 	case H_PUT_TCE:
+	case H_PUT_TCE_INDIRECT:
+	case H_STUFF_TCE:
 	case H_CEDE:
 	case H_LOGICAL_CI_LOAD:
 	case H_LOGICAL_CI_STORE:
-- 
cgit 


From 891f1ebf659edd24bba93182a8de0ba082dcff72 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 13 Sep 2017 16:29:22 +1000
Subject: KVM: PPC: Book3S HV: Explicitly disable HPT operations on radix
 guests

This adds code to make sure that we don't try to access the
non-existent HPT for a radix guest using the htab file for the VM
in debugfs, a file descriptor obtained using the KVM_PPC_GET_HTAB_FD
ioctl, or via the KVM_PPC_RESIZE_HPT_{PREPARE,COMMIT} ioctls.

At present nothing bad happens if userspace does access these
interfaces on a radix guest, mostly because kvmppc_hpt_npte()
gives 0 for a radix guest, which in turn is because 1 << -4
comes out as 0 on POWER processors.  However, that relies on
undefined behaviour, so it is better to be explicit about not
accessing the HPT for a radix guest.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 7df89d58ee91..624b01175b79 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1454,7 +1454,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
 	struct kvm_resize_hpt *resize;
 	int ret;
 
-	if (flags != 0)
+	if (flags != 0 || kvm_is_radix(kvm))
 		return -EINVAL;
 
 	if (shift && ((shift < 18) || (shift > 46)))
@@ -1520,7 +1520,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
 	struct kvm_resize_hpt *resize;
 	long ret;
 
-	if (flags != 0)
+	if (flags != 0 || kvm_is_radix(kvm))
 		return -EINVAL;
 
 	if (shift && ((shift < 18) || (shift > 46)))
@@ -1706,6 +1706,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
 
 	if (!access_ok(VERIFY_WRITE, buf, count))
 		return -EFAULT;
+	if (kvm_is_radix(kvm))
+		return 0;
 
 	first_pass = ctx->first_pass;
 	flags = ctx->flags;
@@ -1803,6 +1805,8 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 
 	if (!access_ok(VERIFY_READ, buf, count))
 		return -EFAULT;
+	if (kvm_is_radix(kvm))
+		return -EINVAL;
 
 	/* lock out vcpus from running while we're doing this */
 	mutex_lock(&kvm->lock);
@@ -2001,6 +2005,10 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
 	struct kvm *kvm;
 	__be64 *hptp;
 
+	kvm = p->kvm;
+	if (kvm_is_radix(kvm))
+		return 0;
+
 	ret = mutex_lock_interruptible(&p->mutex);
 	if (ret)
 		return ret;
@@ -2023,7 +2031,6 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
 		}
 	}
 
-	kvm = p->kvm;
 	i = p->hpt_index;
 	hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
 	for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
-- 
cgit 


From 00bb6ae5006205e041ce9784c819460562351d47 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 26 Oct 2017 17:00:22 +1100
Subject: KVM: PPC: Book3S HV: Don't call real-mode XICS hypercall handlers if
 not enabled

When running a guest on a POWER9 system with the in-kernel XICS
emulation disabled (for example by running QEMU with the parameter
"-machine pseries,kernel_irqchip=off"), the kernel does not pass
the XICS-related hypercalls such as H_CPPR up to userspace for
emulation there as it should.

The reason for this is that the real-mode handlers for these
hypercalls don't check whether a XICS device has been instantiated
before calling the xics-on-xive code.  That code doesn't check
either, leading to potential NULL pointer dereferences because
vcpu->arch.xive_vcpu is NULL.  Those dereferences won't cause an
exception in real mode but will lead to kernel memory corruption.

This fixes it by adding kvmppc_xics_enabled() checks before calling
the XICS functions.

Cc: stable@vger.kernel.org # v4.11+
Fixes: 5af50993850a ("KVM: PPC: Book3S HV: Native usage of the XIVE interrupt controller")
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_builtin.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 2791922e97e2..e38cc2df6d2a 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -529,6 +529,8 @@ static inline bool is_rm(void)
 
 unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
 {
+	if (!kvmppc_xics_enabled(vcpu))
+		return H_TOO_HARD;
 	if (xive_enabled()) {
 		if (is_rm())
 			return xive_rm_h_xirr(vcpu);
@@ -541,6 +543,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
 
 unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
 {
+	if (!kvmppc_xics_enabled(vcpu))
+		return H_TOO_HARD;
 	vcpu->arch.gpr[5] = get_tb();
 	if (xive_enabled()) {
 		if (is_rm())
@@ -554,6 +558,8 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
 
 unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
 {
+	if (!kvmppc_xics_enabled(vcpu))
+		return H_TOO_HARD;
 	if (xive_enabled()) {
 		if (is_rm())
 			return xive_rm_h_ipoll(vcpu, server);
@@ -567,6 +573,8 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
 int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 		    unsigned long mfrr)
 {
+	if (!kvmppc_xics_enabled(vcpu))
+		return H_TOO_HARD;
 	if (xive_enabled()) {
 		if (is_rm())
 			return xive_rm_h_ipi(vcpu, server, mfrr);
@@ -579,6 +587,8 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 
 int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 {
+	if (!kvmppc_xics_enabled(vcpu))
+		return H_TOO_HARD;
 	if (xive_enabled()) {
 		if (is_rm())
 			return xive_rm_h_cppr(vcpu, cppr);
@@ -591,6 +601,8 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 
 int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 {
+	if (!kvmppc_xics_enabled(vcpu))
+		return H_TOO_HARD;
 	if (xive_enabled()) {
 		if (is_rm())
 			return xive_rm_h_eoi(vcpu, xirr);
-- 
cgit 


From f4093ee9d0bd8299ccdb26120a35bfe293fe95e7 Mon Sep 17 00:00:00 2001
From: Greg Kurz <groug@kaod.org>
Date: Mon, 16 Oct 2017 12:29:44 +0200
Subject: KVM: PPC: Book3S PR: Only install valid SLBs during KVM_SET_SREGS

Userland passes an array of 64 SLB descriptors to KVM_SET_SREGS,
some of which are valid (ie, SLB_ESID_V is set) and the rest are
likely all-zeroes (with QEMU at least).

Each of them is then passed to kvmppc_mmu_book3s_64_slbmte(), which
assumes to find the SLB index in the 3 lower bits of its rb argument.
When passed zeroed arguments, it happily overwrites the 0th SLB entry
with zeroes. This is exactly what happens while doing live migration
with QEMU when the destination pushes the incoming SLB descriptors to
KVM PR. When reloading the SLBs at the next synchronization, QEMU first
clears its SLB array and only restore valid ones, but the 0th one is
now gone and we cannot access the corresponding memory anymore:

(qemu) x/x $pc
c0000000000b742c: Cannot access memory

To avoid this, let's filter out non-valid SLB entries. While here, we
also force a full SLB flush before installing new entries. Since SLB
is for 64-bit only, we now build this path conditionally to avoid a
build break on 32-bit, which doesn't define SLB_ESID_V.

Signed-off-by: Greg Kurz <groug@kaod.org>
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_pr.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 69a09444d46e..d0dc8624198f 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1326,12 +1326,22 @@ static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu,
 	kvmppc_set_pvr_pr(vcpu, sregs->pvr);
 
 	vcpu3s->sdr1 = sregs->u.s.sdr1;
+#ifdef CONFIG_PPC_BOOK3S_64
 	if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
+		/* Flush all SLB entries */
+		vcpu->arch.mmu.slbmte(vcpu, 0, 0);
+		vcpu->arch.mmu.slbia(vcpu);
+
 		for (i = 0; i < 64; i++) {
-			vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
-						    sregs->u.s.ppc64.slb[i].slbe);
+			u64 rb = sregs->u.s.ppc64.slb[i].slbe;
+			u64 rs = sregs->u.s.ppc64.slb[i].slbv;
+
+			if (rb & SLB_ESID_V)
+				vcpu->arch.mmu.slbmte(vcpu, rs, rb);
 		}
-	} else {
+	} else
+#endif
+	{
 		for (i = 0; i < 16; i++) {
 			vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
 		}
-- 
cgit 


From 93897a1f4b1a96171b4a445207a371df1b0d4601 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Sat, 21 Oct 2017 14:32:30 +1000
Subject: KVM: PPC: Book3S: Fix gas warning due to using r0 as immediate 0

This fixes the message:

arch/powerpc/kvm/book3s_segment.S: Assembler messages:
arch/powerpc/kvm/book3s_segment.S:330: Warning: invalid register expression

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_slb.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index 3589c4e3d49b..688722acd692 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -113,7 +113,7 @@ slb_do_enter:
 
 	/* Remove all SLB entries that are in use. */
 
-	li	r0, r0
+	li	r0, 0
 	slbmte	r0, r0
 	slbia
 
-- 
cgit 


From 8dc6cca556e4126f77b71cf8e0c45ccc78d1d213 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 11 Sep 2017 15:29:45 +1000
Subject: KVM: PPC: Book3S HV: Don't rely on host's page size information

This removes the dependence of KVM on the mmu_psize_defs array (which
stores information about hardware support for various page sizes) and
the things derived from it, chiefly hpte_page_sizes[], hpte_page_size(),
hpte_actual_page_size() and get_sllp_encoding().  We also no longer
rely on the mmu_slb_size variable or the MMU_FTR_1T_SEGMENTS feature
bit.

The reason for doing this is so we can support a HPT guest on a radix
host.  In a radix host, the mmu_psize_defs array contains information
about page sizes supported by the MMU in radix mode rather than the
page sizes supported by the MMU in HPT mode.  Similarly, mmu_slb_size
and the MMU_FTR_1T_SEGMENTS bit are not set.

Instead we hard-code knowledge of the behaviour of the HPT MMU in the
POWER7, POWER8 and POWER9 processors (which are the only processors
supported by HV KVM) - specifically the encoding of the LP fields in
the HPT and SLB entries, and the fact that they have 32 SLB entries
and support 1TB segments.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 116 +++++++++++++++++++++++++------
 arch/powerpc/kvm/book3s_64_mmu_hv.c      |  13 ++--
 arch/powerpc/kvm/book3s_hv.c             |  39 +++++------
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      |  11 +--
 4 files changed, 126 insertions(+), 53 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index d55c7f881ce7..b21936c7b190 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -107,18 +107,96 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
 	hpte[0] = cpu_to_be64(hpte_v);
 }
 
+/*
+ * These functions encode knowledge of the POWER7/8/9 hardware
+ * interpretations of the HPTE LP (large page size) field.
+ */
+static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l)
+{
+	unsigned int lphi;
+
+	if (!(h & HPTE_V_LARGE))
+		return 12;	/* 4kB */
+	lphi = (l >> 16) & 0xf;
+	switch ((l >> 12) & 0xf) {
+	case 0:
+		return !lphi ? 24 : -1;		/* 16MB */
+		break;
+	case 1:
+		return 16;			/* 64kB */
+		break;
+	case 3:
+		return !lphi ? 34 : -1;		/* 16GB */
+		break;
+	case 7:
+		return (16 << 8) + 12;		/* 64kB in 4kB */
+		break;
+	case 8:
+		if (!lphi)
+			return (24 << 8) + 16;	/* 16MB in 64kkB */
+		if (lphi == 3)
+			return (24 << 8) + 12;	/* 16MB in 4kB */
+		break;
+	}
+	return -1;
+}
+
+static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l)
+{
+	return kvmppc_hpte_page_shifts(h, l) & 0xff;
+}
+
+static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l)
+{
+	int tmp = kvmppc_hpte_page_shifts(h, l);
+
+	if (tmp >= 0x100)
+		tmp >>= 8;
+	return tmp;
+}
+
+static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r)
+{
+	return 1ul << kvmppc_hpte_actual_page_shift(v, r);
+}
+
+static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift)
+{
+	switch (base_shift) {
+	case 12:
+		switch (actual_shift) {
+		case 12:
+			return 0;
+		case 16:
+			return 7;
+		case 24:
+			return 0x38;
+		}
+		break;
+	case 16:
+		switch (actual_shift) {
+		case 16:
+			return 1;
+		case 24:
+			return 8;
+		}
+		break;
+	case 24:
+		return 0;
+	}
+	return -1;
+}
+
 static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 					     unsigned long pte_index)
 {
-	int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
-	unsigned int penc;
+	int a_pgshift, b_pgshift;
 	unsigned long rb = 0, va_low, sllp;
-	unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
 
-	if (v & HPTE_V_LARGE) {
-		i = hpte_page_sizes[lp];
-		b_psize = i & 0xf;
-		a_psize = i >> 4;
+	b_pgshift = a_pgshift = kvmppc_hpte_page_shifts(v, r);
+	if (a_pgshift >= 0x100) {
+		b_pgshift &= 0xff;
+		a_pgshift >>= 8;
 	}
 
 	/*
@@ -152,37 +230,33 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
 		va_low ^= v >> (SID_SHIFT_1T - 16);
 	va_low &= 0x7ff;
 
-	switch (b_psize) {
-	case MMU_PAGE_4K:
-		sllp = get_sllp_encoding(a_psize);
-		rb |= sllp << 5;	/*  AP field */
+	if (b_pgshift == 12) {
+		if (a_pgshift > 12) {
+			sllp = (a_pgshift == 16) ? 5 : 4;
+			rb |= sllp << 5;	/*  AP field */
+		}
 		rb |= (va_low & 0x7ff) << 12;	/* remaining 11 bits of AVA */
-		break;
-	default:
-	{
+	} else {
 		int aval_shift;
 		/*
 		 * remaining bits of AVA/LP fields
 		 * Also contain the rr bits of LP
 		 */
-		rb |= (va_low << mmu_psize_defs[b_psize].shift) & 0x7ff000;
+		rb |= (va_low << b_pgshift) & 0x7ff000;
 		/*
 		 * Now clear not needed LP bits based on actual psize
 		 */
-		rb &= ~((1ul << mmu_psize_defs[a_psize].shift) - 1);
+		rb &= ~((1ul << a_pgshift) - 1);
 		/*
 		 * AVAL field 58..77 - base_page_shift bits of va
 		 * we have space for 58..64 bits, Missing bits should
 		 * be zero filled. +1 is to take care of L bit shift
 		 */
-		aval_shift = 64 - (77 - mmu_psize_defs[b_psize].shift) + 1;
+		aval_shift = 64 - (77 - b_pgshift) + 1;
 		rb |= ((va_low << aval_shift) & 0xfe);
 
 		rb |= 1;		/* L field */
-		penc = mmu_psize_defs[b_psize].penc[a_psize];
-		rb |= penc << 12;	/* LP field */
-		break;
-	}
+		rb |= r & 0xff000 & ((1ul << a_pgshift) - 1); /* LP field */
 	}
 	rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8;	/* B field */
 	return rb;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 624b01175b79..cc21d3c71a8d 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -333,7 +333,7 @@ static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
 {
 	unsigned long ra_mask;
 
-	ra_mask = hpte_page_size(v, r) - 1;
+	ra_mask = kvmppc_actual_pgsz(v, r) - 1;
 	return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
 }
 
@@ -504,7 +504,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		mmio_update = atomic64_read(&kvm->arch.mmio_update);
 		if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
 			r = vcpu->arch.pgfault_cache->rpte;
-			psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r);
+			psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0],
+						   r);
 			gpa_base = r & HPTE_R_RPN & ~(psize - 1);
 			gfn_base = gpa_base >> PAGE_SHIFT;
 			gpa = gpa_base | (ea & (psize - 1));
@@ -533,7 +534,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		return RESUME_GUEST;
 
 	/* Translate the logical address and get the page */
-	psize = hpte_page_size(hpte[0], r);
+	psize = kvmppc_actual_pgsz(hpte[0], r);
 	gpa_base = r & HPTE_R_RPN & ~(psize - 1);
 	gfn_base = gpa_base >> PAGE_SHIFT;
 	gpa = gpa_base | (ea & (psize - 1));
@@ -797,7 +798,7 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
 
 	/* Now check and modify the HPTE */
 	ptel = rev[i].guest_rpte;
-	psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
+	psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel);
 	if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
 	    hpte_rpn(ptel, psize) == gfn) {
 		hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
@@ -1091,7 +1092,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 				rev[i].guest_rpte |= HPTE_R_C;
 				note_hpte_modification(kvm, &rev[i]);
 			}
-			n = hpte_page_size(v, r);
+			n = kvmppc_actual_pgsz(v, r);
 			n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
 			if (n > npages_dirty)
 				npages_dirty = n;
@@ -1266,7 +1267,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
 	guest_rpte = rev->guest_rpte;
 
 	ret = -EIO;
-	apsize = hpte_page_size(vpte, guest_rpte);
+	apsize = kvmppc_actual_pgsz(vpte, guest_rpte);
 	if (!apsize)
 		goto out;
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 9634425f0f39..b3817df58e0e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3300,22 +3300,21 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 }
 
 static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
-				     int linux_psize)
+				     int shift, int sllp)
 {
-	struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
-
-	if (!def->shift)
-		return;
-	(*sps)->page_shift = def->shift;
-	(*sps)->slb_enc = def->sllp;
-	(*sps)->enc[0].page_shift = def->shift;
-	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
+	(*sps)->page_shift = shift;
+	(*sps)->slb_enc = sllp;
+	(*sps)->enc[0].page_shift = shift;
+	(*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
 	/*
-	 * Add 16MB MPSS support if host supports it
+	 * Add 16MB MPSS support (may get filtered out by userspace)
 	 */
-	if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) {
-		(*sps)->enc[1].page_shift = 24;
-		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
+	if (shift != 24) {
+		int penc = kvmppc_pgsize_lp_encoding(shift, 24);
+		if (penc != -1) {
+			(*sps)->enc[1].page_shift = 24;
+			(*sps)->enc[1].pte_enc = penc;
+		}
 	}
 	(*sps)++;
 }
@@ -3340,16 +3339,15 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
 	info->data_keys = 32;
 	info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
 
-	info->flags = KVM_PPC_PAGE_SIZES_REAL;
-	if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
-		info->flags |= KVM_PPC_1T_SEGMENTS;
-	info->slb_size = mmu_slb_size;
+	/* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
+	info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
+	info->slb_size = 32;
 
 	/* We only support these sizes for now, and no muti-size segments */
 	sps = &info->sps[0];
-	kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
-	kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
-	kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
+	kvmppc_add_seg_page_size(&sps, 12, 0);
+	kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
+	kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
 
 	return 0;
 }
@@ -4352,4 +4350,3 @@ module_exit(kvmppc_book3s_exit_hv);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_MISCDEV(KVM_MINOR);
 MODULE_ALIAS("devname:kvm");
-
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 4efe364f1188..cf98f17c1aa6 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -129,7 +129,7 @@ static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
 	unsigned long *rmap;
 	unsigned long gfn;
 
-	gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr));
+	gfn = hpte_rpn(hpte_gr, kvmppc_actual_pgsz(hpte_v, hpte_gr));
 	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
 	if (!memslot)
 		return NULL;
@@ -169,7 +169,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 	}
 	*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
 	if (rcbits & HPTE_R_C)
-		kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r));
+		kvmppc_update_rmap_change(rmap,
+					  kvmppc_actual_pgsz(hpte_v, hpte_r));
 	unlock_rmap(rmap);
 }
 
@@ -193,7 +194,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
 	if (kvm_is_radix(kvm))
 		return H_FUNCTION;
-	psize = hpte_page_size(pteh, ptel);
+	psize = kvmppc_actual_pgsz(pteh, ptel);
 	if (!psize)
 		return H_PARAMETER;
 	writing = hpte_is_writable(ptel);
@@ -848,7 +849,7 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
 		r = be64_to_cpu(hpte[1]);
 		gr |= r & (HPTE_R_R | HPTE_R_C);
 		if (r & HPTE_R_C) {
-			unsigned long psize = hpte_page_size(v, r);
+			unsigned long psize = kvmppc_actual_pgsz(v, r);
 			hpte[1] = cpu_to_be64(r & ~HPTE_R_C);
 			eieio();
 			rmap = revmap_for_hpte(kvm, v, gr);
@@ -1014,7 +1015,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 			 * Check the HPTE again, including base page size
 			 */
 			if ((v & valid) && (v & mask) == val &&
-			    hpte_base_page_size(v, r) == (1ul << pshift))
+			    kvmppc_hpte_base_page_shift(v, r) == pshift)
 				/* Return with the HPTE still locked */
 				return (hash << 3) + (i >> 1);
 
-- 
cgit 


From 1b151ce466175746b1b1a87d42ba5f5a050a5aba Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 13 Sep 2017 15:53:48 +1000
Subject: KVM: PPC: Book3S HV: Rename hpte_setup_done to mmu_ready

This renames the kvm->arch.hpte_setup_done field to mmu_ready because
we will want to use it for radix guests too -- both for setting things
up before vcpu execution, and for excluding vcpus from executing while
MMU-related things get changed, such as in future switching the MMU
from radix to HPT mode or vice-versa.

This also moves the call to kvmppc_setup_partition_table() that was
done in kvmppc_hv_setup_htab_rma() for HPT guests, and the setting
of mmu_ready, into the caller in kvmppc_vcpu_run_hv().

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h |  2 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c | 36 +++++++++++++++++-----------------
 arch/powerpc/kvm/book3s_hv.c        | 39 ++++++++++++++++++++++++-------------
 3 files changed, 44 insertions(+), 33 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e372ed871c51..49493ea5520b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -276,7 +276,7 @@ struct kvm_arch {
 	int tlbie_lock;
 	unsigned long lpcr;
 	unsigned long vrma_slb_v;
-	int hpte_setup_done;
+	int mmu_ready;
 	atomic_t vcpus_running;
 	u32 online_vcores;
 	atomic_t hpte_mod_interest;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index cc21d3c71a8d..84728038e943 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -140,12 +140,12 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 		return -EINVAL;
 
 	mutex_lock(&kvm->lock);
-	if (kvm->arch.hpte_setup_done) {
-		kvm->arch.hpte_setup_done = 0;
-		/* order hpte_setup_done vs. vcpus_running */
+	if (kvm->arch.mmu_ready) {
+		kvm->arch.mmu_ready = 0;
+		/* order mmu_ready vs. vcpus_running */
 		smp_mb();
 		if (atomic_read(&kvm->arch.vcpus_running)) {
-			kvm->arch.hpte_setup_done = 1;
+			kvm->arch.mmu_ready = 1;
 			goto out;
 		}
 	}
@@ -1533,15 +1533,15 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
 
 	/* This shouldn't be possible */
 	ret = -EIO;
-	if (WARN_ON(!kvm->arch.hpte_setup_done))
+	if (WARN_ON(!kvm->arch.mmu_ready))
 		goto out_no_hpt;
 
 	/* Stop VCPUs from running while we mess with the HPT */
-	kvm->arch.hpte_setup_done = 0;
+	kvm->arch.mmu_ready = 0;
 	smp_mb();
 
 	/* Boot all CPUs out of the guest so they re-read
-	 * hpte_setup_done */
+	 * mmu_ready */
 	on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
 
 	ret = -ENXIO;
@@ -1564,7 +1564,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
 
 out:
 	/* Let VCPUs run again */
-	kvm->arch.hpte_setup_done = 1;
+	kvm->arch.mmu_ready = 1;
 	smp_mb();
 out_no_hpt:
 	resize_hpt_release(kvm, resize);
@@ -1802,7 +1802,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 	unsigned long tmp[2];
 	ssize_t nb;
 	long int err, ret;
-	int hpte_setup;
+	int mmu_ready;
 
 	if (!access_ok(VERIFY_READ, buf, count))
 		return -EFAULT;
@@ -1811,13 +1811,13 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 
 	/* lock out vcpus from running while we're doing this */
 	mutex_lock(&kvm->lock);
-	hpte_setup = kvm->arch.hpte_setup_done;
-	if (hpte_setup) {
-		kvm->arch.hpte_setup_done = 0;	/* temporarily */
-		/* order hpte_setup_done vs. vcpus_running */
+	mmu_ready = kvm->arch.mmu_ready;
+	if (mmu_ready) {
+		kvm->arch.mmu_ready = 0;	/* temporarily */
+		/* order mmu_ready vs. vcpus_running */
 		smp_mb();
 		if (atomic_read(&kvm->arch.vcpus_running)) {
-			kvm->arch.hpte_setup_done = 1;
+			kvm->arch.mmu_ready = 1;
 			mutex_unlock(&kvm->lock);
 			return -EBUSY;
 		}
@@ -1870,7 +1870,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 				       "r=%lx\n", ret, i, v, r);
 				goto out;
 			}
-			if (!hpte_setup && is_vrma_hpte(v)) {
+			if (!mmu_ready && is_vrma_hpte(v)) {
 				unsigned long psize = hpte_base_page_size(v, r);
 				unsigned long senc = slb_pgsize_encoding(psize);
 				unsigned long lpcr;
@@ -1879,7 +1879,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 					(VRMA_VSID << SLB_VSID_SHIFT_1T);
 				lpcr = senc << (LPCR_VRMASD_SH - 4);
 				kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
-				hpte_setup = 1;
+				mmu_ready = 1;
 			}
 			++i;
 			hptp += 2;
@@ -1895,9 +1895,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 	}
 
  out:
-	/* Order HPTE updates vs. hpte_setup_done */
+	/* Order HPTE updates vs. mmu_ready */
 	smp_wmb();
-	kvm->arch.hpte_setup_done = hpte_setup;
+	kvm->arch.mmu_ready = mmu_ready;
 	mutex_unlock(&kvm->lock);
 
 	if (err)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b3817df58e0e..f6522075edbe 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -115,6 +115,7 @@ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
 
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
+static void kvmppc_setup_partition_table(struct kvm *kvm);
 
 static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
 		int *ip)
@@ -3198,6 +3199,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	unsigned long ebb_regs[3] = {};	/* shut up GCC */
 	unsigned long user_tar = 0;
 	unsigned int user_vrsave;
+	struct kvm *kvm;
 
 	if (!vcpu->arch.sane) {
 		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -3235,13 +3237,25 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		return -EINTR;
 	}
 
-	atomic_inc(&vcpu->kvm->arch.vcpus_running);
-	/* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
+	kvm = vcpu->kvm;
+	atomic_inc(&kvm->arch.vcpus_running);
+	/* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
 	smp_mb();
 
-	/* On the first time here, set up HTAB and VRMA */
-	if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) {
-		r = kvmppc_hv_setup_htab_rma(vcpu);
+	/* On the first time here, set up MMU if necessary */
+	if (!vcpu->kvm->arch.mmu_ready) {
+		mutex_lock(&kvm->lock);
+		r = 0;
+		if (!kvm->arch.mmu_ready) {
+			if (!kvm_is_radix(vcpu->kvm))
+				r = kvmppc_hv_setup_htab_rma(vcpu);
+			if (!r) {
+				if (cpu_has_feature(CPU_FTR_ARCH_300))
+					kvmppc_setup_partition_table(kvm);
+				kvm->arch.mmu_ready = 1;
+			}
+		}
+		mutex_unlock(&kvm->lock);
 		if (r)
 			goto out;
 	}
@@ -3530,6 +3544,10 @@ static void kvmppc_setup_partition_table(struct kvm *kvm)
 	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
 }
 
+/*
+ * Set up HPT (hashed page table) and RMA (real-mode area).
+ * Must be called with kvm->lock held.
+ */
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 {
 	int err = 0;
@@ -3541,10 +3559,6 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 	unsigned long psize, porder;
 	int srcu_idx;
 
-	mutex_lock(&kvm->lock);
-	if (kvm->arch.hpte_setup_done)
-		goto out;	/* another vcpu beat us to it */
-
 	/* Allocate hashed page table (if not done already) and reset it */
 	if (!kvm->arch.hpt.virt) {
 		int order = KVM_DEFAULT_HPT_ORDER;
@@ -3603,18 +3617,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 		/* the -4 is to account for senc values starting at 0x10 */
 		lpcr = senc << (LPCR_VRMASD_SH - 4);
 		kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
-	} else {
-		kvmppc_setup_partition_table(kvm);
 	}
 
-	/* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
+	/* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
 	smp_wmb();
-	kvm->arch.hpte_setup_done = 1;
 	err = 0;
  out_srcu:
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
  out:
-	mutex_unlock(&kvm->lock);
 	return err;
 
  up_out:
@@ -3769,6 +3779,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	 */
 	if (radix_enabled()) {
 		kvm->arch.radix = 1;
+		kvm->arch.mmu_ready = 1;
 		lpcr &= ~LPCR_VPM1;
 		lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
 		ret = kvmppc_init_vm_radix(kvm);
-- 
cgit 


From e641a317830b6bd26e6dc2ef5fe2c1c181dd5cc2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 26 Oct 2017 16:39:19 +1100
Subject: KVM: PPC: Book3S HV: Unify dirty page map between HPT and radix

Currently, the HPT code in HV KVM maintains a dirty bit per guest page
in the rmap array, whether or not dirty page tracking has been enabled
for the memory slot.  In contrast, the radix code maintains a dirty
bit per guest page in memslot->dirty_bitmap, and only does so when
dirty page tracking has been enabled.

This changes the HPT code to maintain the dirty bits in the memslot
dirty_bitmap like radix does.  This results in slightly less code
overall, and will mean that we do not lose the dirty bits when
transitioning between HPT and radix mode in future.

There is one minor change to behaviour as a result.  With HPT, when
dirty tracking was enabled for a memslot, we would previously clear
all the dirty bits at that point (both in the HPT entries and in the
rmap arrays), meaning that a KVM_GET_DIRTY_LOG ioctl immediately
following would show no pages as dirty (assuming no vcpus have run
in the meantime).  With this change, the dirty bits on HPT entries
are not cleared at the point where dirty tracking is enabled, so
KVM_GET_DIRTY_LOG would show as dirty any guest pages that are
resident in the HPT and dirty.  This is consistent with what happens
on radix.

This also fixes a bug in the mark_pages_dirty() function for radix
(in the sense that the function no longer exists).  In the case where
a large page of 64 normal pages or more is marked dirty, the
addressing of the dirty bitmap was incorrect and could write past
the end of the bitmap.  Fortunately this case was never hit in
practice because a 2MB large page is only 32 x 64kB pages, and we
don't support backing the guest with 1GB huge pages at this point.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_book3s.h    |  3 +-
 arch/powerpc/include/asm/kvm_book3s_64.h | 24 +++++++++++++
 arch/powerpc/include/asm/kvm_host.h      |  3 --
 arch/powerpc/kvm/book3s_64_mmu_hv.c      | 39 ++++++---------------
 arch/powerpc/kvm/book3s_64_mmu_radix.c   | 50 +++++---------------------
 arch/powerpc/kvm/book3s_hv.c             | 30 +++++++---------
 arch/powerpc/kvm/book3s_hv_rm_mmu.c      | 60 ++++++++++++++++++++------------
 7 files changed, 96 insertions(+), 113 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index b8d5b8e35244..9a667007bff8 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -216,7 +216,8 @@ extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa,
 			bool writing, bool *writable);
 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 			unsigned long *rmap, long pte_index, int realmode);
-extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize);
+extern void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot,
+			unsigned long gfn, unsigned long psize);
 extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
 			unsigned long pte_index);
 void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index b21936c7b190..735cfa35298a 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -20,6 +20,8 @@
 #ifndef __ASM_KVM_BOOK3S_64_H__
 #define __ASM_KVM_BOOK3S_64_H__
 
+#include <linux/string.h>
+#include <asm/bitops.h>
 #include <asm/book3s/64/mmu-hash.h>
 
 /* Power architecture requires HPT is at least 256kiB, at most 64TiB */
@@ -444,6 +446,28 @@ static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt)
 	return (1UL << (hpt->order - 7)) - 1;
 }
 
+/* Set bits in a dirty bitmap, which is in LE format */
+static inline void set_dirty_bits(unsigned long *map, unsigned long i,
+				  unsigned long npages)
+{
+
+	if (npages >= 8)
+		memset((char *)map + i / 8, 0xff, npages / 8);
+	else
+		for (; npages; ++i, --npages)
+			__set_bit_le(i, map);
+}
+
+static inline void set_dirty_bits_atomic(unsigned long *map, unsigned long i,
+					 unsigned long npages)
+{
+	if (npages >= 8)
+		memset((char *)map + i / 8, 0xff, npages / 8);
+	else
+		for (; npages; ++i, --npages)
+			set_bit_le(i, map);
+}
+
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 49493ea5520b..d831a3883175 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -235,10 +235,7 @@ struct revmap_entry {
  */
 #define KVMPPC_RMAP_LOCK_BIT	63
 #define KVMPPC_RMAP_RC_SHIFT	32
-#define KVMPPC_RMAP_CHG_SHIFT	48
 #define KVMPPC_RMAP_REFERENCED	(HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
-#define KVMPPC_RMAP_CHANGED	(HPTE_R_C << KVMPPC_RMAP_RC_SHIFT)
-#define KVMPPC_RMAP_CHG_ORDER	(0x3ful << KVMPPC_RMAP_CHG_SHIFT)
 #define KVMPPC_RMAP_PRESENT	0x100000000ul
 #define KVMPPC_RMAP_INDEX	0xfffffffful
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 84728038e943..944f7a532879 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -776,6 +776,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 
 /* Must be called with both HPTE and rmap locked */
 static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
+			      struct kvm_memory_slot *memslot,
 			      unsigned long *rmapp, unsigned long gfn)
 {
 	__be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
@@ -807,8 +808,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
 		/* Harvest R and C */
 		rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
 		*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
-		if (rcbits & HPTE_R_C)
-			kvmppc_update_rmap_change(rmapp, psize);
+		if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap)
+			kvmppc_update_dirty_map(memslot, gfn, psize);
 		if (rcbits & ~rev[i].guest_rpte) {
 			rev[i].guest_rpte = ptel | rcbits;
 			note_hpte_modification(kvm, &rev[i]);
@@ -846,7 +847,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			continue;
 		}
 
-		kvmppc_unmap_hpte(kvm, i, rmapp, gfn);
+		kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn);
 		unlock_rmap(rmapp);
 		__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 	}
@@ -1029,14 +1030,6 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 
  retry:
 	lock_rmap(rmapp);
-	if (*rmapp & KVMPPC_RMAP_CHANGED) {
-		long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER)
-			>> KVMPPC_RMAP_CHG_SHIFT;
-		*rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
-		npages_dirty = 1;
-		if (change_order > PAGE_SHIFT)
-			npages_dirty = 1ul << (change_order - PAGE_SHIFT);
-	}
 	if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
 		unlock_rmap(rmapp);
 		return npages_dirty;
@@ -1128,7 +1121,7 @@ void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
 long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
 			struct kvm_memory_slot *memslot, unsigned long *map)
 {
-	unsigned long i, j;
+	unsigned long i;
 	unsigned long *rmapp;
 
 	preempt_disable();
@@ -1140,9 +1133,8 @@ long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
 		 * since we always put huge-page HPTEs in the rmap chain
 		 * corresponding to their page base address.
 		 */
-		if (npages && map)
-			for (j = i; npages; ++j, --npages)
-				__set_bit_le(j, map);
+		if (npages)
+			set_dirty_bits(map, i, npages);
 		++rmapp;
 	}
 	preempt_enable();
@@ -1186,7 +1178,6 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
 	struct page *page = virt_to_page(va);
 	struct kvm_memory_slot *memslot;
 	unsigned long gfn;
-	unsigned long *rmap;
 	int srcu_idx;
 
 	put_page(page);
@@ -1194,20 +1185,12 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
 	if (!dirty)
 		return;
 
-	/* We need to mark this page dirty in the rmap chain */
+	/* We need to mark this page dirty in the memslot dirty_bitmap, if any */
 	gfn = gpa >> PAGE_SHIFT;
 	srcu_idx = srcu_read_lock(&kvm->srcu);
 	memslot = gfn_to_memslot(kvm, gfn);
-	if (memslot) {
-		if (!kvm_is_radix(kvm)) {
-			rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
-			lock_rmap(rmap);
-			*rmap |= KVMPPC_RMAP_CHANGED;
-			unlock_rmap(rmap);
-		} else if (memslot->dirty_bitmap) {
-			mark_page_dirty(kvm, gfn);
-		}
-	}
+	if (memslot && memslot->dirty_bitmap)
+		set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap);
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
 }
 
@@ -1282,7 +1265,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
 			rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
 
 			lock_rmap(rmapp);
-			kvmppc_unmap_hpte(kvm, idx, rmapp, gfn);
+			kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn);
 			unlock_rmap(rmapp);
 		}
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index c5d7435455f1..6336b13ed233 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -474,26 +474,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return ret;
 }
 
-static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot,
-			     unsigned long gfn, unsigned int order)
-{
-	unsigned long i, limit;
-	unsigned long *dp;
-
-	if (!memslot->dirty_bitmap)
-		return;
-	limit = 1ul << order;
-	if (limit < BITS_PER_LONG) {
-		for (i = 0; i < limit; ++i)
-			mark_page_dirty(kvm, gfn + i);
-		return;
-	}
-	dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn);
-	limit /= BITS_PER_LONG;
-	for (i = 0; i < limit; ++i)
-		*dp++ = ~0ul;
-}
-
 /* Called with kvm->lock held */
 int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		    unsigned long gfn)
@@ -508,12 +488,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
 					      gpa, shift);
 		kvmppc_radix_tlbie_page(kvm, gpa, shift);
-		if (old & _PAGE_DIRTY) {
-			if (!shift)
-				mark_page_dirty(kvm, gfn);
-			else
-				mark_pages_dirty(kvm, memslot,
-						 gfn, shift - PAGE_SHIFT);
+		if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
+			unsigned long npages = 1;
+			if (shift)
+				npages = 1ul << (shift - PAGE_SHIFT);
+			kvmppc_update_dirty_map(memslot, gfn, npages);
 		}
 	}
 	return 0;				
@@ -579,20 +558,8 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 			struct kvm_memory_slot *memslot, unsigned long *map)
 {
 	unsigned long i, j;
-	unsigned long n, *p;
 	int npages;
 
-	/*
-	 * Radix accumulates dirty bits in the first half of the
-	 * memslot's dirty_bitmap area, for when pages are paged
-	 * out or modified by the host directly.  Pick up these
-	 * bits and add them to the map.
-	 */
-	n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long);
-	p = memslot->dirty_bitmap;
-	for (i = 0; i < n; ++i)
-		map[i] |= xchg(&p[i], 0);
-
 	for (i = 0; i < memslot->npages; i = j) {
 		npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
 
@@ -604,9 +571,10 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
 		 * real address, if npages > 1 we can skip to i + npages.
 		 */
 		j = i + 1;
-		if (npages)
-			for (j = i; npages; ++j, --npages)
-				__set_bit_le(j, map);
+		if (npages) {
+			set_dirty_bits(map, i, npages);
+			i = j + npages;
+		}
 	}
 	return 0;
 }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f6522075edbe..96b92d4adb87 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3376,7 +3376,7 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
 	struct kvm_memory_slot *memslot;
 	int i, r;
 	unsigned long n;
-	unsigned long *buf;
+	unsigned long *buf, *p;
 	struct kvm_vcpu *vcpu;
 
 	mutex_lock(&kvm->slots_lock);
@@ -3392,8 +3392,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
 		goto out;
 
 	/*
-	 * Use second half of bitmap area because radix accumulates
-	 * bits in the first half.
+	 * Use second half of bitmap area because both HPT and radix
+	 * accumulate bits in the first half.
 	 */
 	n = kvm_dirty_bitmap_bytes(memslot);
 	buf = memslot->dirty_bitmap + n / sizeof(long);
@@ -3406,6 +3406,16 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
 	if (r)
 		goto out;
 
+	/*
+	 * We accumulate dirty bits in the first half of the
+	 * memslot's dirty_bitmap area, for when pages are paged
+	 * out or modified by the host directly.  Pick up these
+	 * bits and add them to the map.
+	 */
+	p = memslot->dirty_bitmap;
+	for (i = 0; i < n / sizeof(long); ++i)
+		buf[i] |= xchg(&p[i], 0);
+
 	/* Harvest dirty bits from VPA and DTL updates */
 	/* Note: we never modify the SLB shadow buffer areas */
 	kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -3466,8 +3476,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 				const struct kvm_memory_slot *new)
 {
 	unsigned long npages = mem->memory_size >> PAGE_SHIFT;
-	struct kvm_memslots *slots;
-	struct kvm_memory_slot *memslot;
 
 	/*
 	 * If we are making a new memslot, it might make
@@ -3477,18 +3485,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 	 */
 	if (npages)
 		atomic64_inc(&kvm->arch.mmio_update);
-
-	if (npages && old->npages && !kvm_is_radix(kvm)) {
-		/*
-		 * If modifying a memslot, reset all the rmap dirty bits.
-		 * If this is a new memslot, we don't need to do anything
-		 * since the rmap array starts out as all zeroes,
-		 * i.e. no pages are dirty.
-		 */
-		slots = kvm_memslots(kvm);
-		memslot = id_to_memslot(slots, mem->slot);
-		kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
-	}
 }
 
 /*
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index cf98f17c1aa6..26c11f678fbf 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -107,23 +107,39 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
-/* Update the changed page order field of an rmap entry */
-void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize)
+/* Update the dirty bitmap of a memslot */
+void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot,
+			     unsigned long gfn, unsigned long psize)
 {
-	unsigned long order;
+	unsigned long npages;
 
-	if (!psize)
+	if (!psize || !memslot->dirty_bitmap)
 		return;
-	order = ilog2(psize);
-	order <<= KVMPPC_RMAP_CHG_SHIFT;
-	if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER))
-		*rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order;
+	npages = (psize + PAGE_SIZE - 1) / PAGE_SIZE;
+	gfn -= memslot->base_gfn;
+	set_dirty_bits_atomic(memslot->dirty_bitmap, gfn, npages);
+}
+EXPORT_SYMBOL_GPL(kvmppc_update_dirty_map);
+
+static void kvmppc_set_dirty_from_hpte(struct kvm *kvm,
+				unsigned long hpte_v, unsigned long hpte_gr)
+{
+	struct kvm_memory_slot *memslot;
+	unsigned long gfn;
+	unsigned long psize;
+
+	psize = kvmppc_actual_pgsz(hpte_v, hpte_gr);
+	gfn = hpte_rpn(hpte_gr, psize);
+	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+	if (memslot && memslot->dirty_bitmap)
+		kvmppc_update_dirty_map(memslot, gfn, psize);
 }
-EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change);
 
 /* Returns a pointer to the revmap entry for the page mapped by a HPTE */
 static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
-				      unsigned long hpte_gr)
+				      unsigned long hpte_gr,
+				      struct kvm_memory_slot **memslotp,
+				      unsigned long *gfnp)
 {
 	struct kvm_memory_slot *memslot;
 	unsigned long *rmap;
@@ -131,6 +147,10 @@ static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
 
 	gfn = hpte_rpn(hpte_gr, kvmppc_actual_pgsz(hpte_v, hpte_gr));
 	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+	if (memslotp)
+		*memslotp = memslot;
+	if (gfnp)
+		*gfnp = gfn;
 	if (!memslot)
 		return NULL;
 
@@ -147,10 +167,12 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 	unsigned long ptel, head;
 	unsigned long *rmap;
 	unsigned long rcbits;
+	struct kvm_memory_slot *memslot;
+	unsigned long gfn;
 
 	rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
 	ptel = rev->guest_rpte |= rcbits;
-	rmap = revmap_for_hpte(kvm, hpte_v, ptel);
+	rmap = revmap_for_hpte(kvm, hpte_v, ptel, &memslot, &gfn);
 	if (!rmap)
 		return;
 	lock_rmap(rmap);
@@ -169,8 +191,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 	}
 	*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
 	if (rcbits & HPTE_R_C)
-		kvmppc_update_rmap_change(rmap,
-					  kvmppc_actual_pgsz(hpte_v, hpte_r));
+		kvmppc_update_dirty_map(memslot, gfn,
+					kvmppc_actual_pgsz(hpte_v, hpte_r));
 	unlock_rmap(rmap);
 }
 
@@ -798,7 +820,7 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
 		gr |= r & (HPTE_R_R | HPTE_R_C);
 		if (r & HPTE_R_R) {
 			kvmppc_clear_ref_hpte(kvm, hpte, pte_index);
-			rmap = revmap_for_hpte(kvm, v, gr);
+			rmap = revmap_for_hpte(kvm, v, gr, NULL, NULL);
 			if (rmap) {
 				lock_rmap(rmap);
 				*rmap |= KVMPPC_RMAP_REFERENCED;
@@ -820,7 +842,6 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
 	__be64 *hpte;
 	unsigned long v, r, gr;
 	struct revmap_entry *rev;
-	unsigned long *rmap;
 	long ret = H_NOT_FOUND;
 
 	if (kvm_is_radix(kvm))
@@ -849,16 +870,9 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
 		r = be64_to_cpu(hpte[1]);
 		gr |= r & (HPTE_R_R | HPTE_R_C);
 		if (r & HPTE_R_C) {
-			unsigned long psize = kvmppc_actual_pgsz(v, r);
 			hpte[1] = cpu_to_be64(r & ~HPTE_R_C);
 			eieio();
-			rmap = revmap_for_hpte(kvm, v, gr);
-			if (rmap) {
-				lock_rmap(rmap);
-				*rmap |= KVMPPC_RMAP_CHANGED;
-				kvmppc_update_rmap_change(rmap, psize);
-				unlock_rmap(rmap);
-			}
+			kvmppc_set_dirty_from_hpte(kvm, v, gr);
 		}
 	}
 	vcpu->arch.gpr[4] = gr;
-- 
cgit 


From 18c3640cefc7f1c6986b7be48f5013a8d5e394cb Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Wed, 13 Sep 2017 16:00:10 +1000
Subject: KVM: PPC: Book3S HV: Add infrastructure for running HPT guests on
 radix host

This sets up the machinery for switching a guest between HPT (hashed
page table) and radix MMU modes, so that in future we can run a HPT
guest on a radix host on POWER9 machines.

* The KVM_PPC_CONFIGURE_V3_MMU ioctl can now specify either HPT or
  radix mode, on a radix host.

* The KVM_CAP_PPC_MMU_HASH_V3 capability now returns 1 on POWER9
  with HV KVM on a radix host.

* The KVM_PPC_GET_SMMU_INFO returns information about the HPT MMU on a
  radix host.

* The KVM_PPC_ALLOCATE_HTAB ioctl on a radix host will switch the
  guest to HPT mode and allocate a HPT.

* For simplicity, we now allocate the rmap array for each memslot,
  even on a radix host, since it will be needed if the guest switches
  to HPT mode.

* Since we cannot yet run a HPT guest on a radix host, the KVM_RUN
  ioctl will return an EINVAL error in that case.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_ppc.h     |  3 ++
 arch/powerpc/kvm/book3s_64_mmu_hv.c    | 22 +++++----
 arch/powerpc/kvm/book3s_64_mmu_radix.c |  1 +
 arch/powerpc/kvm/book3s_hv.c           | 87 ++++++++++++++++++++++++----------
 arch/powerpc/kvm/powerpc.c             |  3 +-
 5 files changed, 79 insertions(+), 37 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index ba5fadd6f3c9..96753f3aac6d 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -168,6 +168,7 @@ extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order);
 extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info);
 extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order);
 extern void kvmppc_free_hpt(struct kvm_hpt_info *info);
+extern void kvmppc_rmap_reset(struct kvm *kvm);
 extern long kvmppc_prepare_vrma(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
 extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
@@ -177,6 +178,8 @@ extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
 		struct iommu_group *grp);
 extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
 		struct iommu_group *grp);
+extern int kvmppc_switch_mmu_to_hpt(struct kvm *kvm);
+extern int kvmppc_switch_mmu_to_radix(struct kvm *kvm);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce_64 *args);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 944f7a532879..6aec8a22aeff 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -73,8 +73,6 @@ struct kvm_resize_hpt {
 	struct kvm_hpt_info hpt;
 };
 
-static void kvmppc_rmap_reset(struct kvm *kvm);
-
 int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
 {
 	unsigned long hpt = 0;
@@ -136,9 +134,6 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 	long err = -EBUSY;
 	struct kvm_hpt_info info;
 
-	if (kvm_is_radix(kvm))
-		return -EINVAL;
-
 	mutex_lock(&kvm->lock);
 	if (kvm->arch.mmu_ready) {
 		kvm->arch.mmu_ready = 0;
@@ -149,6 +144,12 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 			goto out;
 		}
 	}
+	if (kvm_is_radix(kvm)) {
+		err = kvmppc_switch_mmu_to_hpt(kvm);
+		if (err)
+			goto out;
+	}
+
 	if (kvm->arch.hpt.order == order) {
 		/* We already have a suitable HPT */
 
@@ -182,6 +183,7 @@ out:
 void kvmppc_free_hpt(struct kvm_hpt_info *info)
 {
 	vfree(info->rev);
+	info->rev = NULL;
 	if (info->cma)
 		kvm_free_hpt_cma(virt_to_page(info->virt),
 				 1 << (info->order - PAGE_SHIFT));
@@ -349,6 +351,9 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	int index;
 	int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
 
+	if (kvm_is_radix(vcpu->kvm))
+		return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite);
+
 	/* Get SLB entry */
 	if (virtmode) {
 		slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
@@ -710,7 +715,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	goto out_put;
 }
 
-static void kvmppc_rmap_reset(struct kvm *kvm)
+void kvmppc_rmap_reset(struct kvm *kvm)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
@@ -2089,10 +2094,7 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.slb_nr = 32;		/* POWER7/POWER8 */
 
-	if (kvm_is_radix(vcpu->kvm))
-		mmu->xlate = kvmppc_mmu_radix_xlate;
-	else
-		mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
+	mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
 	mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
 
 	vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 6336b13ed233..58618f644c56 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -662,6 +662,7 @@ void kvmppc_free_radix(struct kvm *kvm)
 		pgd_clear(pgd);
 	}
 	pgd_free(kvm->mm, kvm->arch.pgtable);
+	kvm->arch.pgtable = NULL;
 }
 
 static void pte_ctor(void *addr)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 96b92d4adb87..040e102de4ef 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2442,6 +2442,9 @@ static void prepare_threads(struct kvmppc_vcore *vc)
 	for_each_runnable_thread(i, vcpu, vc) {
 		if (signal_pending(vcpu->arch.run_task))
 			vcpu->arch.ret = -EINTR;
+		else if (kvm_is_radix(vc->kvm) != radix_enabled())
+			/* can't actually run HPT guest on radix host yet... */
+			vcpu->arch.ret = -EINVAL;
 		else if (vcpu->arch.vpa.update_pending ||
 			 vcpu->arch.slb_shadow.update_pending ||
 			 vcpu->arch.dtl.update_pending)
@@ -3338,13 +3341,6 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
 {
 	struct kvm_ppc_one_seg_page_size *sps;
 
-	/*
-	 * Since we don't yet support HPT guests on a radix host,
-	 * return an error if the host uses radix.
-	 */
-	if (radix_enabled())
-		return -EINVAL;
-
 	/*
 	 * POWER7, POWER8 and POWER9 all support 32 storage keys for data.
 	 * POWER7 doesn't support keys for instruction accesses,
@@ -3447,15 +3443,6 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
 static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
 					 unsigned long npages)
 {
-	/*
-	 * For now, if radix_enabled() then we only support radix guests,
-	 * and in that case we don't need the rmap array.
-	 */
-	if (radix_enabled()) {
-		slot->arch.rmap = NULL;
-		return 0;
-	}
-
 	slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
 	if (!slot->arch.rmap)
 		return -ENOMEM;
@@ -3628,6 +3615,34 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 	goto out_srcu;
 }
 
+/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
+{
+	kvmppc_free_radix(kvm);
+	kvmppc_update_lpcr(kvm, LPCR_VPM1,
+			   LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+	kvmppc_rmap_reset(kvm);
+	kvm->arch.radix = 0;
+	kvm->arch.process_table = 0;
+	return 0;
+}
+
+/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
+{
+	int err;
+
+	err = kvmppc_init_vm_radix(kvm);
+	if (err)
+		return err;
+
+	kvmppc_free_hpt(&kvm->arch.hpt);
+	kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
+			   LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+	kvm->arch.radix = 1;
+	return 0;
+}
+
 #ifdef CONFIG_KVM_XICS
 /*
  * Allocate a per-core structure for managing state about which cores are
@@ -3771,7 +3786,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	}
 
 	/*
-	 * For now, if the host uses radix, the guest must be radix.
+	 * If the host uses radix, the guest starts out as radix.
 	 */
 	if (radix_enabled()) {
 		kvm->arch.radix = 1;
@@ -3795,7 +3810,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	 * Work out how many sets the TLB has, for the use of
 	 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
 	 */
-	if (kvm_is_radix(kvm))
+	if (radix_enabled())
 		kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX;	/* 128 */
 	else if (cpu_has_feature(CPU_FTR_ARCH_300))
 		kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH;	/* 256 */
@@ -4185,6 +4200,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 {
 	unsigned long lpcr;
 	int radix;
+	int err;
 
 	/* If not on a POWER9, reject it */
 	if (!cpu_has_feature(CPU_FTR_ARCH_300))
@@ -4194,12 +4210,8 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 	if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
 		return -EINVAL;
 
-	/* We can't change a guest to/from radix yet */
-	radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
-	if (radix != kvm_is_radix(kvm))
-		return -EINVAL;
-
 	/* GR (guest radix) bit in process_table field must match */
+	radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
 	if (!!(cfg->process_table & PATB_GR) != radix)
 		return -EINVAL;
 
@@ -4207,15 +4219,40 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 	if ((cfg->process_table & PRTS_MASK) > 24)
 		return -EINVAL;
 
+	/* We can change a guest to/from radix now, if the host is radix */
+	if (radix && !radix_enabled())
+		return -EINVAL;
+
 	mutex_lock(&kvm->lock);
+	if (radix != kvm_is_radix(kvm)) {
+		if (kvm->arch.mmu_ready) {
+			kvm->arch.mmu_ready = 0;
+			/* order mmu_ready vs. vcpus_running */
+			smp_mb();
+			if (atomic_read(&kvm->arch.vcpus_running)) {
+				kvm->arch.mmu_ready = 1;
+				err = -EBUSY;
+				goto out_unlock;
+			}
+		}
+		if (radix)
+			err = kvmppc_switch_mmu_to_radix(kvm);
+		else
+			err = kvmppc_switch_mmu_to_hpt(kvm);
+		if (err)
+			goto out_unlock;
+	}
+
 	kvm->arch.process_table = cfg->process_table;
 	kvmppc_setup_partition_table(kvm);
 
 	lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
 	kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
-	mutex_unlock(&kvm->lock);
+	err = 0;
 
-	return 0;
+ out_unlock:
+	mutex_unlock(&kvm->lock);
+	return err;
 }
 
 static struct kvmppc_ops kvm_ops_hv = {
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index a3746b98ec11..a0b7f094de78 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -590,8 +590,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = !!(hv_enabled && radix_enabled());
 		break;
 	case KVM_CAP_PPC_MMU_HASH_V3:
-		r = !!(hv_enabled && !radix_enabled() &&
-		       cpu_has_feature(CPU_FTR_ARCH_300));
+		r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300));
 		break;
 #endif
 	case KVM_CAP_SYNC_MMU:
-- 
cgit 


From 516f7898ae20d9dd902a85522676055a4de9dc9b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 16 Oct 2017 16:11:57 +1100
Subject: KVM: PPC: Book3S HV: Allow for running POWER9 host in single-threaded
 mode

This patch allows for a mode on POWER9 hosts where we control all the
threads of a core, much as we do on POWER8.  The mode is controlled by
a module parameter on the kvm_hv module, called "indep_threads_mode".
The normal mode on POWER9 is the "independent threads" mode, with
indep_threads_mode=Y, where the host is in SMT4 mode (or in fact any
desired SMT mode) and each thread independently enters and exits from
KVM guests without reference to what other threads in the core are
doing.

If indep_threads_mode is set to N at the point when a VM is started,
KVM will expect every core that the guest runs on to be in single
threaded mode (that is, threads 1, 2 and 3 offline), and will set the
flag that prevents secondary threads from coming online.  We can still
use all four threads; the code that implements dynamic micro-threading
on POWER8 will become active in over-commit situations and will allow
up to three other VCPUs to be run on the secondary threads of the core
whenever a VCPU is run.

The reason for wanting this mode is that this will allow us to run HPT
guests on a radix host on a POWER9 machine that does not support
"mixed mode", that is, having some threads in a core be in HPT mode
while other threads are in radix mode.  It will also make it possible
to implement a "strict threads" mode in future, if desired.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_host.h     |  1 +
 arch/powerpc/kvm/book3s_hv.c            | 87 ++++++++++++++++++++-------------
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |  2 +
 3 files changed, 57 insertions(+), 33 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index d831a3883175..3aa5b577cd60 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -281,6 +281,7 @@ struct kvm_arch {
 	cpumask_t cpu_in_guest;
 	u8 radix;
 	u8 fwnmi_enabled;
+	bool threads_indep;
 	pgd_t *pgtable;
 	u64 process_table;
 	struct dentry *debugfs_dir;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 040e102de4ef..b5fbf7617952 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -98,6 +98,10 @@ static int target_smt_mode;
 module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
 
+static bool indep_threads_mode = true;
+module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
+
 #ifdef CONFIG_KVM_XICS
 static struct kernel_param_ops module_param_ops = {
 	.set = param_set_int,
@@ -1734,9 +1738,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
  * MMU mode (radix or HPT), unfortunately, but since we only support
  * HPT guests on a HPT host so far, that isn't an impediment yet.
  */
-static int threads_per_vcore(void)
+static int threads_per_vcore(struct kvm *kvm)
 {
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
+	if (kvm->arch.threads_indep)
 		return 1;
 	return threads_per_subcore;
 }
@@ -2228,11 +2232,10 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
 		kvmppc_ipi_thread(cpu);
 }
 
-static void kvmppc_wait_for_nap(void)
+static void kvmppc_wait_for_nap(int n_threads)
 {
 	int cpu = smp_processor_id();
 	int i, loops;
-	int n_threads = threads_per_vcore();
 
 	if (n_threads <= 1)
 		return;
@@ -2319,7 +2322,7 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
 
 	vc->vcore_state = VCORE_PREEMPT;
 	vc->pcpu = smp_processor_id();
-	if (vc->num_threads < threads_per_vcore()) {
+	if (vc->num_threads < threads_per_vcore(vc->kvm)) {
 		spin_lock(&lp->lock);
 		list_add_tail(&vc->preempt_list, &lp->list);
 		spin_unlock(&lp->lock);
@@ -2357,7 +2360,7 @@ struct core_info {
 
 /*
  * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
- * respectively in 2-way micro-threading (split-core) mode.
+ * respectively in 2-way micro-threading (split-core) mode on POWER8.
  */
 static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
 
@@ -2373,7 +2376,14 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
 
 static bool subcore_config_ok(int n_subcores, int n_threads)
 {
-	/* Can only dynamically split if unsplit to begin with */
+	/*
+	 * POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core
+	 * mode, with one thread per subcore.
+	 */
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		return n_subcores <= 4 && n_threads == 1;
+
+	/* On POWER8, can only dynamically split if unsplit to begin with */
 	if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
 		return false;
 	if (n_subcores > MAX_SUBCORES)
@@ -2632,6 +2642,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	int target_threads;
 	int controlled_threads;
 	int trap;
+	bool is_power8;
 
 	/*
 	 * Remove from the list any threads that have a signal pending
@@ -2654,7 +2665,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	 * the number of threads per subcore, except on POWER9,
 	 * where it's 1 because the threads are (mostly) independent.
 	 */
-	controlled_threads = threads_per_vcore();
+	controlled_threads = threads_per_vcore(vc->kvm);
 
 	/*
 	 * Make sure we are running on primary threads, and that secondary
@@ -2725,32 +2736,40 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	cmd_bit = stat_bit = 0;
 	split = core_info.n_subcores;
 	sip = NULL;
+	is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
+		&& !cpu_has_feature(CPU_FTR_ARCH_300);
+
 	if (split > 1) {
-		/* threads_per_subcore must be MAX_SMT_THREADS (8) here */
-		if (split == 2 && (dynamic_mt_modes & 2)) {
-			cmd_bit = HID0_POWER8_1TO2LPAR;
-			stat_bit = HID0_POWER8_2LPARMODE;
-		} else {
-			split = 4;
-			cmd_bit = HID0_POWER8_1TO4LPAR;
-			stat_bit = HID0_POWER8_4LPARMODE;
-		}
-		subcore_size = MAX_SMT_THREADS / split;
 		sip = &split_info;
 		memset(&split_info, 0, sizeof(split_info));
-		split_info.rpr = mfspr(SPRN_RPR);
-		split_info.pmmar = mfspr(SPRN_PMMAR);
-		split_info.ldbar = mfspr(SPRN_LDBAR);
-		split_info.subcore_size = subcore_size;
 		for (sub = 0; sub < core_info.n_subcores; ++sub)
 			split_info.vc[sub] = core_info.vc[sub];
+
+		if (is_power8) {
+			if (split == 2 && (dynamic_mt_modes & 2)) {
+				cmd_bit = HID0_POWER8_1TO2LPAR;
+				stat_bit = HID0_POWER8_2LPARMODE;
+			} else {
+				split = 4;
+				cmd_bit = HID0_POWER8_1TO4LPAR;
+				stat_bit = HID0_POWER8_4LPARMODE;
+			}
+			subcore_size = MAX_SMT_THREADS / split;
+			split_info.rpr = mfspr(SPRN_RPR);
+			split_info.pmmar = mfspr(SPRN_PMMAR);
+			split_info.ldbar = mfspr(SPRN_LDBAR);
+			split_info.subcore_size = subcore_size;
+		} else {
+			split_info.subcore_size = 1;
+		}
+
 		/* order writes to split_info before kvm_split_mode pointer */
 		smp_wmb();
 	}
 	for (thr = 0; thr < controlled_threads; ++thr)
 		paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
 
-	/* Initiate micro-threading (split-core) if required */
+	/* Initiate micro-threading (split-core) on POWER8 if required */
 	if (cmd_bit) {
 		unsigned long hid0 = mfspr(SPRN_HID0);
 
@@ -2769,7 +2788,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	/* Start all the threads */
 	active = 0;
 	for (sub = 0; sub < core_info.n_subcores; ++sub) {
-		thr = subcore_thread_map[sub];
+		thr = is_power8 ? subcore_thread_map[sub] : sub;
 		thr0_done = false;
 		active |= 1 << thr;
 		pvc = core_info.vc[sub];
@@ -2796,18 +2815,18 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	 * the vcore pointer in the PACA of the secondaries.
 	 */
 	smp_mb();
-	if (cmd_bit)
-		split_info.do_nap = 1;	/* ask secondaries to nap when done */
 
 	/*
 	 * When doing micro-threading, poke the inactive threads as well.
 	 * This gets them to the nap instruction after kvm_do_nap,
 	 * which reduces the time taken to unsplit later.
 	 */
-	if (split > 1)
+	if (cmd_bit) {
+		split_info.do_nap = 1;	/* ask secondaries to nap when done */
 		for (thr = 1; thr < threads_per_subcore; ++thr)
 			if (!(active & (1 << thr)))
 				kvmppc_ipi_thread(pcpu + thr);
+	}
 
 	vc->vcore_state = VCORE_RUNNING;
 	preempt_disable();
@@ -2841,10 +2860,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	vc->vcore_state = VCORE_EXITING;
 
 	/* wait for secondary threads to finish writing their state to memory */
-	kvmppc_wait_for_nap();
+	kvmppc_wait_for_nap(controlled_threads);
 
 	/* Return to whole-core mode if we split the core earlier */
-	if (split > 1) {
+	if (cmd_bit) {
 		unsigned long hid0 = mfspr(SPRN_HID0);
 		unsigned long loops = 0;
 
@@ -3822,10 +3841,12 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	/*
 	 * Track that we now have a HV mode VM active. This blocks secondary
 	 * CPU threads from coming online.
-	 * On POWER9, we only need to do this for HPT guests on a radix
-	 * host, which is not yet supported.
+	 * On POWER9, we only need to do this if the "indep_threads_mode"
+	 * module parameter has been set to N.
 	 */
-	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		kvm->arch.threads_indep = indep_threads_mode;
+	if (!kvm->arch.threads_indep)
 		kvm_hv_vm_activated();
 
 	/*
@@ -3865,7 +3886,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 {
 	debugfs_remove_recursive(kvm->arch.debugfs_dir);
 
-	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+	if (!kvm->arch.threads_indep)
 		kvm_hv_vm_deactivated();
 
 	kvmppc_free_vcores(kvm);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index fd2583ddc9a9..ae6c61641ade 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -385,6 +385,7 @@ kvm_secondary_got_guest:
 	ld	r6, 0(r6)
 	mtspr	SPRN_HDEC, r6
 	/* and set per-LPAR registers, if doing dynamic micro-threading */
+BEGIN_FTR_SECTION
 	ld	r6, HSTATE_SPLIT_MODE(r13)
 	cmpdi	r6, 0
 	beq	63f
@@ -395,6 +396,7 @@ kvm_secondary_got_guest:
 	ld	r0, KVM_SPLIT_LDBAR(r6)
 	mtspr	SPRN_LDBAR, r0
 	isync
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 63:
 	/* Order load of vcpu after load of vcore */
 	lwsync
-- 
cgit 


From c01015091a77035de1939ef106bfbcaf9a21395f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 19 Oct 2017 14:11:23 +1100
Subject: KVM: PPC: Book3S HV: Run HPT guests on POWER9 radix hosts

This patch removes the restriction that a radix host can only run
radix guests, allowing us to run HPT (hashed page table) guests as
well.  This is useful because it provides a way to run old guest
kernels that know about POWER8 but not POWER9.

Unfortunately, POWER9 currently has a restriction that all threads
in a given code must either all be in HPT mode, or all in radix mode.
This means that when entering a HPT guest, we have to obtain control
of all 4 threads in the core and get them to switch their LPIDR and
LPCR registers, even if they are not going to run a guest.  On guest
exit we also have to get all threads to switch LPIDR and LPCR back
to host values.

To make this feasible, we require that KVM not be in the "independent
threads" mode, and that the CPU cores be in single-threaded mode from
the host kernel's perspective (only thread 0 online; threads 1, 2 and
3 offline).  That allows us to use the same code as on POWER8 for
obtaining control of the secondary threads.

To manage the LPCR/LPIDR changes required, we extend the kvm_split_info
struct to contain the information needed by the secondary threads.
All threads perform a barrier synchronization (where all threads wait
for every other thread to reach the synchronization point) on guest
entry, both before and after loading LPCR and LPIDR.  On guest exit,
they all once again perform a barrier synchronization both before
and after loading host values into LPCR and LPIDR.

Finally, it is also currently necessary to flush the entire TLB every
time we enter a HPT guest on a radix host.  We do this on thread 0
with a loop of tlbiel instructions.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/kvm_book3s_asm.h | 13 +++-
 arch/powerpc/kernel/asm-offsets.c         |  3 +
 arch/powerpc/kvm/book3s_hv.c              | 47 ++++++++++++---
 arch/powerpc/kvm/book3s_hv_builtin.c      | 99 ++++++++++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 67 +++++++++++++++++++--
 5 files changed, 212 insertions(+), 17 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 7cea76f11c26..ab386af2904f 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -82,6 +82,16 @@ struct kvm_split_mode {
 	u8		do_nap;
 	u8		napped[MAX_SMT_THREADS];
 	struct kvmppc_vcore *vc[MAX_SUBCORES];
+	/* Bits for changing lpcr on P9 */
+	unsigned long	lpcr_req;
+	unsigned long	lpidr_req;
+	unsigned long	host_lpcr;
+	u32		do_set;
+	u32		do_restore;
+	union {
+		u32	allphases;
+		u8	phase[4];
+	} lpcr_sync;
 };
 
 /*
@@ -107,7 +117,8 @@ struct kvmppc_host_state {
 	u8 hwthread_req;
 	u8 hwthread_state;
 	u8 host_ipi;
-	u8 ptid;
+	u8 ptid;		/* thread number within subcore when split */
+	u8 tid;			/* thread number within whole core */
 	struct kvm_vcpu *kvm_vcpu;
 	struct kvmppc_vcore *kvm_vcore;
 	void __iomem *xics_phys;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8cfb20e38cfe..519fad556113 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -642,6 +642,7 @@ int main(void)
 	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
 	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
 	HSTATE_FIELD(HSTATE_PTID, ptid);
+	HSTATE_FIELD(HSTATE_TID, tid);
 	HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
 	HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
 	HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]);
@@ -667,6 +668,8 @@ int main(void)
 	OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
 	OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
 	OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
+	OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
+	OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b5fbf7617952..fff62fdf1464 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2414,6 +2414,11 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
 	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
 		return false;
 
+	/* POWER9 currently requires all threads to be in the same MMU mode */
+	if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+	    kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
+		return false;
+
 	if (n_threads < cip->max_subcore_threads)
 		n_threads = cip->max_subcore_threads;
 	if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
@@ -2452,9 +2457,6 @@ static void prepare_threads(struct kvmppc_vcore *vc)
 	for_each_runnable_thread(i, vcpu, vc) {
 		if (signal_pending(vcpu->arch.run_task))
 			vcpu->arch.ret = -EINTR;
-		else if (kvm_is_radix(vc->kvm) != radix_enabled())
-			/* can't actually run HPT guest on radix host yet... */
-			vcpu->arch.ret = -EINVAL;
 		else if (vcpu->arch.vpa.update_pending ||
 			 vcpu->arch.slb_shadow.update_pending ||
 			 vcpu->arch.dtl.update_pending)
@@ -2643,6 +2645,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	int controlled_threads;
 	int trap;
 	bool is_power8;
+	bool hpt_on_radix;
 
 	/*
 	 * Remove from the list any threads that have a signal pending
@@ -2671,9 +2674,13 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	 * Make sure we are running on primary threads, and that secondary
 	 * threads are offline.  Also check if the number of threads in this
 	 * guest are greater than the current system threads per guest.
+	 * On POWER9, we need to be not in independent-threads mode if
+	 * this is a HPT guest on a radix host.
 	 */
-	if ((controlled_threads > 1) &&
-	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
+	hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm);
+	if (((controlled_threads > 1) &&
+	     ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
+	    (hpt_on_radix && vc->kvm->arch.threads_indep)) {
 		for_each_runnable_thread(i, vcpu, vc) {
 			vcpu->arch.ret = -EBUSY;
 			kvmppc_remove_runnable(vc, vcpu);
@@ -2739,7 +2746,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
 		&& !cpu_has_feature(CPU_FTR_ARCH_300);
 
-	if (split > 1) {
+	if (split > 1 || hpt_on_radix) {
 		sip = &split_info;
 		memset(&split_info, 0, sizeof(split_info));
 		for (sub = 0; sub < core_info.n_subcores; ++sub)
@@ -2761,13 +2768,24 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 			split_info.subcore_size = subcore_size;
 		} else {
 			split_info.subcore_size = 1;
+			if (hpt_on_radix) {
+				/* Use the split_info for LPCR/LPIDR changes */
+				split_info.lpcr_req = vc->lpcr;
+				split_info.lpidr_req = vc->kvm->arch.lpid;
+				split_info.host_lpcr = vc->kvm->arch.host_lpcr;
+				split_info.do_set = 1;
+			}
 		}
 
 		/* order writes to split_info before kvm_split_mode pointer */
 		smp_wmb();
 	}
-	for (thr = 0; thr < controlled_threads; ++thr)
+
+	for (thr = 0; thr < controlled_threads; ++thr) {
+		paca[pcpu + thr].kvm_hstate.tid = thr;
+		paca[pcpu + thr].kvm_hstate.napping = 0;
 		paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
+	}
 
 	/* Initiate micro-threading (split-core) on POWER8 if required */
 	if (cmd_bit) {
@@ -2820,8 +2838,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	 * When doing micro-threading, poke the inactive threads as well.
 	 * This gets them to the nap instruction after kvm_do_nap,
 	 * which reduces the time taken to unsplit later.
+	 * For POWER9 HPT guest on radix host, we need all the secondary
+	 * threads woken up so they can do the LPCR/LPIDR change.
 	 */
-	if (cmd_bit) {
+	if (cmd_bit || hpt_on_radix) {
 		split_info.do_nap = 1;	/* ask secondaries to nap when done */
 		for (thr = 1; thr < threads_per_subcore; ++thr)
 			if (!(active & (1 << thr)))
@@ -2879,8 +2899,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 			cpu_relax();
 			++loops;
 		}
-		split_info.do_nap = 0;
+	} else if (hpt_on_radix) {
+		/* Wait for all threads to have seen final sync */
+		for (thr = 1; thr < controlled_threads; ++thr) {
+			while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) {
+				HMT_low();
+				barrier();
+			}
+			HMT_medium();
+		}
 	}
+	split_info.do_nap = 0;
 
 	kvmppc_set_host_core(pcpu);
 
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index e38cc2df6d2a..49a2c7825e04 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -278,7 +278,8 @@ void kvmhv_commence_exit(int trap)
 	struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
 	int ptid = local_paca->kvm_hstate.ptid;
 	struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
-	int me, ee, i;
+	int me, ee, i, t;
+	int cpu0;
 
 	/* Set our bit in the threads-exiting-guest map in the 0xff00
 	   bits of vcore->entry_exit_map */
@@ -320,6 +321,22 @@ void kvmhv_commence_exit(int trap)
 		if ((ee >> 8) == 0)
 			kvmhv_interrupt_vcore(vc, ee);
 	}
+
+	/*
+	 * On POWER9 when running a HPT guest on a radix host (sip != NULL),
+	 * we have to interrupt inactive CPU threads to get them to
+	 * restore the host LPCR value.
+	 */
+	if (sip->lpcr_req) {
+		if (cmpxchg(&sip->do_restore, 0, 1) == 0) {
+			vc = local_paca->kvm_hstate.kvm_vcore;
+			cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid;
+			for (t = 1; t < threads_per_core; ++t) {
+				if (sip->napped[t])
+					kvmhv_rm_send_ipi(cpu0 + t);
+			}
+		}
+	}
 }
 
 struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
@@ -619,3 +636,83 @@ void kvmppc_bad_interrupt(struct pt_regs *regs)
 	die("Bad interrupt in KVM entry/exit code", regs, SIGABRT);
 	panic("Bad KVM trap");
 }
+
+/*
+ * Functions used to switch LPCR HR and UPRT bits on all threads
+ * when entering and exiting HPT guests on a radix host.
+ */
+
+#define PHASE_REALMODE		1	/* in real mode */
+#define PHASE_SET_LPCR		2	/* have set LPCR */
+#define PHASE_OUT_OF_GUEST	4	/* have finished executing in guest */
+#define PHASE_RESET_LPCR	8	/* have reset LPCR to host value */
+
+#define ALL(p)		(((p) << 24) | ((p) << 16) | ((p) << 8) | (p))
+
+static void wait_for_sync(struct kvm_split_mode *sip, int phase)
+{
+	int thr = local_paca->kvm_hstate.tid;
+
+	sip->lpcr_sync.phase[thr] |= phase;
+	phase = ALL(phase);
+	while ((sip->lpcr_sync.allphases & phase) != phase) {
+		HMT_low();
+		barrier();
+	}
+	HMT_medium();
+}
+
+void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip)
+{
+	unsigned long rb, set;
+
+	/* wait for every other thread to get to real mode */
+	wait_for_sync(sip, PHASE_REALMODE);
+
+	/* Set LPCR and LPIDR */
+	mtspr(SPRN_LPCR, sip->lpcr_req);
+	mtspr(SPRN_LPID, sip->lpidr_req);
+	isync();
+
+	/* Invalidate the TLB on thread 0 */
+	if (local_paca->kvm_hstate.tid == 0) {
+		sip->do_set = 0;
+		asm volatile("ptesync" : : : "memory");
+		for (set = 0; set < POWER9_TLB_SETS_RADIX; ++set) {
+			rb = TLBIEL_INVAL_SET_LPID +
+				(set << TLBIEL_INVAL_SET_SHIFT);
+			asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : :
+				     "r" (rb), "r" (0));
+		}
+		asm volatile("ptesync" : : : "memory");
+	}
+
+	/* indicate that we have done so and wait for others */
+	wait_for_sync(sip, PHASE_SET_LPCR);
+	/* order read of sip->lpcr_sync.allphases vs. sip->do_set */
+	smp_rmb();
+}
+
+/*
+ * Called when a thread that has been in the guest needs
+ * to reload the host LPCR value - but only on POWER9 when
+ * running a HPT guest on a radix host.
+ */
+void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
+{
+	/* we're out of the guest... */
+	wait_for_sync(sip, PHASE_OUT_OF_GUEST);
+
+	mtspr(SPRN_LPID, 0);
+	mtspr(SPRN_LPCR, sip->host_lpcr);
+	isync();
+
+	if (local_paca->kvm_hstate.tid == 0) {
+		sip->do_restore = 0;
+		smp_wmb();	/* order store of do_restore vs. phase */
+	}
+
+	wait_for_sync(sip, PHASE_RESET_LPCR);
+	smp_mb();
+	local_paca->kvm_hstate.kvm_split_mode = NULL;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index ae6c61641ade..7add18930e6d 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -82,6 +82,19 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
 	RFI
 
 kvmppc_call_hv_entry:
+BEGIN_FTR_SECTION
+	/* On P9, do LPCR setting, if necessary */
+	ld	r3, HSTATE_SPLIT_MODE(r13)
+	cmpdi	r3, 0
+	beq	46f
+	lwz	r4, KVM_SPLIT_DO_SET(r3)
+	cmpwi	r4, 0
+	beq	46f
+	bl	kvmhv_p9_set_lpcr
+	nop
+46:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
 	ld	r4, HSTATE_KVM_VCPU(r13)
 	bl	kvmppc_hv_entry
 
@@ -385,10 +398,10 @@ kvm_secondary_got_guest:
 	ld	r6, 0(r6)
 	mtspr	SPRN_HDEC, r6
 	/* and set per-LPAR registers, if doing dynamic micro-threading */
-BEGIN_FTR_SECTION
 	ld	r6, HSTATE_SPLIT_MODE(r13)
 	cmpdi	r6, 0
 	beq	63f
+BEGIN_FTR_SECTION
 	ld	r0, KVM_SPLIT_RPR(r6)
 	mtspr	SPRN_RPR, r0
 	ld	r0, KVM_SPLIT_PMMAR(r6)
@@ -396,7 +409,15 @@ BEGIN_FTR_SECTION
 	ld	r0, KVM_SPLIT_LDBAR(r6)
 	mtspr	SPRN_LDBAR, r0
 	isync
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+FTR_SECTION_ELSE
+	/* On P9 we use the split_info for coordinating LPCR changes */
+	lwz	r4, KVM_SPLIT_DO_SET(r6)
+	cmpwi	r4, 0
+	beq	63f
+	mr	r3, r6
+	bl	kvmhv_p9_set_lpcr
+	nop
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 63:
 	/* Order load of vcpu after load of vcore */
 	lwsync
@@ -467,6 +488,12 @@ kvm_no_guest:
 	ld	r3, HSTATE_SPLIT_MODE(r13)
 	cmpdi	r3, 0
 	beq	kvm_no_guest
+	lwz	r0, KVM_SPLIT_DO_SET(r3)
+	cmpwi	r0, 0
+	bne	kvmhv_do_set
+	lwz	r0, KVM_SPLIT_DO_RESTORE(r3)
+	cmpwi	r0, 0
+	bne	kvmhv_do_restore
 	lbz	r0, KVM_SPLIT_DO_NAP(r3)
 	cmpwi	r0, 0
 	beq	kvm_no_guest
@@ -479,6 +506,19 @@ kvm_no_guest:
 	stb	r0, HSTATE_HWTHREAD_STATE(r13)
 	b	kvm_no_guest
 
+kvmhv_do_set:
+	/* Set LPCR, LPIDR etc. on P9 */
+	HMT_MEDIUM
+	bl	kvmhv_p9_set_lpcr
+	nop
+	b	kvm_no_guest
+
+kvmhv_do_restore:
+	HMT_MEDIUM
+	bl	kvmhv_p9_restore_lpcr
+	nop
+	b	kvm_no_guest
+
 /*
  * Here the primary thread is trying to return the core to
  * whole-core mode, so we need to nap.
@@ -516,8 +556,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	/* Set kvm_split_mode.napped[tid] = 1 */
 	ld	r3, HSTATE_SPLIT_MODE(r13)
 	li	r0, 1
-	lhz	r4, PACAPACAINDEX(r13)
-	clrldi	r4, r4, 61	/* micro-threading => P8 => 8 threads/core */
+	lbz	r4, HSTATE_TID(r13)
 	addi	r4, r4, KVM_SPLIT_NAPPED
 	stbx	r0, r3, r4
 	/* Check the do_nap flag again after setting napped[] */
@@ -1911,10 +1950,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 19:	lis	r8,0x7fff		/* MAX_INT@h */
 	mtspr	SPRN_HDEC,r8
 
-16:	ld	r8,KVM_HOST_LPCR(r4)
+16:
+BEGIN_FTR_SECTION
+	/* On POWER9 with HPT-on-radix we need to wait for all other threads */
+	ld	r3, HSTATE_SPLIT_MODE(r13)
+	cmpdi	r3, 0
+	beq	47f
+	lwz	r8, KVM_SPLIT_DO_RESTORE(r3)
+	cmpwi	r8, 0
+	beq	47f
+	stw	r12, STACK_SLOT_TRAP(r1)
+	bl	kvmhv_p9_restore_lpcr
+	nop
+	lwz	r12, STACK_SLOT_TRAP(r1)
+	b	48f
+47:
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+	ld	r8,KVM_HOST_LPCR(r4)
 	mtspr	SPRN_LPCR,r8
 	isync
-
+48:
 	/* load host SLB entries */
 BEGIN_MMU_FTR_SECTION
 	b	0f
-- 
cgit 


From 432953b4455d8a48af0500a77826c71871671161 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 9 Nov 2017 15:37:10 +1100
Subject: KVM: PPC: Book3S HV: Cosmetic post-merge cleanups

This rearranges the code in kvmppc_run_vcpu() and kvmppc_run_vcpu_hv()
to be neater and clearer.  Deeply indented code in kvmppc_run_vcpu()
is moved out to a helper function, kvmhv_setup_mmu().  In
kvmppc_vcpu_run_hv(), make use of the existing variable 'kvm' in
place of 'vcpu->kvm'.

No functional change.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv.c | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index ca0d4d938d6a..18b16c3957fc 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3120,6 +3120,25 @@ out:
 	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
 }
 
+static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
+{
+	int r = 0;
+	struct kvm *kvm = vcpu->kvm;
+
+	mutex_lock(&kvm->lock);
+	if (!kvm->arch.mmu_ready) {
+		if (!kvm_is_radix(kvm))
+			r = kvmppc_hv_setup_htab_rma(vcpu);
+		if (!r) {
+			if (cpu_has_feature(CPU_FTR_ARCH_300))
+				kvmppc_setup_partition_table(kvm);
+			kvm->arch.mmu_ready = 1;
+		}
+	}
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
 static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
 	int n_ceded, i, r;
@@ -3179,22 +3198,12 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		/* See if the MMU is ready to go */
 		if (!vcpu->kvm->arch.mmu_ready) {
 			spin_unlock(&vc->lock);
-			mutex_lock(&vcpu->kvm->lock);
-			r = 0;
-			if (!vcpu->kvm->arch.mmu_ready) {
-				if (!kvm_is_radix(vcpu->kvm))
-					r = kvmppc_hv_setup_htab_rma(vcpu);
-				if (!r) {
-					if (cpu_has_feature(CPU_FTR_ARCH_300))
-						kvmppc_setup_partition_table(vcpu->kvm);
-					vcpu->kvm->arch.mmu_ready = 1;
-				}
-			}
-			mutex_unlock(&vcpu->kvm->lock);
+			r = kvmhv_setup_mmu(vcpu);
 			spin_lock(&vc->lock);
 			if (r) {
 				kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-				kvm_run->fail_entry.hardware_entry_failure_reason = 0;
+				kvm_run->fail_entry.
+					hardware_entry_failure_reason = 0;
 				vcpu->arch.ret = r;
 				break;
 			}
@@ -3344,10 +3353,10 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			trace_kvm_hcall_exit(vcpu, r);
 			kvmppc_core_prepare_to_enter(vcpu);
 		} else if (r == RESUME_PAGE_FAULT) {
-			srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+			srcu_idx = srcu_read_lock(&kvm->srcu);
 			r = kvmppc_book3s_hv_page_fault(run, vcpu,
 				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
-			srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+			srcu_read_unlock(&kvm->srcu, srcu_idx);
 		} else if (r == RESUME_PASSTHROUGH) {
 			if (WARN_ON(xive_enabled()))
 				r = H_SUCCESS;
@@ -3367,7 +3376,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	mtspr(SPRN_VRSAVE, user_vrsave);
 
 	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
-	atomic_dec(&vcpu->kvm->arch.vcpus_running);
+	atomic_dec(&kvm->arch.vcpus_running);
 	return r;
 }
 
-- 
cgit