From b9fde58db7e5738cacb740b0ec547933fe314fbe Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Thu, 7 Sep 2017 16:35:44 +1000
Subject: powerpc/powernv: Rework EEH initialization on powernv

Remove the post_init callback which is only used
by powernv, we can just call it explicitly from
the powernv code.

This partially kills the ability to "disable" eeh at
runtime via debugfs as this was calling that same
callback again, but this is both unused and broken
in several ways. If we want to revive it, we need
to create a dedicated enable/disable callback on the
backend that does the right thing.

Let the bulk of eeh initialize normally at
core_initcall() like it does on pseries by removing
the hack in eeh_init() that delays it.

Instead we make sure our eeh->probe cleanly bails
out of the PEs haven't been created yet and we force
a re-probe where we used to call eeh_init() again.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/eeh.h               |  8 ++---
 arch/powerpc/kernel/eeh.c                    | 46 +++++++++-------------------
 arch/powerpc/platforms/powernv/eeh-powernv.c | 42 +++++++++++++------------
 arch/powerpc/platforms/powernv/pci-ioda.c    |  3 +-
 arch/powerpc/platforms/powernv/pci.h         |  1 +
 5 files changed, 40 insertions(+), 60 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 9847ae3a12d1..cd1df96335e5 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -200,7 +200,6 @@ enum {
 struct eeh_ops {
 	char *name;
 	int (*init)(void);
-	int (*post_init)(void);
 	void* (*probe)(struct pci_dn *pdn, void *data);
 	int (*set_option)(struct eeh_pe *pe, int option);
 	int (*get_pe_addr)(struct eeh_pe *pe);
@@ -275,7 +274,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
 
 struct eeh_dev *eeh_dev_init(struct pci_dn *pdn);
 void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
-int eeh_init(void);
+void eeh_probe_devices(void);
 int __init eeh_ops_register(struct eeh_ops *ops);
 int __exit eeh_ops_unregister(const char *name);
 int eeh_check_failure(const volatile void __iomem *token);
@@ -321,10 +320,7 @@ static inline bool eeh_enabled(void)
         return false;
 }
 
-static inline int eeh_init(void)
-{
-	return 0;
-}
+static inline void eeh_probe_devices(void) { }
 
 static inline void *eeh_dev_init(struct pci_dn *pdn, void *data)
 {
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 116000b45531..cbca0a667682 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -972,6 +972,18 @@ static struct notifier_block eeh_reboot_nb = {
 	.notifier_call = eeh_reboot_notifier,
 };
 
+void eeh_probe_devices(void)
+{
+	struct pci_controller *hose, *tmp;
+	struct pci_dn *pdn;
+
+	/* Enable EEH for all adapters */
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		pdn = hose->pci_data;
+		traverse_pci_dn(pdn, eeh_ops->probe, NULL);
+	}
+}
+
 /**
  * eeh_init - EEH initialization
  *
@@ -987,22 +999,11 @@ static struct notifier_block eeh_reboot_nb = {
  * Even if force-off is set, the EEH hardware is still enabled, so that
  * newer systems can boot.
  */
-int eeh_init(void)
+static int eeh_init(void)
 {
 	struct pci_controller *hose, *tmp;
-	struct pci_dn *pdn;
-	static int cnt = 0;
 	int ret = 0;
 
-	/*
-	 * We have to delay the initialization on PowerNV after
-	 * the PCI hierarchy tree has been built because the PEs
-	 * are figured out based on PCI devices instead of device
-	 * tree nodes
-	 */
-	if (machine_is(powernv) && cnt++ <= 0)
-		return ret;
-
 	/* Register reboot notifier */
 	ret = register_reboot_notifier(&eeh_reboot_nb);
 	if (ret) {
@@ -1028,22 +1029,7 @@ int eeh_init(void)
 	if (ret)
 		return ret;
 
-	/* Enable EEH for all adapters */
-	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
-		pdn = hose->pci_data;
-		traverse_pci_dn(pdn, eeh_ops->probe, NULL);
-	}
-
-	/*
-	 * Call platform post-initialization. Actually, It's good chance
-	 * to inform platform that EEH is ready to supply service if the
-	 * I/O cache stuff has been built up.
-	 */
-	if (eeh_ops->post_init) {
-		ret = eeh_ops->post_init();
-		if (ret)
-			return ret;
-	}
+	eeh_probe_devices();
 
 	if (eeh_enabled())
 		pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
@@ -1757,10 +1743,6 @@ static int eeh_enable_dbgfs_set(void *data, u64 val)
 	else
 		eeh_add_flag(EEH_FORCE_DISABLED);
 
-	/* Notify the backend */
-	if (eeh_ops->post_init)
-		eeh_ops->post_init();
-
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 8864065eba22..4650fb294e7a 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -41,7 +41,6 @@
 #include "powernv.h"
 #include "pci.h"
 
-static bool pnv_eeh_nb_init = false;
 static int eeh_event_irq = -EINVAL;
 
 static int pnv_eeh_init(void)
@@ -197,31 +196,31 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
  * been built. If the I/O cache staff has been built, EEH is
  * ready to supply service.
  */
-static int pnv_eeh_post_init(void)
+int pnv_eeh_post_init(void)
 {
 	struct pci_controller *hose;
 	struct pnv_phb *phb;
 	int ret = 0;
 
-	/* Register OPAL event notifier */
-	if (!pnv_eeh_nb_init) {
-		eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
-		if (eeh_event_irq < 0) {
-			pr_err("%s: Can't register OPAL event interrupt (%d)\n",
-			       __func__, eeh_event_irq);
-			return eeh_event_irq;
-		}
+	/* Probe devices & build address cache */
+	eeh_probe_devices();
+	eeh_addr_cache_build();
 
-		ret = request_irq(eeh_event_irq, pnv_eeh_event,
-				IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL);
-		if (ret < 0) {
-			irq_dispose_mapping(eeh_event_irq);
-			pr_err("%s: Can't request OPAL event interrupt (%d)\n",
-			       __func__, eeh_event_irq);
-			return ret;
-		}
+	/* Register OPAL event notifier */
+	eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
+	if (eeh_event_irq < 0) {
+		pr_err("%s: Can't register OPAL event interrupt (%d)\n",
+		       __func__, eeh_event_irq);
+		return eeh_event_irq;
+	}
 
-		pnv_eeh_nb_init = true;
+	ret = request_irq(eeh_event_irq, pnv_eeh_event,
+			  IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL);
+	if (ret < 0) {
+		irq_dispose_mapping(eeh_event_irq);
+		pr_err("%s: Can't request OPAL event interrupt (%d)\n",
+		       __func__, eeh_event_irq);
+		return ret;
 	}
 
 	if (!eeh_enabled())
@@ -367,6 +366,10 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
 		return NULL;
 
+	/* Skip if we haven't probed yet */
+	if (phb->ioda.pe_rmap[config_addr] == IODA_INVALID_PE)
+		return NULL;
+
 	/* Initialize eeh device */
 	edev->class_code = pdn->class_code;
 	edev->mode	&= 0xFFFFFF00;
@@ -1731,7 +1734,6 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn)
 static struct eeh_ops pnv_eeh_ops = {
 	.name                   = "powernv",
 	.init                   = pnv_eeh_init,
-	.post_init              = pnv_eeh_post_init,
 	.probe			= pnv_eeh_probe,
 	.set_option             = pnv_eeh_set_option,
 	.get_pe_addr            = pnv_eeh_get_pe_addr,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 57f9e55f4352..fb5cd7511189 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3293,8 +3293,7 @@ static void pnv_pci_ioda_fixup(void)
 	pnv_pci_ioda_create_dbgfs();
 
 #ifdef CONFIG_EEH
-	eeh_init();
-	eeh_addr_cache_build();
+	pnv_eeh_post_init();
 #endif
 }
 
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index a95273c524f6..56d1f272d4ad 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -234,6 +234,7 @@ extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
 extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
 extern bool pnv_pci_enable_device_hook(struct pci_dev *dev);
 extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+extern int pnv_eeh_post_init(void);
 
 extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 			    const char *fmt, ...);
-- 
cgit 


From 5080332c2c893118dbc18755f35c8b0131cf0fc4 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Fri, 15 Sep 2017 15:25:48 +1000
Subject: powerpc/64s: Add workaround for P9 vector CI load issue

POWER9 DD2.1 and earlier has an issue where some cache inhibited
vector load will return bad data. The workaround is two part, one
firmware/microcode part triggers HMI interrupts when hitting such
loads, the other part is this patch which then emulates the
instructions in Linux.

The affected instructions are limited to lxvd2x, lxvw4x, lxvb16x and
lxvh8x.

When an instruction triggers the HMI, all threads in the core will be
sent to the HMI handler, not just the one running the vector load.

In general, these spurious HMIs are detected by the emulation code and
we just return back to the running process. Unfortunately, if a
spurious interrupt occurs on a vector load that's to normal memory we
have no way to detect that it's spurious (unless we walk the page
tables, which is very expensive). In this case we emulate the load but
we need do so using a vector load itself to ensure 128bit atomicity is
preserved.

Some additional debugfs emulated instruction counters are added also.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Switch CONFIG_PPC_BOOK3S_64 to CONFIG_VSX to unbreak the build]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/emulated_ops.h |   4 +
 arch/powerpc/include/asm/paca.h         |   1 +
 arch/powerpc/include/asm/uaccess.h      |  17 +++
 arch/powerpc/kernel/exceptions-64s.S    |  16 ++-
 arch/powerpc/kernel/mce.c               |  30 ++++-
 arch/powerpc/kernel/traps.c             | 201 ++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/smp.c    |   7 ++
 7 files changed, 271 insertions(+), 5 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/emulated_ops.h b/arch/powerpc/include/asm/emulated_ops.h
index f00e10e2a335..651e1354498e 100644
--- a/arch/powerpc/include/asm/emulated_ops.h
+++ b/arch/powerpc/include/asm/emulated_ops.h
@@ -55,6 +55,10 @@ extern struct ppc_emulated {
 	struct ppc_emulated_entry mfdscr;
 	struct ppc_emulated_entry mtdscr;
 	struct ppc_emulated_entry lq_stq;
+	struct ppc_emulated_entry lxvw4x;
+	struct ppc_emulated_entry lxvh8x;
+	struct ppc_emulated_entry lxvd2x;
+	struct ppc_emulated_entry lxvb16x;
 #endif
 } ppc_emulated;
 
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 04b60af027ae..1e06310ccc09 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -210,6 +210,7 @@ struct paca_struct {
 	 */
 	u16 in_mce;
 	u8 hmi_event_available;		/* HMI event is available */
+	u8 hmi_p9_special_emu;		/* HMI P9 special emulation */
 #endif
 
 	/* Stuff for accurate time accounting */
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index 9c0e60ca1666..e34f15e727d9 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -173,6 +173,23 @@ do {								\
 
 extern long __get_user_bad(void);
 
+/*
+ * This does an atomic 128 byte aligned load from userspace.
+ * Upto caller to do enable_kernel_vmx() before calling!
+ */
+#define __get_user_atomic_128_aligned(kaddr, uaddr, err)		\
+	__asm__ __volatile__(				\
+		"1:	lvx  0,0,%1	# get user\n"	\
+		" 	stvx 0,0,%2	# put kernel\n"	\
+		"2:\n"					\
+		".section .fixup,\"ax\"\n"		\
+		"3:	li %0,%3\n"			\
+		"	b 2b\n"				\
+		".previous\n"				\
+		EX_TABLE(1b, 3b)			\
+		: "=r" (err)			\
+		: "b" (uaddr), "b" (kaddr), "i" (-EFAULT), "0" (err))
+
 #define __get_user_asm(x, addr, err, op)		\
 	__asm__ __volatile__(				\
 		"1:	"op" %1,0(%2)	# get_user\n"	\
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 48da0f5d2f7f..8c32dd4cac20 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1010,6 +1010,8 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
 	EXCEPTION_PROLOG_COMMON_3(0xe60)
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	BRANCH_LINK_TO_FAR(hmi_exception_realmode) /* Function call ABI */
+	cmpdi	cr0,r3,0
+
 	/* Windup the stack. */
 	/* Move original HSRR0 and HSRR1 into the respective regs */
 	ld	r9,_MSR(r1)
@@ -1026,10 +1028,15 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
 	REST_8GPRS(2, r1)
 	REST_GPR(10, r1)
 	ld	r11,_CCR(r1)
+	REST_2GPRS(12, r1)
+	bne	1f
 	mtcr	r11
 	REST_GPR(11, r1)
-	REST_2GPRS(12, r1)
-	/* restore original r1. */
+	ld	r1,GPR1(r1)
+	hrfid
+
+1:	mtcr	r11
+	REST_GPR(11, r1)
 	ld	r1,GPR1(r1)
 
 	/*
@@ -1042,8 +1049,9 @@ hmi_exception_after_realmode:
 	EXCEPTION_PROLOG_0(PACA_EXGEN)
 	b	tramp_real_hmi_exception
 
-EXC_COMMON_ASYNC(hmi_exception_common, 0xe60, handle_hmi_exception)
-
+EXC_COMMON_BEGIN(hmi_exception_common)
+EXCEPTION_COMMON(PACA_EXGEN, 0xe60, hmi_exception_common, handle_hmi_exception,
+        ret_from_except, FINISH_NAP;ADD_NVGPRS;ADD_RECONCILE;RUNLATCH_ON)
 
 EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20)
 EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80)
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9b2ea7e71c06..f588951b171d 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -470,6 +470,34 @@ long hmi_exception_realmode(struct pt_regs *regs)
 {
 	__this_cpu_inc(irq_stat.hmi_exceptions);
 
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Workaround for P9 vector CI loads (see p9_hmi_special_emu) */
+	if (pvr_version_is(PVR_POWER9)) {
+		unsigned long hmer = mfspr(SPRN_HMER);
+
+		/* Do we have the debug bit set */
+		if (hmer & PPC_BIT(17)) {
+			hmer &= ~PPC_BIT(17);
+			mtspr(SPRN_HMER, hmer);
+
+			/*
+			 * Now to avoid problems with soft-disable we
+			 * only do the emulation if we are coming from
+			 * user space
+			 */
+			if (user_mode(regs))
+				local_paca->hmi_p9_special_emu = 1;
+
+			/*
+			 * Don't bother going to OPAL if that's the
+			 * only relevant bit.
+			 */
+			if (!(hmer & mfspr(SPRN_HMEER)))
+				return local_paca->hmi_p9_special_emu;
+		}
+	}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 	wait_for_subcore_guest_exit();
 
 	if (ppc_md.hmi_exception_early)
@@ -477,5 +505,5 @@ long hmi_exception_realmode(struct pt_regs *regs)
 
 	wait_for_tb_resync();
 
-	return 0;
+	return 1;
 }
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 13c9dcdcba69..9ae1924c7d1a 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -37,6 +37,7 @@
 #include <linux/kdebug.h>
 #include <linux/ratelimit.h>
 #include <linux/context_tracking.h>
+#include <linux/smp.h>
 
 #include <asm/emulated_ops.h>
 #include <asm/pgtable.h>
@@ -699,6 +700,187 @@ void SMIException(struct pt_regs *regs)
 	die("System Management Interrupt", regs, SIGABRT);
 }
 
+#ifdef CONFIG_VSX
+static void p9_hmi_special_emu(struct pt_regs *regs)
+{
+	unsigned int ra, rb, t, i, sel, instr, rc;
+	const void __user *addr;
+	u8 vbuf[16], *vdst;
+	unsigned long ea, msr, msr_mask;
+	bool swap;
+
+	if (__get_user_inatomic(instr, (unsigned int __user *)regs->nip))
+		return;
+
+	/*
+	 * lxvb16x	opcode: 0x7c0006d8
+	 * lxvd2x	opcode: 0x7c000698
+	 * lxvh8x	opcode: 0x7c000658
+	 * lxvw4x	opcode: 0x7c000618
+	 */
+	if ((instr & 0xfc00073e) != 0x7c000618) {
+		pr_devel("HMI vec emu: not vector CI %i:%s[%d] nip=%016lx"
+			 " instr=%08x\n",
+			 smp_processor_id(), current->comm, current->pid,
+			 regs->nip, instr);
+		return;
+	}
+
+	/* Grab vector registers into the task struct */
+	msr = regs->msr; /* Grab msr before we flush the bits */
+	flush_vsx_to_thread(current);
+	enable_kernel_altivec();
+
+	/*
+	 * Is userspace running with a different endian (this is rare but
+	 * not impossible)
+	 */
+	swap = (msr & MSR_LE) != (MSR_KERNEL & MSR_LE);
+
+	/* Decode the instruction */
+	ra = (instr >> 16) & 0x1f;
+	rb = (instr >> 11) & 0x1f;
+	t = (instr >> 21) & 0x1f;
+	if (instr & 1)
+		vdst = (u8 *)&current->thread.vr_state.vr[t];
+	else
+		vdst = (u8 *)&current->thread.fp_state.fpr[t][0];
+
+	/* Grab the vector address */
+	ea = regs->gpr[rb] + (ra ? regs->gpr[ra] : 0);
+	if (is_32bit_task())
+		ea &= 0xfffffffful;
+	addr = (__force const void __user *)ea;
+
+	/* Check it */
+	if (!access_ok(VERIFY_READ, addr, 16)) {
+		pr_devel("HMI vec emu: bad access %i:%s[%d] nip=%016lx"
+			 " instr=%08x addr=%016lx\n",
+			 smp_processor_id(), current->comm, current->pid,
+			 regs->nip, instr, (unsigned long)addr);
+		return;
+	}
+
+	/* Read the vector */
+	rc = 0;
+	if ((unsigned long)addr & 0xfUL)
+		/* unaligned case */
+		rc = __copy_from_user_inatomic(vbuf, addr, 16);
+	else
+		__get_user_atomic_128_aligned(vbuf, addr, rc);
+	if (rc) {
+		pr_devel("HMI vec emu: page fault %i:%s[%d] nip=%016lx"
+			 " instr=%08x addr=%016lx\n",
+			 smp_processor_id(), current->comm, current->pid,
+			 regs->nip, instr, (unsigned long)addr);
+		return;
+	}
+
+	pr_devel("HMI vec emu: emulated vector CI %i:%s[%d] nip=%016lx"
+		 " instr=%08x addr=%016lx\n",
+		 smp_processor_id(), current->comm, current->pid, regs->nip,
+		 instr, (unsigned long) addr);
+
+	/* Grab instruction "selector" */
+	sel = (instr >> 6) & 3;
+
+	/*
+	 * Check to make sure the facility is actually enabled. This
+	 * could happen if we get a false positive hit.
+	 *
+	 * lxvd2x/lxvw4x always check MSR VSX sel = 0,2
+	 * lxvh8x/lxvb16x check MSR VSX or VEC depending on VSR used sel = 1,3
+	 */
+	msr_mask = MSR_VSX;
+	if ((sel & 1) && (instr & 1)) /* lxvh8x & lxvb16x + VSR >= 32 */
+		msr_mask = MSR_VEC;
+	if (!(msr & msr_mask)) {
+		pr_devel("HMI vec emu: MSR fac clear %i:%s[%d] nip=%016lx"
+			 " instr=%08x msr:%016lx\n",
+			 smp_processor_id(), current->comm, current->pid,
+			 regs->nip, instr, msr);
+		return;
+	}
+
+	/* Do logging here before we modify sel based on endian */
+	switch (sel) {
+	case 0:	/* lxvw4x */
+		PPC_WARN_EMULATED(lxvw4x, regs);
+		break;
+	case 1: /* lxvh8x */
+		PPC_WARN_EMULATED(lxvh8x, regs);
+		break;
+	case 2: /* lxvd2x */
+		PPC_WARN_EMULATED(lxvd2x, regs);
+		break;
+	case 3: /* lxvb16x */
+		PPC_WARN_EMULATED(lxvb16x, regs);
+		break;
+	}
+
+#ifdef __LITTLE_ENDIAN__
+	/*
+	 * An LE kernel stores the vector in the task struct as an LE
+	 * byte array (effectively swapping both the components and
+	 * the content of the components). Those instructions expect
+	 * the components to remain in ascending address order, so we
+	 * swap them back.
+	 *
+	 * If we are running a BE user space, the expectation is that
+	 * of a simple memcpy, so forcing the emulation to look like
+	 * a lxvb16x should do the trick.
+	 */
+	if (swap)
+		sel = 3;
+
+	switch (sel) {
+	case 0:	/* lxvw4x */
+		for (i = 0; i < 4; i++)
+			((u32 *)vdst)[i] = ((u32 *)vbuf)[3-i];
+		break;
+	case 1: /* lxvh8x */
+		for (i = 0; i < 8; i++)
+			((u16 *)vdst)[i] = ((u16 *)vbuf)[7-i];
+		break;
+	case 2: /* lxvd2x */
+		for (i = 0; i < 2; i++)
+			((u64 *)vdst)[i] = ((u64 *)vbuf)[1-i];
+		break;
+	case 3: /* lxvb16x */
+		for (i = 0; i < 16; i++)
+			vdst[i] = vbuf[15-i];
+		break;
+	}
+#else /* __LITTLE_ENDIAN__ */
+	/* On a big endian kernel, a BE userspace only needs a memcpy */
+	if (!swap)
+		sel = 3;
+
+	/* Otherwise, we need to swap the content of the components */
+	switch (sel) {
+	case 0:	/* lxvw4x */
+		for (i = 0; i < 4; i++)
+			((u32 *)vdst)[i] = cpu_to_le32(((u32 *)vbuf)[i]);
+		break;
+	case 1: /* lxvh8x */
+		for (i = 0; i < 8; i++)
+			((u16 *)vdst)[i] = cpu_to_le16(((u16 *)vbuf)[i]);
+		break;
+	case 2: /* lxvd2x */
+		for (i = 0; i < 2; i++)
+			((u64 *)vdst)[i] = cpu_to_le64(((u64 *)vbuf)[i]);
+		break;
+	case 3: /* lxvb16x */
+		memcpy(vdst, vbuf, 16);
+		break;
+	}
+#endif /* !__LITTLE_ENDIAN__ */
+
+	/* Go to next instruction */
+	regs->nip += 4;
+}
+#endif /* CONFIG_VSX */
+
 void handle_hmi_exception(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs;
@@ -706,6 +888,21 @@ void handle_hmi_exception(struct pt_regs *regs)
 	old_regs = set_irq_regs(regs);
 	irq_enter();
 
+#ifdef CONFIG_VSX
+	/* Real mode flagged P9 special emu is needed */
+	if (local_paca->hmi_p9_special_emu) {
+		local_paca->hmi_p9_special_emu = 0;
+
+		/*
+		 * We don't want to take page faults while doing the
+		 * emulation, we just replay the instruction if necessary.
+		 */
+		pagefault_disable();
+		p9_hmi_special_emu(regs);
+		pagefault_enable();
+	}
+#endif /* CONFIG_VSX */
+
 	if (ppc_md.handle_hmi_exception)
 		ppc_md.handle_hmi_exception(regs);
 
@@ -1924,6 +2121,10 @@ struct ppc_emulated ppc_emulated = {
 	WARN_EMULATED_SETUP(mfdscr),
 	WARN_EMULATED_SETUP(mtdscr),
 	WARN_EMULATED_SETUP(lq_stq),
+	WARN_EMULATED_SETUP(lxvw4x),
+	WARN_EMULATED_SETUP(lxvh8x),
+	WARN_EMULATED_SETUP(lxvd2x),
+	WARN_EMULATED_SETUP(lxvb16x),
 #endif
 };
 
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index c17f81e433f7..355d3f99cafb 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -49,6 +49,13 @@
 
 static void pnv_smp_setup_cpu(int cpu)
 {
+	/*
+	 * P9 workaround for CI vector load (see traps.c),
+	 * enable the corresponding HMI interrupt
+	 */
+	if (pvr_version_is(PVR_POWER9))
+		mtspr(SPRN_HMEER, mfspr(SPRN_HMEER) | PPC_BIT(17));
+
 	if (xive_enabled())
 		xive_smp_setup_cpu();
 	else if (cpu != boot_cpuid)
-- 
cgit 


From 6110236b9bbd177debc045c5fc29224444686ece Mon Sep 17 00:00:00 2001
From: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Date: Sun, 3 Sep 2017 20:15:12 +0200
Subject: powerpc/mm: Export flush_all_mm()

With the optimizations introduced by commit a46cc7a90fd8
("powerpc/mm/radix: Improve TLB/PWC flushes"), flush_tlb_mm() no
longer flushes the page walk cache (PWC) with radix. This patch
introduces flush_all_mm(), which flushes everything, TLB and PWC, for
a given mm.

Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Reviewed-By: Alistair Popple <alistair@popple.id.au>
[mpe: Add a WARN_ON_ONCE() in the empty hash routines]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 22 ++++++++++++++++++++++
 .../powerpc/include/asm/book3s/64/tlbflush-radix.h |  3 +++
 arch/powerpc/include/asm/book3s/64/tlbflush.h      | 15 +++++++++++++++
 arch/powerpc/mm/tlb-radix.c                        |  6 ++++--
 4 files changed, 44 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
index 2f6373144e2c..99c99bb04353 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
@@ -65,6 +65,28 @@ static inline void hash__flush_tlb_mm(struct mm_struct *mm)
 {
 }
 
+static inline void hash__local_flush_all_mm(struct mm_struct *mm)
+{
+	/*
+	 * There's no Page Walk Cache for hash, so what is needed is
+	 * the same as flush_tlb_mm(), which doesn't really make sense
+	 * with hash. So the only thing we could do is flush the
+	 * entire LPID! Punt for now, as it's not being used.
+	 */
+	WARN_ON_ONCE(1);
+}
+
+static inline void hash__flush_all_mm(struct mm_struct *mm)
+{
+	/*
+	 * There's no Page Walk Cache for hash, so what is needed is
+	 * the same as flush_tlb_mm(), which doesn't really make sense
+	 * with hash. So the only thing we could do is flush the
+	 * entire LPID! Punt for now, as it's not being used.
+	 */
+	WARN_ON_ONCE(1);
+}
+
 static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma,
 					  unsigned long vmaddr)
 {
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index 9b433a624bf3..af06c6fe8a9f 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -21,17 +21,20 @@ extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long sta
 extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
 extern void radix__local_flush_tlb_mm(struct mm_struct *mm);
+extern void radix__local_flush_all_mm(struct mm_struct *mm);
 extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 					      int psize);
 extern void radix__tlb_flush(struct mmu_gather *tlb);
 #ifdef CONFIG_SMP
 extern void radix__flush_tlb_mm(struct mm_struct *mm);
+extern void radix__flush_all_mm(struct mm_struct *mm);
 extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr);
 extern void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 					int psize);
 #else
 #define radix__flush_tlb_mm(mm)		radix__local_flush_tlb_mm(mm)
+#define radix__flush_all_mm(mm)		radix__local_flush_all_mm(mm)
 #define radix__flush_tlb_page(vma,addr)	radix__local_flush_tlb_page(vma,addr)
 #define radix__flush_tlb_page_psize(mm,addr,p) radix__local_flush_tlb_page_psize(mm,addr,p)
 #endif
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 72b925f97bab..70760d018bcd 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -57,6 +57,13 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma,
 	return hash__local_flush_tlb_page(vma, vmaddr);
 }
 
+static inline void local_flush_all_mm(struct mm_struct *mm)
+{
+	if (radix_enabled())
+		return radix__local_flush_all_mm(mm);
+	return hash__local_flush_all_mm(mm);
+}
+
 static inline void tlb_flush(struct mmu_gather *tlb)
 {
 	if (radix_enabled())
@@ -79,9 +86,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 		return radix__flush_tlb_page(vma, vmaddr);
 	return hash__flush_tlb_page(vma, vmaddr);
 }
+
+static inline void flush_all_mm(struct mm_struct *mm)
+{
+	if (radix_enabled())
+		return radix__flush_all_mm(mm);
+	return hash__flush_all_mm(mm);
+}
 #else
 #define flush_tlb_mm(mm)		local_flush_tlb_mm(mm)
 #define flush_tlb_page(vma, addr)	local_flush_tlb_page(vma, addr)
+#define flush_all_mm(mm)		local_flush_all_mm(mm)
 #endif /* CONFIG_SMP */
 /*
  * flush the page walk cache for the address
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index b3e849c4886e..5a1f46eff3a2 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -144,7 +144,7 @@ void radix__local_flush_tlb_mm(struct mm_struct *mm)
 EXPORT_SYMBOL(radix__local_flush_tlb_mm);
 
 #ifndef CONFIG_SMP
-static void radix__local_flush_all_mm(struct mm_struct *mm)
+void radix__local_flush_all_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
 
@@ -154,6 +154,7 @@ static void radix__local_flush_all_mm(struct mm_struct *mm)
 		_tlbiel_pid(pid, RIC_FLUSH_ALL);
 	preempt_enable();
 }
+EXPORT_SYMBOL(radix__local_flush_all_mm);
 #endif /* CONFIG_SMP */
 
 void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
@@ -200,7 +201,7 @@ no_context:
 }
 EXPORT_SYMBOL(radix__flush_tlb_mm);
 
-static void radix__flush_all_mm(struct mm_struct *mm)
+void radix__flush_all_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
 
@@ -216,6 +217,7 @@ static void radix__flush_all_mm(struct mm_struct *mm)
 no_context:
 	preempt_enable();
 }
+EXPORT_SYMBOL(radix__flush_all_mm);
 
 void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
 {
-- 
cgit 


From 03b8abedf4f4965e7e9e0d4f92877c42c07ce19f Mon Sep 17 00:00:00 2001
From: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Date: Sun, 3 Sep 2017 20:15:13 +0200
Subject: cxl: Enable global TLBIs for cxl contexts

The PSL and nMMU need to see all TLB invalidations for the memory
contexts used on the adapter. For the hash memory model, it is done by
making all TLBIs global as soon as the cxl driver is in use. For
radix, we need something similar, but we can refine and only convert
to global the invalidations for contexts actually used by the device.

The new mm_context_add_copro() API increments the 'active_cpus' count
for the contexts attached to the cxl adapter. As soon as there's more
than 1 active cpu, the TLBIs for the context become global. Active cpu
count must be decremented when detaching to restore locality if
possible and to avoid overflowing the counter.

The hash memory model support is somewhat limited, as we can't
decrement the active cpus count when mm_context_remove_copro() is
called, because we can't flush the TLB for a mm on hash. So TLBIs
remain global on hash.

Signed-off-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Fixes: f24be42aab37 ("cxl: Add psl9 specific code")
Tested-by: Alistair Popple <alistair@popple.id.au>
[mpe: Fold in updated comment on the barrier from Fred]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu_context.h | 46 ++++++++++++++++++++++++++++++++++
 arch/powerpc/mm/mmu_context.c          |  9 -------
 2 files changed, 46 insertions(+), 9 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 309592589e30..a0d7145d6cd2 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -77,6 +77,52 @@ extern void switch_cop(struct mm_struct *next);
 extern int use_cop(unsigned long acop, struct mm_struct *mm);
 extern void drop_cop(unsigned long acop, struct mm_struct *mm);
 
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline void inc_mm_active_cpus(struct mm_struct *mm)
+{
+	atomic_inc(&mm->context.active_cpus);
+}
+
+static inline void dec_mm_active_cpus(struct mm_struct *mm)
+{
+	atomic_dec(&mm->context.active_cpus);
+}
+
+static inline void mm_context_add_copro(struct mm_struct *mm)
+{
+	/*
+	 * On hash, should only be called once over the lifetime of
+	 * the context, as we can't decrement the active cpus count
+	 * and flush properly for the time being.
+	 */
+	inc_mm_active_cpus(mm);
+}
+
+static inline void mm_context_remove_copro(struct mm_struct *mm)
+{
+	/*
+	 * Need to broadcast a global flush of the full mm before
+	 * decrementing active_cpus count, as the next TLBI may be
+	 * local and the nMMU and/or PSL need to be cleaned up.
+	 * Should be rare enough so that it's acceptable.
+	 *
+	 * Skip on hash, as we don't know how to do the proper flush
+	 * for the time being. Invalidations will remain global if
+	 * used on hash.
+	 */
+	if (radix_enabled()) {
+		flush_all_mm(mm);
+		dec_mm_active_cpus(mm);
+	}
+}
+#else
+static inline void inc_mm_active_cpus(struct mm_struct *mm) { }
+static inline void dec_mm_active_cpus(struct mm_struct *mm) { }
+static inline void mm_context_add_copro(struct mm_struct *mm) { }
+static inline void mm_context_remove_copro(struct mm_struct *mm) { }
+#endif
+
+
 extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			       struct task_struct *tsk);
 
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index 0f613bc63c50..d60a62bf4fc7 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -34,15 +34,6 @@ static inline void switch_mm_pgdir(struct task_struct *tsk,
 				   struct mm_struct *mm) { }
 #endif
 
-#ifdef CONFIG_PPC_BOOK3S_64
-static inline void inc_mm_active_cpus(struct mm_struct *mm)
-{
-	atomic_inc(&mm->context.active_cpus);
-}
-#else
-static inline void inc_mm_active_cpus(struct mm_struct *mm) { }
-#endif
-
 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			struct task_struct *tsk)
 {
-- 
cgit 


From 842dc1dbabb5e874550b52d896851e5e53e28a5f Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 29 Sep 2017 13:29:37 +1000
Subject: powerpc/watchdog: Do not panic from locked CPU's IPI handler

The SMP watchdog will detect locked CPUs and IPI them to print a
backtrace and registers. If panic on hard lockup is enabled, do not
panic from this handler, because that can cause recursion into the IPI
layer during the panic.

The caller already panics in this case.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/watchdog.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 2f6eadd9408d..532a1adbe89b 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -97,8 +97,7 @@ static void wd_lockup_ipi(struct pt_regs *regs)
 	else
 		dump_stack();
 
-	if (hardlockup_panic)
-		nmi_panic(regs, "Hard LOCKUP");
+	/* Do not panic from here because that can recurse into NMI IPI layer */
 }
 
 static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
-- 
cgit 


From d58fdd9d7f220f34830838af4e7b21854469ae8a Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 29 Sep 2017 13:29:38 +1000
Subject: powerpc/watchdog: Do not backtrace locked CPUs twice if allcpus
 backtrace is enabled

If sysctl_hardlockup_all_cpu_backtrace is enabled, there is no need to
IPI stuck CPUs for backtrace before trigger_allbutself_cpu_backtrace(),
which does the same thing again.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/watchdog.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 532a1adbe89b..920e61c79f47 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -133,15 +133,18 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 	pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n",
 			cpu, cpumask_pr_args(&wd_smp_cpus_pending));
 
-	/*
-	 * Try to trigger the stuck CPUs.
-	 */
-	for_each_cpu(c, &wd_smp_cpus_pending) {
-		if (c == cpu)
-			continue;
-		smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+	if (!sysctl_hardlockup_all_cpu_backtrace) {
+		/*
+		 * Try to trigger the stuck CPUs, unless we are going to
+		 * get a backtrace on all of them anyway.
+		 */
+		for_each_cpu(c, &wd_smp_cpus_pending) {
+			if (c == cpu)
+				continue;
+			smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+		}
+		smp_flush_nmi_ipi(1000000);
 	}
-	smp_flush_nmi_ipi(1000000);
 
 	/* Take the stuck CPUs out of the watch group */
 	set_cpumask_stuck(&wd_smp_cpus_pending, tb);
-- 
cgit 


From 80e4d70b06863e0104e5a0dc78aa3710297fbd4b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 29 Sep 2017 13:29:39 +1000
Subject: powerpc/watchdog: Do not trigger SMP crash from touch_nmi_watchdog

In xmon, touch_nmi_watchdog() is not expected to be checking that
other CPUs have not touched the watchdog, so the code will just call
touch_nmi_watchdog() once before re-enabling hard interrupts.

Just update our CPU's state, and ignore apparently stuck SMP threads.

Arguably touch_nmi_watchdog should check for SMP lockups, and callers
should be fixed, but that's not trivial for the input code of xmon.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/watchdog.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 920e61c79f47..1fb9379dc683 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -277,9 +277,12 @@ void arch_touch_nmi_watchdog(void)
 {
 	unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
 	int cpu = smp_processor_id();
+	u64 tb = get_tb();
 
-	if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks)
-		watchdog_timer_interrupt(cpu);
+	if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
+		per_cpu(wd_timer_tb, cpu) = tb;
+		wd_smp_clear_cpu_pending(cpu, tb);
+	}
 }
 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
 
-- 
cgit 


From 064996d62a33ffe10264b5af5dca92d54f60f806 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 29 Sep 2017 13:29:40 +1000
Subject: powerpc/xmon: Avoid tripping SMP hardlockup watchdog

The SMP hardlockup watchdog cross-checks other CPUs for lockups, which
causes xmon headaches because it's assuming interrupts hard disabled
means no watchdog troubles. Try to improve that by calling
touch_nmi_watchdog() in obvious places where secondaries are spinning.

Also annotate these spin loops with spin_begin/end calls.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 33351c6704b1..d9a12102b111 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -530,14 +530,19 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 
  waiting:
 	secondary = 1;
+	spin_begin();
 	while (secondary && !xmon_gate) {
 		if (in_xmon == 0) {
-			if (fromipi)
+			if (fromipi) {
+				spin_end();
 				goto leave;
+			}
 			secondary = test_and_set_bit(0, &in_xmon);
 		}
-		barrier();
+		spin_cpu_relax();
+		touch_nmi_watchdog();
 	}
+	spin_end();
 
 	if (!secondary && !xmon_gate) {
 		/* we are the first cpu to come in */
@@ -568,21 +573,25 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
 		mb();
 		xmon_gate = 1;
 		barrier();
+		touch_nmi_watchdog();
 	}
 
  cmdloop:
 	while (in_xmon) {
 		if (secondary) {
+			spin_begin();
 			if (cpu == xmon_owner) {
 				if (!test_and_set_bit(0, &xmon_taken)) {
 					secondary = 0;
+					spin_end();
 					continue;
 				}
 				/* missed it */
 				while (cpu == xmon_owner)
-					barrier();
+					spin_cpu_relax();
 			}
-			barrier();
+			spin_cpu_relax();
+			touch_nmi_watchdog();
 		} else {
 			cmd = cmds(regs);
 			if (cmd != 0) {
-- 
cgit 


From 78adf6c214f04fef01c714d78441653dc838a3ba Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 29 Sep 2017 13:29:41 +1000
Subject: powerpc/64s: Implement system reset idle wakeup reason

It is possible to wake from idle due to a system reset exception, in
which case the CPU takes a system reset interrupt to wake from idle,
with system reset as the wakeup reason.

The regular (not idle wakeup) system reset interrupt handler must be
invoked in this case, otherwise the system reset interrupt is lost.

Handle the system reset interrupt immediately after CPU state has been
restored.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/irq.c | 41 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 4e65bf82f5e0..4813b83b22aa 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -394,11 +394,19 @@ bool prep_irq_for_idle_irqsoff(void)
 /*
  * Take the SRR1 wakeup reason, index into this table to find the
  * appropriate irq_happened bit.
+ *
+ * Sytem reset exceptions taken in idle state also come through here,
+ * but they are NMI interrupts so do not need to wait for IRQs to be
+ * restored, and should be taken as early as practical. These are marked
+ * with 0xff in the table. The Power ISA specifies 0100b as the system
+ * reset interrupt reason.
  */
+#define IRQ_SYSTEM_RESET	0xff
+
 static const u8 srr1_to_lazyirq[0x10] = {
 	0, 0, 0,
 	PACA_IRQ_DBELL,
-	0,
+	IRQ_SYSTEM_RESET,
 	PACA_IRQ_DBELL,
 	PACA_IRQ_DEC,
 	0,
@@ -407,15 +415,42 @@ static const u8 srr1_to_lazyirq[0x10] = {
 	PACA_IRQ_HMI,
 	0, 0, 0, 0, 0 };
 
+static noinline void replay_system_reset(void)
+{
+	struct pt_regs regs;
+
+	ppc_save_regs(&regs);
+	regs.trap = 0x100;
+	get_paca()->in_nmi = 1;
+	system_reset_exception(&regs);
+	get_paca()->in_nmi = 0;
+}
+
 void irq_set_pending_from_srr1(unsigned long srr1)
 {
 	unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18;
+	u8 reason = srr1_to_lazyirq[idx];
+
+	/*
+	 * Take the system reset now, which is immediately after registers
+	 * are restored from idle. It's an NMI, so interrupts need not be
+	 * re-enabled before it is taken.
+	 */
+	if (unlikely(reason == IRQ_SYSTEM_RESET)) {
+		replay_system_reset();
+		return;
+	}
 
 	/*
 	 * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
-	 * so this can be called unconditionally with srr1 wake reason.
+	 * so this can be called unconditionally with the SRR1 wake
+	 * reason as returned by the idle code, which uses 0 to mean no
+	 * interrupt.
+	 *
+	 * If a future CPU was to designate this as an interrupt reason,
+	 * then a new index for no interrupt must be assigned.
 	 */
-	local_paca->irq_happened |= srr1_to_lazyirq[idx];
+	local_paca->irq_happened |= reason;
 }
 #endif /* CONFIG_PPC_BOOK3S */
 
-- 
cgit 


From e36d0a2ed5019184bb9b94ff1138c87c05905789 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 29 Sep 2017 13:29:42 +1000
Subject: powerpc/powernv: Implement NMI IPI with OPAL_SIGNAL_SYSTEM_RESET

This allows MSR[EE]=0 lockups to be detected on an OPAL (bare metal)
system similarly to the hcall NMI IPI on pseries guests, when the
platform/firmware supports it.

This is an example of CPU10 spinning with interrupts hard disabled:

  Watchdog CPU:32 detected Hard LOCKUP other CPUS:10
  Watchdog CPU:10 Hard LOCKUP
  CPU: 10 PID: 4410 Comm: bash Not tainted 4.13.0-rc7-00074-ge89ce1f89f62-dirty #34
  task: c0000003a82b4400 task.stack: c0000003af55c000
  NIP: c0000000000a7b38 LR: c000000000659044 CTR: c0000000000a7b00
  REGS: c00000000fd23d80 TRAP: 0100   Not tainted  (4.13.0-rc7-00074-ge89ce1f89f62-dirty)
  MSR: 90000000000c1033 <SF,HV,ME,IR,DR,RI,LE>
  CR: 28422222  XER: 20000000
  CFAR: c0000000000a7b38 SOFTE: 0
  GPR00: c000000000659044 c0000003af55fbb0 c000000001072a00 0000000000000078
  GPR04: c0000003c81b5c80 c0000003c81cc7e8 9000000000009033 0000000000000000
  GPR08: 0000000000000000 c0000000000a7b00 0000000000000001 9000000000001003
  GPR12: c0000000000a7b00 c00000000fd83200 0000000010180df8 0000000010189e60
  GPR16: 0000000010189ed8 0000000010151270 000000001018bd88 000000001018de78
  GPR20: 00000000370a0668 0000000000000001 00000000101645e0 0000000010163c10
  GPR24: 00007fffd14d6294 00007fffd14d6290 c000000000fba6f0 0000000000000004
  GPR28: c000000000f351d8 0000000000000078 c000000000f4095c 0000000000000000
  NIP [c0000000000a7b38] sysrq_handle_xmon+0x38/0x40
  LR [c000000000659044] __handle_sysrq+0xe4/0x270
  Call Trace:
  [c0000003af55fbd0] [c000000000659044] __handle_sysrq+0xe4/0x270
  [c0000003af55fc70] [c000000000659810] write_sysrq_trigger+0x70/0xa0
  [c0000003af55fca0] [c0000000003da650] proc_reg_write+0xb0/0x110
  [c0000003af55fcf0] [c0000000003423bc] __vfs_write+0x6c/0x1b0
  [c0000003af55fd90] [c000000000344398] vfs_write+0xd8/0x240
  [c0000003af55fde0] [c00000000034632c] SyS_write+0x6c/0x110
  [c0000003af55fe30] [c00000000000b220] system_call+0x58/0x6c

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Use kernel types for opal_signal_system_reset()]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/opal-api.h            |  1 +
 arch/powerpc/include/asm/opal.h                |  2 +
 arch/powerpc/platforms/powernv/opal-wrappers.S |  1 +
 arch/powerpc/platforms/powernv/setup.c         |  1 +
 arch/powerpc/platforms/powernv/smp.c           | 52 ++++++++++++++++++++++++++
 5 files changed, 57 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 450a60b81d2a..9d191ebea706 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -188,6 +188,7 @@
 #define OPAL_XIVE_DUMP				142
 #define OPAL_XIVE_RESERVED3			143
 #define OPAL_XIVE_RESERVED4			144
+#define OPAL_SIGNAL_SYSTEM_RESET		145
 #define OPAL_NPU_INIT_CONTEXT			146
 #define OPAL_NPU_DESTROY_CONTEXT		147
 #define OPAL_NPU_MAP_LPAR			148
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 726c23304a57..04c32b08ffa1 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -281,6 +281,8 @@ int opal_get_power_shift_ratio(u32 handle, int token, u32 *psr);
 int opal_set_power_shift_ratio(u32 handle, int token, u32 psr);
 int opal_sensor_group_clear(u32 group_hndl, int token);
 
+s64 opal_signal_system_reset(s32 cpu);
+
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
 				   int depth, void *data);
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 8c1ede2d3f7e..37cd170201a2 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -307,6 +307,7 @@ OPAL_CALL(opal_xive_get_vp_info,		OPAL_XIVE_GET_VP_INFO);
 OPAL_CALL(opal_xive_set_vp_info,		OPAL_XIVE_SET_VP_INFO);
 OPAL_CALL(opal_xive_sync,			OPAL_XIVE_SYNC);
 OPAL_CALL(opal_xive_dump,			OPAL_XIVE_DUMP);
+OPAL_CALL(opal_signal_system_reset,		OPAL_SIGNAL_SYSTEM_RESET);
 OPAL_CALL(opal_npu_init_context,		OPAL_NPU_INIT_CONTEXT);
 OPAL_CALL(opal_npu_destroy_context,		OPAL_NPU_DESTROY_CONTEXT);
 OPAL_CALL(opal_npu_map_lpar,			OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 897aa1400eb8..cf52d53da460 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -282,6 +282,7 @@ static void __init pnv_setup_machdep_opal(void)
 	ppc_md.restart = pnv_restart;
 	pm_power_off = pnv_power_off;
 	ppc_md.halt = pnv_halt;
+	/* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */
 	ppc_md.machine_check_exception = opal_machine_check;
 	ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
 	ppc_md.hmi_exception_early = opal_hmi_exception_early;
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 355d3f99cafb..ba030669eca1 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -297,6 +297,54 @@ static void __init pnv_smp_probe(void)
 	}
 }
 
+static int pnv_system_reset_exception(struct pt_regs *regs)
+{
+	if (smp_handle_nmi_ipi(regs))
+		return 1;
+	return 0;
+}
+
+static int pnv_cause_nmi_ipi(int cpu)
+{
+	int64_t rc;
+
+	if (cpu >= 0) {
+		rc = opal_signal_system_reset(get_hard_smp_processor_id(cpu));
+		if (rc != OPAL_SUCCESS)
+			return 0;
+		return 1;
+
+	} else if (cpu == NMI_IPI_ALL_OTHERS) {
+		bool success = true;
+		int c;
+
+
+		/*
+		 * We do not use broadcasts (yet), because it's not clear
+		 * exactly what semantics Linux wants or the firmware should
+		 * provide.
+		 */
+		for_each_online_cpu(c) {
+			if (c == smp_processor_id())
+				continue;
+
+			rc = opal_signal_system_reset(
+						get_hard_smp_processor_id(c));
+			if (rc != OPAL_SUCCESS)
+				success = false;
+		}
+		if (success)
+			return 1;
+
+		/*
+		 * Caller will fall back to doorbells, which may pick
+		 * up the remainders.
+		 */
+	}
+
+	return 0;
+}
+
 static struct smp_ops_t pnv_smp_ops = {
 	.message_pass	= NULL, /* Use smp_muxed_ipi_message_pass */
 	.cause_ipi	= NULL,	/* Filled at runtime by pnv_smp_probe() */
@@ -315,6 +363,10 @@ static struct smp_ops_t pnv_smp_ops = {
 /* This is called very early during platform setup_arch */
 void __init pnv_smp_init(void)
 {
+	if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET)) {
+		ppc_md.system_reset_exception = pnv_system_reset_exception;
+		pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi;
+	}
 	smp_ops = &pnv_smp_ops;
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
cgit 


From 969a86a2855d484a00205a424df1c693acee15d9 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 27 Sep 2017 15:45:58 +1000
Subject: powerpc/powernv: Use early_radix_enabled in POWER9 tlb flush

This code is used at boot and machine checks, so it should be using
early_radix_enabled() (which is usable any time).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/mce_power.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index b76ca198e09c..d37e612050b5 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -128,7 +128,7 @@ void __flush_tlb_power9(unsigned int action)
 {
 	unsigned int num_sets;
 
-	if (radix_enabled())
+	if (early_radix_enabled())
 		num_sets = POWER9_TLB_SETS_RADIX;
 	else
 		num_sets = POWER9_TLB_SETS_HASH;
-- 
cgit 


From 83ad1e6a1dc049dd04be4564125a7fd49e022183 Mon Sep 17 00:00:00 2001
From: Allen Pais <allen.lkml@gmail.com>
Date: Fri, 22 Sep 2017 17:04:58 +0530
Subject: powerpc/oprofile: Use setup_timer() helper

Use setup_timer function instead of initializing timer with the
function and data fields.

Signed-off-by: Allen Pais <allen.lkml@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/oprofile/op_model_cell.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c
index c82497a31c54..264b6ab11978 100644
--- a/arch/powerpc/oprofile/op_model_cell.c
+++ b/arch/powerpc/oprofile/op_model_cell.c
@@ -555,9 +555,7 @@ static void cell_virtual_cntr(unsigned long data)
 
 static void start_virt_cntrs(void)
 {
-	init_timer(&timer_virt_cntr);
-	timer_virt_cntr.function = cell_virtual_cntr;
-	timer_virt_cntr.data = 0UL;
+	setup_timer(&timer_virt_cntr, cell_virtual_cntr, 0UL);
 	timer_virt_cntr.expires = jiffies + HZ / 10;
 	add_timer(&timer_virt_cntr);
 }
@@ -679,9 +677,7 @@ static void spu_evnt_swap(unsigned long data)
 
 static void start_spu_event_swap(void)
 {
-	init_timer(&timer_spu_event_swap);
-	timer_spu_event_swap.function = spu_evnt_swap;
-	timer_spu_event_swap.data = 0UL;
+	setup_timer(&timer_spu_event_swap, spu_evnt_swap, 0UL);
 	timer_spu_event_swap.expires = jiffies + HZ / 25;
 	add_timer(&timer_spu_event_swap);
 }
-- 
cgit 


From 8d6b1bf20f61cd5358fdc5fa83d7fc11c8393d8a Mon Sep 17 00:00:00 2001
From: Allen Pais <allen.lkml@gmail.com>
Date: Fri, 22 Sep 2017 17:04:59 +0530
Subject: powerpc/6xx: Use setup_timer() helper

Use setup_timer function instead of initializing timer with the
function and data fields.

Signed-off-by: Allen Pais <allen.lkml@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/tau_6xx.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index a753b72efbc0..726c2d683f93 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -229,8 +229,7 @@ int __init TAU_init(void)
 
 
 	/* first, set up the window shrinking timer */
-	init_timer(&tau_timer);
-	tau_timer.function = tau_timeout_smp;
+	setup_timer(&tau_timer, tau_timeout_smp, 0UL);
 	tau_timer.expires = jiffies + shrink_timer;
 	add_timer(&tau_timer);
 
-- 
cgit 


From 01451ad47e2724eb123e87a5bae04e943046b87a Mon Sep 17 00:00:00 2001
From: Allen Pais <allen.lkml@gmail.com>
Date: Fri, 22 Sep 2017 17:05:00 +0530
Subject: powerpc/powermac: Use setup_timer() helper

Use setup_timer function instead of initializing timer with the
function and data fields.

Signed-off-by: Allen Pais <allen.lkml@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powermac/low_i2c.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c
index 70183eb3d5c8..39a1d4225e0f 100644
--- a/arch/powerpc/platforms/powermac/low_i2c.c
+++ b/arch/powerpc/platforms/powermac/low_i2c.c
@@ -513,9 +513,7 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np)
 	mutex_init(&host->mutex);
 	init_completion(&host->complete);
 	spin_lock_init(&host->lock);
-	init_timer(&host->timeout_timer);
-	host->timeout_timer.function = kw_i2c_timeout;
-	host->timeout_timer.data = (unsigned long)host;
+	setup_timer(&host->timeout_timer, kw_i2c_timeout, (unsigned long)host);
 
 	psteps = of_get_property(np, "AAPL,address-step", NULL);
 	steps = psteps ? (*psteps) : 0x10;
-- 
cgit 


From 924c8feb041c3ef22d46ac2e7468161aad85e979 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Fri, 29 Sep 2017 11:14:08 +0530
Subject: powerpc/lib/sstep: Add XER bits introduced in POWER ISA v3.0

This adds definitions for the OV32 and CA32 bits of XER that
were introduced in POWER ISA v3.0. There are some existing
instructions that currently set the OV and CA bits based on
certain conditions.

The emulation behaviour of all these instructions needs to
be updated to set these new bits accordingly.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Acked-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/lib/sstep.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 5e8418c28bd8..16814bfc01da 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -31,6 +31,8 @@ extern char system_call_common[];
 #define XER_SO		0x80000000U
 #define XER_OV		0x40000000U
 #define XER_CA		0x20000000U
+#define XER_OV32	0x00080000U
+#define XER_CA32	0x00040000U
 
 #ifdef CONFIG_PPC_FPU
 /*
-- 
cgit 


From dc39c1d68f256990d4bb0033a219e8ca15ba4d7f Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Fri, 29 Sep 2017 11:14:09 +0530
Subject: powerpc/lib/sstep: Fix fixed-point arithmetic instructions that set
 CA32

There are existing fixed-point arithmetic instructions that always set the
CA bit of XER to reflect the carry out of bit 0 in 64-bit mode and out of
bit 32 in 32-bit mode. In ISA v3.0, these instructions also always set the
CA32 bit of XER to reflect the carry out of bit 32.

This fixes the emulated behaviour of such instructions when running on a
system that is compliant with POWER ISA v3.0. The following instructions
are affected:
  * Add Immediate Carrying (addic)
  * Add Immediate Carrying and Record (addic.)
  * Subtract From Immediate Carrying (subfic)
  * Add Carrying (addc[.])
  * Subtract From Carrying (subfc[.])
  * Add Extended (adde[.])
  * Subtract From Extended (subfe[.])
  * Add to Minus One Extended (addme[.])
  * Subtract From Minus One Extended (subfme[.])
  * Add to Zero Extended (addze[.])
  * Subtract From Zero Extended (subfze[.])

Fixes: 0016a4cf5582 ("powerpc: Emulate most Book I instructions in emulate_step()")
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/lib/sstep.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 16814bfc01da..fe1910733e55 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -964,6 +964,16 @@ static nokprobe_inline void set_cr0(const struct pt_regs *regs,
 		op->ccval |= 0x20000000;
 }
 
+static nokprobe_inline void set_ca32(struct instruction_op *op, bool val)
+{
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		if (val)
+			op->xerval |= XER_CA32;
+		else
+			op->xerval &= ~XER_CA32;
+	}
+}
+
 static nokprobe_inline void add_with_carry(const struct pt_regs *regs,
 				     struct instruction_op *op, int rd,
 				     unsigned long val1, unsigned long val2,
@@ -987,6 +997,9 @@ static nokprobe_inline void add_with_carry(const struct pt_regs *regs,
 		op->xerval |= XER_CA;
 	else
 		op->xerval &= ~XER_CA;
+
+	set_ca32(op, (unsigned int)val < (unsigned int)val1 ||
+			(carry_in && (unsigned int)val == (unsigned int)val1));
 }
 
 static nokprobe_inline void do_cmp_signed(const struct pt_regs *regs,
-- 
cgit 


From 0a75aff178df005b0619703cf7886d5c8e28e486 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Fri, 29 Sep 2017 11:14:10 +0530
Subject: powerpc/lib/sstep: Fix fixed-point shift instructions that set CA32

This fixes the emulated behaviour of existing fixed-point shift right
algebraic instructions that are supposed to set both the CA and CA32
bits of XER when running on a system that is compliant with POWER ISA
v3.0 independent of whether the system is executing in 32-bit mode or
64-bit mode. The following instructions are affected:
  * Shift Right Algebraic Word Immediate (srawi[.])
  * Shift Right Algebraic Word (sraw[.])
  * Shift Right Algebraic Doubleword Immediate (sradi[.])
  * Shift Right Algebraic Doubleword (srad[.])

Fixes: 0016a4cf5582 ("powerpc: Emulate most Book I instructions in emulate_step()")
Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/lib/sstep.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index fe1910733e55..5118110c3983 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1804,6 +1804,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 				op->xerval |= XER_CA;
 			else
 				op->xerval &= ~XER_CA;
+			set_ca32(op, op->xerval & XER_CA);
 			goto logical_done;
 
 		case 824:	/* srawi */
@@ -1816,6 +1817,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 				op->xerval |= XER_CA;
 			else
 				op->xerval &= ~XER_CA;
+			set_ca32(op, op->xerval & XER_CA);
 			goto logical_done;
 
 #ifdef __powerpc64__
@@ -1845,6 +1847,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 				op->xerval |= XER_CA;
 			else
 				op->xerval &= ~XER_CA;
+			set_ca32(op, op->xerval & XER_CA);
 			goto logical_done;
 
 		case 826:	/* sradi with sh_5 = 0 */
@@ -1858,6 +1861,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 				op->xerval |= XER_CA;
 			else
 				op->xerval &= ~XER_CA;
+			set_ca32(op, op->xerval & XER_CA);
 			goto logical_done;
 #endif /* __powerpc64__ */
 
-- 
cgit 


From c3dda4b0db9cf5044010e4de88e5e5b94edf916c Mon Sep 17 00:00:00 2001
From: Joel Stanley <joel@jms.id.au>
Date: Wed, 4 Oct 2017 14:53:24 +1030
Subject: powerpc/configs: Add Skiroot defconfig

This configuration is used by the OpenPower firmware for it's
Linux-as-bootloader implementation. Also known as the Petitboot
kernel, this configuration broke in 4.12 (CPU_HOTPLUG=n), so add it to
the upstream tree in order to get better coverage.

Signed-off-by: Joel Stanley <joel@jms.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/configs/skiroot_defconfig | 232 +++++++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 arch/powerpc/configs/skiroot_defconfig

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig
new file mode 100644
index 000000000000..6bd5e7261335
--- /dev/null
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -0,0 +1,232 @@
+CONFIG_PPC64=y
+CONFIG_ALTIVEC=y
+CONFIG_VSX=y
+CONFIG_NR_CPUS=2048
+CONFIG_CPU_LITTLE_ENDIAN=y
+# CONFIG_SWAP is not set
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+# CONFIG_CROSS_MEMORY_ATTACH is not set
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=20
+CONFIG_RELAY=y
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_RD_GZIP is not set
+# CONFIG_RD_BZIP2 is not set
+# CONFIG_RD_LZMA is not set
+# CONFIG_RD_LZO is not set
+# CONFIG_RD_LZ4 is not set
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_PERF_EVENTS=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_JUMP_LABEL=y
+CONFIG_STRICT_KERNEL_RWX=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODULE_SIG=y
+CONFIG_MODULE_SIG_FORCE=y
+CONFIG_MODULE_SIG_SHA512=y
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_IOSCHED_DEADLINE is not set
+# CONFIG_PPC_PSERIES is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_IDLE=y
+CONFIG_HZ_100=y
+CONFIG_KEXEC=y
+CONFIG_IRQ_ALL_CPUS=y
+CONFIG_NUMA=y
+# CONFIG_COMPACTION is not set
+# CONFIG_MIGRATION is not set
+# CONFIG_BOUNCE is not set
+CONFIG_PPC_64K_PAGES=y
+CONFIG_SCHED_SMT=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=tty0 console=hvc0 powersave=off"
+# CONFIG_SECCOMP is not set
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_NET_IPIP=y
+CONFIG_SYN_COOKIES=y
+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
+# CONFIG_INET_XFRM_MODE_TUNNEL is not set
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_IPV6 is not set
+CONFIG_DNS_RESOLVER=y
+# CONFIG_WIRELESS is not set
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_MTD=m
+CONFIG_MTD_POWERNV_FLASH=m
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=65536
+CONFIG_VIRTIO_BLK=m
+CONFIG_BLK_DEV_NVME=m
+CONFIG_EEPROM_AT24=y
+# CONFIG_CXL is not set
+CONFIG_BLK_DEV_SD=m
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=m
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SCAN_ASYNC=y
+CONFIG_SCSI_FC_ATTRS=y
+CONFIG_SCSI_CXGB3_ISCSI=m
+CONFIG_SCSI_CXGB4_ISCSI=m
+CONFIG_SCSI_BNX2_ISCSI=m
+CONFIG_BE2ISCSI=m
+CONFIG_SCSI_AACRAID=m
+CONFIG_MEGARAID_NEWGEN=y
+CONFIG_MEGARAID_MM=m
+CONFIG_MEGARAID_MAILBOX=m
+CONFIG_MEGARAID_SAS=m
+CONFIG_SCSI_MPT2SAS=m
+CONFIG_SCSI_IPR=m
+# CONFIG_SCSI_IPR_TRACE is not set
+# CONFIG_SCSI_IPR_DUMP is not set
+CONFIG_SCSI_QLA_FC=m
+CONFIG_SCSI_QLA_ISCSI=m
+CONFIG_SCSI_LPFC=m
+CONFIG_SCSI_VIRTIO=m
+CONFIG_SCSI_DH=y
+CONFIG_SCSI_DH_ALUA=m
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+# CONFIG_ATA_SFF is not set
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=m
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID10=m
+CONFIG_MD_RAID456=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_MD_FAULTY=m
+CONFIG_BLK_DEV_DM=m
+CONFIG_DM_CRYPT=m
+CONFIG_DM_SNAPSHOT=m
+CONFIG_DM_MIRROR=m
+CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_ACENIC=m
+CONFIG_ACENIC_OMIT_TIGON_I=y
+CONFIG_TIGON3=y
+CONFIG_BNX2X=m
+CONFIG_CHELSIO_T1=y
+CONFIG_BE2NET=m
+CONFIG_S2IO=m
+CONFIG_E100=m
+CONFIG_E1000=m
+CONFIG_E1000E=m
+CONFIG_IXGB=m
+CONFIG_IXGBE=m
+CONFIG_MLX4_EN=m
+CONFIG_MLX5_CORE=m
+CONFIG_MLX5_CORE_EN=y
+CONFIG_MYRI10GE=m
+CONFIG_QLGE=m
+CONFIG_NETXEN_NIC=m
+CONFIG_SFC=m
+# CONFIG_USB_NET_DRIVERS is not set
+# CONFIG_WLAN is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_MISC=y
+# CONFIG_SERIO_SERPORT is not set
+# CONFIG_DEVMEM is not set
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_IPMI_HANDLER=y
+CONFIG_IPMI_DEVICE_INTERFACE=y
+CONFIG_IPMI_POWERNV=y
+CONFIG_HW_RANDOM=y
+CONFIG_TCG_TIS_I2C_NUVOTON=y
+# CONFIG_I2C_COMPAT is not set
+CONFIG_I2C_CHARDEV=y
+# CONFIG_I2C_HELPER_AUTO is not set
+CONFIG_DRM=y
+CONFIG_DRM_RADEON=y
+CONFIG_DRM_AST=m
+CONFIG_FIRMWARE_EDID=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_OF=y
+CONFIG_FB_MATROX=y
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+CONFIG_FB_MATROX_G=y
+# CONFIG_LCD_CLASS_DEVICE is not set
+# CONFIG_BACKLIGHT_GENERIC is not set
+# CONFIG_VGA_CONSOLE is not set
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_USB_HIDDEV=y
+CONFIG_USB=y
+CONFIG_USB_MON=y
+CONFIG_USB_XHCI_HCD=y
+CONFIG_USB_EHCI_HCD=y
+# CONFIG_USB_EHCI_HCD_PPC_OF is not set
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_STORAGE=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_DRV_GENERIC=m
+CONFIG_VIRT_DRIVERS=y
+CONFIG_VIRTIO_PCI=y
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_EXT4_FS=m
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_XFS_FS=m
+CONFIG_XFS_POSIX_ACL=y
+CONFIG_BTRFS_FS=m
+CONFIG_BTRFS_FS_POSIX_ACL=y
+CONFIG_ISO9660_FS=m
+CONFIG_UDF_FS=m
+CONFIG_MSDOS_FS=m
+CONFIG_VFAT_FS=m
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+# CONFIG_MISC_FILESYSTEMS is not set
+# CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_CRC16=y
+CONFIG_CRC_ITU_T=y
+CONFIG_LIBCRC32C=y
+CONFIG_PRINTK_TIME=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
+CONFIG_HARDLOCKUP_DETECTOR=y
+CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_WQ_WATCHDOG=y
+CONFIG_SCHEDSTATS=y
+# CONFIG_FTRACE is not set
+CONFIG_XMON=y
+CONFIG_XMON_DEFAULT=y
+CONFIG_SECURITY=y
+CONFIG_IMA=y
+CONFIG_EVM=y
+# CONFIG_CRYPTO_ECHAINIV is not set
+CONFIG_CRYPTO_ECB=y
+CONFIG_CRYPTO_CMAC=y
+CONFIG_CRYPTO_MD4=y
+CONFIG_CRYPTO_ARC4=y
+CONFIG_CRYPTO_DES=y
+# CONFIG_CRYPTO_HW is not set
-- 
cgit 


From 22085337f5b9d7a7adf5c6cc4e007c28e5216a7f Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Fri, 22 Sep 2017 14:40:43 +0530
Subject: powerpc/kprobes: Some cosmetic updates to try_to_emulate()

1. This is only used in kprobes.c, so make it static.
2. Remove the un-necessary (ret == 0) comparison in the else clause.

Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 367494dc67d9..c2a6ab38a67f 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -239,7 +239,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(arch_prepare_kretprobe);
 
-int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
+static int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
 {
 	int ret;
 	unsigned int insn = *p->ainsn.insn;
@@ -261,7 +261,7 @@ int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
 		 */
 		printk("Can't step on instruction %x\n", insn);
 		BUG();
-	} else if (ret == 0)
+	} else
 		/* This instruction can't be boosted */
 		p->ainsn.boostable = -1;
 
-- 
cgit 


From a7b440383f9b078254d38ab6a3672810f1eca74f Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Fri, 22 Sep 2017 14:40:44 +0530
Subject: powerpc/kprobes: Do not suppress instruction emulation if a single
 run failed

Currently, we disable instruction emulation if emulate_step() fails for
any reason. However, such failures could be transient and specific to a
particular run. Instead, only disable instruction emulation if we have
never been able to emulate this. If we had emulated this instruction
successfully at least once, then we single step only this probe hit and
continue to try emulating the instruction in subsequent probe hits.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index c2a6ab38a67f..4c1702423676 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -261,9 +261,20 @@ static int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
 		 */
 		printk("Can't step on instruction %x\n", insn);
 		BUG();
-	} else
-		/* This instruction can't be boosted */
-		p->ainsn.boostable = -1;
+	} else {
+		/*
+		 * If we haven't previously emulated this instruction, then it
+		 * can't be boosted. Note it down so we don't try to do so again.
+		 *
+		 * If, however, we had emulated this instruction in the past,
+		 * then this is just an error with the current run (for
+		 * instance, exceptions due to a load/store). We return 0 so
+		 * that this is now single-stepped, but continue to try
+		 * emulating it in subsequent probe hits.
+		 */
+		if (unlikely(p->ainsn.boostable != 1))
+			p->ainsn.boostable = -1;
+	}
 
 	return ret;
 }
-- 
cgit 


From bf3a91251735863fcb230c96e984ebcacee820d9 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Fri, 22 Sep 2017 14:40:45 +0530
Subject: powerpc/kprobes: Clean up jprobe detection in livepatch handler

In commit c05b8c4474c03 ("powerpc/kprobes: Skip livepatch_handler() for
jprobes"), we added a helper is_current_kprobe_addr() to help detect if
the modified regs->nip was due to a jprobe or livepatch. Masami felt
that the function name was not quite clear. To that end, this patch
renames is_current_kprobe_addr() to __is_active_jprobe() and adds a
comment to (hopefully) better clarify the purpose of this helper. The
helper has also now been moved to kprobes-ftrace.c so that it is only
available for KPROBES_ON_FTRACE.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kprobes.h             |  2 +-
 arch/powerpc/kernel/kprobes-ftrace.c           | 11 +++++++++++
 arch/powerpc/kernel/kprobes.c                  |  6 ------
 arch/powerpc/kernel/trace/ftrace_64_mprofile.S |  4 ++--
 4 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index 8814a7249ceb..9f3be5c8a4a3 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -103,8 +103,8 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
 extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 extern int kprobe_handler(struct pt_regs *regs);
 extern int kprobe_post_handler(struct pt_regs *regs);
-extern int is_current_kprobe_addr(unsigned long addr);
 #ifdef CONFIG_KPROBES_ON_FTRACE
+extern int __is_active_jprobe(unsigned long addr);
 extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
 			   struct kprobe_ctlblk *kcb);
 #else
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
index 6c089d9757c9..48f675a73cff 100644
--- a/arch/powerpc/kernel/kprobes-ftrace.c
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -25,6 +25,17 @@
 #include <linux/preempt.h>
 #include <linux/ftrace.h>
 
+/*
+ * This is called from ftrace code after invoking registered handlers to
+ * disambiguate regs->nip changes done by jprobes and livepatch. We check if
+ * there is an active jprobe at the provided address (mcount location).
+ */
+int __is_active_jprobe(unsigned long addr)
+{
+	struct kprobe *p = kprobe_running();
+	return (p && (unsigned long)p->addr == addr) ? 1 : 0;
+}
+
 static nokprobe_inline
 int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
 		      struct kprobe_ctlblk *kcb, unsigned long orig_nip)
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 4c1702423676..48a81614f629 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -43,12 +43,6 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
 
-int is_current_kprobe_addr(unsigned long addr)
-{
-	struct kprobe *p = kprobe_running();
-	return (p && (unsigned long)p->addr == addr) ? 1 : 0;
-}
-
 bool arch_within_kprobe_blacklist(unsigned long addr)
 {
 	return  (addr >= (unsigned long)__kprobes_text_start &&
diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
index c98e90b4ea7b..c1cfcce0551d 100644
--- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
+++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
@@ -110,9 +110,9 @@ ftrace_call:
 	/* NIP has not been altered, skip over further checks */
 	beq	1f
 
-	/* Check if there is an active kprobe on us */
+	/* Check if there is an active jprobe on us */
 	subi	r3, r14, 4
-	bl	is_current_kprobe_addr
+	bl	__is_active_jprobe
 	nop
 
 	/*
-- 
cgit 


From c179ea270100488c416890beef6424ce390ceb38 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Fri, 22 Sep 2017 14:40:46 +0530
Subject: powerpc/kprobes: Fix warnings from __this_cpu_read() on preempt
 kernels

Kamalesh pointed out that we are getting the below call traces with
livepatched functions when we enable CONFIG_PREEMPT:

[  495.470721] BUG: using __this_cpu_read() in preemptible [00000000] code: cat/8394
[  495.471167] caller is is_current_kprobe_addr+0x30/0x90
[  495.471171] CPU: 4 PID: 8394 Comm: cat Tainted: G              K 4.13.0-rc7-nnr+ #95
[  495.471173] Call Trace:
[  495.471178] [c00000008fd9b960] [c0000000009f039c] dump_stack+0xec/0x160 (unreliable)
[  495.471184] [c00000008fd9b9a0] [c00000000059169c] check_preemption_disabled+0x15c/0x170
[  495.471187] [c00000008fd9ba30] [c000000000046460] is_current_kprobe_addr+0x30/0x90
[  495.471191] [c00000008fd9ba60] [c00000000004e9a0] ftrace_call+0x1c/0xb8
[  495.471195] [c00000008fd9bc30] [c000000000376fd8] seq_read+0x238/0x5c0
[  495.471199] [c00000008fd9bcd0] [c0000000003cfd78] proc_reg_read+0x88/0xd0
[  495.471203] [c00000008fd9bd00] [c00000000033e5d4] __vfs_read+0x44/0x1b0
[  495.471206] [c00000008fd9bd90] [c0000000003402ec] vfs_read+0xbc/0x1b0
[  495.471210] [c00000008fd9bde0] [c000000000342138] SyS_read+0x68/0x110
[  495.471214] [c00000008fd9be30] [c00000000000bc6c] system_call+0x58/0x6c

Commit c05b8c4474c030 ("powerpc/kprobes: Skip livepatch_handler() for
jprobes") introduced a helper is_current_kprobe_addr() to help determine
if the current function has been livepatched or if it has a jprobe
installed, both of which modify the NIP. This was subsequently renamed
to __is_active_jprobe().

In the case of a jprobe, kprobe_ftrace_handler() disables pre-emption
before calling into setjmp_pre_handler() which returns without disabling
pre-emption. This is done to ensure that the jprobe handler won't
disappear beneath us if the jprobe is unregistered between the
setjmp_pre_handler() and the subsequent longjmp_break_handler() called
from the jprobe handler. Due to this, we can use __this_cpu_read() in
__is_active_jprobe() with the pre-emption check as we know that
pre-emption will be disabled.

However, if this function has been livepatched, we are still doing this
check and when we do so, pre-emption won't necessarily be disabled. This
results in the call trace shown above.

Fix this by only invoking __is_active_jprobe() when pre-emption is
disabled. And since we now guard this within a pre-emption check, we can
instead use raw_cpu_read() to get the current_kprobe value skipping the
check done by __this_cpu_read().

Fixes: c05b8c4474c030 ("powerpc/kprobes: Skip livepatch_handler() for jprobes")
Reported-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Tested-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes-ftrace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
index 48f675a73cff..1e54ec8ad85f 100644
--- a/arch/powerpc/kernel/kprobes-ftrace.c
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -32,8 +32,12 @@
  */
 int __is_active_jprobe(unsigned long addr)
 {
-	struct kprobe *p = kprobe_running();
-	return (p && (unsigned long)p->addr == addr) ? 1 : 0;
+	if (!preemptible()) {
+		struct kprobe *p = raw_cpu_read(current_kprobe);
+		return (p && (unsigned long)p->addr == addr) ? 1 : 0;
+	}
+
+	return 0;
 }
 
 static nokprobe_inline
-- 
cgit 


From 6baea433bc84cd148af1c524389a8d756f67412e Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Fri, 22 Sep 2017 14:40:47 +0530
Subject: powerpc/jprobes: Disable preemption when triggered through ftrace

KPROBES_SANITY_TEST throws the below splat when CONFIG_PREEMPT is
enabled:

  Kprobe smoke test: started
  DEBUG_LOCKS_WARN_ON(val > preempt_count())
  ------------[ cut here ]------------
  WARNING: CPU: 19 PID: 1 at kernel/sched/core.c:3094 preempt_count_sub+0xcc/0x140
  Modules linked in:
  CPU: 19 PID: 1 Comm: swapper/0 Not tainted 4.13.0-rc7-nnr+ #97
  task: c0000000fea80000 task.stack: c0000000feb00000
  NIP:  c00000000011d3dc LR: c00000000011d3d8 CTR: c000000000a090d0
  REGS: c0000000feb03400 TRAP: 0700   Not tainted  (4.13.0-rc7-nnr+)
  MSR:  8000000000021033 <SF,ME,IR,DR,RI,LE>  CR: 28000282  XER: 00000000
  CFAR: c00000000015aa18 SOFTE: 0
  <snip>
  NIP preempt_count_sub+0xcc/0x140
  LR  preempt_count_sub+0xc8/0x140
  Call Trace:
    preempt_count_sub+0xc8/0x140 (unreliable)
    kprobe_handler+0x228/0x4b0
    program_check_exception+0x58/0x3b0
    program_check_common+0x16c/0x170
    --- interrupt: 0 at kprobe_target+0x8/0x20
                     LR = init_test_probes+0x248/0x7d0
    kp+0x0/0x80 (unreliable)
    livepatch_handler+0x38/0x74
    init_kprobes+0x1d8/0x208
    do_one_initcall+0x68/0x1d0
    kernel_init_freeable+0x298/0x374
    kernel_init+0x24/0x160
    ret_from_kernel_thread+0x5c/0x70
  Instruction dump:
  419effdc 3d22001b 39299240 81290000 2f890000 409effc8 3c82ffcb 3c62ffcb
  3884bc68 3863bc18 4803d5fd 60000000 <0fe00000> 4bffffa8 60000000 60000000
  ---[ end trace 432dd46b4ce3d29f ]---
  Kprobe smoke test: passed successfully

The issue is that we aren't disabling preemption in
kprobe_ftrace_handler(). Disable it.

Fixes: ead514d5fb30a0 ("powerpc/kprobes: Add support for KPROBES_ON_FTRACE")
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
[mpe: Trim oops a little for formatting]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes-ftrace.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
index 1e54ec8ad85f..4b1f34f685b1 100644
--- a/arch/powerpc/kernel/kprobes-ftrace.c
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -80,6 +80,7 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
 	/* Disable irq for emulating a breakpoint and avoiding preempt */
 	local_irq_save(flags);
 	hard_irq_disable();
+	preempt_disable();
 
 	p = get_kprobe((kprobe_opcode_t *)nip);
 	if (unlikely(!p) || kprobe_disabled(p))
@@ -101,12 +102,18 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
 		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 		if (!p->pre_handler || !p->pre_handler(p, regs))
 			__skip_singlestep(p, regs, kcb, orig_nip);
-		/*
-		 * If pre_handler returns !0, it sets regs->nip and
-		 * resets current kprobe.
-		 */
+		else {
+			/*
+			 * If pre_handler returns !0, it sets regs->nip and
+			 * resets current kprobe. In this case, we still need
+			 * to restore irq, but not preemption.
+			 */
+			local_irq_restore(flags);
+			return;
+		}
 	}
 end:
+	preempt_enable_no_resched();
 	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(kprobe_ftrace_handler);
-- 
cgit 


From 3368f5699a9b0e281e14f9f93244da0a922f6df0 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Fri, 22 Sep 2017 14:40:48 +0530
Subject: powerpc/jprobes: Validate break handler invocation as being due to a
 jprobe_return()

Fix a circa 2005 FIXME by implementing a check to ensure that we
actually got into the jprobe break handler() due to the trap in
jprobe_return().

Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 48a81614f629..a14c61855705 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -639,24 +639,22 @@ NOKPROBE_SYMBOL(setjmp_pre_handler);
 
 void __used jprobe_return(void)
 {
-	asm volatile("trap" ::: "memory");
+	asm volatile("jprobe_return_trap:\n"
+		     "trap\n"
+		     ::: "memory");
 }
 NOKPROBE_SYMBOL(jprobe_return);
 
-static void __used jprobe_return_end(void)
-{
-}
-NOKPROBE_SYMBOL(jprobe_return_end);
-
 int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
-	/*
-	 * FIXME - we should ideally be validating that we got here 'cos
-	 * of the "trap" in jprobe_return() above, before restoring the
-	 * saved regs...
-	 */
+	if (regs->nip != ppc_kallsyms_lookup_name("jprobe_return_trap")) {
+		pr_debug("longjmp_break_handler NIP (0x%lx) does not match jprobe_return_trap (0x%lx)\n",
+				regs->nip, ppc_kallsyms_lookup_name("jprobe_return_trap"));
+		return 0;
+	}
+
 	memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs));
 	/* It's OK to start function graph tracing again */
 	unpause_graph_tracing();
-- 
cgit 


From c6baa077b784c3b37391a8c11f433e3f881a80df Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Fri, 29 Sep 2017 13:58:02 +1000
Subject: powerpc/powernv: Make opal_event_shutdown() callable from IRQ context

In opal_event_shutdown() we free all the IRQs hanging off the
opal_event_irqchip. However it's not safe to do so if we're called
from IRQ context, because free_irq() wants to synchronise versus IRQ
context. This can lead to warnings and a stuck system.

For example from sysrq-b:

  Trying to free IRQ 17 from IRQ context!
  ------------[ cut here ]------------
  WARNING: CPU: 0 PID: 0 at kernel/irq/manage.c:1461 __free_irq+0x398/0x8d0
  ...
  NIP __free_irq+0x398/0x8d0
  LR __free_irq+0x394/0x8d0
  Call Trace:
    __free_irq+0x394/0x8d0 (unreliable)
    free_irq+0xa4/0x140
    opal_event_shutdown+0x128/0x180
    opal_shutdown+0x1c/0xb0
    pnv_shutdown+0x20/0x40
    machine_restart+0x38/0x90
    emergency_restart+0x28/0x40
    sysrq_handle_reboot+0x24/0x40
    __handle_sysrq+0x198/0x590
    hvc_poll+0x48c/0x8c0
    hvc_handle_interrupt+0x1c/0x50
    __handle_irq_event_percpu+0xe8/0x6e0
    handle_irq_event_percpu+0x34/0xe0
    handle_irq_event+0xc4/0x210
    handle_level_irq+0x250/0x770
    generic_handle_irq+0x5c/0xa0
    opal_handle_events+0x11c/0x240
    opal_interrupt+0x38/0x50
    __handle_irq_event_percpu+0xe8/0x6e0
    handle_irq_event_percpu+0x34/0xe0
    handle_irq_event+0xc4/0x210
    handle_fasteoi_irq+0x174/0xa10
    generic_handle_irq+0x5c/0xa0
    __do_irq+0xbc/0x4e0
    call_do_irq+0x14/0x24
    do_IRQ+0x18c/0x540
    hardware_interrupt_common+0x158/0x180

We can avoid that by using disable_irq_nosync() rather than
free_irq(). Although it doesn't fully free the IRQ, it should be
sufficient when we're shutting down, particularly in an emergency.

Add an in_interrupt() check and use free_irq() when we're shutting
down normally. It's probably OK to use disable_irq_nosync() in that
case too, but for now it's safer to leave that behaviour as-is.

Fixes: 9f0fd0499d30 ("powerpc/powernv: Add a virtual irqchip for opal events")
Reported-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-irqchip.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
index ecdcba9d1220..9d1b8c0aaf93 100644
--- a/arch/powerpc/platforms/powernv/opal-irqchip.c
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -174,8 +174,14 @@ void opal_event_shutdown(void)
 
 	/* First free interrupts, which will also mask them */
 	for (i = 0; i < opal_irq_count; i++) {
-		if (opal_irqs[i])
+		if (!opal_irqs[i])
+			continue;
+
+		if (in_interrupt())
+			disable_irq_nosync(opal_irqs[i]);
+		else
 			free_irq(opal_irqs[i], NULL);
+
 		opal_irqs[i] = 0;
 	}
 }
-- 
cgit 


From 59d3391e8cf274097c42369866070c565891bae5 Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@linux.vnet.ibm.com>
Date: Mon, 18 Sep 2017 11:16:58 -0300
Subject: powerpc/xmon: Add option to show uptime information

It might be useful to quickly get the uptime of a running system on
xmon, without needing to grab data from memory and doing math on
struct addresses.

For example, it'd be useful to check for how long after a crash a
system is on xmon shell or if some test was started after the first
test crashed (and this 2nd test crashed too into xmon).

This small patch adds the 'U' command, to accomplish this.

Suggested-by: Murilo Fossa Vicentini <muvic@linux.vnet.ibm.com>
Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
[mpe: Display units (seconds), add sync()/__delay() sequence]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index d9a12102b111..4679aeb84767 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -278,6 +278,7 @@ Commands:\n\
 #elif defined(CONFIG_44x) || defined(CONFIG_PPC_BOOK3E)
 "  u	dump TLB\n"
 #endif
+"  U	show uptime information\n"
 "  ?	help\n"
 "  # n	limit output to n lines per page (for dp, dpa, dl)\n"
 "  zr	reboot\n\
@@ -905,6 +906,26 @@ static void remove_cpu_bpts(void)
 	write_ciabr(0);
 }
 
+/* Based on uptime_proc_show(). */
+static void
+show_uptime(void)
+{
+	struct timespec uptime;
+
+	if (setjmp(bus_error_jmp) == 0) {
+		catch_memory_errors = 1;
+		sync();
+
+		get_monotonic_boottime(&uptime);
+		printf("Uptime: %lu.%.2lu seconds\n", (unsigned long)uptime.tv_sec,
+			((unsigned long)uptime.tv_nsec / (NSEC_PER_SEC/100)));
+
+		sync();
+		__delay(200);						\
+	}
+	catch_memory_errors = 0;
+}
+
 static void set_lpp_cmd(void)
 {
 	unsigned long lpp;
@@ -1040,6 +1061,9 @@ cmds(struct pt_regs *excp)
 			dump_tlb_book3e();
 			break;
 #endif
+		case 'U':
+			show_uptime();
+			break;
 		default:
 			printf("Unrecognized command: ");
 			do {
-- 
cgit 


From e366b921c7e32c9b92b4c524696fa81ab508dd23 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Thu, 21 Sep 2017 15:24:49 +1000
Subject: powerpc: Add PPC_EMULATED_STATS to powernv_defconfig

This is useful, especially for developers.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/configs/powernv_defconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index caee834760d2..3fb2a7ab54f1 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -295,6 +295,7 @@ CONFIG_FUNCTION_GRAPH_TRACER=y
 CONFIG_SCHED_TRACER=y
 CONFIG_FTRACE_SYSCALLS=y
 CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_PPC_EMULATED_STATS=y
 CONFIG_CODE_PATCHING_SELFTEST=y
 CONFIG_FTR_FIXUP_SELFTEST=y
 CONFIG_MSI_BITMAP_SELFTEST=y
-- 
cgit 


From 186b8f1587c79c2fa04bfa392fdf084443e398c1 Mon Sep 17 00:00:00 2001
From: Seth Forshee <seth.forshee@canonical.com>
Date: Thu, 28 Sep 2017 09:33:39 -0400
Subject: powerpc: Always initialize input array when calling epapr_hypercall()

Several callers to epapr_hypercall() pass an uninitialized stack
allocated array for the input arguments, presumably because they
have no input arguments. However this can produce errors like
this one

 arch/powerpc/include/asm/epapr_hcalls.h:470:42: error: 'in' may be used uninitialized in this function [-Werror=maybe-uninitialized]
  unsigned long register r3 asm("r3") = in[0];
                                        ~~^~~

Fix callers to this function to always zero-initialize the input
arguments array to prevent this.

Signed-off-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/epapr_hcalls.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h
index 334459ad145b..90863245df53 100644
--- a/arch/powerpc/include/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/asm/epapr_hcalls.h
@@ -508,7 +508,7 @@ static unsigned long epapr_hypercall(unsigned long *in,
 
 static inline long epapr_hypercall0_1(unsigned int nr, unsigned long *r2)
 {
-	unsigned long in[8];
+	unsigned long in[8] = {0};
 	unsigned long out[8];
 	unsigned long r;
 
@@ -520,7 +520,7 @@ static inline long epapr_hypercall0_1(unsigned int nr, unsigned long *r2)
 
 static inline long epapr_hypercall0(unsigned int nr)
 {
-	unsigned long in[8];
+	unsigned long in[8] = {0};
 	unsigned long out[8];
 
 	return epapr_hypercall(in, out, nr);
@@ -528,7 +528,7 @@ static inline long epapr_hypercall0(unsigned int nr)
 
 static inline long epapr_hypercall1(unsigned int nr, unsigned long p1)
 {
-	unsigned long in[8];
+	unsigned long in[8] = {0};
 	unsigned long out[8];
 
 	in[0] = p1;
@@ -538,7 +538,7 @@ static inline long epapr_hypercall1(unsigned int nr, unsigned long p1)
 static inline long epapr_hypercall2(unsigned int nr, unsigned long p1,
 				    unsigned long p2)
 {
-	unsigned long in[8];
+	unsigned long in[8] = {0};
 	unsigned long out[8];
 
 	in[0] = p1;
@@ -549,7 +549,7 @@ static inline long epapr_hypercall2(unsigned int nr, unsigned long p1,
 static inline long epapr_hypercall3(unsigned int nr, unsigned long p1,
 				    unsigned long p2, unsigned long p3)
 {
-	unsigned long in[8];
+	unsigned long in[8] = {0};
 	unsigned long out[8];
 
 	in[0] = p1;
@@ -562,7 +562,7 @@ static inline long epapr_hypercall4(unsigned int nr, unsigned long p1,
 				    unsigned long p2, unsigned long p3,
 				    unsigned long p4)
 {
-	unsigned long in[8];
+	unsigned long in[8] = {0};
 	unsigned long out[8];
 
 	in[0] = p1;
-- 
cgit 


From 4ca360f3dbf2036d964cdf3a6c4a45a81fdf8e18 Mon Sep 17 00:00:00 2001
From: Kautuk Consul <kautuk.consul.1980@gmail.com>
Date: Tue, 19 Apr 2016 15:48:21 +0530
Subject: powerpc: get_wchan(): solve possible race scenario due to parallel
 wakeup

Add a check for p->state == TASK_RUNNING so that any wake-ups on
task_struct p in the interim lead to 0 being returned by get_wchan().

Signed-off-by: Kautuk Consul <kautuk.consul.1980@gmail.com>
[mpe: Confirmed other architectures do similar]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index a0c74bbf3454..166145b18728 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1898,7 +1898,8 @@ unsigned long get_wchan(struct task_struct *p)
 
 	do {
 		sp = *(unsigned long *)sp;
-		if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD))
+		if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) ||
+		    p->state == TASK_RUNNING)
 			return 0;
 		if (count > 0) {
 			ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE];
-- 
cgit 


From 1c0437af9fca8de6e4ba179d18cf13154da25695 Mon Sep 17 00:00:00 2001
From: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Date: Tue, 10 Oct 2017 20:17:32 +0530
Subject: powerpc/modules: Use WARN_ON() in stub_for_addr()

Use WARN_ON(), while running out of stubs in stub_for_addr()
and abort loading of the module instead of BUG_ON().

Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/module_64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 0b0f89685b67..759104b99f9f 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -429,7 +429,8 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs,
 	/* Find this stub, or if that fails, the next avail. entry */
 	stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr;
 	for (i = 0; stub_func_addr(stubs[i].funcdata); i++) {
-		BUG_ON(i >= num_stubs);
+		if (WARN_ON(i >= num_stubs))
+			return 0;
 
 		if (stub_func_addr(stubs[i].funcdata) == func_addr(addr))
 			return (unsigned long)&stubs[i];
-- 
cgit 


From 73e341eb6bea01fde706d10d7edba97f154d9c5f Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Fri, 29 Sep 2017 14:26:51 +1000
Subject: powerpc/mce: Remove unused function get_mce_fault_addr()

There are no users of get_mce_fault_addr() since commit
1363875bdb63 ("powerpc/64s: fix handling of non-synchronous machine
checks") removed the last usage.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mce.h |  2 --
 arch/powerpc/kernel/mce.c      | 39 ---------------------------------------
 2 files changed, 41 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 190d69a7f701..75292c7eee48 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -210,6 +210,4 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
 					   bool user_mode);
-extern uint64_t get_mce_fault_addr(struct machine_check_event *evt);
-
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index f588951b171d..6757720b1979 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -411,45 +411,6 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 }
 EXPORT_SYMBOL_GPL(machine_check_print_event_info);
 
-uint64_t get_mce_fault_addr(struct machine_check_event *evt)
-{
-	switch (evt->error_type) {
-	case MCE_ERROR_TYPE_UE:
-		if (evt->u.ue_error.effective_address_provided)
-			return evt->u.ue_error.effective_address;
-		break;
-	case MCE_ERROR_TYPE_SLB:
-		if (evt->u.slb_error.effective_address_provided)
-			return evt->u.slb_error.effective_address;
-		break;
-	case MCE_ERROR_TYPE_ERAT:
-		if (evt->u.erat_error.effective_address_provided)
-			return evt->u.erat_error.effective_address;
-		break;
-	case MCE_ERROR_TYPE_TLB:
-		if (evt->u.tlb_error.effective_address_provided)
-			return evt->u.tlb_error.effective_address;
-		break;
-	case MCE_ERROR_TYPE_USER:
-		if (evt->u.user_error.effective_address_provided)
-			return evt->u.user_error.effective_address;
-		break;
-	case MCE_ERROR_TYPE_RA:
-		if (evt->u.ra_error.effective_address_provided)
-			return evt->u.ra_error.effective_address;
-		break;
-	case MCE_ERROR_TYPE_LINK:
-		if (evt->u.link_error.effective_address_provided)
-			return evt->u.link_error.effective_address;
-		break;
-	default:
-	case MCE_ERROR_TYPE_UNKNOWN:
-		break;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(get_mce_fault_addr);
-
 /*
  * This function is called in real mode. Strictly no printk's please.
  *
-- 
cgit 


From 81b61fa7a065e639d097b09c303e3212e4264fac Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Fri, 29 Sep 2017 14:26:52 +1000
Subject: powerpc/mce: Align the print of physical address better

Use the same alignment as Effective address.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/mce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 6757720b1979..d1775bdade72 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -340,7 +340,7 @@ void machine_check_print_event_info(struct machine_check_event *evt,
 			printk("%s    Effective address: %016llx\n",
 			       level, evt->u.ue_error.effective_address);
 		if (evt->u.ue_error.physical_address_provided)
-			printk("%s      Physical address: %016llx\n",
+			printk("%s    Physical address:  %016llx\n",
 			       level, evt->u.ue_error.physical_address);
 		break;
 	case MCE_ERROR_TYPE_SLB:
-- 
cgit 


From ba41e1e1ccb9771ce41a3b8e2121f95486e76ac9 Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Fri, 29 Sep 2017 14:26:53 +1000
Subject: powerpc/mce: Hookup derror (load/store) UE errors

Extract physical_address for UE errors by walking the page
tables for the mm and address at the NIP, to extract the
instruction. Then use the instruction to find the effective
address via analyse_instr().

We might have page table walking races, but we expect them to
be rare, the physical address extraction is best effort. The idea
is to then hook up this infrastructure to memory failure eventually.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/exception-64s.h |  5 ++
 arch/powerpc/include/asm/mce.h           |  2 +-
 arch/powerpc/kernel/exceptions-64s.S     |  2 +-
 arch/powerpc/kernel/mce.c                |  6 ++-
 arch/powerpc/kernel/mce_power.c          | 87 ++++++++++++++++++++++++++++++--
 5 files changed, 94 insertions(+), 8 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 9a318973af05..b27205297e1d 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -54,6 +54,11 @@
 #define EX_SIZE		9	/* size in u64 units */
 #endif
 
+/*
+ * maximum recursive depth of MCE exceptions
+ */
+#define MAX_MCE_DEPTH	4
+
 /*
  * EX_LR is only used in EXSLB and where it does not overlap with EX_DAR
  * EX_CCR similarly with DSISR, but being 4 byte registers there is a hole
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 75292c7eee48..3a1226e9b465 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -204,7 +204,7 @@ struct mce_error_info {
 
 extern void save_mce_event(struct pt_regs *regs, long handled,
 			   struct mce_error_info *mce_err, uint64_t nip,
-			   uint64_t addr);
+			   uint64_t addr, uint64_t phys_addr);
 extern int get_mce_event(struct machine_check_event *mce, bool release);
 extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 8c32dd4cac20..ce04b1c2ac8e 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -232,7 +232,7 @@ BEGIN_FTR_SECTION
 	addi	r10,r10,1		/* increment paca->in_mce */
 	sth	r10,PACA_IN_MCE(r13)
 	/* Limit nested MCE to level 4 to avoid stack overflow */
-	cmpwi	r10,4
+	cmpwi	r10,MAX_MCE_DEPTH
 	bgt	2f			/* Check if we hit limit of 4 */
 	std	r11,GPR1(r1)		/* Save r1 on the stack. */
 	std	r11,0(r1)		/* make stack chain pointer */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index d1775bdade72..1af7c559edbc 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -82,7 +82,7 @@ static void mce_set_error_info(struct machine_check_event *mce,
  */
 void save_mce_event(struct pt_regs *regs, long handled,
 		    struct mce_error_info *mce_err,
-		    uint64_t nip, uint64_t addr)
+		    uint64_t nip, uint64_t addr, uint64_t phys_addr)
 {
 	int index = __this_cpu_inc_return(mce_nest_count) - 1;
 	struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
@@ -140,6 +140,10 @@ void save_mce_event(struct pt_regs *regs, long handled,
 	} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
 		mce->u.ue_error.effective_address_provided = true;
 		mce->u.ue_error.effective_address = addr;
+		if (phys_addr != ULONG_MAX) {
+			mce->u.ue_error.physical_address_provided = true;
+			mce->u.ue_error.physical_address = phys_addr;
+		}
 	}
 	return;
 }
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index d37e612050b5..e61b80e37e1f 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -27,6 +27,36 @@
 #include <asm/mmu.h>
 #include <asm/mce.h>
 #include <asm/machdep.h>
+#include <asm/pgtable.h>
+#include <asm/pte-walk.h>
+#include <asm/sstep.h>
+#include <asm/exception-64s.h>
+
+/*
+ * Convert an address related to an mm to a PFN. NOTE: we are in real
+ * mode, we could potentially race with page table updates.
+ */
+static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+{
+	pte_t *ptep;
+	unsigned long flags;
+	struct mm_struct *mm;
+
+	if (user_mode(regs))
+		mm = current->mm;
+	else
+		mm = &init_mm;
+
+	local_irq_save(flags);
+	if (mm == current->mm)
+		ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
+	else
+		ptep = find_init_mm_pte(addr, NULL);
+	local_irq_restore(flags);
+	if (!ptep || pte_special(*ptep))
+		return ULONG_MAX;
+	return pte_pfn(*ptep);
+}
 
 static void flush_tlb_206(unsigned int num_sets, unsigned int action)
 {
@@ -421,6 +451,41 @@ static const struct mce_derror_table mce_p9_derror_table[] = {
   MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
 { 0, false, 0, 0, 0, 0 } };
 
+static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr,
+					uint64_t *phys_addr)
+{
+	/*
+	 * Carefully look at the NIP to determine
+	 * the instruction to analyse. Reading the NIP
+	 * in real-mode is tricky and can lead to recursive
+	 * faults
+	 */
+	int instr;
+	unsigned long pfn, instr_addr;
+	struct instruction_op op;
+	struct pt_regs tmp = *regs;
+
+	pfn = addr_to_pfn(regs, regs->nip);
+	if (pfn != ULONG_MAX) {
+		instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
+		instr = *(unsigned int *)(instr_addr);
+		if (!analyse_instr(&op, &tmp, instr)) {
+			pfn = addr_to_pfn(regs, op.ea);
+			*addr = op.ea;
+			*phys_addr = (pfn << PAGE_SHIFT);
+			return 0;
+		}
+		/*
+		 * analyse_instr() might fail if the instruction
+		 * is not a load/store, although this is unexpected
+		 * for load/store errors or if we got the NIP
+		 * wrong
+		 */
+	}
+	*addr = 0;
+	return -1;
+}
+
 static int mce_handle_ierror(struct pt_regs *regs,
 		const struct mce_ierror_table table[],
 		struct mce_error_info *mce_err, uint64_t *addr)
@@ -489,7 +554,8 @@ static int mce_handle_ierror(struct pt_regs *regs,
 
 static int mce_handle_derror(struct pt_regs *regs,
 		const struct mce_derror_table table[],
-		struct mce_error_info *mce_err, uint64_t *addr)
+		struct mce_error_info *mce_err, uint64_t *addr,
+		uint64_t *phys_addr)
 {
 	uint64_t dsisr = regs->dsisr;
 	int handled = 0;
@@ -555,7 +621,17 @@ static int mce_handle_derror(struct pt_regs *regs,
 		mce_err->initiator = table[i].initiator;
 		if (table[i].dar_valid)
 			*addr = regs->dar;
-
+		else if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
+				table[i].error_type == MCE_ERROR_TYPE_UE) {
+			/*
+			 * We do a maximum of 4 nested MCE calls, see
+			 * kernel/exception-64s.h
+			 */
+			if (get_paca()->in_mce < MAX_MCE_DEPTH)
+				if (!mce_find_instr_ea_and_pfn(regs, addr,
+								phys_addr))
+					handled = 1;
+		}
 		found = 1;
 	}
 
@@ -592,19 +668,20 @@ static long mce_handle_error(struct pt_regs *regs,
 		const struct mce_ierror_table itable[])
 {
 	struct mce_error_info mce_err = { 0 };
-	uint64_t addr;
+	uint64_t addr, phys_addr;
 	uint64_t srr1 = regs->msr;
 	long handled;
 
 	if (SRR1_MC_LOADSTORE(srr1))
-		handled = mce_handle_derror(regs, dtable, &mce_err, &addr);
+		handled = mce_handle_derror(regs, dtable, &mce_err, &addr,
+				&phys_addr);
 	else
 		handled = mce_handle_ierror(regs, itable, &mce_err, &addr);
 
 	if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
 		handled = mce_handle_ue_error(regs);
 
-	save_mce_event(regs, handled, &mce_err, regs->nip, addr);
+	save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr);
 
 	return handled;
 }
-- 
cgit 


From 01eaac2b0591f7114d9de5cc65c88352de0daca6 Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Fri, 29 Sep 2017 14:26:54 +1000
Subject: powerpc/mce: Hookup ierror (instruction) UE errors

Hookup instruction errors (UE) for memory offling via memory_failure()
in a manner similar to load/store errors (derror). Since we have access
to the NIP, the conversion is a one step process in this case.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/mce_power.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index e61b80e37e1f..82cabfc7c2f6 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -488,7 +488,8 @@ static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr,
 
 static int mce_handle_ierror(struct pt_regs *regs,
 		const struct mce_ierror_table table[],
-		struct mce_error_info *mce_err, uint64_t *addr)
+		struct mce_error_info *mce_err, uint64_t *addr,
+		uint64_t *phys_addr)
 {
 	uint64_t srr1 = regs->msr;
 	int handled = 0;
@@ -540,8 +541,22 @@ static int mce_handle_ierror(struct pt_regs *regs,
 		}
 		mce_err->severity = table[i].severity;
 		mce_err->initiator = table[i].initiator;
-		if (table[i].nip_valid)
+		if (table[i].nip_valid) {
 			*addr = regs->nip;
+			if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
+				table[i].error_type == MCE_ERROR_TYPE_UE) {
+				unsigned long pfn;
+
+				if (get_paca()->in_mce < MAX_MCE_DEPTH) {
+					pfn = addr_to_pfn(regs, regs->nip);
+					if (pfn != ULONG_MAX) {
+						*phys_addr =
+							(pfn << PAGE_SHIFT);
+						handled = 1;
+					}
+				}
+			}
+		}
 		return handled;
 	}
 
@@ -676,7 +691,8 @@ static long mce_handle_error(struct pt_regs *regs,
 		handled = mce_handle_derror(regs, dtable, &mce_err, &addr,
 				&phys_addr);
 	else
-		handled = mce_handle_ierror(regs, itable, &mce_err, &addr);
+		handled = mce_handle_ierror(regs, itable, &mce_err, &addr,
+				&phys_addr);
 
 	if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
 		handled = mce_handle_ue_error(regs);
-- 
cgit 


From 733e4a4c4467033918f4e64f797b03efe209d82c Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Fri, 29 Sep 2017 14:26:55 +1000
Subject: powerpc/mce: hookup memory_failure for UE errors

If we are in user space and hit a UE error, we now have the
basic infrastructure to walk the page tables and find out
the effective address that was accessed, since the DAR
is not valid.

We use a work_queue content to hookup the bad pfn, any
other context causes problems, since memory_failure itself
can call into schedule() via lru_drain_ bits.

We could probably poison the struct page to avoid a race
between detection and taking corrective action.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/mce.c | 70 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 67 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 1af7c559edbc..742e4658c5dc 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -39,11 +39,21 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
 static DEFINE_PER_CPU(int, mce_queue_count);
 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
 
+/* Queue for delayed MCE UE events. */
+static DEFINE_PER_CPU(int, mce_ue_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
+					mce_ue_event_queue);
+
 static void machine_check_process_queued_event(struct irq_work *work);
+void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_process_ue_event(struct work_struct *work);
+
 static struct irq_work mce_event_process_work = {
         .func = machine_check_process_queued_event,
 };
 
+DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
+
 static void mce_set_error_info(struct machine_check_event *mce,
 			       struct mce_error_info *mce_err)
 {
@@ -143,6 +153,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
 		if (phys_addr != ULONG_MAX) {
 			mce->u.ue_error.physical_address_provided = true;
 			mce->u.ue_error.physical_address = phys_addr;
+			machine_check_ue_event(mce);
 		}
 	}
 	return;
@@ -197,6 +208,26 @@ void release_mce_event(void)
 	get_mce_event(NULL, true);
 }
 
+
+/*
+ * Queue up the MCE event which then can be handled later.
+ */
+void machine_check_ue_event(struct machine_check_event *evt)
+{
+	int index;
+
+	index = __this_cpu_inc_return(mce_ue_count) - 1;
+	/* If queue is full, just return for now. */
+	if (index >= MAX_MC_EVT) {
+		__this_cpu_dec(mce_ue_count);
+		return;
+	}
+	memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt));
+
+	/* Queue work to process this event later. */
+	schedule_work(&mce_ue_event_work);
+}
+
 /*
  * Queue up the MCE event which then can be handled later.
  */
@@ -219,7 +250,39 @@ void machine_check_queue_event(void)
 	/* Queue irq work to process this event later. */
 	irq_work_queue(&mce_event_process_work);
 }
-
+/*
+ * process pending MCE event from the mce event queue. This function will be
+ * called during syscall exit.
+ */
+static void machine_process_ue_event(struct work_struct *work)
+{
+	int index;
+	struct machine_check_event *evt;
+
+	while (__this_cpu_read(mce_ue_count) > 0) {
+		index = __this_cpu_read(mce_ue_count) - 1;
+		evt = this_cpu_ptr(&mce_ue_event_queue[index]);
+#ifdef CONFIG_MEMORY_FAILURE
+		/*
+		 * This should probably queued elsewhere, but
+		 * oh! well
+		 */
+		if (evt->error_type == MCE_ERROR_TYPE_UE) {
+			if (evt->u.ue_error.physical_address_provided) {
+				unsigned long pfn;
+
+				pfn = evt->u.ue_error.physical_address >>
+					PAGE_SHIFT;
+				memory_failure(pfn, SIGBUS, 0);
+			} else
+				pr_warn("Failed to identify bad address from "
+					"where the uncorrectable error (UE) "
+					"was generated\n");
+		}
+#endif
+		__this_cpu_dec(mce_ue_count);
+	}
+}
 /*
  * process pending MCE event from the mce event queue. This function will be
  * called during syscall exit.
@@ -227,6 +290,7 @@ void machine_check_queue_event(void)
 static void machine_check_process_queued_event(struct irq_work *work)
 {
 	int index;
+	struct machine_check_event *evt;
 
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
@@ -236,8 +300,8 @@ static void machine_check_process_queued_event(struct irq_work *work)
 	 */
 	while (__this_cpu_read(mce_queue_count) > 0) {
 		index = __this_cpu_read(mce_queue_count) - 1;
-		machine_check_print_event_info(
-				this_cpu_ptr(&mce_event_queue[index]), false);
+		evt = this_cpu_ptr(&mce_event_queue[index]);
+		machine_check_print_event_info(evt, false);
 		__this_cpu_dec(mce_queue_count);
 	}
 }
-- 
cgit 


From 17f444c0549f2ce037646871e748cf2dc375bce9 Mon Sep 17 00:00:00 2001
From: Michael Bringmann <mwb@linux.vnet.ibm.com>
Date: Fri, 8 Sep 2017 15:47:27 -0500
Subject: powerpc/vphn: Update CPU topology when VPHN enabled

powerpc/vphn: On Power systems with shared configurations of CPUs
and memory, there are some issues with the association of additional
CPUs and memory to nodes when hot-adding resources.  This patch
corrects the currently broken capability to set the topology for
shared CPUs in LPARs.  At boot time for shared CPU lpars, the
topology for each CPU was being set to node zero.  Now when
numa_update_cpu_topology() is called appropriately, the Virtual
Processor Home Node (VPHN) capabilities information provided by the
pHyp allows the appropriate node in the shared configuration to be
selected for the CPU.

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/numa.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index b95c584ce19d..3ae031d437a6 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1153,6 +1153,8 @@ static cpumask_t cpu_associativity_changes_mask;
 static int vphn_enabled;
 static int prrn_enabled;
 static void reset_topology_timer(void);
+static int topology_inited;
+static int topology_update_needed;
 
 /*
  * Store the current values of the associativity change counters in the
@@ -1246,6 +1248,10 @@ static long vphn_get_associativity(unsigned long cpu,
 			"hcall_vphn() experienced a hardware fault "
 			"preventing VPHN. Disabling polling...\n");
 		stop_topology_update();
+		break;
+	case H_SUCCESS:
+		dbg("VPHN hcall succeeded. Reset polling...\n");
+		break;
 	}
 
 	return rc;
@@ -1323,8 +1329,11 @@ int numa_update_cpu_topology(bool cpus_locked)
 	struct device *dev;
 	int weight, new_nid, i = 0;
 
-	if (!prrn_enabled && !vphn_enabled)
+	if (!prrn_enabled && !vphn_enabled) {
+		if (!topology_inited)
+			topology_update_needed = 1;
 		return 0;
+	}
 
 	weight = cpumask_weight(&cpu_associativity_changes_mask);
 	if (!weight)
@@ -1363,6 +1372,8 @@ int numa_update_cpu_topology(bool cpus_locked)
 			cpumask_andnot(&cpu_associativity_changes_mask,
 					&cpu_associativity_changes_mask,
 					cpu_sibling_mask(cpu));
+			dbg("Assoc chg gives same node %d for cpu%d\n",
+					new_nid, cpu);
 			cpu = cpu_last_thread_sibling(cpu);
 			continue;
 		}
@@ -1433,6 +1444,7 @@ int numa_update_cpu_topology(bool cpus_locked)
 
 out:
 	kfree(updates);
+	topology_update_needed = 0;
 	return changed;
 }
 
@@ -1613,9 +1625,17 @@ static int topology_update_init(void)
 	if (topology_updates_enabled)
 		start_topology_update();
 
+	if (vphn_enabled)
+		topology_schedule_update();
+
 	if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
 		return -ENOMEM;
 
+	topology_inited = 1;
+	if (topology_update_needed)
+		bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask),
+					nr_cpumask_bits);
+
 	return 0;
 }
 device_initcall(topology_update_init);
-- 
cgit 


From a3496e9137f6f58439da29a761725e083229c4a8 Mon Sep 17 00:00:00 2001
From: Michael Bringmann <mwb@linux.vnet.ibm.com>
Date: Fri, 8 Sep 2017 15:47:36 -0500
Subject: powerpc/vphn: Improve recognition of PRRN/VPHN

powerpc/vphn: On Power systems with shared configurations of CPUs
and memory, there are some issues with the association of additional
CPUs and memory to nodes when hot-adding resources.  This patch
updates the initialization checks to independently recognize PRRN
or VPHN support.

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/numa.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 3ae031d437a6..5f5ff46ae76d 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1531,15 +1531,14 @@ int start_topology_update(void)
 	if (firmware_has_feature(FW_FEATURE_PRRN)) {
 		if (!prrn_enabled) {
 			prrn_enabled = 1;
-			vphn_enabled = 0;
 #ifdef CONFIG_SMP
 			rc = of_reconfig_notifier_register(&dt_update_nb);
 #endif
 		}
-	} else if (firmware_has_feature(FW_FEATURE_VPHN) &&
+	}
+	if (firmware_has_feature(FW_FEATURE_VPHN) &&
 		   lppaca_shared_proc(get_lppaca())) {
 		if (!vphn_enabled) {
-			prrn_enabled = 0;
 			vphn_enabled = 1;
 			setup_cpu_associativity_change_counters();
 			init_timer_deferrable(&topology_timer);
@@ -1562,7 +1561,8 @@ int stop_topology_update(void)
 #ifdef CONFIG_SMP
 		rc = of_reconfig_notifier_unregister(&dt_update_nb);
 #endif
-	} else if (vphn_enabled) {
+	}
+	if (vphn_enabled) {
 		vphn_enabled = 0;
 		rc = del_timer_sync(&topology_timer);
 	}
-- 
cgit 


From cee5405da4020b0b0233bc8fb7c8da7322d2c52e Mon Sep 17 00:00:00 2001
From: Michael Bringmann <mwb@linux.vnet.ibm.com>
Date: Fri, 8 Sep 2017 15:47:47 -0500
Subject: powerpc/hotplug: Improve responsiveness of hotplug change

powerpc/hotplug: On Power systems with shared configurations of CPUs
and memory, there are some issues with the association of additional
CPUs and memory to nodes when hot-adding resources.  During hotplug
CPU operations, this patch resets the timer on topology update work
function to a small value to better ensure that the CPU topology is
detected and configured sooner.

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/topology.h          |  8 ++++++++
 arch/powerpc/mm/numa.c                       | 23 ++++++++++++++++++++++-
 arch/powerpc/platforms/pseries/hotplug-cpu.c |  2 ++
 3 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 2d84bca8d053..34da2c5fd1df 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -96,6 +96,14 @@ static inline int prrn_is_enabled(void)
 }
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_NEED_MULTIPLE_NODES)
+#if defined(CONFIG_PPC_SPLPAR)
+extern int timed_topology_update(int nsecs);
+#else
+#define	timed_topology_update(nsecs)
+#endif /* CONFIG_PPC_SPLPAR */
+#endif /* CONFIG_HOTPLUG_CPU || CONFIG_NEED_MULTIPLE_NODES */
+
 #include <asm-generic/topology.h>
 
 #ifdef CONFIG_SMP
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 5f5ff46ae76d..32f5f8db46ec 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1148,14 +1148,34 @@ struct topology_update_data {
 	int new_nid;
 };
 
+#define TOPOLOGY_DEF_TIMER_SECS	60
+
 static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
 static cpumask_t cpu_associativity_changes_mask;
 static int vphn_enabled;
 static int prrn_enabled;
 static void reset_topology_timer(void);
+static int topology_timer_secs = 1;
 static int topology_inited;
 static int topology_update_needed;
 
+/*
+ * Change polling interval for associativity changes.
+ */
+int timed_topology_update(int nsecs)
+{
+	if (vphn_enabled) {
+		if (nsecs > 0)
+			topology_timer_secs = nsecs;
+		else
+			topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS;
+
+		reset_topology_timer();
+	}
+
+	return 0;
+}
+
 /*
  * Store the current values of the associativity change counters in the
  * hypervisor.
@@ -1251,6 +1271,7 @@ static long vphn_get_associativity(unsigned long cpu,
 		break;
 	case H_SUCCESS:
 		dbg("VPHN hcall succeeded. Reset polling...\n");
+		timed_topology_update(0);
 		break;
 	}
 
@@ -1481,7 +1502,7 @@ static struct timer_list topology_timer =
 static void reset_topology_timer(void)
 {
 	topology_timer.data = 0;
-	topology_timer.expires = jiffies + 60 * HZ;
+	topology_timer.expires = jiffies + topology_timer_secs * HZ;
 	mod_timer(&topology_timer, topology_timer.expires);
 }
 
diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index fadb95efbb9e..a7d14aa7bb7c 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -363,6 +363,7 @@ static int dlpar_online_cpu(struct device_node *dn)
 			BUG_ON(get_cpu_current_state(cpu)
 					!= CPU_STATE_OFFLINE);
 			cpu_maps_update_done();
+			timed_topology_update(1);
 			rc = device_online(get_cpu_device(cpu));
 			if (rc)
 				goto out;
@@ -533,6 +534,7 @@ static int dlpar_offline_cpu(struct device_node *dn)
 				set_preferred_offline_state(cpu,
 							    CPU_STATE_OFFLINE);
 				cpu_maps_update_done();
+				timed_topology_update(1);
 				rc = device_offline(get_cpu_device(cpu));
 				if (rc)
 					goto out;
-- 
cgit 


From 8bc931495d4bfcde6bb242a478ddd1186ae29301 Mon Sep 17 00:00:00 2001
From: Michael Bringmann <mwb@linux.vnet.ibm.com>
Date: Fri, 8 Sep 2017 15:47:56 -0500
Subject: powerpc/vphn: Fix numa update end-loop bug

powerpc/vphn: On Power systems with shared configurations of CPUs
and memory, there are some issues with the association of additional
CPUs and memory to nodes when hot-adding resources.  This patch
fixes an end-of-updates processing problem observed occasionally
in numa_update_cpu_topology().

Signed-off-by: Michael Bringmann <mwb@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/numa.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 32f5f8db46ec..ec098b3f7332 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1401,16 +1401,22 @@ int numa_update_cpu_topology(bool cpus_locked)
 
 		for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
 			ud = &updates[i++];
+			ud->next = &updates[i];
 			ud->cpu = sibling;
 			ud->new_nid = new_nid;
 			ud->old_nid = numa_cpu_lookup_table[sibling];
 			cpumask_set_cpu(sibling, &updated_cpus);
-			if (i < weight)
-				ud->next = &updates[i];
 		}
 		cpu = cpu_last_thread_sibling(cpu);
 	}
 
+	/*
+	 * Prevent processing of 'updates' from overflowing array
+	 * where last entry filled in a 'next' pointer.
+	 */
+	if (i)
+		updates[i-1].next = NULL;
+
 	pr_debug("Topology update for the following CPUs:\n");
 	if (cpumask_weight(&updated_cpus)) {
 		for (ud = &updates[0]; ud; ud = ud->next) {
-- 
cgit 


From 31a4d4480c616bf0fa475f1869e5df852d3544bd Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 19 Oct 2017 15:14:20 +1100
Subject: Revert "KVM: PPC: Book3S HV: POWER9 does not require secondary thread
 management"

This reverts commit 94a04bc25a2c6296bd0c5e82c10e8231c2b11f77.

In order to run HPT guests on a radix POWER9 host, we will have to run
the host in single-threaded mode, because POWER9 processors do not
currently support running some threads of a core in HPT mode while
others are in radix mode ("mixed mode").

That means that we will need the same mechanisms that are used on
POWER8 to make the secondary threads available to KVM, which were
disabled on POWER9 by commit 94a04bc25a2c.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/kvm_book3s_asm.h |  4 ----
 arch/powerpc/kernel/idle_book3s.S         | 35 ++++++++++---------------------
 arch/powerpc/kvm/book3s_hv.c              | 13 +-----------
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  8 -------
 4 files changed, 12 insertions(+), 48 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 83596f32f50b..7cea76f11c26 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -104,10 +104,6 @@ struct kvmppc_host_state {
 	u8 napping;
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-	/*
-	 * hwthread_req/hwthread_state pair is used to pull sibling threads
-	 * out of guest on pre-ISAv3.0B CPUs where threads share MMU.
-	 */
 	u8 hwthread_req;
 	u8 hwthread_state;
 	u8 host_ipi;
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 1125c9be9e06..175d49f468af 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -319,20 +319,13 @@ enter_winkle:
 /*
  * r3 - PSSCR value corresponding to the requested stop state.
  */
+power_enter_stop:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-power_enter_stop_kvm_rm:
-	/*
-	 * This is currently unused because POWER9 KVM does not have to
-	 * gather secondary threads into sibling mode, but the code is
-	 * here in case that function is required.
-	 *
-	 * Tell KVM we're entering idle.
-	 */
+	/* Tell KVM we're entering idle */
 	li	r4,KVM_HWTHREAD_IN_IDLE
 	/* DO THIS IN REAL MODE!  See comment above. */
 	stb	r4,HSTATE_HWTHREAD_STATE(r13)
 #endif
-power_enter_stop:
 /*
  * Check if we are executing the lite variant with ESL=EC=0
  */
@@ -496,18 +489,6 @@ pnv_powersave_wakeup_mce:
 
 	b	pnv_powersave_wakeup
 
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-kvm_start_guest_check:
-	li	r0,KVM_HWTHREAD_IN_KERNEL
-	stb	r0,HSTATE_HWTHREAD_STATE(r13)
-	/* Order setting hwthread_state vs. testing hwthread_req */
-	sync
-	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
-	cmpwi	r0,0
-	beqlr
-	b	kvm_start_guest
-#endif
-
 /*
  * Called from reset vector for powersave wakeups.
  * cr3 - set to gt if waking up with partial/complete hypervisor state loss
@@ -532,9 +513,15 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
 	mr	r3,r12
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-BEGIN_FTR_SECTION
-	bl	kvm_start_guest_check
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+	li	r0,KVM_HWTHREAD_IN_KERNEL
+	stb	r0,HSTATE_HWTHREAD_STATE(r13)
+	/* Order setting hwthread_state vs. testing hwthread_req */
+	sync
+	lbz	r0,HSTATE_HWTHREAD_REQ(r13)
+	cmpwi	r0,0
+	beq	1f
+	b	kvm_start_guest
+1:
 #endif
 
 	/* Return SRR1 from power7_nap() */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 73bf1ebfa78f..8f34715cfbff 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2117,15 +2117,6 @@ static int kvmppc_grab_hwthread(int cpu)
 	struct paca_struct *tpaca;
 	long timeout = 10000;
 
-	/*
-	 * ISA v3.0 idle routines do not set hwthread_state or test
-	 * hwthread_req, so they can not grab idle threads.
-	 */
-	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-		WARN(1, "KVM: can not control sibling threads\n");
-		return -EBUSY;
-	}
-
 	tpaca = &paca[cpu];
 
 	/* Ensure the thread won't go into the kernel if it wakes */
@@ -2160,12 +2151,10 @@ static void kvmppc_release_hwthread(int cpu)
 	struct paca_struct *tpaca;
 
 	tpaca = &paca[cpu];
+	tpaca->kvm_hstate.hwthread_req = 0;
 	tpaca->kvm_hstate.kvm_vcpu = NULL;
 	tpaca->kvm_hstate.kvm_vcore = NULL;
 	tpaca->kvm_hstate.kvm_split_mode = NULL;
-	if (!cpu_has_feature(CPU_FTR_ARCH_300))
-		tpaca->kvm_hstate.hwthread_req = 0;
-
 }
 
 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 17936f82d3c7..fb02209ff782 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -149,11 +149,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	subf	r4, r4, r3
 	mtspr	SPRN_DEC, r4
 
-BEGIN_FTR_SECTION
 	/* hwthread_req may have got set by cede or no vcpu, so clear it */
 	li	r0, 0
 	stb	r0, HSTATE_HWTHREAD_REQ(r13)
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
 	/*
 	 * For external interrupts we need to call the Linux
@@ -316,7 +314,6 @@ kvm_novcpu_exit:
  * Relocation is off and most register values are lost.
  * r13 points to the PACA.
  * r3 contains the SRR1 wakeup value, SRR1 is trashed.
- * This is not used by ISAv3.0B processors.
  */
 	.globl	kvm_start_guest
 kvm_start_guest:
@@ -435,9 +432,6 @@ kvm_secondary_got_guest:
  * While waiting we also need to check if we get given a vcpu to run.
  */
 kvm_no_guest:
-BEGIN_FTR_SECTION
-	twi	31,0,0
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	lbz	r3, HSTATE_HWTHREAD_REQ(r13)
 	cmpwi	r3, 0
 	bne	53f
@@ -2531,10 +2525,8 @@ kvm_do_nap:
 	clrrdi	r0, r0, 1
 	mtspr	SPRN_CTRLT, r0
 
-BEGIN_FTR_SECTION
 	li	r0,1
 	stb	r0,HSTATE_HWTHREAD_REQ(r13)
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 	mfspr	r5,SPRN_LPCR
 	ori	r5,r5,LPCR_PECE0 | LPCR_PECE1
 BEGIN_FTR_SECTION
-- 
cgit 


From 2a3d6553cbd791da4fb624c2500b4580b3f9d72f Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 12 Oct 2017 22:58:54 +1100
Subject: KVM: PPC: Tie KVM_CAP_PPC_HTM to the user-visible TM feature

Currently we use CPU_FTR_TM to decide if the CPU/kernel can support
TM (Transactional Memory), and if it's true we advertise that to
Qemu (or similar) via KVM_CAP_PPC_HTM.

PPC_FEATURE2_HTM is the user-visible feature bit, which indicates that
the CPU and kernel can support TM. Currently CPU_FTR_TM and
PPC_FEATURE2_HTM always have the same value, either true or false, so
using the former for KVM_CAP_PPC_HTM is correct.

However some Power9 CPUs can operate in a mode where TM is enabled but
TM suspended state is disabled. In this mode CPU_FTR_TM is true, but
PPC_FEATURE2_HTM is false. Instead a different PPC_FEATURE2 bit is
set, to indicate that this different mode of TM is available.

It is not safe to let guests use TM as-is, when the CPU is in this
mode. So to prevent that from happening, use PPC_FEATURE2_HTM to
determine the value of KVM_CAP_PPC_HTM.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kvm/powerpc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 3480faaf1ef8..a3746b98ec11 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -644,8 +644,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		break;
 #endif
 	case KVM_CAP_PPC_HTM:
-		r = cpu_has_feature(CPU_FTR_TM_COMP) &&
-		    is_kvmppc_hv_enabled(kvm);
+		r = is_kvmppc_hv_enabled(kvm) &&
+		    (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_HTM_COMP);
 		break;
 	default:
 		r = 0;
-- 
cgit 


From 07fd1761e1cd2b5ecdb470fc228d52ccdf75e05b Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Thu, 12 Oct 2017 21:17:16 +1100
Subject: powerpc/tm: Add commandline option to disable hardware transactional
 memory

Currently the kernel relies on firmware to inform it whether or not the
CPU supports HTM and as long as the kernel was built with
CONFIG_PPC_TRANSACTIONAL_MEM=y then it will allow userspace to make
use of the facility.

There may be situations where it would be advantageous for the kernel
to not allow userspace to use HTM, currently the only way to achieve
this is to recompile the kernel with CONFIG_PPC_TRANSACTIONAL_MEM=n.

This patch adds a simple commandline option so that HTM can be
disabled at boot time.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
[mpe: Simplify to a bool, move to prom.c, put doco in the right place.
 Always disable, regardless of initial state, to avoid user confusion.]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/prom.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index f83056297441..d9bd6555f980 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -658,6 +658,35 @@ static void __init early_reserve_mem(void)
 #endif
 }
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static bool tm_disabled __initdata;
+
+static int __init parse_ppc_tm(char *str)
+{
+	bool res;
+
+	if (kstrtobool(str, &res))
+		return -EINVAL;
+
+	tm_disabled = !res;
+
+	return 0;
+}
+early_param("ppc_tm", parse_ppc_tm);
+
+static void __init tm_init(void)
+{
+	if (tm_disabled) {
+		pr_info("Disabling hardware transactional memory (HTM)\n");
+		cur_cpu_spec->cpu_user_features2 &=
+			~(PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM);
+		cur_cpu_spec->cpu_features &= ~CPU_FTR_TM;
+	}
+}
+#else
+static void tm_init(void) { }
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
 void __init early_init_devtree(void *params)
 {
 	phys_addr_t limit;
@@ -767,6 +796,8 @@ void __init early_init_devtree(void *params)
 		powerpc_firmware_features |= FW_FEATURE_PS3_POSSIBLE;
 #endif
 
+	tm_init();
+
 	DBG(" <- early_init_devtree()\n");
 }
 
-- 
cgit 


From cba6ac4869e45cc93ac5497024d1d49576e82666 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 12 Oct 2017 21:17:17 +1100
Subject: powerpc: Add PPC_FEATURE2_HTM_NO_SUSPEND

Some CPUs can operate in a mode where TM (Transactional Memory) is
enabled but the suspended state of TM is disabled. In this mode
tsuspend does not enter suspended state, instead the transaction is
aborted. Similarly any other event that would lead to suspended state
instead aborts the transaction.

There is also an ABI change, in that in this mode processes are not
allowed to sigreturn with an MSR that would lead to suspended state,
Linux will instead return an error to the sigreturn syscall.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/uapi/asm/cputable.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/uapi/asm/cputable.h b/arch/powerpc/include/uapi/asm/cputable.h
index 4d877144f377..b3b64cba71ec 100644
--- a/arch/powerpc/include/uapi/asm/cputable.h
+++ b/arch/powerpc/include/uapi/asm/cputable.h
@@ -48,6 +48,7 @@
 #define PPC_FEATURE2_HAS_IEEE128	0x00400000 /* VSX IEEE Binary Float 128-bit */
 #define PPC_FEATURE2_DARN		0x00200000 /* darn random number insn */
 #define PPC_FEATURE2_SCV		0x00100000 /* scv syscall */
+#define PPC_FEATURE2_HTM_NO_SUSPEND	0x00080000 /* TM w/out suspended state */
 
 /*
  * IMPORTANT!
-- 
cgit 


From 54820530c5faa9fd78e1c08cb6449100b1a19157 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 12 Oct 2017 21:17:18 +1100
Subject: powerpc/powernv: Enable TM without suspend if possible

Some Power9 revisions can run in a mode where TM operates without
suspended state. If we find ourself on a CPU that might be in this
mode, we query OPAL to check, and if so we reenable TM in CPU
features, and enable a new user feature to signal to userspace that we
are in this mode.

We do not enable the "normal" user feature, PPC_FEATURE2_HTM, but we
do enable PPC_FEATURE2_HTM_NOSC because that indicates to userspace
that the kernel will abort transactions on syscall entry, which is
true regardless of the suspend mode.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/opal-api.h    |  2 ++
 arch/powerpc/include/asm/powernv.h     |  4 ++++
 arch/powerpc/include/asm/tm.h          |  2 ++
 arch/powerpc/kernel/process.c          |  7 +++++++
 arch/powerpc/kernel/prom.c             |  4 ++++
 arch/powerpc/platforms/powernv/setup.c | 23 +++++++++++++++++++++++
 6 files changed, 42 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index 9d191ebea706..233c7504b1f2 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -896,6 +896,8 @@ enum {
 	 */
 	OPAL_REINIT_CPUS_MMU_HASH	= (1 << 2),
 	OPAL_REINIT_CPUS_MMU_RADIX	= (1 << 3),
+
+	OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED = (1 << 4),
 };
 
 typedef struct oppanel_line {
diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h
index f62797702300..dc5f6a5d4575 100644
--- a/arch/powerpc/include/asm/powernv.h
+++ b/arch/powerpc/include/asm/powernv.h
@@ -22,6 +22,8 @@ extern void pnv_npu2_destroy_context(struct npu_context *context,
 extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
 				unsigned long *flags, unsigned long *status,
 				int count);
+
+void pnv_tm_init(void);
 #else
 static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { }
 static inline struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
@@ -36,6 +38,8 @@ static inline int pnv_npu2_handle_fault(struct npu_context *context,
 					unsigned long *status, int count) {
 	return -ENODEV;
 }
+
+static inline void pnv_tm_init(void) { }
 #endif
 
 #endif /* _ASM_POWERNV_H */
diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h
index 82e06ca3a49b..ad19fe41931b 100644
--- a/arch/powerpc/include/asm/tm.h
+++ b/arch/powerpc/include/asm/tm.h
@@ -19,4 +19,6 @@ extern void tm_abort(uint8_t cause);
 extern void tm_save_sprs(struct thread_struct *thread);
 extern void tm_restore_sprs(struct thread_struct *thread);
 
+extern bool tm_suspend_disabled;
+
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 166145b18728..b02807ea54dc 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -77,6 +77,13 @@
 extern unsigned long _get_SP(void);
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Are we running in "Suspend disabled" mode? If so we have to block any
+ * sigreturn that would get us into suspended state, and we also warn in some
+ * other paths that we should never reach with suspend disabled.
+ */
+bool tm_suspend_disabled __ro_after_init = false;
+
 static void check_if_tm_restore_required(struct task_struct *tsk)
 {
 	/*
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index d9bd6555f980..101822be525a 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -47,6 +47,7 @@
 #include <asm/mmu.h>
 #include <asm/paca.h>
 #include <asm/pgtable.h>
+#include <asm/powernv.h>
 #include <asm/iommu.h>
 #include <asm/btext.h>
 #include <asm/sections.h>
@@ -681,7 +682,10 @@ static void __init tm_init(void)
 		cur_cpu_spec->cpu_user_features2 &=
 			~(PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM);
 		cur_cpu_spec->cpu_features &= ~CPU_FTR_TM;
+		return;
 	}
+
+	pnv_tm_init();
 }
 #else
 static void tm_init(void) { }
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index cf52d53da460..d23f148a11f0 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -36,6 +36,7 @@
 #include <asm/opal.h>
 #include <asm/kexec.h>
 #include <asm/smp.h>
+#include <asm/tm.h>
 
 #include "powernv.h"
 
@@ -304,6 +305,28 @@ static int __init pnv_probe(void)
 	return 1;
 }
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void __init pnv_tm_init(void)
+{
+	if (!firmware_has_feature(FW_FEATURE_OPAL) ||
+	    !pvr_version_is(PVR_POWER9) ||
+	    early_cpu_has_feature(CPU_FTR_TM))
+		return;
+
+	if (opal_reinit_cpus(OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) != OPAL_SUCCESS)
+		return;
+
+	pr_info("Enabling TM (Transactional Memory) with Suspend Disabled\n");
+	cur_cpu_spec->cpu_features |= CPU_FTR_TM;
+	/* Make sure "normal" HTM is off (it should be) */
+	cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_HTM;
+	/* Turn on no suspend mode, and HTM no SC */
+	cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_HTM_NO_SUSPEND | \
+					    PPC_FEATURE2_HTM_NOSC;
+	tm_suspend_disabled = true;
+}
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
 /*
  * Returns the cpu frequency for 'cpu' in Hz. This is used by
  * /proc/cpuinfo
-- 
cgit 


From 92fb8690bd04cb421d987d246deac60eef85d272 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Thu, 12 Oct 2017 21:17:19 +1100
Subject: powerpc/tm: P9 disable transactionally suspended sigcontexts

Unfortunately userspace can construct a sigcontext which enables
suspend. Thus userspace can force Linux into a path where trechkpt is
executed.

This patch blocks this from happening on POWER9 by sanity checking
sigcontexts passed in.

ptrace doesn't have this problem as only MSR SE and BE can be changed
via ptrace.

This patch also adds a number of WARN_ON()s in case we ever enter
suspend when we shouldn't. This should not happen, but if it does the
symptoms are soft lockup warnings which are not obviously TM related,
so the WARN_ON()s should make it obvious what's happening.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c   | 2 ++
 arch/powerpc/kernel/signal_32.c | 4 ++++
 arch/powerpc/kernel/signal_64.c | 5 +++++
 3 files changed, 11 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index b02807ea54dc..c051dc2b42ad 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -910,6 +910,8 @@ static inline void tm_reclaim_task(struct task_struct *tsk)
 	if (!MSR_TM_ACTIVE(thr->regs->msr))
 		goto out_and_saveregs;
 
+	WARN_ON(tm_suspend_disabled);
+
 	TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, "
 		 "ccr=%lx, msr=%lx, trap=%lx)\n",
 		 tsk->pid, thr->regs->nip,
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 92fb1c8dbbd8..1dd5fa0f65fd 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -519,6 +519,8 @@ static int save_tm_user_regs(struct pt_regs *regs,
 {
 	unsigned long msr = regs->msr;
 
+	WARN_ON(tm_suspend_disabled);
+
 	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
 	 * just indicates to userland that we were doing a transaction, but we
 	 * don't want to return in transactional state.  This also ensures
@@ -769,6 +771,8 @@ static long restore_tm_user_regs(struct pt_regs *regs,
 	int i;
 #endif
 
+	if (tm_suspend_disabled)
+		return 1;
 	/*
 	 * restore general registers but not including MSR or SOFTE. Also
 	 * take care of keeping r2 (TLS) intact if not a signal.
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index c83c115858c1..4dfdd8e56836 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -214,6 +214,8 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 
 	BUG_ON(!MSR_TM_ACTIVE(regs->msr));
 
+	WARN_ON(tm_suspend_disabled);
+
 	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
 	 * just indicates to userland that we were doing a transaction, but we
 	 * don't want to return in transactional state.  This also ensures
@@ -430,6 +432,9 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
 
 	BUG_ON(tsk != current);
 
+	if (tm_suspend_disabled)
+		return -EINVAL;
+
 	/* copy the GPRs */
 	err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr));
 	err |= __copy_from_user(&tsk->thread.ckpt_regs, sc->gp_regs,
-- 
cgit 


From 402e172a2ce76210f2fe921cf419d12103851344 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@debian.org>
Date: Tue, 17 Oct 2017 16:20:18 -0200
Subject: powerpc/xmon: Check before calling xive functions

Currently xmon could call XIVE functions from OPAL even if the XIVE is
disabled or does not exist in the system, as in POWER8 machines. This
causes the following exception:

 1:mon> dx
 cpu 0x1: Vector: 700 (Program Check) at [c000000423c93450]
     pc: c00000000009cfa4: opal_xive_dump+0x50/0x68
     lr: c0000000000997b8: opal_return+0x0/0x50

This patch simply checks if XIVE is enabled before calling XIVE
functions.

Fixes: 243e25112d06 ("powerpc/xive: Native exploitation of the XIVE interrupt controller")
Suggested-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
Signed-off-by: Breno Leitao <leitao@debian.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 4679aeb84767..2e2320edb96b 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2508,6 +2508,11 @@ static void dump_xives(void)
 	unsigned long num;
 	int c;
 
+	if (!xive_enabled()) {
+		printf("Xive disabled on this system\n");
+		return;
+	}
+
 	c = inchar();
 	if (c == 'a') {
 		dump_all_xives();
-- 
cgit 


From 6773027205ea4ccf17055d7f0a0cd813f22fe127 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 16 Oct 2017 12:41:00 +0530
Subject: powerpc/mm/radix: Drop unneeded NULL check

We call these functions with non-NULL mm or vma. Hence we can skip the
NULL check in these functions. We also remove now unused function
__local_flush_hugetlb_page().

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
[mpe: Drop the checks with is_vm_hugetlb_page() as noticed by Nick]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/hugetlb.h |  6 ------
 arch/powerpc/mm/tlb-radix.c        | 22 ++++++++++------------
 2 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index b8a0fb442c64..795d825c2edd 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -40,12 +40,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma,
 		return radix__flush_hugetlb_page(vma, vmaddr);
 }
 
-static inline void __local_flush_hugetlb_page(struct vm_area_struct *vma,
-					      unsigned long vmaddr)
-{
-	if (radix_enabled())
-		return radix__local_flush_hugetlb_page(vma, vmaddr);
-}
 #else
 
 static inline pte_t *hugepd_page(hugepd_t hpd)
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 5a1f46eff3a2..3a07d7a5e2fe 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -164,7 +164,7 @@ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmadd
 	unsigned long ap = mmu_get_ap(psize);
 
 	preempt_disable();
-	pid = mm ? mm->context.id : 0;
+	pid = mm->context.id;
 	if (pid != MMU_NO_CONTEXT)
 		_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
 	preempt_enable();
@@ -174,11 +174,10 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
 {
 #ifdef CONFIG_HUGETLB_PAGE
 	/* need the return fix for nohash.c */
-	if (vma && is_vm_hugetlb_page(vma))
-		return __local_flush_hugetlb_page(vma, vmaddr);
+	if (is_vm_hugetlb_page(vma))
+		return radix__local_flush_hugetlb_page(vma, vmaddr);
 #endif
-	radix__local_flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr,
-					  mmu_virtual_psize);
+	radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
 }
 EXPORT_SYMBOL(radix__local_flush_tlb_page);
 
@@ -232,7 +231,7 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 	unsigned long ap = mmu_get_ap(psize);
 
 	preempt_disable();
-	pid = mm ? mm->context.id : 0;
+	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto bail;
 	if (!mm_is_thread_local(mm))
@@ -246,11 +245,10 @@ bail:
 void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 {
 #ifdef CONFIG_HUGETLB_PAGE
-	if (vma && is_vm_hugetlb_page(vma))
-		return flush_hugetlb_page(vma, vmaddr);
+	if (is_vm_hugetlb_page(vma))
+		return radix__flush_hugetlb_page(vma, vmaddr);
 #endif
-	radix__flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr,
-				    mmu_virtual_psize);
+	radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
 }
 EXPORT_SYMBOL(radix__flush_tlb_page);
 
@@ -330,7 +328,7 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 
 
 	preempt_disable();
-	pid = mm ? mm->context.id : 0;
+	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto err_out;
 
@@ -361,7 +359,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 	unsigned long pid, end;
 
 
-	pid = mm ? mm->context.id : 0;
+	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto no_context;
 
-- 
cgit 


From deed2ec5a786f037b00136792e08404c84ec557a Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Thu, 12 Oct 2017 13:16:59 +1100
Subject: powerpc/configs: Enable I2C_CHARDEV for pseries and powernv

i2c-dev provides an interface for userspace programs to interact with I2C
devices, and is very helpful for I2C-related debugging.

Enable it in pseries_defconfig and powernv_defconfig. It's already enabled
in many other powerpc defconfigs.

Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/configs/powernv_defconfig | 1 +
 arch/powerpc/configs/pseries_defconfig | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index 3fb2a7ab54f1..4891bbed6258 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -192,6 +192,7 @@ CONFIG_IPMI_DEVICE_INTERFACE=y
 CONFIG_IPMI_POWERNV=y
 CONFIG_RAW_DRIVER=y
 CONFIG_MAX_RAW_DEVS=1024
+CONFIG_I2C_CHARDEV=y
 CONFIG_DRM=y
 CONFIG_DRM_AST=y
 CONFIG_FIRMWARE_EDID=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 3d935969e5a2..bde2cd1005a2 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -193,6 +193,7 @@ CONFIG_VIRTIO_CONSOLE=m
 CONFIG_IBM_BSR=m
 CONFIG_RAW_DRIVER=y
 CONFIG_MAX_RAW_DEVS=1024
+CONFIG_I2C_CHARDEV=y
 CONFIG_FB=y
 CONFIG_FIRMWARE_EDID=y
 CONFIG_FB_OF=y
-- 
cgit 


From 90d6473747b289739b435d5fa6ae8e38ae2921d1 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 9 Oct 2017 21:59:32 +1100
Subject: powerpc/xmon: Add kstack base to paca dump

When dumping the paca in xmon we currently show kstack. Although it's
not hard it's a bit fiddly to work out what the bounds of the kernel
stack should be based on the kstack value.

To make life easier and "kstack_base" which is the base (lowest
address) of the kernel stack, eg:

 kstack               = 0xc0000000f1a7be30      (0x258)
 kstack_base          = 0xc0000000f1a78000

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 2e2320edb96b..83e7faf5b3d3 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2384,6 +2384,7 @@ static void dump_one_paca(int cpu)
 #endif
 	DUMP(p, __current, "p");
 	DUMP(p, kstack, "lx");
+	printf(" kstack_base          = 0x%016lx\n", p->kstack & ~(THREAD_SIZE - 1));
 	DUMP(p, stab_rr, "lx");
 	DUMP(p, saved_r1, "lx");
 	DUMP(p, trap_save, "x");
-- 
cgit 


From 5c2511bff47ba7fb09268ee8ebbf5237da5c0b84 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 9 Oct 2017 21:54:04 +1100
Subject: powerpc/64s: Move the two FAST_ENDIAN macros next to each other

So we can #ifdef them in the next patch.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/exceptions-64s.S | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index ce04b1c2ac8e..acddfe79395d 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -865,12 +865,6 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
 #define LOAD_SYSCALL_HANDLER(reg)					\
 	__LOAD_HANDLER(reg, system_call_common)
 
-#define SYSCALL_FASTENDIAN_TEST					\
-BEGIN_FTR_SECTION						\
-	cmpdi	r0,0x1ebe ; 					\
-	beq-	1f ;						\
-END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
-
 /*
  * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9,
  * and HMT_MEDIUM.
@@ -885,6 +879,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 	rfid ; 							\
 	b	. ;	/* prevent speculative execution */
 
+#define SYSCALL_FASTENDIAN_TEST					\
+BEGIN_FTR_SECTION						\
+	cmpdi	r0,0x1ebe ; 					\
+	beq-	1f ;						\
+END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
+
 #define SYSCALL_FASTENDIAN					\
 	/* Fast LE/BE switch system call */			\
 1:	mfspr	r12,SPRN_SRR1 ;					\
-- 
cgit 


From 727f13616c45caa72e4325c5027c1a8f41e745a9 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 9 Oct 2017 21:54:05 +1100
Subject: powerpc: Disable the fast-endian switch syscall by default

Back in 2008 we added support for "fast little-endian switch" in the
syscall path. This added a special case syscall number 0x1ebe, which
is caught very early in the system call exception and switches endian
with as little overhead as possible. See commit 745a14cc264b
("[POWERPC] Add fast little-endian switch system call") for full
details.

Although it is fast, it's also completely non standard. The "syscall
number" is out of the range of normal syscalls, it can't be traced or
audited, and it's a bit of a wart. To the best of our knowledge it was
only used by one program, now long since discontinued.

So in an effort to shake out any current users, put it behind a config
option, and make it default n. If anyone *is* using it they can
quickly reinstate it with a rebuild, and we can flip it to default y.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig.debug           | 6 ++++++
 arch/powerpc/kernel/exceptions-64s.S | 5 +++++
 2 files changed, 11 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index c86df246339e..b8a1eb42a38d 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -369,4 +369,10 @@ config PPC_HTDUMP
 	def_bool y
 	depends on PPC_PTDUMP && PPC_BOOK3S
 
+config PPC_FAST_ENDIAN_SWITCH
+	bool "Deprecated fast endian-switch syscall"
+        depends on DEBUG_KERNEL && PPC_BOOK3S_64
+        help
+	  If you're unsure what this is, say N.
+
 endmenu
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index acddfe79395d..d001aa4c69a1 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -879,6 +879,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
 	rfid ; 							\
 	b	. ;	/* prevent speculative execution */
 
+#ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH
 #define SYSCALL_FASTENDIAN_TEST					\
 BEGIN_FTR_SECTION						\
 	cmpdi	r0,0x1ebe ; 					\
@@ -893,6 +894,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 	mr	r13,r9 ;					\
 	rfid ;		/* return to userspace */		\
 	b	. ;	/* prevent speculative execution */
+#else
+#define SYSCALL_FASTENDIAN_TEST
+#define SYSCALL_FASTENDIAN
+#endif /* CONFIG_PPC_FAST_ENDIAN_SWITCH */
 
 #if defined(CONFIG_RELOCATABLE)
 	/*
-- 
cgit 


From 05c14c03138532a3cb2aa29c2960445c8753343b Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 9 Oct 2017 21:52:44 +1100
Subject: powerpc/perf/hv-24x7: Fix incorrect comparison in memord

In the hv-24x7 code there is a function memord() which tries to
implement a sort function return -1, 0, 1. However one of the
conditions is incorrect, such that it can never be true, because we
will have already returned.

I don't believe there is a bug in practice though, because the
comparisons are an optimisation prior to calling memcmp().

Fix it by swapping the second comparision, so it can be true.

Reported-by: David Binderman <dcb314@hotmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/perf/hv-24x7.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 9c88b82f6229..72238eedc360 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -540,7 +540,7 @@ static int memord(const void *d1, size_t s1, const void *d2, size_t s2)
 {
 	if (s1 < s2)
 		return 1;
-	if (s2 > s1)
+	if (s1 > s2)
 		return -1;
 
 	return memcmp(d1, d2, s1);
-- 
cgit 


From 3d2d4339cc326c427638daa67e264dd455ee1899 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 2 Jun 2017 14:38:47 +0200
Subject: powerpc: dts: acadia: DT fix s/#interrupts-parent/#interrupt-parent/

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/boot/dts/acadia.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/boot/dts/acadia.dts b/arch/powerpc/boot/dts/acadia.dts
index 57291f61ffe7..86266159521e 100644
--- a/arch/powerpc/boot/dts/acadia.dts
+++ b/arch/powerpc/boot/dts/acadia.dts
@@ -183,7 +183,7 @@
 			usb@ef603000 {
 				compatible = "ohci-be";
 				reg = <0xef603000 0x80>;
-				interrupts-parent = <&UIC0>;
+				interrupt-parent = <&UIC0>;
 				interrupts = <0xd 0x4 0xe 0x4>;
 			};
 
-- 
cgit 


From a7cd4586210c69c19aef26f4f83d3e492040de1c Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 5 Sep 2017 18:47:02 +0200
Subject: powerpc/axonram: Drop unnecessary variable initialisation

The local variable "rc" will eventually be set only to an error code.
Thus omit the explicit initialisation at the beginning.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/sysdev/axonram.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index c60e84e4558d..1b307c80b401 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -184,7 +184,7 @@ static int axon_ram_probe(struct platform_device *device)
 	static int axon_ram_bank_id = -1;
 	struct axon_ram_bank *bank;
 	struct resource resource;
-	int rc = 0;
+	int rc;
 
 	axon_ram_bank_id++;
 
-- 
cgit 


From c28237f1d4ed2c9022f9eed656ecf36999e34f47 Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Tue, 17 Oct 2017 13:31:42 +0200
Subject: powerpc-opal: Fix a typo in a comment line of two file headers

Fix a word in these descriptions.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-hmi.c           | 2 +-
 arch/powerpc/platforms/powernv/opal-memory-errors.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
index d78fed728cdf..c9e1a4ff295c 100644
--- a/arch/powerpc/platforms/powernv/opal-hmi.c
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -1,5 +1,5 @@
 /*
- * OPAL hypervisor Maintenance interrupt handling support in PowreNV.
+ * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c
index 4495f428b500..d9916ea62305 100644
--- a/arch/powerpc/platforms/powernv/opal-memory-errors.c
+++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -1,5 +1,5 @@
 /*
- * OPAL asynchronus Memory error handling support in PowreNV.
+ * OPAL asynchronus Memory error handling support in PowerNV.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
-- 
cgit 


From 4dd9eab39c71628d113168a01473ee17b5f61eac Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Wed, 18 Oct 2017 20:48:52 +0200
Subject: powerpc/pseries: Cleanup error handling in
 iommu_pseries_alloc_group()

Although kfree(NULL) is legal, it's a bit lazy to rely on that to
implement the error handling. So do it the normal Linux way using
labels for each failure path.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
[mpe: Squash a few patches and rewrite change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/iommu.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 7c181467d0ad..69921f72e2da 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -55,23 +55,23 @@
 
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
-	struct iommu_table_group *table_group = NULL;
-	struct iommu_table *tbl = NULL;
-	struct iommu_table_group_link *tgl = NULL;
+	struct iommu_table_group *table_group;
+	struct iommu_table *tbl;
+	struct iommu_table_group_link *tgl;
 
 	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
 			   node);
 	if (!table_group)
-		goto fail_exit;
+		return NULL;
 
 	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
 	if (!tbl)
-		goto fail_exit;
+		goto free_group;
 
 	tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL,
 			node);
 	if (!tgl)
-		goto fail_exit;
+		goto free_table;
 
 	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
 	kref_init(&tbl->it_kref);
@@ -82,11 +82,10 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 
 	return table_group;
 
-fail_exit:
-	kfree(tgl);
-	kfree(table_group);
+free_table:
 	kfree(tbl);
-
+free_group:
+	kfree(table_group);
 	return NULL;
 }
 
-- 
cgit 


From 6de6638b35daa0dfb7daefd78d55015655b4d8f3 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Sun, 5 Nov 2017 23:33:55 +1100
Subject: KVM: PPC: Book3S HV: Handle host system reset in guest mode

If the host takes a system reset interrupt while a guest is running,
the CPU must exit the guest before processing the host exception
handler.

After this patch, taking a sysrq+x with a CPU running in a guest
gives a trace like this:

   cpu 0x27: Vector: 100 (System Reset) at [c000000fdf5776f0]
       pc: c008000010158b80: kvmppc_run_core+0x16b8/0x1ad0 [kvm_hv]
       lr: c008000010158b80: kvmppc_run_core+0x16b8/0x1ad0 [kvm_hv]
       sp: c000000fdf577850
      msr: 9000000002803033
     current = 0xc000000fdf4b1e00
     paca    = 0xc00000000fd4d680	 softe: 3	 irq_happened: 0x01
       pid   = 6608, comm = qemu-system-ppc
   Linux version 4.14.0-rc7-01489-g47e1893a404a-dirty #26 SMP
   [c000000fdf577a00] c008000010159dd4 kvmppc_vcpu_run_hv+0x3dc/0x12d0 [kvm_hv]
   [c000000fdf577b30] c0080000100a537c kvmppc_vcpu_run+0x44/0x60 [kvm]
   [c000000fdf577b60] c0080000100a1ae0 kvm_arch_vcpu_ioctl_run+0x118/0x310 [kvm]
   [c000000fdf577c00] c008000010093e98 kvm_vcpu_ioctl+0x530/0x7c0 [kvm]
   [c000000fdf577d50] c000000000357bf8 do_vfs_ioctl+0xd8/0x8c0
   [c000000fdf577df0] c000000000358448 SyS_ioctl+0x68/0x100
   [c000000fdf577e30] c00000000000b220 system_call+0x58/0x6c
   --- Exception: c01 (System Call) at 00007fff76868df0
   SP (7fff7069baf0) is in userspace

Fixes: e36d0a2ed5 ("powerpc/powernv: Implement NMI IPI with OPAL_SIGNAL_SYSTEM_RESET")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/hw_irq.h    | 1 +
 arch/powerpc/kernel/exceptions-64s.S | 2 ++
 arch/powerpc/kernel/irq.c            | 3 ++-
 arch/powerpc/kvm/book3s_hv.c         | 7 ++++++-
 4 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index c1dd1929342d..d2ef36111518 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -31,6 +31,7 @@
 
 #ifndef __ASSEMBLY__
 
+extern void replay_system_reset(void);
 extern void __replay_interrupt(unsigned int vector);
 
 extern void timer_interrupt(struct pt_regs *);
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index d001aa4c69a1..64c2bc35e4ff 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -113,6 +113,7 @@ EXC_VIRT_NONE(0x4000, 0x100)
 	cmpwi	cr3,r10,2 ;						\
 	BRANCH_TO_C000(r10, system_reset_idle_common) ;			\
 1:									\
+	KVMTEST_PR(n) ;							\
 	END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #else
 #define IDLETEST NOTEST
@@ -129,6 +130,7 @@ EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
 
 EXC_REAL_END(system_reset, 0x100, 0x100)
 EXC_VIRT_NONE(0x4100, 0x100)
+TRAMP_KVM(PACA_EXNMI, 0x100)
 
 #ifdef CONFIG_PPC_P7_NAP
 EXC_COMMON_BEGIN(system_reset_idle_common)
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 4813b83b22aa..6a6995173f51 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -415,7 +415,7 @@ static const u8 srr1_to_lazyirq[0x10] = {
 	PACA_IRQ_HMI,
 	0, 0, 0, 0, 0 };
 
-static noinline void replay_system_reset(void)
+void replay_system_reset(void)
 {
 	struct pt_regs regs;
 
@@ -425,6 +425,7 @@ static noinline void replay_system_reset(void)
 	system_reset_exception(&regs);
 	get_paca()->in_nmi = 0;
 }
+EXPORT_SYMBOL_GPL(replay_system_reset);
 
 void irq_set_pending_from_srr1(unsigned long srr1)
 {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8f34715cfbff..31a362669fea 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -47,6 +47,7 @@
 
 #include <asm/reg.h>
 #include <asm/ppc-opcode.h>
+#include <asm/asm-prototypes.h>
 #include <asm/disassemble.h>
 #include <asm/cputable.h>
 #include <asm/cacheflush.h>
@@ -1089,9 +1090,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		vcpu->stat.ext_intr_exits++;
 		r = RESUME_GUEST;
 		break;
-	/* HMI is hypervisor interrupt and host has handled it. Resume guest.*/
+	/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
 	case BOOK3S_INTERRUPT_HMI:
 	case BOOK3S_INTERRUPT_PERFMON:
+	case BOOK3S_INTERRUPT_SYSTEM_RESET:
 		r = RESUME_GUEST;
 		break;
 	case BOOK3S_INTERRUPT_MACHINE_CHECK:
@@ -2604,6 +2606,9 @@ static void set_irq_happened(int trap)
 	case BOOK3S_INTERRUPT_HMI:
 		local_paca->irq_happened |= PACA_IRQ_HMI;
 		break;
+	case BOOK3S_INTERRUPT_SYSTEM_RESET:
+		replay_system_reset();
+		break;
 	}
 }
 
-- 
cgit 


From ff967900c9d4740f6337c6456f111104267633cb Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Sat, 21 Oct 2017 17:56:06 +1000
Subject: powerpc/64: Fix latency tracing for lazy irq replay

When returning from an exception to a soft-enabled context, pending
IRQs are replayed but IRQ tracing is not reset, so a number of them
can get chained together into the same IRQ-disabled trace.

Fix this by having __check_irq_replay re-set IRQ trace. This is
conceptually where we respond to the next interrupt, so it fits the
semantics of the IRQ tracer.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/irq.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 6a6995173f51..b7a84522e652 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -143,6 +143,13 @@ notrace unsigned int __check_irq_replay(void)
 	 */
 	unsigned char happened = local_paca->irq_happened;
 
+	/*
+	 * We are responding to the next interrupt, so interrupt-off
+	 * latencies should be reset here.
+	 */
+	trace_hardirqs_on();
+	trace_hardirqs_off();
+
 	if (happened & PACA_IRQ_HARD_DIS) {
 		/* Clear bit 0 which we wouldn't clear otherwise */
 		local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
@@ -270,6 +277,7 @@ notrace void arch_local_irq_restore(unsigned long en)
 #endif /* CONFIG_TRACE_IRQFLAGS */
 
 	set_soft_enabled(0);
+	trace_hardirqs_off();
 
 	/*
 	 * Check if anything needs to be re-emitted. We haven't
@@ -279,6 +287,7 @@ notrace void arch_local_irq_restore(unsigned long en)
 	replay = __check_irq_replay();
 
 	/* We can soft-enable now */
+	trace_hardirqs_on();
 	set_soft_enabled(1);
 
 	/*
-- 
cgit 


From f848ea7f5960ec2684c3bd1c0692e6f5ad47a27d Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Mon, 23 Oct 2017 17:08:13 +1000
Subject: powerpc/book3s: Use label for FIXUP_ENDIAN macro branch

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/ppc_asm.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 36f3e41c9fbe..d6b56aebe602 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -776,7 +776,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601)
 #else
 #define FIXUP_ENDIAN						   \
 	tdi   0,0,0x48;	  /* Reverse endian of b . + 8		*/ \
-	b     $+44;	  /* Skip trampoline if endian is good	*/ \
+	b     191f;	  /* Skip trampoline if endian is good	*/ \
 	.long 0xa600607d; /* mfmsr r11				*/ \
 	.long 0x01006b69; /* xori r11,r11,1			*/ \
 	.long 0x00004039; /* li r10,0				*/ \
@@ -786,7 +786,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601)
 	.long 0x14004a39; /* addi r10,r10,20			*/ \
 	.long 0xa6035a7d; /* mtsrr0 r10				*/ \
 	.long 0xa6037b7d; /* mtsrr1 r11				*/ \
-	.long 0x2400004c  /* rfid				*/
+	.long 0x2400004c; /* rfid				*/ \
+191:
 
 #endif /* !CONFIG_PPC_BOOK3E */
 
-- 
cgit 


From 8ca9c08d0c8f6d68db9b592dc149cb8284173ca8 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Mon, 23 Oct 2017 17:08:14 +1000
Subject: powerpc/book3s: Add an HV variant of FIXUP_ENDIAN that is recoverable

Add an HV variant of FIXUP_ENDIAN which uses HSRR[01] and does not
clear MSR[RI], which improves recoverability.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/ppc_asm.h | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index d6b56aebe602..ae94b3626b6c 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -774,6 +774,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601)
 #ifdef CONFIG_PPC_BOOK3E
 #define FIXUP_ENDIAN
 #else
+/*
+ * This version may be used in in HV or non-HV context.
+ * MSR[EE] must be disabled.
+ */
 #define FIXUP_ENDIAN						   \
 	tdi   0,0,0x48;	  /* Reverse endian of b . + 8		*/ \
 	b     191f;	  /* Skip trampoline if endian is good	*/ \
@@ -789,6 +793,24 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601)
 	.long 0x2400004c; /* rfid				*/ \
 191:
 
+/*
+ * This version that may only be used with MSR[HV]=1
+ * - Does not clear MSR[RI], so more robust.
+ * - Slightly smaller and faster.
+ */
+#define FIXUP_ENDIAN_HV						   \
+	tdi   0,0,0x48;	  /* Reverse endian of b . + 8		*/ \
+	b     191f;	  /* Skip trampoline if endian is good	*/ \
+	.long 0xa600607d; /* mfmsr r11				*/ \
+	.long 0x01006b69; /* xori r11,r11,1			*/ \
+	.long 0x05009f42; /* bcl 20,31,$+4			*/ \
+	.long 0xa602487d; /* mflr r10				*/ \
+	.long 0x14004a39; /* addi r10,r10,20			*/ \
+	.long 0xa64b5a7d; /* mthsrr0 r10			*/ \
+	.long 0xa64b7b7d; /* mthsrr1 r11			*/ \
+	.long 0x2402004c; /* hrfid				*/ \
+191:
+
 #endif /* !CONFIG_PPC_BOOK3E */
 
 #endif /*  __ASSEMBLY__ */
-- 
cgit 


From 63c9d8a4b394f9d8e995292a7c74648760235b44 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Mon, 23 Oct 2017 17:08:15 +1000
Subject: powerpc/powernv: Use FIXUP_ENDIAN_HV in OPAL return

Close the recoverability gap for OPAL calls by using FIXUP_ENDIAN_HV
in the return path.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-wrappers.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 37cd170201a2..6f4b00a2ac46 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -94,7 +94,7 @@ opal_return:
 	 * bytes (always BE) since MSR:LE will end up fixed up as a side
 	 * effect of the rfid.
 	 */
-	FIXUP_ENDIAN
+	FIXUP_ENDIAN_HV
 	ld	r2,PACATOC(r13);
 	lwz	r4,8(r1);
 	ld	r5,PPC_LR_STKOFF(r1);
@@ -120,7 +120,7 @@ opal_real_call:
 	hrfid
 
 opal_return_realmode:
-	FIXUP_ENDIAN
+	FIXUP_ENDIAN_HV
 	ld	r2,PACATOC(r13);
 	lwz	r11,8(r1);
 	ld	r12,PPC_LR_STKOFF(r1)
-- 
cgit 


From dffe8449c5dd63ff18b47709de75553586582cd8 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 24 Oct 2017 23:06:53 +1000
Subject: powerpc/64s/radix: Improve preempt handling in TLB code

Preempt should be consistently disabled for mm_is_thread_local tests,
so bring the rest of these under preempt_disable().

Preempt does not need to be disabled for the mm->context.id tests,
which allows simplification and removal of gotos.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/tlb-radix.c | 48 ++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 3a07d7a5e2fe..67191fe63283 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -186,16 +186,15 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
 
-	preempt_disable();
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto no_context;
+		return;
 
+	preempt_disable();
 	if (!mm_is_thread_local(mm))
 		_tlbie_pid(pid, RIC_FLUSH_TLB);
 	else
 		_tlbiel_pid(pid, RIC_FLUSH_TLB);
-no_context:
 	preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_tlb_mm);
@@ -204,16 +203,15 @@ void radix__flush_all_mm(struct mm_struct *mm)
 {
 	unsigned long pid;
 
-	preempt_disable();
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto no_context;
+		return;
 
+	preempt_disable();
 	if (!mm_is_thread_local(mm))
 		_tlbie_pid(pid, RIC_FLUSH_ALL);
 	else
 		_tlbiel_pid(pid, RIC_FLUSH_ALL);
-no_context:
 	preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_all_mm);
@@ -230,15 +228,15 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 	unsigned long pid;
 	unsigned long ap = mmu_get_ap(psize);
 
-	preempt_disable();
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto bail;
+		return;
+
+	preempt_disable();
 	if (!mm_is_thread_local(mm))
 		_tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
 	else
 		_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
-bail:
 	preempt_enable();
 }
 
@@ -322,46 +320,44 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 {
 	unsigned long pid;
 	unsigned long addr;
-	int local = mm_is_thread_local(mm);
+	bool local;
 	unsigned long ap = mmu_get_ap(psize);
 	unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
 
-
-	preempt_disable();
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto err_out;
+		return;
 
+	preempt_disable();
+	local = mm_is_thread_local(mm);
 	if (end == TLB_FLUSH_ALL ||
 	    (end - start) > tlb_single_page_flush_ceiling * page_size) {
 		if (local)
 			_tlbiel_pid(pid, RIC_FLUSH_TLB);
 		else
 			_tlbie_pid(pid, RIC_FLUSH_TLB);
-		goto err_out;
-	}
-	for (addr = start; addr < end; addr += page_size) {
+	} else {
+		for (addr = start; addr < end; addr += page_size) {
 
-		if (local)
-			_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-		else
-			_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+			if (local)
+				_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+			else
+				_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+		}
 	}
-err_out:
 	preempt_enable();
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 {
-	int local = mm_is_thread_local(mm);
 	unsigned long ap = mmu_get_ap(mmu_virtual_psize);
 	unsigned long pid, end;
-
+	bool local;
 
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
-		goto no_context;
+		return;
 
 	/* 4k page size, just blow the world */
 	if (PAGE_SIZE == 0x1000) {
@@ -369,6 +365,8 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 		return;
 	}
 
+	preempt_disable();
+	local = mm_is_thread_local(mm);
 	/* Otherwise first do the PWC */
 	if (local)
 		_tlbiel_pid(pid, RIC_FLUSH_PWC);
@@ -383,7 +381,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 		else
 			_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
 	}
-no_context:
+
 	preempt_enable();
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
cgit 


From 30b49ec798f0984b905fd94d1957d62530f08578 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 24 Oct 2017 23:06:54 +1000
Subject: powerpc/64s/radix: Fix process table entry cache invalidation

According to the architecture, the process table entry cache must be
flushed with tlbie RIC=2.

Currently the process table entry is set to invalid right before the
PID is returned to the allocator, with no invalidation. This works on
existing implementations that are known to not cache the process table
entry for any except the current PIDR.

It is architecturally correct and cleaner to invalidate with RIC=2
after clearing the process table entry and before the PID is returned
to the allocator. This can be done in arch_exit_mmap that runs before
the final flush, and to ensure the final flush (fullmm) is always a
RIC=2 variant.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/mmu_context.h |  4 ++++
 arch/powerpc/mm/mmu_context_book3s64.c | 25 ++++++++++++++++++++-----
 arch/powerpc/mm/tlb-radix.c            |  6 +++++-
 3 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index a0d7145d6cd2..20eae6f76247 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -164,9 +164,13 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm,
 {
 }
 
+#ifndef CONFIG_PPC_BOOK3S_64
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
 }
+#else
+extern void arch_exit_mmap(struct mm_struct *mm);
+#endif
 
 static inline void arch_unmap(struct mm_struct *mm,
 			      struct vm_area_struct *vma,
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 05e15386d4cb..6d724dab27c2 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -216,19 +216,34 @@ void destroy_context(struct mm_struct *mm)
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 	WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list));
 #endif
+	if (radix_enabled())
+		WARN_ON(process_tb[mm->context.id].prtb0 != 0);
+	else
+		subpage_prot_free(mm);
+	destroy_pagetable_page(mm);
+	__destroy_context(mm->context.id);
+	mm->context.id = MMU_NO_CONTEXT;
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
 	if (radix_enabled()) {
 		/*
 		 * Radix doesn't have a valid bit in the process table
 		 * entries. However we know that at least P9 implementation
 		 * will avoid caching an entry with an invalid RTS field,
 		 * and 0 is invalid. So this will do.
+		 *
+		 * This runs before the "fullmm" tlb flush in exit_mmap,
+		 * which does a RIC=2 tlbie to clear the process table
+		 * entry. See the "fullmm" comments in tlb-radix.c.
+		 *
+		 * No barrier required here after the store because
+		 * this process will do the invalidate, which starts with
+		 * ptesync.
 		 */
 		process_tb[mm->context.id].prtb0 = 0;
-	} else
-		subpage_prot_free(mm);
-	destroy_pagetable_page(mm);
-	__destroy_context(mm->context.id);
-	mm->context.id = MMU_NO_CONTEXT;
+	}
 }
 
 #ifdef CONFIG_PPC_RADIX_MMU
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 67191fe63283..63e277b6e60c 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -298,10 +298,14 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 	psize = radix_get_mmu_psize(page_size);
 	/*
 	 * if page size is not something we understand, do a full mm flush
+	 *
+	 * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush
+	 * that flushes the process table entry cache upon process teardown.
+	 * See the comment for radix in arch_exit_mmap().
 	 */
 	if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
 		radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
-	else if (tlb->need_flush_all) {
+	else if (tlb->fullmm || tlb->need_flush_all) {
 		tlb->need_flush_all = 0;
 		radix__flush_all_mm(mm);
 	} else
-- 
cgit 


From b8f89fea599d91e674497aad572613eb63181f31 Mon Sep 17 00:00:00 2001
From: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Date: Thu, 28 Sep 2017 20:19:20 -0400
Subject: powerpc/pseries/vio: Dispose of virq mapping on vdevice unregister

When a vdevice is DLPAR removed from the system the vio subsystem
doesn't bother unmapping the virq from the irq_domain. As a result we
have a virq mapped to a hardware irq that is no longer valid for the
irq_domain. A side effect is that we are left with /proc/irq/<irq#>
affinity entries, and attempts to modify the smp_affinity of the irq
will fail.

In the following observed example the kernel log is spammed by
ics_rtas_set_affinity errors after the removal of a VSCSI adapter.
This is a result of irqbalance trying to adjust the affinity every 10
seconds.

  rpadlpar_io: slot U8408.E8E.10A7ACV-V5-C25 removed
  ics_rtas_set_affinity: ibm,set-xive irq=655385 returns -3
  ics_rtas_set_affinity: ibm,set-xive irq=655385 returns -3

This patch fixes the issue by calling irq_dispose_mapping() on the
virq of the viodev on unregister.

Fixes: f2ab6219969f ("powerpc/pseries: Add PFO support to the VIO bus")
Signed-off-by: Tyrel Datwyler <tyreld@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/pseries/vio.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 12277bc9fd9e..d86938260a86 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1592,6 +1592,8 @@ ATTRIBUTE_GROUPS(vio_dev);
 void vio_unregister_device(struct vio_dev *viodev)
 {
 	device_unregister(&viodev->dev);
+	if (viodev->family == VDEVICE)
+		irq_dispose_mapping(viodev->irq);
 }
 EXPORT_SYMBOL(vio_unregister_device);
 
-- 
cgit 


From d6f934fd48803d9e58040e2cbab2feafe9bb9f01 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Wed, 27 Sep 2017 16:52:31 +1000
Subject: powerpc/powernv: Reserve a hole which appears after enabling IOV

In order to make generic IOV code work, the physical function IOV BAR
should start from offset of the first VF. Since M64 segments share
PE number space across PHB, and some PEs may be in use at the time
when IOV is enabled, the existing code shifts the IOV BAR to the index
of the first PE/VF. This creates a hole in IOMEM space which can be
potentially taken by some other device.

This reserves a temporary hole on a parent and releases it when IOV is
disabled; the temporary resources are stored in pci_dn to avoid
kmalloc/free.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/pci-bridge.h     |  1 +
 arch/powerpc/platforms/powernv/pci-ioda.c | 24 +++++++++++++++++++++---
 2 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 0b8aa1fe2d5f..62ed83db04ae 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -218,6 +218,7 @@ struct pci_dn {
 #endif
 	struct list_head child_list;
 	struct list_head list;
+	struct resource holes[PCI_SRIOV_NUM_BARS];
 };
 
 /* Get the pointer to a device_node's pci_dn */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index fb5cd7511189..7e87867984e7 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1002,9 +1002,12 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 	}
 
 	/*
-	 * After doing so, there would be a "hole" in the /proc/iomem when
-	 * offset is a positive value. It looks like the device return some
-	 * mmio back to the system, which actually no one could use it.
+	 * Since M64 BAR shares segments among all possible 256 PEs,
+	 * we have to shift the beginning of PF IOV BAR to make it start from
+	 * the segment which belongs to the PE number assigned to the first VF.
+	 * This creates a "hole" in the /proc/iomem which could be used for
+	 * allocating other resources so we reserve this area below and
+	 * release when IOV is released.
 	 */
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 		res = &dev->resource[i + PCI_IOV_RESOURCES];
@@ -1018,7 +1021,22 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 		dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
 			 i, &res2, res, (offset > 0) ? "En" : "Dis",
 			 num_vfs, offset);
+
+		if (offset < 0) {
+			devm_release_resource(&dev->dev, &pdn->holes[i]);
+			memset(&pdn->holes[i], 0, sizeof(pdn->holes[i]));
+		}
+
 		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
+
+		if (offset > 0) {
+			pdn->holes[i].start = res2.start;
+			pdn->holes[i].end = res2.start + size * offset - 1;
+			pdn->holes[i].flags = IORESOURCE_BUS;
+			pdn->holes[i].name = "pnv_iov_reserved";
+			devm_request_resource(&dev->dev, res->parent,
+					&pdn->holes[i]);
+		}
 	}
 	return 0;
 }
-- 
cgit 


From 6b148a7ce72a7f87c81cbcde48af014abc0516a9 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@c-s.fr>
Date: Wed, 18 Oct 2017 11:16:47 +0200
Subject: powerpc/ipic: Fix status get and status clear

IPIC Status is provided by register IPIC_SERSR and not by IPIC_SERMR
which is the mask register.

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/sysdev/ipic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c
index 16f1edd78c40..535cf1f6941c 100644
--- a/arch/powerpc/sysdev/ipic.c
+++ b/arch/powerpc/sysdev/ipic.c
@@ -846,12 +846,12 @@ void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq)
 
 u32 ipic_get_mcp_status(void)
 {
-	return ipic_read(primary_ipic->regs, IPIC_SERMR);
+	return ipic_read(primary_ipic->regs, IPIC_SERSR);
 }
 
 void ipic_clear_mcp_status(u32 mask)
 {
-	ipic_write(primary_ipic->regs, IPIC_SERMR, mask);
+	ipic_write(primary_ipic->regs, IPIC_SERSR, mask);
 }
 
 /* Return an interrupt vector or 0 if no interrupt is pending. */
-- 
cgit 


From 7f142661d41e5243c99c6ad2ff72aa8647f5eb87 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 16 Oct 2017 12:31:40 +0530
Subject: powerpc/mm/hash: Add pr_fmt() to hash_utils64.c

Make the printks look a bit nicer by adding a prefix.

Radix config now do
 radix-mmu: Page sizes from device-tree:
 radix-mmu: Page size shift = 12 AP=0x0
 radix-mmu: Page size shift = 16 AP=0x5
 radix-mmu: Page size shift = 21 AP=0x1
 radix-mmu: Page size shift = 30 AP=0x2

This patch update hash config to do similar dmesg output. With the patch we have

 hash-mmu: Page sizes from device-tree:
 hash-mmu: base_shift=12: shift=12, sllp=0x0000, avpnm=0x00000000, tlbiel=1, penc=0
 hash-mmu: base_shift=12: shift=16, sllp=0x0000, avpnm=0x00000000, tlbiel=1, penc=7
 hash-mmu: base_shift=12: shift=24, sllp=0x0000, avpnm=0x00000000, tlbiel=1, penc=56
 hash-mmu: base_shift=16: shift=16, sllp=0x0110, avpnm=0x00000000, tlbiel=1, penc=1
 hash-mmu: base_shift=16: shift=24, sllp=0x0110, avpnm=0x00000000, tlbiel=1, penc=8
 hash-mmu: base_shift=20: shift=20, sllp=0x0111, avpnm=0x00000000, tlbiel=0, penc=2
 hash-mmu: base_shift=24: shift=24, sllp=0x0100, avpnm=0x00000001, tlbiel=0, penc=0
 hash-mmu: base_shift=34: shift=34, sllp=0x0120, avpnm=0x000007ff, tlbiel=0, penc=3

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hash_utils_64.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 67ec2e927253..655a5a9a183d 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -21,6 +21,7 @@
 #undef DEBUG
 #undef DEBUG_LOW
 
+#define pr_fmt(fmt) "hash-mmu: " fmt
 #include <linux/spinlock.h>
 #include <linux/errno.h>
 #include <linux/sched/mm.h>
-- 
cgit 


From c1807e3f84668c4c1e838386fdc3f6e63df1156c Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 19 Oct 2017 15:08:19 +1100
Subject: powerpc/64: Free up CPU_FTR_ICSWX

The last user of CPU_FTR_ICSWX was removed in commit
6ff4d3e96652 ("powerpc: Remove old unused icswx based coprocessor
support"), so free the bit up for future use.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cputable.h | 6 +++---
 arch/powerpc/kernel/dt_cpu_ftrs.c   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index a9bf921f4efc..e188dd57bf51 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -206,7 +206,7 @@ enum {
 #define CPU_FTR_STCX_CHECKS_ADDRESS	LONG_ASM_CONST(0x0004000000000000)
 #define CPU_FTR_POPCNTB			LONG_ASM_CONST(0x0008000000000000)
 #define CPU_FTR_POPCNTD			LONG_ASM_CONST(0x0010000000000000)
-#define CPU_FTR_ICSWX			LONG_ASM_CONST(0x0020000000000000)
+/* Free					LONG_ASM_CONST(0x0020000000000000) */
 #define CPU_FTR_VMX_COPY		LONG_ASM_CONST(0x0040000000000000)
 #define CPU_FTR_TM			LONG_ASM_CONST(0x0080000000000000)
 #define CPU_FTR_CFAR			LONG_ASM_CONST(0x0100000000000000)
@@ -451,7 +451,7 @@ enum {
 	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
 	    CPU_FTR_DSCR | CPU_FTR_SAO  | CPU_FTR_ASYM_SMT | \
 	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | \
+	    CPU_FTR_CFAR | CPU_FTR_HVMODE | \
 	    CPU_FTR_VMX_COPY | CPU_FTR_HAS_PPR | CPU_FTR_DABRX)
 #define CPU_FTRS_POWER8 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\
@@ -460,7 +460,7 @@ enum {
 	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
 	    CPU_FTR_DSCR | CPU_FTR_SAO  | \
 	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
+	    CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
 	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
 	    CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP)
 #define CPU_FTRS_POWER8E (CPU_FTRS_POWER8 | CPU_FTR_PMAO_BUG)
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 1df770e8cbe0..c7bfcacc2cb6 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -634,7 +634,7 @@ static struct dt_cpu_feature_match __initdata
 	{"no-execute", feat_enable, 0},
 	{"strong-access-ordering", feat_enable, CPU_FTR_SAO},
 	{"cache-inhibited-large-page", feat_enable_large_ci, 0},
-	{"coprocessor-icswx", feat_enable, CPU_FTR_ICSWX},
+	{"coprocessor-icswx", feat_enable, 0},
 	{"hypervisor-virtualization-interrupt", feat_enable_hvi, 0},
 	{"program-priority-register", feat_enable, CPU_FTR_HAS_PPR},
 	{"wait", feat_enable, 0},
-- 
cgit 


From 4e003747043d57aa75c9762fa148ef38afe68dd8 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 19 Oct 2017 15:08:43 +1100
Subject: powerpc/64s: Replace CONFIG_PPC_STD_MMU_64 with CONFIG_PPC_BOOK3S_64

CONFIG_PPC_STD_MMU_64 indicates support for the "standard" powerpc MMU
on 64-bit CPUs. The "standard" MMU refers to the hash page table MMU
found in "server" processors, from IBM mainly.

Currently CONFIG_PPC_STD_MMU_64 is == CONFIG_PPC_BOOK3S_64. While it's
annoying to have two symbols that always have the same value, it's not
quite annoying enough to bother removing one.

However with the arrival of Power9, we now have the situation where
CONFIG_PPC_STD_MMU_64 is enabled, but the kernel is running using the
Radix MMU - *not* the "standard" MMU. So it is now actively confusing
to use it, because it implies that code is disabled or inactive when
the Radix MMU is in use, however that is not necessarily true.

So s/CONFIG_PPC_STD_MMU_64/CONFIG_PPC_BOOK3S_64/, and do some minor
formatting updates of some of the affected lines.

This will be a pain for backports, but c'est la vie.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig                         |  6 +++---
 arch/powerpc/include/asm/nohash/64/pgtable.h |  2 +-
 arch/powerpc/include/asm/paca.h              | 10 +++++-----
 arch/powerpc/include/asm/page_64.h           |  6 +++---
 arch/powerpc/include/asm/pgtable-be-types.h  |  2 +-
 arch/powerpc/include/asm/pgtable-types.h     |  4 ++--
 arch/powerpc/include/asm/tlbflush.h          |  2 +-
 arch/powerpc/kernel/asm-offsets.c            |  4 ++--
 arch/powerpc/kernel/entry_64.S               |  4 ++--
 arch/powerpc/kernel/exceptions-64s.S         |  8 ++++----
 arch/powerpc/kernel/machine_kexec_64.c       |  4 ++--
 arch/powerpc/kernel/mce_power.c              |  4 ++--
 arch/powerpc/kernel/paca.c                   | 12 ++++++------
 arch/powerpc/kernel/pci_64.c                 |  4 ++--
 arch/powerpc/kernel/process.c                | 12 ++++++------
 arch/powerpc/kernel/prom.c                   |  2 +-
 arch/powerpc/kernel/setup-common.c           |  4 ++--
 arch/powerpc/mm/Makefile                     |  6 +++---
 arch/powerpc/mm/dump_hashpagetable.c         |  2 +-
 arch/powerpc/mm/dump_linuxpagetables.c       | 10 +++++-----
 arch/powerpc/mm/init_64.c                    |  8 ++++----
 arch/powerpc/mm/pgtable_64.c                 |  2 +-
 arch/powerpc/platforms/Kconfig.cputype       |  6 +-----
 arch/powerpc/platforms/pseries/lpar.c        |  8 ++++----
 arch/powerpc/platforms/pseries/lparcfg.c     |  2 +-
 arch/powerpc/xmon/xmon.c                     |  6 +++---
 26 files changed, 68 insertions(+), 72 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 809c468edab1..b8b75b423316 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -334,7 +334,7 @@ config PPC_OF_PLATFORM_PCI
 	default n
 
 config ARCH_SUPPORTS_DEBUG_PAGEALLOC
-	depends on PPC32 || PPC_STD_MMU_64
+	depends on PPC32 || PPC_BOOK3S_64
 	def_bool y
 
 config ARCH_SUPPORTS_UPROBES
@@ -721,7 +721,7 @@ config PPC_16K_PAGES
 
 config PPC_64K_PAGES
 	bool "64k page size"
-	depends on !PPC_FSL_BOOK3E && (44x || PPC_STD_MMU_64 || PPC_BOOK3E_64)
+	depends on !PPC_FSL_BOOK3E && (44x || PPC_BOOK3S_64 || PPC_BOOK3E_64)
 	select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
 
 config PPC_256K_PAGES
@@ -780,7 +780,7 @@ config FORCE_MAX_ZONEORDER
 
 config PPC_SUBPAGE_PROT
 	bool "Support setting protections for 4k subpages"
-	depends on PPC_STD_MMU_64 && PPC_64K_PAGES
+	depends on PPC_BOOK3S_64 && PPC_64K_PAGES
 	help
 	  This option adds support for a system call to allow user programs
 	  to set access permissions (read/write, readonly, or no access)
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index f0ff384d4ca5..b1c42ac54d96 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -203,7 +203,7 @@ static inline unsigned long pte_update(struct mm_struct *mm,
 	if (!huge)
 		assert_pte_locked(mm, addr);
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	if (old & _PAGE_HASHPTE)
 		hpte_need_flush(mm, addr, ptep, old, huge);
 #endif
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 1e06310ccc09..c907ae23c956 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -91,14 +91,14 @@ struct paca_struct {
 	u8 cpu_start;			/* At startup, processor spins until */
 					/* this becomes non-zero. */
 	u8 kexec_state;		/* set when kexec down has irqs off */
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	struct slb_shadow *slb_shadow_ptr;
 	struct dtl_entry *dispatch_log;
 	struct dtl_entry *dispatch_log_end;
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif
 	u64 dscr_default;		/* per-CPU default DSCR */
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	/*
 	 * Now, starting in cacheline 2, the exception save areas
 	 */
@@ -110,7 +110,7 @@ struct paca_struct {
 	u16 vmalloc_sllp;
 	u16 slb_cache_ptr;
 	u32 slb_cache[SLB_CACHE_ENTRIES];
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 #ifdef CONFIG_PPC_BOOK3E
 	u64 exgen[8] __aligned(0x40);
@@ -192,7 +192,7 @@ struct paca_struct {
 	struct stop_sprs stop_sprs;
 #endif
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	/* Non-maskable exceptions that are not performance critical */
 	u64 exnmi[EX_SIZE];	/* used for system reset (nmi) */
 	u64 exmc[EX_SIZE];	/* used for machine checks */
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index c4d9654bd637..56234c6fcd61 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -117,21 +117,21 @@ extern void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
 #endif /* __ASSEMBLY__ */
 #else
 #define slice_init()
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 #define get_slice_psize(mm, addr)	((mm)->context.user_psize)
 #define slice_set_user_psize(mm, psize)		\
 do {						\
 	(mm)->context.user_psize = (psize);	\
 	(mm)->context.sllp = SLB_VSID_USER | mmu_psize_defs[(psize)].sllp; \
 } while (0)
-#else /* CONFIG_PPC_STD_MMU_64 */
+#else /* !CONFIG_PPC_BOOK3S_64 */
 #ifdef CONFIG_PPC_64K_PAGES
 #define get_slice_psize(mm, addr)	MMU_PAGE_64K
 #else /* CONFIG_PPC_64K_PAGES */
 #define get_slice_psize(mm, addr)	MMU_PAGE_4K
 #endif /* !CONFIG_PPC_64K_PAGES */
 #define slice_set_user_psize(mm, psize)	do { BUG(); } while(0)
-#endif /* !CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 #define slice_set_range_psize(mm, start, len, psize)	\
 	slice_set_user_psize((mm), (psize))
diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h
index 67e7e3d990f4..03ab638593e1 100644
--- a/arch/powerpc/include/asm/pgtable-be-types.h
+++ b/arch/powerpc/include/asm/pgtable-be-types.h
@@ -76,7 +76,7 @@ typedef struct { unsigned long pgprot; } pgprot_t;
  * With hash config 64k pages additionally define a bigger "real PTE" type that
  * gathers the "second half" part of the PTE for pseudo 64k pages
  */
-#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64)
+#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64)
 typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
 #else
 typedef struct { pte_t pte; } real_pte_t;
diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h
index 369a164b545c..5d2c38d26396 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -49,13 +49,13 @@ typedef struct { unsigned long pgprot; } pgprot_t;
  * With hash config 64k pages additionally define a bigger "real PTE" type that
  * gathers the "second half" part of the PTE for pseudo 64k pages
  */
-#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64)
+#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64)
 typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
 #else
 typedef struct { pte_t pte; } real_pte_t;
 #endif
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 #include <asm/cmpxchg.h>
 
 static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new)
diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h
index 13dbcd41885e..7d5a157c7832 100644
--- a/arch/powerpc/include/asm/tlbflush.h
+++ b/arch/powerpc/include/asm/tlbflush.h
@@ -77,7 +77,7 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
 	flush_tlb_mm(mm);
 }
 
-#elif defined(CONFIG_PPC_STD_MMU_64)
+#elif defined(CONFIG_PPC_BOOK3S_64)
 #include <asm/book3s/64/tlbflush.h>
 #else
 #error Unsupported MMU type
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8cfb20e38cfe..200623e71474 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -208,7 +208,7 @@ int main(void)
 	OFFSET(TCD_ESEL_FIRST, tlb_core_data, esel_first);
 #endif /* CONFIG_PPC_BOOK3E */
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	OFFSET(PACASLBCACHE, paca_struct, slb_cache);
 	OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr);
 	OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp);
@@ -230,7 +230,7 @@ int main(void)
 	OFFSET(LPPACA_DTLIDX, lppaca, dtl_idx);
 	OFFSET(LPPACA_YIELDCOUNT, lppaca, yield_count);
 	OFFSET(PACA_DTL_RIDX, paca_struct, dtl_ridx);
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 	OFFSET(PACAEMERGSP, paca_struct, emergency_sp);
 #ifdef CONFIG_PPC_BOOK3S_64
 	OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp);
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 4a0fd4f40245..3320bcac7192 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -539,7 +539,7 @@ _GLOBAL(_switch)
 	std	r6,PACACURRENT(r13)	/* Set new 'current' */
 
 	ld	r8,KSP(r4)	/* new stack pointer */
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 BEGIN_MMU_FTR_SECTION
 	b	2f
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
@@ -588,7 +588,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	slbmte	r7,r0
 	isync
 2:
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 	CURRENT_THREAD_INFO(r7, r8)  /* base of new stack */
 	/* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 64c2bc35e4ff..6879aed47377 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -607,7 +607,7 @@ EXC_COMMON_BEGIN(slb_miss_common)
 	cmpdi	cr5,r11,MSR_RI
 
 	crset	4*cr0+eq
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 BEGIN_MMU_FTR_SECTION
 	bl	slb_allocate
 END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
@@ -1497,7 +1497,7 @@ USE_TEXT_SECTION()
  */
 	.balign	IFETCH_ALIGN_BYTES
 do_hash_page:
-	#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	lis	r0,DSISR_BAD_FAULT_64S@h
 	ori	r0,r0,DSISR_BAD_FAULT_64S@l
 	and.	r0,r4,r0		/* weird error? */
@@ -1528,7 +1528,7 @@ do_hash_page:
 
 	/* Reload DSISR into r4 for the DABR check below */
 	ld      r4,_DSISR(r1)
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 /* Here we have a page fault that hash_page can't handle. */
 handle_page_fault:
@@ -1557,7 +1557,7 @@ handle_dabr_fault:
 12:	b       ret_from_except_lite
 
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 /* We have a page fault that hash_page could handle but HV refused
  * the PTE insertion
  */
diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c
index 5c12e21d0d1a..49d34d7271e7 100644
--- a/arch/powerpc/kernel/machine_kexec_64.c
+++ b/arch/powerpc/kernel/machine_kexec_64.c
@@ -360,7 +360,7 @@ void default_machine_kexec(struct kimage *image)
 	/* NOTREACHED */
 }
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 /* Values we need to export to the second kernel via the device tree. */
 static unsigned long htab_base;
 static unsigned long htab_size;
@@ -402,4 +402,4 @@ static int __init export_htab_values(void)
 	return 0;
 }
 late_initcall(export_htab_values);
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 82cabfc7c2f6..1600ab194ff9 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -168,7 +168,7 @@ void __flush_tlb_power9(unsigned int action)
 
 
 /* flush SLBs and reload */
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 static void flush_and_reload_slb(void)
 {
 	struct slb_shadow *slb;
@@ -215,7 +215,7 @@ static void flush_erat(void)
 
 static int mce_flush(int what)
 {
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	if (what == MCE_FLUSH_SLB) {
 		flush_and_reload_slb();
 		return 1;
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 2ff2b8a19f71..5d38d5ea9a24 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -90,7 +90,7 @@ static inline void free_lppacas(void) { }
 
 #endif /* CONFIG_PPC_BOOK3S */
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 
 /*
  * 3 persistent SLBs are registered here.  The buffer will be zero
@@ -135,11 +135,11 @@ static struct slb_shadow * __init init_slb_shadow(int cpu)
 	return s;
 }
 
-#else /* CONFIG_PPC_STD_MMU_64 */
+#else /* !CONFIG_PPC_BOOK3S_64 */
 
 static void __init allocate_slb_shadows(int nr_cpus, int limit) { }
 
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 /* The Paca is an array with one entry per processor.  Each contains an
  * lppaca, which contains the information shared between the
@@ -170,9 +170,9 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu)
 	new_paca->kexec_state = KEXEC_STATE_NONE;
 	new_paca->__current = &init_task;
 	new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL;
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	new_paca->slb_shadow_ptr = init_slb_shadow(cpu);
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif
 
 #ifdef CONFIG_PPC_BOOK3E
 	/* For now -- if we have threads this will be adjusted later */
@@ -271,7 +271,7 @@ void copy_mm_to_paca(struct mm_struct *mm)
 	get_paca()->mm_ctx_user_psize = context->user_psize;
 	get_paca()->mm_ctx_sllp = context->sllp;
 #endif
-#else /* CONFIG_PPC_BOOK3S */
+#else /* !CONFIG_PPC_BOOK3S */
 	return;
 #endif
 }
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 932b9741aa8f..15ce0306b092 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -90,14 +90,14 @@ int pcibios_unmap_io_space(struct pci_bus *bus)
 	 * to do an appropriate TLB flush here too
 	 */
 	if (bus->self) {
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 		struct resource *res = bus->resource[0];
 #endif
 
 		pr_debug("IO unmapping for PCI-PCI bridge %s\n",
 			 pci_name(bus->self));
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 		__flush_hash_table_range(&init_mm, res->start + _IO_BASE,
 					 res->end + _IO_BASE + 1);
 #endif
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index c051dc2b42ad..2b602d60a9c7 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1164,7 +1164,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	}
 #endif /* CONFIG_PPC64 */
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	batch = this_cpu_ptr(&ppc64_tlb_batch);
 	if (batch->active) {
 		current_thread_info()->local_flags |= _TLF_LAZY_MMU;
@@ -1172,7 +1172,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
 			__flush_tlb_pending(batch);
 		batch->active = 0;
 	}
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 	switch_booke_debug_regs(&new->thread.debug);
@@ -1218,7 +1218,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
 
 	last = _switch(old_thread, new_thread);
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	if (current_thread_info()->local_flags & _TLF_LAZY_MMU) {
 		current_thread_info()->local_flags &= ~_TLF_LAZY_MMU;
 		batch = this_cpu_ptr(&ppc64_tlb_batch);
@@ -1247,7 +1247,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
 					: : "r"(dummy_copy_buffer), "r"(0));
 		}
 	}
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 	return last;
 }
@@ -1476,7 +1476,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 
 static void setup_ksp_vsid(struct task_struct *p, unsigned long sp)
 {
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	unsigned long sp_vsid;
 	unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp;
 
@@ -2056,7 +2056,7 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 	unsigned long base = mm->brk;
 	unsigned long ret;
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	/*
 	 * If we are using 1TB segments and we are allowed to randomise
 	 * the heap, we can put it above 1TB so it is backed by a 1TB
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 101822be525a..b15bae265c90 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -229,7 +229,7 @@ static void __init check_cpu_pa_features(unsigned long node)
 		      ibm_pa_features, ARRAY_SIZE(ibm_pa_features));
 }
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 static void __init init_mmu_slb_size(unsigned long node)
 {
 	const __be32 *slb_size_ptr;
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 0ac741fae90e..da4fd54e6914 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -773,7 +773,7 @@ void arch_setup_pdev_archdata(struct platform_device *pdev)
 static __init void print_system_info(void)
 {
 	pr_info("-----------------------------------------------------\n");
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	pr_info("ppc64_pft_size    = 0x%llx\n", ppc64_pft_size);
 #endif
 #ifdef CONFIG_PPC_STD_MMU_32
@@ -800,7 +800,7 @@ static __init void print_system_info(void)
 	pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features);
 #endif
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	if (htab_address)
 		pr_info("htab_address      = 0x%p\n", htab_address);
 	if (htab_hash_mask)
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index fb844d2f266e..915178423c3a 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -14,11 +14,11 @@ obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
 obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(BITS)e.o
 hash64-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)   += pgtable-book3e.o
-obj-$(CONFIG_PPC_STD_MMU_64)	+= pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o
 obj-$(CONFIG_PPC_RADIX_MMU)	+= pgtable-radix.o tlb-radix.o
 obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o
 obj-$(CONFIG_PPC_STD_MMU)	+= tlb_hash$(BITS).o
-ifeq ($(CONFIG_PPC_STD_MMU_64),y)
+ifeq ($(CONFIG_PPC_BOOK3S_64),y)
 obj-$(CONFIG_PPC_4K_PAGES)	+= hash64_4k.o
 obj-$(CONFIG_PPC_64K_PAGES)	+= hash64_64k.o
 endif
@@ -31,7 +31,7 @@ obj-$(CONFIG_PPC_SPLPAR)	+= vphn.o
 obj-$(CONFIG_PPC_MM_SLICES)	+= slice.o
 obj-y				+= hugetlbpage.o
 ifeq ($(CONFIG_HUGETLB_PAGE),y)
-obj-$(CONFIG_PPC_STD_MMU_64)	+= hugetlbpage-hash64.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= hugetlbpage-hash64.o
 obj-$(CONFIG_PPC_RADIX_MMU)	+= hugetlbpage-radix.o
 obj-$(CONFIG_PPC_BOOK3E_MMU)	+= hugetlbpage-book3e.o
 endif
diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c
index 5c4c93dcff19..14cfb11b09d0 100644
--- a/arch/powerpc/mm/dump_hashpagetable.c
+++ b/arch/powerpc/mm/dump_hashpagetable.c
@@ -500,7 +500,7 @@ static void populate_markers(void)
 	address_markers[6].start_address = PHB_IO_END;
 	address_markers[7].start_address = IOREMAP_BASE;
 	address_markers[8].start_address = IOREMAP_END;
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	address_markers[9].start_address =  H_VMEMMAP_BASE;
 #else
 	address_markers[9].start_address =  VMEMMAP_BASE;
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index c9282d27b203..c2e7dea59490 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -112,7 +112,7 @@ struct flag_info {
 
 static const struct flag_info flag_array[] = {
 	{
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 		.mask	= _PAGE_PRIVILEGED,
 		.val	= 0,
 #else
@@ -147,7 +147,7 @@ static const struct flag_info flag_array[] = {
 		.set	= "present",
 		.clear	= "       ",
 	}, {
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 		.mask	= H_PAGE_HASHPTE,
 		.val	= H_PAGE_HASHPTE,
 #else
@@ -157,7 +157,7 @@ static const struct flag_info flag_array[] = {
 		.set	= "hpte",
 		.clear	= "    ",
 	}, {
-#ifndef CONFIG_PPC_STD_MMU_64
+#ifndef CONFIG_PPC_BOOK3S_64
 		.mask	= _PAGE_GUARDED,
 		.val	= _PAGE_GUARDED,
 		.set	= "guarded",
@@ -174,7 +174,7 @@ static const struct flag_info flag_array[] = {
 		.set	= "accessed",
 		.clear	= "        ",
 	}, {
-#ifndef CONFIG_PPC_STD_MMU_64
+#ifndef CONFIG_PPC_BOOK3S_64
 		.mask	= _PAGE_WRITETHRU,
 		.val	= _PAGE_WRITETHRU,
 		.set	= "write through",
@@ -450,7 +450,7 @@ static void populate_markers(void)
 	address_markers[i++].start_address = PHB_IO_END;
 	address_markers[i++].start_address = IOREMAP_BASE;
 	address_markers[i++].start_address = IOREMAP_END;
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	address_markers[i++].start_address =  H_VMEMMAP_BASE;
 #else
 	address_markers[i++].start_address =  VMEMMAP_BASE;
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 588a521966ec..12b3025fadac 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -68,11 +68,11 @@
 
 #include "mmu_decl.h"
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 #if H_PGTABLE_RANGE > USER_VSID_RANGE
 #warning Limited user VSID range means pagetable space is wasted
 #endif
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 phys_addr_t memstart_addr = ~0;
 EXPORT_SYMBOL_GPL(memstart_addr);
@@ -367,7 +367,7 @@ EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
 
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 static bool disable_radix;
 static int __init parse_disable_radix(char *p)
 {
@@ -444,4 +444,4 @@ void __init mmu_early_init_devtree(void)
 	else
 		hash__early_init_devtree();
 }
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index ac0717a90ca6..2851282edcd4 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -57,7 +57,7 @@
 
 #include "mmu_decl.h"
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT))
 #error TASK_SIZE_USER64 exceeds user VSID range
 #endif
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 13663efc1d31..f8928ee85f6b 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -294,10 +294,6 @@ config PPC_STD_MMU_32
 	def_bool y
 	depends on PPC_STD_MMU && PPC32
 
-config PPC_STD_MMU_64
-	def_bool y
-	depends on PPC_STD_MMU && PPC64
-
 config PPC_RADIX_MMU
 	bool "Radix MMU Support"
 	depends on PPC_BOOK3S_64
@@ -323,7 +319,7 @@ config PPC_BOOK3E_MMU
 
 config PPC_MM_SLICES
 	bool
-	default y if PPC_STD_MMU_64
+	default y if PPC_BOOK3S_64
 	default n
 
 config PPC_HAVE_PMU_SUPPORT
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 495ba4e7336d..0ee4a469a4ae 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -93,7 +93,7 @@ void vpa_init(int cpu)
 		return;
 	}
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	/*
 	 * PAPR says this feature is SLB-Buffer but firmware never
 	 * reports that.  All SPLPAR support SLB shadow buffer.
@@ -106,7 +106,7 @@ void vpa_init(int cpu)
 			       "cpu %d (hw %d) of area %lx failed with %ld\n",
 			       cpu, hwcpu, addr, ret);
 	}
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 	/*
 	 * Register dispatch trace log, if one has been allocated.
@@ -129,7 +129,7 @@ void vpa_init(int cpu)
 	}
 }
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 
 static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 				     unsigned long vpn, unsigned long pa,
@@ -824,7 +824,7 @@ void arch_free_page(struct page *page, int order)
 EXPORT_SYMBOL(arch_free_page);
 
 #endif /* CONFIG_PPC_SMLPAR */
-#endif /* CONFIG_PPC_STD_MMU_64 */
+#endif /* CONFIG_PPC_BOOK3S_64 */
 
 #ifdef CONFIG_TRACEPOINTS
 #ifdef HAVE_JUMP_LABEL
diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c
index 779fc2a1c8f7..b2706c483067 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -485,7 +485,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
 	seq_printf(m, "shared_processor_mode=%d\n",
 		   lppaca_shared_proc(get_lppaca()));
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	seq_printf(m, "slb_size=%d\n", mmu_slb_size);
 #endif
 	parse_em_data(m);
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 83e7faf5b3d3..6b1236b85622 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2312,7 +2312,7 @@ static void dump_tracing(void)
 static void dump_one_paca(int cpu)
 {
 	struct paca_struct *p;
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	int i = 0;
 #endif
 
@@ -2353,7 +2353,7 @@ static void dump_one_paca(int cpu)
 	DUMP(p, hw_cpu_id, "x");
 	DUMP(p, cpu_start, "x");
 	DUMP(p, kexec_state, "x");
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 	for (i = 0; i < SLB_NUM_BOLTED; i++) {
 		u64 esid, vsid;
 
@@ -3263,7 +3263,7 @@ static void xmon_print_symbol(unsigned long address, const char *mid,
 	printf("%s", after);
 }
 
-#ifdef CONFIG_PPC_STD_MMU_64
+#ifdef CONFIG_PPC_BOOK3S_64
 void dump_segments(void)
 {
 	int i;
-- 
cgit 


From 1fd6c02207107c8892219dacef01de7ced3d4ce7 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 24 Oct 2017 17:48:49 +0200
Subject: powerpc/mm: Add a CONFIG option to choose if radix is used by default

Currently if the hardware supports the radix MMU we will use
it, *unless* "disable_radix" is passed on the kernel command line.

However some users would like the reverse semantics. ie. The kernel
uses the hash MMU by default, unless radix is explicitly requested on
the command line.

So add a CONFIG option to choose whether we use radix by default or
not, and expand the disable_radix command line option to allow
"disable_radix=no" which *enables* radix.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/init_64.c              | 13 +++++++++++--
 arch/powerpc/platforms/Kconfig.cputype | 13 +++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 12b3025fadac..a07722531b32 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -368,10 +368,19 @@ EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
 #ifdef CONFIG_PPC_BOOK3S_64
-static bool disable_radix;
+static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
+
 static int __init parse_disable_radix(char *p)
 {
-	disable_radix = true;
+	bool val;
+
+	if (strlen(p) == 0)
+		val = true;
+	else if (kstrtobool(p, &val))
+		return -EINVAL;
+
+	disable_radix = val;
+
 	return 0;
 }
 early_param("disable_radix", parse_disable_radix);
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index f8928ee85f6b..596bd9091478 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -304,6 +304,19 @@ config PPC_RADIX_MMU
 	  is only implemented by IBM Power9 CPUs, if you don't have one of them
 	  you can probably disable this.
 
+config PPC_RADIX_MMU_DEFAULT
+	bool "Default to using the Radix MMU when possible"
+	depends on PPC_RADIX_MMU
+	default y
+	help
+	  When the hardware supports the Radix MMU, default to using it unless
+	  "disable_radix[=yes]" is specified on the kernel command line.
+
+	  If this option is disabled, the Hash MMU will be used by default,
+	  unless "disable_radix=no" is specified on the kernel command line.
+
+	  If you're unsure, say Y.
+
 config ARCH_ENABLE_HUGEPAGE_MIGRATION
 	def_bool y
 	depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION
-- 
cgit 


From 632f0574167ad3f5d646dad6af87d976a703d93e Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Thu, 12 Oct 2017 15:45:25 +1100
Subject: powerpc/tm: Don't check for WARN in TM Bad Thing handling

Currently when we take a TM Bad Thing program check exception, we
search the bug table to see if the program check was generated by a
WARN/WARN_ON etc.

That makes no sense, the WARN macros use trap instructions, which
should never generate a TM Bad Thing exception. If they ever did that
would be a bug and we should oops.

We do have some hand-coded bugs in tm.S, using EMIT_BUG_ENTRY, but
those are all BUGs not WARNs, and they all use trap instructions
anyway. Almost certainly this check was incorrectly copied from the
REASON_TRAP handling in the same function.

Remove it.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Acked-By: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/traps.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 9ae1924c7d1a..0e4099fef198 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1337,13 +1337,8 @@ void program_check_exception(struct pt_regs *regs)
 		 * -  A treclaim is attempted when non transactional.
 		 * -  A tend is illegally attempted.
 		 * -  writing a TM SPR when transactional.
-		 */
-		if (!user_mode(regs) &&
-		    report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) {
-			regs->nip += 4;
-			goto bail;
-		}
-		/* If usermode caused this, it's done something illegal and
+		 *
+		 * If usermode caused this, it's done something illegal and
 		 * gets a SIGILL slap on the wrist.  We call it an illegal
 		 * operand to distinguish from the instruction just being bad
 		 * (e.g. executing a 'tend' on a CPU without TM!); it's an
-- 
cgit 


From ac0761ebcb08830d8f64b9181f6736b1a00bf746 Mon Sep 17 00:00:00 2001
From: Sandipan Das <sandipan@linux.vnet.ibm.com>
Date: Sat, 2 Sep 2017 00:23:01 +0530
Subject: bpf: take advantage of stack_depth tracking in powerpc JIT

Take advantage of stack_depth tracking, originally introduced for
x64, in powerpc JIT as well. Round up allocated stack by 16 bytes
to make sure it stays aligned for functions called from JITed bpf
program.

Signed-off-by: Sandipan Das <sandipan@linux.vnet.ibm.com>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/net/bpf_jit64.h      |  7 ++++---
 arch/powerpc/net/bpf_jit_comp64.c | 16 ++++++++++------
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h
index 62fa7589db2b..8bdef7ed28a8 100644
--- a/arch/powerpc/net/bpf_jit64.h
+++ b/arch/powerpc/net/bpf_jit64.h
@@ -23,7 +23,7 @@
  *		[   nv gpr save area	] 8*8		|
  *		[    tail_call_cnt	] 8		|
  *		[    local_tmp_var	] 8		|
- * fp (r31) -->	[   ebpf stack space	] 512		|
+ * fp (r31) -->	[   ebpf stack space	] upto 512	|
  *		[     frame header	] 32/112	|
  * sp (r1) --->	[    stack pointer	] --------------
  */
@@ -32,8 +32,8 @@
 #define BPF_PPC_STACK_SAVE	(8*8)
 /* for bpf JIT code internal usage */
 #define BPF_PPC_STACK_LOCALS	16
-/* Ensure this is quadword aligned */
-#define BPF_PPC_STACKFRAME	(STACK_FRAME_MIN_SIZE + MAX_BPF_STACK + \
+/* stack frame excluding BPF stack, ensure this is quadword aligned */
+#define BPF_PPC_STACKFRAME	(STACK_FRAME_MIN_SIZE + \
 				 BPF_PPC_STACK_LOCALS + BPF_PPC_STACK_SAVE)
 
 #ifndef __ASSEMBLY__
@@ -103,6 +103,7 @@ struct codegen_context {
 	 */
 	unsigned int seen;
 	unsigned int idx;
+	unsigned int stack_size;
 };
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index a66e64b0b251..46d74e81aff1 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -69,7 +69,7 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx)
 static int bpf_jit_stack_local(struct codegen_context *ctx)
 {
 	if (bpf_has_stack_frame(ctx))
-		return STACK_FRAME_MIN_SIZE + MAX_BPF_STACK;
+		return STACK_FRAME_MIN_SIZE + ctx->stack_size;
 	else
 		return -(BPF_PPC_STACK_SAVE + 16);
 }
@@ -82,8 +82,9 @@ static int bpf_jit_stack_tailcallcnt(struct codegen_context *ctx)
 static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg)
 {
 	if (reg >= BPF_PPC_NVR_MIN && reg < 32)
-		return (bpf_has_stack_frame(ctx) ? BPF_PPC_STACKFRAME : 0)
-							- (8 * (32 - reg));
+		return (bpf_has_stack_frame(ctx) ?
+			(BPF_PPC_STACKFRAME + ctx->stack_size) : 0)
+				- (8 * (32 - reg));
 
 	pr_err("BPF JIT is asking about unknown registers");
 	BUG();
@@ -134,7 +135,7 @@ static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)
 			PPC_BPF_STL(0, 1, PPC_LR_STKOFF);
 		}
 
-		PPC_BPF_STLU(1, 1, -BPF_PPC_STACKFRAME);
+		PPC_BPF_STLU(1, 1, -(BPF_PPC_STACKFRAME + ctx->stack_size));
 	}
 
 	/*
@@ -161,7 +162,7 @@ static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx)
 	/* Setup frame pointer to point to the bpf stack area */
 	if (bpf_is_seen_register(ctx, BPF_REG_FP))
 		PPC_ADDI(b2p[BPF_REG_FP], 1,
-				STACK_FRAME_MIN_SIZE + MAX_BPF_STACK);
+				STACK_FRAME_MIN_SIZE + ctx->stack_size);
 }
 
 static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx)
@@ -183,7 +184,7 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx
 
 	/* Tear down our stack frame */
 	if (bpf_has_stack_frame(ctx)) {
-		PPC_ADDI(1, 1, BPF_PPC_STACKFRAME);
+		PPC_ADDI(1, 1, BPF_PPC_STACKFRAME + ctx->stack_size);
 		if (ctx->seen & SEEN_FUNC) {
 			PPC_BPF_LL(0, 1, PPC_LR_STKOFF);
 			PPC_MTLR(0);
@@ -1013,6 +1014,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 
 	memset(&cgctx, 0, sizeof(struct codegen_context));
 
+	/* Make sure that the stack is quadword aligned. */
+	cgctx.stack_size = round_up(fp->aux->stack_depth, 16);
+
 	/* Scouting faux-generate pass 0 */
 	if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) {
 		/* We hit something illegal or unsupported. */
-- 
cgit 


From edfd17ff39bce59886e8249e645d6ecfc3e3829c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Sat, 4 Nov 2017 22:26:52 +0100
Subject: powerpc/eeh: Stop using do_gettimeofday()

This interface is inefficient and deprecated because of the y2038
overflow.

ktime_get_seconds() is an appropriate replacement here, since it
has sufficient granularity but is more efficient and uses monotonic
time.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Acked-by: Russell Currey <ruscur@russell.cc>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/eeh.h   | 2 +-
 arch/powerpc/kernel/eeh_driver.c | 2 +-
 arch/powerpc/kernel/eeh_pe.c     | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index cd1df96335e5..5161c37dd039 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -93,7 +93,7 @@ struct eeh_pe {
 	struct pci_bus *bus;		/* Top PCI bus for bus PE	*/
 	int check_count;		/* Times of ignored error	*/
 	int freeze_count;		/* Times of froze up		*/
-	struct timeval tstamp;		/* Time on first-time freeze	*/
+	time64_t tstamp;		/* Time on first-time freeze	*/
 	int false_positives;		/* Times of reported #ff's	*/
 	atomic_t pass_dev_cnt;		/* Count of passed through devs	*/
 	struct eeh_pe *parent;		/* Parent PE			*/
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 8b840191df59..57d54163bf94 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -623,7 +623,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
 				struct eeh_rmv_data *rmv_data)
 {
 	struct pci_bus *frozen_bus = eeh_pe_bus_get(pe);
-	struct timeval tstamp;
+	time64_t tstamp;
 	int cnt, rc;
 	struct eeh_dev *edev;
 
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 2e8d1b2b5af4..2d4956e97aa9 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -526,16 +526,16 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
  */
 void eeh_pe_update_time_stamp(struct eeh_pe *pe)
 {
-	struct timeval tstamp;
+	time64_t tstamp;
 
 	if (!pe) return;
 
 	if (pe->freeze_count <= 0) {
 		pe->freeze_count = 0;
-		do_gettimeofday(&pe->tstamp);
+		pe->tstamp = ktime_get_seconds();
 	} else {
-		do_gettimeofday(&tstamp);
-		if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) {
+		tstamp = ktime_get_seconds();
+		if (tstamp - pe->tstamp > 3600) {
 			pe->tstamp = tstamp;
 			pe->freeze_count = 0;
 		}
-- 
cgit 


From 71e24d7731a2903b1ae2bba2b2971c654d9c2aa6 Mon Sep 17 00:00:00 2001
From: "William A. Kennington III" <wak@google.com>
Date: Fri, 22 Sep 2017 16:58:00 -0700
Subject: powerpc/opal: Fix EBUSY bug in acquiring tokens

The current code checks the completion map to look for the first token
that is complete. In some cases, a completion can come in but the
token can still be on lease to the caller processing the completion.
If this completed but unreleased token is the first token found in the
bitmap by another tasks trying to acquire a token, then the
__test_and_set_bit call will fail since the token will still be on
lease. The acquisition will then fail with an EBUSY.

This patch reorganizes the acquisition code to look at the
opal_async_token_map for an unleased token. If the token has no lease
it must have no outstanding completions so we should never see an
EBUSY, unless we have leased out too many tokens. Since
opal_async_get_token_inrerruptible is protected by a semaphore, we
will practically never see EBUSY anymore.

Fixes: 8d7248232208 ("powerpc/powernv: Infrastructure to support OPAL async completion")
Signed-off-by: William A. Kennington III <wak@google.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-async.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index cf33769a7b72..45b3feb8aa2f 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -39,18 +39,18 @@ int __opal_async_get_token(void)
 	int token;
 
 	spin_lock_irqsave(&opal_async_comp_lock, flags);
-	token = find_first_bit(opal_async_complete_map, opal_max_async_tokens);
+	token = find_first_zero_bit(opal_async_token_map, opal_max_async_tokens);
 	if (token >= opal_max_async_tokens) {
 		token = -EBUSY;
 		goto out;
 	}
 
-	if (__test_and_set_bit(token, opal_async_token_map)) {
+	if (!__test_and_clear_bit(token, opal_async_complete_map)) {
 		token = -EBUSY;
 		goto out;
 	}
 
-	__clear_bit(token, opal_async_complete_map);
+	__set_bit(token, opal_async_token_map);
 
 out:
 	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
-- 
cgit 


From 59cf9a1cfcd9de6392d218fcd69413f2e77babbe Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Fri, 3 Nov 2017 13:41:41 +1100
Subject: powerpc/opal: Make __opal_async_{get, release}_token() static

There are no callers of both __opal_async_get_token() and
__opal_async_release_token().

This patch also removes the possibility of "emergency through
synchronous call to __opal_async_get_token()" as such it makes more
sense to initialise opal_sync_sem for the maximum number of async
tokens.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/opal.h             |  2 --
 arch/powerpc/platforms/powernv/opal-async.c | 10 +++-------
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 04c32b08ffa1..00aae3189311 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -306,9 +306,7 @@ extern void opal_notifier_enable(void);
 extern void opal_notifier_disable(void);
 extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val);
 
-extern int __opal_async_get_token(void);
 extern int opal_async_get_token_interruptible(void);
-extern int __opal_async_release_token(int token);
 extern int opal_async_release_token(int token);
 extern int opal_async_wait_response(uint64_t token, struct opal_msg *msg);
 extern int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data);
diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index 45b3feb8aa2f..64255d3ee14a 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -33,7 +33,7 @@ static struct semaphore opal_async_sem;
 static struct opal_msg *opal_async_responses;
 static unsigned int opal_max_async_tokens;
 
-int __opal_async_get_token(void)
+static int __opal_async_get_token(void)
 {
 	unsigned long flags;
 	int token;
@@ -73,7 +73,7 @@ int opal_async_get_token_interruptible(void)
 }
 EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible);
 
-int __opal_async_release_token(int token)
+static int __opal_async_release_token(int token)
 {
 	unsigned long flags;
 
@@ -199,11 +199,7 @@ int __init opal_async_comp_init(void)
 		goto out_opal_node;
 	}
 
-	/* Initialize to 1 less than the maximum tokens available, as we may
-	 * require to pop one during emergency through synchronous call to
-	 * __opal_async_get_token()
-	 */
-	sema_init(&opal_async_sem, opal_max_async_tokens - 1);
+	sema_init(&opal_async_sem, opal_max_async_tokens);
 
 out_opal_node:
 	of_node_put(opal_node);
-- 
cgit 


From 86cd6d98020924f65a6773784c66c5b842e3e320 Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Fri, 3 Nov 2017 13:41:42 +1100
Subject: powerpc/opal: Rework the opal-async interface

Future work will add an opal_async_wait_response_interruptible()
which will call wait_event_interruptible(). This work requires extra
token state to be tracked as wait_event_interruptible() can return and
the caller could release the token before OPAL responds.

Currently token state is tracked with two bitfields which are 64 bits
big but may not need to be as OPAL informs Linux how many async tokens
there are. It also uses an array indexed by token to store response
messages for each token.

The bitfields make it difficult to add more state and also provide a
hard maximum as to how many tokens there can be - it is possible that
OPAL will inform Linux that there are more than 64 tokens.

Rather than add a bitfield to track the extra state, rework the
internals slightly.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
[mpe: Fix __opal_async_get_token() when no tokens are free]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-async.c | 92 ++++++++++++++++-------------
 1 file changed, 51 insertions(+), 41 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index 64255d3ee14a..a8a57310759a 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -1,7 +1,7 @@
 /*
  * PowerNV OPAL asynchronous completion interfaces
  *
- * Copyright 2013 IBM Corp.
+ * Copyright 2013-2017 IBM Corp.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -23,40 +23,47 @@
 #include <asm/machdep.h>
 #include <asm/opal.h>
 
-#define N_ASYNC_COMPLETIONS	64
+enum opal_async_token_state {
+	ASYNC_TOKEN_UNALLOCATED = 0,
+	ASYNC_TOKEN_ALLOCATED,
+	ASYNC_TOKEN_COMPLETED
+};
+
+struct opal_async_token {
+	enum opal_async_token_state state;
+	struct opal_msg response;
+};
 
-static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL};
-static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS);
 static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait);
 static DEFINE_SPINLOCK(opal_async_comp_lock);
 static struct semaphore opal_async_sem;
-static struct opal_msg *opal_async_responses;
 static unsigned int opal_max_async_tokens;
+static struct opal_async_token *opal_async_tokens;
 
 static int __opal_async_get_token(void)
 {
 	unsigned long flags;
-	int token;
+	int i, token = -EBUSY;
 
 	spin_lock_irqsave(&opal_async_comp_lock, flags);
-	token = find_first_zero_bit(opal_async_token_map, opal_max_async_tokens);
-	if (token >= opal_max_async_tokens) {
-		token = -EBUSY;
-		goto out;
-	}
 
-	if (!__test_and_clear_bit(token, opal_async_complete_map)) {
-		token = -EBUSY;
-		goto out;
+	for (i = 0; i < opal_max_async_tokens; i++) {
+		if (opal_async_tokens[i].state == ASYNC_TOKEN_UNALLOCATED) {
+			opal_async_tokens[i].state = ASYNC_TOKEN_ALLOCATED;
+			token = i;
+			break;
+		}
 	}
 
-	__set_bit(token, opal_async_token_map);
-
-out:
 	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
 	return token;
 }
 
+/*
+ * Note: If the returned token is used in an opal call and opal returns
+ * OPAL_ASYNC_COMPLETION you MUST call opal_async_wait_response() before
+ * calling another other opal_async_* function
+ */
 int opal_async_get_token_interruptible(void)
 {
 	int token;
@@ -76,6 +83,7 @@ EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible);
 static int __opal_async_release_token(int token)
 {
 	unsigned long flags;
+	int rc;
 
 	if (token < 0 || token >= opal_max_async_tokens) {
 		pr_err("%s: Passed token is out of range, token %d\n",
@@ -84,11 +92,18 @@ static int __opal_async_release_token(int token)
 	}
 
 	spin_lock_irqsave(&opal_async_comp_lock, flags);
-	__set_bit(token, opal_async_complete_map);
-	__clear_bit(token, opal_async_token_map);
+	switch (opal_async_tokens[token].state) {
+	case ASYNC_TOKEN_COMPLETED:
+	case ASYNC_TOKEN_ALLOCATED:
+		opal_async_tokens[token].state = ASYNC_TOKEN_UNALLOCATED;
+		rc = 0;
+		break;
+	default:
+		rc = 1;
+	}
 	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
 
-	return 0;
+	return rc;
 }
 
 int opal_async_release_token(int token)
@@ -96,12 +111,10 @@ int opal_async_release_token(int token)
 	int ret;
 
 	ret = __opal_async_release_token(token);
-	if (ret)
-		return ret;
-
-	up(&opal_async_sem);
+	if (!ret)
+		up(&opal_async_sem);
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(opal_async_release_token);
 
@@ -122,13 +135,15 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg)
 	 * functional.
 	 */
 	opal_wake_poller();
-	wait_event(opal_async_wait, test_bit(token, opal_async_complete_map));
-	memcpy(msg, &opal_async_responses[token], sizeof(*msg));
+	wait_event(opal_async_wait, opal_async_tokens[token].state
+			== ASYNC_TOKEN_COMPLETED);
+	memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg));
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(opal_async_wait_response);
 
+/* Called from interrupt context */
 static int opal_async_comp_event(struct notifier_block *nb,
 		unsigned long msg_type, void *msg)
 {
@@ -140,9 +155,9 @@ static int opal_async_comp_event(struct notifier_block *nb,
 		return 0;
 
 	token = be64_to_cpu(comp_msg->params[0]);
-	memcpy(&opal_async_responses[token], comp_msg, sizeof(*comp_msg));
+	memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg));
 	spin_lock_irqsave(&opal_async_comp_lock, flags);
-	__set_bit(token, opal_async_complete_map);
+	opal_async_tokens[token].state = ASYNC_TOKEN_COMPLETED;
 	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
 
 	wake_up(&opal_async_wait);
@@ -178,24 +193,19 @@ int __init opal_async_comp_init(void)
 	}
 
 	opal_max_async_tokens = be32_to_cpup(async);
-	if (opal_max_async_tokens > N_ASYNC_COMPLETIONS)
-		opal_max_async_tokens = N_ASYNC_COMPLETIONS;
+	opal_async_tokens = kcalloc(opal_max_async_tokens,
+			sizeof(*opal_async_tokens), GFP_KERNEL);
+	if (!opal_async_tokens) {
+		err = -ENOMEM;
+		goto out_opal_node;
+	}
 
 	err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP,
 			&opal_async_comp_nb);
 	if (err) {
 		pr_err("%s: Can't register OPAL event notifier (%d)\n",
 				__func__, err);
-		goto out_opal_node;
-	}
-
-	opal_async_responses = kzalloc(
-			sizeof(*opal_async_responses) * opal_max_async_tokens,
-			GFP_KERNEL);
-	if (!opal_async_responses) {
-		pr_err("%s: Out of memory, failed to do asynchronous "
-				"completion init\n", __func__);
-		err = -ENOMEM;
+		kfree(opal_async_tokens);
 		goto out_opal_node;
 	}
 
-- 
cgit 


From 95e1bc1daaeee4d598b235dc85b64b7a0bcc3060 Mon Sep 17 00:00:00 2001
From: Stewart Smith <stewart@linux.vnet.ibm.com>
Date: Fri, 3 Nov 2017 13:41:43 +1100
Subject: powernv/opal-sensor: remove not needed lock

Parallel sensor reads could run out of async tokens due to
opal_get_sensor_data grabbing tokens but then doing the sensor
read behind a mutex, essentially serializing the (possibly
asynchronous and relatively slow) sensor read.

It turns out that the mutex isn't needed at all, not only
should the OPAL interface allow concurrent reads, the implementation
is certainly safe for that, and if any sensor we were reading
from somewhere isn't, doing the mutual exclusion in the kernel
is the wrong place to do it, OPAL should be doing it for the kernel.

So, remove the mutex.

Additionally, we shouldn't be printing out an error when we don't
get a token as the only way this should happen is if we've been
interrupted in down_interruptible() on the semaphore.

Reported-by: Robert Lippert <rlippert@google.com>
Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal-sensor.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
index aa267f120033..0a7074bb91dc 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -19,13 +19,10 @@
  */
 
 #include <linux/delay.h>
-#include <linux/mutex.h>
 #include <linux/of_platform.h>
 #include <asm/opal.h>
 #include <asm/machdep.h>
 
-static DEFINE_MUTEX(opal_sensor_mutex);
-
 /*
  * This will return sensor information to driver based on the requested sensor
  * handle. A handle is an opaque id for the powernv, read by the driver from the
@@ -38,13 +35,9 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
 	__be32 data;
 
 	token = opal_async_get_token_interruptible();
-	if (token < 0) {
-		pr_err("%s: Couldn't get the token, returning\n", __func__);
-		ret = token;
-		goto out;
-	}
+	if (token < 0)
+		return token;
 
-	mutex_lock(&opal_sensor_mutex);
 	ret = opal_sensor_read(sensor_hndl, token, &data);
 	switch (ret) {
 	case OPAL_ASYNC_COMPLETION:
@@ -52,7 +45,7 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
 		if (ret) {
 			pr_err("%s: Failed to wait for the async response, %d\n",
 			       __func__, ret);
-			goto out_token;
+			goto out;
 		}
 
 		ret = opal_error_code(opal_get_async_rc(msg));
@@ -73,10 +66,8 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
 		break;
 	}
 
-out_token:
-	mutex_unlock(&opal_sensor_mutex);
-	opal_async_release_token(token);
 out:
+	opal_async_release_token(token);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(opal_get_sensor_data);
-- 
cgit 


From 9aab24495c5644b25ced0d11816cc3c061bf74fc Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Fri, 3 Nov 2017 13:41:44 +1100
Subject: powerpc/opal: Add opal_async_wait_response_interruptible() to
 opal-async

This patch adds an _interruptible version of opal_async_wait_response().
This is useful when a long running OPAL call is performed on behalf of
a userspace thread, for example, the opal_flash_{read,write,erase}
functions performed by the powernv-flash MTD driver.

It is foreseeable that these functions would take upwards of two
minutes causing the wait_event() to block long enough to cause hung
task warnings. Furthermore, wait_event_interruptible() is preferable
as otherwise there is no way for signals to stop the process which is
going to be confusing in userspace.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/opal.h             |  2 +
 arch/powerpc/platforms/powernv/opal-async.c | 84 +++++++++++++++++++++++++++--
 2 files changed, 82 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 00aae3189311..0c545f7fc77b 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -309,6 +309,8 @@ extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val);
 extern int opal_async_get_token_interruptible(void);
 extern int opal_async_release_token(int token);
 extern int opal_async_wait_response(uint64_t token, struct opal_msg *msg);
+extern int opal_async_wait_response_interruptible(uint64_t token,
+		struct opal_msg *msg);
 extern int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data);
 
 struct rtc_time;
diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index a8a57310759a..18a355fa15e8 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -26,6 +26,8 @@
 enum opal_async_token_state {
 	ASYNC_TOKEN_UNALLOCATED = 0,
 	ASYNC_TOKEN_ALLOCATED,
+	ASYNC_TOKEN_DISPATCHED,
+	ASYNC_TOKEN_ABANDONED,
 	ASYNC_TOKEN_COMPLETED
 };
 
@@ -61,8 +63,9 @@ static int __opal_async_get_token(void)
 
 /*
  * Note: If the returned token is used in an opal call and opal returns
- * OPAL_ASYNC_COMPLETION you MUST call opal_async_wait_response() before
- * calling another other opal_async_* function
+ * OPAL_ASYNC_COMPLETION you MUST call one of opal_async_wait_response() or
+ * opal_async_wait_response_interruptible() at least once before calling another
+ * opal_async_* function
  */
 int opal_async_get_token_interruptible(void)
 {
@@ -98,6 +101,14 @@ static int __opal_async_release_token(int token)
 		opal_async_tokens[token].state = ASYNC_TOKEN_UNALLOCATED;
 		rc = 0;
 		break;
+	/*
+	 * DISPATCHED and ABANDONED tokens must wait for OPAL to respond.
+	 * Mark a DISPATCHED token as ABANDONED so that the response handling
+	 * code knows no one cares and that it can free it then.
+	 */
+	case ASYNC_TOKEN_DISPATCHED:
+		opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED;
+		/* Fall through */
 	default:
 		rc = 1;
 	}
@@ -130,7 +141,11 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg)
 		return -EINVAL;
 	}
 
-	/* Wakeup the poller before we wait for events to speed things
+	/*
+	 * There is no need to mark the token as dispatched, wait_event()
+	 * will block until the token completes.
+	 *
+	 * Wakeup the poller before we wait for events to speed things
 	 * up on platforms or simulators where the interrupts aren't
 	 * functional.
 	 */
@@ -143,11 +158,66 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg)
 }
 EXPORT_SYMBOL_GPL(opal_async_wait_response);
 
+int opal_async_wait_response_interruptible(uint64_t token, struct opal_msg *msg)
+{
+	unsigned long flags;
+	int ret;
+
+	if (token >= opal_max_async_tokens) {
+		pr_err("%s: Invalid token passed\n", __func__);
+		return -EINVAL;
+	}
+
+	if (!msg) {
+		pr_err("%s: Invalid message pointer passed\n", __func__);
+		return -EINVAL;
+	}
+
+	/*
+	 * The first time this gets called we mark the token as DISPATCHED
+	 * so that if wait_event_interruptible() returns not zero and the
+	 * caller frees the token, we know not to actually free the token
+	 * until the response comes.
+	 *
+	 * Only change if the token is ALLOCATED - it may have been
+	 * completed even before the caller gets around to calling this
+	 * the first time.
+	 *
+	 * There is also a dirty great comment at the token allocation
+	 * function that if the opal call returns OPAL_ASYNC_COMPLETION to
+	 * the caller then the caller *must* call this or the not
+	 * interruptible version before doing anything else with the
+	 * token.
+	 */
+	if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED) {
+		spin_lock_irqsave(&opal_async_comp_lock, flags);
+		if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED)
+			opal_async_tokens[token].state = ASYNC_TOKEN_DISPATCHED;
+		spin_unlock_irqrestore(&opal_async_comp_lock, flags);
+	}
+
+	/*
+	 * Wakeup the poller before we wait for events to speed things
+	 * up on platforms or simulators where the interrupts aren't
+	 * functional.
+	 */
+	opal_wake_poller();
+	ret = wait_event_interruptible(opal_async_wait,
+			opal_async_tokens[token].state ==
+			ASYNC_TOKEN_COMPLETED);
+	if (!ret)
+		memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg));
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(opal_async_wait_response_interruptible);
+
 /* Called from interrupt context */
 static int opal_async_comp_event(struct notifier_block *nb,
 		unsigned long msg_type, void *msg)
 {
 	struct opal_msg *comp_msg = msg;
+	enum opal_async_token_state state;
 	unsigned long flags;
 	uint64_t token;
 
@@ -155,11 +225,17 @@ static int opal_async_comp_event(struct notifier_block *nb,
 		return 0;
 
 	token = be64_to_cpu(comp_msg->params[0]);
-	memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg));
 	spin_lock_irqsave(&opal_async_comp_lock, flags);
+	state = opal_async_tokens[token].state;
 	opal_async_tokens[token].state = ASYNC_TOKEN_COMPLETED;
 	spin_unlock_irqrestore(&opal_async_comp_lock, flags);
 
+	if (state == ASYNC_TOKEN_ABANDONED) {
+		/* Free the token, no one else will */
+		opal_async_release_token(token);
+		return 0;
+	}
+	memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg));
 	wake_up(&opal_async_wait);
 
 	return 0;
-- 
cgit 


From 77adbd2207e858f5923aa94e4a7d2f29f09217ed Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Fri, 3 Nov 2017 13:41:45 +1100
Subject: powerpc/powernv: Add OPAL_BUSY to opal_error_code()

Also export opal_error_code() so that it can be used in modules

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/opal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 65c79ecf5a4d..041ddbd1fc57 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -998,6 +998,7 @@ int opal_error_code(int rc)
 
 	case OPAL_PARAMETER:		return -EINVAL;
 	case OPAL_ASYNC_COMPLETION:	return -EINPROGRESS;
+	case OPAL_BUSY:
 	case OPAL_BUSY_EVENT:		return -EBUSY;
 	case OPAL_NO_MEM:		return -ENOMEM;
 	case OPAL_PERMISSION:		return -EPERM;
@@ -1037,3 +1038,4 @@ EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
 /* Export this for KVM */
 EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
 EXPORT_SYMBOL_GPL(opal_int_eoi);
+EXPORT_SYMBOL_GPL(opal_error_code);
-- 
cgit 


From a7771176b4392fbc3a17399c51a8c11f2f681afe Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Thu, 2 Nov 2017 14:09:03 +1100
Subject: powerpc: Don't enable FP/Altivec if not checkpointed

Lazy save and restore of FP/Altivec means that a userspace process can
be sent to userspace with FP or Altivec disabled and loaded only as
required (by way of an FP/Altivec unavailable exception). Transactional
Memory complicates this situation as a transaction could be started
without FP/Altivec being loaded up. This causes the hardware to
checkpoint incorrect registers. Handling FP/Altivec unavailable
exceptions while a thread is transactional requires a reclaim and
recheckpoint to ensure the CPU has correct state for both sets of
registers.

Lazy save and restore of FP/Altivec cannot be done if a process is
transactional. If a facility was enabled it must remain enabled whenever
a thread is transactional.

Commit dc16b553c949 ("powerpc: Always restore FPU/VEC/VSX if hardware
transactional memory in use") ensures that the facilities are always
enabled if a thread is transactional. A bug in the introduced code may
cause it to inadvertently enable a facility that was (and should remain)
disabled. The problem with this extraneous enablement is that the
registers for the erroneously enabled facility have not been correctly
recheckpointed - the recheckpointing code assumed the facility would
remain disabled.

Further compounding the issue, the transactional {fp,altivec,vsx}
unavailable code has been incorrectly using the MSR to enable
facilities. The presence of the {FP,VEC,VSX} bit in the regs->msr simply
means if the registers are live on the CPU, not if the kernel should
load them before returning to userspace. This has worked due to the bug
mentioned above.

This causes transactional threads which return to their failure handler
to observe incorrect checkpointed registers. Perhaps an example will
help illustrate the problem:

A userspace process is running and uses both FP and Altivec registers.
This process then continues to run for some time without touching
either sets of registers. The kernel subsequently disables the
facilities as part of lazy save and restore. The userspace process then
performs a tbegin and the CPU checkpoints 'junk' FP and Altivec
registers. The process then performs a floating point instruction
triggering a fp unavailable exception in the kernel.

The kernel then loads the FP registers - and only the FP registers.
Since the thread is transactional it must perform a reclaim and
recheckpoint to ensure both the checkpointed registers and the
transactional registers are correct. It then (correctly) enables
MSR[FP] for the process. Later (on exception exist) the kernel also
(inadvertently) enables MSR[VEC]. The process is then returned to
userspace.

Since the act of loading the FP registers doomed the transaction we know
CPU will fail the transaction, restore its checkpointed registers, and
return the process to its failure handler. The problem is that we're
now running with Altivec enabled and the 'junk' checkpointed registers
are restored. The kernel had only recheckpointed FP.

This patch solves this by only activating FP/Altivec if userspace was
using them when it entered the kernel and not simply if the process is
transactional.

Fixes: dc16b553c949 ("powerpc: Always restore FPU/VEC/VSX if hardware
transactional memory in use")

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c | 18 ++++++++++++++++--
 arch/powerpc/kernel/traps.c   |  8 ++++----
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 2b602d60a9c7..e2980a22c487 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -104,9 +104,23 @@ static inline bool msr_tm_active(unsigned long msr)
 {
 	return MSR_TM_ACTIVE(msr);
 }
+
+static bool tm_active_with_fp(struct task_struct *tsk)
+{
+	return msr_tm_active(tsk->thread.regs->msr) &&
+		(tsk->thread.ckpt_regs.msr & MSR_FP);
+}
+
+static bool tm_active_with_altivec(struct task_struct *tsk)
+{
+	return msr_tm_active(tsk->thread.regs->msr) &&
+		(tsk->thread.ckpt_regs.msr & MSR_VEC);
+}
 #else
 static inline bool msr_tm_active(unsigned long msr) { return false; }
 static inline void check_if_tm_restore_required(struct task_struct *tsk) { }
+static inline bool tm_active_with_fp(struct task_struct *tsk) { return false; }
+static inline bool tm_active_with_altivec(struct task_struct *tsk) { return false; }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
 bool strict_msr_control;
@@ -239,7 +253,7 @@ EXPORT_SYMBOL(enable_kernel_fp);
 
 static int restore_fp(struct task_struct *tsk)
 {
-	if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) {
+	if (tsk->thread.load_fp || tm_active_with_fp(tsk)) {
 		load_fp_state(&current->thread.fp_state);
 		current->thread.load_fp++;
 		return 1;
@@ -321,7 +335,7 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
 static int restore_altivec(struct task_struct *tsk)
 {
 	if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
-		(tsk->thread.load_vec || msr_tm_active(tsk->thread.regs->msr))) {
+		(tsk->thread.load_vec || tm_active_with_altivec(tsk))) {
 		load_vr_state(&tsk->thread.vr_state);
 		tsk->thread.used_vr = 1;
 		tsk->thread.load_vec++;
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 0e4099fef198..863c6858ed0f 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1679,7 +1679,7 @@ void fp_unavailable_tm(struct pt_regs *regs)
 	/* Reclaim didn't save out any FPRs to transact_fprs. */
 
 	/* Enable FP for the task: */
-	regs->msr |= (MSR_FP | current->thread.fpexc_mode);
+	current->thread.load_fp = 1;
 
 	/* This loads and recheckpoints the FP registers from
 	 * thread.fpr[].  They will remain in registers after the
@@ -1708,7 +1708,7 @@ void altivec_unavailable_tm(struct pt_regs *regs)
 		 "MSR=%lx\n",
 		 regs->nip, regs->msr);
 	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
-	regs->msr |= MSR_VEC;
+	current->thread.load_vec = 1;
 	tm_recheckpoint(&current->thread, MSR_VEC);
 	current->thread.used_vr = 1;
 
@@ -1745,8 +1745,8 @@ void vsx_unavailable_tm(struct pt_regs *regs)
 	/* This reclaims FP and/or VR regs if they're already enabled */
 	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
 
-	regs->msr |= MSR_VEC | MSR_FP | current->thread.fpexc_mode |
-		MSR_VSX;
+	current->thread.load_vec = 1;
+	current->thread.load_fp = 1;
 
 	/* This loads & recheckpoints FP and VRs; but we have
 	 * to be sure not to overwrite previously-valid state.
-- 
cgit 


From 91381b9cb1c3afc162f830ebf698721402c7d577 Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Thu, 2 Nov 2017 14:09:04 +1100
Subject: powerpc: Force reload for recheckpoint during tm {fp, vec, vsx}
 unavailable exception

Lazy save and restore of FP/Altivec means that a userspace process can
be sent to userspace with FP or Altivec disabled and loaded only as
required (by way of an FP/Altivec unavailable exception). Transactional
Memory complicates this situation as a transaction could be started
without FP/Altivec being loaded up. This causes the hardware to
checkpoint incorrect registers. Handling FP/Altivec unavailable
exceptions while a thread is transactional requires a reclaim and
recheckpoint to ensure the CPU has correct state for both sets of
registers.

tm_reclaim() has optimisations to not always save the FP/Altivec
registers to the checkpointed save area. This was originally done
because the caller might have information that the checkpointed
registers aren't valid due to lazy save and restore. We've also been a
little vague as to how tm_reclaim() leaves the FP/Altivec state since it
doesn't necessarily always save it to the thread struct. This has lead
to an (incorrect) assumption that it leaves the checkpointed state on
the CPU.

tm_recheckpoint() has similar optimisations in reverse. It may not
always reload the checkpointed FP/Altivec registers from the thread
struct before the trecheckpoint. It is therefore quite unclear where it
expects to get the state from. This didn't help with the assumption
made about tm_reclaim().

This patch is a minimal fix for ease of backporting. A more correct fix
which removes the msr parameter to tm_reclaim() and tm_recheckpoint()
altogether has been upstreamed to apply on top of this patch.

Fixes: dc3106690b20 ("powerpc: tm: Always use fp_state and vr_state to
store live registers")

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c |  4 ++--
 arch/powerpc/kernel/traps.c   | 22 +++++++++++++++++-----
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index e2980a22c487..ca8c33b41989 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -874,6 +874,8 @@ static void tm_reclaim_thread(struct thread_struct *thr,
 	if (!MSR_TM_SUSPENDED(mfmsr()))
 		return;
 
+	giveup_all(container_of(thr, struct task_struct, thread));
+
 	/*
 	 * If we are in a transaction and FP is off then we can't have
 	 * used FP inside that transaction. Hence the checkpointed
@@ -893,8 +895,6 @@ static void tm_reclaim_thread(struct thread_struct *thr,
 		memcpy(&thr->ckvr_state, &thr->vr_state,
 		       sizeof(struct thread_vr_state));
 
-	giveup_all(container_of(thr, struct task_struct, thread));
-
 	tm_reclaim(thr, thr->ckpt_regs.msr, cause);
 }
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 863c6858ed0f..e099e61c702b 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1663,6 +1663,12 @@ out:
 
 void fp_unavailable_tm(struct pt_regs *regs)
 {
+	/*
+	 * Save the MSR now because tm_reclaim_current() is likely to
+	 * change it
+	 */
+	unsigned long orig_msr = regs->msr;
+
 	/* Note:  This does not handle any kind of FP laziness. */
 
 	TM_DEBUG("FP Unavailable trap whilst transactional at 0x%lx, MSR=%lx\n",
@@ -1687,10 +1693,10 @@ void fp_unavailable_tm(struct pt_regs *regs)
 	 * If VMX is in use, the VRs now hold checkpointed values,
 	 * so we don't want to load the VRs from the thread_struct.
 	 */
-	tm_recheckpoint(&current->thread, MSR_FP);
+	tm_recheckpoint(&current->thread, orig_msr | MSR_FP);
 
 	/* If VMX is in use, get the transactional values back */
-	if (regs->msr & MSR_VEC) {
+	if (orig_msr & MSR_VEC) {
 		msr_check_and_set(MSR_VEC);
 		load_vr_state(&current->thread.vr_state);
 		/* At this point all the VSX state is loaded, so enable it */
@@ -1700,6 +1706,12 @@ void fp_unavailable_tm(struct pt_regs *regs)
 
 void altivec_unavailable_tm(struct pt_regs *regs)
 {
+	/*
+	 * Save the MSR now because tm_reclaim_current() is likely to
+	 * change it
+	 */
+	unsigned long orig_msr = regs->msr;
+
 	/* See the comments in fp_unavailable_tm().  This function operates
 	 * the same way.
 	 */
@@ -1709,10 +1721,10 @@ void altivec_unavailable_tm(struct pt_regs *regs)
 		 regs->nip, regs->msr);
 	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
 	current->thread.load_vec = 1;
-	tm_recheckpoint(&current->thread, MSR_VEC);
+	tm_recheckpoint(&current->thread, orig_msr | MSR_VEC);
 	current->thread.used_vr = 1;
 
-	if (regs->msr & MSR_FP) {
+	if (orig_msr & MSR_FP) {
 		msr_check_and_set(MSR_FP);
 		load_fp_state(&current->thread.fp_state);
 		regs->msr |= MSR_VSX;
@@ -1751,7 +1763,7 @@ void vsx_unavailable_tm(struct pt_regs *regs)
 	/* This loads & recheckpoints FP and VRs; but we have
 	 * to be sure not to overwrite previously-valid state.
 	 */
-	tm_recheckpoint(&current->thread, regs->msr & ~orig_msr);
+	tm_recheckpoint(&current->thread, orig_msr | MSR_FP | MSR_VEC);
 
 	msr_check_and_set(orig_msr & (MSR_FP | MSR_VEC));
 
-- 
cgit 


From eb5c3f1c86470fc1a57ab28cce15c12e4d6cdf8b Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Thu, 2 Nov 2017 14:09:05 +1100
Subject: powerpc: Always save/restore checkpointed regs during
 treclaim/trecheckpoint

Lazy save and restore of FP/Altivec means that a userspace process can
be sent to userspace with FP or Altivec disabled and loaded only as
required (by way of an FP/Altivec unavailable exception). Transactional
Memory complicates this situation as a transaction could be started
without FP/Altivec being loaded up. This causes the hardware to
checkpoint incorrect registers. Handling FP/Altivec unavailable
exceptions while a thread is transactional requires a reclaim and
recheckpoint to ensure the CPU has correct state for both sets of
registers.

tm_reclaim() has optimisations to not always save the FP/Altivec
registers to the checkpointed save area. This was originally done
because the caller might have information that the checkpointed
registers aren't valid due to lazy save and restore. We've also been a
little vague as to how tm_reclaim() leaves the FP/Altivec state since it
doesn't necessarily always save it to the thread struct. This has lead
to an (incorrect) assumption that it leaves the checkpointed state on
the CPU.

tm_recheckpoint() has similar optimisations in reverse. It may not
always reload the checkpointed FP/Altivec registers from the thread
struct before the trecheckpoint. It is therefore quite unclear where it
expects to get the state from. This didn't help with the assumption
made about tm_reclaim().

These optimisations sit in what is by definition a slow path. If a
process has to go through a reclaim/recheckpoint then its transaction
will be doomed on returning to userspace. This mean that the process
will be unable to complete its transaction and be forced to its failure
handler. This is already an out if line case for userspace. Furthermore,
the cost of copying 64 times 128 bits from registers isn't very long[0]
(at all) on modern processors. As such it appears these optimisations
have only served to increase code complexity and are unlikely to have
had a measurable performance impact.

Our transactional memory handling has been riddled with bugs. A cause
of this has been difficulty in following the code flow, code complexity
has not been our friend here. It makes sense to remove these
optimisations in favour of a (hopefully) more stable implementation.

This patch does mean that some times the assembly will needlessly save
'junk' registers which will subsequently get overwritten with the
correct value by the C code which calls the assembly function. This
small inefficiency is far outweighed by the reduction in complexity for
general TM code, context switching paths, and transactional facility
unavailable exception handler.

0: I tried to measure it once for other work and found that it was
hiding in the noise of everything else I was working with. I find it
exceedingly likely this will be the case here.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/tm.h   |  5 ++--
 arch/powerpc/kernel/process.c   | 22 ++++++---------
 arch/powerpc/kernel/signal_32.c |  2 +-
 arch/powerpc/kernel/signal_64.c |  2 +-
 arch/powerpc/kernel/tm.S        | 59 ++++++++++++-----------------------------
 arch/powerpc/kernel/traps.c     | 26 +++++-------------
 6 files changed, 35 insertions(+), 81 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h
index ad19fe41931b..79d4156554b4 100644
--- a/arch/powerpc/include/asm/tm.h
+++ b/arch/powerpc/include/asm/tm.h
@@ -11,10 +11,9 @@
 
 extern void tm_enable(void);
 extern void tm_reclaim(struct thread_struct *thread,
-		       unsigned long orig_msr, uint8_t cause);
+		       uint8_t cause);
 extern void tm_reclaim_current(uint8_t cause);
-extern void tm_recheckpoint(struct thread_struct *thread,
-			    unsigned long orig_msr);
+extern void tm_recheckpoint(struct thread_struct *thread);
 extern void tm_abort(uint8_t cause);
 extern void tm_save_sprs(struct thread_struct *thread);
 extern void tm_restore_sprs(struct thread_struct *thread);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ca8c33b41989..4083b737decb 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -876,6 +876,8 @@ static void tm_reclaim_thread(struct thread_struct *thr,
 
 	giveup_all(container_of(thr, struct task_struct, thread));
 
+	tm_reclaim(thr, cause);
+
 	/*
 	 * If we are in a transaction and FP is off then we can't have
 	 * used FP inside that transaction. Hence the checkpointed
@@ -894,8 +896,6 @@ static void tm_reclaim_thread(struct thread_struct *thr,
 	if ((thr->ckpt_regs.msr & MSR_VEC) == 0)
 		memcpy(&thr->ckvr_state, &thr->vr_state,
 		       sizeof(struct thread_vr_state));
-
-	tm_reclaim(thr, thr->ckpt_regs.msr, cause);
 }
 
 void tm_reclaim_current(uint8_t cause)
@@ -946,11 +946,9 @@ out_and_saveregs:
 	tm_save_sprs(thr);
 }
 
-extern void __tm_recheckpoint(struct thread_struct *thread,
-			      unsigned long orig_msr);
+extern void __tm_recheckpoint(struct thread_struct *thread);
 
-void tm_recheckpoint(struct thread_struct *thread,
-		     unsigned long orig_msr)
+void tm_recheckpoint(struct thread_struct *thread)
 {
 	unsigned long flags;
 
@@ -969,15 +967,13 @@ void tm_recheckpoint(struct thread_struct *thread,
 	 */
 	tm_restore_sprs(thread);
 
-	__tm_recheckpoint(thread, orig_msr);
+	__tm_recheckpoint(thread);
 
 	local_irq_restore(flags);
 }
 
 static inline void tm_recheckpoint_new_task(struct task_struct *new)
 {
-	unsigned long msr;
-
 	if (!cpu_has_feature(CPU_FTR_TM))
 		return;
 
@@ -996,13 +992,11 @@ static inline void tm_recheckpoint_new_task(struct task_struct *new)
 		tm_restore_sprs(&new->thread);
 		return;
 	}
-	msr = new->thread.ckpt_regs.msr;
 	/* Recheckpoint to restore original checkpointed register state. */
-	TM_DEBUG("*** tm_recheckpoint of pid %d "
-		 "(new->msr 0x%lx, new->origmsr 0x%lx)\n",
-		 new->pid, new->thread.regs->msr, msr);
+	TM_DEBUG("*** tm_recheckpoint of pid %d (new->msr 0x%lx)\n",
+		 new->pid, new->thread.regs->msr);
 
-	tm_recheckpoint(&new->thread, msr);
+	tm_recheckpoint(&new->thread);
 
 	/*
 	 * The checkpointed state has been restored but the live state has
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 1dd5fa0f65fd..16d16583cf11 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -880,7 +880,7 @@ static long restore_tm_user_regs(struct pt_regs *regs,
 	/* Make sure the transaction is marked as failed */
 	current->thread.tm_texasr |= TEXASR_FS;
 	/* This loads the checkpointed FP/VEC state, if used */
-	tm_recheckpoint(&current->thread, msr);
+	tm_recheckpoint(&current->thread);
 
 	/* This loads the speculative FP/VEC state, if used */
 	msr_check_and_set(msr & (MSR_FP | MSR_VEC));
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 4dfdd8e56836..528ea744911c 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -552,7 +552,7 @@ static long restore_tm_sigcontexts(struct task_struct *tsk,
 	/* Make sure the transaction is marked as failed */
 	tsk->thread.tm_texasr |= TEXASR_FS;
 	/* This loads the checkpointed FP/VEC state, if used */
-	tm_recheckpoint(&tsk->thread, msr);
+	tm_recheckpoint(&tsk->thread);
 
 	msr_check_and_set(msr & (MSR_FP | MSR_VEC));
 	if (msr & MSR_FP) {
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index c4ba37822ba0..d89fb0e6f9ed 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -79,15 +79,12 @@ _GLOBAL(tm_abort)
 	blr
 
 /* void tm_reclaim(struct thread_struct *thread,
- *                 unsigned long orig_msr,
  *		   uint8_t cause)
  *
  *	- Performs a full reclaim.  This destroys outstanding
  *	  transactions and updates thread->regs.tm_ckpt_* with the
  *	  original checkpointed state.  Note that thread->regs is
  *	  unchanged.
- *	- FP regs are written back to thread->transact_fpr before
- *	  reclaiming.  These are the transactional (current) versions.
  *
  * Purpose is to both abort transactions of, and preserve the state of,
  * a transactions at a context switch. We preserve/restore both sets of process
@@ -98,9 +95,9 @@ _GLOBAL(tm_abort)
  * Call with IRQs off, stacks get all out of sync for some periods in here!
  */
 _GLOBAL(tm_reclaim)
-	mfcr	r6
+	mfcr	r5
 	mflr	r0
-	stw	r6, 8(r1)
+	stw	r5, 8(r1)
 	std	r0, 16(r1)
 	std	r2, STK_GOT(r1)
 	stdu	r1, -TM_FRAME_SIZE(r1)
@@ -108,7 +105,6 @@ _GLOBAL(tm_reclaim)
 	/* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */
 
 	std	r3, STK_PARAM(R3)(r1)
-	std	r4, STK_PARAM(R4)(r1)
 	SAVE_NVGPRS(r1)
 
 	/* We need to setup MSR for VSX register save instructions. */
@@ -138,8 +134,8 @@ _GLOBAL(tm_reclaim)
 	std	r1, PACAR1(r13)
 
 	/* Clear MSR RI since we are about to change r1, EE is already off. */
-	li	r4, 0
-	mtmsrd	r4, 1
+	li	r5, 0
+	mtmsrd	r5, 1
 
 	/*
 	 * BE CAREFUL HERE:
@@ -151,7 +147,7 @@ _GLOBAL(tm_reclaim)
 	 * to user register state.  (FPRs, CCR etc. also!)
 	 * Use an sprg and a tm_scratch in the PACA to shuffle.
 	 */
-	TRECLAIM(R5)				/* Cause in r5 */
+	TRECLAIM(R4)				/* Cause in r4 */
 
 	/* ******************** GPRs ******************** */
 	/* Stash the checkpointed r13 away in the scratch SPR and get the real
@@ -242,40 +238,30 @@ _GLOBAL(tm_reclaim)
 
 
 	/* ******************** FPR/VR/VSRs ************
-	 * After reclaiming, capture the checkpointed FPRs/VRs /if used/.
-	 *
-	 * (If VSX used, FP and VMX are implied.  Or, we don't need to look
-	 * at MSR.VSX as copying FP regs if .FP, vector regs if .VMX covers it.)
-	 *
-	 * We're passed the thread's MSR as the second parameter
+	 * After reclaiming, capture the checkpointed FPRs/VRs.
 	 *
 	 * We enabled VEC/FP/VSX in the msr above, so we can execute these
 	 * instructions!
 	 */
-	ld	r4, STK_PARAM(R4)(r1)		/* Second parameter, MSR * */
 	mr	r3, r12
-	andis.		r0, r4, MSR_VEC@h
-	beq	dont_backup_vec
 
+	/* Altivec (VEC/VMX/VR)*/
 	addi	r7, r3, THREAD_CKVRSTATE
 	SAVE_32VRS(0, r6, r7)	/* r6 scratch, r7 transact vr state */
 	mfvscr	v0
 	li	r6, VRSTATE_VSCR
 	stvx	v0, r7, r6
-dont_backup_vec:
+
+	/* VRSAVE */
 	mfspr	r0, SPRN_VRSAVE
 	std	r0, THREAD_CKVRSAVE(r3)
 
-	andi.	r0, r4, MSR_FP
-	beq	dont_backup_fp
-
+	/* Floating Point (FP) */
 	addi	r7, r3, THREAD_CKFPSTATE
 	SAVE_32FPRS_VSRS(0, R6, R7)	/* r6 scratch, r7 transact fp state */
-
 	mffs    fr0
 	stfd    fr0,FPSTATE_FPSCR(r7)
 
-dont_backup_fp:
 
 	/* TM regs, incl TEXASR -- these live in thread_struct.  Note they've
 	 * been updated by the treclaim, to explain to userland the failure
@@ -343,22 +329,19 @@ _GLOBAL(__tm_recheckpoint)
 	 */
 	subi	r7, r7, STACK_FRAME_OVERHEAD
 
+	/* We need to setup MSR for FP/VMX/VSX register save instructions. */
 	mfmsr	r6
-	/* R4 = original MSR to indicate whether thread used FP/Vector etc. */
-
-	/* Enable FP/vec in MSR if necessary! */
-	lis	r5, MSR_VEC@h
+	mr	r5, r6
 	ori	r5, r5, MSR_FP
-	and.	r5, r4, r5
-	beq	restore_gprs			/* if neither, skip both */
-
+#ifdef CONFIG_ALTIVEC
+	oris	r5, r5, MSR_VEC@h
+#endif
 #ifdef CONFIG_VSX
 	BEGIN_FTR_SECTION
-	oris	r5, r5, MSR_VSX@h
+	oris	r5,r5, MSR_VSX@h
 	END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 #endif
-	or	r5, r6, r5			/* Set MSR.FP+.VSX/.VEC */
-	mtmsr	r5
+	mtmsrd	r5
 
 #ifdef CONFIG_ALTIVEC
 	/*
@@ -367,28 +350,20 @@ _GLOBAL(__tm_recheckpoint)
 	 * thread.fp_state[] version holds the 'live' (transactional)
 	 * and will be loaded subsequently by any FPUnavailable trap.
 	 */
-	andis.	r0, r4, MSR_VEC@h
-	beq	dont_restore_vec
-
 	addi	r8, r3, THREAD_CKVRSTATE
 	li	r5, VRSTATE_VSCR
 	lvx	v0, r8, r5
 	mtvscr	v0
 	REST_32VRS(0, r5, r8)			/* r5 scratch, r8 ptr */
-dont_restore_vec:
 	ld	r5, THREAD_CKVRSAVE(r3)
 	mtspr	SPRN_VRSAVE, r5
 #endif
 
-	andi.	r0, r4, MSR_FP
-	beq	dont_restore_fp
-
 	addi	r8, r3, THREAD_CKFPSTATE
 	lfd	fr0, FPSTATE_FPSCR(r8)
 	MTFSF_L(fr0)
 	REST_32FPRS_VSRS(0, R4, R8)
 
-dont_restore_fp:
 	mtmsr	r6				/* FP/Vec off again! */
 
 restore_gprs:
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index e099e61c702b..4e5a543d0b74 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1693,7 +1693,7 @@ void fp_unavailable_tm(struct pt_regs *regs)
 	 * If VMX is in use, the VRs now hold checkpointed values,
 	 * so we don't want to load the VRs from the thread_struct.
 	 */
-	tm_recheckpoint(&current->thread, orig_msr | MSR_FP);
+	tm_recheckpoint(&current->thread);
 
 	/* If VMX is in use, get the transactional values back */
 	if (orig_msr & MSR_VEC) {
@@ -1721,7 +1721,7 @@ void altivec_unavailable_tm(struct pt_regs *regs)
 		 regs->nip, regs->msr);
 	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
 	current->thread.load_vec = 1;
-	tm_recheckpoint(&current->thread, orig_msr | MSR_VEC);
+	tm_recheckpoint(&current->thread);
 	current->thread.used_vr = 1;
 
 	if (orig_msr & MSR_FP) {
@@ -1733,8 +1733,6 @@ void altivec_unavailable_tm(struct pt_regs *regs)
 
 void vsx_unavailable_tm(struct pt_regs *regs)
 {
-	unsigned long orig_msr = regs->msr;
-
 	/* See the comments in fp_unavailable_tm().  This works similarly,
 	 * though we're loading both FP and VEC registers in here.
 	 *
@@ -1748,29 +1746,17 @@ void vsx_unavailable_tm(struct pt_regs *regs)
 
 	current->thread.used_vsr = 1;
 
-	/* If FP and VMX are already loaded, we have all the state we need */
-	if ((orig_msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC)) {
-		regs->msr |= MSR_VSX;
-		return;
-	}
-
 	/* This reclaims FP and/or VR regs if they're already enabled */
 	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
 
 	current->thread.load_vec = 1;
 	current->thread.load_fp = 1;
 
-	/* This loads & recheckpoints FP and VRs; but we have
-	 * to be sure not to overwrite previously-valid state.
-	 */
-	tm_recheckpoint(&current->thread, orig_msr | MSR_FP | MSR_VEC);
-
-	msr_check_and_set(orig_msr & (MSR_FP | MSR_VEC));
+	tm_recheckpoint(&current->thread);
 
-	if (orig_msr & MSR_FP)
-		load_fp_state(&current->thread.fp_state);
-	if (orig_msr & MSR_VEC)
-		load_vr_state(&current->thread.vr_state);
+	msr_check_and_set(MSR_FP | MSR_VEC);
+	load_fp_state(&current->thread.fp_state);
+	load_vr_state(&current->thread.vr_state);
 }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
-- 
cgit 


From 6f700d38a8c9dc4d786ac4a831073194f17c5076 Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@gmail.com>
Date: Thu, 2 Nov 2017 14:09:06 +1100
Subject: powerpc: Remove facility loadups on transactional {fp, vec, vsx}
 unavailable

After handling a transactional FP, Altivec or VSX unavailable exception.
The return to userspace code will detect that the TIF_RESTORE_TM bit is
set and call restore_tm_state(). restore_tm_state() will call
restore_math() to ensure that the correct facilities are loaded.

This means that all the loadup code in {fp,altivec,vsx}_unavailable_tm()
is doing pointless work and can simply be removed.

Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/traps.c | 30 ------------------------------
 1 file changed, 30 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 4e5a543d0b74..f3eb61be0d30 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1663,12 +1663,6 @@ out:
 
 void fp_unavailable_tm(struct pt_regs *regs)
 {
-	/*
-	 * Save the MSR now because tm_reclaim_current() is likely to
-	 * change it
-	 */
-	unsigned long orig_msr = regs->msr;
-
 	/* Note:  This does not handle any kind of FP laziness. */
 
 	TM_DEBUG("FP Unavailable trap whilst transactional at 0x%lx, MSR=%lx\n",
@@ -1694,24 +1688,10 @@ void fp_unavailable_tm(struct pt_regs *regs)
 	 * so we don't want to load the VRs from the thread_struct.
 	 */
 	tm_recheckpoint(&current->thread);
-
-	/* If VMX is in use, get the transactional values back */
-	if (orig_msr & MSR_VEC) {
-		msr_check_and_set(MSR_VEC);
-		load_vr_state(&current->thread.vr_state);
-		/* At this point all the VSX state is loaded, so enable it */
-		regs->msr |= MSR_VSX;
-	}
 }
 
 void altivec_unavailable_tm(struct pt_regs *regs)
 {
-	/*
-	 * Save the MSR now because tm_reclaim_current() is likely to
-	 * change it
-	 */
-	unsigned long orig_msr = regs->msr;
-
 	/* See the comments in fp_unavailable_tm().  This function operates
 	 * the same way.
 	 */
@@ -1723,12 +1703,6 @@ void altivec_unavailable_tm(struct pt_regs *regs)
 	current->thread.load_vec = 1;
 	tm_recheckpoint(&current->thread);
 	current->thread.used_vr = 1;
-
-	if (orig_msr & MSR_FP) {
-		msr_check_and_set(MSR_FP);
-		load_fp_state(&current->thread.fp_state);
-		regs->msr |= MSR_VSX;
-	}
 }
 
 void vsx_unavailable_tm(struct pt_regs *regs)
@@ -1753,10 +1727,6 @@ void vsx_unavailable_tm(struct pt_regs *regs)
 	current->thread.load_fp = 1;
 
 	tm_recheckpoint(&current->thread);
-
-	msr_check_and_set(MSR_FP | MSR_VEC);
-	load_fp_state(&current->thread.fp_state);
-	load_vr_state(&current->thread.vr_state);
 }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
-- 
cgit 


From b6b3755e9bec9c686a34ec81eacced0075370cbc Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 3 Nov 2017 15:13:19 +1100
Subject: powerpc: add POWER9_DD20 feature

Cc: Michael Neuling <mikey@neuling.org>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cputable.h |  5 ++++-
 arch/powerpc/kernel/cputable.c      | 20 ++++++++++++++++++++
 arch/powerpc/kernel/dt_cpu_ftrs.c   |  2 ++
 3 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index e188dd57bf51..1ca26ca85ec6 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -215,6 +215,7 @@ enum {
 #define CPU_FTR_DABRX			LONG_ASM_CONST(0x0800000000000000)
 #define CPU_FTR_PMAO_BUG		LONG_ASM_CONST(0x1000000000000000)
 #define CPU_FTR_POWER9_DD1		LONG_ASM_CONST(0x4000000000000000)
+#define CPU_FTR_POWER9_DD20		LONG_ASM_CONST(0x8000000000000000)
 
 #ifndef __ASSEMBLY__
 
@@ -477,6 +478,7 @@ enum {
 	    CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300)
 #define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \
 			     (~CPU_FTR_SAO))
+#define CPU_FTRS_POWER9_DD20 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD20)
 #define CPU_FTRS_CELL	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -495,7 +497,8 @@ enum {
 	    (CPU_FTRS_POWER4 | CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
 	     CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
 	     CPU_FTRS_POWER8 | CPU_FTRS_POWER8_DD1 | CPU_FTRS_CELL | \
-	     CPU_FTRS_PA6T | CPU_FTR_VSX | CPU_FTRS_POWER9 | CPU_FTRS_POWER9_DD1)
+	     CPU_FTRS_PA6T | CPU_FTR_VSX | CPU_FTRS_POWER9 | \
+	     CPU_FTRS_POWER9_DD1 | CPU_FTRS_POWER9_DD20)
 #endif
 #else
 enum {
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 760872916013..171820190de7 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -547,6 +547,26 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.machine_check_early	= __machine_check_early_realmode_p9,
 		.platform		= "power9",
 	},
+	{	/* Power9 DD2.0 */
+		.pvr_mask		= 0xffffefff,
+		.pvr_value		= 0x004e0200,
+		.cpu_name		= "POWER9 (raw)",
+		.cpu_features		= CPU_FTRS_POWER9_DD20,
+		.cpu_user_features	= COMMON_USER_POWER9,
+		.cpu_user_features2	= COMMON_USER2_POWER9,
+		.mmu_features		= MMU_FTRS_POWER9,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power9",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power9,
+		.cpu_restore		= __restore_cpu_power9,
+		.flush_tlb		= __flush_tlb_power9,
+		.machine_check_early	= __machine_check_early_realmode_p9,
+		.platform		= "power9",
+	},
 	{	/* Power9 */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x004e0000,
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index c7bfcacc2cb6..a7ad181d0deb 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -735,6 +735,8 @@ static __init void cpufeatures_cpu_quirks(void)
 	 */
 	if ((version & 0xffffff00) == 0x004e0100)
 		cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD1;
+	else if ((version & 0xffffefff) == 0x004e0200)
+		cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD20;
 }
 
 static void __init cpufeatures_setup_finished(void)
-- 
cgit 


From 9d2f510a66ec60411536b83dab831ee091c84307 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 3 Nov 2017 15:13:20 +1100
Subject: powerpc/64s/idle: avoid POWER9 DD1 and DD2.0 ERAT workaround on DD2.1

DD2.1 does not have to flush the ERAT after a state-loss idle.

Performance testing was done on a DD2.1 using only the stop0 idle state
(the shallowest state which supports state loss), using context_switch
selftest configured to ping-poing between two threads on the same core
and two different cores.

Performance improvement for same core is 7.0%, different cores is 14.8%.

Reviewed-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/idle_book3s.S | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 175d49f468af..d77766814090 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -542,7 +542,9 @@ pnv_restore_hyp_resource_arch300:
 	 * then clear bit 60 in MMCRA to ensure the PMU starts running.
 	 */
 	blt	cr3,1f
+BEGIN_FTR_SECTION
 	PPC_INVALIDATE_ERAT
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20)
 	ld	r1,PACAR1(r13)
 	mfspr	r4,SPRN_MMCRA
 	ori	r4,r4,(1 << (63-60))
-- 
cgit 


From e3646330cf6667d5cced780ade80f67850c81872 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 3 Nov 2017 15:13:21 +1100
Subject: powerpc/64s/idle: avoid POWER9 DD1 and DD2.0 PMU workaround on DD2.1

DD2.1 does not have to save MMCR0 for all state-loss idle states,
only after deep idle states (like other PMU registers).

Reviewed-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/idle_book3s.S | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index d77766814090..59657783d1d5 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -112,12 +112,14 @@ power9_save_additional_sprs:
 	std	r4, STOP_HFSCR(r13)
 
 	mfspr	r3, SPRN_MMCRA
-	mfspr	r4, SPRN_MMCR1
+	mfspr	r4, SPRN_MMCR0
 	std	r3, STOP_MMCRA(r13)
-	std	r4, STOP_MMCR1(r13)
+	std	r4, _MMCR0(r1)
 
-	mfspr	r3, SPRN_MMCR2
-	std	r3, STOP_MMCR2(r13)
+	mfspr	r3, SPRN_MMCR1
+	mfspr	r4, SPRN_MMCR2
+	std	r3, STOP_MMCR1(r13)
+	std	r4, STOP_MMCR2(r13)
 	blr
 
 power9_restore_additional_sprs:
@@ -135,11 +137,14 @@ power9_restore_additional_sprs:
 	ld	r4, STOP_MMCRA(r13)
 	mtspr	SPRN_HFSCR, r3
 	mtspr	SPRN_MMCRA, r4
-	/* We have already restored PACA_MMCR0 */
-	ld	r3, STOP_MMCR1(r13)
-	ld	r4, STOP_MMCR2(r13)
-	mtspr	SPRN_MMCR1, r3
-	mtspr	SPRN_MMCR2, r4
+
+	ld	r3, _MMCR0(r1)
+	ld	r4, STOP_MMCR1(r13)
+	mtspr	SPRN_MMCR0, r3
+	mtspr	SPRN_MMCR1, r4
+
+	ld	r3, STOP_MMCR2(r13)
+	mtspr	SPRN_MMCR2, r3
 	blr
 
 /*
@@ -350,6 +355,7 @@ power_enter_stop:
 	b 	pnv_wakeup_noloss
 
 .Lhandle_esl_ec_set:
+BEGIN_FTR_SECTION
 	/*
 	 * POWER9 DD2 can incorrectly set PMAO when waking up after a
 	 * state-loss idle. Saving and restoring MMCR0 over idle is a
@@ -357,6 +363,7 @@ power_enter_stop:
 	 */
 	mfspr	r4,SPRN_MMCR0
 	std	r4,_MMCR0(r1)
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20)
 
 /*
  * Check if the requested state is a deep idle state.
@@ -544,15 +551,15 @@ pnv_restore_hyp_resource_arch300:
 	blt	cr3,1f
 BEGIN_FTR_SECTION
 	PPC_INVALIDATE_ERAT
-END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20)
 	ld	r1,PACAR1(r13)
+	ld	r4,_MMCR0(r1)
+	mtspr	SPRN_MMCR0,r4
+END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20)
 	mfspr	r4,SPRN_MMCRA
 	ori	r4,r4,(1 << (63-60))
 	mtspr	SPRN_MMCRA,r4
 	xori	r4,r4,(1 << (63-60))
 	mtspr	SPRN_MMCRA,r4
-	ld	r4,_MMCR0(r1)
-	mtspr	SPRN_MMCR0,r4
 1:
 	/*
 	 * POWER ISA 3. Use PSSCR to determine if we
-- 
cgit 


From cd77b5ce208c153260ed7882d8910f2395bfaabd Mon Sep 17 00:00:00 2001
From: Shriya <shriyak@linux.vnet.ibm.com>
Date: Fri, 13 Oct 2017 10:06:41 +0530
Subject: powerpc/powernv/cpufreq: Fix the frequency read by /proc/cpuinfo

The call to /proc/cpuinfo in turn calls cpufreq_quick_get() which
returns the last frequency requested by the kernel, but may not
reflect the actual frequency the processor is running at. This patch
makes a call to cpufreq_get() instead which returns the current
frequency reported by the hardware.

Fixes: fb5153d05a7d ("powerpc: powernv: Implement ppc_md.get_proc_freq()")
Signed-off-by: Shriya <shriyak@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index d23f148a11f0..62f4a5ad8594 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -335,7 +335,7 @@ static unsigned long pnv_get_proc_freq(unsigned int cpu)
 {
 	unsigned long ret_freq;
 
-	ret_freq = cpufreq_quick_get(cpu) * 1000ul;
+	ret_freq = cpufreq_get(cpu) * 1000ul;
 
 	/*
 	 * If the backend cpufreq driver does not exist,
-- 
cgit 


From 9003a249815a15704f415954039d1c7ea27da9ad Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 7 Nov 2017 14:43:01 +1100
Subject: powerpc/powernv/ioda: Remove explicit max window size check

DMA windows can only have a size of power of two on IODA2 hardware and
using memory_hotplug_max() to determine the upper limit won't work
correcly if it returns not power of two value.

This removes the check as the platform code does this check in
pnv_pci_ioda2_setup_default_config() anyway; the other client is VFIO
and that thing checks against locked_vm limit which prevents the userspace
from locking too much memory.

It is expected to impact DPDK on machines with non-power-of-two RAM size,
mostly. KVM guests are less likely to be affected as usually guests get
less than half of hosts RAM.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 7e87867984e7..749055553064 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2797,7 +2797,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
 	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
 		return -EINVAL;
 
-	if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
+	if (!is_power_of_2(window_size))
 		return -EINVAL;
 
 	/* Adjust direct table size from window_size and levels */
-- 
cgit 


From bf751e30b41cdf2568cccaf5de4156a3d78683df Mon Sep 17 00:00:00 2001
From: Michal Suchanek <msuchanek@suse.de>
Date: Tue, 11 Jul 2017 13:26:26 +0200
Subject: powerpc/mm/hash: Remove stale comment.

In commit e6f81a92015b ("powerpc/mm/hash: Support 68 bit VA") the
masking is folded into ASM_VSID_SCRAMBLE but the comment about masking
is removed only from the firt use of ASM_VSID_SCRAMBLE.

Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/slb_low.S | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index 906a86fe457b..ed60ad861dfa 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -309,10 +309,6 @@ slb_compare_rr_to_size:
 	srdi	r10,r10,(SID_SHIFT_1T - SID_SHIFT)	/* get 1T ESID */
 	rldimi  r10,r9,ESID_BITS_1T,0
 	ASM_VSID_SCRAMBLE(r10,r9,r11,1T)
-	/*
-	 * bits above VSID_BITS_1T need to be ignored from r10
-	 * also combine VSID and flags
-	 */
 
 	li	r10,MMU_SEGSIZE_1T
 	rldimi	r11,r10,SLB_VSID_SSIZE_SHIFT,0	/* insert segment size */
-- 
cgit 


From 80eff6c484799722736471d15ff9cc86b64cae7a Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Mon, 30 Oct 2017 22:01:12 +1100
Subject: powerpc/xmon: Support dumping software pagetables

It would be nice to be able to dump page tables in a particular
context.

eg: dumping vmalloc space:

  0:mon> dv 0xd00037fffff00000
  pgd  @ 0xc0000000017c0000
  pgdp @ 0xc0000000017c00d8 = 0x00000000f10b1000
  pudp @ 0xc0000000f10b13f8 = 0x00000000f10d0000
  pmdp @ 0xc0000000f10d1ff8 = 0x00000000f1102000
  ptep @ 0xc0000000f1102780 = 0xc0000000f1ba018e
  Maps physical address = 0x00000000f1ba0000
  Flags = Accessed Dirty Read Write

This patch does not replicate the complex code of dump_pagetable and
has no support for bolted linear mapping, thats why I've it's called
dump virtual page table support. The format of the PTE can be expanded
even further to add more useful information about the flags in the PTE
if required.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
[mpe: Bike shed the output format, show the pgdir, fix build failures]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/xmon/xmon.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 6b1236b85622..1b2d8cb49abb 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -28,6 +28,7 @@
 #include <linux/bug.h>
 #include <linux/nmi.h>
 #include <linux/ctype.h>
+#include <linux/highmem.h>
 
 #include <asm/debugfs.h>
 #include <asm/ptrace.h>
@@ -127,6 +128,7 @@ static void byterev(unsigned char *, int);
 static void memex(void);
 static int bsesc(void);
 static void dump(void);
+static void show_pte(unsigned long);
 static void prdump(unsigned long, long);
 static int ppc_inst_dump(unsigned long, long, int);
 static void dump_log_buf(void);
@@ -234,6 +236,7 @@ Commands:\n\
 #endif
   "\
   dr	dump stream of raw bytes\n\
+  dv	dump virtual address translation \n\
   dt	dump the tracing buffers (uses printk)\n\
   dtc	dump the tracing buffers for current CPU (uses printk)\n\
 "
@@ -2613,6 +2616,9 @@ dump(void)
 		dump_log_buf();
 	} else if (c == 'o') {
 		dump_opal_msglog();
+	} else if (c == 'v') {
+		/* dump virtual to physical translation */
+		show_pte(adrs);
 	} else if (c == 'r') {
 		scanhex(&ndump);
 		if (ndump == 0)
@@ -2946,6 +2952,116 @@ static void show_task(struct task_struct *tsk)
 		tsk->comm);
 }
 
+#ifdef CONFIG_PPC_BOOK3S_64
+void format_pte(void *ptep, unsigned long pte)
+{
+	printf("ptep @ 0x%016lx = 0x%016lx\n", (unsigned long)ptep, pte);
+	printf("Maps physical address = 0x%016lx\n", pte & PTE_RPN_MASK);
+
+	printf("Flags = %s%s%s%s%s\n",
+	       (pte & _PAGE_ACCESSED) ? "Accessed " : "",
+	       (pte & _PAGE_DIRTY)    ? "Dirty " : "",
+	       (pte & _PAGE_READ)     ? "Read " : "",
+	       (pte & _PAGE_WRITE)    ? "Write " : "",
+	       (pte & _PAGE_EXEC)     ? "Exec " : "");
+}
+
+static void show_pte(unsigned long addr)
+{
+	unsigned long tskv = 0;
+	struct task_struct *tsk = NULL;
+	struct mm_struct *mm;
+	pgd_t *pgdp, *pgdir;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+
+	if (!scanhex(&tskv))
+		mm = &init_mm;
+	else
+		tsk = (struct task_struct *)tskv;
+
+	if (tsk == NULL)
+		mm = &init_mm;
+	else
+		mm = tsk->active_mm;
+
+	if (setjmp(bus_error_jmp) != 0) {
+		catch_memory_errors = 0;
+		printf("*** Error dumping pte for task %p\n", tsk);
+		return;
+	}
+
+	catch_memory_errors = 1;
+	sync();
+
+	if (mm == &init_mm) {
+		pgdp = pgd_offset_k(addr);
+		pgdir = pgd_offset_k(0);
+	} else {
+		pgdp = pgd_offset(mm, addr);
+		pgdir = pgd_offset(mm, 0);
+	}
+
+	if (pgd_none(*pgdp)) {
+		printf("no linux page table for address\n");
+		return;
+	}
+
+	printf("pgd  @ 0x%016lx\n", pgdir);
+
+	if (pgd_huge(*pgdp)) {
+		format_pte(pgdp, pgd_val(*pgdp));
+		return;
+	}
+	printf("pgdp @ 0x%016lx = 0x%016lx\n", pgdp, pgd_val(*pgdp));
+
+	pudp = pud_offset(pgdp, addr);
+
+	if (pud_none(*pudp)) {
+		printf("No valid PUD\n");
+		return;
+	}
+
+	if (pud_huge(*pudp)) {
+		format_pte(pudp, pud_val(*pudp));
+		return;
+	}
+
+	printf("pudp @ 0x%016lx = 0x%016lx\n", pudp, pud_val(*pudp));
+
+	pmdp = pmd_offset(pudp, addr);
+
+	if (pmd_none(*pmdp)) {
+		printf("No valid PMD\n");
+		return;
+	}
+
+	if (pmd_huge(*pmdp)) {
+		format_pte(pmdp, pmd_val(*pmdp));
+		return;
+	}
+	printf("pmdp @ 0x%016lx = 0x%016lx\n", pmdp, pmd_val(*pmdp));
+
+	ptep = pte_offset_map(pmdp, addr);
+	if (pte_none(*ptep)) {
+		printf("no valid PTE\n");
+		return;
+	}
+
+	format_pte(ptep, pte_val(*ptep));
+
+	sync();
+	__delay(200);
+	catch_memory_errors = 0;
+}
+#else
+static void show_pte(unsigned long addr)
+{
+	printf("show_pte not yet implemented\n");
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 static void show_tasks(void)
 {
 	unsigned long tskv;
-- 
cgit 


From 14001c60939a754717893672209160cec1e2ac18 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 7 Nov 2017 18:53:05 +1100
Subject: powerpc/64s/radix: Optimize TLB range flush barriers

Short range flushes issue a sequences of tlbie(l) instructions for
individual effective addresses. These do not all require individual
barrier sequences, only one covering all tlbie(l) instructions.

Commit f7327e0ba3 ("powerpc/mm/radix: Remove unnecessary ptesync")
made a similar optimization for tlbiel for PID flushing.

For tlbie, the ISA says:

    The tlbsync instruction provides an ordering function for the
    effects of all tlbie instructions executed by the thread executing
    the tlbsync instruction, with respect to the memory barrier
    created by a subsequent ptesync instruction executed by the same
    thread.

Time to munmap 30 pages of memory (after mmap, touch):
         local   global
vanilla  10.9us  22.3us
patched   3.4us  14.4us

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/tlb-radix.c | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 63e277b6e60c..22b657e4b01a 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -84,7 +84,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
-static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+static inline void __tlbiel_va(unsigned long va, unsigned long pid,
 			      unsigned long ap, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
@@ -95,14 +95,20 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid,
 	prs = 1; /* process scoped */
 	r = 1;   /* raidx format */
 
-	asm volatile("ptesync": : :"memory");
 	asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	asm volatile("ptesync": : :"memory");
 	trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
-static inline void _tlbie_va(unsigned long va, unsigned long pid,
+static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+			      unsigned long ap, unsigned long ric)
+{
+	asm volatile("ptesync": : :"memory");
+	__tlbiel_va(va, pid, ap, ric);
+	asm volatile("ptesync": : :"memory");
+}
+
+static inline void __tlbie_va(unsigned long va, unsigned long pid,
 			     unsigned long ap, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
@@ -113,13 +119,20 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
 	prs = 1; /* process scoped */
 	r = 1;   /* raidx format */
 
-	asm volatile("ptesync": : :"memory");
 	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
 		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
-	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
+static inline void _tlbie_va(unsigned long va, unsigned long pid,
+			     unsigned long ap, unsigned long ric)
+{
+	asm volatile("ptesync": : :"memory");
+	__tlbie_va(va, pid, ap, ric);
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+
 /*
  * Base TLB flushing operations:
  *
@@ -341,13 +354,17 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 		else
 			_tlbie_pid(pid, RIC_FLUSH_TLB);
 	} else {
+		asm volatile("ptesync": : :"memory");
 		for (addr = start; addr < end; addr += page_size) {
-
 			if (local)
-				_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+				__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
 			else
-				_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+				__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
 		}
+		if (local)
+			asm volatile("ptesync": : :"memory");
+		else
+			asm volatile("eieio; tlbsync; ptesync": : :"memory");
 	}
 	preempt_enable();
 }
@@ -378,6 +395,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 		_tlbie_pid(pid, RIC_FLUSH_PWC);
 
 	/* Then iterate the pages */
+	asm volatile("ptesync": : :"memory");
 	end = addr + HPAGE_PMD_SIZE;
 	for (; addr < end; addr += PAGE_SIZE) {
 		if (local)
@@ -386,6 +404,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 			_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
 	}
 
+	if (local)
+		asm volatile("ptesync": : :"memory");
+	else
+		asm volatile("eieio; tlbsync; ptesync": : :"memory");
+
 	preempt_enable();
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
cgit 


From d665767e39fa4a9e725f92d77ba2060c5ce273dc Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 7 Nov 2017 18:53:06 +1100
Subject: powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions

Move the barriers and range iteration down into the _tlbie* level,
which improves readability.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/tlb-radix.c | 69 +++++++++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 30 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 22b657e4b01a..9916ea2fff43 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -85,7 +85,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 }
 
 static inline void __tlbiel_va(unsigned long va, unsigned long pid,
-			      unsigned long ap, unsigned long ric)
+			       unsigned long ap, unsigned long ric)
 {
 	unsigned long rb,rs,prs,r;
 
@@ -101,13 +101,28 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid,
 }
 
 static inline void _tlbiel_va(unsigned long va, unsigned long pid,
-			      unsigned long ap, unsigned long ric)
+			      unsigned long psize, unsigned long ric)
 {
+	unsigned long ap = mmu_get_ap(psize);
+
 	asm volatile("ptesync": : :"memory");
 	__tlbiel_va(va, pid, ap, ric);
 	asm volatile("ptesync": : :"memory");
 }
 
+static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize)
+{
+	unsigned long addr;
+	unsigned long ap = mmu_get_ap(psize);
+
+	asm volatile("ptesync": : :"memory");
+	for (addr = start; addr < end; addr += page_size)
+		__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+	asm volatile("ptesync": : :"memory");
+}
+
 static inline void __tlbie_va(unsigned long va, unsigned long pid,
 			     unsigned long ap, unsigned long ric)
 {
@@ -125,13 +140,27 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid,
 }
 
 static inline void _tlbie_va(unsigned long va, unsigned long pid,
-			     unsigned long ap, unsigned long ric)
+			      unsigned long psize, unsigned long ric)
 {
+	unsigned long ap = mmu_get_ap(psize);
+
 	asm volatile("ptesync": : :"memory");
 	__tlbie_va(va, pid, ap, ric);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
+static inline void _tlbie_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize)
+{
+	unsigned long addr;
+	unsigned long ap = mmu_get_ap(psize);
+
+	asm volatile("ptesync": : :"memory");
+	for (addr = start; addr < end; addr += page_size)
+		__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
 
 /*
  * Base TLB flushing operations:
@@ -174,12 +203,11 @@ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmadd
 				       int psize)
 {
 	unsigned long pid;
-	unsigned long ap = mmu_get_ap(psize);
 
 	preempt_disable();
 	pid = mm->context.id;
 	if (pid != MMU_NO_CONTEXT)
-		_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
 	preempt_enable();
 }
 
@@ -239,7 +267,6 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 				 int psize)
 {
 	unsigned long pid;
-	unsigned long ap = mmu_get_ap(psize);
 
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
@@ -247,9 +274,9 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 
 	preempt_disable();
 	if (!mm_is_thread_local(mm))
-		_tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+		_tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
 	else
-		_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+		_tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
 	preempt_enable();
 }
 
@@ -336,9 +363,7 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 				  unsigned long end, int psize)
 {
 	unsigned long pid;
-	unsigned long addr;
 	bool local;
-	unsigned long ap = mmu_get_ap(psize);
 	unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
 
 	pid = mm->context.id;
@@ -354,17 +379,10 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 		else
 			_tlbie_pid(pid, RIC_FLUSH_TLB);
 	} else {
-		asm volatile("ptesync": : :"memory");
-		for (addr = start; addr < end; addr += page_size) {
-			if (local)
-				__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-			else
-				__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
-		}
 		if (local)
-			asm volatile("ptesync": : :"memory");
+			_tlbiel_va_range(start, end, pid, page_size, psize);
 		else
-			asm volatile("eieio; tlbsync; ptesync": : :"memory");
+			_tlbie_va_range(start, end, pid, page_size, psize);
 	}
 	preempt_enable();
 }
@@ -372,7 +390,6 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 {
-	unsigned long ap = mmu_get_ap(mmu_virtual_psize);
 	unsigned long pid, end;
 	bool local;
 
@@ -395,19 +412,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 		_tlbie_pid(pid, RIC_FLUSH_PWC);
 
 	/* Then iterate the pages */
-	asm volatile("ptesync": : :"memory");
 	end = addr + HPAGE_PMD_SIZE;
-	for (; addr < end; addr += PAGE_SIZE) {
-		if (local)
-			_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-		else
-			_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
-	}
-
 	if (local)
-		asm volatile("ptesync": : :"memory");
+		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
 	else
-		asm volatile("eieio; tlbsync; ptesync": : :"memory");
+		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
 
 	preempt_enable();
 }
-- 
cgit 


From cbf09c837720f72f5e63ab7a2d331ec6cc9a3417 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 7 Nov 2017 18:53:07 +1100
Subject: powerpc/64s/radix: Optimize flush_tlb_range

Currently for radix, flush_tlb_range flushes the entire PID, because
the Linux mm code does not tell us about page size here for THP vs
regular pages. This is quite sub-optimal for small mremap / mprotect
/ change_protection.

So implement va range flushes with two flush passes, one for each
page size (regular and THP). The second flush has an order of matnitude
fewer tlbie instructions than the first, so it is a relatively small
additional cost.

There is still room for improvement here with some changes to generic
APIs, particularly if there are mostly THP pages to be invalidated,
the small page flushes could be reduced.

Time to mprotect 1 page of memory (after mmap, touch):
vanilla 2.9us   1.8us
patched 1.2us   1.6us

Time to mprotect 30 pages of memory (after mmap, touch):
vanilla 8.2us   7.2us
patched 6.9us   17.9us

Time to mprotect 34 pages of memory (after mmap, touch):
vanilla 9.1us   8.0us
patched 9.0us   8.0us

34 pages is the point at which the invalidation switches from va
to entire PID, which tlbie can do in a single instruction. This is
why in the case of 30 pages, the new code runs slower for this test.
This is a deliberate tradeoff already present in the unmap and THP
promotion code, the idea is that the benefit from avoiding flushing
entire TLB for this PID on all threads in the system.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/tlb-radix.c | 138 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 103 insertions(+), 35 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 9916ea2fff43..b4b49de551a9 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -100,6 +100,17 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid,
 	trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
+static inline void __tlbiel_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize)
+{
+	unsigned long addr;
+	unsigned long ap = mmu_get_ap(psize);
+
+	for (addr = start; addr < end; addr += page_size)
+		__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
 static inline void _tlbiel_va(unsigned long va, unsigned long pid,
 			      unsigned long psize, unsigned long ric)
 {
@@ -114,12 +125,8 @@ static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
 				    unsigned long pid, unsigned long page_size,
 				    unsigned long psize)
 {
-	unsigned long addr;
-	unsigned long ap = mmu_get_ap(psize);
-
 	asm volatile("ptesync": : :"memory");
-	for (addr = start; addr < end; addr += page_size)
-		__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+	__tlbiel_va_range(start, end, pid, page_size, psize);
 	asm volatile("ptesync": : :"memory");
 }
 
@@ -139,6 +146,17 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid,
 	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
+static inline void __tlbie_va_range(unsigned long start, unsigned long end,
+				    unsigned long pid, unsigned long page_size,
+				    unsigned long psize)
+{
+	unsigned long addr;
+	unsigned long ap = mmu_get_ap(psize);
+
+	for (addr = start; addr < end; addr += page_size)
+		__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+}
+
 static inline void _tlbie_va(unsigned long va, unsigned long pid,
 			      unsigned long psize, unsigned long ric)
 {
@@ -153,12 +171,8 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end,
 				    unsigned long pid, unsigned long page_size,
 				    unsigned long psize)
 {
-	unsigned long addr;
-	unsigned long ap = mmu_get_ap(psize);
-
 	asm volatile("ptesync": : :"memory");
-	for (addr = start; addr < end; addr += page_size)
-		__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+	__tlbie_va_range(start, end, pid, page_size, psize);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
@@ -300,17 +314,78 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
 }
 EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
 
+#define TLB_FLUSH_ALL -1UL
+
 /*
- * Currently, for range flushing, we just do a full mm flush. Because
- * we use this in code path where we don' track the page size.
+ * Number of pages above which we invalidate the entire PID rather than
+ * flush individual pages, for local and global flushes respectively.
+ *
+ * tlbie goes out to the interconnect and individual ops are more costly.
+ * It also does not iterate over sets like the local tlbiel variant when
+ * invalidating a full PID, so it has a far lower threshold to change from
+ * individual page flushes to full-pid flushes.
  */
+static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+
 void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		     unsigned long end)
 
 {
 	struct mm_struct *mm = vma->vm_mm;
+	unsigned long pid;
+	unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
+	unsigned long page_size = 1UL << page_shift;
+	unsigned long nr_pages = (end - start) >> page_shift;
+	bool local, full;
+
+#ifdef CONFIG_HUGETLB_PAGE
+	if (is_vm_hugetlb_page(vma))
+		return radix__flush_hugetlb_tlb_range(vma, start, end);
+#endif
 
-	radix__flush_tlb_mm(mm);
+	pid = mm->context.id;
+	if (unlikely(pid == MMU_NO_CONTEXT))
+		return;
+
+	preempt_disable();
+	local = mm_is_thread_local(mm);
+	full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling);
+
+	if (full) {
+		if (local)
+			_tlbiel_pid(pid, RIC_FLUSH_TLB);
+		else
+			_tlbie_pid(pid, RIC_FLUSH_TLB);
+	} else {
+		bool hflush = false;
+		unsigned long hstart, hend;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+		hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT;
+		hend = end >> HPAGE_PMD_SHIFT;
+		if (hstart < hend) {
+			hstart <<= HPAGE_PMD_SHIFT;
+			hend <<= HPAGE_PMD_SHIFT;
+			hflush = true;
+		}
+#endif
+
+		asm volatile("ptesync": : :"memory");
+		if (local) {
+			__tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize);
+			if (hflush)
+				__tlbiel_va_range(hstart, hend, pid,
+						HPAGE_PMD_SIZE, MMU_PAGE_2M);
+			asm volatile("ptesync": : :"memory");
+		} else {
+			__tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize);
+			if (hflush)
+				__tlbie_va_range(hstart, hend, pid,
+						HPAGE_PMD_SIZE, MMU_PAGE_2M);
+			asm volatile("eieio; tlbsync; ptesync": : :"memory");
+		}
+	}
+	preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_tlb_range);
 
@@ -352,19 +427,14 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 		radix__flush_tlb_mm(mm);
 }
 
-#define TLB_FLUSH_ALL -1UL
-/*
- * Number of pages above which we will do a bcast tlbie. Just a
- * number at this point copied from x86
- */
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
-
 void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 				  unsigned long end, int psize)
 {
 	unsigned long pid;
-	bool local;
-	unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
+	unsigned int page_shift = mmu_psize_defs[psize].shift;
+	unsigned long page_size = 1UL << page_shift;
+	unsigned long nr_pages = (end - start) >> page_shift;
+	bool local, full;
 
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
@@ -372,8 +442,9 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 
 	preempt_disable();
 	local = mm_is_thread_local(mm);
-	if (end == TLB_FLUSH_ALL ||
-	    (end - start) > tlb_single_page_flush_ceiling * page_size) {
+	full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling);
+
+	if (full) {
 		if (local)
 			_tlbiel_pid(pid, RIC_FLUSH_TLB);
 		else
@@ -391,7 +462,6 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	unsigned long pid, end;
-	bool local;
 
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
@@ -403,20 +473,18 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 		return;
 	}
 
+	end = addr + HPAGE_PMD_SIZE;
+
+	/* Otherwise first do the PWC, then iterate the pages. */
 	preempt_disable();
-	local = mm_is_thread_local(mm);
-	/* Otherwise first do the PWC */
-	if (local)
-		_tlbiel_pid(pid, RIC_FLUSH_PWC);
-	else
-		_tlbie_pid(pid, RIC_FLUSH_PWC);
 
-	/* Then iterate the pages */
-	end = addr + HPAGE_PMD_SIZE;
-	if (local)
+	if (mm_is_thread_local(mm)) {
+		_tlbiel_pid(pid, RIC_FLUSH_PWC);
 		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
-	else
+	} else {
+		_tlbie_pid(pid, RIC_FLUSH_PWC);
 		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
+	}
 
 	preempt_enable();
 }
-- 
cgit 


From f6f27951fdf84a6edca3ea14077268ad629b57ac Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 7 Nov 2017 18:53:08 +1100
Subject: powerpc/64s/radix: Introduce local single page ceiling for TLB range
 flush

The single page flush ceiling is the cut-off point at which we switch
from invalidating individual pages, to invalidating the entire process
address space in response to a range flush.

Introduce a local variant of this heuristic because local and global
tlbie have significantly different properties:
- Local tlbiel requires 128 instructions to invalidate a PID, global
  tlbie only 1 instruction.
- Global tlbie instructions are expensive broadcast operations.

The local ceiling has been made much higher, 2x the number of
instructions required to invalidate the entire PID (i.e., 256 pages).

     Time to mprotect N pages of memory (after mmap, touch), local invalidate:
     N           32     34      64     128     256     512
     vanilla  7.4us  9.0us  14.6us  26.4us  50.2us  98.3us
     patched  7.4us  7.8us  13.8us  26.4us  51.9us  98.3us

The behaviour of both is identical at N=32 and N=512. Between there,
the vanilla kernel does a PID invalidate and the patched kernel does
a va range invalidate.

At N=128, these require the same number of tlbiel instructions, so
the patched version can be sen to be cheaper when < 128, and more
expensive when > 128. However this does not well capture the cost
of invalidated TLB.

The additional cost at 256 pages does not seem prohibitive. It may
be the case that increasing the limit further would continue to be
beneficial to avoid invalidating all of the process's TLB entries.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/tlb-radix.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index b4b49de551a9..cfa08da534a7 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -326,6 +326,7 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
  * individual page flushes to full-pid flushes.
  */
 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2;
 
 void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		     unsigned long end)
@@ -348,8 +349,15 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		return;
 
 	preempt_disable();
-	local = mm_is_thread_local(mm);
-	full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling);
+	if (mm_is_thread_local(mm)) {
+		local = true;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_local_single_page_flush_ceiling);
+	} else {
+		local = false;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_single_page_flush_ceiling);
+	}
 
 	if (full) {
 		if (local)
@@ -441,8 +449,15 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 		return;
 
 	preempt_disable();
-	local = mm_is_thread_local(mm);
-	full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling);
+	if (mm_is_thread_local(mm)) {
+		local = true;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_local_single_page_flush_ceiling);
+	} else {
+		local = false;
+		full = (end == TLB_FLUSH_ALL ||
+				nr_pages > tlb_single_page_flush_ceiling);
+	}
 
 	if (full) {
 		if (local)
-- 
cgit 


From 0b2f5a8a792755c88bd786f89712a9fac9967b2b Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 7 Nov 2017 18:53:09 +1100
Subject: powerpc/64s/radix: Improve TLB flushing for page table freeing

Unmaps that free page tables always flush the entire PID, which is
sub-optimal. Provide TLB range flushing with an additional PWC flush
that can be use for va range invalidations with PWC flush.

     Time to munmap N pages of memory including last level page table
     teardown (after mmap, touch), local invalidate:
     N           1       2      4      8     16     32     64
     vanilla  3.2us  3.3us  3.4us  3.6us  4.1us  5.2us  7.2us
     patched  1.4us  1.5us  1.7us  1.9us  2.6us  3.7us  6.2us

     Global invalidate:
     N           1       2      4      8     16      32     64
     vanilla  2.2us  2.3us  2.4us  2.6us  3.2us   4.1us  6.2us
     patched  2.1us  2.5us  3.4us  5.2us  8.7us  15.7us  6.2us

Local invalidates get much better across the board. Global ones have
the same issue where multiple tlbies for va flush do get slower than
the single tlbie to invalidate the PID. None of this test captures
the TLB benefits of avoiding killing everything.

Global gets worse, but it is brought in to line with global invalidate
for munmap()s that do not free page tables.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/tlb-radix.c | 90 ++++++++++++++++++++++++++++++---------------
 1 file changed, 61 insertions(+), 29 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index cfa08da534a7..884f4b705b57 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -39,6 +39,20 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
 	trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
+static inline void __tlbie_pid(unsigned long pid, unsigned long ric)
+{
+	unsigned long rb,rs,prs,r;
+
+	rb = PPC_BIT(53); /* IS = 1 */
+	rs = pid << PPC_BITLSHIFT(31);
+	prs = 1; /* process scoped */
+	r = 1;   /* raidx format */
+
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	trace_tlbie(0, 0, rb, rs, ric, prs, r);
+}
+
 /*
  * We use 128 set in radix mode and 256 set in hpt mode.
  */
@@ -70,18 +84,9 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
 
 static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
 {
-	unsigned long rb,rs,prs,r;
-
-	rb = PPC_BIT(53); /* IS = 1 */
-	rs = pid << PPC_BITLSHIFT(31);
-	prs = 1; /* process scoped */
-	r = 1;   /* raidx format */
-
 	asm volatile("ptesync": : :"memory");
-	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
-		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
+	__tlbie_pid(pid, ric);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
-	trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
 static inline void __tlbiel_va(unsigned long va, unsigned long pid,
@@ -123,9 +128,11 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid,
 
 static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
 				    unsigned long pid, unsigned long page_size,
-				    unsigned long psize)
+				    unsigned long psize, bool also_pwc)
 {
 	asm volatile("ptesync": : :"memory");
+	if (also_pwc)
+		__tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
 	__tlbiel_va_range(start, end, pid, page_size, psize);
 	asm volatile("ptesync": : :"memory");
 }
@@ -169,9 +176,11 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
 
 static inline void _tlbie_va_range(unsigned long start, unsigned long end,
 				    unsigned long pid, unsigned long page_size,
-				    unsigned long psize)
+				    unsigned long psize, bool also_pwc)
 {
 	asm volatile("ptesync": : :"memory");
+	if (also_pwc)
+		__tlbie_pid(pid, RIC_FLUSH_PWC);
 	__tlbie_va_range(start, end, pid, page_size, psize);
 	asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
@@ -412,13 +421,15 @@ static int radix_get_mmu_psize(int page_size)
 	return psize;
 }
 
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, int psize);
+
 void radix__tlb_flush(struct mmu_gather *tlb)
 {
 	int psize = 0;
 	struct mm_struct *mm = tlb->mm;
 	int page_size = tlb->page_size;
 
-	psize = radix_get_mmu_psize(page_size);
 	/*
 	 * if page size is not something we understand, do a full mm flush
 	 *
@@ -426,17 +437,28 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 	 * that flushes the process table entry cache upon process teardown.
 	 * See the comment for radix in arch_exit_mmap().
 	 */
-	if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
-		radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
-	else if (tlb->fullmm || tlb->need_flush_all) {
-		tlb->need_flush_all = 0;
+	if (tlb->fullmm) {
 		radix__flush_all_mm(mm);
-	} else
-		radix__flush_tlb_mm(mm);
+	} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
+		if (!tlb->need_flush_all)
+			radix__flush_tlb_mm(mm);
+		else
+			radix__flush_all_mm(mm);
+	} else {
+		unsigned long start = tlb->start;
+		unsigned long end = tlb->end;
+
+		if (!tlb->need_flush_all)
+			radix__flush_tlb_range_psize(mm, start, end, psize);
+		else
+			radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
+	}
+	tlb->need_flush_all = 0;
 }
 
-void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
-				  unsigned long end, int psize)
+static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm,
+				unsigned long start, unsigned long end,
+				int psize, bool also_pwc)
 {
 	unsigned long pid;
 	unsigned int page_shift = mmu_psize_defs[psize].shift;
@@ -461,18 +483,30 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
 
 	if (full) {
 		if (local)
-			_tlbiel_pid(pid, RIC_FLUSH_TLB);
+			_tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB);
 		else
-			_tlbie_pid(pid, RIC_FLUSH_TLB);
+			_tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB);
 	} else {
 		if (local)
-			_tlbiel_va_range(start, end, pid, page_size, psize);
+			_tlbiel_va_range(start, end, pid, page_size, psize, also_pwc);
 		else
-			_tlbie_va_range(start, end, pid, page_size, psize);
+			_tlbie_va_range(start, end, pid, page_size, psize, also_pwc);
 	}
 	preempt_enable();
 }
 
+void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, int psize)
+{
+	return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
+}
+
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start,
+				  unsigned long end, int psize)
+{
+	__radix__flush_tlb_range_psize(mm, start, end, psize, true);
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 {
@@ -494,11 +528,9 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 	preempt_disable();
 
 	if (mm_is_thread_local(mm)) {
-		_tlbiel_pid(pid, RIC_FLUSH_PWC);
-		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
+		_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
 	} else {
-		_tlbie_pid(pid, RIC_FLUSH_PWC);
-		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
+		_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true);
 	}
 
 	preempt_enable();
-- 
cgit 


From 339a3293f4e493a6c40f71e4faab0c8389174313 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Mon, 23 Oct 2017 18:05:07 +1000
Subject: powerpc/powernv: Avoid waiting for secondary hold spinloop with OPAL

OPAL boot does not insert secondaries at 0x60 to wait at the secondary
hold spinloop. Instead they are started later, and inserted at
generic_secondary_smp_init(), which is after the secondary hold
spinloop.

Avoid waiting on this spinloop when booting with OPAL firmware. This
wait always times out that case.

This saves 100ms boot time on powernv, and 10s of seconds of real time
when booting on the simulator in SMP.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/head_64.S  | 16 +++++++++++-----
 arch/powerpc/kernel/setup_64.c | 10 +++++++++-
 2 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index ff8511d6d8ea..aa71a90f5222 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -55,12 +55,18 @@
  *
  *  For pSeries or server processors:
  *   1. The MMU is off & open firmware is running in real mode.
- *   2. The kernel is entered at __start
+ *   2. The primary CPU enters at __start.
+ *   3. If the RTAS supports "query-cpu-stopped-state", then secondary
+ *      CPUs will enter as directed by "start-cpu" RTAS call, which is
+ *      generic_secondary_smp_init, with PIR in r3.
+ *   4. Else the secondary CPUs will enter at secondary_hold (0x60) as
+ *      directed by the "start-cpu" RTS call, with PIR in r3.
  * -or- For OPAL entry:
- *   1. The MMU is off, processor in HV mode, primary CPU enters at 0
- *      with device-tree in gpr3. We also get OPAL base in r8 and
- *	entry in r9 for debugging purposes
- *   2. Secondary processors enter at 0x60 with PIR in gpr3
+ *   1. The MMU is off, processor in HV mode.
+ *   2. The primary CPU enters at 0 with device-tree in r3, OPAL base
+ *      in r8, and entry in r9 for debugging purposes.
+ *   3. Secondary CPUs enter as directed by OPAL_START_CPU call, which
+ *      is at generic_secondary_smp_init, with PIR in r3.
  *
  *  For Book3E processors:
  *   1. The MMU is on running in AS0 in a state defined in ePAPR
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index b89c6aac48c9..87b4fe75fd45 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -360,8 +360,16 @@ void early_setup_secondary(void)
 #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE)
 static bool use_spinloop(void)
 {
-	if (!IS_ENABLED(CONFIG_PPC_BOOK3E))
+	if (IS_ENABLED(CONFIG_PPC_BOOK3S)) {
+		/*
+		 * See comments in head_64.S -- not all platforms insert
+		 * secondaries at __secondary_hold and wait at the spin
+		 * loop.
+		 */
+		if (firmware_has_feature(FW_FEATURE_OPAL))
+			return false;
 		return true;
+	}
 
 	/*
 	 * When book3e boots from kexec, the ePAPR spin table does
-- 
cgit 


From 1696d0fb7fcd18160c9cc92a3f2b2d68e6923dd8 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Tue, 24 Oct 2017 21:44:44 +1000
Subject: powerpc/64: Set DSCR default initially from SPR

Take the DSCR value set by firmware as the dscr_default value,
rather than zero.

POWER9 recommends DSCR default to a non-zero value.

Signed-off-by: From: Nicholas Piggin <npiggin@gmail.com>
[mpe: Make record_spr_defaults() __init]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/setup.h    |  6 ++++++
 arch/powerpc/kernel/setup_64.c |  9 +++++++++
 arch/powerpc/kernel/sysfs.c    | 11 +++++++++++
 3 files changed, 26 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h
index cfba134b3024..21c18071d9d5 100644
--- a/arch/powerpc/kernel/setup.h
+++ b/arch/powerpc/kernel/setup.h
@@ -45,6 +45,12 @@ void emergency_stack_init(void);
 static inline void emergency_stack_init(void) { };
 #endif
 
+#ifdef CONFIG_PPC64
+void record_spr_defaults(void);
+#else
+static inline void record_spr_defaults(void) { };
+#endif
+
 /*
  * Having this in kvm_ppc.h makes include dependencies too
  * tricky to solve for setup-common.c so have it here.
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 87b4fe75fd45..8956a9856604 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -69,6 +69,8 @@
 #include <asm/opal.h>
 #include <asm/cputhreads.h>
 
+#include "setup.h"
+
 #ifdef DEBUG
 #define DBG(fmt...) udbg_printf(fmt)
 #else
@@ -316,6 +318,13 @@ void __init early_setup(unsigned long dt_ptr)
 	/* Initialize the hash table or TLB handling */
 	early_init_mmu();
 
+	/*
+	 * After firmware and early platform setup code has set things up,
+	 * we note the SPR values for configurable control/performance
+	 * registers, and use those as initial defaults.
+	 */
+	record_spr_defaults();
+
 	/*
 	 * At this point, we can let interrupts switch to virtual mode
 	 * (the MMU has been setup), so adjust the MSR in the PACA to
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 4437c70c7c2b..b8d4a1dac39f 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -590,6 +590,17 @@ static void sysfs_create_dscr_default(void)
 	if (cpu_has_feature(CPU_FTR_DSCR))
 		err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default);
 }
+
+void __init record_spr_defaults(void)
+{
+	int cpu;
+
+	if (cpu_has_feature(CPU_FTR_DSCR)) {
+		dscr_default = mfspr(SPRN_DSCR);
+		for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+			paca[cpu].dscr_default = dscr_default;
+	}
+}
 #endif /* CONFIG_PPC64 */
 
 #ifdef HAS_PPC_PMC_PA6T
-- 
cgit 


From e34917fbee1226144c94413697ddbf2d5b06d0d3 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:41 -0800
Subject: powerpc/vas: init missing fields from [rt]xattr

Initialize a few missing window context fields from the window attributes
specified by the caller. These fields are currently set to their default
values by the caller (NX-842), but would be good to apply them anyway.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 5aae845b8cd9..cec7ab7119df 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -679,10 +679,13 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
 
 	winctx->nx_win = rxattr->nx_win;
 	winctx->fault_win = rxattr->fault_win;
+	winctx->user_win = rxattr->user_win;
+	winctx->rej_no_credit = rxattr->rej_no_credit;
 	winctx->rx_word_mode = rxattr->rx_win_ord_mode;
 	winctx->tx_word_mode = rxattr->tx_win_ord_mode;
 	winctx->rx_wcred_mode = rxattr->rx_wcred_mode;
 	winctx->tx_wcred_mode = rxattr->tx_wcred_mode;
+	winctx->notify_early = rxattr->notify_early;
 
 	if (winctx->nx_win) {
 		winctx->data_stamp = true;
@@ -889,11 +892,14 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
 	winctx->user_win = txattr->user_win;
 	winctx->nx_win = txwin->rxwin->nx_win;
 	winctx->pin_win = txattr->pin_win;
+	winctx->rej_no_credit = txattr->rej_no_credit;
+	winctx->rsvd_txbuf_enable = txattr->rsvd_txbuf_enable;
 
 	winctx->rx_wcred_mode = txattr->rx_wcred_mode;
 	winctx->tx_wcred_mode = txattr->tx_wcred_mode;
 	winctx->rx_word_mode = txattr->rx_win_ord_mode;
 	winctx->tx_word_mode = txattr->tx_win_ord_mode;
+	winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count;
 
 	if (winctx->nx_win) {
 		winctx->data_stamp = true;
-- 
cgit 


From 51b537124fc24074aee67cae9ca94ec4d9c204fc Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:42 -0800
Subject: powerpc/vas: Validate window credits

NX-842, the only user of VAS, sets the window credits to default values
but VAS should check the credits against the possible max values.

The VAS_WCREDS_MIN is not needed and can be dropped.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 6 ++++++
 arch/powerpc/platforms/powernv/vas.h        | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index cec7ab7119df..a2fe120ac06d 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -738,6 +738,9 @@ static bool rx_win_args_valid(enum vas_cop_type cop,
 	if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX)
 		return false;
 
+	if (attr->wcreds_max > VAS_RX_WCREDS_MAX)
+		return false;
+
 	if (attr->nx_win) {
 		/* cannot be fault or user window if it is nx */
 		if (attr->fault_win || attr->user_win)
@@ -927,6 +930,9 @@ static bool tx_win_args_valid(enum vas_cop_type cop,
 	if (cop > VAS_COP_TYPE_MAX)
 		return false;
 
+	if (attr->wcreds_max > VAS_TX_WCREDS_MAX)
+		return false;
+
 	if (attr->user_win &&
 			(cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count))
 		return false;
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 38dee5d50f31..fea0de44f076 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -106,8 +106,8 @@
  *
  * TODO: Needs tuning for per-process credits
  */
-#define VAS_WCREDS_MIN			16
-#define VAS_WCREDS_MAX			((64 << 10) - 1)
+#define VAS_RX_WCREDS_MAX		((64 << 10) - 1)
+#define VAS_TX_WCREDS_MAX		((4 << 10) - 1)
 #define VAS_WCREDS_DEFAULT		(1 << 10)
 
 /*
-- 
cgit 


From 0a2c2c24cf78473da785654361ec957f129f4820 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:43 -0800
Subject: powerpc/vas: Cleanup some debug code

Clean up vas.h and the debug code around ifdef vas_debug.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c |  8 +++--
 arch/powerpc/platforms/powernv/vas.h        | 54 ++++++-----------------------
 2 files changed, 17 insertions(+), 45 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index a2fe120ac06d..67ffc5d994cc 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -726,7 +726,10 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
 static bool rx_win_args_valid(enum vas_cop_type cop,
 			struct vas_rx_win_attr *attr)
 {
-	dump_rx_win_attr(attr);
+	pr_debug("Rxattr: fault %d, notify %d, intr %d, early %d, fifo %d\n",
+			attr->fault_win, attr->notify_disable,
+			attr->intr_disable, attr->notify_early,
+			attr->rx_fifo_size);
 
 	if (cop >= VAS_COP_TYPE_MAX)
 		return false;
@@ -1050,7 +1053,8 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re)
 	else
 		rc = -EINVAL;
 
-	print_fifo_msg_count(txwin);
+	pr_debug("Txwin #%d: Msg count %llu\n", txwin->winid,
+			read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
 
 	return rc;
 }
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index fea0de44f076..63e8e037eda0 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -258,6 +258,16 @@
 #define VAS_NX_UTIL_ADDER_OFFSET	0x180
 #define VAS_NX_UTIL_ADDER		PPC_BITMASK(32, 63)
 
+/*
+ * VREG(x):
+ * Expand a register's short name (eg: LPID) into two parameters:
+ *	- the register's short name in string form ("LPID"), and
+ *	- the name of the macro (eg: VAS_LPID_OFFSET), defining the
+ *	  register's offset in the window context
+ */
+#define VREG_SFX(n, s)	__stringify(n), VAS_##n##s
+#define VREG(r)		VREG_SFX(r, _OFFSET)
+
 /*
  * Local Notify Scope Control Register. (Receive windows only).
  */
@@ -385,43 +395,15 @@ struct vas_winctx {
 
 extern struct vas_instance *find_vas_instance(int vasid);
 
-/*
- * VREG(x):
- * Expand a register's short name (eg: LPID) into two parameters:
- *	- the register's short name in string form ("LPID"), and
- *	- the name of the macro (eg: VAS_LPID_OFFSET), defining the
- *	  register's offset in the window context
- */
-#define VREG_SFX(n, s)	__stringify(n), VAS_##n##s
-#define VREG(r)		VREG_SFX(r, _OFFSET)
-
-#ifdef vas_debug
-static inline void dump_rx_win_attr(struct vas_rx_win_attr *attr)
-{
-	pr_err("fault %d, notify %d, intr %d early %d\n",
-			attr->fault_win, attr->notify_disable,
-			attr->intr_disable, attr->notify_early);
-
-	pr_err("rx_fifo_size %d, max value %d\n",
-				attr->rx_fifo_size, VAS_RX_FIFO_SIZE_MAX);
-}
-
 static inline void vas_log_write(struct vas_window *win, char *name,
 			void *regptr, u64 val)
 {
 	if (val)
-		pr_err("%swin #%d: %s reg %p, val 0x%016llx\n",
+		pr_debug("%swin #%d: %s reg %p, val 0x%016llx\n",
 				win->tx_win ? "Tx" : "Rx", win->winid, name,
 				regptr, val);
 }
 
-#else	/* vas_debug */
-
-#define vas_log_write(win, name, reg, val)
-#define dump_rx_win_attr(attr)
-
-#endif	/* vas_debug */
-
 static inline void write_uwc_reg(struct vas_window *win, char *name,
 			s32 reg, u64 val)
 {
@@ -450,18 +432,4 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
 	return in_be64(win->hvwc_map+reg);
 }
 
-#ifdef vas_debug
-
-static void print_fifo_msg_count(struct vas_window *txwin)
-{
-	uint64_t read_hvwc_reg(struct vas_window *w, char *n, uint64_t o);
-	pr_devel("Winid %d, Msg count %llu\n", txwin->winid,
-			(uint64_t)read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
-}
-#else	/* vas_debug */
-
-#define print_fifo_msg_count(window)
-
-#endif	/* vas_debug */
-
 #endif /* _VAS_H */
-- 
cgit 


From 4963ac3632dda7433db5149d6abdfc644a8d8ab2 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:44 -0800
Subject: powerpc/vas: Drop poll_window_cast_out().

Polling for window cast out is listed in the spec, but turns out that
it is not strictly necessary and slows down window close. Making it a
stub for now.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 34 ++++++++++++++---------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 67ffc5d994cc..8ab8a8208347 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1079,25 +1079,25 @@ retry:
 	}
 }
 
+/*
+ * Have the hardware cast a window out of cache and wait for it to
+ * be completed.
+ *
+ * NOTE: It can take a relatively long time to cast the window context
+ *	out of the cache. It is not strictly necessary to cast out if:
+ *
+ *	- we clear the "Pin Window" bit (so hardware is free to evict)
+ *
+ *	- we re-initialize the window context when it is reassigned.
+ *
+ *	We do the former in vas_win_close() and latter in vas_win_open().
+ *	So, ignoring the cast-out for now. We can add it as needed. If
+ *	casting out becomes necessary we should consider offloading the
+ *	job to a worker thread, so the window close can proceed quickly.
+ */
 static void poll_window_castout(struct vas_window *window)
 {
-	int cached;
-	u64 val;
-
-	/* Cast window context out of the cache */
-retry:
-	val = read_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL));
-	cached = GET_FIELD(VAS_WIN_CACHE_STATUS, val);
-	if (cached) {
-		val = 0ULL;
-		val = SET_FIELD(VAS_CASTOUT_REQ, val, 1);
-		val = SET_FIELD(VAS_PUSH_TO_MEM, val, 0);
-		write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val);
-
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(HZ);
-		goto retry;
-	}
+	/* stub for now */
 }
 
 /*
-- 
cgit 


From 36a288fe9dab9a6b0b50ffdb5c34f04c42cee2ac Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:45 -0800
Subject: powerpc/vas: Use helper to unpin/close window

Use a helper to have the hardware unpin and mark a window closed.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 8ab8a8208347..95622a984b05 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1100,6 +1100,20 @@ static void poll_window_castout(struct vas_window *window)
 	/* stub for now */
 }
 
+/*
+ * Unpin and close a window so no new requests are accepted and the
+ * hardware can evict this window from cache if necessary.
+ */
+static void unpin_close_window(struct vas_window *window)
+{
+	u64 val;
+
+	val = read_hvwc_reg(window, VREG(WINCTL));
+	val = SET_FIELD(VAS_WINCTL_PIN, val, 0);
+	val = SET_FIELD(VAS_WINCTL_OPEN, val, 0);
+	write_hvwc_reg(window, VREG(WINCTL), val);
+}
+
 /*
  * Close a window.
  *
@@ -1114,8 +1128,6 @@ static void poll_window_castout(struct vas_window *window)
  */
 int vas_win_close(struct vas_window *window)
 {
-	u64 val;
-
 	if (!window)
 		return 0;
 
@@ -1131,11 +1143,7 @@ int vas_win_close(struct vas_window *window)
 
 	poll_window_busy_state(window);
 
-	/* Unpin window from cache and close it */
-	val = read_hvwc_reg(window, VREG(WINCTL));
-	val = SET_FIELD(VAS_WINCTL_PIN, val, 0);
-	val = SET_FIELD(VAS_WINCTL_OPEN, val, 0);
-	write_hvwc_reg(window, VREG(WINCTL), val);
+	unpin_close_window(window);
 
 	poll_window_castout(window);
 
-- 
cgit 


From dfe954e4456277effffb2c5add47fa25390f8cea Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:46 -0800
Subject: powerpc/vas: Reduce polling interval for busy state

A VAS window is normally in "busy" state for only a short duration.
Reduce the time we wait for the window to go to "not-busy" state to
speed-up vas_win_close() a bit.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 95622a984b05..1422cdd7d917 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1060,21 +1060,23 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re)
 }
 EXPORT_SYMBOL_GPL(vas_paste_crb);
 
+/*
+ * Wait for the window to go to "not-busy" state. It should only take a
+ * short time to queue a CRB, so window should not be busy for too long.
+ * Trying 5ms intervals.
+ */
 static void poll_window_busy_state(struct vas_window *window)
 {
 	int busy;
 	u64 val;
 
 retry:
-	/*
-	 * Poll Window Busy flag
-	 */
 	val = read_hvwc_reg(window, VREG(WIN_STATUS));
 	busy = GET_FIELD(VAS_WIN_BUSY, val);
 	if (busy) {
 		val = 0;
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(HZ);
+		schedule_timeout(msecs_to_jiffies(5));
 		goto retry;
 	}
 }
-- 
cgit 


From 62f659e08ccd657ead6901011f5e542dbdc477c5 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:47 -0800
Subject: powerpc/vas: Save configured window credits

Save the configured max window credits for a window in the vas_window
structure. We will need this when polling for return of window credits.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 6 ++++--
 arch/powerpc/platforms/powernv/vas.h        | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 1422cdd7d917..a59a187c0cd1 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -674,7 +674,7 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
 
 	winctx->rx_fifo = rxattr->rx_fifo;
 	winctx->rx_fifo_size = rxattr->rx_fifo_size;
-	winctx->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+	winctx->wcreds_max = rxwin->wcreds_max;
 	winctx->pin_win = rxattr->pin_win;
 
 	winctx->nx_win = rxattr->nx_win;
@@ -844,6 +844,7 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop,
 	rxwin->nx_win = rxattr->nx_win;
 	rxwin->user_win = rxattr->user_win;
 	rxwin->cop = cop;
+	rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
 	if (rxattr->user_win)
 		rxwin->pid = task_pid_vnr(current);
 
@@ -893,7 +894,7 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
 	 */
 	memset(winctx, 0, sizeof(struct vas_winctx));
 
-	winctx->wcreds_max = txattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+	winctx->wcreds_max = txwin->wcreds_max;
 
 	winctx->user_win = txattr->user_win;
 	winctx->nx_win = txwin->rxwin->nx_win;
@@ -978,6 +979,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
 	txwin->nx_win = txwin->rxwin->nx_win;
 	txwin->pid = attr->pid;
 	txwin->user_win = attr->user_win;
+	txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
 
 	init_winctx_for_txwin(txwin, attr, &winctx);
 
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 63e8e037eda0..02d8a31d9051 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -332,6 +332,7 @@ struct vas_window {
 	void *hvwc_map;		/* HV window context */
 	void *uwc_map;		/* OS/User window context */
 	pid_t pid;		/* Linux process id of owner */
+	int wcreds_max;		/* Window credits */
 
 	/* Fields applicable only to send windows */
 	void *paste_kaddr;
-- 
cgit 


From 6fccac16c578c699bf0714a6c930b0ceb81305a0 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:48 -0800
Subject: powerpc/vas: poll for return of window credits

Normally, the NX driver waits for the CRBs to be processed before closing
the window. But it is better to ensure that the credits are returned before
the window gets reassigned later.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 45 +++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index a59a187c0cd1..23c13a7dcf89 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1062,6 +1062,49 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re)
 }
 EXPORT_SYMBOL_GPL(vas_paste_crb);
 
+/*
+ * If credit checking is enabled for this window, poll for the return
+ * of window credits (i.e for NX engines to process any outstanding CRBs).
+ * Since NX-842 waits for the CRBs to be processed before closing the
+ * window, we should not have to wait for too long.
+ *
+ * TODO: We retry in 10ms intervals now. We could/should probably peek at
+ *	the VAS_LRFIFO_PUSH_OFFSET register to get an estimate of pending
+ *	CRBs on the FIFO and compute the delay dynamically on each retry.
+ *	But that is not really needed until we support NX-GZIP access from
+ *	user space. (NX-842 driver waits for CSB and Fast thread-wakeup
+ *	doesn't use credit checking).
+ */
+static void poll_window_credits(struct vas_window *window)
+{
+	u64 val;
+	int creds, mode;
+
+	val = read_hvwc_reg(window, VREG(WINCTL));
+	if (window->tx_win)
+		mode = GET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val);
+	else
+		mode = GET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val);
+
+	if (!mode)
+		return;
+retry:
+	if (window->tx_win) {
+		val = read_hvwc_reg(window, VREG(TX_WCRED));
+		creds = GET_FIELD(VAS_TX_WCRED, val);
+	} else {
+		val = read_hvwc_reg(window, VREG(LRX_WCRED));
+		creds = GET_FIELD(VAS_LRX_WCRED, val);
+	}
+
+	if (creds < window->wcreds_max) {
+		val = 0;
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(msecs_to_jiffies(10));
+		goto retry;
+	}
+}
+
 /*
  * Wait for the window to go to "not-busy" state. It should only take a
  * short time to queue a CRB, so window should not be busy for too long.
@@ -1149,6 +1192,8 @@ int vas_win_close(struct vas_window *window)
 
 	unpin_close_window(window);
 
+	poll_window_credits(window);
+
 	poll_window_castout(window);
 
 	/* if send window, drop reference to matching receive window */
-- 
cgit 


From ca03258b6b338b392c778bed9c7dd56e6a513012 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:49 -0800
Subject: powerpc/vas: Create cpu to vas id mapping

Create a cpu to vasid mapping so callers can specify -1 instead of
trying to find a VAS id.

Changelog[v2]
	[Michael Ellerman] Use per-cpu variables to simplify code.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
index 565a4878fefa..abb7090a22b4 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -18,15 +18,18 @@
 #include <linux/of_platform.h>
 #include <linux/of_address.h>
 #include <linux/of.h>
+#include <asm/prom.h>
 
 #include "vas.h"
 
 static DEFINE_MUTEX(vas_mutex);
 static LIST_HEAD(vas_instances);
 
+static DEFINE_PER_CPU(int, cpu_vas_id);
+
 static int init_vas_instance(struct platform_device *pdev)
 {
-	int rc, vasid;
+	int rc, cpu, vasid;
 	struct resource *res;
 	struct vas_instance *vinst;
 	struct device_node *dn = pdev->dev.of_node;
@@ -74,6 +77,11 @@ static int init_vas_instance(struct platform_device *pdev)
 			"paste_win_id_shift 0x%llx\n", pdev->name, vasid,
 			vinst->paste_base_addr, vinst->paste_win_id_shift);
 
+	for_each_possible_cpu(cpu) {
+		if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn))
+			per_cpu(cpu_vas_id, cpu) = vasid;
+	}
+
 	mutex_lock(&vas_mutex);
 	list_add(&vinst->node, &vas_instances);
 	mutex_unlock(&vas_mutex);
@@ -98,6 +106,10 @@ struct vas_instance *find_vas_instance(int vasid)
 	struct vas_instance *vinst;
 
 	mutex_lock(&vas_mutex);
+
+	if (vasid == -1)
+		vasid = per_cpu(cpu_vas_id, smp_processor_id());
+
 	list_for_each(ent, &vas_instances) {
 		vinst = list_entry(ent, struct vas_instance, node);
 		if (vinst->vas_id == vasid) {
-- 
cgit 


From d4ef61b5e8955fb913e2e1a6c1533414859a839d Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:50 -0800
Subject: powerpc/vas, nx-842: Define and use chip_to_vas_id()

Define a helper, chip_to_vas_id() to map a given chip id to corresponding
vas id.

Normally, callers of vas_rx_win_open() and vas_tx_win_open() want the VAS
window to be on the same chip where the calling thread is executing. These
callers can pass in -1 for the VAS id.

This interface will be useful if a thread running on one chip wants to open
a window on another chip (like the NX-842 driver does during start up).

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/vas.h       |  9 +++++++++
 arch/powerpc/platforms/powernv/vas.c | 11 +++++++++++
 2 files changed, 20 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index fd5963acd658..044748f547f5 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -103,6 +103,15 @@ struct vas_tx_win_attr {
 	bool rx_win_ord_mode;
 };
 
+/*
+ * Helper to map a chip id to VAS id.
+ * For POWER9, this is a 1:1 mapping. In the future this maybe a 1:N
+ * mapping in which case, we will need to update this helper.
+ *
+ * Return the VAS id or -1 if no matching vasid is found.
+ */
+int chip_to_vas_id(int chipid);
+
 /*
  * Helper to initialize receive window attributes to defaults for an
  * NX window.
diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
index abb7090a22b4..cd9a733d05e2 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -123,6 +123,17 @@ struct vas_instance *find_vas_instance(int vasid)
 	return NULL;
 }
 
+int chip_to_vas_id(int chipid)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (cpu_to_chip_id(cpu) == chipid)
+			return per_cpu(cpu_vas_id, cpu);
+	}
+	return -1;
+}
+
 static int vas_probe(struct platform_device *pdev)
 {
 	return init_vas_instance(pdev);
-- 
cgit 


From ece4e51291485bb4a71ff554964948b02ab89823 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:51 -0800
Subject: powerpc/vas: Export HVWC to debugfs

Export the VAS Window context information to debugfs.

We need to hold a mutex when closing the window to prevent a race
with the debugfs read(). Rather than introduce a per-instance mutex,
we use the global vas_mutex for now, since it is not heavily contended.

The window->cop field is only relevant to a receive window so we were
not setting it for a send window (which is is paired to a receive window
anyway). But to simplify reporting in debugfs, set the 'cop' field for the
send window also.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/Makefile     |   3 +-
 arch/powerpc/platforms/powernv/vas-debug.c  | 209 ++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/vas-window.c |  34 ++++-
 arch/powerpc/platforms/powernv/vas.c        |   6 +-
 arch/powerpc/platforms/powernv/vas.h        |  14 ++
 5 files changed, 257 insertions(+), 9 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/vas-debug.c

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 37d60f7dd86d..17921c45d10b 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -14,4 +14,5 @@ obj-$(CONFIG_TRACEPOINTS)	+= opal-tracepoints.o
 obj-$(CONFIG_OPAL_PRD)	+= opal-prd.o
 obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
 obj-$(CONFIG_PPC_MEMTRACE)	+= memtrace.o
-obj-$(CONFIG_PPC_VAS)	+= vas.o vas-window.o
+obj-$(CONFIG_PPC_VAS)	+= vas.o vas-window.o vas-debug.o
+obj-$(CONFIG_PPC_FTW)	+= nx-ftw.o
diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c
new file mode 100644
index 000000000000..ca22f1eae050
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-debug.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2016-17 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include "vas.h"
+
+static struct dentry *vas_debugfs;
+
+static char *cop_to_str(int cop)
+{
+	switch (cop) {
+	case VAS_COP_TYPE_FAULT:	return "Fault";
+	case VAS_COP_TYPE_842:		return "NX-842 Normal Priority";
+	case VAS_COP_TYPE_842_HIPRI:	return "NX-842 High Priority";
+	case VAS_COP_TYPE_GZIP:		return "NX-GZIP Normal Priority";
+	case VAS_COP_TYPE_GZIP_HIPRI:	return "NX-GZIP High Priority";
+	case VAS_COP_TYPE_FTW:		return "Fast Thread-wakeup";
+	default:			return "Unknown";
+	}
+}
+
+static int info_dbg_show(struct seq_file *s, void *private)
+{
+	struct vas_window *window = s->private;
+
+	mutex_lock(&vas_mutex);
+
+	/* ensure window is not unmapped */
+	if (!window->hvwc_map)
+		goto unlock;
+
+	seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop),
+					window->tx_win ? "Send" : "Receive");
+	seq_printf(s, "Pid : %d\n", window->pid);
+
+unlock:
+	mutex_unlock(&vas_mutex);
+	return 0;
+}
+
+static int info_dbg_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, info_dbg_show, inode->i_private);
+}
+
+static const struct file_operations info_fops = {
+	.open		= info_dbg_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static inline void print_reg(struct seq_file *s, struct vas_window *win,
+			char *name, u32 reg)
+{
+	seq_printf(s, "0x%016llx %s\n", read_hvwc_reg(win, name, reg), name);
+}
+
+static int hvwc_dbg_show(struct seq_file *s, void *private)
+{
+	struct vas_window *window = s->private;
+
+	mutex_lock(&vas_mutex);
+
+	/* ensure window is not unmapped */
+	if (!window->hvwc_map)
+		goto unlock;
+
+	print_reg(s, window, VREG(LPID));
+	print_reg(s, window, VREG(PID));
+	print_reg(s, window, VREG(XLATE_MSR));
+	print_reg(s, window, VREG(XLATE_LPCR));
+	print_reg(s, window, VREG(XLATE_CTL));
+	print_reg(s, window, VREG(AMR));
+	print_reg(s, window, VREG(SEIDR));
+	print_reg(s, window, VREG(FAULT_TX_WIN));
+	print_reg(s, window, VREG(OSU_INTR_SRC_RA));
+	print_reg(s, window, VREG(HV_INTR_SRC_RA));
+	print_reg(s, window, VREG(PSWID));
+	print_reg(s, window, VREG(LFIFO_BAR));
+	print_reg(s, window, VREG(LDATA_STAMP_CTL));
+	print_reg(s, window, VREG(LDMA_CACHE_CTL));
+	print_reg(s, window, VREG(LRFIFO_PUSH));
+	print_reg(s, window, VREG(CURR_MSG_COUNT));
+	print_reg(s, window, VREG(LNOTIFY_AFTER_COUNT));
+	print_reg(s, window, VREG(LRX_WCRED));
+	print_reg(s, window, VREG(LRX_WCRED_ADDER));
+	print_reg(s, window, VREG(TX_WCRED));
+	print_reg(s, window, VREG(TX_WCRED_ADDER));
+	print_reg(s, window, VREG(LFIFO_SIZE));
+	print_reg(s, window, VREG(WINCTL));
+	print_reg(s, window, VREG(WIN_STATUS));
+	print_reg(s, window, VREG(WIN_CTX_CACHING_CTL));
+	print_reg(s, window, VREG(TX_RSVD_BUF_COUNT));
+	print_reg(s, window, VREG(LRFIFO_WIN_PTR));
+	print_reg(s, window, VREG(LNOTIFY_CTL));
+	print_reg(s, window, VREG(LNOTIFY_PID));
+	print_reg(s, window, VREG(LNOTIFY_LPID));
+	print_reg(s, window, VREG(LNOTIFY_TID));
+	print_reg(s, window, VREG(LNOTIFY_SCOPE));
+	print_reg(s, window, VREG(NX_UTIL_ADDER));
+unlock:
+	mutex_unlock(&vas_mutex);
+	return 0;
+}
+
+static int hvwc_dbg_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, hvwc_dbg_show, inode->i_private);
+}
+
+static const struct file_operations hvwc_fops = {
+	.open		= hvwc_dbg_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+void vas_window_free_dbgdir(struct vas_window *window)
+{
+	if (window->dbgdir) {
+		debugfs_remove_recursive(window->dbgdir);
+		kfree(window->dbgname);
+		window->dbgdir = NULL;
+		window->dbgname = NULL;
+	}
+}
+
+void vas_window_init_dbgdir(struct vas_window *window)
+{
+	struct dentry *f, *d;
+
+	if (!window->vinst->dbgdir)
+		return;
+
+	window->dbgname = kzalloc(16, GFP_KERNEL);
+	if (!window->dbgname)
+		return;
+
+	snprintf(window->dbgname, 16, "w%d", window->winid);
+
+	d = debugfs_create_dir(window->dbgname, window->vinst->dbgdir);
+	if (IS_ERR(d))
+		goto free_name;
+
+	window->dbgdir = d;
+
+	f = debugfs_create_file("info", 0444, d, window, &info_fops);
+	if (IS_ERR(f))
+		goto remove_dir;
+
+	f = debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops);
+	if (IS_ERR(f))
+		goto remove_dir;
+
+	return;
+
+free_name:
+	kfree(window->dbgname);
+	window->dbgname = NULL;
+
+remove_dir:
+	debugfs_remove_recursive(window->dbgdir);
+	window->dbgdir = NULL;
+}
+
+void vas_instance_init_dbgdir(struct vas_instance *vinst)
+{
+	struct dentry *d;
+
+	if (!vas_debugfs)
+		return;
+
+	vinst->dbgname = kzalloc(16, GFP_KERNEL);
+	if (!vinst->dbgname)
+		return;
+
+	snprintf(vinst->dbgname, 16, "v%d", vinst->vas_id);
+
+	d = debugfs_create_dir(vinst->dbgname, vas_debugfs);
+	if (IS_ERR(d))
+		goto free_name;
+
+	vinst->dbgdir = d;
+	return;
+
+free_name:
+	kfree(vinst->dbgname);
+	vinst->dbgname = NULL;
+	vinst->dbgdir = NULL;
+}
+
+void vas_init_dbgdir(void)
+{
+	vas_debugfs = debugfs_create_dir("vas", NULL);
+	if (IS_ERR(vas_debugfs))
+		vas_debugfs = NULL;
+}
diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 23c13a7dcf89..c030d4cf982e 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -145,23 +145,37 @@ static void unmap_paste_region(struct vas_window *window)
 }
 
 /*
- * Unmap the MMIO regions for a window.
+ * Unmap the MMIO regions for a window. Hold the vas_mutex so we don't
+ * unmap when the window's debugfs dir is in use. This serializes close
+ * of a window even on another VAS instance but since its not a critical
+ * path, just minimize the time we hold the mutex for now. We can add
+ * a per-instance mutex later if necessary.
  */
 static void unmap_winctx_mmio_bars(struct vas_window *window)
 {
 	int len;
+	void *uwc_map;
+	void *hvwc_map;
 	u64 busaddr_start;
 
-	if (window->hvwc_map) {
+	mutex_lock(&vas_mutex);
+
+	hvwc_map = window->hvwc_map;
+	window->hvwc_map = NULL;
+
+	uwc_map = window->uwc_map;
+	window->uwc_map = NULL;
+
+	mutex_unlock(&vas_mutex);
+
+	if (hvwc_map) {
 		get_hvwc_mmio_bar(window, &busaddr_start, &len);
-		unmap_region(window->hvwc_map, busaddr_start, len);
-		window->hvwc_map = NULL;
+		unmap_region(hvwc_map, busaddr_start, len);
 	}
 
-	if (window->uwc_map) {
+	if (uwc_map) {
 		get_uwc_mmio_bar(window, &busaddr_start, &len);
-		unmap_region(window->uwc_map, busaddr_start, len);
-		window->uwc_map = NULL;
+		unmap_region(uwc_map, busaddr_start, len);
 	}
 }
 
@@ -528,6 +542,9 @@ static void vas_window_free(struct vas_window *window)
 	struct vas_instance *vinst = window->vinst;
 
 	unmap_winctx_mmio_bars(window);
+
+	vas_window_free_dbgdir(window);
+
 	kfree(window);
 
 	vas_release_window_id(&vinst->ida, winid);
@@ -552,6 +569,8 @@ static struct vas_window *vas_window_alloc(struct vas_instance *vinst)
 	if (map_winctx_mmio_bars(window))
 		goto out_free;
 
+	vas_window_init_dbgdir(window);
+
 	return window;
 
 out_free:
@@ -974,6 +993,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
 		goto put_rxwin;
 	}
 
+	txwin->cop = cop;
 	txwin->tx_win = 1;
 	txwin->rxwin = rxwin;
 	txwin->nx_win = txwin->rxwin->nx_win;
diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
index cd9a733d05e2..c488621dbec3 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -22,7 +22,7 @@
 
 #include "vas.h"
 
-static DEFINE_MUTEX(vas_mutex);
+DEFINE_MUTEX(vas_mutex);
 static LIST_HEAD(vas_instances);
 
 static DEFINE_PER_CPU(int, cpu_vas_id);
@@ -86,6 +86,8 @@ static int init_vas_instance(struct platform_device *pdev)
 	list_add(&vinst->node, &vas_instances);
 	mutex_unlock(&vas_mutex);
 
+	vas_instance_init_dbgdir(vinst);
+
 	dev_set_drvdata(&pdev->dev, vinst);
 
 	return 0;
@@ -157,6 +159,8 @@ static int __init vas_init(void)
 	int found = 0;
 	struct device_node *dn;
 
+	vas_init_dbgdir();
+
 	platform_driver_register(&vas_driver);
 
 	for_each_compatible_node(dn, NULL, "ibm,vas") {
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 02d8a31d9051..756cbc5335bc 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -13,6 +13,8 @@
 #include <linux/idr.h>
 #include <asm/vas.h>
 #include <linux/io.h>
+#include <linux/dcache.h>
+#include <linux/mutex.h>
 
 /*
  * Overview of Virtual Accelerator Switchboard (VAS).
@@ -317,6 +319,9 @@ struct vas_instance {
 	struct mutex mutex;
 	struct vas_window *rxwin[VAS_COP_TYPE_MAX];
 	struct vas_window *windows[VAS_WINDOWS_PER_CHIP];
+
+	char *dbgname;
+	struct dentry *dbgdir;
 };
 
 /*
@@ -334,6 +339,9 @@ struct vas_window {
 	pid_t pid;		/* Linux process id of owner */
 	int wcreds_max;		/* Window credits */
 
+	char *dbgname;
+	struct dentry *dbgdir;
+
 	/* Fields applicable only to send windows */
 	void *paste_kaddr;
 	char *paste_addr_name;
@@ -394,7 +402,13 @@ struct vas_winctx {
 	enum vas_notify_after_count notify_after_count;
 };
 
+extern struct mutex vas_mutex;
+
 extern struct vas_instance *find_vas_instance(int vasid);
+extern void vas_init_dbgdir(void);
+extern void vas_instance_init_dbgdir(struct vas_instance *vinst);
+extern void vas_window_init_dbgdir(struct vas_window *win);
+extern void vas_window_free_dbgdir(struct vas_window *win);
 
 static inline void vas_log_write(struct vas_window *win, char *name,
 			void *regptr, u64 val)
-- 
cgit 


From ec233ede4c8654894610ea54f4dae7adc954ac62 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:53 -0800
Subject: powerpc: Add support for setting SPRN_TIDR

We need the SPRN_TIDR to be set for use with fast thread-wakeup (core-
to-core wakeup) and also with CAPI.

Each thread in a process needs to have a unique id within the process.
But for now, we assign globally unique thread ids to all threads in
the system.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Philippe Bergheaud <felix@linux.vnet.ibm.com>
Signed-off-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
[mpe: Simplify tidr clearing on fork() and ctx switch code]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/processor.h |   1 +
 arch/powerpc/include/asm/switch_to.h |   3 +
 arch/powerpc/kernel/process.c        | 116 +++++++++++++++++++++++++++++++++++
 3 files changed, 120 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index fab7ff877304..58cc21274423 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -329,6 +329,7 @@ struct thread_struct {
 	 */
 	int		dscr_inherit;
 	unsigned long	ppr;	/* used to save/restore SMT priority */
+	unsigned long	tidr;
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
 	unsigned long	tar;
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 17c8380673a6..f5da32f43fed 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -91,4 +91,7 @@ static inline void clear_task_ebb(struct task_struct *t)
 #endif
 }
 
+extern int set_thread_tidr(struct task_struct *t);
+extern void clear_thread_tidr(struct task_struct *t);
+
 #endif /* _ASM_POWERPC_SWITCH_TO_H */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 4083b737decb..887ae5047288 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1136,6 +1136,10 @@ static inline void restore_sprs(struct thread_struct *old_thread,
 		if (old_thread->tar != new_thread->tar)
 			mtspr(SPRN_TAR, new_thread->tar);
 	}
+
+	if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+	    old_thread->tidr != new_thread->tidr)
+		mtspr(SPRN_TIDR, new_thread->tidr);
 #endif
 }
 
@@ -1451,6 +1455,116 @@ void flush_thread(void)
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 }
 
+#ifdef CONFIG_PPC64
+static DEFINE_SPINLOCK(vas_thread_id_lock);
+static DEFINE_IDA(vas_thread_ida);
+
+/*
+ * We need to assign a unique thread id to each thread in a process.
+ *
+ * This thread id, referred to as TIDR, and separate from the Linux's tgid,
+ * is intended to be used to direct an ASB_Notify from the hardware to the
+ * thread, when a suitable event occurs in the system.
+ *
+ * One such event is a "paste" instruction in the context of Fast Thread
+ * Wakeup (aka Core-to-core wake up in the Virtual Accelerator Switchboard
+ * (VAS) in POWER9.
+ *
+ * To get a unique TIDR per process we could simply reuse task_pid_nr() but
+ * the problem is that task_pid_nr() is not yet available copy_thread() is
+ * called. Fixing that would require changing more intrusive arch-neutral
+ * code in code path in copy_process()?.
+ *
+ * Further, to assign unique TIDRs within each process, we need an atomic
+ * field (or an IDR) in task_struct, which again intrudes into the arch-
+ * neutral code. So try to assign globally unique TIDRs for now.
+ *
+ * NOTE: TIDR 0 indicates that the thread does not need a TIDR value.
+ *	 For now, only threads that expect to be notified by the VAS
+ *	 hardware need a TIDR value and we assign values > 0 for those.
+ */
+#define MAX_THREAD_CONTEXT	((1 << 16) - 1)
+static int assign_thread_tidr(void)
+{
+	int index;
+	int err;
+
+again:
+	if (!ida_pre_get(&vas_thread_ida, GFP_KERNEL))
+		return -ENOMEM;
+
+	spin_lock(&vas_thread_id_lock);
+	err = ida_get_new_above(&vas_thread_ida, 1, &index);
+	spin_unlock(&vas_thread_id_lock);
+
+	if (err == -EAGAIN)
+		goto again;
+	else if (err)
+		return err;
+
+	if (index > MAX_THREAD_CONTEXT) {
+		spin_lock(&vas_thread_id_lock);
+		ida_remove(&vas_thread_ida, index);
+		spin_unlock(&vas_thread_id_lock);
+		return -ENOMEM;
+	}
+
+	return index;
+}
+
+static void free_thread_tidr(int id)
+{
+	spin_lock(&vas_thread_id_lock);
+	ida_remove(&vas_thread_ida, id);
+	spin_unlock(&vas_thread_id_lock);
+}
+
+/*
+ * Clear any TIDR value assigned to this thread.
+ */
+void clear_thread_tidr(struct task_struct *t)
+{
+	if (!t->thread.tidr)
+		return;
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	mtspr(SPRN_TIDR, 0);
+	free_thread_tidr(t->thread.tidr);
+	t->thread.tidr = 0;
+}
+
+void arch_release_task_struct(struct task_struct *t)
+{
+	clear_thread_tidr(t);
+}
+
+/*
+ * Assign a unique TIDR (thread id) for task @t and set it in the thread
+ * structure. For now, we only support setting TIDR for 'current' task.
+ */
+int set_thread_tidr(struct task_struct *t)
+{
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		return -EINVAL;
+
+	if (t != current)
+		return -EINVAL;
+
+	t->thread.tidr = assign_thread_tidr();
+	if (t->thread.tidr < 0)
+		return t->thread.tidr;
+
+	mtspr(SPRN_TIDR, t->thread.tidr);
+
+	return 0;
+}
+
+#endif /* CONFIG_PPC64 */
+
 void
 release_thread(struct task_struct *t)
 {
@@ -1597,6 +1711,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
 	}
 	if (cpu_has_feature(CPU_FTR_HAS_PPR))
 		p->thread.ppr = INIT_PPR;
+
+	p->thread.tidr = 0;
 #endif
 	kregs->nip = ppc_function_entry(f);
 	return 0;
-- 
cgit 


From 9d2a4d71332cfdf4ea90754ad9b2f05a5ee5f6c7 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:54 -0800
Subject: powerpc: Define set_thread_uses_vas()

A CP_ABORT instruction is required in processes that have mapped a VAS
"paste address" with the intention of using COPY/PASTE instructions.
But since CP_ABORT is expensive, we want to restrict it to only
processes that use/intend to use COPY/PASTE.

Define an interface, set_thread_uses_vas(), that VAS can use to
indicate that the current process opened a send window. During context
switch, issue CP_ABORT only for processes that have the flag set.

Thanks for input from Nick Piggin, Michael Ellerman.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
[mpe: Fix to not use new_thread after _switch() returns]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/processor.h |  2 ++
 arch/powerpc/include/asm/switch_to.h |  2 ++
 arch/powerpc/kernel/process.c        | 41 +++++++++++++++++++++++++++---------
 3 files changed, 35 insertions(+), 10 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 58cc21274423..bdab3b74eb98 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -341,7 +341,9 @@ struct thread_struct {
 	unsigned long	sier;
 	unsigned long	mmcr2;
 	unsigned 	mmcr0;
+
 	unsigned 	used_ebb;
+	unsigned int	used_vas;
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index f5da32f43fed..07ff06e9dae1 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -91,6 +91,8 @@ static inline void clear_task_ebb(struct task_struct *t)
 #endif
 }
 
+extern int set_thread_uses_vas(void);
+
 extern int set_thread_tidr(struct task_struct *t);
 extern void clear_thread_tidr(struct task_struct *t);
 
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 887ae5047288..bfdd783e3916 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1244,17 +1244,17 @@ struct task_struct *__switch_to(struct task_struct *prev,
 		 * The copy-paste buffer can only store into foreign real
 		 * addresses, so unprivileged processes can not see the
 		 * data or use it in any way unless they have foreign real
-		 * mappings. We don't have a VAS driver that allocates those
-		 * yet, so no cpabort is required.
+		 * mappings. If the new process has the foreign real address
+		 * mappings, we must issue a cp_abort to clear any state and
+		 * prevent snooping, corruption or a covert channel.
+		 *
+		 * DD1 allows paste into normal system memory so we do an
+		 * unpaired copy, rather than cp_abort, to clear the buffer,
+		 * since cp_abort is quite expensive.
 		 */
-		if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
-			/*
-			 * DD1 allows paste into normal system memory, so we
-			 * do an unpaired copy here to clear the buffer and
-			 * prevent a covert channel being set up.
-			 *
-			 * cpabort is not used because it is quite expensive.
-			 */
+		if (current_thread_info()->task->thread.used_vas) {
+			asm volatile(PPC_CP_ABORT);
+		} else if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
 			asm volatile(PPC_COPY(%0, %1)
 					: : "r"(dummy_copy_buffer), "r"(0));
 		}
@@ -1455,6 +1455,27 @@ void flush_thread(void)
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 }
 
+int set_thread_uses_vas(void)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
+		return -EINVAL;
+
+	current->thread.used_vas = 1;
+
+	/*
+	 * Even a process that has no foreign real address mapping can use
+	 * an unpaired COPY instruction (to no real effect). Issue CP_ABORT
+	 * to clear any pending COPY and prevent a covert channel.
+	 *
+	 * __switch_to() will issue CP_ABORT on future context switches.
+	 */
+	asm volatile(PPC_CP_ABORT);
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+	return 0;
+}
+
 #ifdef CONFIG_PPC64
 static DEFINE_SPINLOCK(vas_thread_id_lock);
 static DEFINE_IDA(vas_thread_ida);
-- 
cgit 


From 5676be2fb7035ac32da3a96241611e7eddff6157 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:56 -0800
Subject: powerpc/vas: Define vas_win_paste_addr()

Define an interface that the NX drivers can use to find the physical
paste address of a send window. This interface is expected to be used
with the mmap() operation of the NX driver's device. i.e the user space
process can use driver's mmap() operation to map the send window's paste
address into their address space and then use copy and paste instructions
to submit the CRBs to the NX engine.

Note that kernel drivers will use vas_paste_crb() directly and don't need
this interface.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/vas.h              |  7 +++++++
 arch/powerpc/platforms/powernv/vas-window.c | 10 ++++++++++
 2 files changed, 17 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index 044748f547f5..f98ade8a2603 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -10,6 +10,8 @@
 #ifndef _ASM_POWERPC_VAS_H
 #define _ASM_POWERPC_VAS_H
 
+struct vas_window;
+
 /*
  * Min and max FIFO sizes are based on Version 1.05 Section 3.1.4.25
  * (Local FIFO Size Register) of the VAS workbook.
@@ -165,4 +167,9 @@ int vas_copy_crb(void *crb, int offset);
  */
 int vas_paste_crb(struct vas_window *win, int offset, bool re);
 
+/*
+ * Return the power bus paste address associated with @win so the caller
+ * can map that address into their address space.
+ */
+extern u64 vas_win_paste_addr(struct vas_window *win);
 #endif /* __ASM_POWERPC_VAS_H */
diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index c030d4cf982e..d7d06533a1e9 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -40,6 +40,16 @@ static void compute_paste_address(struct vas_window *window, u64 *addr, int *len
 	pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
 }
 
+u64 vas_win_paste_addr(struct vas_window *win)
+{
+	u64 addr;
+
+	compute_paste_address(win, &addr, NULL);
+
+	return addr;
+}
+EXPORT_SYMBOL(vas_win_paste_addr);
+
 static inline void get_hvwc_mmio_bar(struct vas_window *window,
 			u64 *start, int *len)
 {
-- 
cgit 


From 61f3cca8cda979646c24accd9dbf3e2de7ea6ceb Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:57 -0800
Subject: powerpc/vas: Define vas_win_id()

Define an interface to return a system-wide unique id for a given VAS
window.

The vas_win_id() will be used in a follow-on patch to generate an unique
handle for a user space receive window. Applications can use this handle
to pair send and receive windows for fast thread-wakeup.

The hardware refers to this system-wide unique id as a Partition Send
Window ID which is expected to be used during fault handling. Hence the
"pswid" in the function names.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/vas.h              |  5 +++++
 arch/powerpc/platforms/powernv/vas-window.c |  9 +++++++++
 arch/powerpc/platforms/powernv/vas.h        | 28 ++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index f98ade8a2603..771456227496 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -167,6 +167,11 @@ int vas_copy_crb(void *crb, int offset);
  */
 int vas_paste_crb(struct vas_window *win, int offset, bool re);
 
+/*
+ * Return a system-wide unique id for the VAS window @win.
+ */
+extern u32 vas_win_id(struct vas_window *win);
+
 /*
  * Return the power bus paste address associated with @win so the caller
  * can map that address into their address space.
diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index d7d06533a1e9..82754924c2bc 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1235,3 +1235,12 @@ int vas_win_close(struct vas_window *window)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(vas_win_close);
+
+/*
+ * Return a system-wide unique window id for the window @win.
+ */
+u32 vas_win_id(struct vas_window *win)
+{
+	return encode_pswid(win->vinst->vas_id, win->winid);
+}
+EXPORT_SYMBOL_GPL(vas_win_id);
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 756cbc5335bc..ae0100fd35bb 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -447,4 +447,32 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
 	return in_be64(win->hvwc_map+reg);
 }
 
+/*
+ * Encode/decode the Partition Send Window ID (PSWID) for a window in
+ * a way that we can uniquely identify any window in the system. i.e.
+ * we should be able to locate the 'struct vas_window' given the PSWID.
+ *
+ *	Bits	Usage
+ *	0:7	VAS id (8 bits)
+ *	8:15	Unused, 0 (3 bits)
+ *	16:31	Window id (16 bits)
+ */
+static inline u32 encode_pswid(int vasid, int winid)
+{
+	u32 pswid = 0;
+
+	pswid |= vasid << (31 - 7);
+	pswid |= winid;
+
+	return pswid;
+}
+
+static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
+{
+	if (vasid)
+		*vasid = pswid >> (31 - 7) & 0xFF;
+
+	if (winid)
+		*winid = pswid & 0xFFFF;
+}
 #endif /* _VAS_H */
-- 
cgit 


From 6c8e6bb2a52d5e7ae5bbde21c21f6d5dfd6e9ae8 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Tue, 7 Nov 2017 18:23:58 -0800
Subject: powerpc/vas: Add support for user receive window

Add support for user space receive window (for the Fast thread-wakeup
coprocessor type)

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/vas-window.c | 56 +++++++++++++++++++++++++----
 1 file changed, 49 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 82754924c2bc..2b3eb01ab110 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -16,7 +16,8 @@
 #include <linux/log2.h>
 #include <linux/rcupdate.h>
 #include <linux/cred.h>
-
+#include <asm/switch_to.h>
+#include <asm/ppc-opcode.h>
 #include "vas.h"
 #include "copy-paste.h"
 
@@ -597,6 +598,32 @@ static void put_rx_win(struct vas_window *rxwin)
 	atomic_dec(&rxwin->num_txwins);
 }
 
+/*
+ * Find the user space receive window given the @pswid.
+ *      - We must have a valid vasid and it must belong to this instance.
+ *        (so both send and receive windows are on the same VAS instance)
+ *      - The window must refer to an OPEN, FTW, RECEIVE window.
+ *
+ * NOTE: We access ->windows[] table and assume that vinst->mutex is held.
+ */
+static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid)
+{
+	int vasid, winid;
+	struct vas_window *rxwin;
+
+	decode_pswid(pswid, &vasid, &winid);
+
+	if (vinst->vas_id != vasid)
+		return ERR_PTR(-EINVAL);
+
+	rxwin = vinst->windows[winid];
+
+	if (!rxwin || rxwin->tx_win || rxwin->cop != VAS_COP_TYPE_FTW)
+		return ERR_PTR(-EINVAL);
+
+	return rxwin;
+}
+
 /*
  * Get the VAS receive window associated with NX engine identified
  * by @cop and if applicable, @pswid.
@@ -610,10 +637,10 @@ static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst,
 
 	mutex_lock(&vinst->mutex);
 
-	if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI)
-		rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL);
+	if (cop == VAS_COP_TYPE_FTW)
+		rxwin = get_user_rxwin(vinst, pswid);
 	else
-		rxwin = ERR_PTR(-EINVAL);
+		rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL);
 
 	if (!IS_ERR(rxwin))
 		atomic_inc(&rxwin->num_txwins);
@@ -937,10 +964,9 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
 	winctx->tx_word_mode = txattr->tx_win_ord_mode;
 	winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count;
 
-	if (winctx->nx_win) {
+	winctx->intr_disable = true;
+	if (winctx->nx_win)
 		winctx->data_stamp = true;
-		winctx->intr_disable = true;
-	}
 
 	winctx->lpid = txattr->lpid;
 	winctx->pidr = txattr->pidr;
@@ -985,6 +1011,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
 	if (!tx_win_args_valid(cop, attr))
 		return ERR_PTR(-EINVAL);
 
+	/*
+	 * If caller did not specify a vasid but specified the PSWID of a
+	 * receive window (applicable only to FTW windows), use the vasid
+	 * from that receive window.
+	 */
+	if (vasid == -1 && attr->pswid)
+		decode_pswid(attr->pswid, &vasid, NULL);
+
 	vinst = find_vas_instance(vasid);
 	if (!vinst) {
 		pr_devel("vasid %d not found!\n", vasid);
@@ -1031,6 +1065,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
 		}
 	}
 
+	/*
+	 * Now that we have a send window, ensure context switch issues
+	 * CP_ABORT for this thread.
+	 */
+	rc = -EINVAL;
+	if (set_thread_uses_vas() < 0)
+		goto free_window;
+
 	set_vinst_win(vinst, txwin);
 
 	return txwin;
-- 
cgit 


From f79ad50ea3c73fb1ea5b09e95c864e5bb263adfb Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Mon, 16 Oct 2017 16:21:35 +1100
Subject: powerpc/mm/radix: Fix crashes on Power9 DD1 with radix MMU and
 STRICT_RWX

When using the radix MMU on Power9 DD1, to work around a hardware
problem, radix__pte_update() is required to do a two stage update of
the PTE. First we write a zero value into the PTE, then we flush the
TLB, and then we write the new PTE value.

In the normal case that works OK, but it does not work if we're
updating the PTE that maps the code we're executing, because the
mapping is removed by the TLB flush and we can no longer execute from
it. Unfortunately the STRICT_RWX code needs to do exactly that.

The exact symptoms when we hit this case vary, sometimes we print an
oops and then get stuck after that, but I've also seen a machine just
get stuck continually page faulting with no oops printed. The variance
is presumably due to the exact layout of the text and the page size
used for the mappings. In all cases we are unable to boot to a shell.

There are possible solutions such as creating a second mapping of the
TLB flush code, executing from that, and then jumping back to the
original. However we don't want to add that level of complexity for a
DD1 work around.

So just detect that we're running on Power9 DD1 and refrain from
changing the permissions, effectively disabling STRICT_RWX on Power9
DD1.

Fixes: 7614ff3272a1 ("powerpc/mm/radix: Implement STRICT_RWX/mark_rodata_ro() for Radix")
Cc: stable@vger.kernel.org # v4.13+
Reported-by: Andrew Jeffery <andrew@aj.id.au>
[Changelog as suggested by Michael Ellerman <mpe@ellerman.id.au>]
Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/pgtable-radix.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 39c252b54d16..cfbbee941a76 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -169,6 +169,16 @@ void radix__mark_rodata_ro(void)
 {
 	unsigned long start, end;
 
+	/*
+	 * mark_rodata_ro() will mark itself as !writable at some point.
+	 * Due to DD1 workaround in radix__pte_update(), we'll end up with
+	 * an invalid pte and the system will crash quite severly.
+	 */
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+		pr_warn("Warning: Unable to mark rodata read only on P9 DD1\n");
+		return;
+	}
+
 	start = (unsigned long)_stext;
 	end = (unsigned long)__init_begin;
 
-- 
cgit 


From fc2a5a6161a26b386eb0936b74a852ff83793fb6 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 2 Nov 2017 17:49:36 +1100
Subject: powerpc/64s: ppc_save_regs is now needed for all 64s builds

Commit 78adf6c214f0 ("powerpc/64s: Implement system reset idle wakeup
reason"), added a call to ppc_save_regs() in the book3s code.

ppc_save_regs() is only built if XMON and/or KEXEC_CORE are enabled,
which is usually the case, however if they're not enabled then the
build breaks.

Fix it by making the Makefile check also build ppc_save_regs.o if
CONFIG_PPC_BOOK3S is enabled.

Fixes: 78adf6c214f0 ("powerpc/64s: Implement system reset idle wakeup reason")
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
[mpe: Write change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 91960f83039c..34b6e729f38c 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -128,7 +128,7 @@ obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM)	+= tm.o
 obj-$(CONFIG_PPC64)		+= $(obj64-y)
 obj-$(CONFIG_PPC32)		+= $(obj32-y)
 
-ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE),)
+ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE)(CONFIG_PPC_BOOK3S),)
 obj-y				+= ppc_save_regs.o
 endif
 
-- 
cgit 


From 8a2d71a3f2737e2448aa68de2b6052cb570d3d2a Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Mon, 23 Oct 2017 22:07:38 +0530
Subject: powerpc/kprobes: Disable preemption before invoking probe handler for
 optprobes

Per Documentation/kprobes.txt, probe handlers need to be invoked with
preemption disabled. Update optimized_callback() to do so. Also move
get_kprobe_ctlblk() invocation post preemption disable, since it
accesses pre-cpu data.

This was not an issue so far since optprobes wasn't selected if
CONFIG_PREEMPT was enabled. Commit a30b85df7d599f ("kprobes: Use
synchronize_rcu_tasks() for optprobe with CONFIG_PREEMPT=y") changes
this.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/optprobes.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 91e037ab20a1..60ba7f1370a8 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -115,7 +115,6 @@ static unsigned long can_optimize(struct kprobe *p)
 static void optimized_callback(struct optimized_kprobe *op,
 			       struct pt_regs *regs)
 {
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 	unsigned long flags;
 
 	/* This is possible if op is under delayed unoptimizing */
@@ -124,13 +123,14 @@ static void optimized_callback(struct optimized_kprobe *op,
 
 	local_irq_save(flags);
 	hard_irq_disable();
+	preempt_disable();
 
 	if (kprobe_running()) {
 		kprobes_inc_nmissed_count(&op->kp);
 	} else {
 		__this_cpu_write(current_kprobe, &op->kp);
 		regs->nip = (unsigned long)op->kp.addr;
-		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+		get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
 		opt_pre_handler(&op->kp, regs);
 		__this_cpu_write(current_kprobe, NULL);
 	}
@@ -140,6 +140,7 @@ static void optimized_callback(struct optimized_kprobe *op,
 	 * local_irq_restore() will re-enable interrupts,
 	 * if they were hard disabled.
 	 */
+	preempt_enable_no_resched();
 	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(optimized_callback);
-- 
cgit 


From f72180cc93a2c64d4efe0129fa2396ad78be80e3 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Mon, 23 Oct 2017 22:07:39 +0530
Subject: powerpc/kprobes: Do not disable interrupts for optprobes and
 kprobes_on_ftrace

Per Documentation/kprobes.txt, we don't necessarily need to disable
interrupts before invoking the kprobe handlers. Masami submitted
similar changes for x86 via commit a19b2e3d783964 ("kprobes/x86: Remove
IRQ disabling from ftrace-based/optimized kprobes"). Do the same for
powerpc.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes-ftrace.c | 10 ++--------
 arch/powerpc/kernel/optprobes.c      | 10 ----------
 2 files changed, 2 insertions(+), 18 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
index 4b1f34f685b1..7a1f99f1b47f 100644
--- a/arch/powerpc/kernel/kprobes-ftrace.c
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -75,11 +75,7 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
 {
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
-	unsigned long flags;
 
-	/* Disable irq for emulating a breakpoint and avoiding preempt */
-	local_irq_save(flags);
-	hard_irq_disable();
 	preempt_disable();
 
 	p = get_kprobe((kprobe_opcode_t *)nip);
@@ -105,16 +101,14 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
 		else {
 			/*
 			 * If pre_handler returns !0, it sets regs->nip and
-			 * resets current kprobe. In this case, we still need
-			 * to restore irq, but not preemption.
+			 * resets current kprobe. In this case, we should not
+			 * re-enable preemption.
 			 */
-			local_irq_restore(flags);
 			return;
 		}
 	}
 end:
 	preempt_enable_no_resched();
-	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(kprobe_ftrace_handler);
 
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 60ba7f1370a8..8237884ca389 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -115,14 +115,10 @@ static unsigned long can_optimize(struct kprobe *p)
 static void optimized_callback(struct optimized_kprobe *op,
 			       struct pt_regs *regs)
 {
-	unsigned long flags;
-
 	/* This is possible if op is under delayed unoptimizing */
 	if (kprobe_disabled(&op->kp))
 		return;
 
-	local_irq_save(flags);
-	hard_irq_disable();
 	preempt_disable();
 
 	if (kprobe_running()) {
@@ -135,13 +131,7 @@ static void optimized_callback(struct optimized_kprobe *op,
 		__this_cpu_write(current_kprobe, NULL);
 	}
 
-	/*
-	 * No need for an explicit __hard_irq_enable() here.
-	 * local_irq_restore() will re-enable interrupts,
-	 * if they were hard disabled.
-	 */
 	preempt_enable_no_resched();
-	local_irq_restore(flags);
 }
 NOKPROBE_SYMBOL(optimized_callback);
 
-- 
cgit 


From 67ac0bfe29e9b4359012a8961c196baa813c707a Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Mon, 23 Oct 2017 22:07:40 +0530
Subject: powerpc/kprobes: Blacklist emulate_update_regs() from kprobes

Commit 3cdfcbfd32b9d ("powerpc: Change analyse_instr so it doesn't
modify *regs") introduced emulate_update_regs() to perform part of what
emulate_step() was doing earlier. However, this function was not added
to the kprobes blacklist. Add it so as to prevent it from being probed.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/lib/sstep.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 8c3955e183d4..70274b7b4773 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -2717,6 +2717,7 @@ void emulate_update_regs(struct pt_regs *regs, struct instruction_op *op)
 	}
 	regs->nip = next_pc;
 }
+NOKPROBE_SYMBOL(emulate_update_regs);
 
 /*
  * Emulate a previously-analysed load or store instruction.
-- 
cgit 


From acdfe93101d0f5b968d1bd7ab125517bf7444047 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Mon, 23 Oct 2017 22:07:41 +0530
Subject: powerpc/kprobes: refactor kprobe_lookup_name for safer string
 operations

Use safer string manipulation functions when dealing with a
user-provided string in kprobe_lookup_name().

Reported-by: David Laight <David.Laight@ACULAB.COM>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/kprobes.c | 47 ++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index a20ce12adab1..ca5d5a081e75 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -53,7 +53,7 @@ bool arch_within_kprobe_blacklist(unsigned long addr)
 
 kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset)
 {
-	kprobe_opcode_t *addr;
+	kprobe_opcode_t *addr = NULL;
 
 #ifdef PPC64_ELF_ABI_v2
 	/* PPC64 ABIv2 needs local entry point */
@@ -85,36 +85,29 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset)
 	 * Also handle <module:symbol> format.
 	 */
 	char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN];
-	const char *modsym;
 	bool dot_appended = false;
-	if ((modsym = strchr(name, ':')) != NULL) {
-		modsym++;
-		if (*modsym != '\0' && *modsym != '.') {
-			/* Convert to <module:.symbol> */
-			strncpy(dot_name, name, modsym - name);
-			dot_name[modsym - name] = '.';
-			dot_name[modsym - name + 1] = '\0';
-			strncat(dot_name, modsym,
-				sizeof(dot_name) - (modsym - name) - 2);
-			dot_appended = true;
-		} else {
-			dot_name[0] = '\0';
-			strncat(dot_name, name, sizeof(dot_name) - 1);
-		}
-	} else if (name[0] != '.') {
-		dot_name[0] = '.';
-		dot_name[1] = '\0';
-		strncat(dot_name, name, KSYM_NAME_LEN - 2);
+	const char *c;
+	ssize_t ret = 0;
+	int len = 0;
+
+	if ((c = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) {
+		c++;
+		len = c - name;
+		memcpy(dot_name, name, len);
+	} else
+		c = name;
+
+	if (*c != '\0' && *c != '.') {
+		dot_name[len++] = '.';
 		dot_appended = true;
-	} else {
-		dot_name[0] = '\0';
-		strncat(dot_name, name, KSYM_NAME_LEN - 1);
 	}
-	addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name);
-	if (!addr && dot_appended) {
-		/* Let's try the original non-dot symbol lookup	*/
+	ret = strscpy(dot_name + len, c, KSYM_NAME_LEN);
+	if (ret > 0)
+		addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name);
+
+	/* Fallback to the original non-dot symbol lookup */
+	if (!addr && dot_appended)
 		addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
-	}
 #else
 	addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
 #endif
-- 
cgit 


From 2a31ad093bb1b8c1f1e04cbe222ad17bc09c4534 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Wed, 6 Sep 2017 11:48:59 +1000
Subject: powerpc/powernv/npu: Use flush_all_mm() instead of flush_tlb_mm()

With the optimisations introduced by commit a46cc7a908 ("powerpc/mm/radix:
Improve TLB/PWC flushes"), flush_tlb_mm() no longer flushes the page walk
cache with radix. Switch to using flush_all_mm() to ensure the pwc and tlb
are properly flushed on the nmmu.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/npu-dma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 2cb6cbea4b3b..2fff9a65975b 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -549,7 +549,7 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
 	 * Unfortunately the nest mmu does not support flushing specific
 	 * addresses so we have to flush the whole mm.
 	 */
-	flush_tlb_mm(npu_context->mm);
+	flush_all_mm(npu_context->mm);
 
 	/*
 	 * Loop over all the NPUs this process is active on and launch
-- 
cgit 


From 1b2c2b12386f9bb009a2249eca00e01a9d76d7c1 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Wed, 6 Sep 2017 11:49:00 +1000
Subject: powerpc/powernv/npu: Don't explicitly flush nmmu tlb

The nest mmu required an explicit flush as a tlbi would not flush it in the
same way as the core. However an alternate firmware fix exists which should
eliminate the need for this flush, so instead add a device-tree property
(ibm,nmmu-flush) on the NVLink2 PHB to enable it only if required.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Reviewed-by: Frederic Barrat <fbarrat@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/npu-dma.c | 28 +++++++++++++++++++++++-----
 arch/powerpc/platforms/powernv/pci.h     |  3 +++
 2 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 2fff9a65975b..f6cbc1a71472 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -395,6 +395,7 @@ struct npu_context {
 	struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
 	struct mmu_notifier mn;
 	struct kref kref;
+	bool nmmu_flush;
 
 	/* Callback to stop translation requests on a given GPU */
 	struct npu_context *(*release_cb)(struct npu_context *, void *);
@@ -545,11 +546,13 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
 	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
 	unsigned long pid = npu_context->mm->context.id;
 
-	/*
-	 * Unfortunately the nest mmu does not support flushing specific
-	 * addresses so we have to flush the whole mm.
-	 */
-	flush_all_mm(npu_context->mm);
+	if (npu_context->nmmu_flush)
+		/*
+		 * Unfortunately the nest mmu does not support flushing specific
+		 * addresses so we have to flush the whole mm once before
+		 * shooting down the GPU translation.
+		 */
+		flush_all_mm(npu_context->mm);
 
 	/*
 	 * Loop over all the NPUs this process is active on and launch
@@ -722,6 +725,16 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
 		return ERR_PTR(-ENODEV);
 	npu_context->npdev[npu->index][nvlink_index] = npdev;
 
+	if (!nphb->npu.nmmu_flush) {
+		/*
+		 * If we're not explicitly flushing ourselves we need to mark
+		 * the thread for global flushes
+		 */
+		npu_context->nmmu_flush = false;
+		mm_context_add_copro(mm);
+	} else
+		npu_context->nmmu_flush = true;
+
 	return npu_context;
 }
 EXPORT_SYMBOL(pnv_npu2_init_context);
@@ -731,6 +744,9 @@ static void pnv_npu2_release_context(struct kref *kref)
 	struct npu_context *npu_context =
 		container_of(kref, struct npu_context, kref);
 
+	if (!npu_context->nmmu_flush)
+		mm_context_remove_copro(npu_context->mm);
+
 	npu_context->mm->context.npu_context = NULL;
 	mmu_notifier_unregister(&npu_context->mn,
 				npu_context->mm);
@@ -819,6 +835,8 @@ int pnv_npu2_init(struct pnv_phb *phb)
 	static int npu_index;
 	uint64_t rc = 0;
 
+	phb->npu.nmmu_flush =
+		of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush");
 	for_each_child_of_node(phb->hose->dn, dn) {
 		gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
 		if (gpdev) {
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 56d1f272d4ad..96151b3a2dd4 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -187,6 +187,9 @@ struct pnv_phb {
 
 		/* Bitmask for MMIO register usage */
 		unsigned long mmio_atsd_usage;
+
+		/* Do we need to explicitly flush the nest mmu? */
+		bool nmmu_flush;
 	} npu;
 
 #ifdef CONFIG_CXL_BASE
-- 
cgit 


From 32ce3862af3c42a3890e99846a8d1ad8871a49f9 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Thu, 19 Oct 2017 18:13:54 +1100
Subject: powerpc/lib: Implement PMEM API

Implement the architecture specific cache maintence functions that make
up the "PMEM API". Currently the writeback and invalidate functions
are the same since the function of the DCBST (data cache block store)
instruction is typically interpreted as "writeback to the point of
coherency" rather than to memory. As a result implementing the API
requires a full cache flush rather than just a cache write back. This
will probably change in the not-too-distant future.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig      |  1 +
 arch/powerpc/lib/Makefile |  2 +-
 arch/powerpc/lib/pmem.c   | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/pmem.c

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b8b75b423316..c844dbc04153 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -138,6 +138,7 @@ config PPC
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_PMEM_API                if PPC64
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 50d5bf954cff..7aa237d10546 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -23,7 +23,7 @@ endif
 
 obj64-y	+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
 	   copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \
-	   memcpy_64.o memcmp_64.o
+	   memcpy_64.o memcmp_64.o pmem.o
 
 obj64-$(CONFIG_SMP)	+= locks.o
 obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o
diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c
new file mode 100644
index 000000000000..0fa09262ca13
--- /dev/null
+++ b/arch/powerpc/lib/pmem.c
@@ -0,0 +1,34 @@
+/*
+ * Copyright(c) 2017 IBM Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/string.h>
+#include <linux/export.h>
+
+#include <asm/cacheflush.h>
+
+/*
+ * CONFIG_ARCH_HAS_PMEM_API symbols
+ */
+void arch_wb_cache_pmem(void *addr, size_t size)
+{
+	unsigned long start = (unsigned long) addr;
+	flush_inval_dcache_range(start, start + size);
+}
+EXPORT_SYMBOL(arch_wb_cache_pmem);
+
+void arch_invalidate_pmem(void *addr, size_t size)
+{
+	unsigned long start = (unsigned long) addr;
+	flush_inval_dcache_range(start, start + size);
+}
+EXPORT_SYMBOL(arch_invalidate_pmem);
-- 
cgit 


From 6c44741d75a27985a0496cd58b123a58c7177108 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Thu, 19 Oct 2017 18:13:55 +1100
Subject: powerpc/lib: Implement UACCESS_FLUSHCACHE API

Implement the architecture specific portitions of the UACCESS_FLUSHCACHE
API. This provides functions for the copy_user_flushcache iterator that
ensure that when the copy is finished the destination buffer contains
a copy of the original and that the destination buffer is clean in the
processor caches.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/Kconfig               |  1 +
 arch/powerpc/include/asm/string.h  |  2 ++
 arch/powerpc/include/asm/uaccess.h |  5 +++++
 arch/powerpc/lib/pmem.c            | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 41 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c844dbc04153..bbc9336231db 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -142,6 +142,7 @@ config PPC
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
+	select ARCH_HAS_UACCESS_FLUSHCACHE	if PPC64
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAS_ZONE_DEVICE		if PPC_BOOK3S_64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h
index cc9addefb51c..2796eda17d92 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -11,6 +11,7 @@
 #define __HAVE_ARCH_MEMCMP
 #define __HAVE_ARCH_MEMCHR
 #define __HAVE_ARCH_MEMSET16
+#define __HAVE_ARCH_MEMCPY_FLUSHCACHE
 
 extern char * strcpy(char *,const char *);
 extern char * strncpy(char *,const char *, __kernel_size_t);
@@ -23,6 +24,7 @@ extern void * memcpy(void *,const void *,__kernel_size_t);
 extern void * memmove(void *,const void *,__kernel_size_t);
 extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
+extern void * memcpy_flushcache(void *,const void *,__kernel_size_t);
 
 #ifdef CONFIG_PPC64
 #define __HAVE_ARCH_MEMSET32
diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h
index e34f15e727d9..19a3f75ed610 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -356,4 +356,9 @@ static inline unsigned long clear_user(void __user *addr, unsigned long size)
 extern long strncpy_from_user(char *dst, const char __user *src, long count);
 extern __must_check long strnlen_user(const char __user *str, long n);
 
+extern long __copy_from_user_flushcache(void *dst, const void __user *src,
+		unsigned size);
+extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
+			   size_t len);
+
 #endif	/* _ARCH_POWERPC_UACCESS_H */
diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c
index 0fa09262ca13..53c018762e1c 100644
--- a/arch/powerpc/lib/pmem.c
+++ b/arch/powerpc/lib/pmem.c
@@ -13,6 +13,7 @@
 
 #include <linux/string.h>
 #include <linux/export.h>
+#include <linux/uaccess.h>
 
 #include <asm/cacheflush.h>
 
@@ -32,3 +33,35 @@ void arch_invalidate_pmem(void *addr, size_t size)
 	flush_inval_dcache_range(start, start + size);
 }
 EXPORT_SYMBOL(arch_invalidate_pmem);
+
+/*
+ * CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE symbols
+ */
+long __copy_from_user_flushcache(void *dest, const void __user *src,
+		unsigned size)
+{
+	unsigned long copied, start = (unsigned long) dest;
+
+	copied = __copy_from_user(dest, src, size);
+	flush_inval_dcache_range(start, start + size);
+
+	return copied;
+}
+
+void *memcpy_flushcache(void *dest, const void *src, size_t size)
+{
+	unsigned long start = (unsigned long) dest;
+
+	memcpy(dest, src, size);
+	flush_inval_dcache_range(start, start + size);
+
+	return dest;
+}
+EXPORT_SYMBOL(memcpy_flushcache);
+
+void memcpy_page_flushcache(char *to, struct page *page, size_t offset,
+	size_t len)
+{
+	memcpy_flushcache(to, page_to_virt(page) + offset, len);
+}
+EXPORT_SYMBOL(memcpy_page_flushcache);
-- 
cgit 


From dcdc46794b7bb76733d9792cca2f45871d66881f Mon Sep 17 00:00:00 2001
From: Michal Suchanek <msuchanek@suse.de>
Date: Mon, 26 Jun 2017 16:06:01 +0200
Subject: powerpc/fadump: use kstrtoint to handle sysfs store

Currently sysfs store handlers in fadump use if buf[0] == 'char'.

This means input "100foo" is interpreted as '1' and "01" as '0'.

Change to kstrtoint so leading zeroes and the like is handled in
expected way.

Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Acked-by: Hari Bathini <hbathini@linux.vnet.ibm.com>
Signed-off-by: Michal Suchanek <a class="moz-txt-link-rfc2396E" href="mailto:msuchanek@suse.de">&lt;msuchanek@suse.de&gt;</a></pre>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/fadump.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index e1431800bfb9..04ea5c04fd24 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -1270,10 +1270,15 @@ static ssize_t fadump_release_memory_store(struct kobject *kobj,
 					struct kobj_attribute *attr,
 					const char *buf, size_t count)
 {
+	int input = -1;
+
 	if (!fw_dump.dump_active)
 		return -EPERM;
 
-	if (buf[0] == '1') {
+	if (kstrtoint(buf, 0, &input))
+		return -EINVAL;
+
+	if (input == 1) {
 		/*
 		 * Take away the '/proc/vmcore'. We are releasing the dump
 		 * memory, hence it will not be valid anymore.
@@ -1307,21 +1312,25 @@ static ssize_t fadump_register_store(struct kobject *kobj,
 					const char *buf, size_t count)
 {
 	int ret = 0;
+	int input = -1;
 
 	if (!fw_dump.fadump_enabled || fdm_active)
 		return -EPERM;
 
+	if (kstrtoint(buf, 0, &input))
+		return -EINVAL;
+
 	mutex_lock(&fadump_mutex);
 
-	switch (buf[0]) {
-	case '0':
+	switch (input) {
+	case 0:
 		if (fw_dump.dump_registered == 0) {
 			goto unlock_out;
 		}
 		/* Un-register Firmware-assisted dump */
 		fadump_unregister_dump(&fdm);
 		break;
-	case '1':
+	case 1:
 		if (fw_dump.dump_registered == 1) {
 			ret = -EEXIST;
 			goto unlock_out;
-- 
cgit 


From 46725b17f1c6c815a41429259b3f070c01e71bc1 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Thu, 31 Aug 2017 21:55:57 +0530
Subject: powerpc/signal: Properly handle return value from
 uprobe_deny_signal()

When a uprobe is installed on an instruction that we currently do not
emulate, we copy the instruction into a xol buffer and single step
that instruction. If that instruction generates a fault, we abort the
single stepping before invoking the signal handler. Once the signal
handler is done, the uprobe trap is hit again since the instruction is
retried and the process repeats.

We use uprobe_deny_signal() to detect if the xol instruction triggered
a signal. If so, we clear TIF_SIGPENDING and set TIF_UPROBE so that the
signal is not handled until after the single stepping is aborted. In
this case, uprobe_deny_signal() returns true and get_signal() ends up
returning 0. However, in do_signal(), we are not looking at the return
value, but depending on ksig.sig for further action, all with an
uninitialized ksig that is not touched in this scenario. Fix the same
by initializing ksig.sig to 0.

Fixes: 129b69df9c90 ("powerpc: Use get_signal() signal_setup_done()")
Cc: stable@vger.kernel.org # v3.17+
Reported-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index e9436c5e1e09..3d7539b90010 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -103,7 +103,7 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka,
 static void do_signal(struct task_struct *tsk)
 {
 	sigset_t *oldset = sigmask_to_save();
-	struct ksignal ksig;
+	struct ksignal ksig = { .sig = 0 };
 	int ret;
 	int is32 = is_32bit_task();
 
-- 
cgit 


From f23ab3efb1b30cc5c5ef5ae4ef294ed467f30675 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 10 Nov 2017 12:15:00 +1100
Subject: powerpc: Fix DABR match on hash based systems

Commit 398a719d34a1 ("powerpc/mm: Update bits used to skip hash_page")
mistakenly dropped the DSISR_DABRMATCH bit from the mask of bit tested
to skip trying to hash a page.

As a result, the DABR matches would no longer be detected.

This adds it back. We open code it in the 2 places where it matters
rather than fold it into DSISR_BAD_FAULT_32S/64S because this isn't
technically a bad fault and while we would never hit it with the
current code, I prefer if page_fault_is_bad() didn't trigger on these.

Fixes: 398a719d34a1 ("powerpc/mm: Update bits used to skip hash_page")
Cc: stable@vger.kernel.org # v4.14
Tested-by: Pedro Miraglia Franco de Carvalho <pedromfc@br.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/exceptions-64s.S | 2 +-
 arch/powerpc/kernel/head_32.S        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6879aed47377..445137e2d0ca 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1498,7 +1498,7 @@ USE_TEXT_SECTION()
 	.balign	IFETCH_ALIGN_BYTES
 do_hash_page:
 #ifdef CONFIG_PPC_BOOK3S_64
-	lis	r0,DSISR_BAD_FAULT_64S@h
+	lis	r0,(DSISR_BAD_FAULT_64S|DSISR_DABRMATCH)@h
 	ori	r0,r0,DSISR_BAD_FAULT_64S@l
 	and.	r0,r4,r0		/* weird error? */
 	bne-	handle_page_fault	/* if not, try to insert a HPTE */
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 8c54166491e7..29b2fed93289 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -388,7 +388,7 @@ DataAccess:
 	EXCEPTION_PROLOG
 	mfspr	r10,SPRN_DSISR
 	stw	r10,_DSISR(r11)
-	andis.	r0,r10,DSISR_BAD_FAULT_32S@h
+	andis.	r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h
 	bne	1f			/* if not, try to put a PTE */
 	mfspr	r4,SPRN_DAR		/* into the hash table */
 	rlwinm	r3,r10,32-15,21,21	/* DSISR_STORE -> _PAGE_RW */
-- 
cgit 


From 7ece370996b694ae263025e056ad785afc1be5ab Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 13 Nov 2017 23:17:28 +1100
Subject: powerpc/64s/hash: Fix 512T hint detection to use >= 128T

Currently userspace is able to request mmap() search between 128T-512T
by specifying a hint address that is greater than 128T. But that means
a hint of 128T exactly will return an address below 128T, which is
confusing and wrong.

So fix the logic to check the hint is greater than *or equal* to 128T.

Fixes: f4ea6dcb08ea ("powerpc/mm: Enable mappings above 128TB")
Cc: stable@vger.kernel.org # v4.12+
Suggested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Suggested-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Split out of Nick's bigger patch]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/slice.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 45f6740dd407..48a5312103a1 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -419,7 +419,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/*
 	 * Check if we need to expland slice area.
 	 */
-	if (unlikely(addr > mm->context.addr_limit &&
+	if (unlikely(addr >= mm->context.addr_limit &&
 		     mm->context.addr_limit != TASK_SIZE)) {
 		mm->context.addr_limit = TASK_SIZE;
 		on_each_cpu(slice_flush_segments, mm, 1);
@@ -427,7 +427,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	/*
 	 * This mmap request can allocate upt to 512TB
 	 */
-	if (addr > DEFAULT_MAP_WINDOW)
+	if (addr >= DEFAULT_MAP_WINDOW)
 		high_limit = mm->context.addr_limit;
 	else
 		high_limit = DEFAULT_MAP_WINDOW;
-- 
cgit 


From 6a72dc038b615229a1b285829d6c8378d15c2347 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 10 Nov 2017 04:27:36 +1100
Subject: powerpc/64s/hash: Fix 128TB-512TB virtual address boundary case
 allocation

When allocating VA space with a hint that crosses 128TB, the SLB
addr_limit variable is not expanded if addr is not > 128TB, but the
slice allocation looks at task_size, which is 512TB. This results in
slice_check_fit() incorrectly succeeding because the slice_count
truncates off bit 128 of the requested mask, so the comparison to the
available mask succeeds.

Fix this by using mm->context.addr_limit instead of mm->task_size for
testing allocation limits. This causes such allocations to fail.

Fixes: f4ea6dcb08ea ("powerpc/mm: Enable mappings above 128TB")
Cc: stable@vger.kernel.org # v4.12+
Reported-by: Florian Weimer <fweimer@redhat.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/slice.c | 50 ++++++++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 48a5312103a1..3889201b560c 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -96,7 +96,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
 {
 	struct vm_area_struct *vma;
 
-	if ((mm->task_size - len) < addr)
+	if ((mm->context.addr_limit - len) < addr)
 		return 0;
 	vma = find_vma(mm, addr);
 	return (!vma || (addr + len) <= vm_start_gap(vma));
@@ -133,7 +133,7 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret)
 		if (!slice_low_has_vma(mm, i))
 			ret->low_slices |= 1u << i;
 
-	if (mm->task_size <= SLICE_LOW_TOP)
+	if (mm->context.addr_limit <= SLICE_LOW_TOP)
 		return;
 
 	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++)
@@ -412,25 +412,31 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	struct slice_mask compat_mask;
 	int fixed = (flags & MAP_FIXED);
 	int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
+	unsigned long page_size = 1UL << pshift;
 	struct mm_struct *mm = current->mm;
 	unsigned long newaddr;
 	unsigned long high_limit;
 
-	/*
-	 * Check if we need to expland slice area.
-	 */
-	if (unlikely(addr >= mm->context.addr_limit &&
-		     mm->context.addr_limit != TASK_SIZE)) {
-		mm->context.addr_limit = TASK_SIZE;
+	high_limit = DEFAULT_MAP_WINDOW;
+	if (addr >= high_limit)
+		high_limit = TASK_SIZE;
+
+	if (len > high_limit)
+		return -ENOMEM;
+	if (len & (page_size - 1))
+		return -EINVAL;
+	if (fixed) {
+		if (addr & (page_size - 1))
+			return -EINVAL;
+		if (addr > high_limit - len)
+			return -ENOMEM;
+	}
+
+	if (high_limit > mm->context.addr_limit) {
+		mm->context.addr_limit = high_limit;
 		on_each_cpu(slice_flush_segments, mm, 1);
 	}
-	/*
-	 * This mmap request can allocate upt to 512TB
-	 */
-	if (addr >= DEFAULT_MAP_WINDOW)
-		high_limit = mm->context.addr_limit;
-	else
-		high_limit = DEFAULT_MAP_WINDOW;
+
 	/*
 	 * init different masks
 	 */
@@ -446,27 +452,19 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 
 	/* Sanity checks */
 	BUG_ON(mm->task_size == 0);
+	BUG_ON(mm->context.addr_limit == 0);
 	VM_BUG_ON(radix_enabled());
 
 	slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
 	slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
 		  addr, len, flags, topdown);
 
-	if (len > mm->task_size)
-		return -ENOMEM;
-	if (len & ((1ul << pshift) - 1))
-		return -EINVAL;
-	if (fixed && (addr & ((1ul << pshift) - 1)))
-		return -EINVAL;
-	if (fixed && addr > (mm->task_size - len))
-		return -ENOMEM;
-
 	/* If hint, make sure it matches our alignment restrictions */
 	if (!fixed && addr) {
-		addr = _ALIGN_UP(addr, 1ul << pshift);
+		addr = _ALIGN_UP(addr, page_size);
 		slice_dbg(" aligned addr=%lx\n", addr);
 		/* Ignore hint if it's too large or overlaps a VMA */
-		if (addr > mm->task_size - len ||
+		if (addr > high_limit - len ||
 		    !slice_area_is_free(mm, addr, len))
 			addr = 0;
 	}
-- 
cgit 


From effc1b25088502fbd30305c79773de2d1f7470a6 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 10 Nov 2017 04:27:37 +1100
Subject: powerpc/64s/hash: Fix fork() with 512TB process address space

Hash unconditionally resets the addr_limit to default (128TB) when the
mm context is initialised. If a process has > 128TB mappings when it
forks, the child will not get the 512TB addr_limit, so accesses to
valid > 128TB mappings will fail in the child.

Fix this by only resetting the addr_limit to default if it was 0. Non
zero indicates it was duplicated from the parent (0 means exec()).

Fixes: f4ea6dcb08ea ("powerpc/mm: Enable mappings above 128TB")
Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/mmu_context_book3s64.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 6d724dab27c2..846cbad45fce 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -93,11 +93,11 @@ static int hash__init_new_context(struct mm_struct *mm)
 		return index;
 
 	/*
-	 * We do switch_slb() early in fork, even before we setup the
-	 * mm->context.addr_limit. Default to max task size so that we copy the
-	 * default values to paca which will help us to handle slb miss early.
+	 * In the case of exec, use the default limit,
+	 * otherwise inherit it from the mm we are duplicating.
 	 */
-	mm->context.addr_limit = DEFAULT_MAP_WINDOW_USER64;
+	if (!mm->context.addr_limit)
+		mm->context.addr_limit = DEFAULT_MAP_WINDOW_USER64;
 
 	/*
 	 * The old code would re-promote on fork, we don't do that when using
-- 
cgit 


From 35602f82d0c765f991420e319c8d3a596c921eb8 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 10 Nov 2017 04:27:38 +1100
Subject: powerpc/64s/hash: Allow MAP_FIXED allocations to cross 128TB boundary

While mapping hints with a length that cross 128TB are disallowed,
MAP_FIXED allocations that cross 128TB are allowed. These are failing
on hash (on radix they succeed). Add an additional case for fixed
mappings to expand the addr_limit when crossing 128TB.

Fixes: f4ea6dcb08ea ("powerpc/mm: Enable mappings above 128TB")
Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/slice.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 3889201b560c..a4f93699194b 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -418,7 +418,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 	unsigned long high_limit;
 
 	high_limit = DEFAULT_MAP_WINDOW;
-	if (addr >= high_limit)
+	if (addr >= high_limit || (fixed && (addr + len > high_limit)))
 		high_limit = TASK_SIZE;
 
 	if (len > high_limit)
-- 
cgit 


From 85e3f1adcb9d49300b0a943bb93f9604be375bfb Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 10 Nov 2017 04:27:39 +1100
Subject: powerpc/64s/radix: Fix 128TB-512TB virtual address boundary case
 allocation

Radix VA space allocations test addresses against mm->task_size which
is 512TB, even in cases where the intention is to limit allocation to
below 128TB.

This results in mmap with a hint address below 128TB but address +
length above 128TB succeeding when it should fail (as hash does after
the previous patch).

Set the high address limit to be considered up front, and base
subsequent allocation checks on that consistently.

Fixes: f4ea6dcb08ea ("powerpc/mm: Enable mappings above 128TB")
Cc: stable@vger.kernel.org # v4.12+
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/mm/hugetlbpage-radix.c | 26 ++++++++++++------
 arch/powerpc/mm/mmap.c              | 55 ++++++++++++++++++++++---------------
 2 files changed, 50 insertions(+), 31 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index a12e86395025..0a3d71aae175 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -48,17 +48,28 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	struct hstate *h = hstate_file(file);
+	int fixed = (flags & MAP_FIXED);
+	unsigned long high_limit;
 	struct vm_unmapped_area_info info;
 
-	if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
-		mm->context.addr_limit = TASK_SIZE;
+	high_limit = DEFAULT_MAP_WINDOW;
+	if (addr >= high_limit || (fixed && (addr + len > high_limit)))
+		high_limit = TASK_SIZE;
 
 	if (len & ~huge_page_mask(h))
 		return -EINVAL;
-	if (len > mm->task_size)
+	if (len > high_limit)
 		return -ENOMEM;
+	if (fixed) {
+		if (addr > high_limit - len)
+			return -ENOMEM;
+	}
 
-	if (flags & MAP_FIXED) {
+	if (unlikely(addr > mm->context.addr_limit &&
+		     mm->context.addr_limit != TASK_SIZE))
+		mm->context.addr_limit = TASK_SIZE;
+
+	if (fixed) {
 		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
@@ -67,7 +78,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	if (addr) {
 		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
-		if (mm->task_size - len >= addr &&
+		if (high_limit - len >= addr &&
 		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
@@ -78,12 +89,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
 	info.low_limit = PAGE_SIZE;
-	info.high_limit = current->mm->mmap_base;
+	info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
 	info.align_mask = PAGE_MASK & ~huge_page_mask(h);
 	info.align_offset = 0;
 
-	if (addr > DEFAULT_MAP_WINDOW)
-		info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
-
 	return vm_unmapped_area(&info);
 }
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 5d78b193fec4..6d476a7b5611 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -106,22 +106,32 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
+	int fixed = (flags & MAP_FIXED);
+	unsigned long high_limit;
 	struct vm_unmapped_area_info info;
 
+	high_limit = DEFAULT_MAP_WINDOW;
+	if (addr >= high_limit || (fixed && (addr + len > high_limit)))
+		high_limit = TASK_SIZE;
+
+	if (len > high_limit)
+		return -ENOMEM;
+	if (fixed) {
+		if (addr > high_limit - len)
+			return -ENOMEM;
+	}
+
 	if (unlikely(addr > mm->context.addr_limit &&
 		     mm->context.addr_limit != TASK_SIZE))
 		mm->context.addr_limit = TASK_SIZE;
 
-	if (len > mm->task_size - mmap_min_addr)
-		return -ENOMEM;
-
-	if (flags & MAP_FIXED)
+	if (fixed)
 		return addr;
 
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
+		if (high_limit - len >= addr && addr >= mmap_min_addr &&
 		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
@@ -129,13 +139,9 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	info.flags = 0;
 	info.length = len;
 	info.low_limit = mm->mmap_base;
+	info.high_limit = high_limit;
 	info.align_mask = 0;
 
-	if (unlikely(addr > DEFAULT_MAP_WINDOW))
-		info.high_limit = mm->context.addr_limit;
-	else
-		info.high_limit = DEFAULT_MAP_WINDOW;
-
 	return vm_unmapped_area(&info);
 }
 
@@ -149,37 +155,42 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
 	unsigned long addr = addr0;
+	int fixed = (flags & MAP_FIXED);
+	unsigned long high_limit;
 	struct vm_unmapped_area_info info;
 
+	high_limit = DEFAULT_MAP_WINDOW;
+	if (addr >= high_limit || (fixed && (addr + len > high_limit)))
+		high_limit = TASK_SIZE;
+
+	if (len > high_limit)
+		return -ENOMEM;
+	if (fixed) {
+		if (addr > high_limit - len)
+			return -ENOMEM;
+	}
+
 	if (unlikely(addr > mm->context.addr_limit &&
 		     mm->context.addr_limit != TASK_SIZE))
 		mm->context.addr_limit = TASK_SIZE;
 
-	/* requested length too big for entire address space */
-	if (len > mm->task_size - mmap_min_addr)
-		return -ENOMEM;
-
-	if (flags & MAP_FIXED)
+	if (fixed)
 		return addr;
 
-	/* requesting a specific address */
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
-				(!vma || addr + len <= vm_start_gap(vma)))
+		if (high_limit - len >= addr && addr >= mmap_min_addr &&
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
 	info.flags = VM_UNMAPPED_AREA_TOPDOWN;
 	info.length = len;
 	info.low_limit = max(PAGE_SIZE, mmap_min_addr);
-	info.high_limit = mm->mmap_base;
+	info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
 	info.align_mask = 0;
 
-	if (addr > DEFAULT_MAP_WINDOW)
-		info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
-
 	addr = vm_unmapped_area(&info);
 	if (!(addr & ~PAGE_MASK))
 		return addr;
-- 
cgit 


From 4722476bce283149986a3463f61009dec0e7f6a1 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 10 Nov 2017 04:27:40 +1100
Subject: powerpc/64s: mm_context.addr_limit is only used on hash

Radix keeps no meaningful state in addr_limit, so remove it from radix
code and rename to slb_addr_limit to make it clear it applies to hash
only.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  2 +-
 arch/powerpc/include/asm/book3s/64/mmu.h      |  2 +-
 arch/powerpc/include/asm/paca.h               |  2 +-
 arch/powerpc/kernel/asm-offsets.c             |  2 +-
 arch/powerpc/kernel/paca.c                    |  4 ++--
 arch/powerpc/kernel/setup-common.c            |  3 ++-
 arch/powerpc/mm/hugetlbpage-radix.c           |  8 +-------
 arch/powerpc/mm/mmap.c                        | 18 ++++--------------
 arch/powerpc/mm/mmu_context_book3s64.c        |  4 ++--
 arch/powerpc/mm/slb_low.S                     |  2 +-
 arch/powerpc/mm/slice.c                       | 22 +++++++++++-----------
 11 files changed, 27 insertions(+), 42 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 508275bb05d5..e91e115a816f 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -606,7 +606,7 @@ extern void slb_set_size(u16 size);
 
 /* 4 bits per slice and we have one slice per 1TB */
 #define SLICE_ARRAY_SIZE	(H_PGTABLE_RANGE >> 41)
-#define TASK_SLICE_ARRAY_SZ(x)	((x)->context.addr_limit >> 41)
+#define TASK_SLICE_ARRAY_SZ(x)	((x)->context.slb_addr_limit >> 41)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index c3b00e8ff791..49a07c5d9e50 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -92,7 +92,7 @@ typedef struct {
 #ifdef CONFIG_PPC_MM_SLICES
 	u64 low_slices_psize;	/* SLB page size encodings */
 	unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
-	unsigned long addr_limit;
+	unsigned long slb_addr_limit;
 #else
 	u16 sllp;		/* SLB page size encoding */
 #endif
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index c907ae23c956..3892db93b837 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -143,7 +143,7 @@ struct paca_struct {
 #ifdef CONFIG_PPC_MM_SLICES
 	u64 mm_ctx_low_slices_psize;
 	unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE];
-	unsigned long addr_limit;
+	unsigned long mm_ctx_slb_addr_limit;
 #else
 	u16 mm_ctx_user_psize;
 	u16 mm_ctx_sllp;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 200623e71474..9aace433491a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -185,7 +185,7 @@ int main(void)
 #ifdef CONFIG_PPC_MM_SLICES
 	OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize);
 	OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize);
-	DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit));
+	OFFSET(PACA_SLB_ADDR_LIMIT, paca_struct, mm_ctx_slb_addr_limit);
 	DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def));
 #endif /* CONFIG_PPC_MM_SLICES */
 #endif
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 5d38d5ea9a24..d6597038931d 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -262,8 +262,8 @@ void copy_mm_to_paca(struct mm_struct *mm)
 
 	get_paca()->mm_ctx_id = context->id;
 #ifdef CONFIG_PPC_MM_SLICES
-	VM_BUG_ON(!mm->context.addr_limit);
-	get_paca()->addr_limit = mm->context.addr_limit;
+	VM_BUG_ON(!mm->context.slb_addr_limit);
+	get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit;
 	get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
 	memcpy(&get_paca()->mm_ctx_high_slices_psize,
 	       &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm));
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index fa661ed616f5..2075322cd225 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -898,7 +898,8 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_PPC_MM_SLICES
 #ifdef CONFIG_PPC64
-	init_mm.context.addr_limit = DEFAULT_MAP_WINDOW_USER64;
+	if (!radix_enabled())
+		init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
 #else
 #error	"context.addr_limit not initialized."
 #endif
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index 0a3d71aae175..b54b581a2f7d 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -60,16 +60,10 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -EINVAL;
 	if (len > high_limit)
 		return -ENOMEM;
+
 	if (fixed) {
 		if (addr > high_limit - len)
 			return -ENOMEM;
-	}
-
-	if (unlikely(addr > mm->context.addr_limit &&
-		     mm->context.addr_limit != TASK_SIZE))
-		mm->context.addr_limit = TASK_SIZE;
-
-	if (fixed) {
 		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 6d476a7b5611..d503f344e476 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -116,17 +116,12 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
 
 	if (len > high_limit)
 		return -ENOMEM;
+
 	if (fixed) {
 		if (addr > high_limit - len)
 			return -ENOMEM;
-	}
-
-	if (unlikely(addr > mm->context.addr_limit &&
-		     mm->context.addr_limit != TASK_SIZE))
-		mm->context.addr_limit = TASK_SIZE;
-
-	if (fixed)
 		return addr;
+	}
 
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
@@ -165,17 +160,12 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
 
 	if (len > high_limit)
 		return -ENOMEM;
+
 	if (fixed) {
 		if (addr > high_limit - len)
 			return -ENOMEM;
-	}
-
-	if (unlikely(addr > mm->context.addr_limit &&
-		     mm->context.addr_limit != TASK_SIZE))
-		mm->context.addr_limit = TASK_SIZE;
-
-	if (fixed)
 		return addr;
+	}
 
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 846cbad45fce..5e193e444ee8 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -96,8 +96,8 @@ static int hash__init_new_context(struct mm_struct *mm)
 	 * In the case of exec, use the default limit,
 	 * otherwise inherit it from the mm we are duplicating.
 	 */
-	if (!mm->context.addr_limit)
-		mm->context.addr_limit = DEFAULT_MAP_WINDOW_USER64;
+	if (!mm->context.slb_addr_limit)
+		mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
 
 	/*
 	 * The old code would re-promote on fork, we don't do that when using
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index ed60ad861dfa..2cf5ef3fc50d 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -167,7 +167,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
         /*
          * user space make sure we are within the allowed limit
 	 */
-	ld	r11,PACA_ADDR_LIMIT(r13)
+	ld	r11,PACA_SLB_ADDR_LIMIT(r13)
 	cmpld	r3,r11
 	bge-	8f
 
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index a4f93699194b..564fff06f5c1 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -96,7 +96,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
 {
 	struct vm_area_struct *vma;
 
-	if ((mm->context.addr_limit - len) < addr)
+	if ((mm->context.slb_addr_limit - len) < addr)
 		return 0;
 	vma = find_vma(mm, addr);
 	return (!vma || (addr + len) <= vm_start_gap(vma));
@@ -133,10 +133,10 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret)
 		if (!slice_low_has_vma(mm, i))
 			ret->low_slices |= 1u << i;
 
-	if (mm->context.addr_limit <= SLICE_LOW_TOP)
+	if (mm->context.slb_addr_limit <= SLICE_LOW_TOP)
 		return;
 
-	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++)
+	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++)
 		if (!slice_high_has_vma(mm, i))
 			__set_bit(i, ret->high_slices);
 }
@@ -157,7 +157,7 @@ static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_ma
 			ret->low_slices |= 1u << i;
 
 	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) {
+	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
 		if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
@@ -169,7 +169,7 @@ static int slice_check_fit(struct mm_struct *mm,
 			   struct slice_mask mask, struct slice_mask available)
 {
 	DECLARE_BITMAP(result, SLICE_NUM_HIGH);
-	unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.addr_limit);
+	unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit);
 
 	bitmap_and(result, mask.high_slices,
 		   available.high_slices, slice_count);
@@ -219,7 +219,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 	mm->context.low_slices_psize = lpsizes;
 
 	hpsizes = mm->context.high_slices_psize;
-	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) {
+	for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) {
 		mask_index = i & 0x1;
 		index = i >> 1;
 		if (test_bit(i, mask.high_slices))
@@ -329,8 +329,8 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
 	 * Only for that request for which high_limit is above
 	 * DEFAULT_MAP_WINDOW we should apply this.
 	 */
-	if (high_limit  > DEFAULT_MAP_WINDOW)
-		addr += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
+	if (high_limit > DEFAULT_MAP_WINDOW)
+		addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW;
 
 	while (addr > PAGE_SIZE) {
 		info.high_limit = addr;
@@ -432,8 +432,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 			return -ENOMEM;
 	}
 
-	if (high_limit > mm->context.addr_limit) {
-		mm->context.addr_limit = high_limit;
+	if (high_limit > mm->context.slb_addr_limit) {
+		mm->context.slb_addr_limit = high_limit;
 		on_each_cpu(slice_flush_segments, mm, 1);
 	}
 
@@ -452,7 +452,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
 
 	/* Sanity checks */
 	BUG_ON(mm->task_size == 0);
-	BUG_ON(mm->context.addr_limit == 0);
+	BUG_ON(mm->context.slb_addr_limit == 0);
 	VM_BUG_ON(radix_enabled());
 
 	slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
-- 
cgit 


From 475b581ff57bc01437cbc680e281869918447763 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 14 Nov 2017 15:48:47 +1100
Subject: powerpc/64s: Fix masking of SRR1 bits on instruction fault

On 64-bit Book3s, when we take an instruction fault the reason for the
fault may be reported in SRR1. For data faults the reason is reported
in DSISR (Data Storage Instruction Status Register).

The reasons reported in each do not necessarily correspond, so we mask
the SRR1 bits before copying them to the DSISR, which is then used by
the page fault code.

Prior to commit b4c001dc44f0 ("powerpc/mm: Use symbolic constants for
filtering SRR1 bits on ISIs") we used a hard-coded mask of 0x58200000,
which corresponds to:

  DSISR_NOHPTE		0x40000000 /* no translation found */
  DSISR_NOEXEC_OR_G	0x10000000 /* exec of no-exec or guarded */
  DSISR_PROTFAULT	0x08000000 /* protection fault */
  DSISR_KEYFAULT	0x00200000 /* Storage Key fault */

That commit added a #define for the mask, DSISR_SRR1_MATCH_64S, but
incorrectly used a different similarly named DSISR_BAD_FAULT_64S.

This had the effect of changing the mask to 0xa43a0000, which omits
everything but DSISR_KEYFAULT.

Luckily this had no visible effect, because in practice we hardly use
the DSISR bits. The lack of DSISR_NOHPTE means a TLB flush
optimisation was missed in the native HPTE code, and DSISR_NOEXEC_OR_G
and DSISR_PROTFAULT are both only used to trigger rare warnings.

So we got lucky, but let's fix it. The new value only has bits between
17 and 30 set, so we can continue to use andis.

Fixes: b4c001dc44f0 ("powerpc/mm: Use symbolic constants for filtering SRR1 bits on ISIs")
Cc: stable@vger.kernel.org # v4.14+
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/exceptions-64s.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 445137e2d0ca..4c4d20a29726 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -543,7 +543,7 @@ EXC_COMMON_BEGIN(instruction_access_common)
 	RECONCILE_IRQ_STATE(r10, r11)
 	ld	r12,_MSR(r1)
 	ld	r3,_NIP(r1)
-	andis.	r4,r12,DSISR_BAD_FAULT_64S@h
+	andis.	r4,r12,DSISR_SRR1_MATCH_64S@h
 	li	r5,0x400
 	std	r3,_DAR(r1)
 	std	r4,_DSISR(r1)
-- 
cgit 


From 3ffa9d9e2a7c10127d8cbf91ea2be15390b450ed Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 15 Nov 2017 14:25:42 +1100
Subject: powerpc/64s: Fix Power9 DD2.0 workarounds by adding DD2.1 feature

Recently we added a CPU feature for Power9 DD2.0, to capture the fact
that some workarounds are required only on Power9 DD1 and DD2.0 but
not DD2.1 or later.

Then in commit 9d2f510a66ec ("powerpc/64s/idle: avoid POWER9 DD1 and
DD2.0 ERAT workaround on DD2.1") and commit e3646330cf66
"powerpc/64s/idle: avoid POWER9 DD1 and DD2.0 PMU workaround on
DD2.1") we changed CPU_FTR_SECTIONs to check for DD1 or DD20, eg:

  BEGIN_FTR_SECTION
          PPC_INVALIDATE_ERAT
  END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20)

Unfortunately although this reads as "if set DD1 or DD2.0", the or is
a bitwise or and actually generates a mask of both bits. The code that
does the feature patching then checks that the value of the CPU
features masked with that mask are equal to the mask.

So the end result is we're checking for DD1 and DD20 being set, which
never happens. Yes the API is terrible.

Removing the ERAT workaround on DD2.0 results in random SEGVs, the
system tends to boot, but things randomly die including sometimes
dhclient, udev etc.

To fix the problem and hopefully avoid it in future, we remove the
DD2.0 CPU feature and instead add a DD2.1 (or later) feature. This
allows us to easily express that the workarounds are required if DD2.1
is not set.

At some point we will drop the DD1 workarounds entirely and some of
this can be cleaned up.

Fixes: 9d2f510a66ec ("powerpc/64s/idle: avoid POWER9 DD1 and DD2.0 ERAT workaround on DD2.1")
Fixes: e3646330cf66 ("powerpc/64s/idle: avoid POWER9 DD1 and DD2.0 PMU workaround on DD2.1")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/include/asm/cputable.h | 7 ++++---
 arch/powerpc/kernel/cputable.c      | 6 +++---
 arch/powerpc/kernel/dt_cpu_ftrs.c   | 2 +-
 arch/powerpc/kernel/idle_book3s.S   | 8 ++++----
 4 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 1ca26ca85ec6..04c31db5e7c3 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -215,7 +215,7 @@ enum {
 #define CPU_FTR_DABRX			LONG_ASM_CONST(0x0800000000000000)
 #define CPU_FTR_PMAO_BUG		LONG_ASM_CONST(0x1000000000000000)
 #define CPU_FTR_POWER9_DD1		LONG_ASM_CONST(0x4000000000000000)
-#define CPU_FTR_POWER9_DD20		LONG_ASM_CONST(0x8000000000000000)
+#define CPU_FTR_POWER9_DD2_1		LONG_ASM_CONST(0x8000000000000000)
 
 #ifndef __ASSEMBLY__
 
@@ -478,7 +478,8 @@ enum {
 	    CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300)
 #define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \
 			     (~CPU_FTR_SAO))
-#define CPU_FTRS_POWER9_DD20 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD20)
+#define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9
+#define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1)
 #define CPU_FTRS_CELL	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -498,7 +499,7 @@ enum {
 	     CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
 	     CPU_FTRS_POWER8 | CPU_FTRS_POWER8_DD1 | CPU_FTRS_CELL | \
 	     CPU_FTRS_PA6T | CPU_FTR_VSX | CPU_FTRS_POWER9 | \
-	     CPU_FTRS_POWER9_DD1 | CPU_FTRS_POWER9_DD20)
+	     CPU_FTRS_POWER9_DD1 | CPU_FTRS_POWER9_DD2_1)
 #endif
 #else
 enum {
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 171820190de7..1350f49d81a8 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -551,7 +551,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.pvr_mask		= 0xffffefff,
 		.pvr_value		= 0x004e0200,
 		.cpu_name		= "POWER9 (raw)",
-		.cpu_features		= CPU_FTRS_POWER9_DD20,
+		.cpu_features		= CPU_FTRS_POWER9_DD2_0,
 		.cpu_user_features	= COMMON_USER_POWER9,
 		.cpu_user_features2	= COMMON_USER2_POWER9,
 		.mmu_features		= MMU_FTRS_POWER9,
@@ -567,11 +567,11 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.machine_check_early	= __machine_check_early_realmode_p9,
 		.platform		= "power9",
 	},
-	{	/* Power9 */
+	{	/* Power9 DD 2.1 or later (see DD2.0 above) */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x004e0000,
 		.cpu_name		= "POWER9 (raw)",
-		.cpu_features		= CPU_FTRS_POWER9,
+		.cpu_features		= CPU_FTRS_POWER9_DD2_1,
 		.cpu_user_features	= COMMON_USER_POWER9,
 		.cpu_user_features2	= COMMON_USER2_POWER9,
 		.mmu_features		= MMU_FTRS_POWER9,
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 58e3b51de31c..602e0fde19b4 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -736,7 +736,7 @@ static __init void cpufeatures_cpu_quirks(void)
 	if ((version & 0xffffff00) == 0x004e0100)
 		cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD1;
 	else if ((version & 0xffffefff) == 0x004e0200)
-		cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD20;
+		cur_cpu_spec->cpu_features &= ~CPU_FTR_POWER9_DD2_1;
 }
 
 static void __init cpufeatures_setup_finished(void)
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 59657783d1d5..01e1c1997893 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -357,13 +357,13 @@ power_enter_stop:
 .Lhandle_esl_ec_set:
 BEGIN_FTR_SECTION
 	/*
-	 * POWER9 DD2 can incorrectly set PMAO when waking up after a
-	 * state-loss idle. Saving and restoring MMCR0 over idle is a
+	 * POWER9 DD2.0 or earlier can incorrectly set PMAO when waking up after
+	 * a state-loss idle. Saving and restoring MMCR0 over idle is a
 	 * workaround.
 	 */
 	mfspr	r4,SPRN_MMCR0
 	std	r4,_MMCR0(r1)
-END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20)
+END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1)
 
 /*
  * Check if the requested state is a deep idle state.
@@ -554,7 +554,7 @@ BEGIN_FTR_SECTION
 	ld	r1,PACAR1(r13)
 	ld	r4,_MMCR0(r1)
 	mtspr	SPRN_MMCR0,r4
-END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20)
+END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1)
 	mfspr	r4,SPRN_MMCRA
 	ori	r4,r4,(1 << (63-60))
 	mtspr	SPRN_MMCRA,r4
-- 
cgit