From b9fde58db7e5738cacb740b0ec547933fe314fbe Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 7 Sep 2017 16:35:44 +1000 Subject: powerpc/powernv: Rework EEH initialization on powernv Remove the post_init callback which is only used by powernv, we can just call it explicitly from the powernv code. This partially kills the ability to "disable" eeh at runtime via debugfs as this was calling that same callback again, but this is both unused and broken in several ways. If we want to revive it, we need to create a dedicated enable/disable callback on the backend that does the right thing. Let the bulk of eeh initialize normally at core_initcall() like it does on pseries by removing the hack in eeh_init() that delays it. Instead we make sure our eeh->probe cleanly bails out of the PEs haven't been created yet and we force a re-probe where we used to call eeh_init() again. Signed-off-by: Benjamin Herrenschmidt Acked-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 8 ++--- arch/powerpc/kernel/eeh.c | 46 +++++++++------------------- arch/powerpc/platforms/powernv/eeh-powernv.c | 42 +++++++++++++------------ arch/powerpc/platforms/powernv/pci-ioda.c | 3 +- arch/powerpc/platforms/powernv/pci.h | 1 + 5 files changed, 40 insertions(+), 60 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 9847ae3a12d1..cd1df96335e5 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -200,7 +200,6 @@ enum { struct eeh_ops { char *name; int (*init)(void); - int (*post_init)(void); void* (*probe)(struct pci_dn *pdn, void *data); int (*set_option)(struct eeh_pe *pe, int option); int (*get_pe_addr)(struct eeh_pe *pe); @@ -275,7 +274,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe); struct eeh_dev *eeh_dev_init(struct pci_dn *pdn); void eeh_dev_phb_init_dynamic(struct pci_controller *phb); -int eeh_init(void); +void eeh_probe_devices(void); int __init eeh_ops_register(struct eeh_ops *ops); int __exit eeh_ops_unregister(const char *name); int eeh_check_failure(const volatile void __iomem *token); @@ -321,10 +320,7 @@ static inline bool eeh_enabled(void) return false; } -static inline int eeh_init(void) -{ - return 0; -} +static inline void eeh_probe_devices(void) { } static inline void *eeh_dev_init(struct pci_dn *pdn, void *data) { diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 116000b45531..cbca0a667682 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -972,6 +972,18 @@ static struct notifier_block eeh_reboot_nb = { .notifier_call = eeh_reboot_notifier, }; +void eeh_probe_devices(void) +{ + struct pci_controller *hose, *tmp; + struct pci_dn *pdn; + + /* Enable EEH for all adapters */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + pdn = hose->pci_data; + traverse_pci_dn(pdn, eeh_ops->probe, NULL); + } +} + /** * eeh_init - EEH initialization * @@ -987,22 +999,11 @@ static struct notifier_block eeh_reboot_nb = { * Even if force-off is set, the EEH hardware is still enabled, so that * newer systems can boot. */ -int eeh_init(void) +static int eeh_init(void) { struct pci_controller *hose, *tmp; - struct pci_dn *pdn; - static int cnt = 0; int ret = 0; - /* - * We have to delay the initialization on PowerNV after - * the PCI hierarchy tree has been built because the PEs - * are figured out based on PCI devices instead of device - * tree nodes - */ - if (machine_is(powernv) && cnt++ <= 0) - return ret; - /* Register reboot notifier */ ret = register_reboot_notifier(&eeh_reboot_nb); if (ret) { @@ -1028,22 +1029,7 @@ int eeh_init(void) if (ret) return ret; - /* Enable EEH for all adapters */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - pdn = hose->pci_data; - traverse_pci_dn(pdn, eeh_ops->probe, NULL); - } - - /* - * Call platform post-initialization. Actually, It's good chance - * to inform platform that EEH is ready to supply service if the - * I/O cache stuff has been built up. - */ - if (eeh_ops->post_init) { - ret = eeh_ops->post_init(); - if (ret) - return ret; - } + eeh_probe_devices(); if (eeh_enabled()) pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n"); @@ -1757,10 +1743,6 @@ static int eeh_enable_dbgfs_set(void *data, u64 val) else eeh_add_flag(EEH_FORCE_DISABLED); - /* Notify the backend */ - if (eeh_ops->post_init) - eeh_ops->post_init(); - return 0; } diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 8864065eba22..4650fb294e7a 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -41,7 +41,6 @@ #include "powernv.h" #include "pci.h" -static bool pnv_eeh_nb_init = false; static int eeh_event_irq = -EINVAL; static int pnv_eeh_init(void) @@ -197,31 +196,31 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10); * been built. If the I/O cache staff has been built, EEH is * ready to supply service. */ -static int pnv_eeh_post_init(void) +int pnv_eeh_post_init(void) { struct pci_controller *hose; struct pnv_phb *phb; int ret = 0; - /* Register OPAL event notifier */ - if (!pnv_eeh_nb_init) { - eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); - if (eeh_event_irq < 0) { - pr_err("%s: Can't register OPAL event interrupt (%d)\n", - __func__, eeh_event_irq); - return eeh_event_irq; - } + /* Probe devices & build address cache */ + eeh_probe_devices(); + eeh_addr_cache_build(); - ret = request_irq(eeh_event_irq, pnv_eeh_event, - IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL); - if (ret < 0) { - irq_dispose_mapping(eeh_event_irq); - pr_err("%s: Can't request OPAL event interrupt (%d)\n", - __func__, eeh_event_irq); - return ret; - } + /* Register OPAL event notifier */ + eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); + if (eeh_event_irq < 0) { + pr_err("%s: Can't register OPAL event interrupt (%d)\n", + __func__, eeh_event_irq); + return eeh_event_irq; + } - pnv_eeh_nb_init = true; + ret = request_irq(eeh_event_irq, pnv_eeh_event, + IRQ_TYPE_LEVEL_HIGH, "opal-eeh", NULL); + if (ret < 0) { + irq_dispose_mapping(eeh_event_irq); + pr_err("%s: Can't request OPAL event interrupt (%d)\n", + __func__, eeh_event_irq); + return ret; } if (!eeh_enabled()) @@ -367,6 +366,10 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + /* Skip if we haven't probed yet */ + if (phb->ioda.pe_rmap[config_addr] == IODA_INVALID_PE) + return NULL; + /* Initialize eeh device */ edev->class_code = pdn->class_code; edev->mode &= 0xFFFFFF00; @@ -1731,7 +1734,6 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn) static struct eeh_ops pnv_eeh_ops = { .name = "powernv", .init = pnv_eeh_init, - .post_init = pnv_eeh_post_init, .probe = pnv_eeh_probe, .set_option = pnv_eeh_set_option, .get_pe_addr = pnv_eeh_get_pe_addr, diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 57f9e55f4352..fb5cd7511189 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -3293,8 +3293,7 @@ static void pnv_pci_ioda_fixup(void) pnv_pci_ioda_create_dbgfs(); #ifdef CONFIG_EEH - eeh_init(); - eeh_addr_cache_build(); + pnv_eeh_post_init(); #endif } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index a95273c524f6..56d1f272d4ad 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -234,6 +234,7 @@ extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); extern bool pnv_pci_enable_device_hook(struct pci_dev *dev); extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); +extern int pnv_eeh_post_init(void); extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...); -- cgit From 5080332c2c893118dbc18755f35c8b0131cf0fc4 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Fri, 15 Sep 2017 15:25:48 +1000 Subject: powerpc/64s: Add workaround for P9 vector CI load issue POWER9 DD2.1 and earlier has an issue where some cache inhibited vector load will return bad data. The workaround is two part, one firmware/microcode part triggers HMI interrupts when hitting such loads, the other part is this patch which then emulates the instructions in Linux. The affected instructions are limited to lxvd2x, lxvw4x, lxvb16x and lxvh8x. When an instruction triggers the HMI, all threads in the core will be sent to the HMI handler, not just the one running the vector load. In general, these spurious HMIs are detected by the emulation code and we just return back to the running process. Unfortunately, if a spurious interrupt occurs on a vector load that's to normal memory we have no way to detect that it's spurious (unless we walk the page tables, which is very expensive). In this case we emulate the load but we need do so using a vector load itself to ensure 128bit atomicity is preserved. Some additional debugfs emulated instruction counters are added also. Signed-off-by: Michael Neuling Signed-off-by: Benjamin Herrenschmidt [mpe: Switch CONFIG_PPC_BOOK3S_64 to CONFIG_VSX to unbreak the build] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/emulated_ops.h | 4 + arch/powerpc/include/asm/paca.h | 1 + arch/powerpc/include/asm/uaccess.h | 17 +++ arch/powerpc/kernel/exceptions-64s.S | 16 ++- arch/powerpc/kernel/mce.c | 30 ++++- arch/powerpc/kernel/traps.c | 201 ++++++++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/smp.c | 7 ++ 7 files changed, 271 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/emulated_ops.h b/arch/powerpc/include/asm/emulated_ops.h index f00e10e2a335..651e1354498e 100644 --- a/arch/powerpc/include/asm/emulated_ops.h +++ b/arch/powerpc/include/asm/emulated_ops.h @@ -55,6 +55,10 @@ extern struct ppc_emulated { struct ppc_emulated_entry mfdscr; struct ppc_emulated_entry mtdscr; struct ppc_emulated_entry lq_stq; + struct ppc_emulated_entry lxvw4x; + struct ppc_emulated_entry lxvh8x; + struct ppc_emulated_entry lxvd2x; + struct ppc_emulated_entry lxvb16x; #endif } ppc_emulated; diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 04b60af027ae..1e06310ccc09 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -210,6 +210,7 @@ struct paca_struct { */ u16 in_mce; u8 hmi_event_available; /* HMI event is available */ + u8 hmi_p9_special_emu; /* HMI P9 special emulation */ #endif /* Stuff for accurate time accounting */ diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 9c0e60ca1666..e34f15e727d9 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -173,6 +173,23 @@ do { \ extern long __get_user_bad(void); +/* + * This does an atomic 128 byte aligned load from userspace. + * Upto caller to do enable_kernel_vmx() before calling! + */ +#define __get_user_atomic_128_aligned(kaddr, uaddr, err) \ + __asm__ __volatile__( \ + "1: lvx 0,0,%1 # get user\n" \ + " stvx 0,0,%2 # put kernel\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%3\n" \ + " b 2b\n" \ + ".previous\n" \ + EX_TABLE(1b, 3b) \ + : "=r" (err) \ + : "b" (uaddr), "b" (kaddr), "i" (-EFAULT), "0" (err)) + #define __get_user_asm(x, addr, err, op) \ __asm__ __volatile__( \ "1: "op" %1,0(%2) # get_user\n" \ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 48da0f5d2f7f..8c32dd4cac20 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1010,6 +1010,8 @@ TRAMP_REAL_BEGIN(hmi_exception_early) EXCEPTION_PROLOG_COMMON_3(0xe60) addi r3,r1,STACK_FRAME_OVERHEAD BRANCH_LINK_TO_FAR(hmi_exception_realmode) /* Function call ABI */ + cmpdi cr0,r3,0 + /* Windup the stack. */ /* Move original HSRR0 and HSRR1 into the respective regs */ ld r9,_MSR(r1) @@ -1026,10 +1028,15 @@ TRAMP_REAL_BEGIN(hmi_exception_early) REST_8GPRS(2, r1) REST_GPR(10, r1) ld r11,_CCR(r1) + REST_2GPRS(12, r1) + bne 1f mtcr r11 REST_GPR(11, r1) - REST_2GPRS(12, r1) - /* restore original r1. */ + ld r1,GPR1(r1) + hrfid + +1: mtcr r11 + REST_GPR(11, r1) ld r1,GPR1(r1) /* @@ -1042,8 +1049,9 @@ hmi_exception_after_realmode: EXCEPTION_PROLOG_0(PACA_EXGEN) b tramp_real_hmi_exception -EXC_COMMON_ASYNC(hmi_exception_common, 0xe60, handle_hmi_exception) - +EXC_COMMON_BEGIN(hmi_exception_common) +EXCEPTION_COMMON(PACA_EXGEN, 0xe60, hmi_exception_common, handle_hmi_exception, + ret_from_except, FINISH_NAP;ADD_NVGPRS;ADD_RECONCILE;RUNLATCH_ON) EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20) EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80) diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 9b2ea7e71c06..f588951b171d 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -470,6 +470,34 @@ long hmi_exception_realmode(struct pt_regs *regs) { __this_cpu_inc(irq_stat.hmi_exceptions); +#ifdef CONFIG_PPC_BOOK3S_64 + /* Workaround for P9 vector CI loads (see p9_hmi_special_emu) */ + if (pvr_version_is(PVR_POWER9)) { + unsigned long hmer = mfspr(SPRN_HMER); + + /* Do we have the debug bit set */ + if (hmer & PPC_BIT(17)) { + hmer &= ~PPC_BIT(17); + mtspr(SPRN_HMER, hmer); + + /* + * Now to avoid problems with soft-disable we + * only do the emulation if we are coming from + * user space + */ + if (user_mode(regs)) + local_paca->hmi_p9_special_emu = 1; + + /* + * Don't bother going to OPAL if that's the + * only relevant bit. + */ + if (!(hmer & mfspr(SPRN_HMEER))) + return local_paca->hmi_p9_special_emu; + } + } +#endif /* CONFIG_PPC_BOOK3S_64 */ + wait_for_subcore_guest_exit(); if (ppc_md.hmi_exception_early) @@ -477,5 +505,5 @@ long hmi_exception_realmode(struct pt_regs *regs) wait_for_tb_resync(); - return 0; + return 1; } diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 13c9dcdcba69..9ae1924c7d1a 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -699,6 +700,187 @@ void SMIException(struct pt_regs *regs) die("System Management Interrupt", regs, SIGABRT); } +#ifdef CONFIG_VSX +static void p9_hmi_special_emu(struct pt_regs *regs) +{ + unsigned int ra, rb, t, i, sel, instr, rc; + const void __user *addr; + u8 vbuf[16], *vdst; + unsigned long ea, msr, msr_mask; + bool swap; + + if (__get_user_inatomic(instr, (unsigned int __user *)regs->nip)) + return; + + /* + * lxvb16x opcode: 0x7c0006d8 + * lxvd2x opcode: 0x7c000698 + * lxvh8x opcode: 0x7c000658 + * lxvw4x opcode: 0x7c000618 + */ + if ((instr & 0xfc00073e) != 0x7c000618) { + pr_devel("HMI vec emu: not vector CI %i:%s[%d] nip=%016lx" + " instr=%08x\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr); + return; + } + + /* Grab vector registers into the task struct */ + msr = regs->msr; /* Grab msr before we flush the bits */ + flush_vsx_to_thread(current); + enable_kernel_altivec(); + + /* + * Is userspace running with a different endian (this is rare but + * not impossible) + */ + swap = (msr & MSR_LE) != (MSR_KERNEL & MSR_LE); + + /* Decode the instruction */ + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + t = (instr >> 21) & 0x1f; + if (instr & 1) + vdst = (u8 *)¤t->thread.vr_state.vr[t]; + else + vdst = (u8 *)¤t->thread.fp_state.fpr[t][0]; + + /* Grab the vector address */ + ea = regs->gpr[rb] + (ra ? regs->gpr[ra] : 0); + if (is_32bit_task()) + ea &= 0xfffffffful; + addr = (__force const void __user *)ea; + + /* Check it */ + if (!access_ok(VERIFY_READ, addr, 16)) { + pr_devel("HMI vec emu: bad access %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, (unsigned long)addr); + return; + } + + /* Read the vector */ + rc = 0; + if ((unsigned long)addr & 0xfUL) + /* unaligned case */ + rc = __copy_from_user_inatomic(vbuf, addr, 16); + else + __get_user_atomic_128_aligned(vbuf, addr, rc); + if (rc) { + pr_devel("HMI vec emu: page fault %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, (unsigned long)addr); + return; + } + + pr_devel("HMI vec emu: emulated vector CI %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, regs->nip, + instr, (unsigned long) addr); + + /* Grab instruction "selector" */ + sel = (instr >> 6) & 3; + + /* + * Check to make sure the facility is actually enabled. This + * could happen if we get a false positive hit. + * + * lxvd2x/lxvw4x always check MSR VSX sel = 0,2 + * lxvh8x/lxvb16x check MSR VSX or VEC depending on VSR used sel = 1,3 + */ + msr_mask = MSR_VSX; + if ((sel & 1) && (instr & 1)) /* lxvh8x & lxvb16x + VSR >= 32 */ + msr_mask = MSR_VEC; + if (!(msr & msr_mask)) { + pr_devel("HMI vec emu: MSR fac clear %i:%s[%d] nip=%016lx" + " instr=%08x msr:%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, msr); + return; + } + + /* Do logging here before we modify sel based on endian */ + switch (sel) { + case 0: /* lxvw4x */ + PPC_WARN_EMULATED(lxvw4x, regs); + break; + case 1: /* lxvh8x */ + PPC_WARN_EMULATED(lxvh8x, regs); + break; + case 2: /* lxvd2x */ + PPC_WARN_EMULATED(lxvd2x, regs); + break; + case 3: /* lxvb16x */ + PPC_WARN_EMULATED(lxvb16x, regs); + break; + } + +#ifdef __LITTLE_ENDIAN__ + /* + * An LE kernel stores the vector in the task struct as an LE + * byte array (effectively swapping both the components and + * the content of the components). Those instructions expect + * the components to remain in ascending address order, so we + * swap them back. + * + * If we are running a BE user space, the expectation is that + * of a simple memcpy, so forcing the emulation to look like + * a lxvb16x should do the trick. + */ + if (swap) + sel = 3; + + switch (sel) { + case 0: /* lxvw4x */ + for (i = 0; i < 4; i++) + ((u32 *)vdst)[i] = ((u32 *)vbuf)[3-i]; + break; + case 1: /* lxvh8x */ + for (i = 0; i < 8; i++) + ((u16 *)vdst)[i] = ((u16 *)vbuf)[7-i]; + break; + case 2: /* lxvd2x */ + for (i = 0; i < 2; i++) + ((u64 *)vdst)[i] = ((u64 *)vbuf)[1-i]; + break; + case 3: /* lxvb16x */ + for (i = 0; i < 16; i++) + vdst[i] = vbuf[15-i]; + break; + } +#else /* __LITTLE_ENDIAN__ */ + /* On a big endian kernel, a BE userspace only needs a memcpy */ + if (!swap) + sel = 3; + + /* Otherwise, we need to swap the content of the components */ + switch (sel) { + case 0: /* lxvw4x */ + for (i = 0; i < 4; i++) + ((u32 *)vdst)[i] = cpu_to_le32(((u32 *)vbuf)[i]); + break; + case 1: /* lxvh8x */ + for (i = 0; i < 8; i++) + ((u16 *)vdst)[i] = cpu_to_le16(((u16 *)vbuf)[i]); + break; + case 2: /* lxvd2x */ + for (i = 0; i < 2; i++) + ((u64 *)vdst)[i] = cpu_to_le64(((u64 *)vbuf)[i]); + break; + case 3: /* lxvb16x */ + memcpy(vdst, vbuf, 16); + break; + } +#endif /* !__LITTLE_ENDIAN__ */ + + /* Go to next instruction */ + regs->nip += 4; +} +#endif /* CONFIG_VSX */ + void handle_hmi_exception(struct pt_regs *regs) { struct pt_regs *old_regs; @@ -706,6 +888,21 @@ void handle_hmi_exception(struct pt_regs *regs) old_regs = set_irq_regs(regs); irq_enter(); +#ifdef CONFIG_VSX + /* Real mode flagged P9 special emu is needed */ + if (local_paca->hmi_p9_special_emu) { + local_paca->hmi_p9_special_emu = 0; + + /* + * We don't want to take page faults while doing the + * emulation, we just replay the instruction if necessary. + */ + pagefault_disable(); + p9_hmi_special_emu(regs); + pagefault_enable(); + } +#endif /* CONFIG_VSX */ + if (ppc_md.handle_hmi_exception) ppc_md.handle_hmi_exception(regs); @@ -1924,6 +2121,10 @@ struct ppc_emulated ppc_emulated = { WARN_EMULATED_SETUP(mfdscr), WARN_EMULATED_SETUP(mtdscr), WARN_EMULATED_SETUP(lq_stq), + WARN_EMULATED_SETUP(lxvw4x), + WARN_EMULATED_SETUP(lxvh8x), + WARN_EMULATED_SETUP(lxvd2x), + WARN_EMULATED_SETUP(lxvb16x), #endif }; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index c17f81e433f7..355d3f99cafb 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -49,6 +49,13 @@ static void pnv_smp_setup_cpu(int cpu) { + /* + * P9 workaround for CI vector load (see traps.c), + * enable the corresponding HMI interrupt + */ + if (pvr_version_is(PVR_POWER9)) + mtspr(SPRN_HMEER, mfspr(SPRN_HMEER) | PPC_BIT(17)); + if (xive_enabled()) xive_smp_setup_cpu(); else if (cpu != boot_cpuid) -- cgit From 6110236b9bbd177debc045c5fc29224444686ece Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Sun, 3 Sep 2017 20:15:12 +0200 Subject: powerpc/mm: Export flush_all_mm() With the optimizations introduced by commit a46cc7a90fd8 ("powerpc/mm/radix: Improve TLB/PWC flushes"), flush_tlb_mm() no longer flushes the page walk cache (PWC) with radix. This patch introduces flush_all_mm(), which flushes everything, TLB and PWC, for a given mm. Signed-off-by: Frederic Barrat Reviewed-By: Alistair Popple [mpe: Add a WARN_ON_ONCE() in the empty hash routines] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 22 ++++++++++++++++++++++ .../powerpc/include/asm/book3s/64/tlbflush-radix.h | 3 +++ arch/powerpc/include/asm/book3s/64/tlbflush.h | 15 +++++++++++++++ arch/powerpc/mm/tlb-radix.c | 6 ++++-- 4 files changed, 44 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index 2f6373144e2c..99c99bb04353 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -65,6 +65,28 @@ static inline void hash__flush_tlb_mm(struct mm_struct *mm) { } +static inline void hash__local_flush_all_mm(struct mm_struct *mm) +{ + /* + * There's no Page Walk Cache for hash, so what is needed is + * the same as flush_tlb_mm(), which doesn't really make sense + * with hash. So the only thing we could do is flush the + * entire LPID! Punt for now, as it's not being used. + */ + WARN_ON_ONCE(1); +} + +static inline void hash__flush_all_mm(struct mm_struct *mm) +{ + /* + * There's no Page Walk Cache for hash, so what is needed is + * the same as flush_tlb_mm(), which doesn't really make sense + * with hash. So the only thing we could do is flush the + * entire LPID! Punt for now, as it's not being used. + */ + WARN_ON_ONCE(1); +} + static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index 9b433a624bf3..af06c6fe8a9f 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -21,17 +21,20 @@ extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long sta extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); extern void radix__local_flush_tlb_mm(struct mm_struct *mm); +extern void radix__local_flush_all_mm(struct mm_struct *mm); extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize); extern void radix__tlb_flush(struct mmu_gather *tlb); #ifdef CONFIG_SMP extern void radix__flush_tlb_mm(struct mm_struct *mm); +extern void radix__flush_all_mm(struct mm_struct *mm); extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); extern void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize); #else #define radix__flush_tlb_mm(mm) radix__local_flush_tlb_mm(mm) +#define radix__flush_all_mm(mm) radix__local_flush_all_mm(mm) #define radix__flush_tlb_page(vma,addr) radix__local_flush_tlb_page(vma,addr) #define radix__flush_tlb_page_psize(mm,addr,p) radix__local_flush_tlb_page_psize(mm,addr,p) #endif diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 72b925f97bab..70760d018bcd 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -57,6 +57,13 @@ static inline void local_flush_tlb_page(struct vm_area_struct *vma, return hash__local_flush_tlb_page(vma, vmaddr); } +static inline void local_flush_all_mm(struct mm_struct *mm) +{ + if (radix_enabled()) + return radix__local_flush_all_mm(mm); + return hash__local_flush_all_mm(mm); +} + static inline void tlb_flush(struct mmu_gather *tlb) { if (radix_enabled()) @@ -79,9 +86,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, return radix__flush_tlb_page(vma, vmaddr); return hash__flush_tlb_page(vma, vmaddr); } + +static inline void flush_all_mm(struct mm_struct *mm) +{ + if (radix_enabled()) + return radix__flush_all_mm(mm); + return hash__flush_all_mm(mm); +} #else #define flush_tlb_mm(mm) local_flush_tlb_mm(mm) #define flush_tlb_page(vma, addr) local_flush_tlb_page(vma, addr) +#define flush_all_mm(mm) local_flush_all_mm(mm) #endif /* CONFIG_SMP */ /* * flush the page walk cache for the address diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index b3e849c4886e..5a1f46eff3a2 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -144,7 +144,7 @@ void radix__local_flush_tlb_mm(struct mm_struct *mm) EXPORT_SYMBOL(radix__local_flush_tlb_mm); #ifndef CONFIG_SMP -static void radix__local_flush_all_mm(struct mm_struct *mm) +void radix__local_flush_all_mm(struct mm_struct *mm) { unsigned long pid; @@ -154,6 +154,7 @@ static void radix__local_flush_all_mm(struct mm_struct *mm) _tlbiel_pid(pid, RIC_FLUSH_ALL); preempt_enable(); } +EXPORT_SYMBOL(radix__local_flush_all_mm); #endif /* CONFIG_SMP */ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, @@ -200,7 +201,7 @@ no_context: } EXPORT_SYMBOL(radix__flush_tlb_mm); -static void radix__flush_all_mm(struct mm_struct *mm) +void radix__flush_all_mm(struct mm_struct *mm) { unsigned long pid; @@ -216,6 +217,7 @@ static void radix__flush_all_mm(struct mm_struct *mm) no_context: preempt_enable(); } +EXPORT_SYMBOL(radix__flush_all_mm); void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) { -- cgit From 03b8abedf4f4965e7e9e0d4f92877c42c07ce19f Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Sun, 3 Sep 2017 20:15:13 +0200 Subject: cxl: Enable global TLBIs for cxl contexts The PSL and nMMU need to see all TLB invalidations for the memory contexts used on the adapter. For the hash memory model, it is done by making all TLBIs global as soon as the cxl driver is in use. For radix, we need something similar, but we can refine and only convert to global the invalidations for contexts actually used by the device. The new mm_context_add_copro() API increments the 'active_cpus' count for the contexts attached to the cxl adapter. As soon as there's more than 1 active cpu, the TLBIs for the context become global. Active cpu count must be decremented when detaching to restore locality if possible and to avoid overflowing the counter. The hash memory model support is somewhat limited, as we can't decrement the active cpus count when mm_context_remove_copro() is called, because we can't flush the TLB for a mm on hash. So TLBIs remain global on hash. Signed-off-by: Frederic Barrat Fixes: f24be42aab37 ("cxl: Add psl9 specific code") Tested-by: Alistair Popple [mpe: Fold in updated comment on the barrier from Fred] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mmu_context.h | 46 ++++++++++++++++++++++++++++++++++ arch/powerpc/mm/mmu_context.c | 9 ------- 2 files changed, 46 insertions(+), 9 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 309592589e30..a0d7145d6cd2 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -77,6 +77,52 @@ extern void switch_cop(struct mm_struct *next); extern int use_cop(unsigned long acop, struct mm_struct *mm); extern void drop_cop(unsigned long acop, struct mm_struct *mm); +#ifdef CONFIG_PPC_BOOK3S_64 +static inline void inc_mm_active_cpus(struct mm_struct *mm) +{ + atomic_inc(&mm->context.active_cpus); +} + +static inline void dec_mm_active_cpus(struct mm_struct *mm) +{ + atomic_dec(&mm->context.active_cpus); +} + +static inline void mm_context_add_copro(struct mm_struct *mm) +{ + /* + * On hash, should only be called once over the lifetime of + * the context, as we can't decrement the active cpus count + * and flush properly for the time being. + */ + inc_mm_active_cpus(mm); +} + +static inline void mm_context_remove_copro(struct mm_struct *mm) +{ + /* + * Need to broadcast a global flush of the full mm before + * decrementing active_cpus count, as the next TLBI may be + * local and the nMMU and/or PSL need to be cleaned up. + * Should be rare enough so that it's acceptable. + * + * Skip on hash, as we don't know how to do the proper flush + * for the time being. Invalidations will remain global if + * used on hash. + */ + if (radix_enabled()) { + flush_all_mm(mm); + dec_mm_active_cpus(mm); + } +} +#else +static inline void inc_mm_active_cpus(struct mm_struct *mm) { } +static inline void dec_mm_active_cpus(struct mm_struct *mm) { } +static inline void mm_context_add_copro(struct mm_struct *mm) { } +static inline void mm_context_remove_copro(struct mm_struct *mm) { } +#endif + + extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk); diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c index 0f613bc63c50..d60a62bf4fc7 100644 --- a/arch/powerpc/mm/mmu_context.c +++ b/arch/powerpc/mm/mmu_context.c @@ -34,15 +34,6 @@ static inline void switch_mm_pgdir(struct task_struct *tsk, struct mm_struct *mm) { } #endif -#ifdef CONFIG_PPC_BOOK3S_64 -static inline void inc_mm_active_cpus(struct mm_struct *mm) -{ - atomic_inc(&mm->context.active_cpus); -} -#else -static inline void inc_mm_active_cpus(struct mm_struct *mm) { } -#endif - void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { -- cgit From 842dc1dbabb5e874550b52d896851e5e53e28a5f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 29 Sep 2017 13:29:37 +1000 Subject: powerpc/watchdog: Do not panic from locked CPU's IPI handler The SMP watchdog will detect locked CPUs and IPI them to print a backtrace and registers. If panic on hard lockup is enabled, do not panic from this handler, because that can cause recursion into the IPI layer during the panic. The caller already panics in this case. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/watchdog.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 2f6eadd9408d..532a1adbe89b 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -97,8 +97,7 @@ static void wd_lockup_ipi(struct pt_regs *regs) else dump_stack(); - if (hardlockup_panic) - nmi_panic(regs, "Hard LOCKUP"); + /* Do not panic from here because that can recurse into NMI IPI layer */ } static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) -- cgit From d58fdd9d7f220f34830838af4e7b21854469ae8a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 29 Sep 2017 13:29:38 +1000 Subject: powerpc/watchdog: Do not backtrace locked CPUs twice if allcpus backtrace is enabled If sysctl_hardlockup_all_cpu_backtrace is enabled, there is no need to IPI stuck CPUs for backtrace before trigger_allbutself_cpu_backtrace(), which does the same thing again. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/watchdog.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 532a1adbe89b..920e61c79f47 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -133,15 +133,18 @@ static void watchdog_smp_panic(int cpu, u64 tb) pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n", cpu, cpumask_pr_args(&wd_smp_cpus_pending)); - /* - * Try to trigger the stuck CPUs. - */ - for_each_cpu(c, &wd_smp_cpus_pending) { - if (c == cpu) - continue; - smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + if (!sysctl_hardlockup_all_cpu_backtrace) { + /* + * Try to trigger the stuck CPUs, unless we are going to + * get a backtrace on all of them anyway. + */ + for_each_cpu(c, &wd_smp_cpus_pending) { + if (c == cpu) + continue; + smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + } + smp_flush_nmi_ipi(1000000); } - smp_flush_nmi_ipi(1000000); /* Take the stuck CPUs out of the watch group */ set_cpumask_stuck(&wd_smp_cpus_pending, tb); -- cgit From 80e4d70b06863e0104e5a0dc78aa3710297fbd4b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 29 Sep 2017 13:29:39 +1000 Subject: powerpc/watchdog: Do not trigger SMP crash from touch_nmi_watchdog In xmon, touch_nmi_watchdog() is not expected to be checking that other CPUs have not touched the watchdog, so the code will just call touch_nmi_watchdog() once before re-enabling hard interrupts. Just update our CPU's state, and ignore apparently stuck SMP threads. Arguably touch_nmi_watchdog should check for SMP lockups, and callers should be fixed, but that's not trivial for the input code of xmon. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/watchdog.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 920e61c79f47..1fb9379dc683 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -277,9 +277,12 @@ void arch_touch_nmi_watchdog(void) { unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000; int cpu = smp_processor_id(); + u64 tb = get_tb(); - if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks) - watchdog_timer_interrupt(cpu); + if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) { + per_cpu(wd_timer_tb, cpu) = tb; + wd_smp_clear_cpu_pending(cpu, tb); + } } EXPORT_SYMBOL(arch_touch_nmi_watchdog); -- cgit From 064996d62a33ffe10264b5af5dca92d54f60f806 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 29 Sep 2017 13:29:40 +1000 Subject: powerpc/xmon: Avoid tripping SMP hardlockup watchdog The SMP hardlockup watchdog cross-checks other CPUs for lockups, which causes xmon headaches because it's assuming interrupts hard disabled means no watchdog troubles. Try to improve that by calling touch_nmi_watchdog() in obvious places where secondaries are spinning. Also annotate these spin loops with spin_begin/end calls. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/xmon/xmon.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 33351c6704b1..d9a12102b111 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -530,14 +530,19 @@ static int xmon_core(struct pt_regs *regs, int fromipi) waiting: secondary = 1; + spin_begin(); while (secondary && !xmon_gate) { if (in_xmon == 0) { - if (fromipi) + if (fromipi) { + spin_end(); goto leave; + } secondary = test_and_set_bit(0, &in_xmon); } - barrier(); + spin_cpu_relax(); + touch_nmi_watchdog(); } + spin_end(); if (!secondary && !xmon_gate) { /* we are the first cpu to come in */ @@ -568,21 +573,25 @@ static int xmon_core(struct pt_regs *regs, int fromipi) mb(); xmon_gate = 1; barrier(); + touch_nmi_watchdog(); } cmdloop: while (in_xmon) { if (secondary) { + spin_begin(); if (cpu == xmon_owner) { if (!test_and_set_bit(0, &xmon_taken)) { secondary = 0; + spin_end(); continue; } /* missed it */ while (cpu == xmon_owner) - barrier(); + spin_cpu_relax(); } - barrier(); + spin_cpu_relax(); + touch_nmi_watchdog(); } else { cmd = cmds(regs); if (cmd != 0) { -- cgit From 78adf6c214f04fef01c714d78441653dc838a3ba Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 29 Sep 2017 13:29:41 +1000 Subject: powerpc/64s: Implement system reset idle wakeup reason It is possible to wake from idle due to a system reset exception, in which case the CPU takes a system reset interrupt to wake from idle, with system reset as the wakeup reason. The regular (not idle wakeup) system reset interrupt handler must be invoked in this case, otherwise the system reset interrupt is lost. Handle the system reset interrupt immediately after CPU state has been restored. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/irq.c | 41 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 4e65bf82f5e0..4813b83b22aa 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -394,11 +394,19 @@ bool prep_irq_for_idle_irqsoff(void) /* * Take the SRR1 wakeup reason, index into this table to find the * appropriate irq_happened bit. + * + * Sytem reset exceptions taken in idle state also come through here, + * but they are NMI interrupts so do not need to wait for IRQs to be + * restored, and should be taken as early as practical. These are marked + * with 0xff in the table. The Power ISA specifies 0100b as the system + * reset interrupt reason. */ +#define IRQ_SYSTEM_RESET 0xff + static const u8 srr1_to_lazyirq[0x10] = { 0, 0, 0, PACA_IRQ_DBELL, - 0, + IRQ_SYSTEM_RESET, PACA_IRQ_DBELL, PACA_IRQ_DEC, 0, @@ -407,15 +415,42 @@ static const u8 srr1_to_lazyirq[0x10] = { PACA_IRQ_HMI, 0, 0, 0, 0, 0 }; +static noinline void replay_system_reset(void) +{ + struct pt_regs regs; + + ppc_save_regs(®s); + regs.trap = 0x100; + get_paca()->in_nmi = 1; + system_reset_exception(®s); + get_paca()->in_nmi = 0; +} + void irq_set_pending_from_srr1(unsigned long srr1) { unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18; + u8 reason = srr1_to_lazyirq[idx]; + + /* + * Take the system reset now, which is immediately after registers + * are restored from idle. It's an NMI, so interrupts need not be + * re-enabled before it is taken. + */ + if (unlikely(reason == IRQ_SYSTEM_RESET)) { + replay_system_reset(); + return; + } /* * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0, - * so this can be called unconditionally with srr1 wake reason. + * so this can be called unconditionally with the SRR1 wake + * reason as returned by the idle code, which uses 0 to mean no + * interrupt. + * + * If a future CPU was to designate this as an interrupt reason, + * then a new index for no interrupt must be assigned. */ - local_paca->irq_happened |= srr1_to_lazyirq[idx]; + local_paca->irq_happened |= reason; } #endif /* CONFIG_PPC_BOOK3S */ -- cgit From e36d0a2ed5019184bb9b94ff1138c87c05905789 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 29 Sep 2017 13:29:42 +1000 Subject: powerpc/powernv: Implement NMI IPI with OPAL_SIGNAL_SYSTEM_RESET This allows MSR[EE]=0 lockups to be detected on an OPAL (bare metal) system similarly to the hcall NMI IPI on pseries guests, when the platform/firmware supports it. This is an example of CPU10 spinning with interrupts hard disabled: Watchdog CPU:32 detected Hard LOCKUP other CPUS:10 Watchdog CPU:10 Hard LOCKUP CPU: 10 PID: 4410 Comm: bash Not tainted 4.13.0-rc7-00074-ge89ce1f89f62-dirty #34 task: c0000003a82b4400 task.stack: c0000003af55c000 NIP: c0000000000a7b38 LR: c000000000659044 CTR: c0000000000a7b00 REGS: c00000000fd23d80 TRAP: 0100 Not tainted (4.13.0-rc7-00074-ge89ce1f89f62-dirty) MSR: 90000000000c1033 CR: 28422222 XER: 20000000 CFAR: c0000000000a7b38 SOFTE: 0 GPR00: c000000000659044 c0000003af55fbb0 c000000001072a00 0000000000000078 GPR04: c0000003c81b5c80 c0000003c81cc7e8 9000000000009033 0000000000000000 GPR08: 0000000000000000 c0000000000a7b00 0000000000000001 9000000000001003 GPR12: c0000000000a7b00 c00000000fd83200 0000000010180df8 0000000010189e60 GPR16: 0000000010189ed8 0000000010151270 000000001018bd88 000000001018de78 GPR20: 00000000370a0668 0000000000000001 00000000101645e0 0000000010163c10 GPR24: 00007fffd14d6294 00007fffd14d6290 c000000000fba6f0 0000000000000004 GPR28: c000000000f351d8 0000000000000078 c000000000f4095c 0000000000000000 NIP [c0000000000a7b38] sysrq_handle_xmon+0x38/0x40 LR [c000000000659044] __handle_sysrq+0xe4/0x270 Call Trace: [c0000003af55fbd0] [c000000000659044] __handle_sysrq+0xe4/0x270 [c0000003af55fc70] [c000000000659810] write_sysrq_trigger+0x70/0xa0 [c0000003af55fca0] [c0000000003da650] proc_reg_write+0xb0/0x110 [c0000003af55fcf0] [c0000000003423bc] __vfs_write+0x6c/0x1b0 [c0000003af55fd90] [c000000000344398] vfs_write+0xd8/0x240 [c0000003af55fde0] [c00000000034632c] SyS_write+0x6c/0x110 [c0000003af55fe30] [c00000000000b220] system_call+0x58/0x6c Signed-off-by: Nicholas Piggin [mpe: Use kernel types for opal_signal_system_reset()] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal-api.h | 1 + arch/powerpc/include/asm/opal.h | 2 + arch/powerpc/platforms/powernv/opal-wrappers.S | 1 + arch/powerpc/platforms/powernv/setup.c | 1 + arch/powerpc/platforms/powernv/smp.c | 52 ++++++++++++++++++++++++++ 5 files changed, 57 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 450a60b81d2a..9d191ebea706 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -188,6 +188,7 @@ #define OPAL_XIVE_DUMP 142 #define OPAL_XIVE_RESERVED3 143 #define OPAL_XIVE_RESERVED4 144 +#define OPAL_SIGNAL_SYSTEM_RESET 145 #define OPAL_NPU_INIT_CONTEXT 146 #define OPAL_NPU_DESTROY_CONTEXT 147 #define OPAL_NPU_MAP_LPAR 148 diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 726c23304a57..04c32b08ffa1 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -281,6 +281,8 @@ int opal_get_power_shift_ratio(u32 handle, int token, u32 *psr); int opal_set_power_shift_ratio(u32 handle, int token, u32 psr); int opal_sensor_group_clear(u32 group_hndl, int token); +s64 opal_signal_system_reset(s32 cpu); + /* Internal functions */ extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data); diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 8c1ede2d3f7e..37cd170201a2 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -307,6 +307,7 @@ OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); +OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 897aa1400eb8..cf52d53da460 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -282,6 +282,7 @@ static void __init pnv_setup_machdep_opal(void) ppc_md.restart = pnv_restart; pm_power_off = pnv_power_off; ppc_md.halt = pnv_halt; + /* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */ ppc_md.machine_check_exception = opal_machine_check; ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery; ppc_md.hmi_exception_early = opal_hmi_exception_early; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 355d3f99cafb..ba030669eca1 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -297,6 +297,54 @@ static void __init pnv_smp_probe(void) } } +static int pnv_system_reset_exception(struct pt_regs *regs) +{ + if (smp_handle_nmi_ipi(regs)) + return 1; + return 0; +} + +static int pnv_cause_nmi_ipi(int cpu) +{ + int64_t rc; + + if (cpu >= 0) { + rc = opal_signal_system_reset(get_hard_smp_processor_id(cpu)); + if (rc != OPAL_SUCCESS) + return 0; + return 1; + + } else if (cpu == NMI_IPI_ALL_OTHERS) { + bool success = true; + int c; + + + /* + * We do not use broadcasts (yet), because it's not clear + * exactly what semantics Linux wants or the firmware should + * provide. + */ + for_each_online_cpu(c) { + if (c == smp_processor_id()) + continue; + + rc = opal_signal_system_reset( + get_hard_smp_processor_id(c)); + if (rc != OPAL_SUCCESS) + success = false; + } + if (success) + return 1; + + /* + * Caller will fall back to doorbells, which may pick + * up the remainders. + */ + } + + return 0; +} + static struct smp_ops_t pnv_smp_ops = { .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ .cause_ipi = NULL, /* Filled at runtime by pnv_smp_probe() */ @@ -315,6 +363,10 @@ static struct smp_ops_t pnv_smp_ops = { /* This is called very early during platform setup_arch */ void __init pnv_smp_init(void) { + if (opal_check_token(OPAL_SIGNAL_SYSTEM_RESET)) { + ppc_md.system_reset_exception = pnv_system_reset_exception; + pnv_smp_ops.cause_nmi_ipi = pnv_cause_nmi_ipi; + } smp_ops = &pnv_smp_ops; #ifdef CONFIG_HOTPLUG_CPU -- cgit From 969a86a2855d484a00205a424df1c693acee15d9 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 27 Sep 2017 15:45:58 +1000 Subject: powerpc/powernv: Use early_radix_enabled in POWER9 tlb flush This code is used at boot and machine checks, so it should be using early_radix_enabled() (which is usable any time). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/mce_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index b76ca198e09c..d37e612050b5 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -128,7 +128,7 @@ void __flush_tlb_power9(unsigned int action) { unsigned int num_sets; - if (radix_enabled()) + if (early_radix_enabled()) num_sets = POWER9_TLB_SETS_RADIX; else num_sets = POWER9_TLB_SETS_HASH; -- cgit From 83ad1e6a1dc049dd04be4564125a7fd49e022183 Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Fri, 22 Sep 2017 17:04:58 +0530 Subject: powerpc/oprofile: Use setup_timer() helper Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: Michael Ellerman --- arch/powerpc/oprofile/op_model_cell.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c index c82497a31c54..264b6ab11978 100644 --- a/arch/powerpc/oprofile/op_model_cell.c +++ b/arch/powerpc/oprofile/op_model_cell.c @@ -555,9 +555,7 @@ static void cell_virtual_cntr(unsigned long data) static void start_virt_cntrs(void) { - init_timer(&timer_virt_cntr); - timer_virt_cntr.function = cell_virtual_cntr; - timer_virt_cntr.data = 0UL; + setup_timer(&timer_virt_cntr, cell_virtual_cntr, 0UL); timer_virt_cntr.expires = jiffies + HZ / 10; add_timer(&timer_virt_cntr); } @@ -679,9 +677,7 @@ static void spu_evnt_swap(unsigned long data) static void start_spu_event_swap(void) { - init_timer(&timer_spu_event_swap); - timer_spu_event_swap.function = spu_evnt_swap; - timer_spu_event_swap.data = 0UL; + setup_timer(&timer_spu_event_swap, spu_evnt_swap, 0UL); timer_spu_event_swap.expires = jiffies + HZ / 25; add_timer(&timer_spu_event_swap); } -- cgit From 8d6b1bf20f61cd5358fdc5fa83d7fc11c8393d8a Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Fri, 22 Sep 2017 17:04:59 +0530 Subject: powerpc/6xx: Use setup_timer() helper Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/tau_6xx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index a753b72efbc0..726c2d683f93 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -229,8 +229,7 @@ int __init TAU_init(void) /* first, set up the window shrinking timer */ - init_timer(&tau_timer); - tau_timer.function = tau_timeout_smp; + setup_timer(&tau_timer, tau_timeout_smp, 0UL); tau_timer.expires = jiffies + shrink_timer; add_timer(&tau_timer); -- cgit From 01451ad47e2724eb123e87a5bae04e943046b87a Mon Sep 17 00:00:00 2001 From: Allen Pais Date: Fri, 22 Sep 2017 17:05:00 +0530 Subject: powerpc/powermac: Use setup_timer() helper Use setup_timer function instead of initializing timer with the function and data fields. Signed-off-by: Allen Pais Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powermac/low_i2c.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index 70183eb3d5c8..39a1d4225e0f 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -513,9 +513,7 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np) mutex_init(&host->mutex); init_completion(&host->complete); spin_lock_init(&host->lock); - init_timer(&host->timeout_timer); - host->timeout_timer.function = kw_i2c_timeout; - host->timeout_timer.data = (unsigned long)host; + setup_timer(&host->timeout_timer, kw_i2c_timeout, (unsigned long)host); psteps = of_get_property(np, "AAPL,address-step", NULL); steps = psteps ? (*psteps) : 0x10; -- cgit From 924c8feb041c3ef22d46ac2e7468161aad85e979 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 29 Sep 2017 11:14:08 +0530 Subject: powerpc/lib/sstep: Add XER bits introduced in POWER ISA v3.0 This adds definitions for the OV32 and CA32 bits of XER that were introduced in POWER ISA v3.0. There are some existing instructions that currently set the OV and CA bits based on certain conditions. The emulation behaviour of all these instructions needs to be updated to set these new bits accordingly. Signed-off-by: Sandipan Das Acked-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 5e8418c28bd8..16814bfc01da 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -31,6 +31,8 @@ extern char system_call_common[]; #define XER_SO 0x80000000U #define XER_OV 0x40000000U #define XER_CA 0x20000000U +#define XER_OV32 0x00080000U +#define XER_CA32 0x00040000U #ifdef CONFIG_PPC_FPU /* -- cgit From dc39c1d68f256990d4bb0033a219e8ca15ba4d7f Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 29 Sep 2017 11:14:09 +0530 Subject: powerpc/lib/sstep: Fix fixed-point arithmetic instructions that set CA32 There are existing fixed-point arithmetic instructions that always set the CA bit of XER to reflect the carry out of bit 0 in 64-bit mode and out of bit 32 in 32-bit mode. In ISA v3.0, these instructions also always set the CA32 bit of XER to reflect the carry out of bit 32. This fixes the emulated behaviour of such instructions when running on a system that is compliant with POWER ISA v3.0. The following instructions are affected: * Add Immediate Carrying (addic) * Add Immediate Carrying and Record (addic.) * Subtract From Immediate Carrying (subfic) * Add Carrying (addc[.]) * Subtract From Carrying (subfc[.]) * Add Extended (adde[.]) * Subtract From Extended (subfe[.]) * Add to Minus One Extended (addme[.]) * Subtract From Minus One Extended (subfme[.]) * Add to Zero Extended (addze[.]) * Subtract From Zero Extended (subfze[.]) Fixes: 0016a4cf5582 ("powerpc: Emulate most Book I instructions in emulate_step()") Signed-off-by: Sandipan Das Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 16814bfc01da..fe1910733e55 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -964,6 +964,16 @@ static nokprobe_inline void set_cr0(const struct pt_regs *regs, op->ccval |= 0x20000000; } +static nokprobe_inline void set_ca32(struct instruction_op *op, bool val) +{ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (val) + op->xerval |= XER_CA32; + else + op->xerval &= ~XER_CA32; + } +} + static nokprobe_inline void add_with_carry(const struct pt_regs *regs, struct instruction_op *op, int rd, unsigned long val1, unsigned long val2, @@ -987,6 +997,9 @@ static nokprobe_inline void add_with_carry(const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + + set_ca32(op, (unsigned int)val < (unsigned int)val1 || + (carry_in && (unsigned int)val == (unsigned int)val1)); } static nokprobe_inline void do_cmp_signed(const struct pt_regs *regs, -- cgit From 0a75aff178df005b0619703cf7886d5c8e28e486 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Fri, 29 Sep 2017 11:14:10 +0530 Subject: powerpc/lib/sstep: Fix fixed-point shift instructions that set CA32 This fixes the emulated behaviour of existing fixed-point shift right algebraic instructions that are supposed to set both the CA and CA32 bits of XER when running on a system that is compliant with POWER ISA v3.0 independent of whether the system is executing in 32-bit mode or 64-bit mode. The following instructions are affected: * Shift Right Algebraic Word Immediate (srawi[.]) * Shift Right Algebraic Word (sraw[.]) * Shift Right Algebraic Doubleword Immediate (sradi[.]) * Shift Right Algebraic Doubleword (srad[.]) Fixes: 0016a4cf5582 ("powerpc: Emulate most Book I instructions in emulate_step()") Signed-off-by: Sandipan Das Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index fe1910733e55..5118110c3983 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1804,6 +1804,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); goto logical_done; case 824: /* srawi */ @@ -1816,6 +1817,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); goto logical_done; #ifdef __powerpc64__ @@ -1845,6 +1847,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); goto logical_done; case 826: /* sradi with sh_5 = 0 */ @@ -1858,6 +1861,7 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, op->xerval |= XER_CA; else op->xerval &= ~XER_CA; + set_ca32(op, op->xerval & XER_CA); goto logical_done; #endif /* __powerpc64__ */ -- cgit From c3dda4b0db9cf5044010e4de88e5e5b94edf916c Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Wed, 4 Oct 2017 14:53:24 +1030 Subject: powerpc/configs: Add Skiroot defconfig This configuration is used by the OpenPower firmware for it's Linux-as-bootloader implementation. Also known as the Petitboot kernel, this configuration broke in 4.12 (CPU_HOTPLUG=n), so add it to the upstream tree in order to get better coverage. Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman --- arch/powerpc/configs/skiroot_defconfig | 232 +++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 arch/powerpc/configs/skiroot_defconfig (limited to 'arch/powerpc') diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig new file mode 100644 index 000000000000..6bd5e7261335 --- /dev/null +++ b/arch/powerpc/configs/skiroot_defconfig @@ -0,0 +1,232 @@ +CONFIG_PPC64=y +CONFIG_ALTIVEC=y +CONFIG_VSX=y +CONFIG_NR_CPUS=2048 +CONFIG_CPU_LITTLE_ENDIAN=y +# CONFIG_SWAP is not set +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +# CONFIG_CROSS_MEMORY_ATTACH is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=20 +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +# CONFIG_RD_GZIP is not set +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set +# CONFIG_RD_LZO is not set +# CONFIG_RD_LZ4 is not set +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_PERF_EVENTS=y +# CONFIG_COMPAT_BRK is not set +CONFIG_JUMP_LABEL=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_SIG=y +CONFIG_MODULE_SIG_FORCE=y +CONFIG_MODULE_SIG_SHA512=y +CONFIG_PARTITION_ADVANCED=y +# CONFIG_IOSCHED_DEADLINE is not set +# CONFIG_PPC_PSERIES is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_IDLE=y +CONFIG_HZ_100=y +CONFIG_KEXEC=y +CONFIG_IRQ_ALL_CPUS=y +CONFIG_NUMA=y +# CONFIG_COMPACTION is not set +# CONFIG_MIGRATION is not set +# CONFIG_BOUNCE is not set +CONFIG_PPC_64K_PAGES=y +CONFIG_SCHED_SMT=y +CONFIG_CMDLINE_BOOL=y +CONFIG_CMDLINE="console=tty0 console=hvc0 powersave=off" +# CONFIG_SECCOMP is not set +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_NET_IPIP=y +CONFIG_SYN_COOKIES=y +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_IPV6 is not set +CONFIG_DNS_RESOLVER=y +# CONFIG_WIRELESS is not set +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_MTD=m +CONFIG_MTD_POWERNV_FLASH=m +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=65536 +CONFIG_VIRTIO_BLK=m +CONFIG_BLK_DEV_NVME=m +CONFIG_EEPROM_AT24=y +# CONFIG_CXL is not set +CONFIG_BLK_DEV_SD=m +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_SCAN_ASYNC=y +CONFIG_SCSI_FC_ATTRS=y +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_BE2ISCSI=m +CONFIG_SCSI_AACRAID=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_IPR=m +# CONFIG_SCSI_IPR_TRACE is not set +# CONFIG_SCSI_IPR_DUMP is not set +CONFIG_SCSI_QLA_FC=m +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_SCSI_LPFC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_ALUA=m +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +# CONFIG_ATA_SFF is not set +CONFIG_MD=y +CONFIG_BLK_DEV_MD=m +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_BLK_DEV_DM=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_MIRROR=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_ACENIC=m +CONFIG_ACENIC_OMIT_TIGON_I=y +CONFIG_TIGON3=y +CONFIG_BNX2X=m +CONFIG_CHELSIO_T1=y +CONFIG_BE2NET=m +CONFIG_S2IO=m +CONFIG_E100=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_MLX4_EN=m +CONFIG_MLX5_CORE=m +CONFIG_MLX5_CORE_EN=y +CONFIG_MYRI10GE=m +CONFIG_QLGE=m +CONFIG_NETXEN_NIC=m +CONFIG_SFC=m +# CONFIG_USB_NET_DRIVERS is not set +# CONFIG_WLAN is not set +CONFIG_INPUT_EVDEV=y +CONFIG_INPUT_MISC=y +# CONFIG_SERIO_SERPORT is not set +# CONFIG_DEVMEM is not set +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_IPMI_HANDLER=y +CONFIG_IPMI_DEVICE_INTERFACE=y +CONFIG_IPMI_POWERNV=y +CONFIG_HW_RANDOM=y +CONFIG_TCG_TIS_I2C_NUVOTON=y +# CONFIG_I2C_COMPAT is not set +CONFIG_I2C_CHARDEV=y +# CONFIG_I2C_HELPER_AUTO is not set +CONFIG_DRM=y +CONFIG_DRM_RADEON=y +CONFIG_DRM_AST=m +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_OF=y +CONFIG_FB_MATROX=y +CONFIG_FB_MATROX_MILLENIUM=y +CONFIG_FB_MATROX_MYSTIQUE=y +CONFIG_FB_MATROX_G=y +# CONFIG_LCD_CLASS_DEVICE is not set +# CONFIG_BACKLIGHT_GENERIC is not set +# CONFIG_VGA_CONSOLE is not set +CONFIG_LOGO=y +# CONFIG_LOGO_LINUX_MONO is not set +# CONFIG_LOGO_LINUX_VGA16 is not set +CONFIG_USB_HIDDEV=y +CONFIG_USB=y +CONFIG_USB_MON=y +CONFIG_USB_XHCI_HCD=y +CONFIG_USB_EHCI_HCD=y +# CONFIG_USB_EHCI_HCD_PPC_OF is not set +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_STORAGE=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_GENERIC=m +CONFIG_VIRT_DRIVERS=y +CONFIG_VIRTIO_PCI=y +# CONFIG_IOMMU_SUPPORT is not set +CONFIG_EXT4_FS=m +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_XFS_FS=m +CONFIG_XFS_POSIX_ACL=y +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +CONFIG_ISO9660_FS=m +CONFIG_UDF_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_PROC_KCORE=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +# CONFIG_MISC_FILESYSTEMS is not set +# CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ASCII=y +CONFIG_NLS_ISO8859_1=y +CONFIG_NLS_UTF8=y +CONFIG_CRC16=y +CONFIG_CRC_ITU_T=y +CONFIG_LIBCRC32C=y +CONFIG_PRINTK_TIME=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_STACKOVERFLOW=y +CONFIG_SOFTLOCKUP_DETECTOR=y +CONFIG_HARDLOCKUP_DETECTOR=y +CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y +CONFIG_WQ_WATCHDOG=y +CONFIG_SCHEDSTATS=y +# CONFIG_FTRACE is not set +CONFIG_XMON=y +CONFIG_XMON_DEFAULT=y +CONFIG_SECURITY=y +CONFIG_IMA=y +CONFIG_EVM=y +# CONFIG_CRYPTO_ECHAINIV is not set +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_CMAC=y +CONFIG_CRYPTO_MD4=y +CONFIG_CRYPTO_ARC4=y +CONFIG_CRYPTO_DES=y +# CONFIG_CRYPTO_HW is not set -- cgit From 22085337f5b9d7a7adf5c6cc4e007c28e5216a7f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 22 Sep 2017 14:40:43 +0530 Subject: powerpc/kprobes: Some cosmetic updates to try_to_emulate() 1. This is only used in kprobes.c, so make it static. 2. Remove the un-necessary (ret == 0) comparison in the else clause. Reviewed-by: Masami Hiramatsu Reviewed-by: Kamalesh Babulal Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/kprobes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 367494dc67d9..c2a6ab38a67f 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -239,7 +239,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) } NOKPROBE_SYMBOL(arch_prepare_kretprobe); -int try_to_emulate(struct kprobe *p, struct pt_regs *regs) +static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) { int ret; unsigned int insn = *p->ainsn.insn; @@ -261,7 +261,7 @@ int try_to_emulate(struct kprobe *p, struct pt_regs *regs) */ printk("Can't step on instruction %x\n", insn); BUG(); - } else if (ret == 0) + } else /* This instruction can't be boosted */ p->ainsn.boostable = -1; -- cgit From a7b440383f9b078254d38ab6a3672810f1eca74f Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 22 Sep 2017 14:40:44 +0530 Subject: powerpc/kprobes: Do not suppress instruction emulation if a single run failed Currently, we disable instruction emulation if emulate_step() fails for any reason. However, such failures could be transient and specific to a particular run. Instead, only disable instruction emulation if we have never been able to emulate this. If we had emulated this instruction successfully at least once, then we single step only this probe hit and continue to try emulating the instruction in subsequent probe hits. Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/kprobes.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index c2a6ab38a67f..4c1702423676 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -261,9 +261,20 @@ static int try_to_emulate(struct kprobe *p, struct pt_regs *regs) */ printk("Can't step on instruction %x\n", insn); BUG(); - } else - /* This instruction can't be boosted */ - p->ainsn.boostable = -1; + } else { + /* + * If we haven't previously emulated this instruction, then it + * can't be boosted. Note it down so we don't try to do so again. + * + * If, however, we had emulated this instruction in the past, + * then this is just an error with the current run (for + * instance, exceptions due to a load/store). We return 0 so + * that this is now single-stepped, but continue to try + * emulating it in subsequent probe hits. + */ + if (unlikely(p->ainsn.boostable != 1)) + p->ainsn.boostable = -1; + } return ret; } -- cgit From bf3a91251735863fcb230c96e984ebcacee820d9 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 22 Sep 2017 14:40:45 +0530 Subject: powerpc/kprobes: Clean up jprobe detection in livepatch handler In commit c05b8c4474c03 ("powerpc/kprobes: Skip livepatch_handler() for jprobes"), we added a helper is_current_kprobe_addr() to help detect if the modified regs->nip was due to a jprobe or livepatch. Masami felt that the function name was not quite clear. To that end, this patch renames is_current_kprobe_addr() to __is_active_jprobe() and adds a comment to (hopefully) better clarify the purpose of this helper. The helper has also now been moved to kprobes-ftrace.c so that it is only available for KPROBES_ON_FTRACE. Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kprobes.h | 2 +- arch/powerpc/kernel/kprobes-ftrace.c | 11 +++++++++++ arch/powerpc/kernel/kprobes.c | 6 ------ arch/powerpc/kernel/trace/ftrace_64_mprofile.S | 4 ++-- 4 files changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h index 8814a7249ceb..9f3be5c8a4a3 100644 --- a/arch/powerpc/include/asm/kprobes.h +++ b/arch/powerpc/include/asm/kprobes.h @@ -103,8 +103,8 @@ extern int kprobe_exceptions_notify(struct notifier_block *self, extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); extern int kprobe_handler(struct pt_regs *regs); extern int kprobe_post_handler(struct pt_regs *regs); -extern int is_current_kprobe_addr(unsigned long addr); #ifdef CONFIG_KPROBES_ON_FTRACE +extern int __is_active_jprobe(unsigned long addr); extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb); #else diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 6c089d9757c9..48f675a73cff 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -25,6 +25,17 @@ #include #include +/* + * This is called from ftrace code after invoking registered handlers to + * disambiguate regs->nip changes done by jprobes and livepatch. We check if + * there is an active jprobe at the provided address (mcount location). + */ +int __is_active_jprobe(unsigned long addr) +{ + struct kprobe *p = kprobe_running(); + return (p && (unsigned long)p->addr == addr) ? 1 : 0; +} + static nokprobe_inline int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, unsigned long orig_nip) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 4c1702423676..48a81614f629 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -43,12 +43,6 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; -int is_current_kprobe_addr(unsigned long addr) -{ - struct kprobe *p = kprobe_running(); - return (p && (unsigned long)p->addr == addr) ? 1 : 0; -} - bool arch_within_kprobe_blacklist(unsigned long addr) { return (addr >= (unsigned long)__kprobes_text_start && diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S index c98e90b4ea7b..c1cfcce0551d 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S +++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S @@ -110,9 +110,9 @@ ftrace_call: /* NIP has not been altered, skip over further checks */ beq 1f - /* Check if there is an active kprobe on us */ + /* Check if there is an active jprobe on us */ subi r3, r14, 4 - bl is_current_kprobe_addr + bl __is_active_jprobe nop /* -- cgit From c179ea270100488c416890beef6424ce390ceb38 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 22 Sep 2017 14:40:46 +0530 Subject: powerpc/kprobes: Fix warnings from __this_cpu_read() on preempt kernels Kamalesh pointed out that we are getting the below call traces with livepatched functions when we enable CONFIG_PREEMPT: [ 495.470721] BUG: using __this_cpu_read() in preemptible [00000000] code: cat/8394 [ 495.471167] caller is is_current_kprobe_addr+0x30/0x90 [ 495.471171] CPU: 4 PID: 8394 Comm: cat Tainted: G K 4.13.0-rc7-nnr+ #95 [ 495.471173] Call Trace: [ 495.471178] [c00000008fd9b960] [c0000000009f039c] dump_stack+0xec/0x160 (unreliable) [ 495.471184] [c00000008fd9b9a0] [c00000000059169c] check_preemption_disabled+0x15c/0x170 [ 495.471187] [c00000008fd9ba30] [c000000000046460] is_current_kprobe_addr+0x30/0x90 [ 495.471191] [c00000008fd9ba60] [c00000000004e9a0] ftrace_call+0x1c/0xb8 [ 495.471195] [c00000008fd9bc30] [c000000000376fd8] seq_read+0x238/0x5c0 [ 495.471199] [c00000008fd9bcd0] [c0000000003cfd78] proc_reg_read+0x88/0xd0 [ 495.471203] [c00000008fd9bd00] [c00000000033e5d4] __vfs_read+0x44/0x1b0 [ 495.471206] [c00000008fd9bd90] [c0000000003402ec] vfs_read+0xbc/0x1b0 [ 495.471210] [c00000008fd9bde0] [c000000000342138] SyS_read+0x68/0x110 [ 495.471214] [c00000008fd9be30] [c00000000000bc6c] system_call+0x58/0x6c Commit c05b8c4474c030 ("powerpc/kprobes: Skip livepatch_handler() for jprobes") introduced a helper is_current_kprobe_addr() to help determine if the current function has been livepatched or if it has a jprobe installed, both of which modify the NIP. This was subsequently renamed to __is_active_jprobe(). In the case of a jprobe, kprobe_ftrace_handler() disables pre-emption before calling into setjmp_pre_handler() which returns without disabling pre-emption. This is done to ensure that the jprobe handler won't disappear beneath us if the jprobe is unregistered between the setjmp_pre_handler() and the subsequent longjmp_break_handler() called from the jprobe handler. Due to this, we can use __this_cpu_read() in __is_active_jprobe() with the pre-emption check as we know that pre-emption will be disabled. However, if this function has been livepatched, we are still doing this check and when we do so, pre-emption won't necessarily be disabled. This results in the call trace shown above. Fix this by only invoking __is_active_jprobe() when pre-emption is disabled. And since we now guard this within a pre-emption check, we can instead use raw_cpu_read() to get the current_kprobe value skipping the check done by __this_cpu_read(). Fixes: c05b8c4474c030 ("powerpc/kprobes: Skip livepatch_handler() for jprobes") Reported-by: Kamalesh Babulal Tested-by: Kamalesh Babulal Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/kprobes-ftrace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 48f675a73cff..1e54ec8ad85f 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -32,8 +32,12 @@ */ int __is_active_jprobe(unsigned long addr) { - struct kprobe *p = kprobe_running(); - return (p && (unsigned long)p->addr == addr) ? 1 : 0; + if (!preemptible()) { + struct kprobe *p = raw_cpu_read(current_kprobe); + return (p && (unsigned long)p->addr == addr) ? 1 : 0; + } + + return 0; } static nokprobe_inline -- cgit From 6baea433bc84cd148af1c524389a8d756f67412e Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 22 Sep 2017 14:40:47 +0530 Subject: powerpc/jprobes: Disable preemption when triggered through ftrace KPROBES_SANITY_TEST throws the below splat when CONFIG_PREEMPT is enabled: Kprobe smoke test: started DEBUG_LOCKS_WARN_ON(val > preempt_count()) ------------[ cut here ]------------ WARNING: CPU: 19 PID: 1 at kernel/sched/core.c:3094 preempt_count_sub+0xcc/0x140 Modules linked in: CPU: 19 PID: 1 Comm: swapper/0 Not tainted 4.13.0-rc7-nnr+ #97 task: c0000000fea80000 task.stack: c0000000feb00000 NIP: c00000000011d3dc LR: c00000000011d3d8 CTR: c000000000a090d0 REGS: c0000000feb03400 TRAP: 0700 Not tainted (4.13.0-rc7-nnr+) MSR: 8000000000021033 CR: 28000282 XER: 00000000 CFAR: c00000000015aa18 SOFTE: 0 NIP preempt_count_sub+0xcc/0x140 LR preempt_count_sub+0xc8/0x140 Call Trace: preempt_count_sub+0xc8/0x140 (unreliable) kprobe_handler+0x228/0x4b0 program_check_exception+0x58/0x3b0 program_check_common+0x16c/0x170 --- interrupt: 0 at kprobe_target+0x8/0x20 LR = init_test_probes+0x248/0x7d0 kp+0x0/0x80 (unreliable) livepatch_handler+0x38/0x74 init_kprobes+0x1d8/0x208 do_one_initcall+0x68/0x1d0 kernel_init_freeable+0x298/0x374 kernel_init+0x24/0x160 ret_from_kernel_thread+0x5c/0x70 Instruction dump: 419effdc 3d22001b 39299240 81290000 2f890000 409effc8 3c82ffcb 3c62ffcb 3884bc68 3863bc18 4803d5fd 60000000 <0fe00000> 4bffffa8 60000000 60000000 ---[ end trace 432dd46b4ce3d29f ]--- Kprobe smoke test: passed successfully The issue is that we aren't disabling preemption in kprobe_ftrace_handler(). Disable it. Fixes: ead514d5fb30a0 ("powerpc/kprobes: Add support for KPROBES_ON_FTRACE") Acked-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao [mpe: Trim oops a little for formatting] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/kprobes-ftrace.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 1e54ec8ad85f..4b1f34f685b1 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -80,6 +80,7 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, /* Disable irq for emulating a breakpoint and avoiding preempt */ local_irq_save(flags); hard_irq_disable(); + preempt_disable(); p = get_kprobe((kprobe_opcode_t *)nip); if (unlikely(!p) || kprobe_disabled(p)) @@ -101,12 +102,18 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, kcb->kprobe_status = KPROBE_HIT_ACTIVE; if (!p->pre_handler || !p->pre_handler(p, regs)) __skip_singlestep(p, regs, kcb, orig_nip); - /* - * If pre_handler returns !0, it sets regs->nip and - * resets current kprobe. - */ + else { + /* + * If pre_handler returns !0, it sets regs->nip and + * resets current kprobe. In this case, we still need + * to restore irq, but not preemption. + */ + local_irq_restore(flags); + return; + } } end: + preempt_enable_no_resched(); local_irq_restore(flags); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); -- cgit From 3368f5699a9b0e281e14f9f93244da0a922f6df0 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Fri, 22 Sep 2017 14:40:48 +0530 Subject: powerpc/jprobes: Validate break handler invocation as being due to a jprobe_return() Fix a circa 2005 FIXME by implementing a check to ensure that we actually got into the jprobe break handler() due to the trap in jprobe_return(). Acked-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/kprobes.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 48a81614f629..a14c61855705 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -639,24 +639,22 @@ NOKPROBE_SYMBOL(setjmp_pre_handler); void __used jprobe_return(void) { - asm volatile("trap" ::: "memory"); + asm volatile("jprobe_return_trap:\n" + "trap\n" + ::: "memory"); } NOKPROBE_SYMBOL(jprobe_return); -static void __used jprobe_return_end(void) -{ -} -NOKPROBE_SYMBOL(jprobe_return_end); - int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - /* - * FIXME - we should ideally be validating that we got here 'cos - * of the "trap" in jprobe_return() above, before restoring the - * saved regs... - */ + if (regs->nip != ppc_kallsyms_lookup_name("jprobe_return_trap")) { + pr_debug("longjmp_break_handler NIP (0x%lx) does not match jprobe_return_trap (0x%lx)\n", + regs->nip, ppc_kallsyms_lookup_name("jprobe_return_trap")); + return 0; + } + memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs)); /* It's OK to start function graph tracing again */ unpause_graph_tracing(); -- cgit From c6baa077b784c3b37391a8c11f433e3f881a80df Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 29 Sep 2017 13:58:02 +1000 Subject: powerpc/powernv: Make opal_event_shutdown() callable from IRQ context In opal_event_shutdown() we free all the IRQs hanging off the opal_event_irqchip. However it's not safe to do so if we're called from IRQ context, because free_irq() wants to synchronise versus IRQ context. This can lead to warnings and a stuck system. For example from sysrq-b: Trying to free IRQ 17 from IRQ context! ------------[ cut here ]------------ WARNING: CPU: 0 PID: 0 at kernel/irq/manage.c:1461 __free_irq+0x398/0x8d0 ... NIP __free_irq+0x398/0x8d0 LR __free_irq+0x394/0x8d0 Call Trace: __free_irq+0x394/0x8d0 (unreliable) free_irq+0xa4/0x140 opal_event_shutdown+0x128/0x180 opal_shutdown+0x1c/0xb0 pnv_shutdown+0x20/0x40 machine_restart+0x38/0x90 emergency_restart+0x28/0x40 sysrq_handle_reboot+0x24/0x40 __handle_sysrq+0x198/0x590 hvc_poll+0x48c/0x8c0 hvc_handle_interrupt+0x1c/0x50 __handle_irq_event_percpu+0xe8/0x6e0 handle_irq_event_percpu+0x34/0xe0 handle_irq_event+0xc4/0x210 handle_level_irq+0x250/0x770 generic_handle_irq+0x5c/0xa0 opal_handle_events+0x11c/0x240 opal_interrupt+0x38/0x50 __handle_irq_event_percpu+0xe8/0x6e0 handle_irq_event_percpu+0x34/0xe0 handle_irq_event+0xc4/0x210 handle_fasteoi_irq+0x174/0xa10 generic_handle_irq+0x5c/0xa0 __do_irq+0xbc/0x4e0 call_do_irq+0x14/0x24 do_IRQ+0x18c/0x540 hardware_interrupt_common+0x158/0x180 We can avoid that by using disable_irq_nosync() rather than free_irq(). Although it doesn't fully free the IRQ, it should be sufficient when we're shutting down, particularly in an emergency. Add an in_interrupt() check and use free_irq() when we're shutting down normally. It's probably OK to use disable_irq_nosync() in that case too, but for now it's safer to leave that behaviour as-is. Fixes: 9f0fd0499d30 ("powerpc/powernv: Add a virtual irqchip for opal events") Reported-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-irqchip.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index ecdcba9d1220..9d1b8c0aaf93 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -174,8 +174,14 @@ void opal_event_shutdown(void) /* First free interrupts, which will also mask them */ for (i = 0; i < opal_irq_count; i++) { - if (opal_irqs[i]) + if (!opal_irqs[i]) + continue; + + if (in_interrupt()) + disable_irq_nosync(opal_irqs[i]); + else free_irq(opal_irqs[i], NULL); + opal_irqs[i] = 0; } } -- cgit From 59d3391e8cf274097c42369866070c565891bae5 Mon Sep 17 00:00:00 2001 From: "Guilherme G. Piccoli" Date: Mon, 18 Sep 2017 11:16:58 -0300 Subject: powerpc/xmon: Add option to show uptime information It might be useful to quickly get the uptime of a running system on xmon, without needing to grab data from memory and doing math on struct addresses. For example, it'd be useful to check for how long after a crash a system is on xmon shell or if some test was started after the first test crashed (and this 2nd test crashed too into xmon). This small patch adds the 'U' command, to accomplish this. Suggested-by: Murilo Fossa Vicentini Signed-off-by: Guilherme G. Piccoli [mpe: Display units (seconds), add sync()/__delay() sequence] Signed-off-by: Michael Ellerman --- arch/powerpc/xmon/xmon.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index d9a12102b111..4679aeb84767 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -278,6 +278,7 @@ Commands:\n\ #elif defined(CONFIG_44x) || defined(CONFIG_PPC_BOOK3E) " u dump TLB\n" #endif +" U show uptime information\n" " ? help\n" " # n limit output to n lines per page (for dp, dpa, dl)\n" " zr reboot\n\ @@ -905,6 +906,26 @@ static void remove_cpu_bpts(void) write_ciabr(0); } +/* Based on uptime_proc_show(). */ +static void +show_uptime(void) +{ + struct timespec uptime; + + if (setjmp(bus_error_jmp) == 0) { + catch_memory_errors = 1; + sync(); + + get_monotonic_boottime(&uptime); + printf("Uptime: %lu.%.2lu seconds\n", (unsigned long)uptime.tv_sec, + ((unsigned long)uptime.tv_nsec / (NSEC_PER_SEC/100))); + + sync(); + __delay(200); \ + } + catch_memory_errors = 0; +} + static void set_lpp_cmd(void) { unsigned long lpp; @@ -1040,6 +1061,9 @@ cmds(struct pt_regs *excp) dump_tlb_book3e(); break; #endif + case 'U': + show_uptime(); + break; default: printf("Unrecognized command: "); do { -- cgit From e366b921c7e32c9b92b4c524696fa81ab508dd23 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 21 Sep 2017 15:24:49 +1000 Subject: powerpc: Add PPC_EMULATED_STATS to powernv_defconfig This is useful, especially for developers. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/configs/powernv_defconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index caee834760d2..3fb2a7ab54f1 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -295,6 +295,7 @@ CONFIG_FUNCTION_GRAPH_TRACER=y CONFIG_SCHED_TRACER=y CONFIG_FTRACE_SYSCALLS=y CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_PPC_EMULATED_STATS=y CONFIG_CODE_PATCHING_SELFTEST=y CONFIG_FTR_FIXUP_SELFTEST=y CONFIG_MSI_BITMAP_SELFTEST=y -- cgit From 186b8f1587c79c2fa04bfa392fdf084443e398c1 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Thu, 28 Sep 2017 09:33:39 -0400 Subject: powerpc: Always initialize input array when calling epapr_hypercall() Several callers to epapr_hypercall() pass an uninitialized stack allocated array for the input arguments, presumably because they have no input arguments. However this can produce errors like this one arch/powerpc/include/asm/epapr_hcalls.h:470:42: error: 'in' may be used uninitialized in this function [-Werror=maybe-uninitialized] unsigned long register r3 asm("r3") = in[0]; ~~^~~ Fix callers to this function to always zero-initialize the input arguments array to prevent this. Signed-off-by: Seth Forshee Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/epapr_hcalls.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h index 334459ad145b..90863245df53 100644 --- a/arch/powerpc/include/asm/epapr_hcalls.h +++ b/arch/powerpc/include/asm/epapr_hcalls.h @@ -508,7 +508,7 @@ static unsigned long epapr_hypercall(unsigned long *in, static inline long epapr_hypercall0_1(unsigned int nr, unsigned long *r2) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; unsigned long r; @@ -520,7 +520,7 @@ static inline long epapr_hypercall0_1(unsigned int nr, unsigned long *r2) static inline long epapr_hypercall0(unsigned int nr) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; return epapr_hypercall(in, out, nr); @@ -528,7 +528,7 @@ static inline long epapr_hypercall0(unsigned int nr) static inline long epapr_hypercall1(unsigned int nr, unsigned long p1) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; @@ -538,7 +538,7 @@ static inline long epapr_hypercall1(unsigned int nr, unsigned long p1) static inline long epapr_hypercall2(unsigned int nr, unsigned long p1, unsigned long p2) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; @@ -549,7 +549,7 @@ static inline long epapr_hypercall2(unsigned int nr, unsigned long p1, static inline long epapr_hypercall3(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; @@ -562,7 +562,7 @@ static inline long epapr_hypercall4(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3, unsigned long p4) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; -- cgit From 4ca360f3dbf2036d964cdf3a6c4a45a81fdf8e18 Mon Sep 17 00:00:00 2001 From: Kautuk Consul Date: Tue, 19 Apr 2016 15:48:21 +0530 Subject: powerpc: get_wchan(): solve possible race scenario due to parallel wakeup Add a check for p->state == TASK_RUNNING so that any wake-ups on task_struct p in the interim lead to 0 being returned by get_wchan(). Signed-off-by: Kautuk Consul [mpe: Confirmed other architectures do similar] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a0c74bbf3454..166145b18728 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1898,7 +1898,8 @@ unsigned long get_wchan(struct task_struct *p) do { sp = *(unsigned long *)sp; - if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD)) + if (!validate_sp(sp, p, STACK_FRAME_OVERHEAD) || + p->state == TASK_RUNNING) return 0; if (count > 0) { ip = ((unsigned long *)sp)[STACK_FRAME_LR_SAVE]; -- cgit From 1c0437af9fca8de6e4ba179d18cf13154da25695 Mon Sep 17 00:00:00 2001 From: Kamalesh Babulal Date: Tue, 10 Oct 2017 20:17:32 +0530 Subject: powerpc/modules: Use WARN_ON() in stub_for_addr() Use WARN_ON(), while running out of stubs in stub_for_addr() and abort loading of the module instead of BUG_ON(). Signed-off-by: Kamalesh Babulal Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/module_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 0b0f89685b67..759104b99f9f 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -429,7 +429,8 @@ static unsigned long stub_for_addr(const Elf64_Shdr *sechdrs, /* Find this stub, or if that fails, the next avail. entry */ stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr; for (i = 0; stub_func_addr(stubs[i].funcdata); i++) { - BUG_ON(i >= num_stubs); + if (WARN_ON(i >= num_stubs)) + return 0; if (stub_func_addr(stubs[i].funcdata) == func_addr(addr)) return (unsigned long)&stubs[i]; -- cgit From 73e341eb6bea01fde706d10d7edba97f154d9c5f Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 29 Sep 2017 14:26:51 +1000 Subject: powerpc/mce: Remove unused function get_mce_fault_addr() There are no users of get_mce_fault_addr() since commit 1363875bdb63 ("powerpc/64s: fix handling of non-synchronous machine checks") removed the last usage. Signed-off-by: Balbir Singh Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mce.h | 2 -- arch/powerpc/kernel/mce.c | 39 --------------------------------------- 2 files changed, 41 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 190d69a7f701..75292c7eee48 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -210,6 +210,4 @@ extern void release_mce_event(void); extern void machine_check_queue_event(void); extern void machine_check_print_event_info(struct machine_check_event *evt, bool user_mode); -extern uint64_t get_mce_fault_addr(struct machine_check_event *evt); - #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index f588951b171d..6757720b1979 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -411,45 +411,6 @@ void machine_check_print_event_info(struct machine_check_event *evt, } EXPORT_SYMBOL_GPL(machine_check_print_event_info); -uint64_t get_mce_fault_addr(struct machine_check_event *evt) -{ - switch (evt->error_type) { - case MCE_ERROR_TYPE_UE: - if (evt->u.ue_error.effective_address_provided) - return evt->u.ue_error.effective_address; - break; - case MCE_ERROR_TYPE_SLB: - if (evt->u.slb_error.effective_address_provided) - return evt->u.slb_error.effective_address; - break; - case MCE_ERROR_TYPE_ERAT: - if (evt->u.erat_error.effective_address_provided) - return evt->u.erat_error.effective_address; - break; - case MCE_ERROR_TYPE_TLB: - if (evt->u.tlb_error.effective_address_provided) - return evt->u.tlb_error.effective_address; - break; - case MCE_ERROR_TYPE_USER: - if (evt->u.user_error.effective_address_provided) - return evt->u.user_error.effective_address; - break; - case MCE_ERROR_TYPE_RA: - if (evt->u.ra_error.effective_address_provided) - return evt->u.ra_error.effective_address; - break; - case MCE_ERROR_TYPE_LINK: - if (evt->u.link_error.effective_address_provided) - return evt->u.link_error.effective_address; - break; - default: - case MCE_ERROR_TYPE_UNKNOWN: - break; - } - return 0; -} -EXPORT_SYMBOL(get_mce_fault_addr); - /* * This function is called in real mode. Strictly no printk's please. * -- cgit From 81b61fa7a065e639d097b09c303e3212e4264fac Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 29 Sep 2017 14:26:52 +1000 Subject: powerpc/mce: Align the print of physical address better Use the same alignment as Effective address. Signed-off-by: Balbir Singh Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/mce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 6757720b1979..d1775bdade72 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -340,7 +340,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, printk("%s Effective address: %016llx\n", level, evt->u.ue_error.effective_address); if (evt->u.ue_error.physical_address_provided) - printk("%s Physical address: %016llx\n", + printk("%s Physical address: %016llx\n", level, evt->u.ue_error.physical_address); break; case MCE_ERROR_TYPE_SLB: -- cgit From ba41e1e1ccb9771ce41a3b8e2121f95486e76ac9 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 29 Sep 2017 14:26:53 +1000 Subject: powerpc/mce: Hookup derror (load/store) UE errors Extract physical_address for UE errors by walking the page tables for the mm and address at the NIP, to extract the instruction. Then use the instruction to find the effective address via analyse_instr(). We might have page table walking races, but we expect them to be rare, the physical address extraction is best effort. The idea is to then hook up this infrastructure to memory failure eventually. Signed-off-by: Balbir Singh Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 5 ++ arch/powerpc/include/asm/mce.h | 2 +- arch/powerpc/kernel/exceptions-64s.S | 2 +- arch/powerpc/kernel/mce.c | 6 ++- arch/powerpc/kernel/mce_power.c | 87 ++++++++++++++++++++++++++++++-- 5 files changed, 94 insertions(+), 8 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 9a318973af05..b27205297e1d 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -54,6 +54,11 @@ #define EX_SIZE 9 /* size in u64 units */ #endif +/* + * maximum recursive depth of MCE exceptions + */ +#define MAX_MCE_DEPTH 4 + /* * EX_LR is only used in EXSLB and where it does not overlap with EX_DAR * EX_CCR similarly with DSISR, but being 4 byte registers there is a hole diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 75292c7eee48..3a1226e9b465 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -204,7 +204,7 @@ struct mce_error_info { extern void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, uint64_t nip, - uint64_t addr); + uint64_t addr, uint64_t phys_addr); extern int get_mce_event(struct machine_check_event *mce, bool release); extern void release_mce_event(void); extern void machine_check_queue_event(void); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 8c32dd4cac20..ce04b1c2ac8e 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -232,7 +232,7 @@ BEGIN_FTR_SECTION addi r10,r10,1 /* increment paca->in_mce */ sth r10,PACA_IN_MCE(r13) /* Limit nested MCE to level 4 to avoid stack overflow */ - cmpwi r10,4 + cmpwi r10,MAX_MCE_DEPTH bgt 2f /* Check if we hit limit of 4 */ std r11,GPR1(r1) /* Save r1 on the stack. */ std r11,0(r1) /* make stack chain pointer */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index d1775bdade72..1af7c559edbc 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -82,7 +82,7 @@ static void mce_set_error_info(struct machine_check_event *mce, */ void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, - uint64_t nip, uint64_t addr) + uint64_t nip, uint64_t addr, uint64_t phys_addr) { int index = __this_cpu_inc_return(mce_nest_count) - 1; struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]); @@ -140,6 +140,10 @@ void save_mce_event(struct pt_regs *regs, long handled, } else if (mce->error_type == MCE_ERROR_TYPE_UE) { mce->u.ue_error.effective_address_provided = true; mce->u.ue_error.effective_address = addr; + if (phys_addr != ULONG_MAX) { + mce->u.ue_error.physical_address_provided = true; + mce->u.ue_error.physical_address = phys_addr; + } } return; } diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index d37e612050b5..e61b80e37e1f 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -27,6 +27,36 @@ #include #include #include +#include +#include +#include +#include + +/* + * Convert an address related to an mm to a PFN. NOTE: we are in real + * mode, we could potentially race with page table updates. + */ +static unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) +{ + pte_t *ptep; + unsigned long flags; + struct mm_struct *mm; + + if (user_mode(regs)) + mm = current->mm; + else + mm = &init_mm; + + local_irq_save(flags); + if (mm == current->mm) + ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL); + else + ptep = find_init_mm_pte(addr, NULL); + local_irq_restore(flags); + if (!ptep || pte_special(*ptep)) + return ULONG_MAX; + return pte_pfn(*ptep); +} static void flush_tlb_206(unsigned int num_sets, unsigned int action) { @@ -421,6 +451,41 @@ static const struct mce_derror_table mce_p9_derror_table[] = { MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0, false, 0, 0, 0, 0 } }; +static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr, + uint64_t *phys_addr) +{ + /* + * Carefully look at the NIP to determine + * the instruction to analyse. Reading the NIP + * in real-mode is tricky and can lead to recursive + * faults + */ + int instr; + unsigned long pfn, instr_addr; + struct instruction_op op; + struct pt_regs tmp = *regs; + + pfn = addr_to_pfn(regs, regs->nip); + if (pfn != ULONG_MAX) { + instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK); + instr = *(unsigned int *)(instr_addr); + if (!analyse_instr(&op, &tmp, instr)) { + pfn = addr_to_pfn(regs, op.ea); + *addr = op.ea; + *phys_addr = (pfn << PAGE_SHIFT); + return 0; + } + /* + * analyse_instr() might fail if the instruction + * is not a load/store, although this is unexpected + * for load/store errors or if we got the NIP + * wrong + */ + } + *addr = 0; + return -1; +} + static int mce_handle_ierror(struct pt_regs *regs, const struct mce_ierror_table table[], struct mce_error_info *mce_err, uint64_t *addr) @@ -489,7 +554,8 @@ static int mce_handle_ierror(struct pt_regs *regs, static int mce_handle_derror(struct pt_regs *regs, const struct mce_derror_table table[], - struct mce_error_info *mce_err, uint64_t *addr) + struct mce_error_info *mce_err, uint64_t *addr, + uint64_t *phys_addr) { uint64_t dsisr = regs->dsisr; int handled = 0; @@ -555,7 +621,17 @@ static int mce_handle_derror(struct pt_regs *regs, mce_err->initiator = table[i].initiator; if (table[i].dar_valid) *addr = regs->dar; - + else if (mce_err->severity == MCE_SEV_ERROR_SYNC && + table[i].error_type == MCE_ERROR_TYPE_UE) { + /* + * We do a maximum of 4 nested MCE calls, see + * kernel/exception-64s.h + */ + if (get_paca()->in_mce < MAX_MCE_DEPTH) + if (!mce_find_instr_ea_and_pfn(regs, addr, + phys_addr)) + handled = 1; + } found = 1; } @@ -592,19 +668,20 @@ static long mce_handle_error(struct pt_regs *regs, const struct mce_ierror_table itable[]) { struct mce_error_info mce_err = { 0 }; - uint64_t addr; + uint64_t addr, phys_addr; uint64_t srr1 = regs->msr; long handled; if (SRR1_MC_LOADSTORE(srr1)) - handled = mce_handle_derror(regs, dtable, &mce_err, &addr); + handled = mce_handle_derror(regs, dtable, &mce_err, &addr, + &phys_addr); else handled = mce_handle_ierror(regs, itable, &mce_err, &addr); if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) handled = mce_handle_ue_error(regs); - save_mce_event(regs, handled, &mce_err, regs->nip, addr); + save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr); return handled; } -- cgit From 01eaac2b0591f7114d9de5cc65c88352de0daca6 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 29 Sep 2017 14:26:54 +1000 Subject: powerpc/mce: Hookup ierror (instruction) UE errors Hookup instruction errors (UE) for memory offling via memory_failure() in a manner similar to load/store errors (derror). Since we have access to the NIP, the conversion is a one step process in this case. Signed-off-by: Balbir Singh Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/mce_power.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index e61b80e37e1f..82cabfc7c2f6 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -488,7 +488,8 @@ static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr, static int mce_handle_ierror(struct pt_regs *regs, const struct mce_ierror_table table[], - struct mce_error_info *mce_err, uint64_t *addr) + struct mce_error_info *mce_err, uint64_t *addr, + uint64_t *phys_addr) { uint64_t srr1 = regs->msr; int handled = 0; @@ -540,8 +541,22 @@ static int mce_handle_ierror(struct pt_regs *regs, } mce_err->severity = table[i].severity; mce_err->initiator = table[i].initiator; - if (table[i].nip_valid) + if (table[i].nip_valid) { *addr = regs->nip; + if (mce_err->severity == MCE_SEV_ERROR_SYNC && + table[i].error_type == MCE_ERROR_TYPE_UE) { + unsigned long pfn; + + if (get_paca()->in_mce < MAX_MCE_DEPTH) { + pfn = addr_to_pfn(regs, regs->nip); + if (pfn != ULONG_MAX) { + *phys_addr = + (pfn << PAGE_SHIFT); + handled = 1; + } + } + } + } return handled; } @@ -676,7 +691,8 @@ static long mce_handle_error(struct pt_regs *regs, handled = mce_handle_derror(regs, dtable, &mce_err, &addr, &phys_addr); else - handled = mce_handle_ierror(regs, itable, &mce_err, &addr); + handled = mce_handle_ierror(regs, itable, &mce_err, &addr, + &phys_addr); if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) handled = mce_handle_ue_error(regs); -- cgit From 733e4a4c4467033918f4e64f797b03efe209d82c Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 29 Sep 2017 14:26:55 +1000 Subject: powerpc/mce: hookup memory_failure for UE errors If we are in user space and hit a UE error, we now have the basic infrastructure to walk the page tables and find out the effective address that was accessed, since the DAR is not valid. We use a work_queue content to hookup the bad pfn, any other context causes problems, since memory_failure itself can call into schedule() via lru_drain_ bits. We could probably poison the struct page to avoid a race between detection and taking corrective action. Signed-off-by: Balbir Singh Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/mce.c | 70 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 1af7c559edbc..742e4658c5dc 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -39,11 +39,21 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); static DEFINE_PER_CPU(int, mce_queue_count); static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); +/* Queue for delayed MCE UE events. */ +static DEFINE_PER_CPU(int, mce_ue_count); +static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], + mce_ue_event_queue); + static void machine_check_process_queued_event(struct irq_work *work); +void machine_check_ue_event(struct machine_check_event *evt); +static void machine_process_ue_event(struct work_struct *work); + static struct irq_work mce_event_process_work = { .func = machine_check_process_queued_event, }; +DECLARE_WORK(mce_ue_event_work, machine_process_ue_event); + static void mce_set_error_info(struct machine_check_event *mce, struct mce_error_info *mce_err) { @@ -143,6 +153,7 @@ void save_mce_event(struct pt_regs *regs, long handled, if (phys_addr != ULONG_MAX) { mce->u.ue_error.physical_address_provided = true; mce->u.ue_error.physical_address = phys_addr; + machine_check_ue_event(mce); } } return; @@ -197,6 +208,26 @@ void release_mce_event(void) get_mce_event(NULL, true); } + +/* + * Queue up the MCE event which then can be handled later. + */ +void machine_check_ue_event(struct machine_check_event *evt) +{ + int index; + + index = __this_cpu_inc_return(mce_ue_count) - 1; + /* If queue is full, just return for now. */ + if (index >= MAX_MC_EVT) { + __this_cpu_dec(mce_ue_count); + return; + } + memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt)); + + /* Queue work to process this event later. */ + schedule_work(&mce_ue_event_work); +} + /* * Queue up the MCE event which then can be handled later. */ @@ -219,7 +250,39 @@ void machine_check_queue_event(void) /* Queue irq work to process this event later. */ irq_work_queue(&mce_event_process_work); } - +/* + * process pending MCE event from the mce event queue. This function will be + * called during syscall exit. + */ +static void machine_process_ue_event(struct work_struct *work) +{ + int index; + struct machine_check_event *evt; + + while (__this_cpu_read(mce_ue_count) > 0) { + index = __this_cpu_read(mce_ue_count) - 1; + evt = this_cpu_ptr(&mce_ue_event_queue[index]); +#ifdef CONFIG_MEMORY_FAILURE + /* + * This should probably queued elsewhere, but + * oh! well + */ + if (evt->error_type == MCE_ERROR_TYPE_UE) { + if (evt->u.ue_error.physical_address_provided) { + unsigned long pfn; + + pfn = evt->u.ue_error.physical_address >> + PAGE_SHIFT; + memory_failure(pfn, SIGBUS, 0); + } else + pr_warn("Failed to identify bad address from " + "where the uncorrectable error (UE) " + "was generated\n"); + } +#endif + __this_cpu_dec(mce_ue_count); + } +} /* * process pending MCE event from the mce event queue. This function will be * called during syscall exit. @@ -227,6 +290,7 @@ void machine_check_queue_event(void) static void machine_check_process_queued_event(struct irq_work *work) { int index; + struct machine_check_event *evt; add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); @@ -236,8 +300,8 @@ static void machine_check_process_queued_event(struct irq_work *work) */ while (__this_cpu_read(mce_queue_count) > 0) { index = __this_cpu_read(mce_queue_count) - 1; - machine_check_print_event_info( - this_cpu_ptr(&mce_event_queue[index]), false); + evt = this_cpu_ptr(&mce_event_queue[index]); + machine_check_print_event_info(evt, false); __this_cpu_dec(mce_queue_count); } } -- cgit From 17f444c0549f2ce037646871e748cf2dc375bce9 Mon Sep 17 00:00:00 2001 From: Michael Bringmann Date: Fri, 8 Sep 2017 15:47:27 -0500 Subject: powerpc/vphn: Update CPU topology when VPHN enabled powerpc/vphn: On Power systems with shared configurations of CPUs and memory, there are some issues with the association of additional CPUs and memory to nodes when hot-adding resources. This patch corrects the currently broken capability to set the topology for shared CPUs in LPARs. At boot time for shared CPU lpars, the topology for each CPU was being set to node zero. Now when numa_update_cpu_topology() is called appropriately, the Virtual Processor Home Node (VPHN) capabilities information provided by the pHyp allows the appropriate node in the shared configuration to be selected for the CPU. Signed-off-by: Michael Bringmann Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index b95c584ce19d..3ae031d437a6 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1153,6 +1153,8 @@ static cpumask_t cpu_associativity_changes_mask; static int vphn_enabled; static int prrn_enabled; static void reset_topology_timer(void); +static int topology_inited; +static int topology_update_needed; /* * Store the current values of the associativity change counters in the @@ -1246,6 +1248,10 @@ static long vphn_get_associativity(unsigned long cpu, "hcall_vphn() experienced a hardware fault " "preventing VPHN. Disabling polling...\n"); stop_topology_update(); + break; + case H_SUCCESS: + dbg("VPHN hcall succeeded. Reset polling...\n"); + break; } return rc; @@ -1323,8 +1329,11 @@ int numa_update_cpu_topology(bool cpus_locked) struct device *dev; int weight, new_nid, i = 0; - if (!prrn_enabled && !vphn_enabled) + if (!prrn_enabled && !vphn_enabled) { + if (!topology_inited) + topology_update_needed = 1; return 0; + } weight = cpumask_weight(&cpu_associativity_changes_mask); if (!weight) @@ -1363,6 +1372,8 @@ int numa_update_cpu_topology(bool cpus_locked) cpumask_andnot(&cpu_associativity_changes_mask, &cpu_associativity_changes_mask, cpu_sibling_mask(cpu)); + dbg("Assoc chg gives same node %d for cpu%d\n", + new_nid, cpu); cpu = cpu_last_thread_sibling(cpu); continue; } @@ -1433,6 +1444,7 @@ int numa_update_cpu_topology(bool cpus_locked) out: kfree(updates); + topology_update_needed = 0; return changed; } @@ -1613,9 +1625,17 @@ static int topology_update_init(void) if (topology_updates_enabled) start_topology_update(); + if (vphn_enabled) + topology_schedule_update(); + if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops)) return -ENOMEM; + topology_inited = 1; + if (topology_update_needed) + bitmap_fill(cpumask_bits(&cpu_associativity_changes_mask), + nr_cpumask_bits); + return 0; } device_initcall(topology_update_init); -- cgit From a3496e9137f6f58439da29a761725e083229c4a8 Mon Sep 17 00:00:00 2001 From: Michael Bringmann Date: Fri, 8 Sep 2017 15:47:36 -0500 Subject: powerpc/vphn: Improve recognition of PRRN/VPHN powerpc/vphn: On Power systems with shared configurations of CPUs and memory, there are some issues with the association of additional CPUs and memory to nodes when hot-adding resources. This patch updates the initialization checks to independently recognize PRRN or VPHN support. Signed-off-by: Michael Bringmann Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 3ae031d437a6..5f5ff46ae76d 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1531,15 +1531,14 @@ int start_topology_update(void) if (firmware_has_feature(FW_FEATURE_PRRN)) { if (!prrn_enabled) { prrn_enabled = 1; - vphn_enabled = 0; #ifdef CONFIG_SMP rc = of_reconfig_notifier_register(&dt_update_nb); #endif } - } else if (firmware_has_feature(FW_FEATURE_VPHN) && + } + if (firmware_has_feature(FW_FEATURE_VPHN) && lppaca_shared_proc(get_lppaca())) { if (!vphn_enabled) { - prrn_enabled = 0; vphn_enabled = 1; setup_cpu_associativity_change_counters(); init_timer_deferrable(&topology_timer); @@ -1562,7 +1561,8 @@ int stop_topology_update(void) #ifdef CONFIG_SMP rc = of_reconfig_notifier_unregister(&dt_update_nb); #endif - } else if (vphn_enabled) { + } + if (vphn_enabled) { vphn_enabled = 0; rc = del_timer_sync(&topology_timer); } -- cgit From cee5405da4020b0b0233bc8fb7c8da7322d2c52e Mon Sep 17 00:00:00 2001 From: Michael Bringmann Date: Fri, 8 Sep 2017 15:47:47 -0500 Subject: powerpc/hotplug: Improve responsiveness of hotplug change powerpc/hotplug: On Power systems with shared configurations of CPUs and memory, there are some issues with the association of additional CPUs and memory to nodes when hot-adding resources. During hotplug CPU operations, this patch resets the timer on topology update work function to a small value to better ensure that the CPU topology is detected and configured sooner. Signed-off-by: Michael Bringmann Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/topology.h | 8 ++++++++ arch/powerpc/mm/numa.c | 23 ++++++++++++++++++++++- arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 ++ 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 2d84bca8d053..34da2c5fd1df 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -96,6 +96,14 @@ static inline int prrn_is_enabled(void) } #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */ +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_NEED_MULTIPLE_NODES) +#if defined(CONFIG_PPC_SPLPAR) +extern int timed_topology_update(int nsecs); +#else +#define timed_topology_update(nsecs) +#endif /* CONFIG_PPC_SPLPAR */ +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_NEED_MULTIPLE_NODES */ + #include #ifdef CONFIG_SMP diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 5f5ff46ae76d..32f5f8db46ec 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1148,14 +1148,34 @@ struct topology_update_data { int new_nid; }; +#define TOPOLOGY_DEF_TIMER_SECS 60 + static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS]; static cpumask_t cpu_associativity_changes_mask; static int vphn_enabled; static int prrn_enabled; static void reset_topology_timer(void); +static int topology_timer_secs = 1; static int topology_inited; static int topology_update_needed; +/* + * Change polling interval for associativity changes. + */ +int timed_topology_update(int nsecs) +{ + if (vphn_enabled) { + if (nsecs > 0) + topology_timer_secs = nsecs; + else + topology_timer_secs = TOPOLOGY_DEF_TIMER_SECS; + + reset_topology_timer(); + } + + return 0; +} + /* * Store the current values of the associativity change counters in the * hypervisor. @@ -1251,6 +1271,7 @@ static long vphn_get_associativity(unsigned long cpu, break; case H_SUCCESS: dbg("VPHN hcall succeeded. Reset polling...\n"); + timed_topology_update(0); break; } @@ -1481,7 +1502,7 @@ static struct timer_list topology_timer = static void reset_topology_timer(void) { topology_timer.data = 0; - topology_timer.expires = jiffies + 60 * HZ; + topology_timer.expires = jiffies + topology_timer_secs * HZ; mod_timer(&topology_timer, topology_timer.expires); } diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index fadb95efbb9e..a7d14aa7bb7c 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -363,6 +363,7 @@ static int dlpar_online_cpu(struct device_node *dn) BUG_ON(get_cpu_current_state(cpu) != CPU_STATE_OFFLINE); cpu_maps_update_done(); + timed_topology_update(1); rc = device_online(get_cpu_device(cpu)); if (rc) goto out; @@ -533,6 +534,7 @@ static int dlpar_offline_cpu(struct device_node *dn) set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); cpu_maps_update_done(); + timed_topology_update(1); rc = device_offline(get_cpu_device(cpu)); if (rc) goto out; -- cgit From 8bc931495d4bfcde6bb242a478ddd1186ae29301 Mon Sep 17 00:00:00 2001 From: Michael Bringmann Date: Fri, 8 Sep 2017 15:47:56 -0500 Subject: powerpc/vphn: Fix numa update end-loop bug powerpc/vphn: On Power systems with shared configurations of CPUs and memory, there are some issues with the association of additional CPUs and memory to nodes when hot-adding resources. This patch fixes an end-of-updates processing problem observed occasionally in numa_update_cpu_topology(). Signed-off-by: Michael Bringmann Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 32f5f8db46ec..ec098b3f7332 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1401,16 +1401,22 @@ int numa_update_cpu_topology(bool cpus_locked) for_each_cpu(sibling, cpu_sibling_mask(cpu)) { ud = &updates[i++]; + ud->next = &updates[i]; ud->cpu = sibling; ud->new_nid = new_nid; ud->old_nid = numa_cpu_lookup_table[sibling]; cpumask_set_cpu(sibling, &updated_cpus); - if (i < weight) - ud->next = &updates[i]; } cpu = cpu_last_thread_sibling(cpu); } + /* + * Prevent processing of 'updates' from overflowing array + * where last entry filled in a 'next' pointer. + */ + if (i) + updates[i-1].next = NULL; + pr_debug("Topology update for the following CPUs:\n"); if (cpumask_weight(&updated_cpus)) { for (ud = &updates[0]; ud; ud = ud->next) { -- cgit From 31a4d4480c616bf0fa475f1869e5df852d3544bd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 19 Oct 2017 15:14:20 +1100 Subject: Revert "KVM: PPC: Book3S HV: POWER9 does not require secondary thread management" This reverts commit 94a04bc25a2c6296bd0c5e82c10e8231c2b11f77. In order to run HPT guests on a radix POWER9 host, we will have to run the host in single-threaded mode, because POWER9 processors do not currently support running some threads of a core in HPT mode while others are in radix mode ("mixed mode"). That means that we will need the same mechanisms that are used on POWER8 to make the secondary threads available to KVM, which were disabled on POWER9 by commit 94a04bc25a2c. Signed-off-by: Paul Mackerras Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kvm_book3s_asm.h | 4 ---- arch/powerpc/kernel/idle_book3s.S | 35 ++++++++++--------------------- arch/powerpc/kvm/book3s_hv.c | 13 +----------- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 8 ------- 4 files changed, 12 insertions(+), 48 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 83596f32f50b..7cea76f11c26 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -104,10 +104,6 @@ struct kvmppc_host_state { u8 napping; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - /* - * hwthread_req/hwthread_state pair is used to pull sibling threads - * out of guest on pre-ISAv3.0B CPUs where threads share MMU. - */ u8 hwthread_req; u8 hwthread_state; u8 host_ipi; diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 1125c9be9e06..175d49f468af 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -319,20 +319,13 @@ enter_winkle: /* * r3 - PSSCR value corresponding to the requested stop state. */ +power_enter_stop: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -power_enter_stop_kvm_rm: - /* - * This is currently unused because POWER9 KVM does not have to - * gather secondary threads into sibling mode, but the code is - * here in case that function is required. - * - * Tell KVM we're entering idle. - */ + /* Tell KVM we're entering idle */ li r4,KVM_HWTHREAD_IN_IDLE /* DO THIS IN REAL MODE! See comment above. */ stb r4,HSTATE_HWTHREAD_STATE(r13) #endif -power_enter_stop: /* * Check if we are executing the lite variant with ESL=EC=0 */ @@ -496,18 +489,6 @@ pnv_powersave_wakeup_mce: b pnv_powersave_wakeup -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -kvm_start_guest_check: - li r0,KVM_HWTHREAD_IN_KERNEL - stb r0,HSTATE_HWTHREAD_STATE(r13) - /* Order setting hwthread_state vs. testing hwthread_req */ - sync - lbz r0,HSTATE_HWTHREAD_REQ(r13) - cmpwi r0,0 - beqlr - b kvm_start_guest -#endif - /* * Called from reset vector for powersave wakeups. * cr3 - set to gt if waking up with partial/complete hypervisor state loss @@ -532,9 +513,15 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) mr r3,r12 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -BEGIN_FTR_SECTION - bl kvm_start_guest_check -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + li r0,KVM_HWTHREAD_IN_KERNEL + stb r0,HSTATE_HWTHREAD_STATE(r13) + /* Order setting hwthread_state vs. testing hwthread_req */ + sync + lbz r0,HSTATE_HWTHREAD_REQ(r13) + cmpwi r0,0 + beq 1f + b kvm_start_guest +1: #endif /* Return SRR1 from power7_nap() */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 73bf1ebfa78f..8f34715cfbff 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2117,15 +2117,6 @@ static int kvmppc_grab_hwthread(int cpu) struct paca_struct *tpaca; long timeout = 10000; - /* - * ISA v3.0 idle routines do not set hwthread_state or test - * hwthread_req, so they can not grab idle threads. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - WARN(1, "KVM: can not control sibling threads\n"); - return -EBUSY; - } - tpaca = &paca[cpu]; /* Ensure the thread won't go into the kernel if it wakes */ @@ -2160,12 +2151,10 @@ static void kvmppc_release_hwthread(int cpu) struct paca_struct *tpaca; tpaca = &paca[cpu]; + tpaca->kvm_hstate.hwthread_req = 0; tpaca->kvm_hstate.kvm_vcpu = NULL; tpaca->kvm_hstate.kvm_vcore = NULL; tpaca->kvm_hstate.kvm_split_mode = NULL; - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - tpaca->kvm_hstate.hwthread_req = 0; - } static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 17936f82d3c7..fb02209ff782 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -149,11 +149,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) subf r4, r4, r3 mtspr SPRN_DEC, r4 -BEGIN_FTR_SECTION /* hwthread_req may have got set by cede or no vcpu, so clear it */ li r0, 0 stb r0, HSTATE_HWTHREAD_REQ(r13) -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) /* * For external interrupts we need to call the Linux @@ -316,7 +314,6 @@ kvm_novcpu_exit: * Relocation is off and most register values are lost. * r13 points to the PACA. * r3 contains the SRR1 wakeup value, SRR1 is trashed. - * This is not used by ISAv3.0B processors. */ .globl kvm_start_guest kvm_start_guest: @@ -435,9 +432,6 @@ kvm_secondary_got_guest: * While waiting we also need to check if we get given a vcpu to run. */ kvm_no_guest: -BEGIN_FTR_SECTION - twi 31,0,0 -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r3, HSTATE_HWTHREAD_REQ(r13) cmpwi r3, 0 bne 53f @@ -2531,10 +2525,8 @@ kvm_do_nap: clrrdi r0, r0, 1 mtspr SPRN_CTRLT, r0 -BEGIN_FTR_SECTION li r0,1 stb r0,HSTATE_HWTHREAD_REQ(r13) -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mfspr r5,SPRN_LPCR ori r5,r5,LPCR_PECE0 | LPCR_PECE1 BEGIN_FTR_SECTION -- cgit From 2a3d6553cbd791da4fb624c2500b4580b3f9d72f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 12 Oct 2017 22:58:54 +1100 Subject: KVM: PPC: Tie KVM_CAP_PPC_HTM to the user-visible TM feature Currently we use CPU_FTR_TM to decide if the CPU/kernel can support TM (Transactional Memory), and if it's true we advertise that to Qemu (or similar) via KVM_CAP_PPC_HTM. PPC_FEATURE2_HTM is the user-visible feature bit, which indicates that the CPU and kernel can support TM. Currently CPU_FTR_TM and PPC_FEATURE2_HTM always have the same value, either true or false, so using the former for KVM_CAP_PPC_HTM is correct. However some Power9 CPUs can operate in a mode where TM is enabled but TM suspended state is disabled. In this mode CPU_FTR_TM is true, but PPC_FEATURE2_HTM is false. Instead a different PPC_FEATURE2 bit is set, to indicate that this different mode of TM is available. It is not safe to let guests use TM as-is, when the CPU is in this mode. So to prevent that from happening, use PPC_FEATURE2_HTM to determine the value of KVM_CAP_PPC_HTM. Signed-off-by: Michael Ellerman --- arch/powerpc/kvm/powerpc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3480faaf1ef8..a3746b98ec11 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -644,8 +644,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_PPC_HTM: - r = cpu_has_feature(CPU_FTR_TM_COMP) && - is_kvmppc_hv_enabled(kvm); + r = is_kvmppc_hv_enabled(kvm) && + (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_HTM_COMP); break; default: r = 0; -- cgit From 07fd1761e1cd2b5ecdb470fc228d52ccdf75e05b Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Thu, 12 Oct 2017 21:17:16 +1100 Subject: powerpc/tm: Add commandline option to disable hardware transactional memory Currently the kernel relies on firmware to inform it whether or not the CPU supports HTM and as long as the kernel was built with CONFIG_PPC_TRANSACTIONAL_MEM=y then it will allow userspace to make use of the facility. There may be situations where it would be advantageous for the kernel to not allow userspace to use HTM, currently the only way to achieve this is to recompile the kernel with CONFIG_PPC_TRANSACTIONAL_MEM=n. This patch adds a simple commandline option so that HTM can be disabled at boot time. Signed-off-by: Cyril Bur [mpe: Simplify to a bool, move to prom.c, put doco in the right place. Always disable, regardless of initial state, to avoid user confusion.] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/prom.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index f83056297441..d9bd6555f980 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -658,6 +658,35 @@ static void __init early_reserve_mem(void) #endif } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +static bool tm_disabled __initdata; + +static int __init parse_ppc_tm(char *str) +{ + bool res; + + if (kstrtobool(str, &res)) + return -EINVAL; + + tm_disabled = !res; + + return 0; +} +early_param("ppc_tm", parse_ppc_tm); + +static void __init tm_init(void) +{ + if (tm_disabled) { + pr_info("Disabling hardware transactional memory (HTM)\n"); + cur_cpu_spec->cpu_user_features2 &= + ~(PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM); + cur_cpu_spec->cpu_features &= ~CPU_FTR_TM; + } +} +#else +static void tm_init(void) { } +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + void __init early_init_devtree(void *params) { phys_addr_t limit; @@ -767,6 +796,8 @@ void __init early_init_devtree(void *params) powerpc_firmware_features |= FW_FEATURE_PS3_POSSIBLE; #endif + tm_init(); + DBG(" <- early_init_devtree()\n"); } -- cgit From cba6ac4869e45cc93ac5497024d1d49576e82666 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 12 Oct 2017 21:17:17 +1100 Subject: powerpc: Add PPC_FEATURE2_HTM_NO_SUSPEND Some CPUs can operate in a mode where TM (Transactional Memory) is enabled but the suspended state of TM is disabled. In this mode tsuspend does not enter suspended state, instead the transaction is aborted. Similarly any other event that would lead to suspended state instead aborts the transaction. There is also an ABI change, in that in this mode processes are not allowed to sigreturn with an MSR that would lead to suspended state, Linux will instead return an error to the sigreturn syscall. Signed-off-by: Michael Ellerman --- arch/powerpc/include/uapi/asm/cputable.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/uapi/asm/cputable.h b/arch/powerpc/include/uapi/asm/cputable.h index 4d877144f377..b3b64cba71ec 100644 --- a/arch/powerpc/include/uapi/asm/cputable.h +++ b/arch/powerpc/include/uapi/asm/cputable.h @@ -48,6 +48,7 @@ #define PPC_FEATURE2_HAS_IEEE128 0x00400000 /* VSX IEEE Binary Float 128-bit */ #define PPC_FEATURE2_DARN 0x00200000 /* darn random number insn */ #define PPC_FEATURE2_SCV 0x00100000 /* scv syscall */ +#define PPC_FEATURE2_HTM_NO_SUSPEND 0x00080000 /* TM w/out suspended state */ /* * IMPORTANT! -- cgit From 54820530c5faa9fd78e1c08cb6449100b1a19157 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 12 Oct 2017 21:17:18 +1100 Subject: powerpc/powernv: Enable TM without suspend if possible Some Power9 revisions can run in a mode where TM operates without suspended state. If we find ourself on a CPU that might be in this mode, we query OPAL to check, and if so we reenable TM in CPU features, and enable a new user feature to signal to userspace that we are in this mode. We do not enable the "normal" user feature, PPC_FEATURE2_HTM, but we do enable PPC_FEATURE2_HTM_NOSC because that indicates to userspace that the kernel will abort transactions on syscall entry, which is true regardless of the suspend mode. Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal-api.h | 2 ++ arch/powerpc/include/asm/powernv.h | 4 ++++ arch/powerpc/include/asm/tm.h | 2 ++ arch/powerpc/kernel/process.c | 7 +++++++ arch/powerpc/kernel/prom.c | 4 ++++ arch/powerpc/platforms/powernv/setup.c | 23 +++++++++++++++++++++++ 6 files changed, 42 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 9d191ebea706..233c7504b1f2 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -896,6 +896,8 @@ enum { */ OPAL_REINIT_CPUS_MMU_HASH = (1 << 2), OPAL_REINIT_CPUS_MMU_RADIX = (1 << 3), + + OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED = (1 << 4), }; typedef struct oppanel_line { diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h index f62797702300..dc5f6a5d4575 100644 --- a/arch/powerpc/include/asm/powernv.h +++ b/arch/powerpc/include/asm/powernv.h @@ -22,6 +22,8 @@ extern void pnv_npu2_destroy_context(struct npu_context *context, extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, unsigned long *flags, unsigned long *status, int count); + +void pnv_tm_init(void); #else static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { } static inline struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, @@ -36,6 +38,8 @@ static inline int pnv_npu2_handle_fault(struct npu_context *context, unsigned long *status, int count) { return -ENODEV; } + +static inline void pnv_tm_init(void) { } #endif #endif /* _ASM_POWERNV_H */ diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h index 82e06ca3a49b..ad19fe41931b 100644 --- a/arch/powerpc/include/asm/tm.h +++ b/arch/powerpc/include/asm/tm.h @@ -19,4 +19,6 @@ extern void tm_abort(uint8_t cause); extern void tm_save_sprs(struct thread_struct *thread); extern void tm_restore_sprs(struct thread_struct *thread); +extern bool tm_suspend_disabled; + #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 166145b18728..b02807ea54dc 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -77,6 +77,13 @@ extern unsigned long _get_SP(void); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM +/* + * Are we running in "Suspend disabled" mode? If so we have to block any + * sigreturn that would get us into suspended state, and we also warn in some + * other paths that we should never reach with suspend disabled. + */ +bool tm_suspend_disabled __ro_after_init = false; + static void check_if_tm_restore_required(struct task_struct *tsk) { /* diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index d9bd6555f980..101822be525a 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -681,7 +682,10 @@ static void __init tm_init(void) cur_cpu_spec->cpu_user_features2 &= ~(PPC_FEATURE2_HTM_NOSC | PPC_FEATURE2_HTM); cur_cpu_spec->cpu_features &= ~CPU_FTR_TM; + return; } + + pnv_tm_init(); } #else static void tm_init(void) { } diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index cf52d53da460..d23f148a11f0 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "powernv.h" @@ -304,6 +305,28 @@ static int __init pnv_probe(void) return 1; } +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +void __init pnv_tm_init(void) +{ + if (!firmware_has_feature(FW_FEATURE_OPAL) || + !pvr_version_is(PVR_POWER9) || + early_cpu_has_feature(CPU_FTR_TM)) + return; + + if (opal_reinit_cpus(OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED) != OPAL_SUCCESS) + return; + + pr_info("Enabling TM (Transactional Memory) with Suspend Disabled\n"); + cur_cpu_spec->cpu_features |= CPU_FTR_TM; + /* Make sure "normal" HTM is off (it should be) */ + cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_HTM; + /* Turn on no suspend mode, and HTM no SC */ + cur_cpu_spec->cpu_user_features2 |= PPC_FEATURE2_HTM_NO_SUSPEND | \ + PPC_FEATURE2_HTM_NOSC; + tm_suspend_disabled = true; +} +#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ + /* * Returns the cpu frequency for 'cpu' in Hz. This is used by * /proc/cpuinfo -- cgit From 92fb8690bd04cb421d987d246deac60eef85d272 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 12 Oct 2017 21:17:19 +1100 Subject: powerpc/tm: P9 disable transactionally suspended sigcontexts Unfortunately userspace can construct a sigcontext which enables suspend. Thus userspace can force Linux into a path where trechkpt is executed. This patch blocks this from happening on POWER9 by sanity checking sigcontexts passed in. ptrace doesn't have this problem as only MSR SE and BE can be changed via ptrace. This patch also adds a number of WARN_ON()s in case we ever enter suspend when we shouldn't. This should not happen, but if it does the symptoms are soft lockup warnings which are not obviously TM related, so the WARN_ON()s should make it obvious what's happening. Signed-off-by: Michael Neuling Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 2 ++ arch/powerpc/kernel/signal_32.c | 4 ++++ arch/powerpc/kernel/signal_64.c | 5 +++++ 3 files changed, 11 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index b02807ea54dc..c051dc2b42ad 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -910,6 +910,8 @@ static inline void tm_reclaim_task(struct task_struct *tsk) if (!MSR_TM_ACTIVE(thr->regs->msr)) goto out_and_saveregs; + WARN_ON(tm_suspend_disabled); + TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, " "ccr=%lx, msr=%lx, trap=%lx)\n", tsk->pid, thr->regs->nip, diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 92fb1c8dbbd8..1dd5fa0f65fd 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -519,6 +519,8 @@ static int save_tm_user_regs(struct pt_regs *regs, { unsigned long msr = regs->msr; + WARN_ON(tm_suspend_disabled); + /* Remove TM bits from thread's MSR. The MSR in the sigcontext * just indicates to userland that we were doing a transaction, but we * don't want to return in transactional state. This also ensures @@ -769,6 +771,8 @@ static long restore_tm_user_regs(struct pt_regs *regs, int i; #endif + if (tm_suspend_disabled) + return 1; /* * restore general registers but not including MSR or SOFTE. Also * take care of keeping r2 (TLS) intact if not a signal. diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index c83c115858c1..4dfdd8e56836 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -214,6 +214,8 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc, BUG_ON(!MSR_TM_ACTIVE(regs->msr)); + WARN_ON(tm_suspend_disabled); + /* Remove TM bits from thread's MSR. The MSR in the sigcontext * just indicates to userland that we were doing a transaction, but we * don't want to return in transactional state. This also ensures @@ -430,6 +432,9 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, BUG_ON(tsk != current); + if (tm_suspend_disabled) + return -EINVAL; + /* copy the GPRs */ err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr)); err |= __copy_from_user(&tsk->thread.ckpt_regs, sc->gp_regs, -- cgit From 402e172a2ce76210f2fe921cf419d12103851344 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 17 Oct 2017 16:20:18 -0200 Subject: powerpc/xmon: Check before calling xive functions Currently xmon could call XIVE functions from OPAL even if the XIVE is disabled or does not exist in the system, as in POWER8 machines. This causes the following exception: 1:mon> dx cpu 0x1: Vector: 700 (Program Check) at [c000000423c93450] pc: c00000000009cfa4: opal_xive_dump+0x50/0x68 lr: c0000000000997b8: opal_return+0x0/0x50 This patch simply checks if XIVE is enabled before calling XIVE functions. Fixes: 243e25112d06 ("powerpc/xive: Native exploitation of the XIVE interrupt controller") Suggested-by: Guilherme G. Piccoli Signed-off-by: Breno Leitao Signed-off-by: Michael Ellerman --- arch/powerpc/xmon/xmon.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 4679aeb84767..2e2320edb96b 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2508,6 +2508,11 @@ static void dump_xives(void) unsigned long num; int c; + if (!xive_enabled()) { + printf("Xive disabled on this system\n"); + return; + } + c = inchar(); if (c == 'a') { dump_all_xives(); -- cgit From 6773027205ea4ccf17055d7f0a0cd813f22fe127 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 16 Oct 2017 12:41:00 +0530 Subject: powerpc/mm/radix: Drop unneeded NULL check We call these functions with non-NULL mm or vma. Hence we can skip the NULL check in these functions. We also remove now unused function __local_flush_hugetlb_page(). Signed-off-by: Aneesh Kumar K.V [mpe: Drop the checks with is_vm_hugetlb_page() as noticed by Nick] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hugetlb.h | 6 ------ arch/powerpc/mm/tlb-radix.c | 22 ++++++++++------------ 2 files changed, 10 insertions(+), 18 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index b8a0fb442c64..795d825c2edd 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -40,12 +40,6 @@ static inline void flush_hugetlb_page(struct vm_area_struct *vma, return radix__flush_hugetlb_page(vma, vmaddr); } -static inline void __local_flush_hugetlb_page(struct vm_area_struct *vma, - unsigned long vmaddr) -{ - if (radix_enabled()) - return radix__local_flush_hugetlb_page(vma, vmaddr); -} #else static inline pte_t *hugepd_page(hugepd_t hpd) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 5a1f46eff3a2..3a07d7a5e2fe 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -164,7 +164,7 @@ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmadd unsigned long ap = mmu_get_ap(psize); preempt_disable(); - pid = mm ? mm->context.id : 0; + pid = mm->context.id; if (pid != MMU_NO_CONTEXT) _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); preempt_enable(); @@ -174,11 +174,10 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd { #ifdef CONFIG_HUGETLB_PAGE /* need the return fix for nohash.c */ - if (vma && is_vm_hugetlb_page(vma)) - return __local_flush_hugetlb_page(vma, vmaddr); + if (is_vm_hugetlb_page(vma)) + return radix__local_flush_hugetlb_page(vma, vmaddr); #endif - radix__local_flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr, - mmu_virtual_psize); + radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); } EXPORT_SYMBOL(radix__local_flush_tlb_page); @@ -232,7 +231,7 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, unsigned long ap = mmu_get_ap(psize); preempt_disable(); - pid = mm ? mm->context.id : 0; + pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) goto bail; if (!mm_is_thread_local(mm)) @@ -246,11 +245,10 @@ bail: void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) { #ifdef CONFIG_HUGETLB_PAGE - if (vma && is_vm_hugetlb_page(vma)) - return flush_hugetlb_page(vma, vmaddr); + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_page(vma, vmaddr); #endif - radix__flush_tlb_page_psize(vma ? vma->vm_mm : NULL, vmaddr, - mmu_virtual_psize); + radix__flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize); } EXPORT_SYMBOL(radix__flush_tlb_page); @@ -330,7 +328,7 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, preempt_disable(); - pid = mm ? mm->context.id : 0; + pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) goto err_out; @@ -361,7 +359,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) unsigned long pid, end; - pid = mm ? mm->context.id : 0; + pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) goto no_context; -- cgit From deed2ec5a786f037b00136792e08404c84ec557a Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Thu, 12 Oct 2017 13:16:59 +1100 Subject: powerpc/configs: Enable I2C_CHARDEV for pseries and powernv i2c-dev provides an interface for userspace programs to interact with I2C devices, and is very helpful for I2C-related debugging. Enable it in pseries_defconfig and powernv_defconfig. It's already enabled in many other powerpc defconfigs. Signed-off-by: Andrew Donnellan Signed-off-by: Michael Ellerman --- arch/powerpc/configs/powernv_defconfig | 1 + arch/powerpc/configs/pseries_defconfig | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index 3fb2a7ab54f1..4891bbed6258 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -192,6 +192,7 @@ CONFIG_IPMI_DEVICE_INTERFACE=y CONFIG_IPMI_POWERNV=y CONFIG_RAW_DRIVER=y CONFIG_MAX_RAW_DEVS=1024 +CONFIG_I2C_CHARDEV=y CONFIG_DRM=y CONFIG_DRM_AST=y CONFIG_FIRMWARE_EDID=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 3d935969e5a2..bde2cd1005a2 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -193,6 +193,7 @@ CONFIG_VIRTIO_CONSOLE=m CONFIG_IBM_BSR=m CONFIG_RAW_DRIVER=y CONFIG_MAX_RAW_DEVS=1024 +CONFIG_I2C_CHARDEV=y CONFIG_FB=y CONFIG_FIRMWARE_EDID=y CONFIG_FB_OF=y -- cgit From 90d6473747b289739b435d5fa6ae8e38ae2921d1 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 9 Oct 2017 21:59:32 +1100 Subject: powerpc/xmon: Add kstack base to paca dump When dumping the paca in xmon we currently show kstack. Although it's not hard it's a bit fiddly to work out what the bounds of the kernel stack should be based on the kstack value. To make life easier and "kstack_base" which is the base (lowest address) of the kernel stack, eg: kstack = 0xc0000000f1a7be30 (0x258) kstack_base = 0xc0000000f1a78000 Signed-off-by: Michael Ellerman --- arch/powerpc/xmon/xmon.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 2e2320edb96b..83e7faf5b3d3 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2384,6 +2384,7 @@ static void dump_one_paca(int cpu) #endif DUMP(p, __current, "p"); DUMP(p, kstack, "lx"); + printf(" kstack_base = 0x%016lx\n", p->kstack & ~(THREAD_SIZE - 1)); DUMP(p, stab_rr, "lx"); DUMP(p, saved_r1, "lx"); DUMP(p, trap_save, "x"); -- cgit From 5c2511bff47ba7fb09268ee8ebbf5237da5c0b84 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 9 Oct 2017 21:54:04 +1100 Subject: powerpc/64s: Move the two FAST_ENDIAN macros next to each other So we can #ifdef them in the next patch. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ce04b1c2ac8e..acddfe79395d 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -865,12 +865,6 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) #define LOAD_SYSCALL_HANDLER(reg) \ __LOAD_HANDLER(reg, system_call_common) -#define SYSCALL_FASTENDIAN_TEST \ -BEGIN_FTR_SECTION \ - cmpdi r0,0x1ebe ; \ - beq- 1f ; \ -END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ - /* * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9, * and HMT_MEDIUM. @@ -885,6 +879,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ rfid ; \ b . ; /* prevent speculative execution */ +#define SYSCALL_FASTENDIAN_TEST \ +BEGIN_FTR_SECTION \ + cmpdi r0,0x1ebe ; \ + beq- 1f ; \ +END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ + #define SYSCALL_FASTENDIAN \ /* Fast LE/BE switch system call */ \ 1: mfspr r12,SPRN_SRR1 ; \ -- cgit From 727f13616c45caa72e4325c5027c1a8f41e745a9 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 9 Oct 2017 21:54:05 +1100 Subject: powerpc: Disable the fast-endian switch syscall by default Back in 2008 we added support for "fast little-endian switch" in the syscall path. This added a special case syscall number 0x1ebe, which is caught very early in the system call exception and switches endian with as little overhead as possible. See commit 745a14cc264b ("[POWERPC] Add fast little-endian switch system call") for full details. Although it is fast, it's also completely non standard. The "syscall number" is out of the range of normal syscalls, it can't be traced or audited, and it's a bit of a wart. To the best of our knowledge it was only used by one program, now long since discontinued. So in an effort to shake out any current users, put it behind a config option, and make it default n. If anyone *is* using it they can quickly reinstate it with a rebuild, and we can flip it to default y. Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig.debug | 6 ++++++ arch/powerpc/kernel/exceptions-64s.S | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index c86df246339e..b8a1eb42a38d 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -369,4 +369,10 @@ config PPC_HTDUMP def_bool y depends on PPC_PTDUMP && PPC_BOOK3S +config PPC_FAST_ENDIAN_SWITCH + bool "Deprecated fast endian-switch syscall" + depends on DEBUG_KERNEL && PPC_BOOK3S_64 + help + If you're unsure what this is, say N. + endmenu diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index acddfe79395d..d001aa4c69a1 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -879,6 +879,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) rfid ; \ b . ; /* prevent speculative execution */ +#ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH #define SYSCALL_FASTENDIAN_TEST \ BEGIN_FTR_SECTION \ cmpdi r0,0x1ebe ; \ @@ -893,6 +894,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ mr r13,r9 ; \ rfid ; /* return to userspace */ \ b . ; /* prevent speculative execution */ +#else +#define SYSCALL_FASTENDIAN_TEST +#define SYSCALL_FASTENDIAN +#endif /* CONFIG_PPC_FAST_ENDIAN_SWITCH */ #if defined(CONFIG_RELOCATABLE) /* -- cgit From 05c14c03138532a3cb2aa29c2960445c8753343b Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 9 Oct 2017 21:52:44 +1100 Subject: powerpc/perf/hv-24x7: Fix incorrect comparison in memord In the hv-24x7 code there is a function memord() which tries to implement a sort function return -1, 0, 1. However one of the conditions is incorrect, such that it can never be true, because we will have already returned. I don't believe there is a bug in practice though, because the comparisons are an optimisation prior to calling memcmp(). Fix it by swapping the second comparision, so it can be true. Reported-by: David Binderman Signed-off-by: Michael Ellerman --- arch/powerpc/perf/hv-24x7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 9c88b82f6229..72238eedc360 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -540,7 +540,7 @@ static int memord(const void *d1, size_t s1, const void *d2, size_t s2) { if (s1 < s2) return 1; - if (s2 > s1) + if (s1 > s2) return -1; return memcmp(d1, d2, s1); -- cgit From 3d2d4339cc326c427638daa67e264dd455ee1899 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 2 Jun 2017 14:38:47 +0200 Subject: powerpc: dts: acadia: DT fix s/#interrupts-parent/#interrupt-parent/ Signed-off-by: Geert Uytterhoeven Acked-by: Rob Herring Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/acadia.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/boot/dts/acadia.dts b/arch/powerpc/boot/dts/acadia.dts index 57291f61ffe7..86266159521e 100644 --- a/arch/powerpc/boot/dts/acadia.dts +++ b/arch/powerpc/boot/dts/acadia.dts @@ -183,7 +183,7 @@ usb@ef603000 { compatible = "ohci-be"; reg = <0xef603000 0x80>; - interrupts-parent = <&UIC0>; + interrupt-parent = <&UIC0>; interrupts = <0xd 0x4 0xe 0x4>; }; -- cgit From a7cd4586210c69c19aef26f4f83d3e492040de1c Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 5 Sep 2017 18:47:02 +0200 Subject: powerpc/axonram: Drop unnecessary variable initialisation The local variable "rc" will eventually be set only to an error code. Thus omit the explicit initialisation at the beginning. Signed-off-by: Markus Elfring Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/axonram.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index c60e84e4558d..1b307c80b401 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -184,7 +184,7 @@ static int axon_ram_probe(struct platform_device *device) static int axon_ram_bank_id = -1; struct axon_ram_bank *bank; struct resource resource; - int rc = 0; + int rc; axon_ram_bank_id++; -- cgit From c28237f1d4ed2c9022f9eed656ecf36999e34f47 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Tue, 17 Oct 2017 13:31:42 +0200 Subject: powerpc-opal: Fix a typo in a comment line of two file headers Fix a word in these descriptions. Signed-off-by: Markus Elfring Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-hmi.c | 2 +- arch/powerpc/platforms/powernv/opal-memory-errors.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c index d78fed728cdf..c9e1a4ff295c 100644 --- a/arch/powerpc/platforms/powernv/opal-hmi.c +++ b/arch/powerpc/platforms/powernv/opal-hmi.c @@ -1,5 +1,5 @@ /* - * OPAL hypervisor Maintenance interrupt handling support in PowreNV. + * OPAL hypervisor Maintenance interrupt handling support in PowerNV. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c index 4495f428b500..d9916ea62305 100644 --- a/arch/powerpc/platforms/powernv/opal-memory-errors.c +++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c @@ -1,5 +1,5 @@ /* - * OPAL asynchronus Memory error handling support in PowreNV. + * OPAL asynchronus Memory error handling support in PowerNV. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by -- cgit From 4dd9eab39c71628d113168a01473ee17b5f61eac Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Wed, 18 Oct 2017 20:48:52 +0200 Subject: powerpc/pseries: Cleanup error handling in iommu_pseries_alloc_group() Although kfree(NULL) is legal, it's a bit lazy to rely on that to implement the error handling. So do it the normal Linux way using labels for each failure path. Signed-off-by: Markus Elfring [mpe: Squash a few patches and rewrite change log] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/iommu.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 7c181467d0ad..69921f72e2da 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -55,23 +55,23 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) { - struct iommu_table_group *table_group = NULL; - struct iommu_table *tbl = NULL; - struct iommu_table_group_link *tgl = NULL; + struct iommu_table_group *table_group; + struct iommu_table *tbl; + struct iommu_table_group_link *tgl; table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, node); if (!table_group) - goto fail_exit; + return NULL; tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); if (!tbl) - goto fail_exit; + goto free_group; tgl = kzalloc_node(sizeof(struct iommu_table_group_link), GFP_KERNEL, node); if (!tgl) - goto fail_exit; + goto free_table; INIT_LIST_HEAD_RCU(&tbl->it_group_list); kref_init(&tbl->it_kref); @@ -82,11 +82,10 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node) return table_group; -fail_exit: - kfree(tgl); - kfree(table_group); +free_table: kfree(tbl); - +free_group: + kfree(table_group); return NULL; } -- cgit From 6de6638b35daa0dfb7daefd78d55015655b4d8f3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 5 Nov 2017 23:33:55 +1100 Subject: KVM: PPC: Book3S HV: Handle host system reset in guest mode If the host takes a system reset interrupt while a guest is running, the CPU must exit the guest before processing the host exception handler. After this patch, taking a sysrq+x with a CPU running in a guest gives a trace like this: cpu 0x27: Vector: 100 (System Reset) at [c000000fdf5776f0] pc: c008000010158b80: kvmppc_run_core+0x16b8/0x1ad0 [kvm_hv] lr: c008000010158b80: kvmppc_run_core+0x16b8/0x1ad0 [kvm_hv] sp: c000000fdf577850 msr: 9000000002803033 current = 0xc000000fdf4b1e00 paca = 0xc00000000fd4d680 softe: 3 irq_happened: 0x01 pid = 6608, comm = qemu-system-ppc Linux version 4.14.0-rc7-01489-g47e1893a404a-dirty #26 SMP [c000000fdf577a00] c008000010159dd4 kvmppc_vcpu_run_hv+0x3dc/0x12d0 [kvm_hv] [c000000fdf577b30] c0080000100a537c kvmppc_vcpu_run+0x44/0x60 [kvm] [c000000fdf577b60] c0080000100a1ae0 kvm_arch_vcpu_ioctl_run+0x118/0x310 [kvm] [c000000fdf577c00] c008000010093e98 kvm_vcpu_ioctl+0x530/0x7c0 [kvm] [c000000fdf577d50] c000000000357bf8 do_vfs_ioctl+0xd8/0x8c0 [c000000fdf577df0] c000000000358448 SyS_ioctl+0x68/0x100 [c000000fdf577e30] c00000000000b220 system_call+0x58/0x6c --- Exception: c01 (System Call) at 00007fff76868df0 SP (7fff7069baf0) is in userspace Fixes: e36d0a2ed5 ("powerpc/powernv: Implement NMI IPI with OPAL_SIGNAL_SYSTEM_RESET") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hw_irq.h | 1 + arch/powerpc/kernel/exceptions-64s.S | 2 ++ arch/powerpc/kernel/irq.c | 3 ++- arch/powerpc/kvm/book3s_hv.c | 7 ++++++- 4 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index c1dd1929342d..d2ef36111518 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -31,6 +31,7 @@ #ifndef __ASSEMBLY__ +extern void replay_system_reset(void); extern void __replay_interrupt(unsigned int vector); extern void timer_interrupt(struct pt_regs *); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index d001aa4c69a1..64c2bc35e4ff 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -113,6 +113,7 @@ EXC_VIRT_NONE(0x4000, 0x100) cmpwi cr3,r10,2 ; \ BRANCH_TO_C000(r10, system_reset_idle_common) ; \ 1: \ + KVMTEST_PR(n) ; \ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #else #define IDLETEST NOTEST @@ -129,6 +130,7 @@ EXC_REAL_BEGIN(system_reset, 0x100, 0x100) EXC_REAL_END(system_reset, 0x100, 0x100) EXC_VIRT_NONE(0x4100, 0x100) +TRAMP_KVM(PACA_EXNMI, 0x100) #ifdef CONFIG_PPC_P7_NAP EXC_COMMON_BEGIN(system_reset_idle_common) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 4813b83b22aa..6a6995173f51 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -415,7 +415,7 @@ static const u8 srr1_to_lazyirq[0x10] = { PACA_IRQ_HMI, 0, 0, 0, 0, 0 }; -static noinline void replay_system_reset(void) +void replay_system_reset(void) { struct pt_regs regs; @@ -425,6 +425,7 @@ static noinline void replay_system_reset(void) system_reset_exception(®s); get_paca()->in_nmi = 0; } +EXPORT_SYMBOL_GPL(replay_system_reset); void irq_set_pending_from_srr1(unsigned long srr1) { diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 8f34715cfbff..31a362669fea 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -47,6 +47,7 @@ #include #include +#include #include #include #include @@ -1089,9 +1090,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->stat.ext_intr_exits++; r = RESUME_GUEST; break; - /* HMI is hypervisor interrupt and host has handled it. Resume guest.*/ + /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/ case BOOK3S_INTERRUPT_HMI: case BOOK3S_INTERRUPT_PERFMON: + case BOOK3S_INTERRUPT_SYSTEM_RESET: r = RESUME_GUEST; break; case BOOK3S_INTERRUPT_MACHINE_CHECK: @@ -2604,6 +2606,9 @@ static void set_irq_happened(int trap) case BOOK3S_INTERRUPT_HMI: local_paca->irq_happened |= PACA_IRQ_HMI; break; + case BOOK3S_INTERRUPT_SYSTEM_RESET: + replay_system_reset(); + break; } } -- cgit From ff967900c9d4740f6337c6456f111104267633cb Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 21 Oct 2017 17:56:06 +1000 Subject: powerpc/64: Fix latency tracing for lazy irq replay When returning from an exception to a soft-enabled context, pending IRQs are replayed but IRQ tracing is not reset, so a number of them can get chained together into the same IRQ-disabled trace. Fix this by having __check_irq_replay re-set IRQ trace. This is conceptually where we respond to the next interrupt, so it fits the semantics of the IRQ tracer. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/irq.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 6a6995173f51..b7a84522e652 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -143,6 +143,13 @@ notrace unsigned int __check_irq_replay(void) */ unsigned char happened = local_paca->irq_happened; + /* + * We are responding to the next interrupt, so interrupt-off + * latencies should be reset here. + */ + trace_hardirqs_on(); + trace_hardirqs_off(); + if (happened & PACA_IRQ_HARD_DIS) { /* Clear bit 0 which we wouldn't clear otherwise */ local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; @@ -270,6 +277,7 @@ notrace void arch_local_irq_restore(unsigned long en) #endif /* CONFIG_TRACE_IRQFLAGS */ set_soft_enabled(0); + trace_hardirqs_off(); /* * Check if anything needs to be re-emitted. We haven't @@ -279,6 +287,7 @@ notrace void arch_local_irq_restore(unsigned long en) replay = __check_irq_replay(); /* We can soft-enable now */ + trace_hardirqs_on(); set_soft_enabled(1); /* -- cgit From f848ea7f5960ec2684c3bd1c0692e6f5ad47a27d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 23 Oct 2017 17:08:13 +1000 Subject: powerpc/book3s: Use label for FIXUP_ENDIAN macro branch Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/ppc_asm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 36f3e41c9fbe..d6b56aebe602 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -776,7 +776,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) #else #define FIXUP_ENDIAN \ tdi 0,0,0x48; /* Reverse endian of b . + 8 */ \ - b $+44; /* Skip trampoline if endian is good */ \ + b 191f; /* Skip trampoline if endian is good */ \ .long 0xa600607d; /* mfmsr r11 */ \ .long 0x01006b69; /* xori r11,r11,1 */ \ .long 0x00004039; /* li r10,0 */ \ @@ -786,7 +786,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) .long 0x14004a39; /* addi r10,r10,20 */ \ .long 0xa6035a7d; /* mtsrr0 r10 */ \ .long 0xa6037b7d; /* mtsrr1 r11 */ \ - .long 0x2400004c /* rfid */ + .long 0x2400004c; /* rfid */ \ +191: #endif /* !CONFIG_PPC_BOOK3E */ -- cgit From 8ca9c08d0c8f6d68db9b592dc149cb8284173ca8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 23 Oct 2017 17:08:14 +1000 Subject: powerpc/book3s: Add an HV variant of FIXUP_ENDIAN that is recoverable Add an HV variant of FIXUP_ENDIAN which uses HSRR[01] and does not clear MSR[RI], which improves recoverability. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/ppc_asm.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index d6b56aebe602..ae94b3626b6c 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -774,6 +774,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) #ifdef CONFIG_PPC_BOOK3E #define FIXUP_ENDIAN #else +/* + * This version may be used in in HV or non-HV context. + * MSR[EE] must be disabled. + */ #define FIXUP_ENDIAN \ tdi 0,0,0x48; /* Reverse endian of b . + 8 */ \ b 191f; /* Skip trampoline if endian is good */ \ @@ -789,6 +793,24 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601) .long 0x2400004c; /* rfid */ \ 191: +/* + * This version that may only be used with MSR[HV]=1 + * - Does not clear MSR[RI], so more robust. + * - Slightly smaller and faster. + */ +#define FIXUP_ENDIAN_HV \ + tdi 0,0,0x48; /* Reverse endian of b . + 8 */ \ + b 191f; /* Skip trampoline if endian is good */ \ + .long 0xa600607d; /* mfmsr r11 */ \ + .long 0x01006b69; /* xori r11,r11,1 */ \ + .long 0x05009f42; /* bcl 20,31,$+4 */ \ + .long 0xa602487d; /* mflr r10 */ \ + .long 0x14004a39; /* addi r10,r10,20 */ \ + .long 0xa64b5a7d; /* mthsrr0 r10 */ \ + .long 0xa64b7b7d; /* mthsrr1 r11 */ \ + .long 0x2402004c; /* hrfid */ \ +191: + #endif /* !CONFIG_PPC_BOOK3E */ #endif /* __ASSEMBLY__ */ -- cgit From 63c9d8a4b394f9d8e995292a7c74648760235b44 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 23 Oct 2017 17:08:15 +1000 Subject: powerpc/powernv: Use FIXUP_ENDIAN_HV in OPAL return Close the recoverability gap for OPAL calls by using FIXUP_ENDIAN_HV in the return path. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-wrappers.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 37cd170201a2..6f4b00a2ac46 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -94,7 +94,7 @@ opal_return: * bytes (always BE) since MSR:LE will end up fixed up as a side * effect of the rfid. */ - FIXUP_ENDIAN + FIXUP_ENDIAN_HV ld r2,PACATOC(r13); lwz r4,8(r1); ld r5,PPC_LR_STKOFF(r1); @@ -120,7 +120,7 @@ opal_real_call: hrfid opal_return_realmode: - FIXUP_ENDIAN + FIXUP_ENDIAN_HV ld r2,PACATOC(r13); lwz r11,8(r1); ld r12,PPC_LR_STKOFF(r1) -- cgit From dffe8449c5dd63ff18b47709de75553586582cd8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 24 Oct 2017 23:06:53 +1000 Subject: powerpc/64s/radix: Improve preempt handling in TLB code Preempt should be consistently disabled for mm_is_thread_local tests, so bring the rest of these under preempt_disable(). Preempt does not need to be disabled for the mm->context.id tests, which allows simplification and removal of gotos. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb-radix.c | 48 ++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 25 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 3a07d7a5e2fe..67191fe63283 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -186,16 +186,15 @@ void radix__flush_tlb_mm(struct mm_struct *mm) { unsigned long pid; - preempt_disable(); pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) - goto no_context; + return; + preempt_disable(); if (!mm_is_thread_local(mm)) _tlbie_pid(pid, RIC_FLUSH_TLB); else _tlbiel_pid(pid, RIC_FLUSH_TLB); -no_context: preempt_enable(); } EXPORT_SYMBOL(radix__flush_tlb_mm); @@ -204,16 +203,15 @@ void radix__flush_all_mm(struct mm_struct *mm) { unsigned long pid; - preempt_disable(); pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) - goto no_context; + return; + preempt_disable(); if (!mm_is_thread_local(mm)) _tlbie_pid(pid, RIC_FLUSH_ALL); else _tlbiel_pid(pid, RIC_FLUSH_ALL); -no_context: preempt_enable(); } EXPORT_SYMBOL(radix__flush_all_mm); @@ -230,15 +228,15 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, unsigned long pid; unsigned long ap = mmu_get_ap(psize); - preempt_disable(); pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) - goto bail; + return; + + preempt_disable(); if (!mm_is_thread_local(mm)) _tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB); else _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); -bail: preempt_enable(); } @@ -322,46 +320,44 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, { unsigned long pid; unsigned long addr; - int local = mm_is_thread_local(mm); + bool local; unsigned long ap = mmu_get_ap(psize); unsigned long page_size = 1UL << mmu_psize_defs[psize].shift; - - preempt_disable(); pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) - goto err_out; + return; + preempt_disable(); + local = mm_is_thread_local(mm); if (end == TLB_FLUSH_ALL || (end - start) > tlb_single_page_flush_ceiling * page_size) { if (local) _tlbiel_pid(pid, RIC_FLUSH_TLB); else _tlbie_pid(pid, RIC_FLUSH_TLB); - goto err_out; - } - for (addr = start; addr < end; addr += page_size) { + } else { + for (addr = start; addr < end; addr += page_size) { - if (local) - _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); - else - _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + if (local) + _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); + else + _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + } } -err_out: preempt_enable(); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) { - int local = mm_is_thread_local(mm); unsigned long ap = mmu_get_ap(mmu_virtual_psize); unsigned long pid, end; - + bool local; pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) - goto no_context; + return; /* 4k page size, just blow the world */ if (PAGE_SIZE == 0x1000) { @@ -369,6 +365,8 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) return; } + preempt_disable(); + local = mm_is_thread_local(mm); /* Otherwise first do the PWC */ if (local) _tlbiel_pid(pid, RIC_FLUSH_PWC); @@ -383,7 +381,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) else _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); } -no_context: + preempt_enable(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit From 30b49ec798f0984b905fd94d1957d62530f08578 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 24 Oct 2017 23:06:54 +1000 Subject: powerpc/64s/radix: Fix process table entry cache invalidation According to the architecture, the process table entry cache must be flushed with tlbie RIC=2. Currently the process table entry is set to invalid right before the PID is returned to the allocator, with no invalidation. This works on existing implementations that are known to not cache the process table entry for any except the current PIDR. It is architecturally correct and cleaner to invalidate with RIC=2 after clearing the process table entry and before the PID is returned to the allocator. This can be done in arch_exit_mmap that runs before the final flush, and to ensure the final flush (fullmm) is always a RIC=2 variant. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mmu_context.h | 4 ++++ arch/powerpc/mm/mmu_context_book3s64.c | 25 ++++++++++++++++++++----- arch/powerpc/mm/tlb-radix.c | 6 +++++- 3 files changed, 29 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index a0d7145d6cd2..20eae6f76247 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -164,9 +164,13 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm, { } +#ifndef CONFIG_PPC_BOOK3S_64 static inline void arch_exit_mmap(struct mm_struct *mm) { } +#else +extern void arch_exit_mmap(struct mm_struct *mm); +#endif static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 05e15386d4cb..6d724dab27c2 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -216,19 +216,34 @@ void destroy_context(struct mm_struct *mm) #ifdef CONFIG_SPAPR_TCE_IOMMU WARN_ON_ONCE(!list_empty(&mm->context.iommu_group_mem_list)); #endif + if (radix_enabled()) + WARN_ON(process_tb[mm->context.id].prtb0 != 0); + else + subpage_prot_free(mm); + destroy_pagetable_page(mm); + __destroy_context(mm->context.id); + mm->context.id = MMU_NO_CONTEXT; +} + +void arch_exit_mmap(struct mm_struct *mm) +{ if (radix_enabled()) { /* * Radix doesn't have a valid bit in the process table * entries. However we know that at least P9 implementation * will avoid caching an entry with an invalid RTS field, * and 0 is invalid. So this will do. + * + * This runs before the "fullmm" tlb flush in exit_mmap, + * which does a RIC=2 tlbie to clear the process table + * entry. See the "fullmm" comments in tlb-radix.c. + * + * No barrier required here after the store because + * this process will do the invalidate, which starts with + * ptesync. */ process_tb[mm->context.id].prtb0 = 0; - } else - subpage_prot_free(mm); - destroy_pagetable_page(mm); - __destroy_context(mm->context.id); - mm->context.id = MMU_NO_CONTEXT; + } } #ifdef CONFIG_PPC_RADIX_MMU diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 67191fe63283..63e277b6e60c 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -298,10 +298,14 @@ void radix__tlb_flush(struct mmu_gather *tlb) psize = radix_get_mmu_psize(page_size); /* * if page size is not something we understand, do a full mm flush + * + * A "fullmm" flush must always do a flush_all_mm (RIC=2) flush + * that flushes the process table entry cache upon process teardown. + * See the comment for radix in arch_exit_mmap(). */ if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all) radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize); - else if (tlb->need_flush_all) { + else if (tlb->fullmm || tlb->need_flush_all) { tlb->need_flush_all = 0; radix__flush_all_mm(mm); } else -- cgit From b8f89fea599d91e674497aad572613eb63181f31 Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Thu, 28 Sep 2017 20:19:20 -0400 Subject: powerpc/pseries/vio: Dispose of virq mapping on vdevice unregister When a vdevice is DLPAR removed from the system the vio subsystem doesn't bother unmapping the virq from the irq_domain. As a result we have a virq mapped to a hardware irq that is no longer valid for the irq_domain. A side effect is that we are left with /proc/irq/ affinity entries, and attempts to modify the smp_affinity of the irq will fail. In the following observed example the kernel log is spammed by ics_rtas_set_affinity errors after the removal of a VSCSI adapter. This is a result of irqbalance trying to adjust the affinity every 10 seconds. rpadlpar_io: slot U8408.E8E.10A7ACV-V5-C25 removed ics_rtas_set_affinity: ibm,set-xive irq=655385 returns -3 ics_rtas_set_affinity: ibm,set-xive irq=655385 returns -3 This patch fixes the issue by calling irq_dispose_mapping() on the virq of the viodev on unregister. Fixes: f2ab6219969f ("powerpc/pseries: Add PFO support to the VIO bus") Signed-off-by: Tyrel Datwyler Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/vio.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 12277bc9fd9e..d86938260a86 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1592,6 +1592,8 @@ ATTRIBUTE_GROUPS(vio_dev); void vio_unregister_device(struct vio_dev *viodev) { device_unregister(&viodev->dev); + if (viodev->family == VDEVICE) + irq_dispose_mapping(viodev->irq); } EXPORT_SYMBOL(vio_unregister_device); -- cgit From d6f934fd48803d9e58040e2cbab2feafe9bb9f01 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 27 Sep 2017 16:52:31 +1000 Subject: powerpc/powernv: Reserve a hole which appears after enabling IOV In order to make generic IOV code work, the physical function IOV BAR should start from offset of the first VF. Since M64 segments share PE number space across PHB, and some PEs may be in use at the time when IOV is enabled, the existing code shifts the IOV BAR to the index of the first PE/VF. This creates a hole in IOMEM space which can be potentially taken by some other device. This reserves a temporary hole on a parent and releases it when IOV is disabled; the temporary resources are stored in pci_dn to avoid kmalloc/free. Signed-off-by: Alexey Kardashevskiy Acked-by: Bjorn Helgaas Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/pci-bridge.h | 1 + arch/powerpc/platforms/powernv/pci-ioda.c | 24 +++++++++++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 0b8aa1fe2d5f..62ed83db04ae 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -218,6 +218,7 @@ struct pci_dn { #endif struct list_head child_list; struct list_head list; + struct resource holes[PCI_SRIOV_NUM_BARS]; }; /* Get the pointer to a device_node's pci_dn */ diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index fb5cd7511189..7e87867984e7 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1002,9 +1002,12 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) } /* - * After doing so, there would be a "hole" in the /proc/iomem when - * offset is a positive value. It looks like the device return some - * mmio back to the system, which actually no one could use it. + * Since M64 BAR shares segments among all possible 256 PEs, + * we have to shift the beginning of PF IOV BAR to make it start from + * the segment which belongs to the PE number assigned to the first VF. + * This creates a "hole" in the /proc/iomem which could be used for + * allocating other resources so we reserve this area below and + * release when IOV is released. */ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { res = &dev->resource[i + PCI_IOV_RESOURCES]; @@ -1018,7 +1021,22 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n", i, &res2, res, (offset > 0) ? "En" : "Dis", num_vfs, offset); + + if (offset < 0) { + devm_release_resource(&dev->dev, &pdn->holes[i]); + memset(&pdn->holes[i], 0, sizeof(pdn->holes[i])); + } + pci_update_resource(dev, i + PCI_IOV_RESOURCES); + + if (offset > 0) { + pdn->holes[i].start = res2.start; + pdn->holes[i].end = res2.start + size * offset - 1; + pdn->holes[i].flags = IORESOURCE_BUS; + pdn->holes[i].name = "pnv_iov_reserved"; + devm_request_resource(&dev->dev, res->parent, + &pdn->holes[i]); + } } return 0; } -- cgit From 6b148a7ce72a7f87c81cbcde48af014abc0516a9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 18 Oct 2017 11:16:47 +0200 Subject: powerpc/ipic: Fix status get and status clear IPIC Status is provided by register IPIC_SERSR and not by IPIC_SERMR which is the mask register. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/ipic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 16f1edd78c40..535cf1f6941c 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -846,12 +846,12 @@ void ipic_disable_mcp(enum ipic_mcp_irq mcp_irq) u32 ipic_get_mcp_status(void) { - return ipic_read(primary_ipic->regs, IPIC_SERMR); + return ipic_read(primary_ipic->regs, IPIC_SERSR); } void ipic_clear_mcp_status(u32 mask) { - ipic_write(primary_ipic->regs, IPIC_SERMR, mask); + ipic_write(primary_ipic->regs, IPIC_SERSR, mask); } /* Return an interrupt vector or 0 if no interrupt is pending. */ -- cgit From 7f142661d41e5243c99c6ad2ff72aa8647f5eb87 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 16 Oct 2017 12:31:40 +0530 Subject: powerpc/mm/hash: Add pr_fmt() to hash_utils64.c Make the printks look a bit nicer by adding a prefix. Radix config now do radix-mmu: Page sizes from device-tree: radix-mmu: Page size shift = 12 AP=0x0 radix-mmu: Page size shift = 16 AP=0x5 radix-mmu: Page size shift = 21 AP=0x1 radix-mmu: Page size shift = 30 AP=0x2 This patch update hash config to do similar dmesg output. With the patch we have hash-mmu: Page sizes from device-tree: hash-mmu: base_shift=12: shift=12, sllp=0x0000, avpnm=0x00000000, tlbiel=1, penc=0 hash-mmu: base_shift=12: shift=16, sllp=0x0000, avpnm=0x00000000, tlbiel=1, penc=7 hash-mmu: base_shift=12: shift=24, sllp=0x0000, avpnm=0x00000000, tlbiel=1, penc=56 hash-mmu: base_shift=16: shift=16, sllp=0x0110, avpnm=0x00000000, tlbiel=1, penc=1 hash-mmu: base_shift=16: shift=24, sllp=0x0110, avpnm=0x00000000, tlbiel=1, penc=8 hash-mmu: base_shift=20: shift=20, sllp=0x0111, avpnm=0x00000000, tlbiel=0, penc=2 hash-mmu: base_shift=24: shift=24, sllp=0x0100, avpnm=0x00000001, tlbiel=0, penc=0 hash-mmu: base_shift=34: shift=34, sllp=0x0120, avpnm=0x000007ff, tlbiel=0, penc=3 Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hash_utils_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 67ec2e927253..655a5a9a183d 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -21,6 +21,7 @@ #undef DEBUG #undef DEBUG_LOW +#define pr_fmt(fmt) "hash-mmu: " fmt #include #include #include -- cgit From c1807e3f84668c4c1e838386fdc3f6e63df1156c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 19 Oct 2017 15:08:19 +1100 Subject: powerpc/64: Free up CPU_FTR_ICSWX The last user of CPU_FTR_ICSWX was removed in commit 6ff4d3e96652 ("powerpc: Remove old unused icswx based coprocessor support"), so free the bit up for future use. Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/cputable.h | 6 +++--- arch/powerpc/kernel/dt_cpu_ftrs.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index a9bf921f4efc..e188dd57bf51 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -206,7 +206,7 @@ enum { #define CPU_FTR_STCX_CHECKS_ADDRESS LONG_ASM_CONST(0x0004000000000000) #define CPU_FTR_POPCNTB LONG_ASM_CONST(0x0008000000000000) #define CPU_FTR_POPCNTD LONG_ASM_CONST(0x0010000000000000) -#define CPU_FTR_ICSWX LONG_ASM_CONST(0x0020000000000000) +/* Free LONG_ASM_CONST(0x0020000000000000) */ #define CPU_FTR_VMX_COPY LONG_ASM_CONST(0x0040000000000000) #define CPU_FTR_TM LONG_ASM_CONST(0x0080000000000000) #define CPU_FTR_CFAR LONG_ASM_CONST(0x0100000000000000) @@ -451,7 +451,7 @@ enum { CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \ CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ - CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | \ + CPU_FTR_CFAR | CPU_FTR_HVMODE | \ CPU_FTR_VMX_COPY | CPU_FTR_HAS_PPR | CPU_FTR_DABRX) #define CPU_FTRS_POWER8 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\ @@ -460,7 +460,7 @@ enum { CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ CPU_FTR_DSCR | CPU_FTR_SAO | \ CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ - CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ + CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \ CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \ CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP) #define CPU_FTRS_POWER8E (CPU_FTRS_POWER8 | CPU_FTR_PMAO_BUG) diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 1df770e8cbe0..c7bfcacc2cb6 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -634,7 +634,7 @@ static struct dt_cpu_feature_match __initdata {"no-execute", feat_enable, 0}, {"strong-access-ordering", feat_enable, CPU_FTR_SAO}, {"cache-inhibited-large-page", feat_enable_large_ci, 0}, - {"coprocessor-icswx", feat_enable, CPU_FTR_ICSWX}, + {"coprocessor-icswx", feat_enable, 0}, {"hypervisor-virtualization-interrupt", feat_enable_hvi, 0}, {"program-priority-register", feat_enable, CPU_FTR_HAS_PPR}, {"wait", feat_enable, 0}, -- cgit From 4e003747043d57aa75c9762fa148ef38afe68dd8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 19 Oct 2017 15:08:43 +1100 Subject: powerpc/64s: Replace CONFIG_PPC_STD_MMU_64 with CONFIG_PPC_BOOK3S_64 CONFIG_PPC_STD_MMU_64 indicates support for the "standard" powerpc MMU on 64-bit CPUs. The "standard" MMU refers to the hash page table MMU found in "server" processors, from IBM mainly. Currently CONFIG_PPC_STD_MMU_64 is == CONFIG_PPC_BOOK3S_64. While it's annoying to have two symbols that always have the same value, it's not quite annoying enough to bother removing one. However with the arrival of Power9, we now have the situation where CONFIG_PPC_STD_MMU_64 is enabled, but the kernel is running using the Radix MMU - *not* the "standard" MMU. So it is now actively confusing to use it, because it implies that code is disabled or inactive when the Radix MMU is in use, however that is not necessarily true. So s/CONFIG_PPC_STD_MMU_64/CONFIG_PPC_BOOK3S_64/, and do some minor formatting updates of some of the affected lines. This will be a pain for backports, but c'est la vie. Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 6 +++--- arch/powerpc/include/asm/nohash/64/pgtable.h | 2 +- arch/powerpc/include/asm/paca.h | 10 +++++----- arch/powerpc/include/asm/page_64.h | 6 +++--- arch/powerpc/include/asm/pgtable-be-types.h | 2 +- arch/powerpc/include/asm/pgtable-types.h | 4 ++-- arch/powerpc/include/asm/tlbflush.h | 2 +- arch/powerpc/kernel/asm-offsets.c | 4 ++-- arch/powerpc/kernel/entry_64.S | 4 ++-- arch/powerpc/kernel/exceptions-64s.S | 8 ++++---- arch/powerpc/kernel/machine_kexec_64.c | 4 ++-- arch/powerpc/kernel/mce_power.c | 4 ++-- arch/powerpc/kernel/paca.c | 12 ++++++------ arch/powerpc/kernel/pci_64.c | 4 ++-- arch/powerpc/kernel/process.c | 12 ++++++------ arch/powerpc/kernel/prom.c | 2 +- arch/powerpc/kernel/setup-common.c | 4 ++-- arch/powerpc/mm/Makefile | 6 +++--- arch/powerpc/mm/dump_hashpagetable.c | 2 +- arch/powerpc/mm/dump_linuxpagetables.c | 10 +++++----- arch/powerpc/mm/init_64.c | 8 ++++---- arch/powerpc/mm/pgtable_64.c | 2 +- arch/powerpc/platforms/Kconfig.cputype | 6 +----- arch/powerpc/platforms/pseries/lpar.c | 8 ++++---- arch/powerpc/platforms/pseries/lparcfg.c | 2 +- arch/powerpc/xmon/xmon.c | 6 +++--- 26 files changed, 68 insertions(+), 72 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 809c468edab1..b8b75b423316 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -334,7 +334,7 @@ config PPC_OF_PLATFORM_PCI default n config ARCH_SUPPORTS_DEBUG_PAGEALLOC - depends on PPC32 || PPC_STD_MMU_64 + depends on PPC32 || PPC_BOOK3S_64 def_bool y config ARCH_SUPPORTS_UPROBES @@ -721,7 +721,7 @@ config PPC_16K_PAGES config PPC_64K_PAGES bool "64k page size" - depends on !PPC_FSL_BOOK3E && (44x || PPC_STD_MMU_64 || PPC_BOOK3E_64) + depends on !PPC_FSL_BOOK3E && (44x || PPC_BOOK3S_64 || PPC_BOOK3E_64) select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64 config PPC_256K_PAGES @@ -780,7 +780,7 @@ config FORCE_MAX_ZONEORDER config PPC_SUBPAGE_PROT bool "Support setting protections for 4k subpages" - depends on PPC_STD_MMU_64 && PPC_64K_PAGES + depends on PPC_BOOK3S_64 && PPC_64K_PAGES help This option adds support for a system call to allow user programs to set access permissions (read/write, readonly, or no access) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index f0ff384d4ca5..b1c42ac54d96 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -203,7 +203,7 @@ static inline unsigned long pte_update(struct mm_struct *mm, if (!huge) assert_pte_locked(mm, addr); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (old & _PAGE_HASHPTE) hpte_need_flush(mm, addr, ptep, old, huge); #endif diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 1e06310ccc09..c907ae23c956 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -91,14 +91,14 @@ struct paca_struct { u8 cpu_start; /* At startup, processor spins until */ /* this becomes non-zero. */ u8 kexec_state; /* set when kexec down has irqs off */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 struct slb_shadow *slb_shadow_ptr; struct dtl_entry *dispatch_log; struct dtl_entry *dispatch_log_end; -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif u64 dscr_default; /* per-CPU default DSCR */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * Now, starting in cacheline 2, the exception save areas */ @@ -110,7 +110,7 @@ struct paca_struct { u16 vmalloc_sllp; u16 slb_cache_ptr; u32 slb_cache[SLB_CACHE_ENTRIES]; -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_BOOK3E u64 exgen[8] __aligned(0x40); @@ -192,7 +192,7 @@ struct paca_struct { struct stop_sprs stop_sprs; #endif -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* Non-maskable exceptions that are not performance critical */ u64 exnmi[EX_SIZE]; /* used for system reset (nmi) */ u64 exmc[EX_SIZE]; /* used for machine checks */ diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index c4d9654bd637..56234c6fcd61 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -117,21 +117,21 @@ extern void slice_set_range_psize(struct mm_struct *mm, unsigned long start, #endif /* __ASSEMBLY__ */ #else #define slice_init() -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 #define get_slice_psize(mm, addr) ((mm)->context.user_psize) #define slice_set_user_psize(mm, psize) \ do { \ (mm)->context.user_psize = (psize); \ (mm)->context.sllp = SLB_VSID_USER | mmu_psize_defs[(psize)].sllp; \ } while (0) -#else /* CONFIG_PPC_STD_MMU_64 */ +#else /* !CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_64K_PAGES #define get_slice_psize(mm, addr) MMU_PAGE_64K #else /* CONFIG_PPC_64K_PAGES */ #define get_slice_psize(mm, addr) MMU_PAGE_4K #endif /* !CONFIG_PPC_64K_PAGES */ #define slice_set_user_psize(mm, psize) do { BUG(); } while(0) -#endif /* !CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #define slice_set_range_psize(mm, start, len, psize) \ slice_set_user_psize((mm), (psize)) diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h index 67e7e3d990f4..03ab638593e1 100644 --- a/arch/powerpc/include/asm/pgtable-be-types.h +++ b/arch/powerpc/include/asm/pgtable-be-types.h @@ -76,7 +76,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; * With hash config 64k pages additionally define a bigger "real PTE" type that * gathers the "second half" part of the PTE for pseudo 64k pages */ -#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64) +#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64) typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; #else typedef struct { pte_t pte; } real_pte_t; diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 369a164b545c..5d2c38d26396 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -49,13 +49,13 @@ typedef struct { unsigned long pgprot; } pgprot_t; * With hash config 64k pages additionally define a bigger "real PTE" type that * gathers the "second half" part of the PTE for pseudo 64k pages */ -#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64) +#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_BOOK3S_64) typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; #else typedef struct { pte_t pte; } real_pte_t; #endif -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 #include static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h index 13dbcd41885e..7d5a157c7832 100644 --- a/arch/powerpc/include/asm/tlbflush.h +++ b/arch/powerpc/include/asm/tlbflush.h @@ -77,7 +77,7 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm) flush_tlb_mm(mm); } -#elif defined(CONFIG_PPC_STD_MMU_64) +#elif defined(CONFIG_PPC_BOOK3S_64) #include #else #error Unsupported MMU type diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 8cfb20e38cfe..200623e71474 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -208,7 +208,7 @@ int main(void) OFFSET(TCD_ESEL_FIRST, tlb_core_data, esel_first); #endif /* CONFIG_PPC_BOOK3E */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACASLBCACHE, paca_struct, slb_cache); OFFSET(PACASLBCACHEPTR, paca_struct, slb_cache_ptr); OFFSET(PACAVMALLOCSLLP, paca_struct, vmalloc_sllp); @@ -230,7 +230,7 @@ int main(void) OFFSET(LPPACA_DTLIDX, lppaca, dtl_idx); OFFSET(LPPACA_YIELDCOUNT, lppaca, yield_count); OFFSET(PACA_DTL_RIDX, paca_struct, dtl_ridx); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ OFFSET(PACAEMERGSP, paca_struct, emergency_sp); #ifdef CONFIG_PPC_BOOK3S_64 OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp); diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 4a0fd4f40245..3320bcac7192 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -539,7 +539,7 @@ _GLOBAL(_switch) std r6,PACACURRENT(r13) /* Set new 'current' */ ld r8,KSP(r4) /* new stack pointer */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 BEGIN_MMU_FTR_SECTION b 2f END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) @@ -588,7 +588,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) slbmte r7,r0 isync 2: -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 64c2bc35e4ff..6879aed47377 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -607,7 +607,7 @@ EXC_COMMON_BEGIN(slb_miss_common) cmpdi cr5,r11,MSR_RI crset 4*cr0+eq -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 BEGIN_MMU_FTR_SECTION bl slb_allocate END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) @@ -1497,7 +1497,7 @@ USE_TEXT_SECTION() */ .balign IFETCH_ALIGN_BYTES do_hash_page: - #ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 lis r0,DSISR_BAD_FAULT_64S@h ori r0,r0,DSISR_BAD_FAULT_64S@l and. r0,r4,r0 /* weird error? */ @@ -1528,7 +1528,7 @@ do_hash_page: /* Reload DSISR into r4 for the DABR check below */ ld r4,_DSISR(r1) -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: @@ -1557,7 +1557,7 @@ handle_dabr_fault: 12: b ret_from_except_lite -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* We have a page fault that hash_page could handle but HV refused * the PTE insertion */ diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 5c12e21d0d1a..49d34d7271e7 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -360,7 +360,7 @@ void default_machine_kexec(struct kimage *image) /* NOTREACHED */ } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* Values we need to export to the second kernel via the device tree. */ static unsigned long htab_base; static unsigned long htab_size; @@ -402,4 +402,4 @@ static int __init export_htab_values(void) return 0; } late_initcall(export_htab_values); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 82cabfc7c2f6..1600ab194ff9 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -168,7 +168,7 @@ void __flush_tlb_power9(unsigned int action) /* flush SLBs and reload */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static void flush_and_reload_slb(void) { struct slb_shadow *slb; @@ -215,7 +215,7 @@ static void flush_erat(void) static int mce_flush(int what) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (what == MCE_FLUSH_SLB) { flush_and_reload_slb(); return 1; diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 2ff2b8a19f71..5d38d5ea9a24 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -90,7 +90,7 @@ static inline void free_lppacas(void) { } #endif /* CONFIG_PPC_BOOK3S */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * 3 persistent SLBs are registered here. The buffer will be zero @@ -135,11 +135,11 @@ static struct slb_shadow * __init init_slb_shadow(int cpu) return s; } -#else /* CONFIG_PPC_STD_MMU_64 */ +#else /* !CONFIG_PPC_BOOK3S_64 */ static void __init allocate_slb_shadows(int nr_cpus, int limit) { } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* The Paca is an array with one entry per processor. Each contains an * lppaca, which contains the information shared between the @@ -170,9 +170,9 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu) new_paca->kexec_state = KEXEC_STATE_NONE; new_paca->__current = &init_task; new_paca->data_offset = 0xfeeeeeeeeeeeeeeeULL; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 new_paca->slb_shadow_ptr = init_slb_shadow(cpu); -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif #ifdef CONFIG_PPC_BOOK3E /* For now -- if we have threads this will be adjusted later */ @@ -271,7 +271,7 @@ void copy_mm_to_paca(struct mm_struct *mm) get_paca()->mm_ctx_user_psize = context->user_psize; get_paca()->mm_ctx_sllp = context->sllp; #endif -#else /* CONFIG_PPC_BOOK3S */ +#else /* !CONFIG_PPC_BOOK3S */ return; #endif } diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 932b9741aa8f..15ce0306b092 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -90,14 +90,14 @@ int pcibios_unmap_io_space(struct pci_bus *bus) * to do an appropriate TLB flush here too */ if (bus->self) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 struct resource *res = bus->resource[0]; #endif pr_debug("IO unmapping for PCI-PCI bridge %s\n", pci_name(bus->self)); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 __flush_hash_table_range(&init_mm, res->start + _IO_BASE, res->end + _IO_BASE + 1); #endif diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index c051dc2b42ad..2b602d60a9c7 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1164,7 +1164,7 @@ struct task_struct *__switch_to(struct task_struct *prev, } #endif /* CONFIG_PPC64 */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 batch = this_cpu_ptr(&ppc64_tlb_batch); if (batch->active) { current_thread_info()->local_flags |= _TLF_LAZY_MMU; @@ -1172,7 +1172,7 @@ struct task_struct *__switch_to(struct task_struct *prev, __flush_tlb_pending(batch); batch->active = 0; } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_PPC_ADV_DEBUG_REGS switch_booke_debug_regs(&new->thread.debug); @@ -1218,7 +1218,7 @@ struct task_struct *__switch_to(struct task_struct *prev, last = _switch(old_thread, new_thread); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (current_thread_info()->local_flags & _TLF_LAZY_MMU) { current_thread_info()->local_flags &= ~_TLF_LAZY_MMU; batch = this_cpu_ptr(&ppc64_tlb_batch); @@ -1247,7 +1247,7 @@ struct task_struct *__switch_to(struct task_struct *prev, : : "r"(dummy_copy_buffer), "r"(0)); } } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ return last; } @@ -1476,7 +1476,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) static void setup_ksp_vsid(struct task_struct *p, unsigned long sp) { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 unsigned long sp_vsid; unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; @@ -2056,7 +2056,7 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) unsigned long base = mm->brk; unsigned long ret; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * If we are using 1TB segments and we are allowed to randomise * the heap, we can put it above 1TB so it is backed by a 1TB diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index 101822be525a..b15bae265c90 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -229,7 +229,7 @@ static void __init check_cpu_pa_features(unsigned long node) ibm_pa_features, ARRAY_SIZE(ibm_pa_features)); } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static void __init init_mmu_slb_size(unsigned long node) { const __be32 *slb_size_ptr; diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 0ac741fae90e..da4fd54e6914 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -773,7 +773,7 @@ void arch_setup_pdev_archdata(struct platform_device *pdev) static __init void print_system_info(void) { pr_info("-----------------------------------------------------\n"); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); #endif #ifdef CONFIG_PPC_STD_MMU_32 @@ -800,7 +800,7 @@ static __init void print_system_info(void) pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features); #endif -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 if (htab_address) pr_info("htab_address = 0x%p\n", htab_address); if (htab_hash_mask) diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index fb844d2f266e..915178423c3a 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -14,11 +14,11 @@ obj-$(CONFIG_PPC_MMU_NOHASH) += mmu_context_nohash.o tlb_nohash.o \ obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(BITS)e.o hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o -obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o +obj-$(CONFIG_PPC_BOOK3S_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(BITS).o -ifeq ($(CONFIG_PPC_STD_MMU_64),y) +ifeq ($(CONFIG_PPC_BOOK3S_64),y) obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o endif @@ -31,7 +31,7 @@ obj-$(CONFIG_PPC_SPLPAR) += vphn.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-y += hugetlbpage.o ifeq ($(CONFIG_HUGETLB_PAGE),y) -obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +obj-$(CONFIG_PPC_BOOK3S_64) += hugetlbpage-hash64.o obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o endif diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c index 5c4c93dcff19..14cfb11b09d0 100644 --- a/arch/powerpc/mm/dump_hashpagetable.c +++ b/arch/powerpc/mm/dump_hashpagetable.c @@ -500,7 +500,7 @@ static void populate_markers(void) address_markers[6].start_address = PHB_IO_END; address_markers[7].start_address = IOREMAP_BASE; address_markers[8].start_address = IOREMAP_END; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 address_markers[9].start_address = H_VMEMMAP_BASE; #else address_markers[9].start_address = VMEMMAP_BASE; diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c index c9282d27b203..c2e7dea59490 100644 --- a/arch/powerpc/mm/dump_linuxpagetables.c +++ b/arch/powerpc/mm/dump_linuxpagetables.c @@ -112,7 +112,7 @@ struct flag_info { static const struct flag_info flag_array[] = { { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 .mask = _PAGE_PRIVILEGED, .val = 0, #else @@ -147,7 +147,7 @@ static const struct flag_info flag_array[] = { .set = "present", .clear = " ", }, { -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 .mask = H_PAGE_HASHPTE, .val = H_PAGE_HASHPTE, #else @@ -157,7 +157,7 @@ static const struct flag_info flag_array[] = { .set = "hpte", .clear = " ", }, { -#ifndef CONFIG_PPC_STD_MMU_64 +#ifndef CONFIG_PPC_BOOK3S_64 .mask = _PAGE_GUARDED, .val = _PAGE_GUARDED, .set = "guarded", @@ -174,7 +174,7 @@ static const struct flag_info flag_array[] = { .set = "accessed", .clear = " ", }, { -#ifndef CONFIG_PPC_STD_MMU_64 +#ifndef CONFIG_PPC_BOOK3S_64 .mask = _PAGE_WRITETHRU, .val = _PAGE_WRITETHRU, .set = "write through", @@ -450,7 +450,7 @@ static void populate_markers(void) address_markers[i++].start_address = PHB_IO_END; address_markers[i++].start_address = IOREMAP_BASE; address_markers[i++].start_address = IOREMAP_END; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 address_markers[i++].start_address = H_VMEMMAP_BASE; #else address_markers[i++].start_address = VMEMMAP_BASE; diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 588a521966ec..12b3025fadac 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -68,11 +68,11 @@ #include "mmu_decl.h" -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 #if H_PGTABLE_RANGE > USER_VSID_RANGE #warning Limited user VSID range means pagetable space is wasted #endif -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ phys_addr_t memstart_addr = ~0; EXPORT_SYMBOL_GPL(memstart_addr); @@ -367,7 +367,7 @@ EXPORT_SYMBOL_GPL(realmode_pfn_to_page); #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static bool disable_radix; static int __init parse_disable_radix(char *p) { @@ -444,4 +444,4 @@ void __init mmu_early_init_devtree(void) else hash__early_init_devtree(); } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index ac0717a90ca6..2851282edcd4 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -57,7 +57,7 @@ #include "mmu_decl.h" -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) #error TASK_SIZE_USER64 exceeds user VSID range #endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 13663efc1d31..f8928ee85f6b 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -294,10 +294,6 @@ config PPC_STD_MMU_32 def_bool y depends on PPC_STD_MMU && PPC32 -config PPC_STD_MMU_64 - def_bool y - depends on PPC_STD_MMU && PPC64 - config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 @@ -323,7 +319,7 @@ config PPC_BOOK3E_MMU config PPC_MM_SLICES bool - default y if PPC_STD_MMU_64 + default y if PPC_BOOK3S_64 default n config PPC_HAVE_PMU_SUPPORT diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 495ba4e7336d..0ee4a469a4ae 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -93,7 +93,7 @@ void vpa_init(int cpu) return; } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 /* * PAPR says this feature is SLB-Buffer but firmware never * reports that. All SPLPAR support SLB shadow buffer. @@ -106,7 +106,7 @@ void vpa_init(int cpu) "cpu %d (hw %d) of area %lx failed with %ld\n", cpu, hwcpu, addr, ret); } -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ /* * Register dispatch trace log, if one has been allocated. @@ -129,7 +129,7 @@ void vpa_init(int cpu) } } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 static long pSeries_lpar_hpte_insert(unsigned long hpte_group, unsigned long vpn, unsigned long pa, @@ -824,7 +824,7 @@ void arch_free_page(struct page *page, int order) EXPORT_SYMBOL(arch_free_page); #endif /* CONFIG_PPC_SMLPAR */ -#endif /* CONFIG_PPC_STD_MMU_64 */ +#endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_TRACEPOINTS #ifdef HAVE_JUMP_LABEL diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 779fc2a1c8f7..b2706c483067 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -485,7 +485,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) seq_printf(m, "shared_processor_mode=%d\n", lppaca_shared_proc(get_lppaca())); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 seq_printf(m, "slb_size=%d\n", mmu_slb_size); #endif parse_em_data(m); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 83e7faf5b3d3..6b1236b85622 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2312,7 +2312,7 @@ static void dump_tracing(void) static void dump_one_paca(int cpu) { struct paca_struct *p; -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 int i = 0; #endif @@ -2353,7 +2353,7 @@ static void dump_one_paca(int cpu) DUMP(p, hw_cpu_id, "x"); DUMP(p, cpu_start, "x"); DUMP(p, kexec_state, "x"); -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 for (i = 0; i < SLB_NUM_BOLTED; i++) { u64 esid, vsid; @@ -3263,7 +3263,7 @@ static void xmon_print_symbol(unsigned long address, const char *mid, printf("%s", after); } -#ifdef CONFIG_PPC_STD_MMU_64 +#ifdef CONFIG_PPC_BOOK3S_64 void dump_segments(void) { int i; -- cgit From 1fd6c02207107c8892219dacef01de7ced3d4ce7 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 24 Oct 2017 17:48:49 +0200 Subject: powerpc/mm: Add a CONFIG option to choose if radix is used by default Currently if the hardware supports the radix MMU we will use it, *unless* "disable_radix" is passed on the kernel command line. However some users would like the reverse semantics. ie. The kernel uses the hash MMU by default, unless radix is explicitly requested on the command line. So add a CONFIG option to choose whether we use radix by default or not, and expand the disable_radix command line option to allow "disable_radix=no" which *enables* radix. Signed-off-by: Michael Ellerman --- arch/powerpc/mm/init_64.c | 13 +++++++++++-- arch/powerpc/platforms/Kconfig.cputype | 13 +++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 12b3025fadac..a07722531b32 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -368,10 +368,19 @@ EXPORT_SYMBOL_GPL(realmode_pfn_to_page); #endif /* CONFIG_SPARSEMEM_VMEMMAP */ #ifdef CONFIG_PPC_BOOK3S_64 -static bool disable_radix; +static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT); + static int __init parse_disable_radix(char *p) { - disable_radix = true; + bool val; + + if (strlen(p) == 0) + val = true; + else if (kstrtobool(p, &val)) + return -EINVAL; + + disable_radix = val; + return 0; } early_param("disable_radix", parse_disable_radix); diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index f8928ee85f6b..596bd9091478 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -304,6 +304,19 @@ config PPC_RADIX_MMU is only implemented by IBM Power9 CPUs, if you don't have one of them you can probably disable this. +config PPC_RADIX_MMU_DEFAULT + bool "Default to using the Radix MMU when possible" + depends on PPC_RADIX_MMU + default y + help + When the hardware supports the Radix MMU, default to using it unless + "disable_radix[=yes]" is specified on the kernel command line. + + If this option is disabled, the Hash MMU will be used by default, + unless "disable_radix=no" is specified on the kernel command line. + + If you're unsure, say Y. + config ARCH_ENABLE_HUGEPAGE_MIGRATION def_bool y depends on PPC_BOOK3S_64 && HUGETLB_PAGE && MIGRATION -- cgit From 632f0574167ad3f5d646dad6af87d976a703d93e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 12 Oct 2017 15:45:25 +1100 Subject: powerpc/tm: Don't check for WARN in TM Bad Thing handling Currently when we take a TM Bad Thing program check exception, we search the bug table to see if the program check was generated by a WARN/WARN_ON etc. That makes no sense, the WARN macros use trap instructions, which should never generate a TM Bad Thing exception. If they ever did that would be a bug and we should oops. We do have some hand-coded bugs in tm.S, using EMIT_BUG_ENTRY, but those are all BUGs not WARNs, and they all use trap instructions anyway. Almost certainly this check was incorrectly copied from the REASON_TRAP handling in the same function. Remove it. Signed-off-by: Michael Ellerman Acked-By: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 9ae1924c7d1a..0e4099fef198 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1337,13 +1337,8 @@ void program_check_exception(struct pt_regs *regs) * - A treclaim is attempted when non transactional. * - A tend is illegally attempted. * - writing a TM SPR when transactional. - */ - if (!user_mode(regs) && - report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) { - regs->nip += 4; - goto bail; - } - /* If usermode caused this, it's done something illegal and + * + * If usermode caused this, it's done something illegal and * gets a SIGILL slap on the wrist. We call it an illegal * operand to distinguish from the instruction just being bad * (e.g. executing a 'tend' on a CPU without TM!); it's an -- cgit From ac0761ebcb08830d8f64b9181f6736b1a00bf746 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Sat, 2 Sep 2017 00:23:01 +0530 Subject: bpf: take advantage of stack_depth tracking in powerpc JIT Take advantage of stack_depth tracking, originally introduced for x64, in powerpc JIT as well. Round up allocated stack by 16 bytes to make sure it stays aligned for functions called from JITed bpf program. Signed-off-by: Sandipan Das Reviewed-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/net/bpf_jit64.h | 7 ++++--- arch/powerpc/net/bpf_jit_comp64.c | 16 ++++++++++------ 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/net/bpf_jit64.h b/arch/powerpc/net/bpf_jit64.h index 62fa7589db2b..8bdef7ed28a8 100644 --- a/arch/powerpc/net/bpf_jit64.h +++ b/arch/powerpc/net/bpf_jit64.h @@ -23,7 +23,7 @@ * [ nv gpr save area ] 8*8 | * [ tail_call_cnt ] 8 | * [ local_tmp_var ] 8 | - * fp (r31) --> [ ebpf stack space ] 512 | + * fp (r31) --> [ ebpf stack space ] upto 512 | * [ frame header ] 32/112 | * sp (r1) ---> [ stack pointer ] -------------- */ @@ -32,8 +32,8 @@ #define BPF_PPC_STACK_SAVE (8*8) /* for bpf JIT code internal usage */ #define BPF_PPC_STACK_LOCALS 16 -/* Ensure this is quadword aligned */ -#define BPF_PPC_STACKFRAME (STACK_FRAME_MIN_SIZE + MAX_BPF_STACK + \ +/* stack frame excluding BPF stack, ensure this is quadword aligned */ +#define BPF_PPC_STACKFRAME (STACK_FRAME_MIN_SIZE + \ BPF_PPC_STACK_LOCALS + BPF_PPC_STACK_SAVE) #ifndef __ASSEMBLY__ @@ -103,6 +103,7 @@ struct codegen_context { */ unsigned int seen; unsigned int idx; + unsigned int stack_size; }; #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index a66e64b0b251..46d74e81aff1 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -69,7 +69,7 @@ static inline bool bpf_has_stack_frame(struct codegen_context *ctx) static int bpf_jit_stack_local(struct codegen_context *ctx) { if (bpf_has_stack_frame(ctx)) - return STACK_FRAME_MIN_SIZE + MAX_BPF_STACK; + return STACK_FRAME_MIN_SIZE + ctx->stack_size; else return -(BPF_PPC_STACK_SAVE + 16); } @@ -82,8 +82,9 @@ static int bpf_jit_stack_tailcallcnt(struct codegen_context *ctx) static int bpf_jit_stack_offsetof(struct codegen_context *ctx, int reg) { if (reg >= BPF_PPC_NVR_MIN && reg < 32) - return (bpf_has_stack_frame(ctx) ? BPF_PPC_STACKFRAME : 0) - - (8 * (32 - reg)); + return (bpf_has_stack_frame(ctx) ? + (BPF_PPC_STACKFRAME + ctx->stack_size) : 0) + - (8 * (32 - reg)); pr_err("BPF JIT is asking about unknown registers"); BUG(); @@ -134,7 +135,7 @@ static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) PPC_BPF_STL(0, 1, PPC_LR_STKOFF); } - PPC_BPF_STLU(1, 1, -BPF_PPC_STACKFRAME); + PPC_BPF_STLU(1, 1, -(BPF_PPC_STACKFRAME + ctx->stack_size)); } /* @@ -161,7 +162,7 @@ static void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx) /* Setup frame pointer to point to the bpf stack area */ if (bpf_is_seen_register(ctx, BPF_REG_FP)) PPC_ADDI(b2p[BPF_REG_FP], 1, - STACK_FRAME_MIN_SIZE + MAX_BPF_STACK); + STACK_FRAME_MIN_SIZE + ctx->stack_size); } static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx) @@ -183,7 +184,7 @@ static void bpf_jit_emit_common_epilogue(u32 *image, struct codegen_context *ctx /* Tear down our stack frame */ if (bpf_has_stack_frame(ctx)) { - PPC_ADDI(1, 1, BPF_PPC_STACKFRAME); + PPC_ADDI(1, 1, BPF_PPC_STACKFRAME + ctx->stack_size); if (ctx->seen & SEEN_FUNC) { PPC_BPF_LL(0, 1, PPC_LR_STKOFF); PPC_MTLR(0); @@ -1013,6 +1014,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) memset(&cgctx, 0, sizeof(struct codegen_context)); + /* Make sure that the stack is quadword aligned. */ + cgctx.stack_size = round_up(fp->aux->stack_depth, 16); + /* Scouting faux-generate pass 0 */ if (bpf_jit_build_body(fp, 0, &cgctx, addrs)) { /* We hit something illegal or unsupported. */ -- cgit From edfd17ff39bce59886e8249e645d6ecfc3e3829c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 4 Nov 2017 22:26:52 +0100 Subject: powerpc/eeh: Stop using do_gettimeofday() This interface is inefficient and deprecated because of the y2038 overflow. ktime_get_seconds() is an appropriate replacement here, since it has sufficient granularity but is more efficient and uses monotonic time. Signed-off-by: Arnd Bergmann Reviewed-by: Andrew Donnellan Acked-by: Russell Currey Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 2 +- arch/powerpc/kernel/eeh_driver.c | 2 +- arch/powerpc/kernel/eeh_pe.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index cd1df96335e5..5161c37dd039 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -93,7 +93,7 @@ struct eeh_pe { struct pci_bus *bus; /* Top PCI bus for bus PE */ int check_count; /* Times of ignored error */ int freeze_count; /* Times of froze up */ - struct timeval tstamp; /* Time on first-time freeze */ + time64_t tstamp; /* Time on first-time freeze */ int false_positives; /* Times of reported #ff's */ atomic_t pass_dev_cnt; /* Count of passed through devs */ struct eeh_pe *parent; /* Parent PE */ diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 8b840191df59..57d54163bf94 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -623,7 +623,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, struct eeh_rmv_data *rmv_data) { struct pci_bus *frozen_bus = eeh_pe_bus_get(pe); - struct timeval tstamp; + time64_t tstamp; int cnt, rc; struct eeh_dev *edev; diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 2e8d1b2b5af4..2d4956e97aa9 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -526,16 +526,16 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) */ void eeh_pe_update_time_stamp(struct eeh_pe *pe) { - struct timeval tstamp; + time64_t tstamp; if (!pe) return; if (pe->freeze_count <= 0) { pe->freeze_count = 0; - do_gettimeofday(&pe->tstamp); + pe->tstamp = ktime_get_seconds(); } else { - do_gettimeofday(&tstamp); - if (tstamp.tv_sec - pe->tstamp.tv_sec > 3600) { + tstamp = ktime_get_seconds(); + if (tstamp - pe->tstamp > 3600) { pe->tstamp = tstamp; pe->freeze_count = 0; } -- cgit From 71e24d7731a2903b1ae2bba2b2971c654d9c2aa6 Mon Sep 17 00:00:00 2001 From: "William A. Kennington III" Date: Fri, 22 Sep 2017 16:58:00 -0700 Subject: powerpc/opal: Fix EBUSY bug in acquiring tokens The current code checks the completion map to look for the first token that is complete. In some cases, a completion can come in but the token can still be on lease to the caller processing the completion. If this completed but unreleased token is the first token found in the bitmap by another tasks trying to acquire a token, then the __test_and_set_bit call will fail since the token will still be on lease. The acquisition will then fail with an EBUSY. This patch reorganizes the acquisition code to look at the opal_async_token_map for an unleased token. If the token has no lease it must have no outstanding completions so we should never see an EBUSY, unless we have leased out too many tokens. Since opal_async_get_token_inrerruptible is protected by a semaphore, we will practically never see EBUSY anymore. Fixes: 8d7248232208 ("powerpc/powernv: Infrastructure to support OPAL async completion") Signed-off-by: William A. Kennington III Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-async.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index cf33769a7b72..45b3feb8aa2f 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -39,18 +39,18 @@ int __opal_async_get_token(void) int token; spin_lock_irqsave(&opal_async_comp_lock, flags); - token = find_first_bit(opal_async_complete_map, opal_max_async_tokens); + token = find_first_zero_bit(opal_async_token_map, opal_max_async_tokens); if (token >= opal_max_async_tokens) { token = -EBUSY; goto out; } - if (__test_and_set_bit(token, opal_async_token_map)) { + if (!__test_and_clear_bit(token, opal_async_complete_map)) { token = -EBUSY; goto out; } - __clear_bit(token, opal_async_complete_map); + __set_bit(token, opal_async_token_map); out: spin_unlock_irqrestore(&opal_async_comp_lock, flags); -- cgit From 59cf9a1cfcd9de6392d218fcd69413f2e77babbe Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Fri, 3 Nov 2017 13:41:41 +1100 Subject: powerpc/opal: Make __opal_async_{get, release}_token() static There are no callers of both __opal_async_get_token() and __opal_async_release_token(). This patch also removes the possibility of "emergency through synchronous call to __opal_async_get_token()" as such it makes more sense to initialise opal_sync_sem for the maximum number of async tokens. Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal.h | 2 -- arch/powerpc/platforms/powernv/opal-async.c | 10 +++------- 2 files changed, 3 insertions(+), 9 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 04c32b08ffa1..00aae3189311 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -306,9 +306,7 @@ extern void opal_notifier_enable(void); extern void opal_notifier_disable(void); extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val); -extern int __opal_async_get_token(void); extern int opal_async_get_token_interruptible(void); -extern int __opal_async_release_token(int token); extern int opal_async_release_token(int token); extern int opal_async_wait_response(uint64_t token, struct opal_msg *msg); extern int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data); diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index 45b3feb8aa2f..64255d3ee14a 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -33,7 +33,7 @@ static struct semaphore opal_async_sem; static struct opal_msg *opal_async_responses; static unsigned int opal_max_async_tokens; -int __opal_async_get_token(void) +static int __opal_async_get_token(void) { unsigned long flags; int token; @@ -73,7 +73,7 @@ int opal_async_get_token_interruptible(void) } EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible); -int __opal_async_release_token(int token) +static int __opal_async_release_token(int token) { unsigned long flags; @@ -199,11 +199,7 @@ int __init opal_async_comp_init(void) goto out_opal_node; } - /* Initialize to 1 less than the maximum tokens available, as we may - * require to pop one during emergency through synchronous call to - * __opal_async_get_token() - */ - sema_init(&opal_async_sem, opal_max_async_tokens - 1); + sema_init(&opal_async_sem, opal_max_async_tokens); out_opal_node: of_node_put(opal_node); -- cgit From 86cd6d98020924f65a6773784c66c5b842e3e320 Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Fri, 3 Nov 2017 13:41:42 +1100 Subject: powerpc/opal: Rework the opal-async interface Future work will add an opal_async_wait_response_interruptible() which will call wait_event_interruptible(). This work requires extra token state to be tracked as wait_event_interruptible() can return and the caller could release the token before OPAL responds. Currently token state is tracked with two bitfields which are 64 bits big but may not need to be as OPAL informs Linux how many async tokens there are. It also uses an array indexed by token to store response messages for each token. The bitfields make it difficult to add more state and also provide a hard maximum as to how many tokens there can be - it is possible that OPAL will inform Linux that there are more than 64 tokens. Rather than add a bitfield to track the extra state, rework the internals slightly. Signed-off-by: Cyril Bur [mpe: Fix __opal_async_get_token() when no tokens are free] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-async.c | 92 ++++++++++++++++------------- 1 file changed, 51 insertions(+), 41 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index 64255d3ee14a..a8a57310759a 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -1,7 +1,7 @@ /* * PowerNV OPAL asynchronous completion interfaces * - * Copyright 2013 IBM Corp. + * Copyright 2013-2017 IBM Corp. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -23,40 +23,47 @@ #include #include -#define N_ASYNC_COMPLETIONS 64 +enum opal_async_token_state { + ASYNC_TOKEN_UNALLOCATED = 0, + ASYNC_TOKEN_ALLOCATED, + ASYNC_TOKEN_COMPLETED +}; + +struct opal_async_token { + enum opal_async_token_state state; + struct opal_msg response; +}; -static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL}; -static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS); static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait); static DEFINE_SPINLOCK(opal_async_comp_lock); static struct semaphore opal_async_sem; -static struct opal_msg *opal_async_responses; static unsigned int opal_max_async_tokens; +static struct opal_async_token *opal_async_tokens; static int __opal_async_get_token(void) { unsigned long flags; - int token; + int i, token = -EBUSY; spin_lock_irqsave(&opal_async_comp_lock, flags); - token = find_first_zero_bit(opal_async_token_map, opal_max_async_tokens); - if (token >= opal_max_async_tokens) { - token = -EBUSY; - goto out; - } - if (!__test_and_clear_bit(token, opal_async_complete_map)) { - token = -EBUSY; - goto out; + for (i = 0; i < opal_max_async_tokens; i++) { + if (opal_async_tokens[i].state == ASYNC_TOKEN_UNALLOCATED) { + opal_async_tokens[i].state = ASYNC_TOKEN_ALLOCATED; + token = i; + break; + } } - __set_bit(token, opal_async_token_map); - -out: spin_unlock_irqrestore(&opal_async_comp_lock, flags); return token; } +/* + * Note: If the returned token is used in an opal call and opal returns + * OPAL_ASYNC_COMPLETION you MUST call opal_async_wait_response() before + * calling another other opal_async_* function + */ int opal_async_get_token_interruptible(void) { int token; @@ -76,6 +83,7 @@ EXPORT_SYMBOL_GPL(opal_async_get_token_interruptible); static int __opal_async_release_token(int token) { unsigned long flags; + int rc; if (token < 0 || token >= opal_max_async_tokens) { pr_err("%s: Passed token is out of range, token %d\n", @@ -84,11 +92,18 @@ static int __opal_async_release_token(int token) } spin_lock_irqsave(&opal_async_comp_lock, flags); - __set_bit(token, opal_async_complete_map); - __clear_bit(token, opal_async_token_map); + switch (opal_async_tokens[token].state) { + case ASYNC_TOKEN_COMPLETED: + case ASYNC_TOKEN_ALLOCATED: + opal_async_tokens[token].state = ASYNC_TOKEN_UNALLOCATED; + rc = 0; + break; + default: + rc = 1; + } spin_unlock_irqrestore(&opal_async_comp_lock, flags); - return 0; + return rc; } int opal_async_release_token(int token) @@ -96,12 +111,10 @@ int opal_async_release_token(int token) int ret; ret = __opal_async_release_token(token); - if (ret) - return ret; - - up(&opal_async_sem); + if (!ret) + up(&opal_async_sem); - return 0; + return ret; } EXPORT_SYMBOL_GPL(opal_async_release_token); @@ -122,13 +135,15 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg) * functional. */ opal_wake_poller(); - wait_event(opal_async_wait, test_bit(token, opal_async_complete_map)); - memcpy(msg, &opal_async_responses[token], sizeof(*msg)); + wait_event(opal_async_wait, opal_async_tokens[token].state + == ASYNC_TOKEN_COMPLETED); + memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg)); return 0; } EXPORT_SYMBOL_GPL(opal_async_wait_response); +/* Called from interrupt context */ static int opal_async_comp_event(struct notifier_block *nb, unsigned long msg_type, void *msg) { @@ -140,9 +155,9 @@ static int opal_async_comp_event(struct notifier_block *nb, return 0; token = be64_to_cpu(comp_msg->params[0]); - memcpy(&opal_async_responses[token], comp_msg, sizeof(*comp_msg)); + memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg)); spin_lock_irqsave(&opal_async_comp_lock, flags); - __set_bit(token, opal_async_complete_map); + opal_async_tokens[token].state = ASYNC_TOKEN_COMPLETED; spin_unlock_irqrestore(&opal_async_comp_lock, flags); wake_up(&opal_async_wait); @@ -178,24 +193,19 @@ int __init opal_async_comp_init(void) } opal_max_async_tokens = be32_to_cpup(async); - if (opal_max_async_tokens > N_ASYNC_COMPLETIONS) - opal_max_async_tokens = N_ASYNC_COMPLETIONS; + opal_async_tokens = kcalloc(opal_max_async_tokens, + sizeof(*opal_async_tokens), GFP_KERNEL); + if (!opal_async_tokens) { + err = -ENOMEM; + goto out_opal_node; + } err = opal_message_notifier_register(OPAL_MSG_ASYNC_COMP, &opal_async_comp_nb); if (err) { pr_err("%s: Can't register OPAL event notifier (%d)\n", __func__, err); - goto out_opal_node; - } - - opal_async_responses = kzalloc( - sizeof(*opal_async_responses) * opal_max_async_tokens, - GFP_KERNEL); - if (!opal_async_responses) { - pr_err("%s: Out of memory, failed to do asynchronous " - "completion init\n", __func__); - err = -ENOMEM; + kfree(opal_async_tokens); goto out_opal_node; } -- cgit From 95e1bc1daaeee4d598b235dc85b64b7a0bcc3060 Mon Sep 17 00:00:00 2001 From: Stewart Smith Date: Fri, 3 Nov 2017 13:41:43 +1100 Subject: powernv/opal-sensor: remove not needed lock Parallel sensor reads could run out of async tokens due to opal_get_sensor_data grabbing tokens but then doing the sensor read behind a mutex, essentially serializing the (possibly asynchronous and relatively slow) sensor read. It turns out that the mutex isn't needed at all, not only should the OPAL interface allow concurrent reads, the implementation is certainly safe for that, and if any sensor we were reading from somewhere isn't, doing the mutual exclusion in the kernel is the wrong place to do it, OPAL should be doing it for the kernel. So, remove the mutex. Additionally, we shouldn't be printing out an error when we don't get a token as the only way this should happen is if we've been interrupted in down_interruptible() on the semaphore. Reported-by: Robert Lippert Signed-off-by: Stewart Smith Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-sensor.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c index aa267f120033..0a7074bb91dc 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor.c +++ b/arch/powerpc/platforms/powernv/opal-sensor.c @@ -19,13 +19,10 @@ */ #include -#include #include #include #include -static DEFINE_MUTEX(opal_sensor_mutex); - /* * This will return sensor information to driver based on the requested sensor * handle. A handle is an opaque id for the powernv, read by the driver from the @@ -38,13 +35,9 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) __be32 data; token = opal_async_get_token_interruptible(); - if (token < 0) { - pr_err("%s: Couldn't get the token, returning\n", __func__); - ret = token; - goto out; - } + if (token < 0) + return token; - mutex_lock(&opal_sensor_mutex); ret = opal_sensor_read(sensor_hndl, token, &data); switch (ret) { case OPAL_ASYNC_COMPLETION: @@ -52,7 +45,7 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) if (ret) { pr_err("%s: Failed to wait for the async response, %d\n", __func__, ret); - goto out_token; + goto out; } ret = opal_error_code(opal_get_async_rc(msg)); @@ -73,10 +66,8 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) break; } -out_token: - mutex_unlock(&opal_sensor_mutex); - opal_async_release_token(token); out: + opal_async_release_token(token); return ret; } EXPORT_SYMBOL_GPL(opal_get_sensor_data); -- cgit From 9aab24495c5644b25ced0d11816cc3c061bf74fc Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Fri, 3 Nov 2017 13:41:44 +1100 Subject: powerpc/opal: Add opal_async_wait_response_interruptible() to opal-async This patch adds an _interruptible version of opal_async_wait_response(). This is useful when a long running OPAL call is performed on behalf of a userspace thread, for example, the opal_flash_{read,write,erase} functions performed by the powernv-flash MTD driver. It is foreseeable that these functions would take upwards of two minutes causing the wait_event() to block long enough to cause hung task warnings. Furthermore, wait_event_interruptible() is preferable as otherwise there is no way for signals to stop the process which is going to be confusing in userspace. Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/opal.h | 2 + arch/powerpc/platforms/powernv/opal-async.c | 84 +++++++++++++++++++++++++++-- 2 files changed, 82 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 00aae3189311..0c545f7fc77b 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -309,6 +309,8 @@ extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val); extern int opal_async_get_token_interruptible(void); extern int opal_async_release_token(int token); extern int opal_async_wait_response(uint64_t token, struct opal_msg *msg); +extern int opal_async_wait_response_interruptible(uint64_t token, + struct opal_msg *msg); extern int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data); struct rtc_time; diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index a8a57310759a..18a355fa15e8 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -26,6 +26,8 @@ enum opal_async_token_state { ASYNC_TOKEN_UNALLOCATED = 0, ASYNC_TOKEN_ALLOCATED, + ASYNC_TOKEN_DISPATCHED, + ASYNC_TOKEN_ABANDONED, ASYNC_TOKEN_COMPLETED }; @@ -61,8 +63,9 @@ static int __opal_async_get_token(void) /* * Note: If the returned token is used in an opal call and opal returns - * OPAL_ASYNC_COMPLETION you MUST call opal_async_wait_response() before - * calling another other opal_async_* function + * OPAL_ASYNC_COMPLETION you MUST call one of opal_async_wait_response() or + * opal_async_wait_response_interruptible() at least once before calling another + * opal_async_* function */ int opal_async_get_token_interruptible(void) { @@ -98,6 +101,14 @@ static int __opal_async_release_token(int token) opal_async_tokens[token].state = ASYNC_TOKEN_UNALLOCATED; rc = 0; break; + /* + * DISPATCHED and ABANDONED tokens must wait for OPAL to respond. + * Mark a DISPATCHED token as ABANDONED so that the response handling + * code knows no one cares and that it can free it then. + */ + case ASYNC_TOKEN_DISPATCHED: + opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED; + /* Fall through */ default: rc = 1; } @@ -130,7 +141,11 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg) return -EINVAL; } - /* Wakeup the poller before we wait for events to speed things + /* + * There is no need to mark the token as dispatched, wait_event() + * will block until the token completes. + * + * Wakeup the poller before we wait for events to speed things * up on platforms or simulators where the interrupts aren't * functional. */ @@ -143,11 +158,66 @@ int opal_async_wait_response(uint64_t token, struct opal_msg *msg) } EXPORT_SYMBOL_GPL(opal_async_wait_response); +int opal_async_wait_response_interruptible(uint64_t token, struct opal_msg *msg) +{ + unsigned long flags; + int ret; + + if (token >= opal_max_async_tokens) { + pr_err("%s: Invalid token passed\n", __func__); + return -EINVAL; + } + + if (!msg) { + pr_err("%s: Invalid message pointer passed\n", __func__); + return -EINVAL; + } + + /* + * The first time this gets called we mark the token as DISPATCHED + * so that if wait_event_interruptible() returns not zero and the + * caller frees the token, we know not to actually free the token + * until the response comes. + * + * Only change if the token is ALLOCATED - it may have been + * completed even before the caller gets around to calling this + * the first time. + * + * There is also a dirty great comment at the token allocation + * function that if the opal call returns OPAL_ASYNC_COMPLETION to + * the caller then the caller *must* call this or the not + * interruptible version before doing anything else with the + * token. + */ + if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED) { + spin_lock_irqsave(&opal_async_comp_lock, flags); + if (opal_async_tokens[token].state == ASYNC_TOKEN_ALLOCATED) + opal_async_tokens[token].state = ASYNC_TOKEN_DISPATCHED; + spin_unlock_irqrestore(&opal_async_comp_lock, flags); + } + + /* + * Wakeup the poller before we wait for events to speed things + * up on platforms or simulators where the interrupts aren't + * functional. + */ + opal_wake_poller(); + ret = wait_event_interruptible(opal_async_wait, + opal_async_tokens[token].state == + ASYNC_TOKEN_COMPLETED); + if (!ret) + memcpy(msg, &opal_async_tokens[token].response, sizeof(*msg)); + + return ret; +} +EXPORT_SYMBOL_GPL(opal_async_wait_response_interruptible); + /* Called from interrupt context */ static int opal_async_comp_event(struct notifier_block *nb, unsigned long msg_type, void *msg) { struct opal_msg *comp_msg = msg; + enum opal_async_token_state state; unsigned long flags; uint64_t token; @@ -155,11 +225,17 @@ static int opal_async_comp_event(struct notifier_block *nb, return 0; token = be64_to_cpu(comp_msg->params[0]); - memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg)); spin_lock_irqsave(&opal_async_comp_lock, flags); + state = opal_async_tokens[token].state; opal_async_tokens[token].state = ASYNC_TOKEN_COMPLETED; spin_unlock_irqrestore(&opal_async_comp_lock, flags); + if (state == ASYNC_TOKEN_ABANDONED) { + /* Free the token, no one else will */ + opal_async_release_token(token); + return 0; + } + memcpy(&opal_async_tokens[token].response, comp_msg, sizeof(*comp_msg)); wake_up(&opal_async_wait); return 0; -- cgit From 77adbd2207e858f5923aa94e4a7d2f29f09217ed Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Fri, 3 Nov 2017 13:41:45 +1100 Subject: powerpc/powernv: Add OPAL_BUSY to opal_error_code() Also export opal_error_code() so that it can be used in modules Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 65c79ecf5a4d..041ddbd1fc57 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -998,6 +998,7 @@ int opal_error_code(int rc) case OPAL_PARAMETER: return -EINVAL; case OPAL_ASYNC_COMPLETION: return -EINPROGRESS; + case OPAL_BUSY: case OPAL_BUSY_EVENT: return -EBUSY; case OPAL_NO_MEM: return -ENOMEM; case OPAL_PERMISSION: return -EPERM; @@ -1037,3 +1038,4 @@ EXPORT_SYMBOL_GPL(opal_write_oppanel_async); /* Export this for KVM */ EXPORT_SYMBOL_GPL(opal_int_set_mfrr); EXPORT_SYMBOL_GPL(opal_int_eoi); +EXPORT_SYMBOL_GPL(opal_error_code); -- cgit From a7771176b4392fbc3a17399c51a8c11f2f681afe Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Thu, 2 Nov 2017 14:09:03 +1100 Subject: powerpc: Don't enable FP/Altivec if not checkpointed Lazy save and restore of FP/Altivec means that a userspace process can be sent to userspace with FP or Altivec disabled and loaded only as required (by way of an FP/Altivec unavailable exception). Transactional Memory complicates this situation as a transaction could be started without FP/Altivec being loaded up. This causes the hardware to checkpoint incorrect registers. Handling FP/Altivec unavailable exceptions while a thread is transactional requires a reclaim and recheckpoint to ensure the CPU has correct state for both sets of registers. Lazy save and restore of FP/Altivec cannot be done if a process is transactional. If a facility was enabled it must remain enabled whenever a thread is transactional. Commit dc16b553c949 ("powerpc: Always restore FPU/VEC/VSX if hardware transactional memory in use") ensures that the facilities are always enabled if a thread is transactional. A bug in the introduced code may cause it to inadvertently enable a facility that was (and should remain) disabled. The problem with this extraneous enablement is that the registers for the erroneously enabled facility have not been correctly recheckpointed - the recheckpointing code assumed the facility would remain disabled. Further compounding the issue, the transactional {fp,altivec,vsx} unavailable code has been incorrectly using the MSR to enable facilities. The presence of the {FP,VEC,VSX} bit in the regs->msr simply means if the registers are live on the CPU, not if the kernel should load them before returning to userspace. This has worked due to the bug mentioned above. This causes transactional threads which return to their failure handler to observe incorrect checkpointed registers. Perhaps an example will help illustrate the problem: A userspace process is running and uses both FP and Altivec registers. This process then continues to run for some time without touching either sets of registers. The kernel subsequently disables the facilities as part of lazy save and restore. The userspace process then performs a tbegin and the CPU checkpoints 'junk' FP and Altivec registers. The process then performs a floating point instruction triggering a fp unavailable exception in the kernel. The kernel then loads the FP registers - and only the FP registers. Since the thread is transactional it must perform a reclaim and recheckpoint to ensure both the checkpointed registers and the transactional registers are correct. It then (correctly) enables MSR[FP] for the process. Later (on exception exist) the kernel also (inadvertently) enables MSR[VEC]. The process is then returned to userspace. Since the act of loading the FP registers doomed the transaction we know CPU will fail the transaction, restore its checkpointed registers, and return the process to its failure handler. The problem is that we're now running with Altivec enabled and the 'junk' checkpointed registers are restored. The kernel had only recheckpointed FP. This patch solves this by only activating FP/Altivec if userspace was using them when it entered the kernel and not simply if the process is transactional. Fixes: dc16b553c949 ("powerpc: Always restore FPU/VEC/VSX if hardware transactional memory in use") Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 18 ++++++++++++++++-- arch/powerpc/kernel/traps.c | 8 ++++---- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 2b602d60a9c7..e2980a22c487 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -104,9 +104,23 @@ static inline bool msr_tm_active(unsigned long msr) { return MSR_TM_ACTIVE(msr); } + +static bool tm_active_with_fp(struct task_struct *tsk) +{ + return msr_tm_active(tsk->thread.regs->msr) && + (tsk->thread.ckpt_regs.msr & MSR_FP); +} + +static bool tm_active_with_altivec(struct task_struct *tsk) +{ + return msr_tm_active(tsk->thread.regs->msr) && + (tsk->thread.ckpt_regs.msr & MSR_VEC); +} #else static inline bool msr_tm_active(unsigned long msr) { return false; } static inline void check_if_tm_restore_required(struct task_struct *tsk) { } +static inline bool tm_active_with_fp(struct task_struct *tsk) { return false; } +static inline bool tm_active_with_altivec(struct task_struct *tsk) { return false; } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ bool strict_msr_control; @@ -239,7 +253,7 @@ EXPORT_SYMBOL(enable_kernel_fp); static int restore_fp(struct task_struct *tsk) { - if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) { + if (tsk->thread.load_fp || tm_active_with_fp(tsk)) { load_fp_state(¤t->thread.fp_state); current->thread.load_fp++; return 1; @@ -321,7 +335,7 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread); static int restore_altivec(struct task_struct *tsk) { if (cpu_has_feature(CPU_FTR_ALTIVEC) && - (tsk->thread.load_vec || msr_tm_active(tsk->thread.regs->msr))) { + (tsk->thread.load_vec || tm_active_with_altivec(tsk))) { load_vr_state(&tsk->thread.vr_state); tsk->thread.used_vr = 1; tsk->thread.load_vec++; diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 0e4099fef198..863c6858ed0f 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1679,7 +1679,7 @@ void fp_unavailable_tm(struct pt_regs *regs) /* Reclaim didn't save out any FPRs to transact_fprs. */ /* Enable FP for the task: */ - regs->msr |= (MSR_FP | current->thread.fpexc_mode); + current->thread.load_fp = 1; /* This loads and recheckpoints the FP registers from * thread.fpr[]. They will remain in registers after the @@ -1708,7 +1708,7 @@ void altivec_unavailable_tm(struct pt_regs *regs) "MSR=%lx\n", regs->nip, regs->msr); tm_reclaim_current(TM_CAUSE_FAC_UNAV); - regs->msr |= MSR_VEC; + current->thread.load_vec = 1; tm_recheckpoint(¤t->thread, MSR_VEC); current->thread.used_vr = 1; @@ -1745,8 +1745,8 @@ void vsx_unavailable_tm(struct pt_regs *regs) /* This reclaims FP and/or VR regs if they're already enabled */ tm_reclaim_current(TM_CAUSE_FAC_UNAV); - regs->msr |= MSR_VEC | MSR_FP | current->thread.fpexc_mode | - MSR_VSX; + current->thread.load_vec = 1; + current->thread.load_fp = 1; /* This loads & recheckpoints FP and VRs; but we have * to be sure not to overwrite previously-valid state. -- cgit From 91381b9cb1c3afc162f830ebf698721402c7d577 Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Thu, 2 Nov 2017 14:09:04 +1100 Subject: powerpc: Force reload for recheckpoint during tm {fp, vec, vsx} unavailable exception Lazy save and restore of FP/Altivec means that a userspace process can be sent to userspace with FP or Altivec disabled and loaded only as required (by way of an FP/Altivec unavailable exception). Transactional Memory complicates this situation as a transaction could be started without FP/Altivec being loaded up. This causes the hardware to checkpoint incorrect registers. Handling FP/Altivec unavailable exceptions while a thread is transactional requires a reclaim and recheckpoint to ensure the CPU has correct state for both sets of registers. tm_reclaim() has optimisations to not always save the FP/Altivec registers to the checkpointed save area. This was originally done because the caller might have information that the checkpointed registers aren't valid due to lazy save and restore. We've also been a little vague as to how tm_reclaim() leaves the FP/Altivec state since it doesn't necessarily always save it to the thread struct. This has lead to an (incorrect) assumption that it leaves the checkpointed state on the CPU. tm_recheckpoint() has similar optimisations in reverse. It may not always reload the checkpointed FP/Altivec registers from the thread struct before the trecheckpoint. It is therefore quite unclear where it expects to get the state from. This didn't help with the assumption made about tm_reclaim(). This patch is a minimal fix for ease of backporting. A more correct fix which removes the msr parameter to tm_reclaim() and tm_recheckpoint() altogether has been upstreamed to apply on top of this patch. Fixes: dc3106690b20 ("powerpc: tm: Always use fp_state and vr_state to store live registers") Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/process.c | 4 ++-- arch/powerpc/kernel/traps.c | 22 +++++++++++++++++----- 2 files changed, 19 insertions(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index e2980a22c487..ca8c33b41989 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -874,6 +874,8 @@ static void tm_reclaim_thread(struct thread_struct *thr, if (!MSR_TM_SUSPENDED(mfmsr())) return; + giveup_all(container_of(thr, struct task_struct, thread)); + /* * If we are in a transaction and FP is off then we can't have * used FP inside that transaction. Hence the checkpointed @@ -893,8 +895,6 @@ static void tm_reclaim_thread(struct thread_struct *thr, memcpy(&thr->ckvr_state, &thr->vr_state, sizeof(struct thread_vr_state)); - giveup_all(container_of(thr, struct task_struct, thread)); - tm_reclaim(thr, thr->ckpt_regs.msr, cause); } diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 863c6858ed0f..e099e61c702b 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1663,6 +1663,12 @@ out: void fp_unavailable_tm(struct pt_regs *regs) { + /* + * Save the MSR now because tm_reclaim_current() is likely to + * change it + */ + unsigned long orig_msr = regs->msr; + /* Note: This does not handle any kind of FP laziness. */ TM_DEBUG("FP Unavailable trap whilst transactional at 0x%lx, MSR=%lx\n", @@ -1687,10 +1693,10 @@ void fp_unavailable_tm(struct pt_regs *regs) * If VMX is in use, the VRs now hold checkpointed values, * so we don't want to load the VRs from the thread_struct. */ - tm_recheckpoint(¤t->thread, MSR_FP); + tm_recheckpoint(¤t->thread, orig_msr | MSR_FP); /* If VMX is in use, get the transactional values back */ - if (regs->msr & MSR_VEC) { + if (orig_msr & MSR_VEC) { msr_check_and_set(MSR_VEC); load_vr_state(¤t->thread.vr_state); /* At this point all the VSX state is loaded, so enable it */ @@ -1700,6 +1706,12 @@ void fp_unavailable_tm(struct pt_regs *regs) void altivec_unavailable_tm(struct pt_regs *regs) { + /* + * Save the MSR now because tm_reclaim_current() is likely to + * change it + */ + unsigned long orig_msr = regs->msr; + /* See the comments in fp_unavailable_tm(). This function operates * the same way. */ @@ -1709,10 +1721,10 @@ void altivec_unavailable_tm(struct pt_regs *regs) regs->nip, regs->msr); tm_reclaim_current(TM_CAUSE_FAC_UNAV); current->thread.load_vec = 1; - tm_recheckpoint(¤t->thread, MSR_VEC); + tm_recheckpoint(¤t->thread, orig_msr | MSR_VEC); current->thread.used_vr = 1; - if (regs->msr & MSR_FP) { + if (orig_msr & MSR_FP) { msr_check_and_set(MSR_FP); load_fp_state(¤t->thread.fp_state); regs->msr |= MSR_VSX; @@ -1751,7 +1763,7 @@ void vsx_unavailable_tm(struct pt_regs *regs) /* This loads & recheckpoints FP and VRs; but we have * to be sure not to overwrite previously-valid state. */ - tm_recheckpoint(¤t->thread, regs->msr & ~orig_msr); + tm_recheckpoint(¤t->thread, orig_msr | MSR_FP | MSR_VEC); msr_check_and_set(orig_msr & (MSR_FP | MSR_VEC)); -- cgit From eb5c3f1c86470fc1a57ab28cce15c12e4d6cdf8b Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Thu, 2 Nov 2017 14:09:05 +1100 Subject: powerpc: Always save/restore checkpointed regs during treclaim/trecheckpoint Lazy save and restore of FP/Altivec means that a userspace process can be sent to userspace with FP or Altivec disabled and loaded only as required (by way of an FP/Altivec unavailable exception). Transactional Memory complicates this situation as a transaction could be started without FP/Altivec being loaded up. This causes the hardware to checkpoint incorrect registers. Handling FP/Altivec unavailable exceptions while a thread is transactional requires a reclaim and recheckpoint to ensure the CPU has correct state for both sets of registers. tm_reclaim() has optimisations to not always save the FP/Altivec registers to the checkpointed save area. This was originally done because the caller might have information that the checkpointed registers aren't valid due to lazy save and restore. We've also been a little vague as to how tm_reclaim() leaves the FP/Altivec state since it doesn't necessarily always save it to the thread struct. This has lead to an (incorrect) assumption that it leaves the checkpointed state on the CPU. tm_recheckpoint() has similar optimisations in reverse. It may not always reload the checkpointed FP/Altivec registers from the thread struct before the trecheckpoint. It is therefore quite unclear where it expects to get the state from. This didn't help with the assumption made about tm_reclaim(). These optimisations sit in what is by definition a slow path. If a process has to go through a reclaim/recheckpoint then its transaction will be doomed on returning to userspace. This mean that the process will be unable to complete its transaction and be forced to its failure handler. This is already an out if line case for userspace. Furthermore, the cost of copying 64 times 128 bits from registers isn't very long[0] (at all) on modern processors. As such it appears these optimisations have only served to increase code complexity and are unlikely to have had a measurable performance impact. Our transactional memory handling has been riddled with bugs. A cause of this has been difficulty in following the code flow, code complexity has not been our friend here. It makes sense to remove these optimisations in favour of a (hopefully) more stable implementation. This patch does mean that some times the assembly will needlessly save 'junk' registers which will subsequently get overwritten with the correct value by the C code which calls the assembly function. This small inefficiency is far outweighed by the reduction in complexity for general TM code, context switching paths, and transactional facility unavailable exception handler. 0: I tried to measure it once for other work and found that it was hiding in the noise of everything else I was working with. I find it exceedingly likely this will be the case here. Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/tm.h | 5 ++-- arch/powerpc/kernel/process.c | 22 ++++++--------- arch/powerpc/kernel/signal_32.c | 2 +- arch/powerpc/kernel/signal_64.c | 2 +- arch/powerpc/kernel/tm.S | 59 ++++++++++++----------------------------- arch/powerpc/kernel/traps.c | 26 +++++------------- 6 files changed, 35 insertions(+), 81 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h index ad19fe41931b..79d4156554b4 100644 --- a/arch/powerpc/include/asm/tm.h +++ b/arch/powerpc/include/asm/tm.h @@ -11,10 +11,9 @@ extern void tm_enable(void); extern void tm_reclaim(struct thread_struct *thread, - unsigned long orig_msr, uint8_t cause); + uint8_t cause); extern void tm_reclaim_current(uint8_t cause); -extern void tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr); +extern void tm_recheckpoint(struct thread_struct *thread); extern void tm_abort(uint8_t cause); extern void tm_save_sprs(struct thread_struct *thread); extern void tm_restore_sprs(struct thread_struct *thread); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index ca8c33b41989..4083b737decb 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -876,6 +876,8 @@ static void tm_reclaim_thread(struct thread_struct *thr, giveup_all(container_of(thr, struct task_struct, thread)); + tm_reclaim(thr, cause); + /* * If we are in a transaction and FP is off then we can't have * used FP inside that transaction. Hence the checkpointed @@ -894,8 +896,6 @@ static void tm_reclaim_thread(struct thread_struct *thr, if ((thr->ckpt_regs.msr & MSR_VEC) == 0) memcpy(&thr->ckvr_state, &thr->vr_state, sizeof(struct thread_vr_state)); - - tm_reclaim(thr, thr->ckpt_regs.msr, cause); } void tm_reclaim_current(uint8_t cause) @@ -946,11 +946,9 @@ out_and_saveregs: tm_save_sprs(thr); } -extern void __tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr); +extern void __tm_recheckpoint(struct thread_struct *thread); -void tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr) +void tm_recheckpoint(struct thread_struct *thread) { unsigned long flags; @@ -969,15 +967,13 @@ void tm_recheckpoint(struct thread_struct *thread, */ tm_restore_sprs(thread); - __tm_recheckpoint(thread, orig_msr); + __tm_recheckpoint(thread); local_irq_restore(flags); } static inline void tm_recheckpoint_new_task(struct task_struct *new) { - unsigned long msr; - if (!cpu_has_feature(CPU_FTR_TM)) return; @@ -996,13 +992,11 @@ static inline void tm_recheckpoint_new_task(struct task_struct *new) tm_restore_sprs(&new->thread); return; } - msr = new->thread.ckpt_regs.msr; /* Recheckpoint to restore original checkpointed register state. */ - TM_DEBUG("*** tm_recheckpoint of pid %d " - "(new->msr 0x%lx, new->origmsr 0x%lx)\n", - new->pid, new->thread.regs->msr, msr); + TM_DEBUG("*** tm_recheckpoint of pid %d (new->msr 0x%lx)\n", + new->pid, new->thread.regs->msr); - tm_recheckpoint(&new->thread, msr); + tm_recheckpoint(&new->thread); /* * The checkpointed state has been restored but the live state has diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 1dd5fa0f65fd..16d16583cf11 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -880,7 +880,7 @@ static long restore_tm_user_regs(struct pt_regs *regs, /* Make sure the transaction is marked as failed */ current->thread.tm_texasr |= TEXASR_FS; /* This loads the checkpointed FP/VEC state, if used */ - tm_recheckpoint(¤t->thread, msr); + tm_recheckpoint(¤t->thread); /* This loads the speculative FP/VEC state, if used */ msr_check_and_set(msr & (MSR_FP | MSR_VEC)); diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index 4dfdd8e56836..528ea744911c 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -552,7 +552,7 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, /* Make sure the transaction is marked as failed */ tsk->thread.tm_texasr |= TEXASR_FS; /* This loads the checkpointed FP/VEC state, if used */ - tm_recheckpoint(&tsk->thread, msr); + tm_recheckpoint(&tsk->thread); msr_check_and_set(msr & (MSR_FP | MSR_VEC)); if (msr & MSR_FP) { diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index c4ba37822ba0..d89fb0e6f9ed 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -79,15 +79,12 @@ _GLOBAL(tm_abort) blr /* void tm_reclaim(struct thread_struct *thread, - * unsigned long orig_msr, * uint8_t cause) * * - Performs a full reclaim. This destroys outstanding * transactions and updates thread->regs.tm_ckpt_* with the * original checkpointed state. Note that thread->regs is * unchanged. - * - FP regs are written back to thread->transact_fpr before - * reclaiming. These are the transactional (current) versions. * * Purpose is to both abort transactions of, and preserve the state of, * a transactions at a context switch. We preserve/restore both sets of process @@ -98,9 +95,9 @@ _GLOBAL(tm_abort) * Call with IRQs off, stacks get all out of sync for some periods in here! */ _GLOBAL(tm_reclaim) - mfcr r6 + mfcr r5 mflr r0 - stw r6, 8(r1) + stw r5, 8(r1) std r0, 16(r1) std r2, STK_GOT(r1) stdu r1, -TM_FRAME_SIZE(r1) @@ -108,7 +105,6 @@ _GLOBAL(tm_reclaim) /* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */ std r3, STK_PARAM(R3)(r1) - std r4, STK_PARAM(R4)(r1) SAVE_NVGPRS(r1) /* We need to setup MSR for VSX register save instructions. */ @@ -138,8 +134,8 @@ _GLOBAL(tm_reclaim) std r1, PACAR1(r13) /* Clear MSR RI since we are about to change r1, EE is already off. */ - li r4, 0 - mtmsrd r4, 1 + li r5, 0 + mtmsrd r5, 1 /* * BE CAREFUL HERE: @@ -151,7 +147,7 @@ _GLOBAL(tm_reclaim) * to user register state. (FPRs, CCR etc. also!) * Use an sprg and a tm_scratch in the PACA to shuffle. */ - TRECLAIM(R5) /* Cause in r5 */ + TRECLAIM(R4) /* Cause in r4 */ /* ******************** GPRs ******************** */ /* Stash the checkpointed r13 away in the scratch SPR and get the real @@ -242,40 +238,30 @@ _GLOBAL(tm_reclaim) /* ******************** FPR/VR/VSRs ************ - * After reclaiming, capture the checkpointed FPRs/VRs /if used/. - * - * (If VSX used, FP and VMX are implied. Or, we don't need to look - * at MSR.VSX as copying FP regs if .FP, vector regs if .VMX covers it.) - * - * We're passed the thread's MSR as the second parameter + * After reclaiming, capture the checkpointed FPRs/VRs. * * We enabled VEC/FP/VSX in the msr above, so we can execute these * instructions! */ - ld r4, STK_PARAM(R4)(r1) /* Second parameter, MSR * */ mr r3, r12 - andis. r0, r4, MSR_VEC@h - beq dont_backup_vec + /* Altivec (VEC/VMX/VR)*/ addi r7, r3, THREAD_CKVRSTATE SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 transact vr state */ mfvscr v0 li r6, VRSTATE_VSCR stvx v0, r7, r6 -dont_backup_vec: + + /* VRSAVE */ mfspr r0, SPRN_VRSAVE std r0, THREAD_CKVRSAVE(r3) - andi. r0, r4, MSR_FP - beq dont_backup_fp - + /* Floating Point (FP) */ addi r7, r3, THREAD_CKFPSTATE SAVE_32FPRS_VSRS(0, R6, R7) /* r6 scratch, r7 transact fp state */ - mffs fr0 stfd fr0,FPSTATE_FPSCR(r7) -dont_backup_fp: /* TM regs, incl TEXASR -- these live in thread_struct. Note they've * been updated by the treclaim, to explain to userland the failure @@ -343,22 +329,19 @@ _GLOBAL(__tm_recheckpoint) */ subi r7, r7, STACK_FRAME_OVERHEAD + /* We need to setup MSR for FP/VMX/VSX register save instructions. */ mfmsr r6 - /* R4 = original MSR to indicate whether thread used FP/Vector etc. */ - - /* Enable FP/vec in MSR if necessary! */ - lis r5, MSR_VEC@h + mr r5, r6 ori r5, r5, MSR_FP - and. r5, r4, r5 - beq restore_gprs /* if neither, skip both */ - +#ifdef CONFIG_ALTIVEC + oris r5, r5, MSR_VEC@h +#endif #ifdef CONFIG_VSX BEGIN_FTR_SECTION - oris r5, r5, MSR_VSX@h + oris r5,r5, MSR_VSX@h END_FTR_SECTION_IFSET(CPU_FTR_VSX) #endif - or r5, r6, r5 /* Set MSR.FP+.VSX/.VEC */ - mtmsr r5 + mtmsrd r5 #ifdef CONFIG_ALTIVEC /* @@ -367,28 +350,20 @@ _GLOBAL(__tm_recheckpoint) * thread.fp_state[] version holds the 'live' (transactional) * and will be loaded subsequently by any FPUnavailable trap. */ - andis. r0, r4, MSR_VEC@h - beq dont_restore_vec - addi r8, r3, THREAD_CKVRSTATE li r5, VRSTATE_VSCR lvx v0, r8, r5 mtvscr v0 REST_32VRS(0, r5, r8) /* r5 scratch, r8 ptr */ -dont_restore_vec: ld r5, THREAD_CKVRSAVE(r3) mtspr SPRN_VRSAVE, r5 #endif - andi. r0, r4, MSR_FP - beq dont_restore_fp - addi r8, r3, THREAD_CKFPSTATE lfd fr0, FPSTATE_FPSCR(r8) MTFSF_L(fr0) REST_32FPRS_VSRS(0, R4, R8) -dont_restore_fp: mtmsr r6 /* FP/Vec off again! */ restore_gprs: diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index e099e61c702b..4e5a543d0b74 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1693,7 +1693,7 @@ void fp_unavailable_tm(struct pt_regs *regs) * If VMX is in use, the VRs now hold checkpointed values, * so we don't want to load the VRs from the thread_struct. */ - tm_recheckpoint(¤t->thread, orig_msr | MSR_FP); + tm_recheckpoint(¤t->thread); /* If VMX is in use, get the transactional values back */ if (orig_msr & MSR_VEC) { @@ -1721,7 +1721,7 @@ void altivec_unavailable_tm(struct pt_regs *regs) regs->nip, regs->msr); tm_reclaim_current(TM_CAUSE_FAC_UNAV); current->thread.load_vec = 1; - tm_recheckpoint(¤t->thread, orig_msr | MSR_VEC); + tm_recheckpoint(¤t->thread); current->thread.used_vr = 1; if (orig_msr & MSR_FP) { @@ -1733,8 +1733,6 @@ void altivec_unavailable_tm(struct pt_regs *regs) void vsx_unavailable_tm(struct pt_regs *regs) { - unsigned long orig_msr = regs->msr; - /* See the comments in fp_unavailable_tm(). This works similarly, * though we're loading both FP and VEC registers in here. * @@ -1748,29 +1746,17 @@ void vsx_unavailable_tm(struct pt_regs *regs) current->thread.used_vsr = 1; - /* If FP and VMX are already loaded, we have all the state we need */ - if ((orig_msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC)) { - regs->msr |= MSR_VSX; - return; - } - /* This reclaims FP and/or VR regs if they're already enabled */ tm_reclaim_current(TM_CAUSE_FAC_UNAV); current->thread.load_vec = 1; current->thread.load_fp = 1; - /* This loads & recheckpoints FP and VRs; but we have - * to be sure not to overwrite previously-valid state. - */ - tm_recheckpoint(¤t->thread, orig_msr | MSR_FP | MSR_VEC); - - msr_check_and_set(orig_msr & (MSR_FP | MSR_VEC)); + tm_recheckpoint(¤t->thread); - if (orig_msr & MSR_FP) - load_fp_state(¤t->thread.fp_state); - if (orig_msr & MSR_VEC) - load_vr_state(¤t->thread.vr_state); + msr_check_and_set(MSR_FP | MSR_VEC); + load_fp_state(¤t->thread.fp_state); + load_vr_state(¤t->thread.vr_state); } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ -- cgit From 6f700d38a8c9dc4d786ac4a831073194f17c5076 Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Thu, 2 Nov 2017 14:09:06 +1100 Subject: powerpc: Remove facility loadups on transactional {fp, vec, vsx} unavailable After handling a transactional FP, Altivec or VSX unavailable exception. The return to userspace code will detect that the TIF_RESTORE_TM bit is set and call restore_tm_state(). restore_tm_state() will call restore_math() to ensure that the correct facilities are loaded. This means that all the loadup code in {fp,altivec,vsx}_unavailable_tm() is doing pointless work and can simply be removed. Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/traps.c | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 4e5a543d0b74..f3eb61be0d30 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1663,12 +1663,6 @@ out: void fp_unavailable_tm(struct pt_regs *regs) { - /* - * Save the MSR now because tm_reclaim_current() is likely to - * change it - */ - unsigned long orig_msr = regs->msr; - /* Note: This does not handle any kind of FP laziness. */ TM_DEBUG("FP Unavailable trap whilst transactional at 0x%lx, MSR=%lx\n", @@ -1694,24 +1688,10 @@ void fp_unavailable_tm(struct pt_regs *regs) * so we don't want to load the VRs from the thread_struct. */ tm_recheckpoint(¤t->thread); - - /* If VMX is in use, get the transactional values back */ - if (orig_msr & MSR_VEC) { - msr_check_and_set(MSR_VEC); - load_vr_state(¤t->thread.vr_state); - /* At this point all the VSX state is loaded, so enable it */ - regs->msr |= MSR_VSX; - } } void altivec_unavailable_tm(struct pt_regs *regs) { - /* - * Save the MSR now because tm_reclaim_current() is likely to - * change it - */ - unsigned long orig_msr = regs->msr; - /* See the comments in fp_unavailable_tm(). This function operates * the same way. */ @@ -1723,12 +1703,6 @@ void altivec_unavailable_tm(struct pt_regs *regs) current->thread.load_vec = 1; tm_recheckpoint(¤t->thread); current->thread.used_vr = 1; - - if (orig_msr & MSR_FP) { - msr_check_and_set(MSR_FP); - load_fp_state(¤t->thread.fp_state); - regs->msr |= MSR_VSX; - } } void vsx_unavailable_tm(struct pt_regs *regs) @@ -1753,10 +1727,6 @@ void vsx_unavailable_tm(struct pt_regs *regs) current->thread.load_fp = 1; tm_recheckpoint(¤t->thread); - - msr_check_and_set(MSR_FP | MSR_VEC); - load_fp_state(¤t->thread.fp_state); - load_vr_state(¤t->thread.vr_state); } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ -- cgit From b6b3755e9bec9c686a34ec81eacced0075370cbc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 3 Nov 2017 15:13:19 +1100 Subject: powerpc: add POWER9_DD20 feature Cc: Michael Neuling Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/cputable.h | 5 ++++- arch/powerpc/kernel/cputable.c | 20 ++++++++++++++++++++ arch/powerpc/kernel/dt_cpu_ftrs.c | 2 ++ 3 files changed, 26 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index e188dd57bf51..1ca26ca85ec6 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -215,6 +215,7 @@ enum { #define CPU_FTR_DABRX LONG_ASM_CONST(0x0800000000000000) #define CPU_FTR_PMAO_BUG LONG_ASM_CONST(0x1000000000000000) #define CPU_FTR_POWER9_DD1 LONG_ASM_CONST(0x4000000000000000) +#define CPU_FTR_POWER9_DD20 LONG_ASM_CONST(0x8000000000000000) #ifndef __ASSEMBLY__ @@ -477,6 +478,7 @@ enum { CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300) #define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \ (~CPU_FTR_SAO)) +#define CPU_FTRS_POWER9_DD20 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD20) #define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ @@ -495,7 +497,8 @@ enum { (CPU_FTRS_POWER4 | CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \ CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \ CPU_FTRS_POWER8 | CPU_FTRS_POWER8_DD1 | CPU_FTRS_CELL | \ - CPU_FTRS_PA6T | CPU_FTR_VSX | CPU_FTRS_POWER9 | CPU_FTRS_POWER9_DD1) + CPU_FTRS_PA6T | CPU_FTR_VSX | CPU_FTRS_POWER9 | \ + CPU_FTRS_POWER9_DD1 | CPU_FTRS_POWER9_DD20) #endif #else enum { diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 760872916013..171820190de7 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -547,6 +547,26 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check_early = __machine_check_early_realmode_p9, .platform = "power9", }, + { /* Power9 DD2.0 */ + .pvr_mask = 0xffffefff, + .pvr_value = 0x004e0200, + .cpu_name = "POWER9 (raw)", + .cpu_features = CPU_FTRS_POWER9_DD20, + .cpu_user_features = COMMON_USER_POWER9, + .cpu_user_features2 = COMMON_USER2_POWER9, + .mmu_features = MMU_FTRS_POWER9, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power9", + .oprofile_type = PPC_OPROFILE_INVALID, + .cpu_setup = __setup_cpu_power9, + .cpu_restore = __restore_cpu_power9, + .flush_tlb = __flush_tlb_power9, + .machine_check_early = __machine_check_early_realmode_p9, + .platform = "power9", + }, { /* Power9 */ .pvr_mask = 0xffff0000, .pvr_value = 0x004e0000, diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index c7bfcacc2cb6..a7ad181d0deb 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -735,6 +735,8 @@ static __init void cpufeatures_cpu_quirks(void) */ if ((version & 0xffffff00) == 0x004e0100) cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD1; + else if ((version & 0xffffefff) == 0x004e0200) + cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD20; } static void __init cpufeatures_setup_finished(void) -- cgit From 9d2f510a66ec60411536b83dab831ee091c84307 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 3 Nov 2017 15:13:20 +1100 Subject: powerpc/64s/idle: avoid POWER9 DD1 and DD2.0 ERAT workaround on DD2.1 DD2.1 does not have to flush the ERAT after a state-loss idle. Performance testing was done on a DD2.1 using only the stop0 idle state (the shallowest state which supports state loss), using context_switch selftest configured to ping-poing between two threads on the same core and two different cores. Performance improvement for same core is 7.0%, different cores is 14.8%. Reviewed-by: Vaidyanathan Srinivasan Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 175d49f468af..d77766814090 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -542,7 +542,9 @@ pnv_restore_hyp_resource_arch300: * then clear bit 60 in MMCRA to ensure the PMU starts running. */ blt cr3,1f +BEGIN_FTR_SECTION PPC_INVALIDATE_ERAT +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20) ld r1,PACAR1(r13) mfspr r4,SPRN_MMCRA ori r4,r4,(1 << (63-60)) -- cgit From e3646330cf6667d5cced780ade80f67850c81872 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 3 Nov 2017 15:13:21 +1100 Subject: powerpc/64s/idle: avoid POWER9 DD1 and DD2.0 PMU workaround on DD2.1 DD2.1 does not have to save MMCR0 for all state-loss idle states, only after deep idle states (like other PMU registers). Reviewed-by: Vaidyanathan Srinivasan Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index d77766814090..59657783d1d5 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -112,12 +112,14 @@ power9_save_additional_sprs: std r4, STOP_HFSCR(r13) mfspr r3, SPRN_MMCRA - mfspr r4, SPRN_MMCR1 + mfspr r4, SPRN_MMCR0 std r3, STOP_MMCRA(r13) - std r4, STOP_MMCR1(r13) + std r4, _MMCR0(r1) - mfspr r3, SPRN_MMCR2 - std r3, STOP_MMCR2(r13) + mfspr r3, SPRN_MMCR1 + mfspr r4, SPRN_MMCR2 + std r3, STOP_MMCR1(r13) + std r4, STOP_MMCR2(r13) blr power9_restore_additional_sprs: @@ -135,11 +137,14 @@ power9_restore_additional_sprs: ld r4, STOP_MMCRA(r13) mtspr SPRN_HFSCR, r3 mtspr SPRN_MMCRA, r4 - /* We have already restored PACA_MMCR0 */ - ld r3, STOP_MMCR1(r13) - ld r4, STOP_MMCR2(r13) - mtspr SPRN_MMCR1, r3 - mtspr SPRN_MMCR2, r4 + + ld r3, _MMCR0(r1) + ld r4, STOP_MMCR1(r13) + mtspr SPRN_MMCR0, r3 + mtspr SPRN_MMCR1, r4 + + ld r3, STOP_MMCR2(r13) + mtspr SPRN_MMCR2, r3 blr /* @@ -350,6 +355,7 @@ power_enter_stop: b pnv_wakeup_noloss .Lhandle_esl_ec_set: +BEGIN_FTR_SECTION /* * POWER9 DD2 can incorrectly set PMAO when waking up after a * state-loss idle. Saving and restoring MMCR0 over idle is a @@ -357,6 +363,7 @@ power_enter_stop: */ mfspr r4,SPRN_MMCR0 std r4,_MMCR0(r1) +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20) /* * Check if the requested state is a deep idle state. @@ -544,15 +551,15 @@ pnv_restore_hyp_resource_arch300: blt cr3,1f BEGIN_FTR_SECTION PPC_INVALIDATE_ERAT -END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20) ld r1,PACAR1(r13) + ld r4,_MMCR0(r1) + mtspr SPRN_MMCR0,r4 +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20) mfspr r4,SPRN_MMCRA ori r4,r4,(1 << (63-60)) mtspr SPRN_MMCRA,r4 xori r4,r4,(1 << (63-60)) mtspr SPRN_MMCRA,r4 - ld r4,_MMCR0(r1) - mtspr SPRN_MMCR0,r4 1: /* * POWER ISA 3. Use PSSCR to determine if we -- cgit From cd77b5ce208c153260ed7882d8910f2395bfaabd Mon Sep 17 00:00:00 2001 From: Shriya Date: Fri, 13 Oct 2017 10:06:41 +0530 Subject: powerpc/powernv/cpufreq: Fix the frequency read by /proc/cpuinfo The call to /proc/cpuinfo in turn calls cpufreq_quick_get() which returns the last frequency requested by the kernel, but may not reflect the actual frequency the processor is running at. This patch makes a call to cpufreq_get() instead which returns the current frequency reported by the hardware. Fixes: fb5153d05a7d ("powerpc: powernv: Implement ppc_md.get_proc_freq()") Signed-off-by: Shriya Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index d23f148a11f0..62f4a5ad8594 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -335,7 +335,7 @@ static unsigned long pnv_get_proc_freq(unsigned int cpu) { unsigned long ret_freq; - ret_freq = cpufreq_quick_get(cpu) * 1000ul; + ret_freq = cpufreq_get(cpu) * 1000ul; /* * If the backend cpufreq driver does not exist, -- cgit From 9003a249815a15704f415954039d1c7ea27da9ad Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 7 Nov 2017 14:43:01 +1100 Subject: powerpc/powernv/ioda: Remove explicit max window size check DMA windows can only have a size of power of two on IODA2 hardware and using memory_hotplug_max() to determine the upper limit won't work correcly if it returns not power of two value. This removes the check as the platform code does this check in pnv_pci_ioda2_setup_default_config() anyway; the other client is VFIO and that thing checks against locked_vm limit which prevents the userspace from locking too much memory. It is expected to impact DPDK on machines with non-power-of-two RAM size, mostly. KVM guests are less likely to be affected as usually guests get less than half of hosts RAM. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 7e87867984e7..749055553064 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2797,7 +2797,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) return -EINVAL; - if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size)) + if (!is_power_of_2(window_size)) return -EINVAL; /* Adjust direct table size from window_size and levels */ -- cgit From bf751e30b41cdf2568cccaf5de4156a3d78683df Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Tue, 11 Jul 2017 13:26:26 +0200 Subject: powerpc/mm/hash: Remove stale comment. In commit e6f81a92015b ("powerpc/mm/hash: Support 68 bit VA") the masking is folded into ASM_VSID_SCRAMBLE but the comment about masking is removed only from the firt use of ASM_VSID_SCRAMBLE. Signed-off-by: Michal Suchanek Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slb_low.S | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 906a86fe457b..ed60ad861dfa 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -309,10 +309,6 @@ slb_compare_rr_to_size: srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ rldimi r10,r9,ESID_BITS_1T,0 ASM_VSID_SCRAMBLE(r10,r9,r11,1T) - /* - * bits above VSID_BITS_1T need to be ignored from r10 - * also combine VSID and flags - */ li r10,MMU_SEGSIZE_1T rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ -- cgit From 80eff6c484799722736471d15ff9cc86b64cae7a Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Mon, 30 Oct 2017 22:01:12 +1100 Subject: powerpc/xmon: Support dumping software pagetables It would be nice to be able to dump page tables in a particular context. eg: dumping vmalloc space: 0:mon> dv 0xd00037fffff00000 pgd @ 0xc0000000017c0000 pgdp @ 0xc0000000017c00d8 = 0x00000000f10b1000 pudp @ 0xc0000000f10b13f8 = 0x00000000f10d0000 pmdp @ 0xc0000000f10d1ff8 = 0x00000000f1102000 ptep @ 0xc0000000f1102780 = 0xc0000000f1ba018e Maps physical address = 0x00000000f1ba0000 Flags = Accessed Dirty Read Write This patch does not replicate the complex code of dump_pagetable and has no support for bolted linear mapping, thats why I've it's called dump virtual page table support. The format of the PTE can be expanded even further to add more useful information about the flags in the PTE if required. Signed-off-by: Balbir Singh [mpe: Bike shed the output format, show the pgdir, fix build failures] Signed-off-by: Michael Ellerman --- arch/powerpc/xmon/xmon.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 6b1236b85622..1b2d8cb49abb 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -127,6 +128,7 @@ static void byterev(unsigned char *, int); static void memex(void); static int bsesc(void); static void dump(void); +static void show_pte(unsigned long); static void prdump(unsigned long, long); static int ppc_inst_dump(unsigned long, long, int); static void dump_log_buf(void); @@ -234,6 +236,7 @@ Commands:\n\ #endif "\ dr dump stream of raw bytes\n\ + dv dump virtual address translation \n\ dt dump the tracing buffers (uses printk)\n\ dtc dump the tracing buffers for current CPU (uses printk)\n\ " @@ -2613,6 +2616,9 @@ dump(void) dump_log_buf(); } else if (c == 'o') { dump_opal_msglog(); + } else if (c == 'v') { + /* dump virtual to physical translation */ + show_pte(adrs); } else if (c == 'r') { scanhex(&ndump); if (ndump == 0) @@ -2946,6 +2952,116 @@ static void show_task(struct task_struct *tsk) tsk->comm); } +#ifdef CONFIG_PPC_BOOK3S_64 +void format_pte(void *ptep, unsigned long pte) +{ + printf("ptep @ 0x%016lx = 0x%016lx\n", (unsigned long)ptep, pte); + printf("Maps physical address = 0x%016lx\n", pte & PTE_RPN_MASK); + + printf("Flags = %s%s%s%s%s\n", + (pte & _PAGE_ACCESSED) ? "Accessed " : "", + (pte & _PAGE_DIRTY) ? "Dirty " : "", + (pte & _PAGE_READ) ? "Read " : "", + (pte & _PAGE_WRITE) ? "Write " : "", + (pte & _PAGE_EXEC) ? "Exec " : ""); +} + +static void show_pte(unsigned long addr) +{ + unsigned long tskv = 0; + struct task_struct *tsk = NULL; + struct mm_struct *mm; + pgd_t *pgdp, *pgdir; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + if (!scanhex(&tskv)) + mm = &init_mm; + else + tsk = (struct task_struct *)tskv; + + if (tsk == NULL) + mm = &init_mm; + else + mm = tsk->active_mm; + + if (setjmp(bus_error_jmp) != 0) { + catch_memory_errors = 0; + printf("*** Error dumping pte for task %p\n", tsk); + return; + } + + catch_memory_errors = 1; + sync(); + + if (mm == &init_mm) { + pgdp = pgd_offset_k(addr); + pgdir = pgd_offset_k(0); + } else { + pgdp = pgd_offset(mm, addr); + pgdir = pgd_offset(mm, 0); + } + + if (pgd_none(*pgdp)) { + printf("no linux page table for address\n"); + return; + } + + printf("pgd @ 0x%016lx\n", pgdir); + + if (pgd_huge(*pgdp)) { + format_pte(pgdp, pgd_val(*pgdp)); + return; + } + printf("pgdp @ 0x%016lx = 0x%016lx\n", pgdp, pgd_val(*pgdp)); + + pudp = pud_offset(pgdp, addr); + + if (pud_none(*pudp)) { + printf("No valid PUD\n"); + return; + } + + if (pud_huge(*pudp)) { + format_pte(pudp, pud_val(*pudp)); + return; + } + + printf("pudp @ 0x%016lx = 0x%016lx\n", pudp, pud_val(*pudp)); + + pmdp = pmd_offset(pudp, addr); + + if (pmd_none(*pmdp)) { + printf("No valid PMD\n"); + return; + } + + if (pmd_huge(*pmdp)) { + format_pte(pmdp, pmd_val(*pmdp)); + return; + } + printf("pmdp @ 0x%016lx = 0x%016lx\n", pmdp, pmd_val(*pmdp)); + + ptep = pte_offset_map(pmdp, addr); + if (pte_none(*ptep)) { + printf("no valid PTE\n"); + return; + } + + format_pte(ptep, pte_val(*ptep)); + + sync(); + __delay(200); + catch_memory_errors = 0; +} +#else +static void show_pte(unsigned long addr) +{ + printf("show_pte not yet implemented\n"); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ + static void show_tasks(void) { unsigned long tskv; -- cgit From 14001c60939a754717893672209160cec1e2ac18 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 7 Nov 2017 18:53:05 +1100 Subject: powerpc/64s/radix: Optimize TLB range flush barriers Short range flushes issue a sequences of tlbie(l) instructions for individual effective addresses. These do not all require individual barrier sequences, only one covering all tlbie(l) instructions. Commit f7327e0ba3 ("powerpc/mm/radix: Remove unnecessary ptesync") made a similar optimization for tlbiel for PID flushing. For tlbie, the ISA says: The tlbsync instruction provides an ordering function for the effects of all tlbie instructions executed by the thread executing the tlbsync instruction, with respect to the memory barrier created by a subsequent ptesync instruction executed by the same thread. Time to munmap 30 pages of memory (after mmap, touch): local global vanilla 10.9us 22.3us patched 3.4us 14.4us Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb-radix.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 63e277b6e60c..22b657e4b01a 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -84,7 +84,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static inline void _tlbiel_va(unsigned long va, unsigned long pid, +static inline void __tlbiel_va(unsigned long va, unsigned long pid, unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -95,14 +95,20 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid, prs = 1; /* process scoped */ r = 1; /* raidx format */ - asm volatile("ptesync": : :"memory"); asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - asm volatile("ptesync": : :"memory"); trace_tlbie(0, 1, rb, rs, ric, prs, r); } -static inline void _tlbie_va(unsigned long va, unsigned long pid, +static inline void _tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) +{ + asm volatile("ptesync": : :"memory"); + __tlbiel_va(va, pid, ap, ric); + asm volatile("ptesync": : :"memory"); +} + +static inline void __tlbie_va(unsigned long va, unsigned long pid, unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -113,13 +119,20 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid, prs = 1; /* process scoped */ r = 1; /* raidx format */ - asm volatile("ptesync": : :"memory"); asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); trace_tlbie(0, 0, rb, rs, ric, prs, r); } +static inline void _tlbie_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) +{ + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, ric); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + + /* * Base TLB flushing operations: * @@ -341,13 +354,17 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, else _tlbie_pid(pid, RIC_FLUSH_TLB); } else { + asm volatile("ptesync": : :"memory"); for (addr = start; addr < end; addr += page_size) { - if (local) - _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); + __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); else - _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); } + if (local) + asm volatile("ptesync": : :"memory"); + else + asm volatile("eieio; tlbsync; ptesync": : :"memory"); } preempt_enable(); } @@ -378,6 +395,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) _tlbie_pid(pid, RIC_FLUSH_PWC); /* Then iterate the pages */ + asm volatile("ptesync": : :"memory"); end = addr + HPAGE_PMD_SIZE; for (; addr < end; addr += PAGE_SIZE) { if (local) @@ -386,6 +404,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); } + if (local) + asm volatile("ptesync": : :"memory"); + else + asm volatile("eieio; tlbsync; ptesync": : :"memory"); + preempt_enable(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit From d665767e39fa4a9e725f92d77ba2060c5ce273dc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 7 Nov 2017 18:53:06 +1100 Subject: powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions Move the barriers and range iteration down into the _tlbie* level, which improves readability. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb-radix.c | 69 +++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 30 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 22b657e4b01a..9916ea2fff43 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -85,7 +85,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) } static inline void __tlbiel_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) + unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -101,13 +101,28 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid, } static inline void _tlbiel_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) + unsigned long psize, unsigned long ric) { + unsigned long ap = mmu_get_ap(psize); + asm volatile("ptesync": : :"memory"); __tlbiel_va(va, pid, ap, ric); asm volatile("ptesync": : :"memory"); } +static inline void _tlbiel_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + for (addr = start; addr < end; addr += page_size) + __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); + asm volatile("ptesync": : :"memory"); +} + static inline void __tlbie_va(unsigned long va, unsigned long pid, unsigned long ap, unsigned long ric) { @@ -125,13 +140,27 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid, } static inline void _tlbie_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) + unsigned long psize, unsigned long ric) { + unsigned long ap = mmu_get_ap(psize); + asm volatile("ptesync": : :"memory"); __tlbie_va(va, pid, ap, ric); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } +static inline void _tlbie_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + for (addr = start; addr < end; addr += page_size) + __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} /* * Base TLB flushing operations: @@ -174,12 +203,11 @@ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmadd int psize) { unsigned long pid; - unsigned long ap = mmu_get_ap(psize); preempt_disable(); pid = mm->context.id; if (pid != MMU_NO_CONTEXT) - _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); preempt_enable(); } @@ -239,7 +267,6 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) { unsigned long pid; - unsigned long ap = mmu_get_ap(psize); pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) @@ -247,9 +274,9 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, preempt_disable(); if (!mm_is_thread_local(mm)) - _tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB); + _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); else - _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); preempt_enable(); } @@ -336,9 +363,7 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize) { unsigned long pid; - unsigned long addr; bool local; - unsigned long ap = mmu_get_ap(psize); unsigned long page_size = 1UL << mmu_psize_defs[psize].shift; pid = mm->context.id; @@ -354,17 +379,10 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, else _tlbie_pid(pid, RIC_FLUSH_TLB); } else { - asm volatile("ptesync": : :"memory"); - for (addr = start; addr < end; addr += page_size) { - if (local) - __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); - else - __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); - } if (local) - asm volatile("ptesync": : :"memory"); + _tlbiel_va_range(start, end, pid, page_size, psize); else - asm volatile("eieio; tlbsync; ptesync": : :"memory"); + _tlbie_va_range(start, end, pid, page_size, psize); } preempt_enable(); } @@ -372,7 +390,6 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, #ifdef CONFIG_TRANSPARENT_HUGEPAGE void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) { - unsigned long ap = mmu_get_ap(mmu_virtual_psize); unsigned long pid, end; bool local; @@ -395,19 +412,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) _tlbie_pid(pid, RIC_FLUSH_PWC); /* Then iterate the pages */ - asm volatile("ptesync": : :"memory"); end = addr + HPAGE_PMD_SIZE; - for (; addr < end; addr += PAGE_SIZE) { - if (local) - _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); - else - _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); - } - if (local) - asm volatile("ptesync": : :"memory"); + _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize); else - asm volatile("eieio; tlbsync; ptesync": : :"memory"); + _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize); preempt_enable(); } -- cgit From cbf09c837720f72f5e63ab7a2d331ec6cc9a3417 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 7 Nov 2017 18:53:07 +1100 Subject: powerpc/64s/radix: Optimize flush_tlb_range Currently for radix, flush_tlb_range flushes the entire PID, because the Linux mm code does not tell us about page size here for THP vs regular pages. This is quite sub-optimal for small mremap / mprotect / change_protection. So implement va range flushes with two flush passes, one for each page size (regular and THP). The second flush has an order of matnitude fewer tlbie instructions than the first, so it is a relatively small additional cost. There is still room for improvement here with some changes to generic APIs, particularly if there are mostly THP pages to be invalidated, the small page flushes could be reduced. Time to mprotect 1 page of memory (after mmap, touch): vanilla 2.9us 1.8us patched 1.2us 1.6us Time to mprotect 30 pages of memory (after mmap, touch): vanilla 8.2us 7.2us patched 6.9us 17.9us Time to mprotect 34 pages of memory (after mmap, touch): vanilla 9.1us 8.0us patched 9.0us 8.0us 34 pages is the point at which the invalidation switches from va to entire PID, which tlbie can do in a single instruction. This is why in the case of 30 pages, the new code runs slower for this test. This is a deliberate tradeoff already present in the unmap and THP promotion code, the idea is that the benefit from avoiding flushing entire TLB for this PID on all threads in the system. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb-radix.c | 138 +++++++++++++++++++++++++++++++++----------- 1 file changed, 103 insertions(+), 35 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 9916ea2fff43..b4b49de551a9 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -100,6 +100,17 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid, trace_tlbie(0, 1, rb, rs, ric, prs, r); } +static inline void __tlbiel_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); +} + static inline void _tlbiel_va(unsigned long va, unsigned long pid, unsigned long psize, unsigned long ric) { @@ -114,12 +125,8 @@ static inline void _tlbiel_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, unsigned long psize) { - unsigned long addr; - unsigned long ap = mmu_get_ap(psize); - asm volatile("ptesync": : :"memory"); - for (addr = start; addr < end; addr += page_size) - __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); + __tlbiel_va_range(start, end, pid, page_size, psize); asm volatile("ptesync": : :"memory"); } @@ -139,6 +146,17 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid, trace_tlbie(0, 0, rb, rs, ric, prs, r); } +static inline void __tlbie_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); +} + static inline void _tlbie_va(unsigned long va, unsigned long pid, unsigned long psize, unsigned long ric) { @@ -153,12 +171,8 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, unsigned long psize) { - unsigned long addr; - unsigned long ap = mmu_get_ap(psize); - asm volatile("ptesync": : :"memory"); - for (addr = start; addr < end; addr += page_size) - __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + __tlbie_va_range(start, end, pid, page_size, psize); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -300,17 +314,78 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) } EXPORT_SYMBOL(radix__flush_tlb_kernel_range); +#define TLB_FLUSH_ALL -1UL + /* - * Currently, for range flushing, we just do a full mm flush. Because - * we use this in code path where we don' track the page size. + * Number of pages above which we invalidate the entire PID rather than + * flush individual pages, for local and global flushes respectively. + * + * tlbie goes out to the interconnect and individual ops are more costly. + * It also does not iterate over sets like the local tlbiel variant when + * invalidating a full PID, so it has a far lower threshold to change from + * individual page flushes to full-pid flushes. */ +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; + void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; + unsigned long pid; + unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift; + unsigned long page_size = 1UL << page_shift; + unsigned long nr_pages = (end - start) >> page_shift; + bool local, full; + +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) + return radix__flush_hugetlb_tlb_range(vma, start, end); +#endif - radix__flush_tlb_mm(mm); + pid = mm->context.id; + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + preempt_disable(); + local = mm_is_thread_local(mm); + full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling); + + if (full) { + if (local) + _tlbiel_pid(pid, RIC_FLUSH_TLB); + else + _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { + bool hflush = false; + unsigned long hstart, hend; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT; + hend = end >> HPAGE_PMD_SHIFT; + if (hstart < hend) { + hstart <<= HPAGE_PMD_SHIFT; + hend <<= HPAGE_PMD_SHIFT; + hflush = true; + } +#endif + + asm volatile("ptesync": : :"memory"); + if (local) { + __tlbiel_va_range(start, end, pid, page_size, mmu_virtual_psize); + if (hflush) + __tlbiel_va_range(hstart, hend, pid, + HPAGE_PMD_SIZE, MMU_PAGE_2M); + asm volatile("ptesync": : :"memory"); + } else { + __tlbie_va_range(start, end, pid, page_size, mmu_virtual_psize); + if (hflush) + __tlbie_va_range(hstart, hend, pid, + HPAGE_PMD_SIZE, MMU_PAGE_2M); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); + } + } + preempt_enable(); } EXPORT_SYMBOL(radix__flush_tlb_range); @@ -352,19 +427,14 @@ void radix__tlb_flush(struct mmu_gather *tlb) radix__flush_tlb_mm(mm); } -#define TLB_FLUSH_ALL -1UL -/* - * Number of pages above which we will do a bcast tlbie. Just a - * number at this point copied from x86 - */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; - void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize) { unsigned long pid; - bool local; - unsigned long page_size = 1UL << mmu_psize_defs[psize].shift; + unsigned int page_shift = mmu_psize_defs[psize].shift; + unsigned long page_size = 1UL << page_shift; + unsigned long nr_pages = (end - start) >> page_shift; + bool local, full; pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) @@ -372,8 +442,9 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, preempt_disable(); local = mm_is_thread_local(mm); - if (end == TLB_FLUSH_ALL || - (end - start) > tlb_single_page_flush_ceiling * page_size) { + full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling); + + if (full) { if (local) _tlbiel_pid(pid, RIC_FLUSH_TLB); else @@ -391,7 +462,6 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) { unsigned long pid, end; - bool local; pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) @@ -403,20 +473,18 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) return; } + end = addr + HPAGE_PMD_SIZE; + + /* Otherwise first do the PWC, then iterate the pages. */ preempt_disable(); - local = mm_is_thread_local(mm); - /* Otherwise first do the PWC */ - if (local) - _tlbiel_pid(pid, RIC_FLUSH_PWC); - else - _tlbie_pid(pid, RIC_FLUSH_PWC); - /* Then iterate the pages */ - end = addr + HPAGE_PMD_SIZE; - if (local) + if (mm_is_thread_local(mm)) { + _tlbiel_pid(pid, RIC_FLUSH_PWC); _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize); - else + } else { + _tlbie_pid(pid, RIC_FLUSH_PWC); _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize); + } preempt_enable(); } -- cgit From f6f27951fdf84a6edca3ea14077268ad629b57ac Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 7 Nov 2017 18:53:08 +1100 Subject: powerpc/64s/radix: Introduce local single page ceiling for TLB range flush The single page flush ceiling is the cut-off point at which we switch from invalidating individual pages, to invalidating the entire process address space in response to a range flush. Introduce a local variant of this heuristic because local and global tlbie have significantly different properties: - Local tlbiel requires 128 instructions to invalidate a PID, global tlbie only 1 instruction. - Global tlbie instructions are expensive broadcast operations. The local ceiling has been made much higher, 2x the number of instructions required to invalidate the entire PID (i.e., 256 pages). Time to mprotect N pages of memory (after mmap, touch), local invalidate: N 32 34 64 128 256 512 vanilla 7.4us 9.0us 14.6us 26.4us 50.2us 98.3us patched 7.4us 7.8us 13.8us 26.4us 51.9us 98.3us The behaviour of both is identical at N=32 and N=512. Between there, the vanilla kernel does a PID invalidate and the patched kernel does a va range invalidate. At N=128, these require the same number of tlbiel instructions, so the patched version can be sen to be cheaper when < 128, and more expensive when > 128. However this does not well capture the cost of invalidated TLB. The additional cost at 256 pages does not seem prohibitive. It may be the case that increasing the limit further would continue to be beneficial to avoid invalidating all of the process's TLB entries. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb-radix.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index b4b49de551a9..cfa08da534a7 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -326,6 +326,7 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); * individual page flushes to full-pid flushes. */ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; +static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) @@ -348,8 +349,15 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, return; preempt_disable(); - local = mm_is_thread_local(mm); - full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling); + if (mm_is_thread_local(mm)) { + local = true; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_local_single_page_flush_ceiling); + } else { + local = false; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_single_page_flush_ceiling); + } if (full) { if (local) @@ -441,8 +449,15 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, return; preempt_disable(); - local = mm_is_thread_local(mm); - full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling); + if (mm_is_thread_local(mm)) { + local = true; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_local_single_page_flush_ceiling); + } else { + local = false; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_single_page_flush_ceiling); + } if (full) { if (local) -- cgit From 0b2f5a8a792755c88bd786f89712a9fac9967b2b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 7 Nov 2017 18:53:09 +1100 Subject: powerpc/64s/radix: Improve TLB flushing for page table freeing Unmaps that free page tables always flush the entire PID, which is sub-optimal. Provide TLB range flushing with an additional PWC flush that can be use for va range invalidations with PWC flush. Time to munmap N pages of memory including last level page table teardown (after mmap, touch), local invalidate: N 1 2 4 8 16 32 64 vanilla 3.2us 3.3us 3.4us 3.6us 4.1us 5.2us 7.2us patched 1.4us 1.5us 1.7us 1.9us 2.6us 3.7us 6.2us Global invalidate: N 1 2 4 8 16 32 64 vanilla 2.2us 2.3us 2.4us 2.6us 3.2us 4.1us 6.2us patched 2.1us 2.5us 3.4us 5.2us 8.7us 15.7us 6.2us Local invalidates get much better across the board. Global ones have the same issue where multiple tlbies for va flush do get slower than the single tlbie to invalidate the PID. None of this test captures the TLB benefits of avoiding killing everything. Global gets worse, but it is brought in to line with global invalidate for munmap()s that do not free page tables. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/mm/tlb-radix.c | 90 ++++++++++++++++++++++++++++++--------------- 1 file changed, 61 insertions(+), 29 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index cfa08da534a7..884f4b705b57 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -39,6 +39,20 @@ static inline void __tlbiel_pid(unsigned long pid, int set, trace_tlbie(0, 1, rb, rs, ric, prs, r); } +static inline void __tlbie_pid(unsigned long pid, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + /* * We use 128 set in radix mode and 256 set in hpt mode. */ @@ -70,18 +84,9 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) static inline void _tlbie_pid(unsigned long pid, unsigned long ric) { - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(53); /* IS = 1 */ - rs = pid << PPC_BITLSHIFT(31); - prs = 1; /* process scoped */ - r = 1; /* raidx format */ - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) - : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + __tlbie_pid(pid, ric); asm volatile("eieio; tlbsync; ptesync": : :"memory"); - trace_tlbie(0, 0, rb, rs, ric, prs, r); } static inline void __tlbiel_va(unsigned long va, unsigned long pid, @@ -123,9 +128,11 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid, static inline void _tlbiel_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, - unsigned long psize) + unsigned long psize, bool also_pwc) { asm volatile("ptesync": : :"memory"); + if (also_pwc) + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); __tlbiel_va_range(start, end, pid, page_size, psize); asm volatile("ptesync": : :"memory"); } @@ -169,9 +176,11 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid, static inline void _tlbie_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, - unsigned long psize) + unsigned long psize, bool also_pwc) { asm volatile("ptesync": : :"memory"); + if (also_pwc) + __tlbie_pid(pid, RIC_FLUSH_PWC); __tlbie_va_range(start, end, pid, page_size, psize); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -412,13 +421,15 @@ static int radix_get_mmu_psize(int page_size) return psize; } +static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize); + void radix__tlb_flush(struct mmu_gather *tlb) { int psize = 0; struct mm_struct *mm = tlb->mm; int page_size = tlb->page_size; - psize = radix_get_mmu_psize(page_size); /* * if page size is not something we understand, do a full mm flush * @@ -426,17 +437,28 @@ void radix__tlb_flush(struct mmu_gather *tlb) * that flushes the process table entry cache upon process teardown. * See the comment for radix in arch_exit_mmap(). */ - if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all) - radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize); - else if (tlb->fullmm || tlb->need_flush_all) { - tlb->need_flush_all = 0; + if (tlb->fullmm) { radix__flush_all_mm(mm); - } else - radix__flush_tlb_mm(mm); + } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) { + if (!tlb->need_flush_all) + radix__flush_tlb_mm(mm); + else + radix__flush_all_mm(mm); + } else { + unsigned long start = tlb->start; + unsigned long end = tlb->end; + + if (!tlb->need_flush_all) + radix__flush_tlb_range_psize(mm, start, end, psize); + else + radix__flush_tlb_pwc_range_psize(mm, start, end, psize); + } + tlb->need_flush_all = 0; } -void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, - unsigned long end, int psize) +static inline void __radix__flush_tlb_range_psize(struct mm_struct *mm, + unsigned long start, unsigned long end, + int psize, bool also_pwc) { unsigned long pid; unsigned int page_shift = mmu_psize_defs[psize].shift; @@ -461,18 +483,30 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, if (full) { if (local) - _tlbiel_pid(pid, RIC_FLUSH_TLB); + _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : RIC_FLUSH_TLB); else - _tlbie_pid(pid, RIC_FLUSH_TLB); + _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL: RIC_FLUSH_TLB); } else { if (local) - _tlbiel_va_range(start, end, pid, page_size, psize); + _tlbiel_va_range(start, end, pid, page_size, psize, also_pwc); else - _tlbie_va_range(start, end, pid, page_size, psize); + _tlbie_va_range(start, end, pid, page_size, psize, also_pwc); } preempt_enable(); } +void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize) +{ + return __radix__flush_tlb_range_psize(mm, start, end, psize, false); +} + +static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize) +{ + __radix__flush_tlb_range_psize(mm, start, end, psize, true); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) { @@ -494,11 +528,9 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) preempt_disable(); if (mm_is_thread_local(mm)) { - _tlbiel_pid(pid, RIC_FLUSH_PWC); - _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize); + _tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); } else { - _tlbie_pid(pid, RIC_FLUSH_PWC); - _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize); + _tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize, true); } preempt_enable(); -- cgit From 339a3293f4e493a6c40f71e4faab0c8389174313 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 23 Oct 2017 18:05:07 +1000 Subject: powerpc/powernv: Avoid waiting for secondary hold spinloop with OPAL OPAL boot does not insert secondaries at 0x60 to wait at the secondary hold spinloop. Instead they are started later, and inserted at generic_secondary_smp_init(), which is after the secondary hold spinloop. Avoid waiting on this spinloop when booting with OPAL firmware. This wait always times out that case. This saves 100ms boot time on powernv, and 10s of seconds of real time when booting on the simulator in SMP. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/head_64.S | 16 +++++++++++----- arch/powerpc/kernel/setup_64.c | 10 +++++++++- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index ff8511d6d8ea..aa71a90f5222 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -55,12 +55,18 @@ * * For pSeries or server processors: * 1. The MMU is off & open firmware is running in real mode. - * 2. The kernel is entered at __start + * 2. The primary CPU enters at __start. + * 3. If the RTAS supports "query-cpu-stopped-state", then secondary + * CPUs will enter as directed by "start-cpu" RTAS call, which is + * generic_secondary_smp_init, with PIR in r3. + * 4. Else the secondary CPUs will enter at secondary_hold (0x60) as + * directed by the "start-cpu" RTS call, with PIR in r3. * -or- For OPAL entry: - * 1. The MMU is off, processor in HV mode, primary CPU enters at 0 - * with device-tree in gpr3. We also get OPAL base in r8 and - * entry in r9 for debugging purposes - * 2. Secondary processors enter at 0x60 with PIR in gpr3 + * 1. The MMU is off, processor in HV mode. + * 2. The primary CPU enters at 0 with device-tree in r3, OPAL base + * in r8, and entry in r9 for debugging purposes. + * 3. Secondary CPUs enter as directed by OPAL_START_CPU call, which + * is at generic_secondary_smp_init, with PIR in r3. * * For Book3E processors: * 1. The MMU is on running in AS0 in a state defined in ePAPR diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index b89c6aac48c9..87b4fe75fd45 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -360,8 +360,16 @@ void early_setup_secondary(void) #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE) static bool use_spinloop(void) { - if (!IS_ENABLED(CONFIG_PPC_BOOK3E)) + if (IS_ENABLED(CONFIG_PPC_BOOK3S)) { + /* + * See comments in head_64.S -- not all platforms insert + * secondaries at __secondary_hold and wait at the spin + * loop. + */ + if (firmware_has_feature(FW_FEATURE_OPAL)) + return false; return true; + } /* * When book3e boots from kexec, the ePAPR spin table does -- cgit From 1696d0fb7fcd18160c9cc92a3f2b2d68e6923dd8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 24 Oct 2017 21:44:44 +1000 Subject: powerpc/64: Set DSCR default initially from SPR Take the DSCR value set by firmware as the dscr_default value, rather than zero. POWER9 recommends DSCR default to a non-zero value. Signed-off-by: From: Nicholas Piggin [mpe: Make record_spr_defaults() __init] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup.h | 6 ++++++ arch/powerpc/kernel/setup_64.c | 9 +++++++++ arch/powerpc/kernel/sysfs.c | 11 +++++++++++ 3 files changed, 26 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/setup.h b/arch/powerpc/kernel/setup.h index cfba134b3024..21c18071d9d5 100644 --- a/arch/powerpc/kernel/setup.h +++ b/arch/powerpc/kernel/setup.h @@ -45,6 +45,12 @@ void emergency_stack_init(void); static inline void emergency_stack_init(void) { }; #endif +#ifdef CONFIG_PPC64 +void record_spr_defaults(void); +#else +static inline void record_spr_defaults(void) { }; +#endif + /* * Having this in kvm_ppc.h makes include dependencies too * tricky to solve for setup-common.c so have it here. diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 87b4fe75fd45..8956a9856604 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -69,6 +69,8 @@ #include #include +#include "setup.h" + #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) #else @@ -316,6 +318,13 @@ void __init early_setup(unsigned long dt_ptr) /* Initialize the hash table or TLB handling */ early_init_mmu(); + /* + * After firmware and early platform setup code has set things up, + * we note the SPR values for configurable control/performance + * registers, and use those as initial defaults. + */ + record_spr_defaults(); + /* * At this point, we can let interrupts switch to virtual mode * (the MMU has been setup), so adjust the MSR in the PACA to diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 4437c70c7c2b..b8d4a1dac39f 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -590,6 +590,17 @@ static void sysfs_create_dscr_default(void) if (cpu_has_feature(CPU_FTR_DSCR)) err = device_create_file(cpu_subsys.dev_root, &dev_attr_dscr_default); } + +void __init record_spr_defaults(void) +{ + int cpu; + + if (cpu_has_feature(CPU_FTR_DSCR)) { + dscr_default = mfspr(SPRN_DSCR); + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + paca[cpu].dscr_default = dscr_default; + } +} #endif /* CONFIG_PPC64 */ #ifdef HAS_PPC_PMC_PA6T -- cgit From e34917fbee1226144c94413697ddbf2d5b06d0d3 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:41 -0800 Subject: powerpc/vas: init missing fields from [rt]xattr Initialize a few missing window context fields from the window attributes specified by the caller. These fields are currently set to their default values by the caller (NX-842), but would be good to apply them anyway. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 5aae845b8cd9..cec7ab7119df 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -679,10 +679,13 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, winctx->nx_win = rxattr->nx_win; winctx->fault_win = rxattr->fault_win; + winctx->user_win = rxattr->user_win; + winctx->rej_no_credit = rxattr->rej_no_credit; winctx->rx_word_mode = rxattr->rx_win_ord_mode; winctx->tx_word_mode = rxattr->tx_win_ord_mode; winctx->rx_wcred_mode = rxattr->rx_wcred_mode; winctx->tx_wcred_mode = rxattr->tx_wcred_mode; + winctx->notify_early = rxattr->notify_early; if (winctx->nx_win) { winctx->data_stamp = true; @@ -889,11 +892,14 @@ static void init_winctx_for_txwin(struct vas_window *txwin, winctx->user_win = txattr->user_win; winctx->nx_win = txwin->rxwin->nx_win; winctx->pin_win = txattr->pin_win; + winctx->rej_no_credit = txattr->rej_no_credit; + winctx->rsvd_txbuf_enable = txattr->rsvd_txbuf_enable; winctx->rx_wcred_mode = txattr->rx_wcred_mode; winctx->tx_wcred_mode = txattr->tx_wcred_mode; winctx->rx_word_mode = txattr->rx_win_ord_mode; winctx->tx_word_mode = txattr->tx_win_ord_mode; + winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count; if (winctx->nx_win) { winctx->data_stamp = true; -- cgit From 51b537124fc24074aee67cae9ca94ec4d9c204fc Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:42 -0800 Subject: powerpc/vas: Validate window credits NX-842, the only user of VAS, sets the window credits to default values but VAS should check the credits against the possible max values. The VAS_WCREDS_MIN is not needed and can be dropped. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 6 ++++++ arch/powerpc/platforms/powernv/vas.h | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index cec7ab7119df..a2fe120ac06d 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -738,6 +738,9 @@ static bool rx_win_args_valid(enum vas_cop_type cop, if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX) return false; + if (attr->wcreds_max > VAS_RX_WCREDS_MAX) + return false; + if (attr->nx_win) { /* cannot be fault or user window if it is nx */ if (attr->fault_win || attr->user_win) @@ -927,6 +930,9 @@ static bool tx_win_args_valid(enum vas_cop_type cop, if (cop > VAS_COP_TYPE_MAX) return false; + if (attr->wcreds_max > VAS_TX_WCREDS_MAX) + return false; + if (attr->user_win && (cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count)) return false; diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 38dee5d50f31..fea0de44f076 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -106,8 +106,8 @@ * * TODO: Needs tuning for per-process credits */ -#define VAS_WCREDS_MIN 16 -#define VAS_WCREDS_MAX ((64 << 10) - 1) +#define VAS_RX_WCREDS_MAX ((64 << 10) - 1) +#define VAS_TX_WCREDS_MAX ((4 << 10) - 1) #define VAS_WCREDS_DEFAULT (1 << 10) /* -- cgit From 0a2c2c24cf78473da785654361ec957f129f4820 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:43 -0800 Subject: powerpc/vas: Cleanup some debug code Clean up vas.h and the debug code around ifdef vas_debug. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 8 +++-- arch/powerpc/platforms/powernv/vas.h | 54 ++++++----------------------- 2 files changed, 17 insertions(+), 45 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index a2fe120ac06d..67ffc5d994cc 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -726,7 +726,10 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, static bool rx_win_args_valid(enum vas_cop_type cop, struct vas_rx_win_attr *attr) { - dump_rx_win_attr(attr); + pr_debug("Rxattr: fault %d, notify %d, intr %d, early %d, fifo %d\n", + attr->fault_win, attr->notify_disable, + attr->intr_disable, attr->notify_early, + attr->rx_fifo_size); if (cop >= VAS_COP_TYPE_MAX) return false; @@ -1050,7 +1053,8 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re) else rc = -EINVAL; - print_fifo_msg_count(txwin); + pr_debug("Txwin #%d: Msg count %llu\n", txwin->winid, + read_hvwc_reg(txwin, VREG(LRFIFO_PUSH))); return rc; } diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index fea0de44f076..63e8e037eda0 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -258,6 +258,16 @@ #define VAS_NX_UTIL_ADDER_OFFSET 0x180 #define VAS_NX_UTIL_ADDER PPC_BITMASK(32, 63) +/* + * VREG(x): + * Expand a register's short name (eg: LPID) into two parameters: + * - the register's short name in string form ("LPID"), and + * - the name of the macro (eg: VAS_LPID_OFFSET), defining the + * register's offset in the window context + */ +#define VREG_SFX(n, s) __stringify(n), VAS_##n##s +#define VREG(r) VREG_SFX(r, _OFFSET) + /* * Local Notify Scope Control Register. (Receive windows only). */ @@ -385,43 +395,15 @@ struct vas_winctx { extern struct vas_instance *find_vas_instance(int vasid); -/* - * VREG(x): - * Expand a register's short name (eg: LPID) into two parameters: - * - the register's short name in string form ("LPID"), and - * - the name of the macro (eg: VAS_LPID_OFFSET), defining the - * register's offset in the window context - */ -#define VREG_SFX(n, s) __stringify(n), VAS_##n##s -#define VREG(r) VREG_SFX(r, _OFFSET) - -#ifdef vas_debug -static inline void dump_rx_win_attr(struct vas_rx_win_attr *attr) -{ - pr_err("fault %d, notify %d, intr %d early %d\n", - attr->fault_win, attr->notify_disable, - attr->intr_disable, attr->notify_early); - - pr_err("rx_fifo_size %d, max value %d\n", - attr->rx_fifo_size, VAS_RX_FIFO_SIZE_MAX); -} - static inline void vas_log_write(struct vas_window *win, char *name, void *regptr, u64 val) { if (val) - pr_err("%swin #%d: %s reg %p, val 0x%016llx\n", + pr_debug("%swin #%d: %s reg %p, val 0x%016llx\n", win->tx_win ? "Tx" : "Rx", win->winid, name, regptr, val); } -#else /* vas_debug */ - -#define vas_log_write(win, name, reg, val) -#define dump_rx_win_attr(attr) - -#endif /* vas_debug */ - static inline void write_uwc_reg(struct vas_window *win, char *name, s32 reg, u64 val) { @@ -450,18 +432,4 @@ static inline u64 read_hvwc_reg(struct vas_window *win, return in_be64(win->hvwc_map+reg); } -#ifdef vas_debug - -static void print_fifo_msg_count(struct vas_window *txwin) -{ - uint64_t read_hvwc_reg(struct vas_window *w, char *n, uint64_t o); - pr_devel("Winid %d, Msg count %llu\n", txwin->winid, - (uint64_t)read_hvwc_reg(txwin, VREG(LRFIFO_PUSH))); -} -#else /* vas_debug */ - -#define print_fifo_msg_count(window) - -#endif /* vas_debug */ - #endif /* _VAS_H */ -- cgit From 4963ac3632dda7433db5149d6abdfc644a8d8ab2 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:44 -0800 Subject: powerpc/vas: Drop poll_window_cast_out(). Polling for window cast out is listed in the spec, but turns out that it is not strictly necessary and slows down window close. Making it a stub for now. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 34 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 67ffc5d994cc..8ab8a8208347 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1079,25 +1079,25 @@ retry: } } +/* + * Have the hardware cast a window out of cache and wait for it to + * be completed. + * + * NOTE: It can take a relatively long time to cast the window context + * out of the cache. It is not strictly necessary to cast out if: + * + * - we clear the "Pin Window" bit (so hardware is free to evict) + * + * - we re-initialize the window context when it is reassigned. + * + * We do the former in vas_win_close() and latter in vas_win_open(). + * So, ignoring the cast-out for now. We can add it as needed. If + * casting out becomes necessary we should consider offloading the + * job to a worker thread, so the window close can proceed quickly. + */ static void poll_window_castout(struct vas_window *window) { - int cached; - u64 val; - - /* Cast window context out of the cache */ -retry: - val = read_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL)); - cached = GET_FIELD(VAS_WIN_CACHE_STATUS, val); - if (cached) { - val = 0ULL; - val = SET_FIELD(VAS_CASTOUT_REQ, val, 1); - val = SET_FIELD(VAS_PUSH_TO_MEM, val, 0); - write_hvwc_reg(window, VREG(WIN_CTX_CACHING_CTL), val); - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - goto retry; - } + /* stub for now */ } /* -- cgit From 36a288fe9dab9a6b0b50ffdb5c34f04c42cee2ac Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:45 -0800 Subject: powerpc/vas: Use helper to unpin/close window Use a helper to have the hardware unpin and mark a window closed. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 8ab8a8208347..95622a984b05 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1100,6 +1100,20 @@ static void poll_window_castout(struct vas_window *window) /* stub for now */ } +/* + * Unpin and close a window so no new requests are accepted and the + * hardware can evict this window from cache if necessary. + */ +static void unpin_close_window(struct vas_window *window) +{ + u64 val; + + val = read_hvwc_reg(window, VREG(WINCTL)); + val = SET_FIELD(VAS_WINCTL_PIN, val, 0); + val = SET_FIELD(VAS_WINCTL_OPEN, val, 0); + write_hvwc_reg(window, VREG(WINCTL), val); +} + /* * Close a window. * @@ -1114,8 +1128,6 @@ static void poll_window_castout(struct vas_window *window) */ int vas_win_close(struct vas_window *window) { - u64 val; - if (!window) return 0; @@ -1131,11 +1143,7 @@ int vas_win_close(struct vas_window *window) poll_window_busy_state(window); - /* Unpin window from cache and close it */ - val = read_hvwc_reg(window, VREG(WINCTL)); - val = SET_FIELD(VAS_WINCTL_PIN, val, 0); - val = SET_FIELD(VAS_WINCTL_OPEN, val, 0); - write_hvwc_reg(window, VREG(WINCTL), val); + unpin_close_window(window); poll_window_castout(window); -- cgit From dfe954e4456277effffb2c5add47fa25390f8cea Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:46 -0800 Subject: powerpc/vas: Reduce polling interval for busy state A VAS window is normally in "busy" state for only a short duration. Reduce the time we wait for the window to go to "not-busy" state to speed-up vas_win_close() a bit. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 95622a984b05..1422cdd7d917 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1060,21 +1060,23 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re) } EXPORT_SYMBOL_GPL(vas_paste_crb); +/* + * Wait for the window to go to "not-busy" state. It should only take a + * short time to queue a CRB, so window should not be busy for too long. + * Trying 5ms intervals. + */ static void poll_window_busy_state(struct vas_window *window) { int busy; u64 val; retry: - /* - * Poll Window Busy flag - */ val = read_hvwc_reg(window, VREG(WIN_STATUS)); busy = GET_FIELD(VAS_WIN_BUSY, val); if (busy) { val = 0; set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); + schedule_timeout(msecs_to_jiffies(5)); goto retry; } } -- cgit From 62f659e08ccd657ead6901011f5e542dbdc477c5 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:47 -0800 Subject: powerpc/vas: Save configured window credits Save the configured max window credits for a window in the vas_window structure. We will need this when polling for return of window credits. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 6 ++++-- arch/powerpc/platforms/powernv/vas.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 1422cdd7d917..a59a187c0cd1 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -674,7 +674,7 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, winctx->rx_fifo = rxattr->rx_fifo; winctx->rx_fifo_size = rxattr->rx_fifo_size; - winctx->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT; + winctx->wcreds_max = rxwin->wcreds_max; winctx->pin_win = rxattr->pin_win; winctx->nx_win = rxattr->nx_win; @@ -844,6 +844,7 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop, rxwin->nx_win = rxattr->nx_win; rxwin->user_win = rxattr->user_win; rxwin->cop = cop; + rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT; if (rxattr->user_win) rxwin->pid = task_pid_vnr(current); @@ -893,7 +894,7 @@ static void init_winctx_for_txwin(struct vas_window *txwin, */ memset(winctx, 0, sizeof(struct vas_winctx)); - winctx->wcreds_max = txattr->wcreds_max ?: VAS_WCREDS_DEFAULT; + winctx->wcreds_max = txwin->wcreds_max; winctx->user_win = txattr->user_win; winctx->nx_win = txwin->rxwin->nx_win; @@ -978,6 +979,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, txwin->nx_win = txwin->rxwin->nx_win; txwin->pid = attr->pid; txwin->user_win = attr->user_win; + txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT; init_winctx_for_txwin(txwin, attr, &winctx); diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 63e8e037eda0..02d8a31d9051 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -332,6 +332,7 @@ struct vas_window { void *hvwc_map; /* HV window context */ void *uwc_map; /* OS/User window context */ pid_t pid; /* Linux process id of owner */ + int wcreds_max; /* Window credits */ /* Fields applicable only to send windows */ void *paste_kaddr; -- cgit From 6fccac16c578c699bf0714a6c930b0ceb81305a0 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:48 -0800 Subject: powerpc/vas: poll for return of window credits Normally, the NX driver waits for the CRBs to be processed before closing the window. But it is better to ensure that the credits are returned before the window gets reassigned later. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 45 +++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index a59a187c0cd1..23c13a7dcf89 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1062,6 +1062,49 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re) } EXPORT_SYMBOL_GPL(vas_paste_crb); +/* + * If credit checking is enabled for this window, poll for the return + * of window credits (i.e for NX engines to process any outstanding CRBs). + * Since NX-842 waits for the CRBs to be processed before closing the + * window, we should not have to wait for too long. + * + * TODO: We retry in 10ms intervals now. We could/should probably peek at + * the VAS_LRFIFO_PUSH_OFFSET register to get an estimate of pending + * CRBs on the FIFO and compute the delay dynamically on each retry. + * But that is not really needed until we support NX-GZIP access from + * user space. (NX-842 driver waits for CSB and Fast thread-wakeup + * doesn't use credit checking). + */ +static void poll_window_credits(struct vas_window *window) +{ + u64 val; + int creds, mode; + + val = read_hvwc_reg(window, VREG(WINCTL)); + if (window->tx_win) + mode = GET_FIELD(VAS_WINCTL_TX_WCRED_MODE, val); + else + mode = GET_FIELD(VAS_WINCTL_RX_WCRED_MODE, val); + + if (!mode) + return; +retry: + if (window->tx_win) { + val = read_hvwc_reg(window, VREG(TX_WCRED)); + creds = GET_FIELD(VAS_TX_WCRED, val); + } else { + val = read_hvwc_reg(window, VREG(LRX_WCRED)); + creds = GET_FIELD(VAS_LRX_WCRED, val); + } + + if (creds < window->wcreds_max) { + val = 0; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(msecs_to_jiffies(10)); + goto retry; + } +} + /* * Wait for the window to go to "not-busy" state. It should only take a * short time to queue a CRB, so window should not be busy for too long. @@ -1149,6 +1192,8 @@ int vas_win_close(struct vas_window *window) unpin_close_window(window); + poll_window_credits(window); + poll_window_castout(window); /* if send window, drop reference to matching receive window */ -- cgit From ca03258b6b338b392c778bed9c7dd56e6a513012 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:49 -0800 Subject: powerpc/vas: Create cpu to vas id mapping Create a cpu to vasid mapping so callers can specify -1 instead of trying to find a VAS id. Changelog[v2] [Michael Ellerman] Use per-cpu variables to simplify code. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c index 565a4878fefa..abb7090a22b4 100644 --- a/arch/powerpc/platforms/powernv/vas.c +++ b/arch/powerpc/platforms/powernv/vas.c @@ -18,15 +18,18 @@ #include #include #include +#include #include "vas.h" static DEFINE_MUTEX(vas_mutex); static LIST_HEAD(vas_instances); +static DEFINE_PER_CPU(int, cpu_vas_id); + static int init_vas_instance(struct platform_device *pdev) { - int rc, vasid; + int rc, cpu, vasid; struct resource *res; struct vas_instance *vinst; struct device_node *dn = pdev->dev.of_node; @@ -74,6 +77,11 @@ static int init_vas_instance(struct platform_device *pdev) "paste_win_id_shift 0x%llx\n", pdev->name, vasid, vinst->paste_base_addr, vinst->paste_win_id_shift); + for_each_possible_cpu(cpu) { + if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn)) + per_cpu(cpu_vas_id, cpu) = vasid; + } + mutex_lock(&vas_mutex); list_add(&vinst->node, &vas_instances); mutex_unlock(&vas_mutex); @@ -98,6 +106,10 @@ struct vas_instance *find_vas_instance(int vasid) struct vas_instance *vinst; mutex_lock(&vas_mutex); + + if (vasid == -1) + vasid = per_cpu(cpu_vas_id, smp_processor_id()); + list_for_each(ent, &vas_instances) { vinst = list_entry(ent, struct vas_instance, node); if (vinst->vas_id == vasid) { -- cgit From d4ef61b5e8955fb913e2e1a6c1533414859a839d Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:50 -0800 Subject: powerpc/vas, nx-842: Define and use chip_to_vas_id() Define a helper, chip_to_vas_id() to map a given chip id to corresponding vas id. Normally, callers of vas_rx_win_open() and vas_tx_win_open() want the VAS window to be on the same chip where the calling thread is executing. These callers can pass in -1 for the VAS id. This interface will be useful if a thread running on one chip wants to open a window on another chip (like the NX-842 driver does during start up). Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/vas.h | 9 +++++++++ arch/powerpc/platforms/powernv/vas.c | 11 +++++++++++ 2 files changed, 20 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h index fd5963acd658..044748f547f5 100644 --- a/arch/powerpc/include/asm/vas.h +++ b/arch/powerpc/include/asm/vas.h @@ -103,6 +103,15 @@ struct vas_tx_win_attr { bool rx_win_ord_mode; }; +/* + * Helper to map a chip id to VAS id. + * For POWER9, this is a 1:1 mapping. In the future this maybe a 1:N + * mapping in which case, we will need to update this helper. + * + * Return the VAS id or -1 if no matching vasid is found. + */ +int chip_to_vas_id(int chipid); + /* * Helper to initialize receive window attributes to defaults for an * NX window. diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c index abb7090a22b4..cd9a733d05e2 100644 --- a/arch/powerpc/platforms/powernv/vas.c +++ b/arch/powerpc/platforms/powernv/vas.c @@ -123,6 +123,17 @@ struct vas_instance *find_vas_instance(int vasid) return NULL; } +int chip_to_vas_id(int chipid) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu_to_chip_id(cpu) == chipid) + return per_cpu(cpu_vas_id, cpu); + } + return -1; +} + static int vas_probe(struct platform_device *pdev) { return init_vas_instance(pdev); -- cgit From ece4e51291485bb4a71ff554964948b02ab89823 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:51 -0800 Subject: powerpc/vas: Export HVWC to debugfs Export the VAS Window context information to debugfs. We need to hold a mutex when closing the window to prevent a race with the debugfs read(). Rather than introduce a per-instance mutex, we use the global vas_mutex for now, since it is not heavily contended. The window->cop field is only relevant to a receive window so we were not setting it for a send window (which is is paired to a receive window anyway). But to simplify reporting in debugfs, set the 'cop' field for the send window also. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/Makefile | 3 +- arch/powerpc/platforms/powernv/vas-debug.c | 209 ++++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/vas-window.c | 34 ++++- arch/powerpc/platforms/powernv/vas.c | 6 +- arch/powerpc/platforms/powernv/vas.h | 14 ++ 5 files changed, 257 insertions(+), 9 deletions(-) create mode 100644 arch/powerpc/platforms/powernv/vas-debug.c (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 37d60f7dd86d..17921c45d10b 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -14,4 +14,5 @@ obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o -obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o +obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o +obj-$(CONFIG_PPC_FTW) += nx-ftw.o diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c new file mode 100644 index 000000000000..ca22f1eae050 --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas-debug.c @@ -0,0 +1,209 @@ +/* + * Copyright 2016-17 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "vas: " fmt + +#include +#include +#include +#include +#include "vas.h" + +static struct dentry *vas_debugfs; + +static char *cop_to_str(int cop) +{ + switch (cop) { + case VAS_COP_TYPE_FAULT: return "Fault"; + case VAS_COP_TYPE_842: return "NX-842 Normal Priority"; + case VAS_COP_TYPE_842_HIPRI: return "NX-842 High Priority"; + case VAS_COP_TYPE_GZIP: return "NX-GZIP Normal Priority"; + case VAS_COP_TYPE_GZIP_HIPRI: return "NX-GZIP High Priority"; + case VAS_COP_TYPE_FTW: return "Fast Thread-wakeup"; + default: return "Unknown"; + } +} + +static int info_dbg_show(struct seq_file *s, void *private) +{ + struct vas_window *window = s->private; + + mutex_lock(&vas_mutex); + + /* ensure window is not unmapped */ + if (!window->hvwc_map) + goto unlock; + + seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop), + window->tx_win ? "Send" : "Receive"); + seq_printf(s, "Pid : %d\n", window->pid); + +unlock: + mutex_unlock(&vas_mutex); + return 0; +} + +static int info_dbg_open(struct inode *inode, struct file *file) +{ + return single_open(file, info_dbg_show, inode->i_private); +} + +static const struct file_operations info_fops = { + .open = info_dbg_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static inline void print_reg(struct seq_file *s, struct vas_window *win, + char *name, u32 reg) +{ + seq_printf(s, "0x%016llx %s\n", read_hvwc_reg(win, name, reg), name); +} + +static int hvwc_dbg_show(struct seq_file *s, void *private) +{ + struct vas_window *window = s->private; + + mutex_lock(&vas_mutex); + + /* ensure window is not unmapped */ + if (!window->hvwc_map) + goto unlock; + + print_reg(s, window, VREG(LPID)); + print_reg(s, window, VREG(PID)); + print_reg(s, window, VREG(XLATE_MSR)); + print_reg(s, window, VREG(XLATE_LPCR)); + print_reg(s, window, VREG(XLATE_CTL)); + print_reg(s, window, VREG(AMR)); + print_reg(s, window, VREG(SEIDR)); + print_reg(s, window, VREG(FAULT_TX_WIN)); + print_reg(s, window, VREG(OSU_INTR_SRC_RA)); + print_reg(s, window, VREG(HV_INTR_SRC_RA)); + print_reg(s, window, VREG(PSWID)); + print_reg(s, window, VREG(LFIFO_BAR)); + print_reg(s, window, VREG(LDATA_STAMP_CTL)); + print_reg(s, window, VREG(LDMA_CACHE_CTL)); + print_reg(s, window, VREG(LRFIFO_PUSH)); + print_reg(s, window, VREG(CURR_MSG_COUNT)); + print_reg(s, window, VREG(LNOTIFY_AFTER_COUNT)); + print_reg(s, window, VREG(LRX_WCRED)); + print_reg(s, window, VREG(LRX_WCRED_ADDER)); + print_reg(s, window, VREG(TX_WCRED)); + print_reg(s, window, VREG(TX_WCRED_ADDER)); + print_reg(s, window, VREG(LFIFO_SIZE)); + print_reg(s, window, VREG(WINCTL)); + print_reg(s, window, VREG(WIN_STATUS)); + print_reg(s, window, VREG(WIN_CTX_CACHING_CTL)); + print_reg(s, window, VREG(TX_RSVD_BUF_COUNT)); + print_reg(s, window, VREG(LRFIFO_WIN_PTR)); + print_reg(s, window, VREG(LNOTIFY_CTL)); + print_reg(s, window, VREG(LNOTIFY_PID)); + print_reg(s, window, VREG(LNOTIFY_LPID)); + print_reg(s, window, VREG(LNOTIFY_TID)); + print_reg(s, window, VREG(LNOTIFY_SCOPE)); + print_reg(s, window, VREG(NX_UTIL_ADDER)); +unlock: + mutex_unlock(&vas_mutex); + return 0; +} + +static int hvwc_dbg_open(struct inode *inode, struct file *file) +{ + return single_open(file, hvwc_dbg_show, inode->i_private); +} + +static const struct file_operations hvwc_fops = { + .open = hvwc_dbg_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +void vas_window_free_dbgdir(struct vas_window *window) +{ + if (window->dbgdir) { + debugfs_remove_recursive(window->dbgdir); + kfree(window->dbgname); + window->dbgdir = NULL; + window->dbgname = NULL; + } +} + +void vas_window_init_dbgdir(struct vas_window *window) +{ + struct dentry *f, *d; + + if (!window->vinst->dbgdir) + return; + + window->dbgname = kzalloc(16, GFP_KERNEL); + if (!window->dbgname) + return; + + snprintf(window->dbgname, 16, "w%d", window->winid); + + d = debugfs_create_dir(window->dbgname, window->vinst->dbgdir); + if (IS_ERR(d)) + goto free_name; + + window->dbgdir = d; + + f = debugfs_create_file("info", 0444, d, window, &info_fops); + if (IS_ERR(f)) + goto remove_dir; + + f = debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops); + if (IS_ERR(f)) + goto remove_dir; + + return; + +free_name: + kfree(window->dbgname); + window->dbgname = NULL; + +remove_dir: + debugfs_remove_recursive(window->dbgdir); + window->dbgdir = NULL; +} + +void vas_instance_init_dbgdir(struct vas_instance *vinst) +{ + struct dentry *d; + + if (!vas_debugfs) + return; + + vinst->dbgname = kzalloc(16, GFP_KERNEL); + if (!vinst->dbgname) + return; + + snprintf(vinst->dbgname, 16, "v%d", vinst->vas_id); + + d = debugfs_create_dir(vinst->dbgname, vas_debugfs); + if (IS_ERR(d)) + goto free_name; + + vinst->dbgdir = d; + return; + +free_name: + kfree(vinst->dbgname); + vinst->dbgname = NULL; + vinst->dbgdir = NULL; +} + +void vas_init_dbgdir(void) +{ + vas_debugfs = debugfs_create_dir("vas", NULL); + if (IS_ERR(vas_debugfs)) + vas_debugfs = NULL; +} diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 23c13a7dcf89..c030d4cf982e 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -145,23 +145,37 @@ static void unmap_paste_region(struct vas_window *window) } /* - * Unmap the MMIO regions for a window. + * Unmap the MMIO regions for a window. Hold the vas_mutex so we don't + * unmap when the window's debugfs dir is in use. This serializes close + * of a window even on another VAS instance but since its not a critical + * path, just minimize the time we hold the mutex for now. We can add + * a per-instance mutex later if necessary. */ static void unmap_winctx_mmio_bars(struct vas_window *window) { int len; + void *uwc_map; + void *hvwc_map; u64 busaddr_start; - if (window->hvwc_map) { + mutex_lock(&vas_mutex); + + hvwc_map = window->hvwc_map; + window->hvwc_map = NULL; + + uwc_map = window->uwc_map; + window->uwc_map = NULL; + + mutex_unlock(&vas_mutex); + + if (hvwc_map) { get_hvwc_mmio_bar(window, &busaddr_start, &len); - unmap_region(window->hvwc_map, busaddr_start, len); - window->hvwc_map = NULL; + unmap_region(hvwc_map, busaddr_start, len); } - if (window->uwc_map) { + if (uwc_map) { get_uwc_mmio_bar(window, &busaddr_start, &len); - unmap_region(window->uwc_map, busaddr_start, len); - window->uwc_map = NULL; + unmap_region(uwc_map, busaddr_start, len); } } @@ -528,6 +542,9 @@ static void vas_window_free(struct vas_window *window) struct vas_instance *vinst = window->vinst; unmap_winctx_mmio_bars(window); + + vas_window_free_dbgdir(window); + kfree(window); vas_release_window_id(&vinst->ida, winid); @@ -552,6 +569,8 @@ static struct vas_window *vas_window_alloc(struct vas_instance *vinst) if (map_winctx_mmio_bars(window)) goto out_free; + vas_window_init_dbgdir(window); + return window; out_free: @@ -974,6 +993,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, goto put_rxwin; } + txwin->cop = cop; txwin->tx_win = 1; txwin->rxwin = rxwin; txwin->nx_win = txwin->rxwin->nx_win; diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c index cd9a733d05e2..c488621dbec3 100644 --- a/arch/powerpc/platforms/powernv/vas.c +++ b/arch/powerpc/platforms/powernv/vas.c @@ -22,7 +22,7 @@ #include "vas.h" -static DEFINE_MUTEX(vas_mutex); +DEFINE_MUTEX(vas_mutex); static LIST_HEAD(vas_instances); static DEFINE_PER_CPU(int, cpu_vas_id); @@ -86,6 +86,8 @@ static int init_vas_instance(struct platform_device *pdev) list_add(&vinst->node, &vas_instances); mutex_unlock(&vas_mutex); + vas_instance_init_dbgdir(vinst); + dev_set_drvdata(&pdev->dev, vinst); return 0; @@ -157,6 +159,8 @@ static int __init vas_init(void) int found = 0; struct device_node *dn; + vas_init_dbgdir(); + platform_driver_register(&vas_driver); for_each_compatible_node(dn, NULL, "ibm,vas") { diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 02d8a31d9051..756cbc5335bc 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -13,6 +13,8 @@ #include #include #include +#include +#include /* * Overview of Virtual Accelerator Switchboard (VAS). @@ -317,6 +319,9 @@ struct vas_instance { struct mutex mutex; struct vas_window *rxwin[VAS_COP_TYPE_MAX]; struct vas_window *windows[VAS_WINDOWS_PER_CHIP]; + + char *dbgname; + struct dentry *dbgdir; }; /* @@ -334,6 +339,9 @@ struct vas_window { pid_t pid; /* Linux process id of owner */ int wcreds_max; /* Window credits */ + char *dbgname; + struct dentry *dbgdir; + /* Fields applicable only to send windows */ void *paste_kaddr; char *paste_addr_name; @@ -394,7 +402,13 @@ struct vas_winctx { enum vas_notify_after_count notify_after_count; }; +extern struct mutex vas_mutex; + extern struct vas_instance *find_vas_instance(int vasid); +extern void vas_init_dbgdir(void); +extern void vas_instance_init_dbgdir(struct vas_instance *vinst); +extern void vas_window_init_dbgdir(struct vas_window *win); +extern void vas_window_free_dbgdir(struct vas_window *win); static inline void vas_log_write(struct vas_window *win, char *name, void *regptr, u64 val) -- cgit From ec233ede4c8654894610ea54f4dae7adc954ac62 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:53 -0800 Subject: powerpc: Add support for setting SPRN_TIDR We need the SPRN_TIDR to be set for use with fast thread-wakeup (core- to-core wakeup) and also with CAPI. Each thread in a process needs to have a unique id within the process. But for now, we assign globally unique thread ids to all threads in the system. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Philippe Bergheaud Signed-off-by: Christophe Lombard [mpe: Simplify tidr clearing on fork() and ctx switch code] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/processor.h | 1 + arch/powerpc/include/asm/switch_to.h | 3 + arch/powerpc/kernel/process.c | 116 +++++++++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index fab7ff877304..58cc21274423 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -329,6 +329,7 @@ struct thread_struct { */ int dscr_inherit; unsigned long ppr; /* used to save/restore SMT priority */ + unsigned long tidr; #endif #ifdef CONFIG_PPC_BOOK3S_64 unsigned long tar; diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 17c8380673a6..f5da32f43fed 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -91,4 +91,7 @@ static inline void clear_task_ebb(struct task_struct *t) #endif } +extern int set_thread_tidr(struct task_struct *t); +extern void clear_thread_tidr(struct task_struct *t); + #endif /* _ASM_POWERPC_SWITCH_TO_H */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 4083b737decb..887ae5047288 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1136,6 +1136,10 @@ static inline void restore_sprs(struct thread_struct *old_thread, if (old_thread->tar != new_thread->tar) mtspr(SPRN_TAR, new_thread->tar); } + + if (cpu_has_feature(CPU_FTR_ARCH_300) && + old_thread->tidr != new_thread->tidr) + mtspr(SPRN_TIDR, new_thread->tidr); #endif } @@ -1451,6 +1455,116 @@ void flush_thread(void) #endif /* CONFIG_HAVE_HW_BREAKPOINT */ } +#ifdef CONFIG_PPC64 +static DEFINE_SPINLOCK(vas_thread_id_lock); +static DEFINE_IDA(vas_thread_ida); + +/* + * We need to assign a unique thread id to each thread in a process. + * + * This thread id, referred to as TIDR, and separate from the Linux's tgid, + * is intended to be used to direct an ASB_Notify from the hardware to the + * thread, when a suitable event occurs in the system. + * + * One such event is a "paste" instruction in the context of Fast Thread + * Wakeup (aka Core-to-core wake up in the Virtual Accelerator Switchboard + * (VAS) in POWER9. + * + * To get a unique TIDR per process we could simply reuse task_pid_nr() but + * the problem is that task_pid_nr() is not yet available copy_thread() is + * called. Fixing that would require changing more intrusive arch-neutral + * code in code path in copy_process()?. + * + * Further, to assign unique TIDRs within each process, we need an atomic + * field (or an IDR) in task_struct, which again intrudes into the arch- + * neutral code. So try to assign globally unique TIDRs for now. + * + * NOTE: TIDR 0 indicates that the thread does not need a TIDR value. + * For now, only threads that expect to be notified by the VAS + * hardware need a TIDR value and we assign values > 0 for those. + */ +#define MAX_THREAD_CONTEXT ((1 << 16) - 1) +static int assign_thread_tidr(void) +{ + int index; + int err; + +again: + if (!ida_pre_get(&vas_thread_ida, GFP_KERNEL)) + return -ENOMEM; + + spin_lock(&vas_thread_id_lock); + err = ida_get_new_above(&vas_thread_ida, 1, &index); + spin_unlock(&vas_thread_id_lock); + + if (err == -EAGAIN) + goto again; + else if (err) + return err; + + if (index > MAX_THREAD_CONTEXT) { + spin_lock(&vas_thread_id_lock); + ida_remove(&vas_thread_ida, index); + spin_unlock(&vas_thread_id_lock); + return -ENOMEM; + } + + return index; +} + +static void free_thread_tidr(int id) +{ + spin_lock(&vas_thread_id_lock); + ida_remove(&vas_thread_ida, id); + spin_unlock(&vas_thread_id_lock); +} + +/* + * Clear any TIDR value assigned to this thread. + */ +void clear_thread_tidr(struct task_struct *t) +{ + if (!t->thread.tidr) + return; + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + WARN_ON_ONCE(1); + return; + } + + mtspr(SPRN_TIDR, 0); + free_thread_tidr(t->thread.tidr); + t->thread.tidr = 0; +} + +void arch_release_task_struct(struct task_struct *t) +{ + clear_thread_tidr(t); +} + +/* + * Assign a unique TIDR (thread id) for task @t and set it in the thread + * structure. For now, we only support setting TIDR for 'current' task. + */ +int set_thread_tidr(struct task_struct *t) +{ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -EINVAL; + + if (t != current) + return -EINVAL; + + t->thread.tidr = assign_thread_tidr(); + if (t->thread.tidr < 0) + return t->thread.tidr; + + mtspr(SPRN_TIDR, t->thread.tidr); + + return 0; +} + +#endif /* CONFIG_PPC64 */ + void release_thread(struct task_struct *t) { @@ -1597,6 +1711,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, } if (cpu_has_feature(CPU_FTR_HAS_PPR)) p->thread.ppr = INIT_PPR; + + p->thread.tidr = 0; #endif kregs->nip = ppc_function_entry(f); return 0; -- cgit From 9d2a4d71332cfdf4ea90754ad9b2f05a5ee5f6c7 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:54 -0800 Subject: powerpc: Define set_thread_uses_vas() A CP_ABORT instruction is required in processes that have mapped a VAS "paste address" with the intention of using COPY/PASTE instructions. But since CP_ABORT is expensive, we want to restrict it to only processes that use/intend to use COPY/PASTE. Define an interface, set_thread_uses_vas(), that VAS can use to indicate that the current process opened a send window. During context switch, issue CP_ABORT only for processes that have the flag set. Thanks for input from Nick Piggin, Michael Ellerman. Signed-off-by: Sukadev Bhattiprolu [mpe: Fix to not use new_thread after _switch() returns] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/processor.h | 2 ++ arch/powerpc/include/asm/switch_to.h | 2 ++ arch/powerpc/kernel/process.c | 41 +++++++++++++++++++++++++++--------- 3 files changed, 35 insertions(+), 10 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 58cc21274423..bdab3b74eb98 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -341,7 +341,9 @@ struct thread_struct { unsigned long sier; unsigned long mmcr2; unsigned mmcr0; + unsigned used_ebb; + unsigned int used_vas; #endif }; diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index f5da32f43fed..07ff06e9dae1 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -91,6 +91,8 @@ static inline void clear_task_ebb(struct task_struct *t) #endif } +extern int set_thread_uses_vas(void); + extern int set_thread_tidr(struct task_struct *t); extern void clear_thread_tidr(struct task_struct *t); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 887ae5047288..bfdd783e3916 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1244,17 +1244,17 @@ struct task_struct *__switch_to(struct task_struct *prev, * The copy-paste buffer can only store into foreign real * addresses, so unprivileged processes can not see the * data or use it in any way unless they have foreign real - * mappings. We don't have a VAS driver that allocates those - * yet, so no cpabort is required. + * mappings. If the new process has the foreign real address + * mappings, we must issue a cp_abort to clear any state and + * prevent snooping, corruption or a covert channel. + * + * DD1 allows paste into normal system memory so we do an + * unpaired copy, rather than cp_abort, to clear the buffer, + * since cp_abort is quite expensive. */ - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - /* - * DD1 allows paste into normal system memory, so we - * do an unpaired copy here to clear the buffer and - * prevent a covert channel being set up. - * - * cpabort is not used because it is quite expensive. - */ + if (current_thread_info()->task->thread.used_vas) { + asm volatile(PPC_CP_ABORT); + } else if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { asm volatile(PPC_COPY(%0, %1) : : "r"(dummy_copy_buffer), "r"(0)); } @@ -1455,6 +1455,27 @@ void flush_thread(void) #endif /* CONFIG_HAVE_HW_BREAKPOINT */ } +int set_thread_uses_vas(void) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -EINVAL; + + current->thread.used_vas = 1; + + /* + * Even a process that has no foreign real address mapping can use + * an unpaired COPY instruction (to no real effect). Issue CP_ABORT + * to clear any pending COPY and prevent a covert channel. + * + * __switch_to() will issue CP_ABORT on future context switches. + */ + asm volatile(PPC_CP_ABORT); + +#endif /* CONFIG_PPC_BOOK3S_64 */ + return 0; +} + #ifdef CONFIG_PPC64 static DEFINE_SPINLOCK(vas_thread_id_lock); static DEFINE_IDA(vas_thread_ida); -- cgit From 5676be2fb7035ac32da3a96241611e7eddff6157 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:56 -0800 Subject: powerpc/vas: Define vas_win_paste_addr() Define an interface that the NX drivers can use to find the physical paste address of a send window. This interface is expected to be used with the mmap() operation of the NX driver's device. i.e the user space process can use driver's mmap() operation to map the send window's paste address into their address space and then use copy and paste instructions to submit the CRBs to the NX engine. Note that kernel drivers will use vas_paste_crb() directly and don't need this interface. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/vas.h | 7 +++++++ arch/powerpc/platforms/powernv/vas-window.c | 10 ++++++++++ 2 files changed, 17 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h index 044748f547f5..f98ade8a2603 100644 --- a/arch/powerpc/include/asm/vas.h +++ b/arch/powerpc/include/asm/vas.h @@ -10,6 +10,8 @@ #ifndef _ASM_POWERPC_VAS_H #define _ASM_POWERPC_VAS_H +struct vas_window; + /* * Min and max FIFO sizes are based on Version 1.05 Section 3.1.4.25 * (Local FIFO Size Register) of the VAS workbook. @@ -165,4 +167,9 @@ int vas_copy_crb(void *crb, int offset); */ int vas_paste_crb(struct vas_window *win, int offset, bool re); +/* + * Return the power bus paste address associated with @win so the caller + * can map that address into their address space. + */ +extern u64 vas_win_paste_addr(struct vas_window *win); #endif /* __ASM_POWERPC_VAS_H */ diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index c030d4cf982e..d7d06533a1e9 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -40,6 +40,16 @@ static void compute_paste_address(struct vas_window *window, u64 *addr, int *len pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr); } +u64 vas_win_paste_addr(struct vas_window *win) +{ + u64 addr; + + compute_paste_address(win, &addr, NULL); + + return addr; +} +EXPORT_SYMBOL(vas_win_paste_addr); + static inline void get_hvwc_mmio_bar(struct vas_window *window, u64 *start, int *len) { -- cgit From 61f3cca8cda979646c24accd9dbf3e2de7ea6ceb Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:57 -0800 Subject: powerpc/vas: Define vas_win_id() Define an interface to return a system-wide unique id for a given VAS window. The vas_win_id() will be used in a follow-on patch to generate an unique handle for a user space receive window. Applications can use this handle to pair send and receive windows for fast thread-wakeup. The hardware refers to this system-wide unique id as a Partition Send Window ID which is expected to be used during fault handling. Hence the "pswid" in the function names. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/vas.h | 5 +++++ arch/powerpc/platforms/powernv/vas-window.c | 9 +++++++++ arch/powerpc/platforms/powernv/vas.h | 28 ++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h index f98ade8a2603..771456227496 100644 --- a/arch/powerpc/include/asm/vas.h +++ b/arch/powerpc/include/asm/vas.h @@ -167,6 +167,11 @@ int vas_copy_crb(void *crb, int offset); */ int vas_paste_crb(struct vas_window *win, int offset, bool re); +/* + * Return a system-wide unique id for the VAS window @win. + */ +extern u32 vas_win_id(struct vas_window *win); + /* * Return the power bus paste address associated with @win so the caller * can map that address into their address space. diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index d7d06533a1e9..82754924c2bc 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1235,3 +1235,12 @@ int vas_win_close(struct vas_window *window) return 0; } EXPORT_SYMBOL_GPL(vas_win_close); + +/* + * Return a system-wide unique window id for the window @win. + */ +u32 vas_win_id(struct vas_window *win) +{ + return encode_pswid(win->vinst->vas_id, win->winid); +} +EXPORT_SYMBOL_GPL(vas_win_id); diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index 756cbc5335bc..ae0100fd35bb 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -447,4 +447,32 @@ static inline u64 read_hvwc_reg(struct vas_window *win, return in_be64(win->hvwc_map+reg); } +/* + * Encode/decode the Partition Send Window ID (PSWID) for a window in + * a way that we can uniquely identify any window in the system. i.e. + * we should be able to locate the 'struct vas_window' given the PSWID. + * + * Bits Usage + * 0:7 VAS id (8 bits) + * 8:15 Unused, 0 (3 bits) + * 16:31 Window id (16 bits) + */ +static inline u32 encode_pswid(int vasid, int winid) +{ + u32 pswid = 0; + + pswid |= vasid << (31 - 7); + pswid |= winid; + + return pswid; +} + +static inline void decode_pswid(u32 pswid, int *vasid, int *winid) +{ + if (vasid) + *vasid = pswid >> (31 - 7) & 0xFF; + + if (winid) + *winid = pswid & 0xFFFF; +} #endif /* _VAS_H */ -- cgit From 6c8e6bb2a52d5e7ae5bbde21c21f6d5dfd6e9ae8 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Tue, 7 Nov 2017 18:23:58 -0800 Subject: powerpc/vas: Add support for user receive window Add support for user space receive window (for the Fast thread-wakeup coprocessor type) Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/vas-window.c | 56 +++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index 82754924c2bc..2b3eb01ab110 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -16,7 +16,8 @@ #include #include #include - +#include +#include #include "vas.h" #include "copy-paste.h" @@ -597,6 +598,32 @@ static void put_rx_win(struct vas_window *rxwin) atomic_dec(&rxwin->num_txwins); } +/* + * Find the user space receive window given the @pswid. + * - We must have a valid vasid and it must belong to this instance. + * (so both send and receive windows are on the same VAS instance) + * - The window must refer to an OPEN, FTW, RECEIVE window. + * + * NOTE: We access ->windows[] table and assume that vinst->mutex is held. + */ +static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid) +{ + int vasid, winid; + struct vas_window *rxwin; + + decode_pswid(pswid, &vasid, &winid); + + if (vinst->vas_id != vasid) + return ERR_PTR(-EINVAL); + + rxwin = vinst->windows[winid]; + + if (!rxwin || rxwin->tx_win || rxwin->cop != VAS_COP_TYPE_FTW) + return ERR_PTR(-EINVAL); + + return rxwin; +} + /* * Get the VAS receive window associated with NX engine identified * by @cop and if applicable, @pswid. @@ -610,10 +637,10 @@ static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst, mutex_lock(&vinst->mutex); - if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) - rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL); + if (cop == VAS_COP_TYPE_FTW) + rxwin = get_user_rxwin(vinst, pswid); else - rxwin = ERR_PTR(-EINVAL); + rxwin = vinst->rxwin[cop] ?: ERR_PTR(-EINVAL); if (!IS_ERR(rxwin)) atomic_inc(&rxwin->num_txwins); @@ -937,10 +964,9 @@ static void init_winctx_for_txwin(struct vas_window *txwin, winctx->tx_word_mode = txattr->tx_win_ord_mode; winctx->rsvd_txbuf_count = txattr->rsvd_txbuf_count; - if (winctx->nx_win) { + winctx->intr_disable = true; + if (winctx->nx_win) winctx->data_stamp = true; - winctx->intr_disable = true; - } winctx->lpid = txattr->lpid; winctx->pidr = txattr->pidr; @@ -985,6 +1011,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, if (!tx_win_args_valid(cop, attr)) return ERR_PTR(-EINVAL); + /* + * If caller did not specify a vasid but specified the PSWID of a + * receive window (applicable only to FTW windows), use the vasid + * from that receive window. + */ + if (vasid == -1 && attr->pswid) + decode_pswid(attr->pswid, &vasid, NULL); + vinst = find_vas_instance(vasid); if (!vinst) { pr_devel("vasid %d not found!\n", vasid); @@ -1031,6 +1065,14 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, } } + /* + * Now that we have a send window, ensure context switch issues + * CP_ABORT for this thread. + */ + rc = -EINVAL; + if (set_thread_uses_vas() < 0) + goto free_window; + set_vinst_win(vinst, txwin); return txwin; -- cgit From f79ad50ea3c73fb1ea5b09e95c864e5bb263adfb Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Mon, 16 Oct 2017 16:21:35 +1100 Subject: powerpc/mm/radix: Fix crashes on Power9 DD1 with radix MMU and STRICT_RWX When using the radix MMU on Power9 DD1, to work around a hardware problem, radix__pte_update() is required to do a two stage update of the PTE. First we write a zero value into the PTE, then we flush the TLB, and then we write the new PTE value. In the normal case that works OK, but it does not work if we're updating the PTE that maps the code we're executing, because the mapping is removed by the TLB flush and we can no longer execute from it. Unfortunately the STRICT_RWX code needs to do exactly that. The exact symptoms when we hit this case vary, sometimes we print an oops and then get stuck after that, but I've also seen a machine just get stuck continually page faulting with no oops printed. The variance is presumably due to the exact layout of the text and the page size used for the mappings. In all cases we are unable to boot to a shell. There are possible solutions such as creating a second mapping of the TLB flush code, executing from that, and then jumping back to the original. However we don't want to add that level of complexity for a DD1 work around. So just detect that we're running on Power9 DD1 and refrain from changing the permissions, effectively disabling STRICT_RWX on Power9 DD1. Fixes: 7614ff3272a1 ("powerpc/mm/radix: Implement STRICT_RWX/mark_rodata_ro() for Radix") Cc: stable@vger.kernel.org # v4.13+ Reported-by: Andrew Jeffery [Changelog as suggested by Michael Ellerman ] Signed-off-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable-radix.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 39c252b54d16..cfbbee941a76 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -169,6 +169,16 @@ void radix__mark_rodata_ro(void) { unsigned long start, end; + /* + * mark_rodata_ro() will mark itself as !writable at some point. + * Due to DD1 workaround in radix__pte_update(), we'll end up with + * an invalid pte and the system will crash quite severly. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + pr_warn("Warning: Unable to mark rodata read only on P9 DD1\n"); + return; + } + start = (unsigned long)_stext; end = (unsigned long)__init_begin; -- cgit From fc2a5a6161a26b386eb0936b74a852ff83793fb6 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 2 Nov 2017 17:49:36 +1100 Subject: powerpc/64s: ppc_save_regs is now needed for all 64s builds Commit 78adf6c214f0 ("powerpc/64s: Implement system reset idle wakeup reason"), added a call to ppc_save_regs() in the book3s code. ppc_save_regs() is only built if XMON and/or KEXEC_CORE are enabled, which is usually the case, however if they're not enabled then the build breaks. Fix it by making the Makefile check also build ppc_save_regs.o if CONFIG_PPC_BOOK3S is enabled. Fixes: 78adf6c214f0 ("powerpc/64s: Implement system reset idle wakeup reason") Signed-off-by: Stephen Rothwell [mpe: Write change log] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 91960f83039c..34b6e729f38c 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -128,7 +128,7 @@ obj64-$(CONFIG_PPC_TRANSACTIONAL_MEM) += tm.o obj-$(CONFIG_PPC64) += $(obj64-y) obj-$(CONFIG_PPC32) += $(obj32-y) -ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE),) +ifneq ($(CONFIG_XMON)$(CONFIG_KEXEC_CORE)(CONFIG_PPC_BOOK3S),) obj-y += ppc_save_regs.o endif -- cgit From 8a2d71a3f2737e2448aa68de2b6052cb570d3d2a Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 23 Oct 2017 22:07:38 +0530 Subject: powerpc/kprobes: Disable preemption before invoking probe handler for optprobes Per Documentation/kprobes.txt, probe handlers need to be invoked with preemption disabled. Update optimized_callback() to do so. Also move get_kprobe_ctlblk() invocation post preemption disable, since it accesses pre-cpu data. This was not an issue so far since optprobes wasn't selected if CONFIG_PREEMPT was enabled. Commit a30b85df7d599f ("kprobes: Use synchronize_rcu_tasks() for optprobe with CONFIG_PREEMPT=y") changes this. Signed-off-by: Naveen N. Rao Acked-by: Masami Hiramatsu Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/optprobes.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 91e037ab20a1..60ba7f1370a8 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -115,7 +115,6 @@ static unsigned long can_optimize(struct kprobe *p) static void optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); unsigned long flags; /* This is possible if op is under delayed unoptimizing */ @@ -124,13 +123,14 @@ static void optimized_callback(struct optimized_kprobe *op, local_irq_save(flags); hard_irq_disable(); + preempt_disable(); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); } else { __this_cpu_write(current_kprobe, &op->kp); regs->nip = (unsigned long)op->kp.addr; - kcb->kprobe_status = KPROBE_HIT_ACTIVE; + get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; opt_pre_handler(&op->kp, regs); __this_cpu_write(current_kprobe, NULL); } @@ -140,6 +140,7 @@ static void optimized_callback(struct optimized_kprobe *op, * local_irq_restore() will re-enable interrupts, * if they were hard disabled. */ + preempt_enable_no_resched(); local_irq_restore(flags); } NOKPROBE_SYMBOL(optimized_callback); -- cgit From f72180cc93a2c64d4efe0129fa2396ad78be80e3 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 23 Oct 2017 22:07:39 +0530 Subject: powerpc/kprobes: Do not disable interrupts for optprobes and kprobes_on_ftrace Per Documentation/kprobes.txt, we don't necessarily need to disable interrupts before invoking the kprobe handlers. Masami submitted similar changes for x86 via commit a19b2e3d783964 ("kprobes/x86: Remove IRQ disabling from ftrace-based/optimized kprobes"). Do the same for powerpc. Signed-off-by: Naveen N. Rao Acked-by: Masami Hiramatsu Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/kprobes-ftrace.c | 10 ++-------- arch/powerpc/kernel/optprobes.c | 10 ---------- 2 files changed, 2 insertions(+), 18 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 4b1f34f685b1..7a1f99f1b47f 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -75,11 +75,7 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, { struct kprobe *p; struct kprobe_ctlblk *kcb; - unsigned long flags; - /* Disable irq for emulating a breakpoint and avoiding preempt */ - local_irq_save(flags); - hard_irq_disable(); preempt_disable(); p = get_kprobe((kprobe_opcode_t *)nip); @@ -105,16 +101,14 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, else { /* * If pre_handler returns !0, it sets regs->nip and - * resets current kprobe. In this case, we still need - * to restore irq, but not preemption. + * resets current kprobe. In this case, we should not + * re-enable preemption. */ - local_irq_restore(flags); return; } } end: preempt_enable_no_resched(); - local_irq_restore(flags); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 60ba7f1370a8..8237884ca389 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -115,14 +115,10 @@ static unsigned long can_optimize(struct kprobe *p) static void optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { - unsigned long flags; - /* This is possible if op is under delayed unoptimizing */ if (kprobe_disabled(&op->kp)) return; - local_irq_save(flags); - hard_irq_disable(); preempt_disable(); if (kprobe_running()) { @@ -135,13 +131,7 @@ static void optimized_callback(struct optimized_kprobe *op, __this_cpu_write(current_kprobe, NULL); } - /* - * No need for an explicit __hard_irq_enable() here. - * local_irq_restore() will re-enable interrupts, - * if they were hard disabled. - */ preempt_enable_no_resched(); - local_irq_restore(flags); } NOKPROBE_SYMBOL(optimized_callback); -- cgit From 67ac0bfe29e9b4359012a8961c196baa813c707a Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 23 Oct 2017 22:07:40 +0530 Subject: powerpc/kprobes: Blacklist emulate_update_regs() from kprobes Commit 3cdfcbfd32b9d ("powerpc: Change analyse_instr so it doesn't modify *regs") introduced emulate_update_regs() to perform part of what emulate_step() was doing earlier. However, this function was not added to the kprobes blacklist. Add it so as to prevent it from being probed. Signed-off-by: Naveen N. Rao Acked-by: Masami Hiramatsu Signed-off-by: Michael Ellerman --- arch/powerpc/lib/sstep.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 8c3955e183d4..70274b7b4773 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -2717,6 +2717,7 @@ void emulate_update_regs(struct pt_regs *regs, struct instruction_op *op) } regs->nip = next_pc; } +NOKPROBE_SYMBOL(emulate_update_regs); /* * Emulate a previously-analysed load or store instruction. -- cgit From acdfe93101d0f5b968d1bd7ab125517bf7444047 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Mon, 23 Oct 2017 22:07:41 +0530 Subject: powerpc/kprobes: refactor kprobe_lookup_name for safer string operations Use safer string manipulation functions when dealing with a user-provided string in kprobe_lookup_name(). Reported-by: David Laight Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/kprobes.c | 47 ++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 27 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index a20ce12adab1..ca5d5a081e75 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -53,7 +53,7 @@ bool arch_within_kprobe_blacklist(unsigned long addr) kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) { - kprobe_opcode_t *addr; + kprobe_opcode_t *addr = NULL; #ifdef PPC64_ELF_ABI_v2 /* PPC64 ABIv2 needs local entry point */ @@ -85,36 +85,29 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) * Also handle format. */ char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN]; - const char *modsym; bool dot_appended = false; - if ((modsym = strchr(name, ':')) != NULL) { - modsym++; - if (*modsym != '\0' && *modsym != '.') { - /* Convert to */ - strncpy(dot_name, name, modsym - name); - dot_name[modsym - name] = '.'; - dot_name[modsym - name + 1] = '\0'; - strncat(dot_name, modsym, - sizeof(dot_name) - (modsym - name) - 2); - dot_appended = true; - } else { - dot_name[0] = '\0'; - strncat(dot_name, name, sizeof(dot_name) - 1); - } - } else if (name[0] != '.') { - dot_name[0] = '.'; - dot_name[1] = '\0'; - strncat(dot_name, name, KSYM_NAME_LEN - 2); + const char *c; + ssize_t ret = 0; + int len = 0; + + if ((c = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { + c++; + len = c - name; + memcpy(dot_name, name, len); + } else + c = name; + + if (*c != '\0' && *c != '.') { + dot_name[len++] = '.'; dot_appended = true; - } else { - dot_name[0] = '\0'; - strncat(dot_name, name, KSYM_NAME_LEN - 1); } - addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); - if (!addr && dot_appended) { - /* Let's try the original non-dot symbol lookup */ + ret = strscpy(dot_name + len, c, KSYM_NAME_LEN); + if (ret > 0) + addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); + + /* Fallback to the original non-dot symbol lookup */ + if (!addr && dot_appended) addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); - } #else addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); #endif -- cgit From 2a31ad093bb1b8c1f1e04cbe222ad17bc09c4534 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 6 Sep 2017 11:48:59 +1000 Subject: powerpc/powernv/npu: Use flush_all_mm() instead of flush_tlb_mm() With the optimisations introduced by commit a46cc7a908 ("powerpc/mm/radix: Improve TLB/PWC flushes"), flush_tlb_mm() no longer flushes the page walk cache with radix. Switch to using flush_all_mm() to ensure the pwc and tlb are properly flushed on the nmmu. Signed-off-by: Alistair Popple Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 2cb6cbea4b3b..2fff9a65975b 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -549,7 +549,7 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, * Unfortunately the nest mmu does not support flushing specific * addresses so we have to flush the whole mm. */ - flush_tlb_mm(npu_context->mm); + flush_all_mm(npu_context->mm); /* * Loop over all the NPUs this process is active on and launch -- cgit From 1b2c2b12386f9bb009a2249eca00e01a9d76d7c1 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 6 Sep 2017 11:49:00 +1000 Subject: powerpc/powernv/npu: Don't explicitly flush nmmu tlb The nest mmu required an explicit flush as a tlbi would not flush it in the same way as the core. However an alternate firmware fix exists which should eliminate the need for this flush, so instead add a device-tree property (ibm,nmmu-flush) on the NVLink2 PHB to enable it only if required. Signed-off-by: Alistair Popple Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/npu-dma.c | 28 +++++++++++++++++++++++----- arch/powerpc/platforms/powernv/pci.h | 3 +++ 2 files changed, 26 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 2fff9a65975b..f6cbc1a71472 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -395,6 +395,7 @@ struct npu_context { struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS]; struct mmu_notifier mn; struct kref kref; + bool nmmu_flush; /* Callback to stop translation requests on a given GPU */ struct npu_context *(*release_cb)(struct npu_context *, void *); @@ -545,11 +546,13 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; unsigned long pid = npu_context->mm->context.id; - /* - * Unfortunately the nest mmu does not support flushing specific - * addresses so we have to flush the whole mm. - */ - flush_all_mm(npu_context->mm); + if (npu_context->nmmu_flush) + /* + * Unfortunately the nest mmu does not support flushing specific + * addresses so we have to flush the whole mm once before + * shooting down the GPU translation. + */ + flush_all_mm(npu_context->mm); /* * Loop over all the NPUs this process is active on and launch @@ -722,6 +725,16 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, return ERR_PTR(-ENODEV); npu_context->npdev[npu->index][nvlink_index] = npdev; + if (!nphb->npu.nmmu_flush) { + /* + * If we're not explicitly flushing ourselves we need to mark + * the thread for global flushes + */ + npu_context->nmmu_flush = false; + mm_context_add_copro(mm); + } else + npu_context->nmmu_flush = true; + return npu_context; } EXPORT_SYMBOL(pnv_npu2_init_context); @@ -731,6 +744,9 @@ static void pnv_npu2_release_context(struct kref *kref) struct npu_context *npu_context = container_of(kref, struct npu_context, kref); + if (!npu_context->nmmu_flush) + mm_context_remove_copro(npu_context->mm); + npu_context->mm->context.npu_context = NULL; mmu_notifier_unregister(&npu_context->mn, npu_context->mm); @@ -819,6 +835,8 @@ int pnv_npu2_init(struct pnv_phb *phb) static int npu_index; uint64_t rc = 0; + phb->npu.nmmu_flush = + of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush"); for_each_child_of_node(phb->hose->dn, dn) { gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn)); if (gpdev) { diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 56d1f272d4ad..96151b3a2dd4 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -187,6 +187,9 @@ struct pnv_phb { /* Bitmask for MMIO register usage */ unsigned long mmio_atsd_usage; + + /* Do we need to explicitly flush the nest mmu? */ + bool nmmu_flush; } npu; #ifdef CONFIG_CXL_BASE -- cgit From 32ce3862af3c42a3890e99846a8d1ad8871a49f9 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Thu, 19 Oct 2017 18:13:54 +1100 Subject: powerpc/lib: Implement PMEM API Implement the architecture specific cache maintence functions that make up the "PMEM API". Currently the writeback and invalidate functions are the same since the function of the DCBST (data cache block store) instruction is typically interpreted as "writeback to the point of coherency" rather than to memory. As a result implementing the API requires a full cache flush rather than just a cache write back. This will probably change in the not-too-distant future. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 + arch/powerpc/lib/Makefile | 2 +- arch/powerpc/lib/pmem.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/lib/pmem.c (limited to 'arch/powerpc') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b8b75b423316..c844dbc04153 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -138,6 +138,7 @@ config PPC select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_PMEM_API if PPC64 select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 50d5bf954cff..7aa237d10546 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -23,7 +23,7 @@ endif obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \ copyuser_power7.o string_64.o copypage_power7.o memcpy_power7.o \ - memcpy_64.o memcmp_64.o + memcpy_64.o memcmp_64.o pmem.o obj64-$(CONFIG_SMP) += locks.o obj64-$(CONFIG_ALTIVEC) += vmx-helper.o diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c new file mode 100644 index 000000000000..0fa09262ca13 --- /dev/null +++ b/arch/powerpc/lib/pmem.c @@ -0,0 +1,34 @@ +/* + * Copyright(c) 2017 IBM Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include + +#include + +/* + * CONFIG_ARCH_HAS_PMEM_API symbols + */ +void arch_wb_cache_pmem(void *addr, size_t size) +{ + unsigned long start = (unsigned long) addr; + flush_inval_dcache_range(start, start + size); +} +EXPORT_SYMBOL(arch_wb_cache_pmem); + +void arch_invalidate_pmem(void *addr, size_t size) +{ + unsigned long start = (unsigned long) addr; + flush_inval_dcache_range(start, start + size); +} +EXPORT_SYMBOL(arch_invalidate_pmem); -- cgit From 6c44741d75a27985a0496cd58b123a58c7177108 Mon Sep 17 00:00:00 2001 From: Oliver O'Halloran Date: Thu, 19 Oct 2017 18:13:55 +1100 Subject: powerpc/lib: Implement UACCESS_FLUSHCACHE API Implement the architecture specific portitions of the UACCESS_FLUSHCACHE API. This provides functions for the copy_user_flushcache iterator that ensure that when the copy is finished the destination buffer contains a copy of the original and that the destination buffer is clean in the processor caches. Signed-off-by: Oliver O'Halloran Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/string.h | 2 ++ arch/powerpc/include/asm/uaccess.h | 5 +++++ arch/powerpc/lib/pmem.c | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 41 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index c844dbc04153..bbc9336231db 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -142,6 +142,7 @@ config PPC select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_UACCESS_FLUSHCACHE if PPC64 select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index cc9addefb51c..2796eda17d92 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -11,6 +11,7 @@ #define __HAVE_ARCH_MEMCMP #define __HAVE_ARCH_MEMCHR #define __HAVE_ARCH_MEMSET16 +#define __HAVE_ARCH_MEMCPY_FLUSHCACHE extern char * strcpy(char *,const char *); extern char * strncpy(char *,const char *, __kernel_size_t); @@ -23,6 +24,7 @@ extern void * memcpy(void *,const void *,__kernel_size_t); extern void * memmove(void *,const void *,__kernel_size_t); extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); +extern void * memcpy_flushcache(void *,const void *,__kernel_size_t); #ifdef CONFIG_PPC64 #define __HAVE_ARCH_MEMSET32 diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index e34f15e727d9..19a3f75ed610 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -356,4 +356,9 @@ static inline unsigned long clear_user(void __user *addr, unsigned long size) extern long strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); +extern long __copy_from_user_flushcache(void *dst, const void __user *src, + unsigned size); +extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len); + #endif /* _ARCH_POWERPC_UACCESS_H */ diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c index 0fa09262ca13..53c018762e1c 100644 --- a/arch/powerpc/lib/pmem.c +++ b/arch/powerpc/lib/pmem.c @@ -13,6 +13,7 @@ #include #include +#include #include @@ -32,3 +33,35 @@ void arch_invalidate_pmem(void *addr, size_t size) flush_inval_dcache_range(start, start + size); } EXPORT_SYMBOL(arch_invalidate_pmem); + +/* + * CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE symbols + */ +long __copy_from_user_flushcache(void *dest, const void __user *src, + unsigned size) +{ + unsigned long copied, start = (unsigned long) dest; + + copied = __copy_from_user(dest, src, size); + flush_inval_dcache_range(start, start + size); + + return copied; +} + +void *memcpy_flushcache(void *dest, const void *src, size_t size) +{ + unsigned long start = (unsigned long) dest; + + memcpy(dest, src, size); + flush_inval_dcache_range(start, start + size); + + return dest; +} +EXPORT_SYMBOL(memcpy_flushcache); + +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len) +{ + memcpy_flushcache(to, page_to_virt(page) + offset, len); +} +EXPORT_SYMBOL(memcpy_page_flushcache); -- cgit From dcdc46794b7bb76733d9792cca2f45871d66881f Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Mon, 26 Jun 2017 16:06:01 +0200 Subject: powerpc/fadump: use kstrtoint to handle sysfs store Currently sysfs store handlers in fadump use if buf[0] == 'char'. This means input "100foo" is interpreted as '1' and "01" as '0'. Change to kstrtoint so leading zeroes and the like is handled in expected way. Signed-off-by: Michal Suchanek Acked-by: Hari Bathini Signed-off-by: Michal Suchanek <msuchanek@suse.de> Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fadump.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index e1431800bfb9..04ea5c04fd24 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1270,10 +1270,15 @@ static ssize_t fadump_release_memory_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + int input = -1; + if (!fw_dump.dump_active) return -EPERM; - if (buf[0] == '1') { + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + + if (input == 1) { /* * Take away the '/proc/vmcore'. We are releasing the dump * memory, hence it will not be valid anymore. @@ -1307,21 +1312,25 @@ static ssize_t fadump_register_store(struct kobject *kobj, const char *buf, size_t count) { int ret = 0; + int input = -1; if (!fw_dump.fadump_enabled || fdm_active) return -EPERM; + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + mutex_lock(&fadump_mutex); - switch (buf[0]) { - case '0': + switch (input) { + case 0: if (fw_dump.dump_registered == 0) { goto unlock_out; } /* Un-register Firmware-assisted dump */ fadump_unregister_dump(&fdm); break; - case '1': + case 1: if (fw_dump.dump_registered == 1) { ret = -EEXIST; goto unlock_out; -- cgit From 46725b17f1c6c815a41429259b3f070c01e71bc1 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 31 Aug 2017 21:55:57 +0530 Subject: powerpc/signal: Properly handle return value from uprobe_deny_signal() When a uprobe is installed on an instruction that we currently do not emulate, we copy the instruction into a xol buffer and single step that instruction. If that instruction generates a fault, we abort the single stepping before invoking the signal handler. Once the signal handler is done, the uprobe trap is hit again since the instruction is retried and the process repeats. We use uprobe_deny_signal() to detect if the xol instruction triggered a signal. If so, we clear TIF_SIGPENDING and set TIF_UPROBE so that the signal is not handled until after the single stepping is aborted. In this case, uprobe_deny_signal() returns true and get_signal() ends up returning 0. However, in do_signal(), we are not looking at the return value, but depending on ksig.sig for further action, all with an uninitialized ksig that is not touched in this scenario. Fix the same by initializing ksig.sig to 0. Fixes: 129b69df9c90 ("powerpc: Use get_signal() signal_setup_done()") Cc: stable@vger.kernel.org # v3.17+ Reported-by: Anton Blanchard Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index e9436c5e1e09..3d7539b90010 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -103,7 +103,7 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka, static void do_signal(struct task_struct *tsk) { sigset_t *oldset = sigmask_to_save(); - struct ksignal ksig; + struct ksignal ksig = { .sig = 0 }; int ret; int is32 = is_32bit_task(); -- cgit From f23ab3efb1b30cc5c5ef5ae4ef294ed467f30675 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 10 Nov 2017 12:15:00 +1100 Subject: powerpc: Fix DABR match on hash based systems Commit 398a719d34a1 ("powerpc/mm: Update bits used to skip hash_page") mistakenly dropped the DSISR_DABRMATCH bit from the mask of bit tested to skip trying to hash a page. As a result, the DABR matches would no longer be detected. This adds it back. We open code it in the 2 places where it matters rather than fold it into DSISR_BAD_FAULT_32S/64S because this isn't technically a bad fault and while we would never hit it with the current code, I prefer if page_fault_is_bad() didn't trigger on these. Fixes: 398a719d34a1 ("powerpc/mm: Update bits used to skip hash_page") Cc: stable@vger.kernel.org # v4.14 Tested-by: Pedro Miraglia Franco de Carvalho Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/exceptions-64s.S | 2 +- arch/powerpc/kernel/head_32.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 6879aed47377..445137e2d0ca 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1498,7 +1498,7 @@ USE_TEXT_SECTION() .balign IFETCH_ALIGN_BYTES do_hash_page: #ifdef CONFIG_PPC_BOOK3S_64 - lis r0,DSISR_BAD_FAULT_64S@h + lis r0,(DSISR_BAD_FAULT_64S|DSISR_DABRMATCH)@h ori r0,r0,DSISR_BAD_FAULT_64S@l and. r0,r4,r0 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 8c54166491e7..29b2fed93289 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -388,7 +388,7 @@ DataAccess: EXCEPTION_PROLOG mfspr r10,SPRN_DSISR stw r10,_DSISR(r11) - andis. r0,r10,DSISR_BAD_FAULT_32S@h + andis. r0,r10,(DSISR_BAD_FAULT_32S|DSISR_DABRMATCH)@h bne 1f /* if not, try to put a PTE */ mfspr r4,SPRN_DAR /* into the hash table */ rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */ -- cgit From 7ece370996b694ae263025e056ad785afc1be5ab Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 13 Nov 2017 23:17:28 +1100 Subject: powerpc/64s/hash: Fix 512T hint detection to use >= 128T Currently userspace is able to request mmap() search between 128T-512T by specifying a hint address that is greater than 128T. But that means a hint of 128T exactly will return an address below 128T, which is confusing and wrong. So fix the logic to check the hint is greater than *or equal* to 128T. Fixes: f4ea6dcb08ea ("powerpc/mm: Enable mappings above 128TB") Cc: stable@vger.kernel.org # v4.12+ Suggested-by: Aneesh Kumar K.V Suggested-by: Nicholas Piggin [mpe: Split out of Nick's bigger patch] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 45f6740dd407..48a5312103a1 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -419,7 +419,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* * Check if we need to expland slice area. */ - if (unlikely(addr > mm->context.addr_limit && + if (unlikely(addr >= mm->context.addr_limit && mm->context.addr_limit != TASK_SIZE)) { mm->context.addr_limit = TASK_SIZE; on_each_cpu(slice_flush_segments, mm, 1); @@ -427,7 +427,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* * This mmap request can allocate upt to 512TB */ - if (addr > DEFAULT_MAP_WINDOW) + if (addr >= DEFAULT_MAP_WINDOW) high_limit = mm->context.addr_limit; else high_limit = DEFAULT_MAP_WINDOW; -- cgit From 6a72dc038b615229a1b285829d6c8378d15c2347 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 10 Nov 2017 04:27:36 +1100 Subject: powerpc/64s/hash: Fix 128TB-512TB virtual address boundary case allocation When allocating VA space with a hint that crosses 128TB, the SLB addr_limit variable is not expanded if addr is not > 128TB, but the slice allocation looks at task_size, which is 512TB. This results in slice_check_fit() incorrectly succeeding because the slice_count truncates off bit 128 of the requested mask, so the comparison to the available mask succeeds. Fix this by using mm->context.addr_limit instead of mm->task_size for testing allocation limits. This causes such allocations to fail. Fixes: f4ea6dcb08ea ("powerpc/mm: Enable mappings above 128TB") Cc: stable@vger.kernel.org # v4.12+ Reported-by: Florian Weimer Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 50 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 48a5312103a1..3889201b560c 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -96,7 +96,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *vma; - if ((mm->task_size - len) < addr) + if ((mm->context.addr_limit - len) < addr) return 0; vma = find_vma(mm, addr); return (!vma || (addr + len) <= vm_start_gap(vma)); @@ -133,7 +133,7 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret) if (!slice_low_has_vma(mm, i)) ret->low_slices |= 1u << i; - if (mm->task_size <= SLICE_LOW_TOP) + if (mm->context.addr_limit <= SLICE_LOW_TOP) return; for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) @@ -412,25 +412,31 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, struct slice_mask compat_mask; int fixed = (flags & MAP_FIXED); int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned long page_size = 1UL << pshift; struct mm_struct *mm = current->mm; unsigned long newaddr; unsigned long high_limit; - /* - * Check if we need to expland slice area. - */ - if (unlikely(addr >= mm->context.addr_limit && - mm->context.addr_limit != TASK_SIZE)) { - mm->context.addr_limit = TASK_SIZE; + high_limit = DEFAULT_MAP_WINDOW; + if (addr >= high_limit) + high_limit = TASK_SIZE; + + if (len > high_limit) + return -ENOMEM; + if (len & (page_size - 1)) + return -EINVAL; + if (fixed) { + if (addr & (page_size - 1)) + return -EINVAL; + if (addr > high_limit - len) + return -ENOMEM; + } + + if (high_limit > mm->context.addr_limit) { + mm->context.addr_limit = high_limit; on_each_cpu(slice_flush_segments, mm, 1); } - /* - * This mmap request can allocate upt to 512TB - */ - if (addr >= DEFAULT_MAP_WINDOW) - high_limit = mm->context.addr_limit; - else - high_limit = DEFAULT_MAP_WINDOW; + /* * init different masks */ @@ -446,27 +452,19 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* Sanity checks */ BUG_ON(mm->task_size == 0); + BUG_ON(mm->context.addr_limit == 0); VM_BUG_ON(radix_enabled()); slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", addr, len, flags, topdown); - if (len > mm->task_size) - return -ENOMEM; - if (len & ((1ul << pshift) - 1)) - return -EINVAL; - if (fixed && (addr & ((1ul << pshift) - 1))) - return -EINVAL; - if (fixed && addr > (mm->task_size - len)) - return -ENOMEM; - /* If hint, make sure it matches our alignment restrictions */ if (!fixed && addr) { - addr = _ALIGN_UP(addr, 1ul << pshift); + addr = _ALIGN_UP(addr, page_size); slice_dbg(" aligned addr=%lx\n", addr); /* Ignore hint if it's too large or overlaps a VMA */ - if (addr > mm->task_size - len || + if (addr > high_limit - len || !slice_area_is_free(mm, addr, len)) addr = 0; } -- cgit From effc1b25088502fbd30305c79773de2d1f7470a6 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 10 Nov 2017 04:27:37 +1100 Subject: powerpc/64s/hash: Fix fork() with 512TB process address space Hash unconditionally resets the addr_limit to default (128TB) when the mm context is initialised. If a process has > 128TB mappings when it forks, the child will not get the 512TB addr_limit, so accesses to valid > 128TB mappings will fail in the child. Fix this by only resetting the addr_limit to default if it was 0. Non zero indicates it was duplicated from the parent (0 means exec()). Fixes: f4ea6dcb08ea ("powerpc/mm: Enable mappings above 128TB") Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/mmu_context_book3s64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 6d724dab27c2..846cbad45fce 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -93,11 +93,11 @@ static int hash__init_new_context(struct mm_struct *mm) return index; /* - * We do switch_slb() early in fork, even before we setup the - * mm->context.addr_limit. Default to max task size so that we copy the - * default values to paca which will help us to handle slb miss early. + * In the case of exec, use the default limit, + * otherwise inherit it from the mm we are duplicating. */ - mm->context.addr_limit = DEFAULT_MAP_WINDOW_USER64; + if (!mm->context.addr_limit) + mm->context.addr_limit = DEFAULT_MAP_WINDOW_USER64; /* * The old code would re-promote on fork, we don't do that when using -- cgit From 35602f82d0c765f991420e319c8d3a596c921eb8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 10 Nov 2017 04:27:38 +1100 Subject: powerpc/64s/hash: Allow MAP_FIXED allocations to cross 128TB boundary While mapping hints with a length that cross 128TB are disallowed, MAP_FIXED allocations that cross 128TB are allowed. These are failing on hash (on radix they succeed). Add an additional case for fixed mappings to expand the addr_limit when crossing 128TB. Fixes: f4ea6dcb08ea ("powerpc/mm: Enable mappings above 128TB") Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/slice.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 3889201b560c..a4f93699194b 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -418,7 +418,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, unsigned long high_limit; high_limit = DEFAULT_MAP_WINDOW; - if (addr >= high_limit) + if (addr >= high_limit || (fixed && (addr + len > high_limit))) high_limit = TASK_SIZE; if (len > high_limit) -- cgit From 85e3f1adcb9d49300b0a943bb93f9604be375bfb Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 10 Nov 2017 04:27:39 +1100 Subject: powerpc/64s/radix: Fix 128TB-512TB virtual address boundary case allocation Radix VA space allocations test addresses against mm->task_size which is 512TB, even in cases where the intention is to limit allocation to below 128TB. This results in mmap with a hint address below 128TB but address + length above 128TB succeeding when it should fail (as hash does after the previous patch). Set the high address limit to be considered up front, and base subsequent allocation checks on that consistently. Fixes: f4ea6dcb08ea ("powerpc/mm: Enable mappings above 128TB") Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/hugetlbpage-radix.c | 26 ++++++++++++------ arch/powerpc/mm/mmap.c | 55 ++++++++++++++++++++++--------------- 2 files changed, 50 insertions(+), 31 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c index a12e86395025..0a3d71aae175 100644 --- a/arch/powerpc/mm/hugetlbpage-radix.c +++ b/arch/powerpc/mm/hugetlbpage-radix.c @@ -48,17 +48,28 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct hstate *h = hstate_file(file); + int fixed = (flags & MAP_FIXED); + unsigned long high_limit; struct vm_unmapped_area_info info; - if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE)) - mm->context.addr_limit = TASK_SIZE; + high_limit = DEFAULT_MAP_WINDOW; + if (addr >= high_limit || (fixed && (addr + len > high_limit))) + high_limit = TASK_SIZE; if (len & ~huge_page_mask(h)) return -EINVAL; - if (len > mm->task_size) + if (len > high_limit) return -ENOMEM; + if (fixed) { + if (addr > high_limit - len) + return -ENOMEM; + } - if (flags & MAP_FIXED) { + if (unlikely(addr > mm->context.addr_limit && + mm->context.addr_limit != TASK_SIZE)) + mm->context.addr_limit = TASK_SIZE; + + if (fixed) { if (prepare_hugepage_range(file, addr, len)) return -EINVAL; return addr; @@ -67,7 +78,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (mm->task_size - len >= addr && + if (high_limit - len >= addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -78,12 +89,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; - info.high_limit = current->mm->mmap_base; + info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; - if (addr > DEFAULT_MAP_WINDOW) - info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW; - return vm_unmapped_area(&info); } diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 5d78b193fec4..6d476a7b5611 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -106,22 +106,32 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; + int fixed = (flags & MAP_FIXED); + unsigned long high_limit; struct vm_unmapped_area_info info; + high_limit = DEFAULT_MAP_WINDOW; + if (addr >= high_limit || (fixed && (addr + len > high_limit))) + high_limit = TASK_SIZE; + + if (len > high_limit) + return -ENOMEM; + if (fixed) { + if (addr > high_limit - len) + return -ENOMEM; + } + if (unlikely(addr > mm->context.addr_limit && mm->context.addr_limit != TASK_SIZE)) mm->context.addr_limit = TASK_SIZE; - if (len > mm->task_size - mmap_min_addr) - return -ENOMEM; - - if (flags & MAP_FIXED) + if (fixed) return addr; if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (mm->task_size - len >= addr && addr >= mmap_min_addr && + if (high_limit - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -129,13 +139,9 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = mm->mmap_base; + info.high_limit = high_limit; info.align_mask = 0; - if (unlikely(addr > DEFAULT_MAP_WINDOW)) - info.high_limit = mm->context.addr_limit; - else - info.high_limit = DEFAULT_MAP_WINDOW; - return vm_unmapped_area(&info); } @@ -149,37 +155,42 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, struct vm_area_struct *vma; struct mm_struct *mm = current->mm; unsigned long addr = addr0; + int fixed = (flags & MAP_FIXED); + unsigned long high_limit; struct vm_unmapped_area_info info; + high_limit = DEFAULT_MAP_WINDOW; + if (addr >= high_limit || (fixed && (addr + len > high_limit))) + high_limit = TASK_SIZE; + + if (len > high_limit) + return -ENOMEM; + if (fixed) { + if (addr > high_limit - len) + return -ENOMEM; + } + if (unlikely(addr > mm->context.addr_limit && mm->context.addr_limit != TASK_SIZE)) mm->context.addr_limit = TASK_SIZE; - /* requested length too big for entire address space */ - if (len > mm->task_size - mmap_min_addr) - return -ENOMEM; - - if (flags & MAP_FIXED) + if (fixed) return addr; - /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (mm->task_size - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) + if (high_limit - len >= addr && addr >= mmap_min_addr && + (!vma || addr + len <= vm_start_gap(vma))) return addr; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); - info.high_limit = mm->mmap_base; + info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW); info.align_mask = 0; - if (addr > DEFAULT_MAP_WINDOW) - info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW; - addr = vm_unmapped_area(&info); if (!(addr & ~PAGE_MASK)) return addr; -- cgit From 4722476bce283149986a3463f61009dec0e7f6a1 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 10 Nov 2017 04:27:40 +1100 Subject: powerpc/64s: mm_context.addr_limit is only used on hash Radix keeps no meaningful state in addr_limit, so remove it from radix code and rename to slb_addr_limit to make it clear it applies to hash only. Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 2 +- arch/powerpc/include/asm/book3s/64/mmu.h | 2 +- arch/powerpc/include/asm/paca.h | 2 +- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kernel/paca.c | 4 ++-- arch/powerpc/kernel/setup-common.c | 3 ++- arch/powerpc/mm/hugetlbpage-radix.c | 8 +------- arch/powerpc/mm/mmap.c | 18 ++++-------------- arch/powerpc/mm/mmu_context_book3s64.c | 4 ++-- arch/powerpc/mm/slb_low.S | 2 +- arch/powerpc/mm/slice.c | 22 +++++++++++----------- 11 files changed, 27 insertions(+), 42 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 508275bb05d5..e91e115a816f 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -606,7 +606,7 @@ extern void slb_set_size(u16 size); /* 4 bits per slice and we have one slice per 1TB */ #define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) -#define TASK_SLICE_ARRAY_SZ(x) ((x)->context.addr_limit >> 41) +#define TASK_SLICE_ARRAY_SZ(x) ((x)->context.slb_addr_limit >> 41) #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index c3b00e8ff791..49a07c5d9e50 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -92,7 +92,7 @@ typedef struct { #ifdef CONFIG_PPC_MM_SLICES u64 low_slices_psize; /* SLB page size encodings */ unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; - unsigned long addr_limit; + unsigned long slb_addr_limit; #else u16 sllp; /* SLB page size encoding */ #endif diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index c907ae23c956..3892db93b837 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -143,7 +143,7 @@ struct paca_struct { #ifdef CONFIG_PPC_MM_SLICES u64 mm_ctx_low_slices_psize; unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE]; - unsigned long addr_limit; + unsigned long mm_ctx_slb_addr_limit; #else u16 mm_ctx_user_psize; u16 mm_ctx_sllp; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 200623e71474..9aace433491a 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -185,7 +185,7 @@ int main(void) #ifdef CONFIG_PPC_MM_SLICES OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize); OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize); - DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit)); + OFFSET(PACA_SLB_ADDR_LIMIT, paca_struct, mm_ctx_slb_addr_limit); DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); #endif /* CONFIG_PPC_MM_SLICES */ #endif diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 5d38d5ea9a24..d6597038931d 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -262,8 +262,8 @@ void copy_mm_to_paca(struct mm_struct *mm) get_paca()->mm_ctx_id = context->id; #ifdef CONFIG_PPC_MM_SLICES - VM_BUG_ON(!mm->context.addr_limit); - get_paca()->addr_limit = mm->context.addr_limit; + VM_BUG_ON(!mm->context.slb_addr_limit); + get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit; get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; memcpy(&get_paca()->mm_ctx_high_slices_psize, &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm)); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index fa661ed616f5..2075322cd225 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -898,7 +898,8 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_PPC_MM_SLICES #ifdef CONFIG_PPC64 - init_mm.context.addr_limit = DEFAULT_MAP_WINDOW_USER64; + if (!radix_enabled()) + init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; #else #error "context.addr_limit not initialized." #endif diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c index 0a3d71aae175..b54b581a2f7d 100644 --- a/arch/powerpc/mm/hugetlbpage-radix.c +++ b/arch/powerpc/mm/hugetlbpage-radix.c @@ -60,16 +60,10 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return -EINVAL; if (len > high_limit) return -ENOMEM; + if (fixed) { if (addr > high_limit - len) return -ENOMEM; - } - - if (unlikely(addr > mm->context.addr_limit && - mm->context.addr_limit != TASK_SIZE)) - mm->context.addr_limit = TASK_SIZE; - - if (fixed) { if (prepare_hugepage_range(file, addr, len)) return -EINVAL; return addr; diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index 6d476a7b5611..d503f344e476 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -116,17 +116,12 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, if (len > high_limit) return -ENOMEM; + if (fixed) { if (addr > high_limit - len) return -ENOMEM; - } - - if (unlikely(addr > mm->context.addr_limit && - mm->context.addr_limit != TASK_SIZE)) - mm->context.addr_limit = TASK_SIZE; - - if (fixed) return addr; + } if (addr) { addr = PAGE_ALIGN(addr); @@ -165,17 +160,12 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, if (len > high_limit) return -ENOMEM; + if (fixed) { if (addr > high_limit - len) return -ENOMEM; - } - - if (unlikely(addr > mm->context.addr_limit && - mm->context.addr_limit != TASK_SIZE)) - mm->context.addr_limit = TASK_SIZE; - - if (fixed) return addr; + } if (addr) { addr = PAGE_ALIGN(addr); diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 846cbad45fce..5e193e444ee8 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -96,8 +96,8 @@ static int hash__init_new_context(struct mm_struct *mm) * In the case of exec, use the default limit, * otherwise inherit it from the mm we are duplicating. */ - if (!mm->context.addr_limit) - mm->context.addr_limit = DEFAULT_MAP_WINDOW_USER64; + if (!mm->context.slb_addr_limit) + mm->context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64; /* * The old code would re-promote on fork, we don't do that when using diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index ed60ad861dfa..2cf5ef3fc50d 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -167,7 +167,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) /* * user space make sure we are within the allowed limit */ - ld r11,PACA_ADDR_LIMIT(r13) + ld r11,PACA_SLB_ADDR_LIMIT(r13) cmpld r3,r11 bge- 8f diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index a4f93699194b..564fff06f5c1 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -96,7 +96,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, { struct vm_area_struct *vma; - if ((mm->context.addr_limit - len) < addr) + if ((mm->context.slb_addr_limit - len) < addr) return 0; vma = find_vma(mm, addr); return (!vma || (addr + len) <= vm_start_gap(vma)); @@ -133,10 +133,10 @@ static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret) if (!slice_low_has_vma(mm, i)) ret->low_slices |= 1u << i; - if (mm->context.addr_limit <= SLICE_LOW_TOP) + if (mm->context.slb_addr_limit <= SLICE_LOW_TOP) return; - for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) if (!slice_high_has_vma(mm, i)) __set_bit(i, ret->high_slices); } @@ -157,7 +157,7 @@ static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_ma ret->low_slices |= 1u << i; hpsizes = mm->context.high_slices_psize; - for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) { + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) { mask_index = i & 0x1; index = i >> 1; if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize) @@ -169,7 +169,7 @@ static int slice_check_fit(struct mm_struct *mm, struct slice_mask mask, struct slice_mask available) { DECLARE_BITMAP(result, SLICE_NUM_HIGH); - unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.addr_limit); + unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); bitmap_and(result, mask.high_slices, available.high_slices, slice_count); @@ -219,7 +219,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz mm->context.low_slices_psize = lpsizes; hpsizes = mm->context.high_slices_psize; - for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) { + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.slb_addr_limit); i++) { mask_index = i & 0x1; index = i >> 1; if (test_bit(i, mask.high_slices)) @@ -329,8 +329,8 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * Only for that request for which high_limit is above * DEFAULT_MAP_WINDOW we should apply this. */ - if (high_limit > DEFAULT_MAP_WINDOW) - addr += mm->context.addr_limit - DEFAULT_MAP_WINDOW; + if (high_limit > DEFAULT_MAP_WINDOW) + addr += mm->context.slb_addr_limit - DEFAULT_MAP_WINDOW; while (addr > PAGE_SIZE) { info.high_limit = addr; @@ -432,8 +432,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, return -ENOMEM; } - if (high_limit > mm->context.addr_limit) { - mm->context.addr_limit = high_limit; + if (high_limit > mm->context.slb_addr_limit) { + mm->context.slb_addr_limit = high_limit; on_each_cpu(slice_flush_segments, mm, 1); } @@ -452,7 +452,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* Sanity checks */ BUG_ON(mm->task_size == 0); - BUG_ON(mm->context.addr_limit == 0); + BUG_ON(mm->context.slb_addr_limit == 0); VM_BUG_ON(radix_enabled()); slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); -- cgit From 475b581ff57bc01437cbc680e281869918447763 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 14 Nov 2017 15:48:47 +1100 Subject: powerpc/64s: Fix masking of SRR1 bits on instruction fault On 64-bit Book3s, when we take an instruction fault the reason for the fault may be reported in SRR1. For data faults the reason is reported in DSISR (Data Storage Instruction Status Register). The reasons reported in each do not necessarily correspond, so we mask the SRR1 bits before copying them to the DSISR, which is then used by the page fault code. Prior to commit b4c001dc44f0 ("powerpc/mm: Use symbolic constants for filtering SRR1 bits on ISIs") we used a hard-coded mask of 0x58200000, which corresponds to: DSISR_NOHPTE 0x40000000 /* no translation found */ DSISR_NOEXEC_OR_G 0x10000000 /* exec of no-exec or guarded */ DSISR_PROTFAULT 0x08000000 /* protection fault */ DSISR_KEYFAULT 0x00200000 /* Storage Key fault */ That commit added a #define for the mask, DSISR_SRR1_MATCH_64S, but incorrectly used a different similarly named DSISR_BAD_FAULT_64S. This had the effect of changing the mask to 0xa43a0000, which omits everything but DSISR_KEYFAULT. Luckily this had no visible effect, because in practice we hardly use the DSISR bits. The lack of DSISR_NOHPTE means a TLB flush optimisation was missed in the native HPTE code, and DSISR_NOEXEC_OR_G and DSISR_PROTFAULT are both only used to trigger rare warnings. So we got lucky, but let's fix it. The new value only has bits between 17 and 30 set, so we can continue to use andis. Fixes: b4c001dc44f0 ("powerpc/mm: Use symbolic constants for filtering SRR1 bits on ISIs") Cc: stable@vger.kernel.org # v4.14+ Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 445137e2d0ca..4c4d20a29726 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -543,7 +543,7 @@ EXC_COMMON_BEGIN(instruction_access_common) RECONCILE_IRQ_STATE(r10, r11) ld r12,_MSR(r1) ld r3,_NIP(r1) - andis. r4,r12,DSISR_BAD_FAULT_64S@h + andis. r4,r12,DSISR_SRR1_MATCH_64S@h li r5,0x400 std r3,_DAR(r1) std r4,_DSISR(r1) -- cgit From 3ffa9d9e2a7c10127d8cbf91ea2be15390b450ed Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 15 Nov 2017 14:25:42 +1100 Subject: powerpc/64s: Fix Power9 DD2.0 workarounds by adding DD2.1 feature Recently we added a CPU feature for Power9 DD2.0, to capture the fact that some workarounds are required only on Power9 DD1 and DD2.0 but not DD2.1 or later. Then in commit 9d2f510a66ec ("powerpc/64s/idle: avoid POWER9 DD1 and DD2.0 ERAT workaround on DD2.1") and commit e3646330cf66 "powerpc/64s/idle: avoid POWER9 DD1 and DD2.0 PMU workaround on DD2.1") we changed CPU_FTR_SECTIONs to check for DD1 or DD20, eg: BEGIN_FTR_SECTION PPC_INVALIDATE_ERAT END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20) Unfortunately although this reads as "if set DD1 or DD2.0", the or is a bitwise or and actually generates a mask of both bits. The code that does the feature patching then checks that the value of the CPU features masked with that mask are equal to the mask. So the end result is we're checking for DD1 and DD20 being set, which never happens. Yes the API is terrible. Removing the ERAT workaround on DD2.0 results in random SEGVs, the system tends to boot, but things randomly die including sometimes dhclient, udev etc. To fix the problem and hopefully avoid it in future, we remove the DD2.0 CPU feature and instead add a DD2.1 (or later) feature. This allows us to easily express that the workarounds are required if DD2.1 is not set. At some point we will drop the DD1 workarounds entirely and some of this can be cleaned up. Fixes: 9d2f510a66ec ("powerpc/64s/idle: avoid POWER9 DD1 and DD2.0 ERAT workaround on DD2.1") Fixes: e3646330cf66 ("powerpc/64s/idle: avoid POWER9 DD1 and DD2.0 PMU workaround on DD2.1") Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/cputable.h | 7 ++++--- arch/powerpc/kernel/cputable.c | 6 +++--- arch/powerpc/kernel/dt_cpu_ftrs.c | 2 +- arch/powerpc/kernel/idle_book3s.S | 8 ++++---- 4 files changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 1ca26ca85ec6..04c31db5e7c3 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -215,7 +215,7 @@ enum { #define CPU_FTR_DABRX LONG_ASM_CONST(0x0800000000000000) #define CPU_FTR_PMAO_BUG LONG_ASM_CONST(0x1000000000000000) #define CPU_FTR_POWER9_DD1 LONG_ASM_CONST(0x4000000000000000) -#define CPU_FTR_POWER9_DD20 LONG_ASM_CONST(0x8000000000000000) +#define CPU_FTR_POWER9_DD2_1 LONG_ASM_CONST(0x8000000000000000) #ifndef __ASSEMBLY__ @@ -478,7 +478,8 @@ enum { CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300) #define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \ (~CPU_FTR_SAO)) -#define CPU_FTRS_POWER9_DD20 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD20) +#define CPU_FTRS_POWER9_DD2_0 CPU_FTRS_POWER9 +#define CPU_FTRS_POWER9_DD2_1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD2_1) #define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ @@ -498,7 +499,7 @@ enum { CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \ CPU_FTRS_POWER8 | CPU_FTRS_POWER8_DD1 | CPU_FTRS_CELL | \ CPU_FTRS_PA6T | CPU_FTR_VSX | CPU_FTRS_POWER9 | \ - CPU_FTRS_POWER9_DD1 | CPU_FTRS_POWER9_DD20) + CPU_FTRS_POWER9_DD1 | CPU_FTRS_POWER9_DD2_1) #endif #else enum { diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 171820190de7..1350f49d81a8 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -551,7 +551,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .pvr_mask = 0xffffefff, .pvr_value = 0x004e0200, .cpu_name = "POWER9 (raw)", - .cpu_features = CPU_FTRS_POWER9_DD20, + .cpu_features = CPU_FTRS_POWER9_DD2_0, .cpu_user_features = COMMON_USER_POWER9, .cpu_user_features2 = COMMON_USER2_POWER9, .mmu_features = MMU_FTRS_POWER9, @@ -567,11 +567,11 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check_early = __machine_check_early_realmode_p9, .platform = "power9", }, - { /* Power9 */ + { /* Power9 DD 2.1 or later (see DD2.0 above) */ .pvr_mask = 0xffff0000, .pvr_value = 0x004e0000, .cpu_name = "POWER9 (raw)", - .cpu_features = CPU_FTRS_POWER9, + .cpu_features = CPU_FTRS_POWER9_DD2_1, .cpu_user_features = COMMON_USER_POWER9, .cpu_user_features2 = COMMON_USER2_POWER9, .mmu_features = MMU_FTRS_POWER9, diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 58e3b51de31c..602e0fde19b4 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -736,7 +736,7 @@ static __init void cpufeatures_cpu_quirks(void) if ((version & 0xffffff00) == 0x004e0100) cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD1; else if ((version & 0xffffefff) == 0x004e0200) - cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD20; + cur_cpu_spec->cpu_features &= ~CPU_FTR_POWER9_DD2_1; } static void __init cpufeatures_setup_finished(void) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 59657783d1d5..01e1c1997893 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -357,13 +357,13 @@ power_enter_stop: .Lhandle_esl_ec_set: BEGIN_FTR_SECTION /* - * POWER9 DD2 can incorrectly set PMAO when waking up after a - * state-loss idle. Saving and restoring MMCR0 over idle is a + * POWER9 DD2.0 or earlier can incorrectly set PMAO when waking up after + * a state-loss idle. Saving and restoring MMCR0 over idle is a * workaround. */ mfspr r4,SPRN_MMCR0 std r4,_MMCR0(r1) -END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20) +END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1) /* * Check if the requested state is a deep idle state. @@ -554,7 +554,7 @@ BEGIN_FTR_SECTION ld r1,PACAR1(r13) ld r4,_MMCR0(r1) mtspr SPRN_MMCR0,r4 -END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1 | CPU_FTR_POWER9_DD20) +END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1) mfspr r4,SPRN_MMCRA ori r4,r4,(1 << (63-60)) mtspr SPRN_MMCRA,r4 -- cgit