diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv')
57 files changed, 6832 insertions, 5640 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 850eee860cf2..b5ad7c173ef0 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -2,48 +2,39 @@ config PPC_POWERNV depends on PPC64 && PPC_BOOK3S bool "IBM PowerNV (Non-Virtualized) platform support" - select PPC_NATIVE + select PPC_HASH_MMU_NATIVE if PPC_64S_HASH_MMU select PPC_XICS select PPC_ICP_NATIVE select PPC_XIVE_NATIVE select PPC_P7_NAP select FORCE_PCI select PCI_MSI + select IRQ_MSI_LIB select EPAPR_BOOT select PPC_INDIRECT_PIO select PPC_UDBG_16550 - select PPC_SCOM - select ARCH_RANDOM select CPU_FREQ select PPC_DOORBELL select MMU_NOTIFIER select FORCE_SMP + select ARCH_SUPPORTS_PER_VMA_LOCK + select PPC_RADIX_BROADCAST_TLBIE if PPC_RADIX_MMU default y config OPAL_PRD - tristate 'OPAL PRD driver' + tristate "OPAL PRD driver" depends on PPC_POWERNV help This enables the opal-prd driver, a facility to run processor recovery diagnostics on OpenPower machines config PPC_MEMTRACE - bool "Enable removal of RAM from kernel mappings for tracing" - depends on PPC_POWERNV && MEMORY_HOTREMOVE + bool "Enable runtime allocation of RAM for tracing" + depends on PPC_POWERNV && MEMORY_HOTPLUG && CONTIG_ALLOC help - Enabling this option allows for the removal of memory (RAM) - from the kernel mappings to be used for hardware tracing. + Enabling this option allows for runtime allocation of memory (RAM) + for hardware tracing. -config PPC_VAS - bool "IBM Virtual Accelerator Switchboard (VAS)" - depends on PPC_POWERNV && PPC_64K_PAGES - default y - help - This enables support for IBM Virtual Accelerator Switchboard (VAS). - - VAS allows accelerators in co-processors like NX-GZIP and NX-842 - to be accessible to kernel subsystems and user processes. - - VAS adapters are found in POWER9 based systems. - - If unsure, say N. +config SCOM_DEBUGFS + bool "Expose SCOM controllers via debugfs" + depends on DEBUG_FS diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index b540ce8eec55..9e5d0c847ee2 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,19 +1,32 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o -obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o + +# nothing that deals with real mode is safe to KASAN +# in particular, idle code runs a bunch of things in real mode +KASAN_SANITIZE_idle.o := n +KASAN_SANITIZE_pci-ioda.o := n +KASAN_SANITIZE_pci-ioda-tce.o := n +# pnv_machine_check_early +KASAN_SANITIZE_setup.o := n + +obj-y += setup.o opal-call.o opal-wrappers.o opal.o opal-async.o +obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o +obj-y += ultravisor.o obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o -obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o -obj-$(CONFIG_CXL_BASE) += pci-cxl.o +obj-$(CONFIG_FA_DUMP) += opal-fadump.o +obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o +obj-$(CONFIG_OPAL_CORE) += opal-core.o +obj-$(CONFIG_PCI) += pci.o pci-ioda.o pci-ioda-tce.o +obj-$(CONFIG_PCI_IOV) += pci-sriov.o obj-$(CONFIG_EEH) += eeh-powernv.o -obj-$(CONFIG_PPC_SCOM) += opal-xscom.o obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o -obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o obj-$(CONFIG_PERF_EVENTS) += opal-imc.o obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o -obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o +obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o vas-fault.o obj-$(CONFIG_OCXL_BASE) += ocxl.o +obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o +obj-$(CONFIG_PPC_SECURE_BOOT) += opal-secvar.o diff --git a/arch/powerpc/platforms/powernv/copy-paste.h b/arch/powerpc/platforms/powernv/copy-paste.h index cb36f9fbcef3..f063807eda9a 100644 --- a/arch/powerpc/platforms/powernv/copy-paste.h +++ b/arch/powerpc/platforms/powernv/copy-paste.h @@ -1,10 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright 2016-17 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <asm/ppc-opcode.h> #include <asm/reg.h> diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index f38078976c5d..db3370d1673c 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -1,14 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * The file intends to implement the platform dependent EEH operations on - * powernv platform. Actually, the powernv was created in order to fully - * hypervisor support. + * PowerNV Platform dependent EEH operations * * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. */ #include <linux/atomic.h> @@ -17,6 +11,7 @@ #include <linux/export.h> #include <linux/init.h> #include <linux/interrupt.h> +#include <linux/irqdomain.h> #include <linux/list.h> #include <linux/msi.h> #include <linux/of.h> @@ -40,71 +35,14 @@ #include "powernv.h" #include "pci.h" +#include "../../../../drivers/pci/pci.h" static int eeh_event_irq = -EINVAL; -void pnv_pcibios_bus_add_device(struct pci_dev *pdev) +static void pnv_pcibios_bus_add_device(struct pci_dev *pdev) { - struct pci_dn *pdn = pci_get_pdn(pdev); - - if (!pdev->is_virtfn) - return; - - /* - * The following operations will fail if VF's sysfs files - * aren't created or its resources aren't finalized. - */ - eeh_add_device_early(pdn); - eeh_add_device_late(pdev); - eeh_sysfs_add_device(pdev); -} - -static int pnv_eeh_init(void) -{ - struct pci_controller *hose; - struct pnv_phb *phb; - int max_diag_size = PNV_PCI_DIAG_BUF_SIZE; - - if (!firmware_has_feature(FW_FEATURE_OPAL)) { - pr_warn("%s: OPAL is required !\n", - __func__); - return -EINVAL; - } - - /* Set probe mode */ - eeh_add_flag(EEH_PROBE_MODE_DEV); - - /* - * P7IOC blocks PCI config access to frozen PE, but PHB3 - * doesn't do that. So we have to selectively enable I/O - * prior to collecting error log. - */ - list_for_each_entry(hose, &hose_list, list_node) { - phb = hose->private_data; - - if (phb->model == PNV_PHB_MODEL_P7IOC) - eeh_add_flag(EEH_ENABLE_IO_FOR_LOG); - - if (phb->diag_data_size > max_diag_size) - max_diag_size = phb->diag_data_size; - - /* - * PE#0 should be regarded as valid by EEH core - * if it's not the reserved one. Currently, we - * have the reserved PE#255 and PE#127 for PHB3 - * and P7IOC separately. So we should regard - * PE#0 as valid for PHB3 and P7IOC. - */ - if (phb->ioda.reserved_pe_idx != 0) - eeh_add_flag(EEH_VALID_PE_ZERO); - - break; - } - - eeh_set_pe_aux_size(max_diag_size); - ppc_md.pcibios_bus_add_device = pnv_pcibios_bus_add_device; - - return 0; + dev_dbg(&pdev->dev, "EEH: Setting up device\n"); + eeh_probe_device(pdev); } static irqreturn_t pnv_eeh_event(int irq, void *data) @@ -150,7 +88,7 @@ static ssize_t pnv_eeh_ei_write(struct file *filp, return -EINVAL; /* Retrieve PE */ - pe = eeh_pe_get(hose, pe_no, 0); + pe = eeh_pe_get(hose, pe_no); if (!pe) return -ENODEV; @@ -161,7 +99,6 @@ static ssize_t pnv_eeh_ei_write(struct file *filp, static const struct file_operations pnv_eeh_ei_fops = { .open = simple_open, - .llseek = no_llseek, .write = pnv_eeh_ei_write, }; @@ -205,6 +142,25 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10); #endif /* CONFIG_DEBUG_FS */ +static void pnv_eeh_enable_phbs(void) +{ + struct pci_controller *hose; + struct pnv_phb *phb; + + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + /* + * If EEH is enabled, we're going to rely on that. + * Otherwise, we restore to conventional mechanism + * to clear frozen PE during PCI config access. + */ + if (eeh_enabled()) + phb->flags |= PNV_PHB_FLAG_EEH; + else + phb->flags &= ~PNV_PHB_FLAG_EEH; + } +} + /** * pnv_eeh_post_init - EEH platform dependent post initialization * @@ -219,9 +175,7 @@ int pnv_eeh_post_init(void) struct pnv_phb *phb; int ret = 0; - /* Probe devices & build address cache */ - eeh_probe_devices(); - eeh_addr_cache_build(); + eeh_show_enabled(); /* Register OPAL event notifier */ eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR)); @@ -243,19 +197,11 @@ int pnv_eeh_post_init(void) if (!eeh_enabled()) disable_irq(eeh_event_irq); + pnv_eeh_enable_phbs(); + list_for_each_entry(hose, &hose_list, list_node) { phb = hose->private_data; - /* - * If EEH is enabled, we're going to rely on that. - * Otherwise, we restore to conventional mechanism - * to clear frozen PE during PCI config access. - */ - if (eeh_enabled()) - phb->flags |= PNV_PHB_FLAG_EEH; - else - phb->flags &= ~PNV_PHB_FLAG_EEH; - /* Create debugfs entries */ #ifdef CONFIG_DEBUG_FS if (phb->has_dbgfs || !phb->dbgfs) @@ -344,28 +290,41 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap) return 0; } +static struct eeh_pe *pnv_eeh_get_upstream_pe(struct pci_dev *pdev) +{ + struct pci_controller *hose = pdev->bus->sysdata; + struct pnv_phb *phb = hose->private_data; + struct pci_dev *parent = pdev->bus->self; + +#ifdef CONFIG_PCI_IOV + /* for VFs we use the PF's PE as the upstream PE */ + if (pdev->is_virtfn) + parent = pdev->physfn; +#endif + + /* otherwise use the PE of our parent bridge */ + if (parent) { + struct pnv_ioda_pe *ioda_pe = pnv_ioda_get_pe(parent); + + return eeh_pe_get(phb->hose, ioda_pe->pe_number); + } + + return NULL; +} + /** * pnv_eeh_probe - Do probe on PCI device - * @pdn: PCI device node - * @data: unused + * @pdev: pci_dev to probe * - * When EEH module is installed during system boot, all PCI devices - * are checked one by one to see if it supports EEH. The function - * is introduced for the purpose. By default, EEH has been enabled - * on all PCI devices. That's to say, we only need do necessary - * initialization on the corresponding eeh device and create PE - * accordingly. - * - * It's notable that's unsafe to retrieve the EEH device through - * the corresponding PCI device. During the PCI device hotplug, which - * was possiblly triggered by EEH core, the binding between EEH device - * and the PCI device isn't built yet. + * Create, or find the existing, eeh_dev for this pci_dev. */ -static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) +static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev) { + struct pci_dn *pdn = pci_get_pdn(pdev); struct pci_controller *hose = pdn->phb; struct pnv_phb *phb = hose->private_data; struct eeh_dev *edev = pdn_to_eeh_dev(pdn); + struct eeh_pe *upstream_pe; uint32_t pcie_flags; int ret; int config_addr = (pdn->busno << 8) | (pdn->devfn); @@ -379,18 +338,27 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) if (!edev || edev->pe) return NULL; + /* already configured? */ + if (edev->pdev) { + pr_debug("%s: found existing edev for %04x:%02x:%02x.%01x\n", + __func__, hose->global_number, config_addr >> 8, + PCI_SLOT(config_addr), PCI_FUNC(config_addr)); + return edev; + } + /* Skip for PCI-ISA bridge */ - if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA) + if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) return NULL; + eeh_edev_dbg(edev, "Probing device\n"); + /* Initialize eeh device */ - edev->class_code = pdn->class_code; edev->mode &= 0xFFFFFF00; edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX); edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP); edev->af_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_AF); edev->aer_cap = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR); - if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) { + if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_PCI) { edev->mode |= EEH_DEV_BRIDGE; if (edev->pcie_cap) { pnv_pci_cfg_read(pdn, edev->pcie_cap + PCI_EXP_FLAGS, @@ -405,12 +373,12 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) edev->pe_config_addr = phb->ioda.pe_rmap[config_addr]; + upstream_pe = pnv_eeh_get_upstream_pe(pdev); + /* Create PE */ - ret = eeh_add_to_parent_pe(edev); + ret = eeh_pe_tree_insert(edev, upstream_pe); if (ret) { - pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%x)\n", - __func__, hose->global_number, pdn->busno, - PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret); + eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret); return NULL; } @@ -422,7 +390,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) * should be blocked until PE reset. MMIO access is dropped * by hardware certainly. In order to drop PCI config requests, * one more flag (EEH_PE_CFG_RESTRICTED) is introduced, which - * will be checked in the backend for PE state retrival. If + * will be checked in the backend for PE state retrieval. If * the PE becomes frozen for the first time and the flag has * been set for the PE, we will set EEH_PE_CFG_BLOCKED for * that PE to block its config space. @@ -459,12 +427,18 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data) * Enable EEH explicitly so that we will do EEH check * while accessing I/O stuff */ - eeh_add_flag(EEH_ENABLED); + if (!eeh_has_flag(EEH_ENABLED)) { + enable_irq(eeh_event_irq); + pnv_eeh_enable_phbs(); + eeh_add_flag(EEH_ENABLED); + } /* Save memory bars */ eeh_save_bars(edev); - return NULL; + eeh_edev_dbg(edev, "EEH enabled on device\n"); + + return edev; } /** @@ -537,18 +511,6 @@ static int pnv_eeh_set_option(struct eeh_pe *pe, int option) return 0; } -/** - * pnv_eeh_get_pe_addr - Retrieve PE address - * @pe: EEH PE - * - * Retrieve the PE address according to the given tranditional - * PCI BDF (Bus/Device/Function) address. - */ -static int pnv_eeh_get_pe_addr(struct eeh_pe *pe) -{ - return pe->addr; -} - static void pnv_eeh_get_phb_diag(struct eeh_pe *pe) { struct pnv_phb *phb = pe->phb->private_data; @@ -843,7 +805,7 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option) int aer = edev ? edev->aer_cap : 0; u32 ctrl; - pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n", + pr_debug("%s: Secondary Reset PCI bus %04x:%02x with option %d\n", __func__, pci_domain_nr(dev->bus), dev->bus->number, option); @@ -852,32 +814,32 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option) case EEH_RESET_HOT: /* Don't report linkDown event */ if (aer) { - eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK, + eeh_ops->read_config(edev, aer + PCI_ERR_UNCOR_MASK, 4, &ctrl); ctrl |= PCI_ERR_UNC_SURPDN; - eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK, + eeh_ops->write_config(edev, aer + PCI_ERR_UNCOR_MASK, 4, ctrl); } - eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl); + eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &ctrl); ctrl |= PCI_BRIDGE_CTL_BUS_RESET; - eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl); + eeh_ops->write_config(edev, PCI_BRIDGE_CONTROL, 2, ctrl); msleep(EEH_PE_RST_HOLD_TIME); break; case EEH_RESET_DEACTIVATE: - eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl); + eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &ctrl); ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET; - eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl); + eeh_ops->write_config(edev, PCI_BRIDGE_CONTROL, 2, ctrl); msleep(EEH_PE_RST_SETTLE_TIME); /* Continue reporting linkDown event */ if (aer) { - eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK, + eeh_ops->read_config(edev, aer + PCI_ERR_UNCOR_MASK, 4, &ctrl); ctrl &= ~PCI_ERR_UNC_SURPDN; - eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK, + eeh_ops->write_config(edev, aer + PCI_ERR_UNCOR_MASK, 4, ctrl); } @@ -892,15 +854,18 @@ static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option) struct pci_controller *hose = pci_bus_to_host(pdev->bus); struct pnv_phb *phb = hose->private_data; struct device_node *dn = pci_device_to_OF_node(pdev); - uint64_t id = PCI_SLOT_ID(phb->opal_id, - (pdev->bus->number << 8) | pdev->devfn); + uint64_t id = PCI_SLOT_ID(phb->opal_id, pci_dev_id(pdev)); uint8_t scope; int64_t rc; /* Hot reset to the bus if firmware cannot handle */ - if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL)) + if (!dn || !of_property_present(dn, "ibm,reset-by-firmware")) return __pnv_eeh_bridge_reset(pdev, option); + pr_debug("%s: FW reset PCI bus %04x:%02x with option %d\n", + __func__, pci_domain_nr(pdev->bus), + pdev->bus->number, option); + switch (option) { case EEH_RESET_FUNDAMENTAL: scope = OPAL_RESET_PCI_FUNDAMENTAL; @@ -942,11 +907,12 @@ void pnv_pci_reset_secondary_bus(struct pci_dev *dev) static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type, int pos, u16 mask) { + struct eeh_dev *edev = pdn->edev; int i, status = 0; /* Wait for Transaction Pending bit to be cleared */ for (i = 0; i < 4; i++) { - eeh_ops->read_config(pdn, pos, 2, &status); + eeh_ops->read_config(edev, pos, 2, &status); if (!(status & mask)) return; @@ -967,7 +933,7 @@ static int pnv_eeh_do_flr(struct pci_dn *pdn, int option) if (WARN_ON(!edev->pcie_cap)) return -ENOTTY; - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP, 4, ®); + eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCAP, 4, ®); if (!(reg & PCI_EXP_DEVCAP_FLR)) return -ENOTTY; @@ -977,18 +943,18 @@ static int pnv_eeh_do_flr(struct pci_dn *pdn, int option) pnv_eeh_wait_for_pending(pdn, "", edev->pcie_cap + PCI_EXP_DEVSTA, PCI_EXP_DEVSTA_TRPND); - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, + eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL, 4, ®); reg |= PCI_EXP_DEVCTL_BCR_FLR; - eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, + eeh_ops->write_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL, 4, reg); msleep(EEH_PE_RST_HOLD_TIME); break; case EEH_RESET_DEACTIVATE: - eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, + eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL, 4, ®); reg &= ~PCI_EXP_DEVCTL_BCR_FLR; - eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL, + eeh_ops->write_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL, 4, reg); msleep(EEH_PE_RST_SETTLE_TIME); break; @@ -1005,7 +971,7 @@ static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option) if (WARN_ON(!edev->af_cap)) return -ENOTTY; - eeh_ops->read_config(pdn, edev->af_cap + PCI_AF_CAP, 1, &cap); + eeh_ops->read_config(edev, edev->af_cap + PCI_AF_CAP, 1, &cap); if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR)) return -ENOTTY; @@ -1014,18 +980,18 @@ static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option) case EEH_RESET_FUNDAMENTAL: /* * Wait for Transaction Pending bit to clear. A word-aligned - * test is used, so we use the conrol offset rather than status + * test is used, so we use the control offset rather than status * and shift the test bit to match. */ pnv_eeh_wait_for_pending(pdn, "AF", edev->af_cap + PCI_AF_CTRL, PCI_AF_STATUS_TP << 8); - eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL, + eeh_ops->write_config(edev, edev->af_cap + PCI_AF_CTRL, 1, PCI_AF_CTRL_FLR); msleep(EEH_PE_RST_HOLD_TIME); break; case EEH_RESET_DEACTIVATE: - eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL, 1, 0); + eeh_ops->write_config(edev, edev->af_cap + PCI_AF_CTRL, 1, 0); msleep(EEH_PE_RST_SETTLE_TIME); break; } @@ -1081,7 +1047,7 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) * frozen state during PE reset. However, the good idea here from * benh is to keep frozen state before we get PE reset done completely * (until BAR restore). With the frozen state, HW drops illegal IO - * or MMIO access, which can incur recrusive frozen PE during PE + * or MMIO access, which can incur recursive frozen PE during PE * reset. The side effect is that EEH core has to clear the frozen * state explicitly after BAR restore. */ @@ -1119,17 +1085,37 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option) return -EIO; } + if (pci_is_root_bus(bus)) + return pnv_eeh_root_reset(hose, option); + /* - * If dealing with the root bus (or the bus underneath the - * root port), we reset the bus underneath the root port. + * For hot resets try use the generic PCI error recovery reset + * functions. These correctly handles the case where the secondary + * bus is behind a hotplug slot and it will use the slot provided + * reset methods to prevent spurious hotplug events during the reset. * - * The cxl driver depends on this behaviour for bi-modal card - * switching. + * Fundamental resets need to be handled internally to EEH since the + * PCI core doesn't really have a concept of a fundamental reset, + * mainly because there's no standard way to generate one. Only a + * few devices require an FRESET so it should be fine. */ - if (pci_is_root_bus(bus) || - pci_is_root_bus(bus->parent)) - return pnv_eeh_root_reset(hose, option); + if (option != EEH_RESET_FUNDAMENTAL) { + /* + * NB: Skiboot and pnv_eeh_bridge_reset() also no-op the + * de-assert step. It's like the OPAL reset API was + * poorly designed or something... + */ + if (option == EEH_RESET_DEACTIVATE) + return 0; + rc = pci_bus_error_reset(bus->self); + if (!rc) + return 0; + } + + /* otherwise, use the generic bridge reset. this might call into FW */ + if (pci_is_root_bus(bus->parent)) + return pnv_eeh_root_reset(hose, option); return pnv_eeh_bridge_reset(bus->self, option); } @@ -1239,9 +1225,11 @@ static inline bool pnv_eeh_cfg_blocked(struct pci_dn *pdn) return false; } -static int pnv_eeh_read_config(struct pci_dn *pdn, +static int pnv_eeh_read_config(struct eeh_dev *edev, int where, int size, u32 *val) { + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + if (!pdn) return PCIBIOS_DEVICE_NOT_FOUND; @@ -1253,9 +1241,11 @@ static int pnv_eeh_read_config(struct pci_dn *pdn, return pnv_pci_cfg_read(pdn, where, size, val); } -static int pnv_eeh_write_config(struct pci_dn *pdn, +static int pnv_eeh_write_config(struct eeh_dev *edev, int where, int size, u32 val) { + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + if (!pdn) return PCIBIOS_DEVICE_NOT_FOUND; @@ -1367,7 +1357,7 @@ static int pnv_eeh_get_pe(struct pci_controller *hose, } /* Find the PE according to PE# */ - dev_pe = eeh_pe_get(hose, pe_no, 0); + dev_pe = eeh_pe_get(hose, pe_no); if (!dev_pe) return -EEXIST; @@ -1609,34 +1599,24 @@ static int pnv_eeh_next_error(struct eeh_pe **pe) return ret; } -static int pnv_eeh_restore_config(struct pci_dn *pdn) +static int pnv_eeh_restore_config(struct eeh_dev *edev) { - struct eeh_dev *edev = pdn_to_eeh_dev(pdn); struct pnv_phb *phb; s64 ret = 0; - int config_addr = (pdn->busno << 8) | (pdn->devfn); if (!edev) return -EEXIST; - /* - * We have to restore the PCI config space after reset since the - * firmware can't see SRIOV VFs. - * - * FIXME: The MPS, error routing rules, timeout setting are worthy - * to be exported by firmware in extendible way. - */ - if (edev->physfn) { - ret = eeh_restore_vf_config(pdn); - } else { - phb = pdn->phb->private_data; - ret = opal_pci_reinit(phb->opal_id, - OPAL_REINIT_PCI_DEV, config_addr); - } + if (edev->physfn) + return 0; + + phb = edev->controller->private_data; + ret = opal_pci_reinit(phb->opal_id, + OPAL_REINIT_PCI_DEV, edev->bdfn); if (ret) { pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n", - __func__, config_addr, ret); + __func__, edev->bdfn, ret); return -EIO; } @@ -1645,10 +1625,8 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn) static struct eeh_ops pnv_eeh_ops = { .name = "powernv", - .init = pnv_eeh_init, .probe = pnv_eeh_probe, .set_option = pnv_eeh_set_option, - .get_pe_addr = pnv_eeh_get_pe_addr, .get_state = pnv_eeh_get_state, .reset = pnv_eeh_reset, .get_log = pnv_eeh_get_log, @@ -1661,24 +1639,6 @@ static struct eeh_ops pnv_eeh_ops = { .notify_resume = NULL }; -#ifdef CONFIG_PCI_IOV -static void pnv_pci_fixup_vf_mps(struct pci_dev *pdev) -{ - struct pci_dn *pdn = pci_get_pdn(pdev); - int parent_mps; - - if (!pdev->is_virtfn) - return; - - /* Synchronize MPS for VF and PF */ - parent_mps = pcie_get_mps(pdev->physfn); - if ((128 << pdev->pcie_mpss) >= parent_mps) - pcie_set_mps(pdev, parent_mps); - pdn->mps = pcie_get_mps(pdev); -} -DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_pci_fixup_vf_mps); -#endif /* CONFIG_PCI_IOV */ - /** * eeh_powernv_init - Register platform dependent EEH operations * @@ -1687,9 +1647,44 @@ DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_pci_fixup_vf_mps); */ static int __init eeh_powernv_init(void) { + int max_diag_size = PNV_PCI_DIAG_BUF_SIZE; + struct pci_controller *hose; + struct pnv_phb *phb; int ret = -EINVAL; - ret = eeh_ops_register(&pnv_eeh_ops); + if (!firmware_has_feature(FW_FEATURE_OPAL)) { + pr_warn("%s: OPAL is required !\n", __func__); + return -EINVAL; + } + + /* Set probe mode */ + eeh_add_flag(EEH_PROBE_MODE_DEV); + + /* + * P7IOC blocks PCI config access to frozen PE, but PHB3 + * doesn't do that. So we have to selectively enable I/O + * prior to collecting error log. + */ + list_for_each_entry(hose, &hose_list, list_node) { + phb = hose->private_data; + + if (phb->model == PNV_PHB_MODEL_P7IOC) + eeh_add_flag(EEH_ENABLE_IO_FOR_LOG); + + if (phb->diag_data_size > max_diag_size) + max_diag_size = phb->diag_data_size; + + break; + } + + /* + * eeh_init() allocates the eeh_pe and its aux data buf so the + * size needs to be set before calling eeh_init(). + */ + eeh_set_pe_aux_size(max_diag_size); + ppc_md.pcibios_bus_add_device = pnv_pcibios_bus_add_device; + + ret = eeh_init(&pnv_eeh_ops); if (!ret) pr_info("EEH: PowerNV platform initialized\n"); else @@ -1697,4 +1692,4 @@ static int __init eeh_powernv_init(void) return ret; } -machine_early_initcall(powernv, eeh_powernv_init); +machine_arch_initcall(powernv, eeh_powernv_init); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 35f699ebb662..d98b933e4984 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV cpuidle code * * Copyright 2015 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/types.h> @@ -17,11 +13,12 @@ #include <linux/cpu.h> #include <asm/firmware.h> +#include <asm/interrupt.h> #include <asm/machdep.h> #include <asm/opal.h> #include <asm/cputhreads.h> #include <asm/cpuidle.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/smp.h> #include <asm/runlatch.h> #include <asm/dbell.h> @@ -48,10 +45,10 @@ static u64 pnv_default_stop_mask; static bool default_stop_found; /* - * First deep stop state. Used to figure out when to save/restore - * hypervisor context. + * First stop state levels when SPR and TB loss can occur. */ -u64 pnv_first_deep_stop_state = MAX_STOP_STATE; +static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1; +static u64 deep_spr_loss_state = MAX_STOP_STATE + 1; /* * psscr value and mask of the deepest stop idle state. @@ -62,7 +59,9 @@ static u64 pnv_deepest_stop_psscr_mask; static u64 pnv_deepest_stop_flag; static bool deepest_stop_found; -static int pnv_save_sprs_for_deep_states(void) +static unsigned long power7_offline_type; + +static int __init pnv_save_sprs_for_deep_states(void) { int cpu; int rc; @@ -72,12 +71,9 @@ static int pnv_save_sprs_for_deep_states(void) * all cpus at boot. Get these reg values of current cpu and use the * same across all cpus. */ - uint64_t lpcr_val = mfspr(SPRN_LPCR); - uint64_t hid0_val = mfspr(SPRN_HID0); - uint64_t hid1_val = mfspr(SPRN_HID1); - uint64_t hid4_val = mfspr(SPRN_HID4); - uint64_t hid5_val = mfspr(SPRN_HID5); - uint64_t hmeer_val = mfspr(SPRN_HMEER); + uint64_t lpcr_val = mfspr(SPRN_LPCR); + uint64_t hid0_val = mfspr(SPRN_HID0); + uint64_t hmeer_val = mfspr(SPRN_HMEER); uint64_t msr_val = MSR_IDLE; uint64_t psscr_val = pnv_deepest_stop_psscr_val; @@ -116,8 +112,11 @@ static int pnv_save_sprs_for_deep_states(void) if (rc != 0) return rc; - /* Only p8 needs to set extra HID regiters */ + /* Only p8 needs to set extra HID registers */ if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + uint64_t hid1_val = mfspr(SPRN_HID1); + uint64_t hid4_val = mfspr(SPRN_HID4); + uint64_t hid5_val = mfspr(SPRN_HID5); rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val); if (rc != 0) @@ -137,89 +136,6 @@ static int pnv_save_sprs_for_deep_states(void) return 0; } -static void pnv_alloc_idle_core_states(void) -{ - int i, j; - int nr_cores = cpu_nr_cores(); - u32 *core_idle_state; - - /* - * core_idle_state - The lower 8 bits track the idle state of - * each thread of the core. - * - * The most significant bit is the lock bit. - * - * Initially all the bits corresponding to threads_per_core - * are set. They are cleared when the thread enters deep idle - * state like sleep and winkle/stop. - * - * Initially the lock bit is cleared. The lock bit has 2 - * purposes: - * a. While the first thread in the core waking up from - * idle is restoring core state, it prevents other - * threads in the core from switching to process - * context. - * b. While the last thread in the core is saving the - * core state, it prevents a different thread from - * waking up. - */ - for (i = 0; i < nr_cores; i++) { - int first_cpu = i * threads_per_core; - int node = cpu_to_node(first_cpu); - size_t paca_ptr_array_size; - - core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); - *core_idle_state = (1 << threads_per_core) - 1; - paca_ptr_array_size = (threads_per_core * - sizeof(struct paca_struct *)); - - for (j = 0; j < threads_per_core; j++) { - int cpu = first_cpu + j; - - paca_ptrs[cpu]->core_idle_state_ptr = core_idle_state; - paca_ptrs[cpu]->thread_idle_state = PNV_THREAD_RUNNING; - paca_ptrs[cpu]->thread_mask = 1 << j; - } - } - - update_subcore_sibling_mask(); - - if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) { - int rc = pnv_save_sprs_for_deep_states(); - - if (likely(!rc)) - return; - - /* - * The stop-api is unable to restore hypervisor - * resources on wakeup from platform idle states which - * lose full context. So disable such states. - */ - supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT; - pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n"); - pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n"); - - if (cpu_has_feature(CPU_FTR_ARCH_300) && - (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) { - /* - * Use the default stop state for CPU-Hotplug - * if available. - */ - if (default_stop_found) { - pnv_deepest_stop_psscr_val = - pnv_default_stop_val; - pnv_deepest_stop_psscr_mask = - pnv_default_stop_mask; - pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n", - pnv_deepest_stop_psscr_val); - } else { /* Fallback to snooze loop for CPU-Hotplug */ - deepest_stop_found = false; - pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n"); - } - } - } -} - u32 pnv_get_supported_cpuidle_states(void) { return supported_cpuidle_states; @@ -229,15 +145,22 @@ EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); static void pnv_fastsleep_workaround_apply(void *info) { + int cpu = smp_processor_id(); int rc; int *err = info; + if (cpu_first_thread_sibling(cpu) != cpu) + return; + rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP, OPAL_CONFIG_IDLE_APPLY); if (rc) *err = 1; } +static bool power7_fastsleep_workaround_entry = true; +static bool power7_fastsleep_workaround_exit = true; + /* * Used to store fastsleep workaround state * 0 - Workaround applied/undone at fastsleep entry/exit path (Default) @@ -255,7 +178,6 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - cpumask_t primary_thread_mask; int err; u8 val; @@ -269,40 +191,25 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev, * fastsleep_workaround_applyonce = 1 implies * fastsleep workaround needs to be left in 'applied' state on all * the cores. Do this by- - * 1. Patching out the call to 'undo' workaround in fastsleep exit path - * 2. Sending ipi to all the cores which have at least one online thread - * 3. Patching out the call to 'apply' workaround in fastsleep entry - * path + * 1. Disable the 'undo' workaround in fastsleep exit path + * 2. Sendi IPIs to all the cores which have at least one online thread + * 3. Disable the 'apply' workaround in fastsleep entry path + * * There is no need to send ipi to cores which have all threads * offlined, as last thread of the core entering fastsleep or deeper * state would have applied workaround. */ - err = patch_instruction( - (unsigned int *)pnv_fastsleep_workaround_at_exit, - PPC_INST_NOP); - if (err) { - pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_exit"); - goto fail; - } + power7_fastsleep_workaround_exit = false; - get_online_cpus(); - primary_thread_mask = cpu_online_cores_map(); - on_each_cpu_mask(&primary_thread_mask, - pnv_fastsleep_workaround_apply, - &err, 1); - put_online_cpus(); + cpus_read_lock(); + on_each_cpu(pnv_fastsleep_workaround_apply, &err, 1); + cpus_read_unlock(); if (err) { pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply"); goto fail; } - err = patch_instruction( - (unsigned int *)pnv_fastsleep_workaround_at_entry, - PPC_INST_NOP); - if (err) { - pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_entry"); - goto fail; - } + power7_fastsleep_workaround_entry = false; fastsleep_workaround_applyonce = 1; @@ -315,31 +222,353 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600, show_fastsleep_workaround_applyonce, store_fastsleep_workaround_applyonce); -static unsigned long __power7_idle_type(unsigned long type) +static inline void atomic_start_thread_idle(void) +{ + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + int thread_nr = cpu_thread_in_core(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + + clear_bit(thread_nr, state); +} + +static inline void atomic_stop_thread_idle(void) +{ + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + int thread_nr = cpu_thread_in_core(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + + set_bit(thread_nr, state); +} + +static inline void atomic_lock_thread_idle(void) { + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + unsigned long *lock = &paca_ptrs[first]->idle_lock; + + while (unlikely(test_and_set_bit_lock(NR_PNV_CORE_IDLE_LOCK_BIT, lock))) + barrier(); +} + +static inline void atomic_unlock_and_stop_thread_idle(void) +{ + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + unsigned long thread = 1UL << cpu_thread_in_core(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + unsigned long *lock = &paca_ptrs[first]->idle_lock; + u64 s = READ_ONCE(*state); + u64 new, tmp; + + BUG_ON(!(READ_ONCE(*lock) & PNV_CORE_IDLE_LOCK_BIT)); + BUG_ON(s & thread); + +again: + new = s | thread; + tmp = cmpxchg(state, s, new); + if (unlikely(tmp != s)) { + s = tmp; + goto again; + } + clear_bit_unlock(NR_PNV_CORE_IDLE_LOCK_BIT, lock); +} + +static inline void atomic_unlock_thread_idle(void) +{ + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + unsigned long *lock = &paca_ptrs[first]->idle_lock; + + BUG_ON(!test_bit(NR_PNV_CORE_IDLE_LOCK_BIT, lock)); + clear_bit_unlock(NR_PNV_CORE_IDLE_LOCK_BIT, lock); +} + +/* P7 and P8 */ +struct p7_sprs { + /* per core */ + u64 tscr; + u64 worc; + + /* per subcore */ + u64 sdr1; + u64 rpr; + + /* per thread */ + u64 lpcr; + u64 hfscr; + u64 fscr; + u64 purr; + u64 spurr; + u64 dscr; + u64 wort; + + /* per thread SPRs that get lost in shallow states */ + u64 amr; + u64 iamr; + u64 uamor; + /* amor is restored to constant ~0 */ +}; + +static unsigned long power7_idle_insn(unsigned long type) +{ + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + unsigned long thread = 1UL << cpu_thread_in_core(cpu); + unsigned long core_thread_mask = (1UL << threads_per_core) - 1; unsigned long srr1; + bool full_winkle; + struct p7_sprs sprs = {}; /* avoid false use-uninitialised */ + bool sprs_saved = false; + int rc; - if (!prep_irq_for_idle_irqsoff()) - return 0; + if (unlikely(type != PNV_THREAD_NAP)) { + atomic_lock_thread_idle(); + + BUG_ON(!(*state & thread)); + *state &= ~thread; + + if (power7_fastsleep_workaround_entry) { + if ((*state & core_thread_mask) == 0) { + rc = opal_config_cpu_idle_state( + OPAL_CONFIG_IDLE_FASTSLEEP, + OPAL_CONFIG_IDLE_APPLY); + BUG_ON(rc); + } + } + + if (type == PNV_THREAD_WINKLE) { + sprs.tscr = mfspr(SPRN_TSCR); + sprs.worc = mfspr(SPRN_WORC); + + sprs.sdr1 = mfspr(SPRN_SDR1); + sprs.rpr = mfspr(SPRN_RPR); + + sprs.lpcr = mfspr(SPRN_LPCR); + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + sprs.hfscr = mfspr(SPRN_HFSCR); + sprs.fscr = mfspr(SPRN_FSCR); + } + sprs.purr = mfspr(SPRN_PURR); + sprs.spurr = mfspr(SPRN_SPURR); + sprs.dscr = mfspr(SPRN_DSCR); + sprs.wort = mfspr(SPRN_WORT); + + sprs_saved = true; + + /* + * Increment winkle counter and set all winkle bits if + * all threads are winkling. This allows wakeup side to + * distinguish between fast sleep and winkle state + * loss. Fast sleep still has to resync the timebase so + * this may not be a really big win. + */ + *state += 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT; + if ((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) + >> PNV_CORE_IDLE_WINKLE_COUNT_SHIFT + == threads_per_core) + *state |= PNV_CORE_IDLE_THREAD_WINKLE_BITS; + WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0); + } + + atomic_unlock_thread_idle(); + } + + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + sprs.amr = mfspr(SPRN_AMR); + sprs.iamr = mfspr(SPRN_IAMR); + sprs.uamor = mfspr(SPRN_UAMOR); + } + + local_paca->thread_idle_state = type; + srr1 = isa206_idle_insn_mayloss(type); /* go idle */ + local_paca->thread_idle_state = PNV_THREAD_RUNNING; + + WARN_ON_ONCE(!srr1); + WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR)); + + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) { + /* + * We don't need an isync after the mtsprs here because + * the upcoming mtmsrd is execution synchronizing. + */ + mtspr(SPRN_AMR, sprs.amr); + mtspr(SPRN_IAMR, sprs.iamr); + mtspr(SPRN_AMOR, ~0); + mtspr(SPRN_UAMOR, sprs.uamor); + } + } + + if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI)) + hmi_exception_realmode(NULL); + + if (likely((srr1 & SRR1_WAKESTATE) != SRR1_WS_HVLOSS)) { + if (unlikely(type != PNV_THREAD_NAP)) { + atomic_lock_thread_idle(); + if (type == PNV_THREAD_WINKLE) { + WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0); + *state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT; + *state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT); + } + atomic_unlock_and_stop_thread_idle(); + } + return srr1; + } + + /* HV state loss */ + BUG_ON(type == PNV_THREAD_NAP); + + atomic_lock_thread_idle(); + + full_winkle = false; + if (type == PNV_THREAD_WINKLE) { + WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0); + *state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT; + if (*state & (thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT)) { + *state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT); + full_winkle = true; + BUG_ON(!sprs_saved); + } + } + + WARN_ON(*state & thread); + + if ((*state & core_thread_mask) != 0) + goto core_woken; + + /* Per-core SPRs */ + if (full_winkle) { + mtspr(SPRN_TSCR, sprs.tscr); + mtspr(SPRN_WORC, sprs.worc); + } + + if (power7_fastsleep_workaround_exit) { + rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP, + OPAL_CONFIG_IDLE_UNDO); + BUG_ON(rc); + } + + /* TB */ + if (opal_resync_timebase() != OPAL_SUCCESS) + BUG(); + +core_woken: + if (!full_winkle) + goto subcore_woken; + + if ((*state & local_paca->subcore_sibling_mask) != 0) + goto subcore_woken; + + /* Per-subcore SPRs */ + mtspr(SPRN_SDR1, sprs.sdr1); + mtspr(SPRN_RPR, sprs.rpr); + +subcore_woken: + /* + * isync after restoring shared SPRs and before unlocking. Unlock + * only contains hwsync which does not necessarily do the right + * thing for SPRs. + */ + isync(); + atomic_unlock_and_stop_thread_idle(); + + /* Fast sleep does not lose SPRs */ + if (!full_winkle) + return srr1; + + /* Per-thread SPRs */ + mtspr(SPRN_LPCR, sprs.lpcr); + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + mtspr(SPRN_HFSCR, sprs.hfscr); + mtspr(SPRN_FSCR, sprs.fscr); + } + mtspr(SPRN_PURR, sprs.purr); + mtspr(SPRN_SPURR, sprs.spurr); + mtspr(SPRN_DSCR, sprs.dscr); + mtspr(SPRN_WORT, sprs.wort); + + mtspr(SPRN_SPRG3, local_paca->sprg_vdso); + +#ifdef CONFIG_PPC_64S_HASH_MMU + /* + * The SLB has to be restored here, but it sometimes still + * contains entries, so the __ variant must be used to prevent + * multi hits. + */ + __slb_restore_bolted_realmode(); +#endif + + return srr1; +} + +extern unsigned long idle_kvm_start_guest(unsigned long srr1); + +#ifdef CONFIG_HOTPLUG_CPU +static unsigned long power7_offline(void) +{ + unsigned long srr1; + + mtmsr(MSR_IDLE); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + /* Tell KVM we're entering idle. */ + /******************************************************/ + /* N O T E W E L L ! ! ! N O T E W E L L */ + /* The following store to HSTATE_HWTHREAD_STATE(r13) */ + /* MUST occur in real mode, i.e. with the MMU off, */ + /* and the MMU must stay off until we clear this flag */ + /* and test HSTATE_HWTHREAD_REQ(r13) in */ + /* pnv_powersave_wakeup in this file. */ + /* The reason is that another thread can switch the */ + /* MMU to a guest context whenever this flag is set */ + /* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on, */ + /* that would potentially cause this thread to start */ + /* executing instructions from guest memory in */ + /* hypervisor mode, leading to a host crash or data */ + /* corruption, or worse. */ + /******************************************************/ + local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE; +#endif __ppc64_runlatch_off(); - srr1 = power7_idle_insn(type); + srr1 = power7_idle_insn(power7_offline_type); __ppc64_runlatch_on(); - fini_irq_for_idle_irqsoff(); +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL; + /* Order setting hwthread_state vs. testing hwthread_req */ + smp_mb(); + if (local_paca->kvm_hstate.hwthread_req) + srr1 = idle_kvm_start_guest(srr1); +#endif + + mtmsr(MSR_KERNEL); return srr1; } +#endif void power7_idle_type(unsigned long type) { unsigned long srr1; - srr1 = __power7_idle_type(type); + if (!prep_irq_for_idle_irqsoff()) + return; + + mtmsr(MSR_IDLE); + __ppc64_runlatch_off(); + srr1 = power7_idle_insn(type); + __ppc64_runlatch_on(); + mtmsr(MSR_KERNEL); + + fini_irq_for_idle_irqsoff(); irq_set_pending_from_srr1(srr1); } -void power7_idle(void) +static void power7_idle(void) { if (!powersave_nap) return; @@ -347,42 +576,232 @@ void power7_idle(void) power7_idle_type(PNV_THREAD_NAP); } -static unsigned long __power9_idle_type(unsigned long stop_psscr_val, - unsigned long stop_psscr_mask) +struct p9_sprs { + /* per core */ + u64 ptcr; + u64 rpr; + u64 tscr; + u64 ldbar; + + /* per thread */ + u64 lpcr; + u64 hfscr; + u64 fscr; + u64 pid; + u64 purr; + u64 spurr; + u64 dscr; + u64 ciabr; + + u64 mmcra; + u32 mmcr0; + u32 mmcr1; + u64 mmcr2; + + /* per thread SPRs that get lost in shallow states */ + u64 amr; + u64 iamr; + u64 amor; + u64 uamor; +}; + +static unsigned long power9_idle_stop(unsigned long psscr) { - unsigned long psscr; + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + unsigned long core_thread_mask = (1UL << threads_per_core) - 1; unsigned long srr1; + unsigned long pls; + unsigned long mmcr0 = 0; + unsigned long mmcra = 0; + struct p9_sprs sprs = {}; /* avoid false used-uninitialised */ + bool sprs_saved = false; - if (!prep_irq_for_idle_irqsoff()) - return 0; + if (!(psscr & (PSSCR_EC|PSSCR_ESL))) { + /* EC=ESL=0 case */ + + /* + * Wake synchronously. SRESET via xscom may still cause + * a 0x100 powersave wakeup with SRR1 reason! + */ + srr1 = isa300_idle_stop_noloss(psscr); /* go idle */ + if (likely(!srr1)) + return 0; + + /* + * Registers not saved, can't recover! + * This would be a hardware bug + */ + BUG_ON((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS); + + goto out; + } + + /* EC=ESL=1 case */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (cpu_has_feature(CPU_FTR_P9_TM_XER_SO_BUG)) { + local_paca->requested_psscr = psscr; + /* order setting requested_psscr vs testing dont_stop */ + smp_mb(); + if (atomic_read(&local_paca->dont_stop)) { + local_paca->requested_psscr = 0; + return 0; + } + } +#endif + + if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) { + /* + * POWER9 DD2 can incorrectly set PMAO when waking up + * after a state-loss idle. Saving and restoring MMCR0 + * over idle is a workaround. + */ + mmcr0 = mfspr(SPRN_MMCR0); + } + + if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) { + sprs.lpcr = mfspr(SPRN_LPCR); + sprs.hfscr = mfspr(SPRN_HFSCR); + sprs.fscr = mfspr(SPRN_FSCR); + sprs.pid = mfspr(SPRN_PID); + sprs.purr = mfspr(SPRN_PURR); + sprs.spurr = mfspr(SPRN_SPURR); + sprs.dscr = mfspr(SPRN_DSCR); + sprs.ciabr = mfspr(SPRN_CIABR); + + sprs.mmcra = mfspr(SPRN_MMCRA); + sprs.mmcr0 = mfspr(SPRN_MMCR0); + sprs.mmcr1 = mfspr(SPRN_MMCR1); + sprs.mmcr2 = mfspr(SPRN_MMCR2); + + sprs.ptcr = mfspr(SPRN_PTCR); + sprs.rpr = mfspr(SPRN_RPR); + sprs.tscr = mfspr(SPRN_TSCR); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + sprs.ldbar = mfspr(SPRN_LDBAR); + + sprs_saved = true; + + atomic_start_thread_idle(); + } + + sprs.amr = mfspr(SPRN_AMR); + sprs.iamr = mfspr(SPRN_IAMR); + sprs.uamor = mfspr(SPRN_UAMOR); + + srr1 = isa300_idle_stop_mayloss(psscr); /* go idle */ + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + local_paca->requested_psscr = 0; +#endif psscr = mfspr(SPRN_PSSCR); - psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; - __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr); - __ppc64_runlatch_on(); + WARN_ON_ONCE(!srr1); + WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR)); - fini_irq_for_idle_irqsoff(); + if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) { + /* + * We don't need an isync after the mtsprs here because the + * upcoming mtmsrd is execution synchronizing. + */ + mtspr(SPRN_AMR, sprs.amr); + mtspr(SPRN_IAMR, sprs.iamr); + mtspr(SPRN_AMOR, ~0); + mtspr(SPRN_UAMOR, sprs.uamor); - return srr1; -} + /* + * Workaround for POWER9 DD2.0, if we lost resources, the ERAT + * might have been corrupted and needs flushing. We also need + * to reload MMCR0 (see mmcr0 comment above). + */ + if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) { + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT); + mtspr(SPRN_MMCR0, mmcr0); + } -void power9_idle_type(unsigned long stop_psscr_val, - unsigned long stop_psscr_mask) -{ - unsigned long srr1; + /* + * DD2.2 and earlier need to set then clear bit 60 in MMCRA + * to ensure the PMU starts running. + */ + mmcra = mfspr(SPRN_MMCRA); + mmcra |= PPC_BIT(60); + mtspr(SPRN_MMCRA, mmcra); + mmcra &= ~PPC_BIT(60); + mtspr(SPRN_MMCRA, mmcra); + } - srr1 = __power9_idle_type(stop_psscr_val, stop_psscr_mask); - irq_set_pending_from_srr1(srr1); -} + if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI)) + hmi_exception_realmode(NULL); -/* - * Used for ppc_md.power_save which needs a function with no parameters - */ -void power9_idle(void) -{ - power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask); + /* + * On POWER9, SRR1 bits do not match exactly as expected. + * SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so + * just always test PSSCR for SPR/TB state loss. + */ + pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT; + if (likely(pls < deep_spr_loss_state)) { + if (sprs_saved) + atomic_stop_thread_idle(); + goto out; + } + + /* HV state loss */ + BUG_ON(!sprs_saved); + + atomic_lock_thread_idle(); + + if ((*state & core_thread_mask) != 0) + goto core_woken; + + /* Per-core SPRs */ + mtspr(SPRN_PTCR, sprs.ptcr); + mtspr(SPRN_RPR, sprs.rpr); + mtspr(SPRN_TSCR, sprs.tscr); + + if (pls >= pnv_first_tb_loss_level) { + /* TB loss */ + if (opal_resync_timebase() != OPAL_SUCCESS) + BUG(); + } + + /* + * isync after restoring shared SPRs and before unlocking. Unlock + * only contains hwsync which does not necessarily do the right + * thing for SPRs. + */ + isync(); + +core_woken: + atomic_unlock_and_stop_thread_idle(); + + /* Per-thread SPRs */ + mtspr(SPRN_LPCR, sprs.lpcr); + mtspr(SPRN_HFSCR, sprs.hfscr); + mtspr(SPRN_FSCR, sprs.fscr); + mtspr(SPRN_PID, sprs.pid); + mtspr(SPRN_PURR, sprs.purr); + mtspr(SPRN_SPURR, sprs.spurr); + mtspr(SPRN_DSCR, sprs.dscr); + mtspr(SPRN_CIABR, sprs.ciabr); + + mtspr(SPRN_MMCRA, sprs.mmcra); + mtspr(SPRN_MMCR0, sprs.mmcr0); + mtspr(SPRN_MMCR1, sprs.mmcr1); + mtspr(SPRN_MMCR2, sprs.mmcr2); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + mtspr(SPRN_LDBAR, sprs.ldbar); + + mtspr(SPRN_SPRG3, local_paca->sprg_vdso); + + if (!radix_enabled()) + __slb_restore_bolted_realmode(); + +out: + mtmsr(MSR_KERNEL); + + return srr1; } #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -409,7 +828,7 @@ void pnv_power9_force_smt4_catch(void) atomic_inc(&paca_ptrs[cpu0+thr]->dont_stop); } /* order setting dont_stop vs testing requested_psscr */ - mb(); + smp_mb(); for (thr = 0; thr < threads_per_core; ++thr) { if (!paca_ptrs[cpu0+thr]->requested_psscr) ++awake_threads; @@ -457,8 +876,168 @@ void pnv_power9_force_smt4_release(void) EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_release); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ +struct p10_sprs { + /* + * SPRs that get lost in shallow states: + * + * P10 loses CR, LR, CTR, FPSCR, VSCR, XER, TAR, SPRG2, and HSPRG1 + * isa300 idle routines restore CR, LR. + * CTR is volatile + * idle thread doesn't use FP or VEC + * kernel doesn't use TAR + * HSPRG1 is only live in HV interrupt entry + * SPRG2 is only live in KVM guests, KVM handles it. + */ +}; + +static unsigned long power10_idle_stop(unsigned long psscr) +{ + int cpu = raw_smp_processor_id(); + int first = cpu_first_thread_sibling(cpu); + unsigned long *state = &paca_ptrs[first]->idle_state; + unsigned long core_thread_mask = (1UL << threads_per_core) - 1; + unsigned long srr1; + unsigned long pls; +// struct p10_sprs sprs = {}; /* avoid false used-uninitialised */ + bool sprs_saved = false; + + if (!(psscr & (PSSCR_EC|PSSCR_ESL))) { + /* EC=ESL=0 case */ + + /* + * Wake synchronously. SRESET via xscom may still cause + * a 0x100 powersave wakeup with SRR1 reason! + */ + srr1 = isa300_idle_stop_noloss(psscr); /* go idle */ + if (likely(!srr1)) + return 0; + + /* + * Registers not saved, can't recover! + * This would be a hardware bug + */ + BUG_ON((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS); + + goto out; + } + + /* EC=ESL=1 case */ + if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) { + /* XXX: save SPRs for deep state loss here. */ + + sprs_saved = true; + + atomic_start_thread_idle(); + } + + srr1 = isa300_idle_stop_mayloss(psscr); /* go idle */ + + psscr = mfspr(SPRN_PSSCR); + + WARN_ON_ONCE(!srr1); + WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR)); + + if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI)) + hmi_exception_realmode(NULL); + + /* + * On POWER10, SRR1 bits do not match exactly as expected. + * SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so + * just always test PSSCR for SPR/TB state loss. + */ + pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT; + if (likely(pls < deep_spr_loss_state)) { + if (sprs_saved) + atomic_stop_thread_idle(); + goto out; + } + + /* HV state loss */ + BUG_ON(!sprs_saved); + + atomic_lock_thread_idle(); + + if ((*state & core_thread_mask) != 0) + goto core_woken; + + /* XXX: restore per-core SPRs here */ + + if (pls >= pnv_first_tb_loss_level) { + /* TB loss */ + if (opal_resync_timebase() != OPAL_SUCCESS) + BUG(); + } + + /* + * isync after restoring shared SPRs and before unlocking. Unlock + * only contains hwsync which does not necessarily do the right + * thing for SPRs. + */ + isync(); + +core_woken: + atomic_unlock_and_stop_thread_idle(); + + /* XXX: restore per-thread SPRs here */ + + if (!radix_enabled()) + __slb_restore_bolted_realmode(); + +out: + mtmsr(MSR_KERNEL); + + return srr1; +} + +#ifdef CONFIG_HOTPLUG_CPU +static unsigned long arch300_offline_stop(unsigned long psscr) +{ + unsigned long srr1; + + if (cpu_has_feature(CPU_FTR_ARCH_31)) + srr1 = power10_idle_stop(psscr); + else + srr1 = power9_idle_stop(psscr); + + return srr1; +} +#endif + +void arch300_idle_type(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask) +{ + unsigned long psscr; + unsigned long srr1; + + if (!prep_irq_for_idle_irqsoff()) + return; + + psscr = mfspr(SPRN_PSSCR); + psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; + + __ppc64_runlatch_off(); + if (cpu_has_feature(CPU_FTR_ARCH_31)) + srr1 = power10_idle_stop(psscr); + else + srr1 = power9_idle_stop(psscr); + __ppc64_runlatch_on(); + + fini_irq_for_idle_irqsoff(); + + irq_set_pending_from_srr1(srr1); +} + +/* + * Used for ppc_md.power_save which needs a function with no parameters + */ +static void arch300_idle(void) +{ + arch300_idle_type(pnv_default_stop_val, pnv_default_stop_mask); +} + #ifdef CONFIG_HOTPLUG_CPU -static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) + +void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) { u64 pir = get_hard_smp_processor_id(cpu); @@ -480,21 +1059,6 @@ static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) unsigned long pnv_cpu_offline(unsigned int cpu) { unsigned long srr1; - u32 idle_states = pnv_get_supported_cpuidle_states(); - u64 lpcr_val; - - /* - * We don't want to take decrementer interrupts while we are - * offline, so clear LPCR:PECE1. We keep PECE2 (and - * LPCR_PECE_HVEE on P9) enabled as to let IPIs in. - * - * If the CPU gets woken up by a special wakeup, ensure that - * the SLW engine sets LPCR with decrementer bit cleared, else - * the CPU will come back to the kernel due to a spurious - * wakeup. - */ - lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; - pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); __ppc64_runlatch_off(); @@ -504,16 +1068,9 @@ unsigned long pnv_cpu_offline(unsigned int cpu) psscr = mfspr(SPRN_PSSCR); psscr = (psscr & ~pnv_deepest_stop_psscr_mask) | pnv_deepest_stop_psscr_val; - srr1 = power9_offline_stop(psscr); - - } else if ((idle_states & OPAL_PM_WINKLE_ENABLED) && - (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) { - srr1 = power7_idle_insn(PNV_THREAD_WINKLE); - } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || - (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - srr1 = power7_idle_insn(PNV_THREAD_SLEEP); - } else if (idle_states & OPAL_PM_NAP_ENABLED) { - srr1 = power7_idle_insn(PNV_THREAD_NAP); + srr1 = arch300_offline_stop(psscr); + } else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) { + srr1 = power7_offline(); } else { /* This is the fallback method. We emulate snooze */ while (!generic_check_cpu_restart(cpu)) { @@ -526,16 +1083,6 @@ unsigned long pnv_cpu_offline(unsigned int cpu) __ppc64_runlatch_on(); - /* - * Re-enable decrementer interrupts in LPCR. - * - * Further, we want stop states to be woken up by decrementer - * for non-hotplug cases. So program the LPCR via stop api as - * well. - */ - lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1; - pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); - return srr1; } #endif @@ -577,7 +1124,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu) * stop instruction */ -int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags) +int __init validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags) { int err = 0; @@ -619,33 +1166,53 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags) * @dt_idle_states: Number of idle state entries * Returns 0 on success */ -static int __init pnv_power9_idle_init(void) +static void __init pnv_arch300_idle_init(void) { u64 max_residency_ns = 0; int i; + /* stop is not really architected, we only have p9,p10 drivers */ + if (!pvr_version_is(PVR_POWER10) && !pvr_version_is(PVR_POWER9)) + return; + /* - * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask}, - * and the pnv_default_stop_{val,mask}. - * - * pnv_first_deep_stop_state should be set to the first stop - * level to cause hypervisor state loss. - * * pnv_deepest_stop_{val,mask} should be set to values corresponding to * the deepest stop state. * * pnv_default_stop_{val,mask} should be set to values corresponding to - * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state. + * the deepest loss-less (OPAL_PM_STOP_INST_FAST) stop state. */ - pnv_first_deep_stop_state = MAX_STOP_STATE; + pnv_first_tb_loss_level = MAX_STOP_STATE + 1; + deep_spr_loss_state = MAX_STOP_STATE + 1; for (i = 0; i < nr_pnv_idle_states; i++) { int err; struct pnv_idle_states_t *state = &pnv_idle_states[i]; u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK; + /* No deep loss driver implemented for POWER10 yet */ + if (pvr_version_is(PVR_POWER10) && + state->flags & (OPAL_PM_TIMEBASE_STOP|OPAL_PM_LOSE_FULL_CONTEXT)) + continue; + + if ((state->flags & OPAL_PM_TIMEBASE_STOP) && + (pnv_first_tb_loss_level > psscr_rl)) + pnv_first_tb_loss_level = psscr_rl; + if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) && - pnv_first_deep_stop_state > psscr_rl) - pnv_first_deep_stop_state = psscr_rl; + (deep_spr_loss_state > psscr_rl)) + deep_spr_loss_state = psscr_rl; + + /* + * The idle code does not deal with TB loss occurring + * in a shallower state than SPR loss, so force it to + * behave like SPRs are lost if TB is lost. POWER9 would + * never encounter this, but a POWER8 core would if it + * implemented the stop instruction. So this is for forward + * compatibility. + */ + if ((state->flags & OPAL_PM_TIMEBASE_STOP) && + (deep_spr_loss_state > psscr_rl)) + deep_spr_loss_state = psscr_rl; err = validate_psscr_val_mask(&state->psscr_val, &state->psscr_mask, @@ -670,13 +1237,14 @@ static int __init pnv_power9_idle_init(void) pnv_default_stop_val = state->psscr_val; pnv_default_stop_mask = state->psscr_mask; default_stop_found = true; + WARN_ON(state->flags & OPAL_PM_LOSE_FULL_CONTEXT); } } if (unlikely(!default_stop_found)) { pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n"); } else { - ppc_md.power_save = power9_idle; + ppc_md.power_save = arch300_idle; pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n", pnv_default_stop_val, pnv_default_stop_mask); } @@ -689,10 +1257,40 @@ static int __init pnv_power9_idle_init(void) pnv_deepest_stop_psscr_mask); } - pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n", - pnv_first_deep_stop_state); + pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%llx\n", + deep_spr_loss_state); - return 0; + pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%llx\n", + pnv_first_tb_loss_level); +} + +static void __init pnv_disable_deep_states(void) +{ + /* + * The stop-api is unable to restore hypervisor + * resources on wakeup from platform idle states which + * lose full context. So disable such states. + */ + supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT; + pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n"); + pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n"); + + if (cpu_has_feature(CPU_FTR_ARCH_300) && + (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) { + /* + * Use the default stop state for CPU-Hotplug + * if available. + */ + if (default_stop_found) { + pnv_deepest_stop_psscr_val = pnv_default_stop_val; + pnv_deepest_stop_psscr_mask = pnv_default_stop_mask; + pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n", + pnv_deepest_stop_psscr_val); + } else { /* Fallback to snooze loop for CPU-Hotplug */ + deepest_stop_found = false; + pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n"); + } + } } /* @@ -707,10 +1305,8 @@ static void __init pnv_probe_idle_states(void) return; } - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - if (pnv_power9_idle_init()) - return; - } + if (cpu_has_feature(CPU_FTR_ARCH_300)) + pnv_arch300_idle_init(); for (i = 0; i < nr_pnv_idle_states; i++) supported_cpuidle_states |= pnv_idle_states[i].flags; @@ -722,7 +1318,7 @@ static void __init pnv_probe_idle_states(void) * which is the number of cpuidle states discovered through device-tree. */ -static int pnv_parse_cpuidle_dt(void) +static int __init pnv_parse_cpuidle_dt(void) { struct device_node *np; int nr_idle_states, i; @@ -774,14 +1370,14 @@ static int pnv_parse_cpuidle_dt(void) /* Read residencies */ if (of_property_read_u32_array(np, "ibm,cpu-idle-state-residency-ns", temp_u32, nr_idle_states)) { - pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-latencies-ns in DT\n"); + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-residency-ns in DT\n"); rc = -EINVAL; goto out; } for (i = 0; i < nr_idle_states; i++) pnv_idle_states[i].residency_ns = temp_u32[i]; - /* For power9 */ + /* For power9 and later */ if (cpu_has_feature(CPU_FTR_ARCH_300)) { /* Read pm_crtl_val */ if (of_property_read_u64_array(np, "ibm,cpu-idle-state-psscr", @@ -817,7 +1413,7 @@ static int pnv_parse_cpuidle_dt(void) goto out; } for (i = 0; i < nr_idle_states; i++) - strlcpy(pnv_idle_states[i].name, temp_string[i], + strscpy(pnv_idle_states[i].name, temp_string[i], PNV_IDLE_NAME_LEN); nr_pnv_idle_states = nr_idle_states; rc = 0; @@ -825,16 +1421,39 @@ out: kfree(temp_u32); kfree(temp_u64); kfree(temp_string); + of_node_put(np); return rc; } static int __init pnv_init_idle_states(void) { + int cpu; int rc = 0; - supported_cpuidle_states = 0; + + /* Set up PACA fields */ + for_each_present_cpu(cpu) { + struct paca_struct *p = paca_ptrs[cpu]; + + p->idle_state = 0; + if (cpu == cpu_first_thread_sibling(cpu)) + p->idle_state = (1 << threads_per_core) - 1; + + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + /* P7/P8 nap */ + p->thread_idle_state = PNV_THREAD_RUNNING; + } else if (pvr_version_is(PVR_POWER9)) { + /* P9 stop workarounds */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + p->requested_psscr = 0; + atomic_set(&p->dont_stop, 0); +#endif + } + } /* In case we error out nr_pnv_idle_states will be zero */ nr_pnv_idle_states = 0; + supported_cpuidle_states = 0; + if (cpuidle_disable != IDLE_NO_OVERRIDE) goto out; rc = pnv_parse_cpuidle_dt(); @@ -842,27 +1461,45 @@ static int __init pnv_init_idle_states(void) return rc; pnv_probe_idle_states(); - if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - patch_instruction( - (unsigned int *)pnv_fastsleep_workaround_at_entry, - PPC_INST_NOP); - patch_instruction( - (unsigned int *)pnv_fastsleep_workaround_at_exit, - PPC_INST_NOP); - } else { - /* - * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that - * workaround is needed to use fastsleep. Provide sysfs - * control to choose how this workaround has to be applied. - */ - device_create_file(cpu_subsys.dev_root, - &dev_attr_fastsleep_workaround_applyonce); - } + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { + power7_fastsleep_workaround_entry = false; + power7_fastsleep_workaround_exit = false; + } else { + struct device *dev_root; + /* + * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that + * workaround is needed to use fastsleep. Provide sysfs + * control to choose how this workaround has to be + * applied. + */ + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + device_create_file(dev_root, + &dev_attr_fastsleep_workaround_applyonce); + put_device(dev_root); + } + } + + update_subcore_sibling_mask(); - pnv_alloc_idle_core_states(); + if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) { + ppc_md.power_save = power7_idle; + power7_offline_type = PNV_THREAD_NAP; + } + + if ((supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED) && + (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT)) + power7_offline_type = PNV_THREAD_WINKLE; + else if ((supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED) || + (supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) + power7_offline_type = PNV_THREAD_SLEEP; + } - if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) - ppc_md.power_save = power7_idle; + if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) { + if (pnv_save_sprs_for_deep_states()) + pnv_disable_deep_states(); + } out: return 0; diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 84d038ed3882..2ea30b343354 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -1,11 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) IBM Corporation, 2014, 2017 * Anton Blanchard, Rashmica Gupta. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. */ #define pr_fmt(fmt) "memtrace: " fmt @@ -20,8 +16,9 @@ #include <linux/slab.h> #include <linux/memory.h> #include <linux/memory_hotplug.h> +#include <linux/numa.h> #include <asm/machdep.h> -#include <asm/debugfs.h> +#include <asm/cacheflush.h> /* This enables us to keep track of the memory removed from each node. */ struct memtrace_entry { @@ -33,6 +30,7 @@ struct memtrace_entry { char name[16]; }; +static DEFINE_MUTEX(memtrace_mutex); static u64 memtrace_size; static struct memtrace_entry *memtrace_array; @@ -47,90 +45,87 @@ static ssize_t memtrace_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size); } -static const struct file_operations memtrace_fops = { - .llseek = default_llseek, - .read = memtrace_read, - .open = simple_open, -}; - -static int check_memblock_online(struct memory_block *mem, void *arg) +static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma) { - if (mem->state != MEM_ONLINE) - return -1; - - return 0; -} + struct memtrace_entry *ent = filp->private_data; + unsigned long ent_nrpages = ent->size >> PAGE_SHIFT; + unsigned long vma_nrpages = vma_pages(vma); -static int change_memblock_state(struct memory_block *mem, void *arg) -{ - unsigned long state = (unsigned long)arg; + /* The requested page offset should be within object's page count */ + if (vma->vm_pgoff >= ent_nrpages) + return -EINVAL; - mem->state = state; + /* The requested mapping range should remain within the bounds */ + if (vma_nrpages > ent_nrpages - vma->vm_pgoff) + return -EINVAL; - return 0; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + return remap_pfn_range(vma, vma->vm_start, PHYS_PFN(ent->start) + vma->vm_pgoff, + vma->vm_end - vma->vm_start, vma->vm_page_prot); } -/* called with device_hotplug_lock held */ -static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages) -{ - u64 end_pfn = start_pfn + nr_pages - 1; - - if (walk_memory_range(start_pfn, end_pfn, NULL, - check_memblock_online)) - return false; +static const struct file_operations memtrace_fops = { + .llseek = default_llseek, + .read = memtrace_read, + .open = simple_open, + .mmap = memtrace_mmap, +}; - walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE, - change_memblock_state); +#define FLUSH_CHUNK_SIZE SZ_1G +/** + * flush_dcache_range_chunked(): Write any modified data cache blocks out to + * memory and invalidate them, in chunks of up to FLUSH_CHUNK_SIZE + * Does not invalidate the corresponding instruction cache blocks. + * + * @start: the start address + * @stop: the stop address (exclusive) + * @chunk: the max size of the chunks + */ +static void flush_dcache_range_chunked(unsigned long start, unsigned long stop, + unsigned long chunk) +{ + unsigned long i; - if (offline_pages(start_pfn, nr_pages)) { - walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE, - change_memblock_state); - return false; + for (i = start; i < stop; i += chunk) { + flush_dcache_range(i, min(stop, i + chunk)); + cond_resched(); } - - walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE, - change_memblock_state); - - - return true; } static u64 memtrace_alloc_node(u32 nid, u64 size) { - u64 start_pfn, end_pfn, nr_pages, pfn; - u64 base_pfn; - u64 bytes = memory_block_size_bytes(); + const unsigned long nr_pages = PHYS_PFN(size); + unsigned long pfn, start_pfn; + struct page *page; - if (!node_spanned_pages(nid)) + /* + * Trace memory needs to be aligned to the size, which is guaranteed + * by alloc_contig_pages(). + */ + page = alloc_contig_pages(nr_pages, GFP_KERNEL | __GFP_THISNODE | + __GFP_NOWARN | __GFP_ZERO, nid, NULL); + if (!page) return 0; + start_pfn = page_to_pfn(page); - start_pfn = node_start_pfn(nid); - end_pfn = node_end_pfn(nid); - nr_pages = size >> PAGE_SHIFT; - - /* Trace memory needs to be aligned to the size */ - end_pfn = round_down(end_pfn - nr_pages, nr_pages); - - lock_device_hotplug(); - for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) { - if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) { - /* - * Remove memory in memory block size chunks so that - * iomem resources are always split to the same size and - * we never try to remove memory that spans two iomem - * resources. - */ - end_pfn = base_pfn + nr_pages; - for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) { - __remove_memory(nid, pfn << PAGE_SHIFT, bytes); - } - unlock_device_hotplug(); - return base_pfn << PAGE_SHIFT; - } - } - unlock_device_hotplug(); + /* + * Before we go ahead and use this range as cache inhibited range + * flush the cache. + */ + flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn), + (unsigned long)pfn_to_kaddr(start_pfn + nr_pages), + FLUSH_CHUNK_SIZE); - return 0; + /* + * Set pages PageOffline(), to indicate that nobody (e.g., hibernation, + * dumping, ...) should be touching these pages. + */ + for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) + __SetPageOffline(pfn_to_page(pfn)); + + arch_remove_linear_mapping(PFN_PHYS(start_pfn), size); + + return PFN_PHYS(start_pfn); } static int memtrace_init_regions_runtime(u64 size) @@ -190,14 +185,9 @@ static int memtrace_init_debugfs(void) snprintf(ent->name, 16, "%08x", ent->nid); dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir); - if (!dir) { - pr_err("Failed to create debugfs directory for node %d\n", - ent->nid); - return -1; - } ent->dir = dir; - debugfs_create_file("trace", 0400, dir, ent, &memtrace_fops); + debugfs_create_file_unsafe("trace", 0600, dir, ent, &memtrace_fops); debugfs_create_x64("start", 0400, dir, &ent->start); debugfs_create_x64("size", 0400, dir, &ent->size); } @@ -205,16 +195,30 @@ static int memtrace_init_debugfs(void) return ret; } -static int online_mem_block(struct memory_block *mem, void *arg) +static int memtrace_free(int nid, u64 start, u64 size) { - return device_online(&mem->dev); + struct mhp_params params = { .pgprot = PAGE_KERNEL }; + const unsigned long nr_pages = PHYS_PFN(size); + const unsigned long start_pfn = PHYS_PFN(start); + unsigned long pfn; + int ret; + + ret = arch_create_linear_mapping(nid, start, size, ¶ms); + if (ret) + return ret; + + for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) + __ClearPageOffline(pfn_to_page(pfn)); + + free_contig_range(start_pfn, nr_pages); + return 0; } /* - * Iterate through the chunks of memory we have removed from the kernel - * and attempt to add them back to the kernel. + * Iterate through the chunks of memory we allocated and attempt to expose + * them back to the kernel. */ -static int memtrace_online(void) +static int memtrace_free_regions(void) { int i, ret = 0; struct memtrace_entry *ent; @@ -222,8 +226,8 @@ static int memtrace_online(void) for (i = memtrace_array_nr - 1; i >= 0; i--) { ent = &memtrace_array[i]; - /* We have onlined this chunk previously */ - if (ent->nid == -1) + /* We have freed this chunk previously */ + if (ent->nid == NUMA_NO_NODE) continue; /* Remove from io mappings */ @@ -232,37 +236,25 @@ static int memtrace_online(void) ent->mem = 0; } - if (add_memory(ent->nid, ent->start, ent->size)) { - pr_err("Failed to add trace memory to node %d\n", + if (memtrace_free(ent->nid, ent->start, ent->size)) { + pr_err("Failed to free trace memory on node %d\n", ent->nid); ret += 1; continue; } /* - * If kernel isn't compiled with the auto online option - * we need to online the memory ourselves. - */ - if (!memhp_auto_online) { - lock_device_hotplug(); - walk_memory_range(PFN_DOWN(ent->start), - PFN_UP(ent->start + ent->size - 1), - NULL, online_mem_block); - unlock_device_hotplug(); - } - - /* - * Memory was added successfully so clean up references to it - * so on reentry we can tell that this chunk was added. + * Memory was freed successfully so clean up references to it + * so on reentry we can tell that this chunk was freed. */ debugfs_remove_recursive(ent->dir); - pr_info("Added trace memory back to node %d\n", ent->nid); - ent->size = ent->start = ent->nid = -1; + pr_info("Freed trace memory back on node %d\n", ent->nid); + ent->size = ent->start = ent->nid = NUMA_NO_NODE; } if (ret) return ret; - /* If all chunks of memory were added successfully, reset globals */ + /* If all chunks of memory were freed successfully, reset globals */ kfree(memtrace_array); memtrace_array = NULL; memtrace_size = 0; @@ -272,6 +264,7 @@ static int memtrace_online(void) static int memtrace_enable_set(void *data, u64 val) { + int rc = -EAGAIN; u64 bytes; /* @@ -284,25 +277,29 @@ static int memtrace_enable_set(void *data, u64 val) return -EINVAL; } - /* Re-add/online previously removed/offlined memory */ - if (memtrace_size) { - if (memtrace_online()) - return -EAGAIN; - } + mutex_lock(&memtrace_mutex); - if (!val) - return 0; + /* Free all previously allocated memory. */ + if (memtrace_size && memtrace_free_regions()) + goto out_unlock; + + if (!val) { + rc = 0; + goto out_unlock; + } - /* Offline and remove memory */ + /* Allocate memory. */ if (memtrace_init_regions_runtime(val)) - return -EINVAL; + goto out_unlock; if (memtrace_init_debugfs()) - return -EINVAL; + goto out_unlock; memtrace_size = val; - - return 0; + rc = 0; +out_unlock: + mutex_unlock(&memtrace_mutex); + return rc; } static int memtrace_enable_get(void *data, u64 *val) @@ -317,9 +314,7 @@ DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get, static int memtrace_init(void) { memtrace_debugfs_dir = debugfs_create_dir("memtrace", - powerpc_debugfs_root); - if (!memtrace_debugfs_dir) - return -1; + arch_debugfs_dir); debugfs_create_file("enable", 0600, memtrace_debugfs_dir, NULL, &memtrace_init_fops); diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c deleted file mode 100644 index 3f58c7dbd581..000000000000 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ /dev/null @@ -1,1291 +0,0 @@ -/* - * This file implements the DMA operations for NVLink devices. The NPU - * devices all point to the same iommu table as the parent PCI device. - * - * Copyright Alistair Popple, IBM Corporation 2015. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ - -#include <linux/mmu_notifier.h> -#include <linux/mmu_context.h> -#include <linux/of.h> -#include <linux/pci.h> -#include <linux/memblock.h> -#include <linux/sizes.h> - -#include <asm/debugfs.h> -#include <asm/powernv.h> -#include <asm/opal.h> - -#include "pci.h" - -/* - * spinlock to protect initialisation of an npu_context for a particular - * mm_struct. - */ -static DEFINE_SPINLOCK(npu_context_lock); - -/* - * Other types of TCE cache invalidation are not functional in the - * hardware. - */ -static struct pci_dev *get_pci_dev(struct device_node *dn) -{ - struct pci_dn *pdn = PCI_DN(dn); - - return pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus), - pdn->busno, pdn->devfn); -} - -/* Given a NPU device get the associated PCI device. */ -struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev) -{ - struct device_node *dn; - struct pci_dev *gpdev; - - if (WARN_ON(!npdev)) - return NULL; - - if (WARN_ON(!npdev->dev.of_node)) - return NULL; - - /* Get assoicated PCI device */ - dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0); - if (!dn) - return NULL; - - gpdev = get_pci_dev(dn); - of_node_put(dn); - - return gpdev; -} -EXPORT_SYMBOL(pnv_pci_get_gpu_dev); - -/* Given the real PCI device get a linked NPU device. */ -struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index) -{ - struct device_node *dn; - struct pci_dev *npdev; - - if (WARN_ON(!gpdev)) - return NULL; - - /* Not all PCI devices have device-tree nodes */ - if (!gpdev->dev.of_node) - return NULL; - - /* Get assoicated PCI device */ - dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index); - if (!dn) - return NULL; - - npdev = get_pci_dev(dn); - of_node_put(dn); - - return npdev; -} -EXPORT_SYMBOL(pnv_pci_get_npu_dev); - -/* - * Returns the PE assoicated with the PCI device of the given - * NPU. Returns the linked pci device if pci_dev != NULL. - */ -static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe, - struct pci_dev **gpdev) -{ - struct pnv_phb *phb; - struct pci_controller *hose; - struct pci_dev *pdev; - struct pnv_ioda_pe *pe; - struct pci_dn *pdn; - - pdev = pnv_pci_get_gpu_dev(npe->pdev); - if (!pdev) - return NULL; - - pdn = pci_get_pdn(pdev); - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return NULL; - - hose = pci_bus_to_host(pdev->bus); - phb = hose->private_data; - pe = &phb->ioda.pe_array[pdn->pe_number]; - - if (gpdev) - *gpdev = pdev; - - return pe; -} - -static long pnv_npu_unset_window(struct iommu_table_group *table_group, - int num); - -static long pnv_npu_set_window(struct iommu_table_group *table_group, int num, - struct iommu_table *tbl) -{ - struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, - table_group); - struct pnv_phb *phb = npe->phb; - int64_t rc; - const unsigned long size = tbl->it_indirect_levels ? - tbl->it_level_size : tbl->it_size; - const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; - const __u64 win_size = tbl->it_size << tbl->it_page_shift; - int num2 = (num == 0) ? 1 : 0; - - /* NPU has just one TVE so if there is another table, remove it first */ - if (npe->table_group.tables[num2]) - pnv_npu_unset_window(&npe->table_group, num2); - - pe_info(npe, "Setting up window %llx..%llx pg=%lx\n", - start_addr, start_addr + win_size - 1, - IOMMU_PAGE_SIZE(tbl)); - - rc = opal_pci_map_pe_dma_window(phb->opal_id, - npe->pe_number, - npe->pe_number, - tbl->it_indirect_levels + 1, - __pa(tbl->it_base), - size << 3, - IOMMU_PAGE_SIZE(tbl)); - if (rc) { - pe_err(npe, "Failed to configure TCE table, err %lld\n", rc); - return rc; - } - pnv_pci_ioda2_tce_invalidate_entire(phb, false); - - /* Add the table to the list so its TCE cache will get invalidated */ - pnv_pci_link_table_and_group(phb->hose->node, num, - tbl, &npe->table_group); - - return 0; -} - -static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num) -{ - struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, - table_group); - struct pnv_phb *phb = npe->phb; - int64_t rc; - - if (!npe->table_group.tables[num]) - return 0; - - pe_info(npe, "Removing DMA window\n"); - - rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, - npe->pe_number, - 0/* levels */, 0/* table address */, - 0/* table size */, 0/* page size */); - if (rc) { - pe_err(npe, "Unmapping failed, ret = %lld\n", rc); - return rc; - } - pnv_pci_ioda2_tce_invalidate_entire(phb, false); - - pnv_pci_unlink_table_and_group(npe->table_group.tables[num], - &npe->table_group); - - return 0; -} - -/* - * Enables 32 bit DMA on NPU. - */ -static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) -{ - struct pci_dev *gpdev; - struct pnv_ioda_pe *gpe; - int64_t rc; - - /* - * Find the assoicated PCI devices and get the dma window - * information from there. - */ - if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV)) - return; - - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); - if (!gpe) - return; - - rc = pnv_npu_set_window(&npe->table_group, 0, - gpe->table_group.tables[0]); - - /* - * NVLink devices use the same TCE table configuration as - * their parent device so drivers shouldn't be doing DMA - * operations directly on these devices. - */ - set_dma_ops(&npe->pdev->dev, NULL); -} - -/* - * Enables bypass mode on the NPU. The NPU only supports one - * window per link, so bypass needs to be explicitly enabled or - * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be - * active at the same time. - */ -static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) -{ - struct pnv_phb *phb = npe->phb; - int64_t rc = 0; - phys_addr_t top = memblock_end_of_DRAM(); - - if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev) - return -EINVAL; - - rc = pnv_npu_unset_window(&npe->table_group, 0); - if (rc != OPAL_SUCCESS) - return rc; - - /* Enable the bypass window */ - - top = roundup_pow_of_two(top); - dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n", - npe->pe_number); - rc = opal_pci_map_pe_dma_window_real(phb->opal_id, - npe->pe_number, npe->pe_number, - 0 /* bypass base */, top); - - if (rc == OPAL_SUCCESS) - pnv_pci_ioda2_tce_invalidate_entire(phb, false); - - return rc; -} - -void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass) -{ - int i; - struct pnv_phb *phb; - struct pci_dn *pdn; - struct pnv_ioda_pe *npe; - struct pci_dev *npdev; - - for (i = 0; ; ++i) { - npdev = pnv_pci_get_npu_dev(gpdev, i); - - if (!npdev) - break; - - pdn = pci_get_pdn(npdev); - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return; - - phb = pci_bus_to_host(npdev->bus)->private_data; - - /* We only do bypass if it's enabled on the linked device */ - npe = &phb->ioda.pe_array[pdn->pe_number]; - - if (bypass) { - dev_info(&npdev->dev, - "Using 64-bit DMA iommu bypass\n"); - pnv_npu_dma_set_bypass(npe); - } else { - dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); - pnv_npu_dma_set_32(npe); - } - } -} - -#ifdef CONFIG_IOMMU_API -/* Switch ownership from platform code to external user (e.g. VFIO) */ -static void pnv_npu_take_ownership(struct iommu_table_group *table_group) -{ - struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, - table_group); - struct pnv_phb *phb = npe->phb; - int64_t rc; - struct pci_dev *gpdev = NULL; - - /* - * Note: NPU has just a single TVE in the hardware which means that - * while used by the kernel, it can have either 32bit window or - * DMA bypass but never both. So we deconfigure 32bit window only - * if it was enabled at the moment of ownership change. - */ - if (npe->table_group.tables[0]) { - pnv_npu_unset_window(&npe->table_group, 0); - return; - } - - /* Disable bypass */ - rc = opal_pci_map_pe_dma_window_real(phb->opal_id, - npe->pe_number, npe->pe_number, - 0 /* bypass base */, 0); - if (rc) { - pe_err(npe, "Failed to disable bypass, err %lld\n", rc); - return; - } - pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); - - get_gpu_pci_dev_and_pe(npe, &gpdev); - if (gpdev) - pnv_npu2_unmap_lpar_dev(gpdev); -} - -static void pnv_npu_release_ownership(struct iommu_table_group *table_group) -{ - struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe, - table_group); - struct pci_dev *gpdev = NULL; - - get_gpu_pci_dev_and_pe(npe, &gpdev); - if (gpdev) - pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV); -} - -static struct iommu_table_group_ops pnv_pci_npu_ops = { - .set_window = pnv_npu_set_window, - .unset_window = pnv_npu_unset_window, - .take_ownership = pnv_npu_take_ownership, - .release_ownership = pnv_npu_release_ownership, -}; -#endif /* !CONFIG_IOMMU_API */ - -/* - * NPU2 ATS - */ -/* Maximum possible number of ATSD MMIO registers per NPU */ -#define NV_NMMU_ATSD_REGS 8 -#define NV_NPU_MAX_PE_NUM 16 - -/* - * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or - * up to 3 x (GPU + 2xNPUs) (POWER9). - */ -struct npu_comp { - struct iommu_table_group table_group; - int pe_num; - struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM]; -}; - -/* An NPU descriptor, valid for POWER9 only */ -struct npu { - int index; - __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS]; - unsigned int mmio_atsd_count; - - /* Bitmask for MMIO register usage */ - unsigned long mmio_atsd_usage; - - /* Do we need to explicitly flush the nest mmu? */ - bool nmmu_flush; - - struct npu_comp npucomp; -}; - -#ifdef CONFIG_IOMMU_API -static long pnv_npu_peers_create_table_userspace( - struct iommu_table_group *table_group, - int num, __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table **ptbl) -{ - struct npu_comp *npucomp = container_of(table_group, struct npu_comp, - table_group); - - if (!npucomp->pe_num || !npucomp->pe[0] || - !npucomp->pe[0]->table_group.ops || - !npucomp->pe[0]->table_group.ops->create_table) - return -EFAULT; - - return npucomp->pe[0]->table_group.ops->create_table( - &npucomp->pe[0]->table_group, num, page_shift, - window_size, levels, ptbl); -} - -static long pnv_npu_peers_set_window(struct iommu_table_group *table_group, - int num, struct iommu_table *tbl) -{ - int i, j; - long ret = 0; - struct npu_comp *npucomp = container_of(table_group, struct npu_comp, - table_group); - - for (i = 0; i < npucomp->pe_num; ++i) { - struct pnv_ioda_pe *pe = npucomp->pe[i]; - - if (!pe->table_group.ops->set_window) - continue; - - ret = pe->table_group.ops->set_window(&pe->table_group, - num, tbl); - if (ret) - break; - } - - if (ret) { - for (j = 0; j < i; ++j) { - struct pnv_ioda_pe *pe = npucomp->pe[j]; - - if (!pe->table_group.ops->unset_window) - continue; - - ret = pe->table_group.ops->unset_window( - &pe->table_group, num); - if (ret) - break; - } - } else { - table_group->tables[num] = iommu_tce_table_get(tbl); - } - - return ret; -} - -static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group, - int num) -{ - int i, j; - long ret = 0; - struct npu_comp *npucomp = container_of(table_group, struct npu_comp, - table_group); - - for (i = 0; i < npucomp->pe_num; ++i) { - struct pnv_ioda_pe *pe = npucomp->pe[i]; - - WARN_ON(npucomp->table_group.tables[num] != - table_group->tables[num]); - if (!npucomp->table_group.tables[num]) - continue; - - if (!pe->table_group.ops->unset_window) - continue; - - ret = pe->table_group.ops->unset_window(&pe->table_group, num); - if (ret) - break; - } - - if (ret) { - for (j = 0; j < i; ++j) { - struct pnv_ioda_pe *pe = npucomp->pe[j]; - - if (!npucomp->table_group.tables[num]) - continue; - - if (!pe->table_group.ops->set_window) - continue; - - ret = pe->table_group.ops->set_window(&pe->table_group, - num, table_group->tables[num]); - if (ret) - break; - } - } else if (table_group->tables[num]) { - iommu_tce_table_put(table_group->tables[num]); - table_group->tables[num] = NULL; - } - - return ret; -} - -static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group) -{ - int i; - struct npu_comp *npucomp = container_of(table_group, struct npu_comp, - table_group); - - for (i = 0; i < npucomp->pe_num; ++i) { - struct pnv_ioda_pe *pe = npucomp->pe[i]; - - if (!pe->table_group.ops->take_ownership) - continue; - pe->table_group.ops->take_ownership(&pe->table_group); - } -} - -static void pnv_npu_peers_release_ownership( - struct iommu_table_group *table_group) -{ - int i; - struct npu_comp *npucomp = container_of(table_group, struct npu_comp, - table_group); - - for (i = 0; i < npucomp->pe_num; ++i) { - struct pnv_ioda_pe *pe = npucomp->pe[i]; - - if (!pe->table_group.ops->release_ownership) - continue; - pe->table_group.ops->release_ownership(&pe->table_group); - } -} - -static struct iommu_table_group_ops pnv_npu_peers_ops = { - .get_table_size = pnv_pci_ioda2_get_table_size, - .create_table = pnv_npu_peers_create_table_userspace, - .set_window = pnv_npu_peers_set_window, - .unset_window = pnv_npu_peers_unset_window, - .take_ownership = pnv_npu_peers_take_ownership, - .release_ownership = pnv_npu_peers_release_ownership, -}; - -static void pnv_comp_attach_table_group(struct npu_comp *npucomp, - struct pnv_ioda_pe *pe) -{ - if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM)) - return; - - npucomp->pe[npucomp->pe_num] = pe; - ++npucomp->pe_num; -} - -struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe) -{ - struct iommu_table_group *table_group; - struct npu_comp *npucomp; - struct pci_dev *gpdev = NULL; - struct pci_controller *hose; - struct pci_dev *npdev = NULL; - - list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) { - npdev = pnv_pci_get_npu_dev(gpdev, 0); - if (npdev) - break; - } - - if (!npdev) - /* It is not an NPU attached device, skip */ - return NULL; - - hose = pci_bus_to_host(npdev->bus); - - if (hose->npu) { - table_group = &hose->npu->npucomp.table_group; - - if (!table_group->group) { - table_group->ops = &pnv_npu_peers_ops; - iommu_register_group(table_group, - hose->global_number, - pe->pe_number); - } - } else { - /* Create a group for 1 GPU and attached NPUs for POWER8 */ - pe->npucomp = kzalloc(sizeof(*pe->npucomp), GFP_KERNEL); - table_group = &pe->npucomp->table_group; - table_group->ops = &pnv_npu_peers_ops; - iommu_register_group(table_group, hose->global_number, - pe->pe_number); - } - - /* Steal capabilities from a GPU PE */ - table_group->max_dynamic_windows_supported = - pe->table_group.max_dynamic_windows_supported; - table_group->tce32_start = pe->table_group.tce32_start; - table_group->tce32_size = pe->table_group.tce32_size; - table_group->max_levels = pe->table_group.max_levels; - if (!table_group->pgsizes) - table_group->pgsizes = pe->table_group.pgsizes; - - npucomp = container_of(table_group, struct npu_comp, table_group); - pnv_comp_attach_table_group(npucomp, pe); - - return table_group; -} - -struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe) -{ - struct iommu_table_group *table_group; - struct npu_comp *npucomp; - struct pci_dev *gpdev = NULL; - struct pci_dev *npdev; - struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev); - - WARN_ON(!(pe->flags & PNV_IODA_PE_DEV)); - if (!gpe) - return NULL; - - /* - * IODA2 bridges get this set up from pci_controller_ops::setup_bridge - * but NPU bridges do not have this hook defined so we do it here. - * We do not setup other table group parameters as they won't be used - * anyway - NVLink bridges are subordinate PEs. - */ - pe->table_group.ops = &pnv_pci_npu_ops; - - table_group = iommu_group_get_iommudata( - iommu_group_get(&gpdev->dev)); - - /* - * On P9 NPU PHB and PCI PHB support different page sizes, - * keep only matching. We expect here that NVLink bridge PE pgsizes is - * initialized by the caller. - */ - table_group->pgsizes &= pe->table_group.pgsizes; - npucomp = container_of(table_group, struct npu_comp, table_group); - pnv_comp_attach_table_group(npucomp, pe); - - list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) { - struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev); - - if (gpdevtmp != gpdev) - continue; - - iommu_add_device(table_group, &npdev->dev); - } - - return table_group; -} -#endif /* CONFIG_IOMMU_API */ - -/* Maximum number of nvlinks per npu */ -#define NV_MAX_LINKS 6 - -/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */ -static int max_npu2_index; - -struct npu_context { - struct mm_struct *mm; - struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS]; - struct mmu_notifier mn; - struct kref kref; - bool nmmu_flush; - - /* Callback to stop translation requests on a given GPU */ - void (*release_cb)(struct npu_context *context, void *priv); - - /* - * Private pointer passed to the above callback for usage by - * device drivers. - */ - void *priv; -}; - -struct mmio_atsd_reg { - struct npu *npu; - int reg; -}; - -/* - * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC - * if none are available. - */ -static int get_mmio_atsd_reg(struct npu *npu) -{ - int i; - - for (i = 0; i < npu->mmio_atsd_count; i++) { - if (!test_bit(i, &npu->mmio_atsd_usage)) - if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage)) - return i; - } - - return -ENOSPC; -} - -static void put_mmio_atsd_reg(struct npu *npu, int reg) -{ - clear_bit_unlock(reg, &npu->mmio_atsd_usage); -} - -/* MMIO ATSD register offsets */ -#define XTS_ATSD_LAUNCH 0 -#define XTS_ATSD_AVA 1 -#define XTS_ATSD_STAT 2 - -static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize) -{ - unsigned long launch = 0; - - if (psize == MMU_PAGE_COUNT) { - /* IS set to invalidate entire matching PID */ - launch |= PPC_BIT(12); - } else { - /* AP set to invalidate region of psize */ - launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17); - } - - /* PRS set to process-scoped */ - launch |= PPC_BIT(13); - - /* PID */ - launch |= pid << PPC_BITLSHIFT(38); - - /* Leave "No flush" (bit 39) 0 so every ATSD performs a flush */ - - return launch; -} - -static void mmio_atsd_regs_write(struct mmio_atsd_reg - mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset, - unsigned long val) -{ - struct npu *npu; - int i, reg; - - for (i = 0; i <= max_npu2_index; i++) { - reg = mmio_atsd_reg[i].reg; - if (reg < 0) - continue; - - npu = mmio_atsd_reg[i].npu; - __raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset); - } -} - -static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], - unsigned long pid) -{ - unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT); - - /* Invalidating the entire process doesn't use a va */ - mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch); -} - -static void mmio_invalidate_range(struct mmio_atsd_reg - mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid, - unsigned long start, unsigned long psize) -{ - unsigned long launch = get_atsd_launch_val(pid, psize); - - /* Write all VAs first */ - mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start); - - /* Issue one barrier for all address writes */ - eieio(); - - /* Launch */ - mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch); -} - -#define mn_to_npu_context(x) container_of(x, struct npu_context, mn) - -static void mmio_invalidate_wait( - struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) -{ - struct npu *npu; - int i, reg; - - /* Wait for all invalidations to complete */ - for (i = 0; i <= max_npu2_index; i++) { - if (mmio_atsd_reg[i].reg < 0) - continue; - - /* Wait for completion */ - npu = mmio_atsd_reg[i].npu; - reg = mmio_atsd_reg[i].reg; - while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) - cpu_relax(); - } -} - -/* - * Acquires all the address translation shootdown (ATSD) registers required to - * launch an ATSD on all links this npu_context is active on. - */ -static void acquire_atsd_reg(struct npu_context *npu_context, - struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) -{ - int i, j; - struct npu *npu; - struct pci_dev *npdev; - - for (i = 0; i <= max_npu2_index; i++) { - mmio_atsd_reg[i].reg = -1; - for (j = 0; j < NV_MAX_LINKS; j++) { - /* - * There are no ordering requirements with respect to - * the setup of struct npu_context, but to ensure - * consistent behaviour we need to ensure npdev[][] is - * only read once. - */ - npdev = READ_ONCE(npu_context->npdev[i][j]); - if (!npdev) - continue; - - npu = pci_bus_to_host(npdev->bus)->npu; - if (!npu) - continue; - - mmio_atsd_reg[i].npu = npu; - mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu); - while (mmio_atsd_reg[i].reg < 0) { - mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu); - cpu_relax(); - } - break; - } - } -} - -/* - * Release previously acquired ATSD registers. To avoid deadlocks the registers - * must be released in the same order they were acquired above in - * acquire_atsd_reg. - */ -static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) -{ - int i; - - for (i = 0; i <= max_npu2_index; i++) { - /* - * We can't rely on npu_context->npdev[][] being the same here - * as when acquire_atsd_reg() was called, hence we use the - * values stored in mmio_atsd_reg during the acquire phase - * rather than re-reading npdev[][]. - */ - if (mmio_atsd_reg[i].reg < 0) - continue; - - put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg); - } -} - -/* - * Invalidate a virtual address range - */ -static void mmio_invalidate(struct npu_context *npu_context, - unsigned long start, unsigned long size) -{ - struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; - unsigned long pid = npu_context->mm->context.id; - unsigned long atsd_start = 0; - unsigned long end = start + size - 1; - int atsd_psize = MMU_PAGE_COUNT; - - /* - * Convert the input range into one of the supported sizes. If the range - * doesn't fit, use the next larger supported size. Invalidation latency - * is high, so over-invalidation is preferred to issuing multiple - * invalidates. - * - * A 4K page size isn't supported by NPU/GPU ATS, so that case is - * ignored. - */ - if (size == SZ_64K) { - atsd_start = start; - atsd_psize = MMU_PAGE_64K; - } else if (ALIGN_DOWN(start, SZ_2M) == ALIGN_DOWN(end, SZ_2M)) { - atsd_start = ALIGN_DOWN(start, SZ_2M); - atsd_psize = MMU_PAGE_2M; - } else if (ALIGN_DOWN(start, SZ_1G) == ALIGN_DOWN(end, SZ_1G)) { - atsd_start = ALIGN_DOWN(start, SZ_1G); - atsd_psize = MMU_PAGE_1G; - } - - if (npu_context->nmmu_flush) - /* - * Unfortunately the nest mmu does not support flushing specific - * addresses so we have to flush the whole mm once before - * shooting down the GPU translation. - */ - flush_all_mm(npu_context->mm); - - /* - * Loop over all the NPUs this process is active on and launch - * an invalidate. - */ - acquire_atsd_reg(npu_context, mmio_atsd_reg); - - if (atsd_psize == MMU_PAGE_COUNT) - mmio_invalidate_pid(mmio_atsd_reg, pid); - else - mmio_invalidate_range(mmio_atsd_reg, pid, atsd_start, - atsd_psize); - - mmio_invalidate_wait(mmio_atsd_reg); - - /* - * The GPU requires two flush ATSDs to ensure all entries have been - * flushed. We use PID 0 as it will never be used for a process on the - * GPU. - */ - mmio_invalidate_pid(mmio_atsd_reg, 0); - mmio_invalidate_wait(mmio_atsd_reg); - mmio_invalidate_pid(mmio_atsd_reg, 0); - mmio_invalidate_wait(mmio_atsd_reg); - - release_atsd_reg(mmio_atsd_reg); -} - -static void pnv_npu2_mn_release(struct mmu_notifier *mn, - struct mm_struct *mm) -{ - struct npu_context *npu_context = mn_to_npu_context(mn); - - /* Call into device driver to stop requests to the NMMU */ - if (npu_context->release_cb) - npu_context->release_cb(npu_context, npu_context->priv); - - /* - * There should be no more translation requests for this PID, but we - * need to ensure any entries for it are removed from the TLB. - */ - mmio_invalidate(npu_context, 0, ~0UL); -} - -static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long address, - pte_t pte) -{ - struct npu_context *npu_context = mn_to_npu_context(mn); - mmio_invalidate(npu_context, address, PAGE_SIZE); -} - -static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - struct npu_context *npu_context = mn_to_npu_context(mn); - mmio_invalidate(npu_context, start, end - start); -} - -static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { - .release = pnv_npu2_mn_release, - .change_pte = pnv_npu2_mn_change_pte, - .invalidate_range = pnv_npu2_mn_invalidate_range, -}; - -/* - * Call into OPAL to setup the nmmu context for the current task in - * the NPU. This must be called to setup the context tables before the - * GPU issues ATRs. pdev should be a pointed to PCIe GPU device. - * - * A release callback should be registered to allow a device driver to - * be notified that it should not launch any new translation requests - * as the final TLB invalidate is about to occur. - * - * Returns an error if there no contexts are currently available or a - * npu_context which should be passed to pnv_npu2_handle_fault(). - * - * mmap_sem must be held in write mode and must not be called from interrupt - * context. - */ -struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, - unsigned long flags, - void (*cb)(struct npu_context *, void *), - void *priv) -{ - int rc; - u32 nvlink_index; - struct device_node *nvlink_dn; - struct mm_struct *mm = current->mm; - struct npu *npu; - struct npu_context *npu_context; - struct pci_controller *hose; - - /* - * At present we don't support GPUs connected to multiple NPUs and I'm - * not sure the hardware does either. - */ - struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); - - if (!npdev) - /* No nvlink associated with this GPU device */ - return ERR_PTR(-ENODEV); - - /* We only support DR/PR/HV in pnv_npu2_map_lpar_dev() */ - if (flags & ~(MSR_DR | MSR_PR | MSR_HV)) - return ERR_PTR(-EINVAL); - - nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); - if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", - &nvlink_index))) - return ERR_PTR(-ENODEV); - - if (!mm || mm->context.id == 0) { - /* - * Kernel thread contexts are not supported and context id 0 is - * reserved on the GPU. - */ - return ERR_PTR(-EINVAL); - } - - hose = pci_bus_to_host(npdev->bus); - npu = hose->npu; - if (!npu) - return ERR_PTR(-ENODEV); - - /* - * We store the npu pci device so we can more easily get at the - * associated npus. - */ - spin_lock(&npu_context_lock); - npu_context = mm->context.npu_context; - if (npu_context) { - if (npu_context->release_cb != cb || - npu_context->priv != priv) { - spin_unlock(&npu_context_lock); - return ERR_PTR(-EINVAL); - } - - WARN_ON(!kref_get_unless_zero(&npu_context->kref)); - } - spin_unlock(&npu_context_lock); - - if (!npu_context) { - /* - * We can set up these fields without holding the - * npu_context_lock as the npu_context hasn't been returned to - * the caller meaning it can't be destroyed. Parallel allocation - * is protected against by mmap_sem. - */ - rc = -ENOMEM; - npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL); - if (npu_context) { - kref_init(&npu_context->kref); - npu_context->mm = mm; - npu_context->mn.ops = &nv_nmmu_notifier_ops; - rc = __mmu_notifier_register(&npu_context->mn, mm); - } - - if (rc) { - kfree(npu_context); - return ERR_PTR(rc); - } - - mm->context.npu_context = npu_context; - } - - npu_context->release_cb = cb; - npu_context->priv = priv; - - /* - * npdev is a pci_dev pointer setup by the PCI code. We assign it to - * npdev[][] to indicate to the mmu notifiers that an invalidation - * should also be sent over this nvlink. The notifiers don't use any - * other fields in npu_context, so we just need to ensure that when they - * deference npu_context->npdev[][] it is either a valid pointer or - * NULL. - */ - WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev); - - if (!npu->nmmu_flush) { - /* - * If we're not explicitly flushing ourselves we need to mark - * the thread for global flushes - */ - npu_context->nmmu_flush = false; - mm_context_add_copro(mm); - } else - npu_context->nmmu_flush = true; - - return npu_context; -} -EXPORT_SYMBOL(pnv_npu2_init_context); - -static void pnv_npu2_release_context(struct kref *kref) -{ - struct npu_context *npu_context = - container_of(kref, struct npu_context, kref); - - if (!npu_context->nmmu_flush) - mm_context_remove_copro(npu_context->mm); - - npu_context->mm->context.npu_context = NULL; -} - -/* - * Destroy a context on the given GPU. May free the npu_context if it is no - * longer active on any GPUs. Must not be called from interrupt context. - */ -void pnv_npu2_destroy_context(struct npu_context *npu_context, - struct pci_dev *gpdev) -{ - int removed; - struct npu *npu; - struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); - struct device_node *nvlink_dn; - u32 nvlink_index; - struct pci_controller *hose; - - if (WARN_ON(!npdev)) - return; - - hose = pci_bus_to_host(npdev->bus); - npu = hose->npu; - if (!npu) - return; - nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); - if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", - &nvlink_index))) - return; - WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL); - spin_lock(&npu_context_lock); - removed = kref_put(&npu_context->kref, pnv_npu2_release_context); - spin_unlock(&npu_context_lock); - - /* - * We need to do this outside of pnv_npu2_release_context so that it is - * outside the spinlock as mmu_notifier_destroy uses SRCU. - */ - if (removed) { - mmu_notifier_unregister(&npu_context->mn, - npu_context->mm); - - kfree(npu_context); - } - -} -EXPORT_SYMBOL(pnv_npu2_destroy_context); - -/* - * Assumes mmap_sem is held for the contexts associated mm. - */ -int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, - unsigned long *flags, unsigned long *status, int count) -{ - u64 rc = 0, result = 0; - int i, is_write; - struct page *page[1]; - const char __user *u; - char c; - - /* mmap_sem should be held so the struct_mm must be present */ - struct mm_struct *mm = context->mm; - - WARN_ON(!rwsem_is_locked(&mm->mmap_sem)); - - for (i = 0; i < count; i++) { - is_write = flags[i] & NPU2_WRITE; - rc = get_user_pages_remote(NULL, mm, ea[i], 1, - is_write ? FOLL_WRITE : 0, - page, NULL, NULL); - - if (rc != 1) { - status[i] = rc; - result = -EFAULT; - continue; - } - - /* Make sure partition scoped tree gets a pte */ - u = page_address(page[0]); - if (__get_user(c, u)) - result = -EFAULT; - - status[i] = 0; - put_page(page[0]); - } - - return result; -} -EXPORT_SYMBOL(pnv_npu2_handle_fault); - -int pnv_npu2_init(struct pci_controller *hose) -{ - unsigned int i; - u64 mmio_atsd; - static int npu_index; - struct npu *npu; - int ret; - - npu = kzalloc(sizeof(*npu), GFP_KERNEL); - if (!npu) - return -ENOMEM; - - npu->nmmu_flush = of_property_read_bool(hose->dn, "ibm,nmmu-flush"); - - for (i = 0; i < ARRAY_SIZE(npu->mmio_atsd_regs) && - !of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", - i, &mmio_atsd); i++) - npu->mmio_atsd_regs[i] = ioremap(mmio_atsd, 32); - - pr_info("NPU%d: Found %d MMIO ATSD registers", hose->global_number, i); - npu->mmio_atsd_count = i; - npu->mmio_atsd_usage = 0; - npu_index++; - if (WARN_ON(npu_index >= NV_MAX_NPUS)) { - ret = -ENOSPC; - goto fail_exit; - } - max_npu2_index = npu_index; - npu->index = npu_index; - hose->npu = npu; - - return 0; - -fail_exit: - for (i = 0; i < npu->mmio_atsd_count; ++i) - iounmap(npu->mmio_atsd_regs[i]); - - kfree(npu); - - return ret; -} - -int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid, - unsigned long msr) -{ - int ret; - struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); - struct pci_controller *hose; - struct pnv_phb *nphb; - - if (!npdev) - return -ENODEV; - - hose = pci_bus_to_host(npdev->bus); - nphb = hose->private_data; - - dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=%u\n", - nphb->opal_id, lparid); - /* - * Currently we only support radix and non-zero LPCR only makes sense - * for hash tables so skiboot expects the LPCR parameter to be a zero. - */ - ret = opal_npu_map_lpar(nphb->opal_id, - PCI_DEVID(gpdev->bus->number, gpdev->devfn), lparid, - 0 /* LPCR bits */); - if (ret) { - dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret); - return ret; - } - - dev_dbg(&gpdev->dev, "init context opalid=%llu msr=%lx\n", - nphb->opal_id, msr); - ret = opal_npu_init_context(nphb->opal_id, 0/*__unused*/, msr, - PCI_DEVID(gpdev->bus->number, gpdev->devfn)); - if (ret < 0) - dev_err(&gpdev->dev, "Failed to init context: %d\n", ret); - else - ret = 0; - - return 0; -} -EXPORT_SYMBOL_GPL(pnv_npu2_map_lpar_dev); - -void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr) -{ - struct pci_dev *gpdev; - - list_for_each_entry(gpdev, &gpe->pbus->devices, bus_list) - pnv_npu2_map_lpar_dev(gpdev, 0, msr); -} - -int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev) -{ - int ret; - struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); - struct pci_controller *hose; - struct pnv_phb *nphb; - - if (!npdev) - return -ENODEV; - - hose = pci_bus_to_host(npdev->bus); - nphb = hose->private_data; - - dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n", - nphb->opal_id); - ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/, - PCI_DEVID(gpdev->bus->number, gpdev->devfn)); - if (ret < 0) { - dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret); - return ret; - } - - /* Set LPID to 0 anyway, just to be safe */ - dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id); - ret = opal_npu_map_lpar(nphb->opal_id, - PCI_DEVID(gpdev->bus->number, gpdev->devfn), 0 /*LPID*/, - 0 /* LPCR bits */); - if (ret) - dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret); - - return ret; -} -EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev); diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c index 8c65aacda9c8..f8139948348e 100644 --- a/arch/powerpc/platforms/powernv/ocxl.c +++ b/arch/powerpc/platforms/powernv/ocxl.c @@ -2,7 +2,6 @@ // Copyright 2017 IBM Corp. #include <asm/pnv-ocxl.h> #include <asm/opal.h> -#include <asm/xive.h> #include <misc/ocxl-config.h> #include "pci.h" @@ -108,7 +107,8 @@ static int get_max_afu_index(struct pci_dev *dev, int *afu_idx) int pos; u32 val; - pos = find_dvsec_from_pos(dev, OCXL_DVSEC_FUNC_ID, 0); + pos = pci_find_dvsec_capability(dev, PCI_VENDOR_ID_IBM, + OCXL_DVSEC_FUNC_ID); if (!pos) return -ESRCH; @@ -172,12 +172,11 @@ static void pnv_ocxl_fixup_actag(struct pci_dev *dev) if (phb->type != PNV_PHB_NPU_OCAPI) return; - mutex_lock(&links_list_lock); + guard(mutex)(&links_list_lock); link = find_link(dev); if (!link) { dev_warn(&dev->dev, "couldn't update actag information\n"); - mutex_unlock(&links_list_lock); return; } @@ -206,7 +205,6 @@ static void pnv_ocxl_fixup_actag(struct pci_dev *dev) dev_dbg(&dev->dev, "total actags for function: %d\n", link->fn_desired_actags[PCI_FUNC(dev->devfn)]); - mutex_unlock(&links_list_lock); } DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_ocxl_fixup_actag); @@ -253,12 +251,11 @@ int pnv_ocxl_get_actag(struct pci_dev *dev, u16 *base, u16 *enabled, { struct npu_link *link; - mutex_lock(&links_list_lock); + guard(mutex)(&links_list_lock); link = find_link(dev); if (!link) { dev_err(&dev->dev, "actag information not found\n"); - mutex_unlock(&links_list_lock); return -ENODEV; } /* @@ -274,7 +271,6 @@ int pnv_ocxl_get_actag(struct pci_dev *dev, u16 *base, u16 *enabled, *enabled = link->fn_actags[PCI_FUNC(dev->devfn)].count; *supported = link->fn_desired_actags[PCI_FUNC(dev->devfn)]; - mutex_unlock(&links_list_lock); return 0; } EXPORT_SYMBOL_GPL(pnv_ocxl_get_actag); @@ -289,16 +285,15 @@ int pnv_ocxl_get_pasid_count(struct pci_dev *dev, int *count) * be used by a function depends on how many functions exist * on the device. The NPU needs to be configured to know how * many bits are available to PASIDs and how many are to be - * used by the function BDF indentifier. + * used by the function BDF identifier. * * We only support one AFU-carrying function for now. */ - mutex_lock(&links_list_lock); + guard(mutex)(&links_list_lock); link = find_link(dev); if (!link) { dev_err(&dev->dev, "actag information not found\n"); - mutex_unlock(&links_list_lock); return -ENODEV; } @@ -309,7 +304,6 @@ int pnv_ocxl_get_pasid_count(struct pci_dev *dev, int *count) break; } - mutex_unlock(&links_list_lock); dev_dbg(&dev->dev, "%d PASIDs available for function\n", rc ? 0 : *count); return rc; @@ -449,7 +443,7 @@ int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask, if (!data) return -ENOMEM; - bdfn = (dev->bus->number << 8) | dev->devfn; + bdfn = pci_dev_id(dev); rc = opal_npu_spa_setup(phb->opal_id, bdfn, virt_to_phys(spa_mem), PE_mask); if (rc) { @@ -478,38 +472,121 @@ EXPORT_SYMBOL_GPL(pnv_ocxl_spa_release); int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle) { struct spa_data *data = (struct spa_data *) platform_data; - int rc; - rc = opal_npu_spa_clear_cache(data->phb_opal_id, data->bdfn, pe_handle); - return rc; + return opal_npu_spa_clear_cache(data->phb_opal_id, data->bdfn, pe_handle); } EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache); -int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr) +int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid, + uint64_t lpcr, void __iomem **arva) { - __be64 flags, trigger_page; - s64 rc; - u32 hwirq; - - hwirq = xive_native_alloc_irq(); - if (!hwirq) - return -ENOENT; - - rc = opal_xive_get_irq_info(hwirq, &flags, NULL, &trigger_page, NULL, - NULL); - if (rc || !trigger_page) { - xive_native_free_irq(hwirq); - return -ENOENT; + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + u64 mmio_atsd; + int rc; + + /* ATSD physical address. + * ATSD LAUNCH register: write access initiates a shoot down to + * initiate the TLB Invalidate command. + */ + rc = of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", + 0, &mmio_atsd); + if (rc) { + dev_info(&dev->dev, "No available ATSD found\n"); + return rc; + } + + /* Assign a register set to a Logical Partition and MMIO ATSD + * LPARID register to the required value. + */ + rc = opal_npu_map_lpar(phb->opal_id, pci_dev_id(dev), + lparid, lpcr); + if (rc) { + dev_err(&dev->dev, "Error mapping device to LPAR: %d\n", rc); + return rc; + } + + *arva = ioremap(mmio_atsd, 24); + if (!(*arva)) { + dev_warn(&dev->dev, "ioremap failed - mmio_atsd: %#llx\n", mmio_atsd); + rc = -ENOMEM; } - *irq = hwirq; - *trigger_addr = be64_to_cpu(trigger_page); - return 0; + return rc; +} +EXPORT_SYMBOL_GPL(pnv_ocxl_map_lpar); + +void pnv_ocxl_unmap_lpar(void __iomem *arva) +{ + iounmap(arva); } -EXPORT_SYMBOL_GPL(pnv_ocxl_alloc_xive_irq); +EXPORT_SYMBOL_GPL(pnv_ocxl_unmap_lpar); -void pnv_ocxl_free_xive_irq(u32 irq) +void pnv_ocxl_tlb_invalidate(void __iomem *arva, + unsigned long pid, + unsigned long addr, + unsigned long page_size) { - xive_native_free_irq(irq); + unsigned long timeout = jiffies + (HZ * PNV_OCXL_ATSD_TIMEOUT); + u64 val = 0ull; + int pend; + u8 size; + + if (!(arva)) + return; + + if (addr) { + /* load Abbreviated Virtual Address register with + * the necessary value + */ + val |= FIELD_PREP(PNV_OCXL_ATSD_AVA_AVA, addr >> (63-51)); + out_be64(arva + PNV_OCXL_ATSD_AVA, val); + } + + /* Write access initiates a shoot down to initiate the + * TLB Invalidate command + */ + val = PNV_OCXL_ATSD_LNCH_R; + val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_RIC, 0b10); + if (addr) + val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_IS, 0b00); + else { + val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_IS, 0b01); + val |= PNV_OCXL_ATSD_LNCH_OCAPI_SINGLETON; + } + val |= PNV_OCXL_ATSD_LNCH_PRS; + /* Actual Page Size to be invalidated + * 000 4KB + * 101 64KB + * 001 2MB + * 010 1GB + */ + size = 0b101; + if (page_size == 0x1000) + size = 0b000; + if (page_size == 0x200000) + size = 0b001; + if (page_size == 0x40000000) + size = 0b010; + val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_AP, size); + val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_PID, pid); + out_be64(arva + PNV_OCXL_ATSD_LNCH, val); + + /* Poll the ATSD status register to determine when the + * TLB Invalidate has been completed. + */ + val = in_be64(arva + PNV_OCXL_ATSD_STAT); + pend = val >> 63; + + while (pend) { + if (time_after_eq(jiffies, timeout)) { + pr_err("%s - Timeout while reading XTS MMIO ATSD status register (val=%#llx, pidr=0x%lx)\n", + __func__, val, pid); + return; + } + cpu_relax(); + val = in_be64(arva + PNV_OCXL_ATSD_STAT); + pend = val >> 63; + } } -EXPORT_SYMBOL_GPL(pnv_ocxl_free_xive_irq); +EXPORT_SYMBOL_GPL(pnv_ocxl_tlb_invalidate); diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c index 18a355fa15e8..c094fdf5825c 100644 --- a/arch/powerpc/platforms/powernv/opal-async.c +++ b/arch/powerpc/platforms/powernv/opal-async.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV OPAL asynchronous completion interfaces * * Copyright 2013-2017 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #undef DEBUG @@ -108,7 +104,7 @@ static int __opal_async_release_token(int token) */ case ASYNC_TOKEN_DISPATCHED: opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED; - /* Fall through */ + fallthrough; default: rc = 1; } diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c new file mode 100644 index 000000000000..021b0ec29e24 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/percpu.h> +#include <linux/jump_label.h> +#include <asm/interrupt.h> +#include <asm/opal-api.h> +#include <asm/trace.h> +#include <asm/asm-prototypes.h> + +#ifdef CONFIG_TRACEPOINTS +/* + * Since the tracing code might execute OPAL calls we need to guard against + * recursion. + */ +static DEFINE_PER_CPU(unsigned int, opal_trace_depth); + +static void __trace_opal_entry(s64 a0, s64 a1, s64 a2, s64 a3, + s64 a4, s64 a5, s64 a6, s64 a7, + unsigned long opcode) +{ + unsigned int *depth; + unsigned long args[8]; + + depth = this_cpu_ptr(&opal_trace_depth); + + if (*depth) + return; + + args[0] = a0; + args[1] = a1; + args[2] = a2; + args[3] = a3; + args[4] = a4; + args[5] = a5; + args[6] = a6; + args[7] = a7; + + (*depth)++; + trace_opal_entry(opcode, &args[0]); + (*depth)--; +} + +static void __trace_opal_exit(unsigned long opcode, unsigned long retval) +{ + unsigned int *depth; + + depth = this_cpu_ptr(&opal_trace_depth); + + if (*depth) + return; + + (*depth)++; + trace_opal_exit(opcode, retval); + (*depth)--; +} + +static DEFINE_STATIC_KEY_FALSE(opal_tracepoint_key); + +int opal_tracepoint_regfunc(void) +{ + static_branch_inc(&opal_tracepoint_key); + return 0; +} + +void opal_tracepoint_unregfunc(void) +{ + static_branch_dec(&opal_tracepoint_key); +} + +static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3, + s64 a4, s64 a5, s64 a6, s64 a7, + unsigned long opcode, unsigned long msr) +{ + s64 ret; + + __trace_opal_entry(a0, a1, a2, a3, a4, a5, a6, a7, opcode); + ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + __trace_opal_exit(opcode, ret); + + return ret; +} + +#define DO_TRACE (static_branch_unlikely(&opal_tracepoint_key)) + +#else /* CONFIG_TRACEPOINTS */ + +static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3, + s64 a4, s64 a5, s64 a6, s64 a7, + unsigned long opcode, unsigned long msr) +{ + return 0; +} + +#define DO_TRACE false +#endif /* CONFIG_TRACEPOINTS */ + +static int64_t opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3, + int64_t a4, int64_t a5, int64_t a6, int64_t a7, int64_t opcode) +{ + unsigned long flags; + unsigned long msr = mfmsr(); + bool mmu = (msr & (MSR_IR|MSR_DR)); + int64_t ret; + + /* OPAL call / firmware may use SRR and/or HSRR */ + srr_regs_clobbered(); + + msr &= ~MSR_EE; + + if (unlikely(!mmu)) + return __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + + local_save_flags(flags); + hard_irq_disable(); + + if (DO_TRACE) { + ret = __opal_call_trace(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + } else { + ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr); + } + + local_irq_restore(flags); + + return ret; +} + +#define OPAL_CALL(name, opcode) \ +int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3, \ + int64_t a4, int64_t a5, int64_t a6, int64_t a7); \ +int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3, \ + int64_t a4, int64_t a5, int64_t a6, int64_t a7) \ +{ \ + return opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode); \ +} + +OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); +OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); +OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); +OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); +OPAL_CALL(opal_rtc_read, OPAL_RTC_READ); +OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE); +OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN); +OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT); +OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2); +OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM); +OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM); +OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT); +OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS); +OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY); +OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY); +OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE); +OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD); +OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD); +OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); +OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); +OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); +OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); +OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); +OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); +OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); +OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR); +OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET); +OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT); +OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC); +OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE); +OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW); +OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW); +OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY); +OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE); +OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV); +OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE); +OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE); +OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE); +OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32); +OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64); +OPAL_CALL(opal_start_cpu, OPAL_START_CPU); +OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS); +OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL); +OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW); +OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); +OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET); +OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA); +OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA); +OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB); +OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT); +OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR); +OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); +OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); +OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS); +OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); +OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR); +OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL); +OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI); +OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2); +OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ); +OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE); +OPAL_CALL(opal_lpc_read, OPAL_LPC_READ); +OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE); +OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU); +OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS); +OPAL_CALL(opal_read_elog, OPAL_ELOG_READ); +OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK); +OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE); +OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND); +OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE); +OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE); +OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE); +OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE); +OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE); +OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN); +OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT); +OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO); +OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2); +OPAL_CALL(opal_dump_read, OPAL_DUMP_READ); +OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK); +OPAL_CALL(opal_get_msg, OPAL_GET_MSG); +OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC); +OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION); +OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND); +OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT); +OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); +OPAL_CALL(opal_get_param, OPAL_GET_PARAM); +OPAL_CALL(opal_set_param, OPAL_SET_PARAM); +OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); +OPAL_CALL(opal_handle_hmi2, OPAL_HANDLE_HMI2); +OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); +OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); +OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); +OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION); +OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE); +OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO); +OPAL_CALL(opal_tpo_read, OPAL_READ_TPO); +OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND); +OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV); +OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST); +OPAL_CALL(opal_flash_read, OPAL_FLASH_READ); +OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE); +OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE); +OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG); +OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR); +OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR); +OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH); +OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE); +OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE); +OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE); +OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE); +OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR); +OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR); +OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); +OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); +OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); +OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); +OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET); +OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO); +OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG); +OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG); +OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO); +OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); +OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); +OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); +OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); +OPAL_CALL(opal_xive_allocate_irq_raw, OPAL_XIVE_ALLOCATE_IRQ); +OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); +OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); +OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); +OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); +OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); +OPAL_CALL(opal_xive_get_queue_state, OPAL_XIVE_GET_QUEUE_STATE); +OPAL_CALL(opal_xive_set_queue_state, OPAL_XIVE_SET_QUEUE_STATE); +OPAL_CALL(opal_xive_get_vp_state, OPAL_XIVE_GET_VP_STATE); +OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); +OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); +OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT); +OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START); +OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP); +OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP); +OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP); +OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO); +OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO); +OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR); +OPAL_CALL(opal_quiesce, OPAL_QUIESCE); +OPAL_CALL(opal_npu_spa_setup, OPAL_NPU_SPA_SETUP); +OPAL_CALL(opal_npu_spa_clear_cache, OPAL_NPU_SPA_CLEAR_CACHE); +OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET); +OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR); +OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); +OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); +OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); +OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); +OPAL_CALL(opal_mpipl_update, OPAL_MPIPL_UPDATE); +OPAL_CALL(opal_mpipl_register_tag, OPAL_MPIPL_REGISTER_TAG); +OPAL_CALL(opal_mpipl_query_tag, OPAL_MPIPL_QUERY_TAG); +OPAL_CALL(opal_secvar_get, OPAL_SECVAR_GET); +OPAL_CALL(opal_secvar_get_next, OPAL_SECVAR_GET_NEXT); +OPAL_CALL(opal_secvar_enqueue_update, OPAL_SECVAR_ENQUEUE_UPDATE); diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c new file mode 100644 index 000000000000..784602a48afb --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -0,0 +1,663 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Interface for exporting the OPAL ELF core. + * Heavily inspired from fs/proc/vmcore.c + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "opal core: " fmt + +#include <linux/memblock.h> +#include <linux/uaccess.h> +#include <linux/proc_fs.h> +#include <linux/elf.h> +#include <linux/elfcore.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/vmcore_info.h> +#include <linux/of.h> + +#include <asm/page.h> +#include <asm/opal.h> +#include <asm/fadump-internal.h> + +#include "opal-fadump.h" + +#define MAX_PT_LOAD_CNT 8 + +/* NT_AUXV note related info */ +#define AUXV_CNT 1 +#define AUXV_DESC_SZ (((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off)) + +struct opalcore_config { + u32 num_cpus; + /* PIR value of crashing CPU */ + u32 crashing_cpu; + + /* CPU state data info from F/W */ + u64 cpu_state_destination_vaddr; + u64 cpu_state_data_size; + u64 cpu_state_entry_size; + + /* OPAL memory to be exported as PT_LOAD segments */ + u64 ptload_addr[MAX_PT_LOAD_CNT]; + u64 ptload_size[MAX_PT_LOAD_CNT]; + u64 ptload_cnt; + + /* Pointer to the first PT_LOAD in the ELF core file */ + Elf64_Phdr *ptload_phdr; + + /* Total size of opalcore file. */ + size_t opalcore_size; + + /* Buffer for all the ELF core headers and the PT_NOTE */ + size_t opalcorebuf_sz; + char *opalcorebuf; + + /* NT_AUXV buffer */ + char auxv_buf[AUXV_DESC_SZ]; +}; + +struct opalcore { + struct list_head list; + u64 paddr; + size_t size; + loff_t offset; +}; + +static LIST_HEAD(opalcore_list); +static struct opalcore_config *oc_conf; +static const struct opal_mpipl_fadump *opalc_metadata; +static const struct opal_mpipl_fadump *opalc_cpu_metadata; +static struct kobject *mpipl_kobj; + +/* + * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered + * by kernel, SIGTERM otherwise. + */ +bool kernel_initiated; + +static struct opalcore * __init get_new_element(void) +{ + return kzalloc(sizeof(struct opalcore), GFP_KERNEL); +} + +static inline int is_opalcore_usable(void) +{ + return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0; +} + +static Elf64_Word *__init append_elf64_note(Elf64_Word *buf, char *name, + u32 type, void *data, + size_t data_len) +{ + Elf64_Nhdr *note = (Elf64_Nhdr *)buf; + Elf64_Word namesz = strlen(name) + 1; + + note->n_namesz = cpu_to_be32(namesz); + note->n_descsz = cpu_to_be32(data_len); + note->n_type = cpu_to_be32(type); + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf64_Word)); + memcpy(buf, name, namesz); + buf += DIV_ROUND_UP(namesz, sizeof(Elf64_Word)); + memcpy(buf, data, data_len); + buf += DIV_ROUND_UP(data_len, sizeof(Elf64_Word)); + + return buf; +} + +static void __init fill_prstatus(struct elf_prstatus *prstatus, int pir, + struct pt_regs *regs) +{ + memset(prstatus, 0, sizeof(struct elf_prstatus)); + elf_core_copy_regs(&(prstatus->pr_reg), regs); + + /* + * Overload PID with PIR value. + * As a PIR value could also be '0', add an offset of '100' + * to every PIR to avoid misinterpretations in GDB. + */ + prstatus->common.pr_pid = cpu_to_be32(100 + pir); + prstatus->common.pr_ppid = cpu_to_be32(1); + + /* + * Indicate SIGUSR1 for crash initiated from kernel. + * SIGTERM otherwise. + */ + if (pir == oc_conf->crashing_cpu) { + short sig; + + sig = kernel_initiated ? SIGUSR1 : SIGTERM; + prstatus->common.pr_cursig = cpu_to_be16(sig); + } +} + +static Elf64_Word *__init auxv_to_elf64_notes(Elf64_Word *buf, + u64 opal_boot_entry) +{ + Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf; + int idx = 0; + + memset(bufp, 0, AUXV_DESC_SZ); + + /* Entry point of OPAL */ + bufp[idx++] = cpu_to_be64(AT_ENTRY); + bufp[idx++] = cpu_to_be64(opal_boot_entry); + + /* end of vector */ + bufp[idx++] = cpu_to_be64(AT_NULL); + + buf = append_elf64_note(buf, NN_AUXV, NT_AUXV, + oc_conf->auxv_buf, AUXV_DESC_SZ); + return buf; +} + +/* + * Read from the ELF header and then the crash dump. + * Returns number of bytes read on success, -errno on failure. + */ +static ssize_t read_opalcore(struct file *file, struct kobject *kobj, + const struct bin_attribute *bin_attr, char *to, + loff_t pos, size_t count) +{ + struct opalcore *m; + ssize_t tsz, avail; + loff_t tpos = pos; + + if (pos >= oc_conf->opalcore_size) + return 0; + + /* Adjust count if it goes beyond opalcore size */ + avail = oc_conf->opalcore_size - pos; + if (count > avail) + count = avail; + + if (count == 0) + return 0; + + /* Read ELF core header and/or PT_NOTE segment */ + if (tpos < oc_conf->opalcorebuf_sz) { + tsz = min_t(size_t, oc_conf->opalcorebuf_sz - tpos, count); + memcpy(to, oc_conf->opalcorebuf + tpos, tsz); + to += tsz; + tpos += tsz; + count -= tsz; + } + + list_for_each_entry(m, &opalcore_list, list) { + /* nothing more to read here */ + if (count == 0) + break; + + if (tpos < m->offset + m->size) { + void *addr; + + tsz = min_t(size_t, m->offset + m->size - tpos, count); + addr = (void *)(m->paddr + tpos - m->offset); + memcpy(to, __va(addr), tsz); + to += tsz; + tpos += tsz; + count -= tsz; + } + } + + return (tpos - pos); +} + +static struct bin_attribute opal_core_attr __ro_after_init = { + .attr = {.name = "core", .mode = 0400}, + .read = read_opalcore +}; + +/* + * Read CPU state dump data and convert it into ELF notes. + * + * Each register entry is of 16 bytes, A numerical identifier along with + * a GPR/SPR flag in the first 8 bytes and the register value in the next + * 8 bytes. For more details refer to F/W documentation. + */ +static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf) +{ + u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize; + struct hdat_fadump_thread_hdr *thdr; + struct elf_prstatus prstatus; + Elf64_Word *first_cpu_note; + struct pt_regs regs; + char *bufp; + int i; + + size_per_thread = oc_conf->cpu_state_entry_size; + bufp = __va(oc_conf->cpu_state_destination_vaddr); + + /* + * Offset for register entries, entry size and registers count is + * duplicated in every thread header in keeping with HDAT format. + * Use these values from the first thread header. + */ + thdr = (struct hdat_fadump_thread_hdr *)bufp; + regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) + + be32_to_cpu(thdr->offset)); + reg_esize = be32_to_cpu(thdr->esize); + regs_cnt = be32_to_cpu(thdr->ecnt); + + pr_debug("--------CPU State Data------------\n"); + pr_debug("NumCpus : %u\n", oc_conf->num_cpus); + pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n", + regs_offset, reg_esize, regs_cnt); + + /* + * Skip past the first CPU note. Fill this note with the + * crashing CPU's prstatus. + */ + first_cpu_note = buf; + buf = append_elf64_note(buf, NN_PRSTATUS, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + + for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) { + thdr = (struct hdat_fadump_thread_hdr *)bufp; + thread_pir = be32_to_cpu(thdr->pir); + + pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n", + i, thread_pir, thdr->core_state); + + /* + * Register state data of MAX cores is provided by firmware, + * but some of this cores may not be active. So, while + * processing register state data, check core state and + * skip threads that belong to inactive cores. + */ + if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE) + continue; + + opal_fadump_read_regs((bufp + regs_offset), regs_cnt, + reg_esize, false, ®s); + + pr_debug("PIR 0x%x - R1 : 0x%llx, NIP : 0x%llx\n", thread_pir, + be64_to_cpu(regs.gpr[1]), be64_to_cpu(regs.nip)); + fill_prstatus(&prstatus, thread_pir, ®s); + + if (thread_pir != oc_conf->crashing_cpu) { + buf = append_elf64_note(buf, NN_PRSTATUS, + NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + } else { + /* + * Add crashing CPU as the first NT_PRSTATUS note for + * GDB to process the core file appropriately. + */ + append_elf64_note(first_cpu_note, NN_PRSTATUS, + NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + } + } + + return buf; +} + +static int __init create_opalcore(void) +{ + u64 opal_boot_entry, opal_base_addr, paddr; + u32 hdr_size, cpu_notes_size, count; + struct device_node *dn; + struct opalcore *new; + loff_t opalcore_off; + struct page *page; + Elf64_Phdr *phdr; + Elf64_Ehdr *elf; + int i, ret; + char *bufp; + + /* Get size of header & CPU notes for OPAL core */ + hdr_size = (sizeof(Elf64_Ehdr) + + ((oc_conf->ptload_cnt + 1) * sizeof(Elf64_Phdr))); + cpu_notes_size = ((oc_conf->num_cpus * (CRASH_CORE_NOTE_HEAD_BYTES + + CRASH_CORE_NOTE_NAME_BYTES + + CRASH_CORE_NOTE_DESC_BYTES)) + + (CRASH_CORE_NOTE_HEAD_BYTES + + CRASH_CORE_NOTE_NAME_BYTES + AUXV_DESC_SZ)); + + /* Allocate buffer to setup OPAL core */ + oc_conf->opalcorebuf_sz = PAGE_ALIGN(hdr_size + cpu_notes_size); + oc_conf->opalcorebuf = alloc_pages_exact(oc_conf->opalcorebuf_sz, + GFP_KERNEL | __GFP_ZERO); + if (!oc_conf->opalcorebuf) { + pr_err("Not enough memory to setup OPAL core (size: %lu)\n", + oc_conf->opalcorebuf_sz); + oc_conf->opalcorebuf_sz = 0; + return -ENOMEM; + } + count = oc_conf->opalcorebuf_sz / PAGE_SIZE; + page = virt_to_page(oc_conf->opalcorebuf); + for (i = 0; i < count; i++) + mark_page_reserved(page + i); + + pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf); + + /* Read OPAL related device-tree entries */ + dn = of_find_node_by_name(NULL, "ibm,opal"); + if (dn) { + ret = of_property_read_u64(dn, "opal-base-address", + &opal_base_addr); + pr_debug("opal-base-address: %llx\n", opal_base_addr); + ret |= of_property_read_u64(dn, "opal-boot-address", + &opal_boot_entry); + pr_debug("opal-boot-address: %llx\n", opal_boot_entry); + } + if (!dn || ret) + pr_warn("WARNING: Failed to read OPAL base & entry values\n"); + + of_node_put(dn); + + /* Use count to keep track of the program headers */ + count = 0; + + bufp = oc_conf->opalcorebuf; + elf = (Elf64_Ehdr *)bufp; + bufp += sizeof(Elf64_Ehdr); + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELFDATA2MSB; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + elf->e_type = cpu_to_be16(ET_CORE); + elf->e_machine = cpu_to_be16(ELF_ARCH); + elf->e_version = cpu_to_be32(EV_CURRENT); + elf->e_entry = 0; + elf->e_phoff = cpu_to_be64(sizeof(Elf64_Ehdr)); + elf->e_shoff = 0; + elf->e_flags = 0; + + elf->e_ehsize = cpu_to_be16(sizeof(Elf64_Ehdr)); + elf->e_phentsize = cpu_to_be16(sizeof(Elf64_Phdr)); + elf->e_phnum = 0; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = cpu_to_be32(PT_NOTE); + phdr->p_flags = 0; + phdr->p_align = 0; + phdr->p_paddr = phdr->p_vaddr = 0; + phdr->p_offset = cpu_to_be64(hdr_size); + phdr->p_filesz = phdr->p_memsz = cpu_to_be64(cpu_notes_size); + count++; + + opalcore_off = oc_conf->opalcorebuf_sz; + oc_conf->ptload_phdr = (Elf64_Phdr *)bufp; + paddr = 0; + for (i = 0; i < oc_conf->ptload_cnt; i++) { + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = cpu_to_be32(PT_LOAD); + phdr->p_flags = cpu_to_be32(PF_R|PF_W|PF_X); + phdr->p_align = 0; + + new = get_new_element(); + if (!new) + return -ENOMEM; + new->paddr = oc_conf->ptload_addr[i]; + new->size = oc_conf->ptload_size[i]; + new->offset = opalcore_off; + list_add_tail(&new->list, &opalcore_list); + + phdr->p_paddr = cpu_to_be64(paddr); + phdr->p_vaddr = cpu_to_be64(opal_base_addr + paddr); + phdr->p_filesz = phdr->p_memsz = + cpu_to_be64(oc_conf->ptload_size[i]); + phdr->p_offset = cpu_to_be64(opalcore_off); + + count++; + opalcore_off += oc_conf->ptload_size[i]; + paddr += oc_conf->ptload_size[i]; + } + + elf->e_phnum = cpu_to_be16(count); + + bufp = (char *)opalcore_append_cpu_notes((Elf64_Word *)bufp); + bufp = (char *)auxv_to_elf64_notes((Elf64_Word *)bufp, opal_boot_entry); + + oc_conf->opalcore_size = opalcore_off; + return 0; +} + +static void opalcore_cleanup(void) +{ + if (oc_conf == NULL) + return; + + /* Remove OPAL core sysfs file */ + sysfs_remove_bin_file(mpipl_kobj, &opal_core_attr); + oc_conf->ptload_phdr = NULL; + oc_conf->ptload_cnt = 0; + + /* free the buffer used for setting up OPAL core */ + if (oc_conf->opalcorebuf) { + void *end = (void *)((u64)oc_conf->opalcorebuf + + oc_conf->opalcorebuf_sz); + + free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL); + oc_conf->opalcorebuf = NULL; + oc_conf->opalcorebuf_sz = 0; + } + + kfree(oc_conf); + oc_conf = NULL; +} +__exitcall(opalcore_cleanup); + +static void __init opalcore_config_init(void) +{ + u32 idx, cpu_data_version; + struct device_node *np; + const __be32 *prop; + u64 addr = 0; + int i, ret; + + np = of_find_node_by_path("/ibm,opal/dump"); + if (np == NULL) + return; + + if (!of_device_is_compatible(np, "ibm,opal-dump")) { + pr_warn("Support missing for this f/w version!\n"); + return; + } + + /* Check if dump has been initiated on last reboot */ + prop = of_get_property(np, "mpipl-boot", NULL); + if (!prop) { + of_node_put(np); + return; + } + + /* Get OPAL metadata */ + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_OPAL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get OPAL metadata (%d)\n", ret); + goto error_out; + } + + addr = be64_to_cpu(addr); + pr_debug("OPAL metadata addr: %llx\n", addr); + opalc_metadata = __va(addr); + + /* Get OPAL CPU metadata */ + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get OPAL CPU metadata (%d)\n", ret); + goto error_out; + } + + addr = be64_to_cpu(addr); + pr_debug("CPU metadata addr: %llx\n", addr); + opalc_cpu_metadata = __va(addr); + + /* Allocate memory for config buffer */ + oc_conf = kzalloc(sizeof(struct opalcore_config), GFP_KERNEL); + if (oc_conf == NULL) + goto error_out; + + /* Parse OPAL metadata */ + if (opalc_metadata->version != OPAL_MPIPL_VERSION) { + pr_warn("Supported OPAL metadata version: %u, found: %u!\n", + OPAL_MPIPL_VERSION, opalc_metadata->version); + pr_warn("WARNING: F/W using newer OPAL metadata format!!\n"); + } + + oc_conf->ptload_cnt = 0; + idx = be32_to_cpu(opalc_metadata->region_cnt); + if (idx > MAX_PT_LOAD_CNT) { + pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)", + idx, MAX_PT_LOAD_CNT); + idx = MAX_PT_LOAD_CNT; + } + for (i = 0; i < idx; i++) { + oc_conf->ptload_addr[oc_conf->ptload_cnt] = + be64_to_cpu(opalc_metadata->region[i].dest); + oc_conf->ptload_size[oc_conf->ptload_cnt++] = + be64_to_cpu(opalc_metadata->region[i].size); + } + oc_conf->ptload_cnt = i; + oc_conf->crashing_cpu = be32_to_cpu(opalc_metadata->crashing_pir); + + if (!oc_conf->ptload_cnt) { + pr_err("OPAL memory regions not found\n"); + goto error_out; + } + + /* Parse OPAL CPU metadata */ + cpu_data_version = be32_to_cpu(opalc_cpu_metadata->cpu_data_version); + if (cpu_data_version != HDAT_FADUMP_CPU_DATA_VER) { + pr_warn("Supported CPU data version: %u, found: %u!\n", + HDAT_FADUMP_CPU_DATA_VER, cpu_data_version); + pr_warn("WARNING: F/W using newer CPU state data format!!\n"); + } + + addr = be64_to_cpu(opalc_cpu_metadata->region[0].dest); + if (!addr) { + pr_err("CPU state data not found!\n"); + goto error_out; + } + oc_conf->cpu_state_destination_vaddr = (u64)__va(addr); + + oc_conf->cpu_state_data_size = + be64_to_cpu(opalc_cpu_metadata->region[0].size); + oc_conf->cpu_state_entry_size = + be32_to_cpu(opalc_cpu_metadata->cpu_data_size); + + if ((oc_conf->cpu_state_entry_size == 0) || + (oc_conf->cpu_state_entry_size > oc_conf->cpu_state_data_size)) { + pr_err("CPU state data is invalid.\n"); + goto error_out; + } + oc_conf->num_cpus = (oc_conf->cpu_state_data_size / + oc_conf->cpu_state_entry_size); + + of_node_put(np); + return; + +error_out: + pr_err("Could not export /sys/firmware/opal/core\n"); + opalcore_cleanup(); + of_node_put(np); +} + +static ssize_t release_core_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int input = -1; + + if (kstrtoint(buf, 0, &input)) + return -EINVAL; + + if (input == 1) { + if (oc_conf == NULL) { + pr_err("'/sys/firmware/opal/core' file not accessible!\n"); + return -EPERM; + } + + /* + * Take away '/sys/firmware/opal/core' and release all memory + * used for exporting this file. + */ + opalcore_cleanup(); + } else + return -EINVAL; + + return count; +} + +static struct kobj_attribute opalcore_rel_attr = __ATTR_WO(release_core); + +static struct attribute *mpipl_attr[] = { + &opalcore_rel_attr.attr, + NULL, +}; + +static const struct bin_attribute *const mpipl_bin_attr[] = { + &opal_core_attr, + NULL, + +}; + +static const struct attribute_group mpipl_group = { + .attrs = mpipl_attr, + .bin_attrs = mpipl_bin_attr, +}; + +static int __init opalcore_init(void) +{ + int rc = -1; + + opalcore_config_init(); + + if (oc_conf == NULL) + return rc; + + create_opalcore(); + + /* + * If oc_conf->opalcorebuf= is set in the 2nd kernel, + * then capture the dump. + */ + if (!(is_opalcore_usable())) { + pr_err("Failed to export /sys/firmware/opal/mpipl/core\n"); + opalcore_cleanup(); + return rc; + } + + /* Set OPAL core file size */ + opal_core_attr.size = oc_conf->opalcore_size; + + mpipl_kobj = kobject_create_and_add("mpipl", opal_kobj); + if (!mpipl_kobj) { + pr_err("unable to create mpipl kobject\n"); + return -ENOMEM; + } + + /* Export OPAL core sysfs file */ + rc = sysfs_create_group(mpipl_kobj, &mpipl_group); + if (rc) { + pr_err("mpipl sysfs group creation failed (%d)", rc); + opalcore_cleanup(); + return rc; + } + /* The /sys/firmware/opal/core is moved to /sys/firmware/opal/mpipl/ + * directory, need to create symlink at old location to maintain + * backward compatibility. + */ + rc = compat_only_sysfs_link_entry_to_kobj(opal_kobj, mpipl_kobj, + "core", NULL); + if (rc) { + pr_err("unable to create core symlink (%d)\n", rc); + return rc; + } + + return 0; +} +fs_initcall(opalcore_init); diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c index 198143833f00..cc3cc9ddf9d1 100644 --- a/arch/powerpc/platforms/powernv/opal-dump.c +++ b/arch/powerpc/platforms/powernv/opal-dump.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV OPAL Dump Interface * * Copyright 2013,2014 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kobject.h> @@ -92,9 +88,14 @@ static ssize_t dump_ack_store(struct dump_obj *dump_obj, const char *buf, size_t count) { - dump_send_ack(dump_obj->id); - sysfs_remove_file_self(&dump_obj->kobj, &attr->attr); - kobject_put(&dump_obj->kobj); + /* + * Try to self remove this attribute. If we are successful, + * delete the kobject itself. + */ + if (sysfs_remove_file_self(&dump_obj->kobj, &attr->attr)) { + dump_send_ack(dump_obj->id); + kobject_put(&dump_obj->kobj); + } return count; } @@ -149,7 +150,7 @@ static struct attribute *initiate_attrs[] = { NULL, }; -static struct attribute_group initiate_attr_group = { +static const struct attribute_group initiate_attr_group = { .attrs = initiate_attrs, }; @@ -207,11 +208,12 @@ static struct attribute *dump_default_attrs[] = { &ack_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(dump_default); -static struct kobj_type dump_ktype = { +static const struct kobj_type dump_ktype = { .sysfs_ops = &dump_sysfs_ops, .release = &dump_release, - .default_attrs = dump_default_attrs, + .default_groups = dump_default_groups, }; static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t *dump_type) @@ -284,7 +286,7 @@ out: } static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buffer, loff_t pos, size_t count) { ssize_t rc; @@ -322,15 +324,14 @@ static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj, return count; } -static struct dump_obj *create_dump_obj(uint32_t id, size_t size, - uint32_t type) +static void create_dump_obj(uint32_t id, size_t size, uint32_t type) { struct dump_obj *dump; int rc; dump = kzalloc(sizeof(*dump), GFP_KERNEL); if (!dump) - return NULL; + return; dump->kobj.kset = dump_kset; @@ -350,21 +351,39 @@ static struct dump_obj *create_dump_obj(uint32_t id, size_t size, rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id); if (rc) { kobject_put(&dump->kobj); - return NULL; + return; } + /* + * As soon as the sysfs file for this dump is created/activated there is + * a chance the opal_errd daemon (or any userspace) might read and + * acknowledge the dump before kobject_uevent() is called. If that + * happens then there is a potential race between + * dump_ack_store->kobject_put() and kobject_uevent() which leads to a + * use-after-free of a kernfs object resulting in a kernel crash. + * + * To avoid that, we need to take a reference on behalf of the bin file, + * so that our reference remains valid while we call kobject_uevent(). + * We then drop our reference before exiting the function, leaving the + * bin file to drop the last reference (if it hasn't already). + */ + + /* Take a reference for the bin file */ + kobject_get(&dump->kobj); rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr); - if (rc) { + if (rc == 0) { + kobject_uevent(&dump->kobj, KOBJ_ADD); + + pr_info("%s: New platform dump. ID = 0x%x Size %u\n", + __func__, dump->id, dump->size); + } else { + /* Drop reference count taken for bin file */ kobject_put(&dump->kobj); - return NULL; } - pr_info("%s: New platform dump. ID = 0x%x Size %u\n", - __func__, dump->id, dump->size); - - kobject_uevent(&dump->kobj, KOBJ_ADD); - - return dump; + /* Drop our reference */ + kobject_put(&dump->kobj); + return; } static irqreturn_t process_dump(int irq, void *data) @@ -401,7 +420,7 @@ void __init opal_platform_dump_init(void) int rc; int dump_irq; - /* ELOG not supported by firmware */ + /* Dump not supported by firmware */ if (!opal_check_token(OPAL_DUMP_READ)) return; diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c index ba6e437abb4b..c3fc5d258146 100644 --- a/arch/powerpc/platforms/powernv/opal-elog.c +++ b/arch/powerpc/platforms/powernv/opal-elog.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Error log support on PowerNV. * * Copyright 2013,2014 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> #include <linux/init.h> @@ -76,9 +72,14 @@ static ssize_t elog_ack_store(struct elog_obj *elog_obj, const char *buf, size_t count) { - opal_send_ack_elog(elog_obj->id); - sysfs_remove_file_self(&elog_obj->kobj, &attr->attr); - kobject_put(&elog_obj->kobj); + /* + * Try to self remove this attribute. If we are successful, + * delete the kobject itself. + */ + if (sysfs_remove_file_self(&elog_obj->kobj, &attr->attr)) { + opal_send_ack_elog(elog_obj->id); + kobject_put(&elog_obj->kobj); + } return count; } @@ -143,18 +144,19 @@ static struct attribute *elog_default_attrs[] = { &ack_attribute.attr, NULL, }; +ATTRIBUTE_GROUPS(elog_default); -static struct kobj_type elog_ktype = { +static const struct kobj_type elog_ktype = { .sysfs_ops = &elog_sysfs_ops, .release = &elog_release, - .default_attrs = elog_default_attrs, + .default_groups = elog_default_groups, }; /* Maximum size of a single log on FSP is 16KB */ #define OPAL_MAX_ERRLOG_SIZE 16384 static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buffer, loff_t pos, size_t count) { int opal_rc; @@ -170,8 +172,8 @@ static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj, opal_rc = opal_read_elog(__pa(elog->buffer), elog->size, elog->id); if (opal_rc != OPAL_SUCCESS) { - pr_err("ELOG: log read failed for log-id=%llx\n", - elog->id); + pr_err_ratelimited("ELOG: log read failed for log-id=%llx\n", + elog->id); kfree(elog->buffer); elog->buffer = NULL; return -EIO; @@ -183,14 +185,14 @@ static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj, return count; } -static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type) +static void create_elog_obj(uint64_t id, size_t size, uint64_t type) { struct elog_obj *elog; int rc; elog = kzalloc(sizeof(*elog), GFP_KERNEL); if (!elog) - return NULL; + return; elog->kobj.kset = elog_kset; @@ -223,18 +225,37 @@ static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type) rc = kobject_add(&elog->kobj, NULL, "0x%llx", id); if (rc) { kobject_put(&elog->kobj); - return NULL; + return; } + /* + * As soon as the sysfs file for this elog is created/activated there is + * a chance the opal_errd daemon (or any userspace) might read and + * acknowledge the elog before kobject_uevent() is called. If that + * happens then there is a potential race between + * elog_ack_store->kobject_put() and kobject_uevent() which leads to a + * use-after-free of a kernfs object resulting in a kernel crash. + * + * To avoid that, we need to take a reference on behalf of the bin file, + * so that our reference remains valid while we call kobject_uevent(). + * We then drop our reference before exiting the function, leaving the + * bin file to drop the last reference (if it hasn't already). + */ + + /* Take a reference for the bin file */ + kobject_get(&elog->kobj); rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr); - if (rc) { + if (rc == 0) { + kobject_uevent(&elog->kobj, KOBJ_ADD); + } else { + /* Drop the reference taken for the bin file */ kobject_put(&elog->kobj); - return NULL; } - kobject_uevent(&elog->kobj, KOBJ_ADD); + /* Drop our reference */ + kobject_put(&elog->kobj); - return elog; + return; } static irqreturn_t elog_event(int irq, void *data) diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c new file mode 100644 index 000000000000..c9c1dfb35464 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -0,0 +1,719 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Firmware-Assisted Dump support on POWER platform (OPAL). + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#define pr_fmt(fmt) "opal fadump: " fmt + +#include <linux/string.h> +#include <linux/seq_file.h> +#include <linux/of.h> +#include <linux/of_fdt.h> +#include <linux/libfdt.h> +#include <linux/mm.h> +#include <linux/crash_dump.h> + +#include <asm/page.h> +#include <asm/opal.h> +#include <asm/fadump-internal.h> + +#include "opal-fadump.h" + + +#ifdef CONFIG_PRESERVE_FA_DUMP +/* + * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel, + * ensure crash data is preserved in hope that the subsequent memory + * preserving kernel boot is going to process this crash data. + */ +void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + const struct opal_fadump_mem_struct *opal_fdm_active; + const __be32 *prop; + unsigned long dn; + u64 addr = 0; + s64 ret; + + dn = of_get_flat_dt_subnode_by_name(node, "dump"); + if (dn == -FDT_ERR_NOTFOUND) + return; + + /* + * Check if dump has been initiated on last reboot. + */ + prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL); + if (!prop) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_debug("Could not get Kernel metadata (%lld)\n", ret); + return; + } + + /* + * Preserve memory only if kernel memory regions are registered + * with f/w for MPIPL. + */ + addr = be64_to_cpu(addr); + pr_debug("Kernel metadata addr: %llx\n", addr); + opal_fdm_active = (void *)addr; + if (be16_to_cpu(opal_fdm_active->registered_regions) == 0) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_BOOT_MEM, &addr); + if ((ret != OPAL_SUCCESS) || !addr) { + pr_err("Failed to get boot memory tag (%lld)\n", ret); + return; + } + + /* + * Memory below this address can be used for booting a + * capture kernel or petitboot kernel. Preserve everything + * above this address for processing crashdump. + */ + fadump_conf->boot_mem_top = be64_to_cpu(addr); + pr_debug("Preserve everything above %llx\n", fadump_conf->boot_mem_top); + + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; +} + +#else /* CONFIG_PRESERVE_FA_DUMP */ +static const struct opal_fadump_mem_struct *opal_fdm_active; +static const struct opal_mpipl_fadump *opal_cpu_metadata; +static struct opal_fadump_mem_struct *opal_fdm; + +#ifdef CONFIG_OPAL_CORE +extern bool kernel_initiated; +#endif + +static int opal_fadump_unregister(struct fw_dump *fadump_conf); + +static void opal_fadump_update_config(struct fw_dump *fadump_conf, + const struct opal_fadump_mem_struct *fdm) +{ + pr_debug("Boot memory regions count: %d\n", be16_to_cpu(fdm->region_cnt)); + + /* + * The destination address of the first boot memory region is the + * destination address of boot memory regions. + */ + fadump_conf->boot_mem_dest_addr = be64_to_cpu(fdm->rgn[0].dest); + pr_debug("Destination address of boot memory regions: %#016llx\n", + fadump_conf->boot_mem_dest_addr); + + fadump_conf->fadumphdr_addr = be64_to_cpu(fdm->fadumphdr_addr); +} + +/* + * This function is called in the capture kernel to get configuration details + * from metadata setup by the first kernel. + */ +static void __init opal_fadump_get_config(struct fw_dump *fadump_conf, + const struct opal_fadump_mem_struct *fdm) +{ + unsigned long base, size, last_end, hole_size; + int i; + + if (!fadump_conf->dump_active) + return; + + last_end = 0; + hole_size = 0; + fadump_conf->boot_memory_size = 0; + + pr_debug("Boot memory regions:\n"); + for (i = 0; i < be16_to_cpu(fdm->region_cnt); i++) { + base = be64_to_cpu(fdm->rgn[i].src); + size = be64_to_cpu(fdm->rgn[i].size); + pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size); + + fadump_conf->boot_mem_addr[i] = base; + fadump_conf->boot_mem_sz[i] = size; + fadump_conf->boot_memory_size += size; + hole_size += (base - last_end); + + last_end = base + size; + } + + /* + * Start address of reserve dump area (permanent reservation) for + * re-registering FADump after dump capture. + */ + fadump_conf->reserve_dump_area_start = be64_to_cpu(fdm->rgn[0].dest); + + /* + * Rarely, but it can so happen that system crashes before all + * boot memory regions are registered for MPIPL. In such + * cases, warn that the vmcore may not be accurate and proceed + * anyway as that is the best bet considering free pages, cache + * pages, user pages, etc are usually filtered out. + * + * Hope the memory that could not be preserved only has pages + * that are usually filtered out while saving the vmcore. + */ + if (be16_to_cpu(fdm->region_cnt) > be16_to_cpu(fdm->registered_regions)) { + pr_warn("Not all memory regions were saved!!!\n"); + pr_warn(" Unsaved memory regions:\n"); + i = be16_to_cpu(fdm->registered_regions); + while (i < be16_to_cpu(fdm->region_cnt)) { + pr_warn("\t[%03d] base: 0x%llx, size: 0x%llx\n", + i, be64_to_cpu(fdm->rgn[i].src), + be64_to_cpu(fdm->rgn[i].size)); + i++; + } + + pr_warn("If the unsaved regions only contain pages that are filtered out (eg. free/user pages), the vmcore should still be usable.\n"); + pr_warn("WARNING: If the unsaved regions contain kernel pages, the vmcore will be corrupted.\n"); + } + + fadump_conf->boot_mem_top = (fadump_conf->boot_memory_size + hole_size); + fadump_conf->boot_mem_regs_cnt = be16_to_cpu(fdm->region_cnt); + opal_fadump_update_config(fadump_conf, fdm); +} + +/* Initialize kernel metadata */ +static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm) +{ + fdm->version = OPAL_FADUMP_VERSION; + fdm->region_cnt = cpu_to_be16(0); + fdm->registered_regions = cpu_to_be16(0); + fdm->fadumphdr_addr = cpu_to_be64(0); +} + +static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf) +{ + u64 addr = fadump_conf->reserve_dump_area_start; + u16 reg_cnt; + int i; + + opal_fdm = __va(fadump_conf->kernel_metadata); + opal_fadump_init_metadata(opal_fdm); + + /* Boot memory regions */ + reg_cnt = be16_to_cpu(opal_fdm->region_cnt); + for (i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) { + opal_fdm->rgn[i].src = cpu_to_be64(fadump_conf->boot_mem_addr[i]); + opal_fdm->rgn[i].dest = cpu_to_be64(addr); + opal_fdm->rgn[i].size = cpu_to_be64(fadump_conf->boot_mem_sz[i]); + + reg_cnt++; + addr += fadump_conf->boot_mem_sz[i]; + } + opal_fdm->region_cnt = cpu_to_be16(reg_cnt); + + /* + * Kernel metadata is passed to f/w and retrieved in capture kernel. + * So, use it to save fadump header address instead of calculating it. + */ + opal_fdm->fadumphdr_addr = cpu_to_be64(be64_to_cpu(opal_fdm->rgn[0].dest) + + fadump_conf->boot_memory_size); + + opal_fadump_update_config(fadump_conf, opal_fdm); + + return addr; +} + +static u64 opal_fadump_get_metadata_size(void) +{ + return PAGE_ALIGN(sizeof(struct opal_fadump_mem_struct)); +} + +static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf) +{ + int err = 0; + s64 ret; + + /* + * Use the last page(s) in FADump memory reservation for + * kernel metadata. + */ + fadump_conf->kernel_metadata = (fadump_conf->reserve_dump_area_start + + fadump_conf->reserve_dump_area_size - + opal_fadump_get_metadata_size()); + pr_info("Kernel metadata addr: %llx\n", fadump_conf->kernel_metadata); + + /* Initialize kernel metadata before registering the address with f/w */ + opal_fdm = __va(fadump_conf->kernel_metadata); + opal_fadump_init_metadata(opal_fdm); + + /* + * Register metadata address with f/w. Can be retrieved in + * the capture kernel. + */ + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, + fadump_conf->kernel_metadata); + if (ret != OPAL_SUCCESS) { + pr_err("Failed to set kernel metadata tag!\n"); + err = -EPERM; + } + + /* + * Register boot memory top address with f/w. Should be retrieved + * by a kernel that intends to preserve crash'ed kernel's memory. + */ + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM, + fadump_conf->boot_mem_top); + if (ret != OPAL_SUCCESS) { + pr_err("Failed to set boot memory tag!\n"); + err = -EPERM; + } + + return err; +} + +static u64 opal_fadump_get_bootmem_min(void) +{ + return OPAL_FADUMP_MIN_BOOT_MEM; +} + +static int opal_fadump_register(struct fw_dump *fadump_conf) +{ + s64 rc = OPAL_PARAMETER; + u16 registered_regs; + int i, err = -EIO; + + registered_regs = be16_to_cpu(opal_fdm->registered_regions); + for (i = 0; i < be16_to_cpu(opal_fdm->region_cnt); i++) { + rc = opal_mpipl_update(OPAL_MPIPL_ADD_RANGE, + be64_to_cpu(opal_fdm->rgn[i].src), + be64_to_cpu(opal_fdm->rgn[i].dest), + be64_to_cpu(opal_fdm->rgn[i].size)); + if (rc != OPAL_SUCCESS) + break; + + registered_regs++; + } + opal_fdm->registered_regions = cpu_to_be16(registered_regs); + + switch (rc) { + case OPAL_SUCCESS: + pr_info("Registration is successful!\n"); + fadump_conf->dump_registered = 1; + err = 0; + break; + case OPAL_RESOURCE: + /* If MAX regions limit in f/w is hit, warn and proceed. */ + pr_warn("%d regions could not be registered for MPIPL as MAX limit is reached!\n", + (be16_to_cpu(opal_fdm->region_cnt) - + be16_to_cpu(opal_fdm->registered_regions))); + fadump_conf->dump_registered = 1; + err = 0; + break; + case OPAL_PARAMETER: + pr_err("Failed to register. Parameter Error(%lld).\n", rc); + break; + case OPAL_HARDWARE: + pr_err("Support not available.\n"); + fadump_conf->fadump_supported = 0; + fadump_conf->fadump_enabled = 0; + break; + default: + pr_err("Failed to register. Unknown Error(%lld).\n", rc); + break; + } + + /* + * If some regions were registered before OPAL_MPIPL_ADD_RANGE + * OPAL call failed, unregister all regions. + */ + if ((err < 0) && (be16_to_cpu(opal_fdm->registered_regions) > 0)) + opal_fadump_unregister(fadump_conf); + + return err; +} + +static int opal_fadump_unregister(struct fw_dump *fadump_conf) +{ + s64 rc; + + rc = opal_mpipl_update(OPAL_MPIPL_REMOVE_ALL, 0, 0, 0); + if (rc) { + pr_err("Failed to un-register - unexpected Error(%lld).\n", rc); + return -EIO; + } + + opal_fdm->registered_regions = cpu_to_be16(0); + fadump_conf->dump_registered = 0; + return 0; +} + +static int opal_fadump_invalidate(struct fw_dump *fadump_conf) +{ + s64 rc; + + rc = opal_mpipl_update(OPAL_MPIPL_FREE_PRESERVED_MEMORY, 0, 0, 0); + if (rc) { + pr_err("Failed to invalidate - unexpected Error(%lld).\n", rc); + return -EIO; + } + + fadump_conf->dump_active = 0; + opal_fdm_active = NULL; + return 0; +} + +static void opal_fadump_cleanup(struct fw_dump *fadump_conf) +{ + s64 ret; + + ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, 0); + if (ret != OPAL_SUCCESS) + pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret); +} + +/* + * Verify if CPU state data is available. If available, do a bit of sanity + * checking before processing this data. + */ +static bool __init is_opal_fadump_cpu_data_valid(struct fw_dump *fadump_conf) +{ + if (!opal_cpu_metadata) + return false; + + fadump_conf->cpu_state_data_version = + be32_to_cpu(opal_cpu_metadata->cpu_data_version); + fadump_conf->cpu_state_entry_size = + be32_to_cpu(opal_cpu_metadata->cpu_data_size); + fadump_conf->cpu_state_dest_vaddr = + (u64)__va(be64_to_cpu(opal_cpu_metadata->region[0].dest)); + fadump_conf->cpu_state_data_size = + be64_to_cpu(opal_cpu_metadata->region[0].size); + + if (fadump_conf->cpu_state_data_version != HDAT_FADUMP_CPU_DATA_VER) { + pr_warn("Supported CPU state data version: %u, found: %d!\n", + HDAT_FADUMP_CPU_DATA_VER, + fadump_conf->cpu_state_data_version); + pr_warn("WARNING: F/W using newer CPU state data format!!\n"); + } + + if ((fadump_conf->cpu_state_dest_vaddr == 0) || + (fadump_conf->cpu_state_entry_size == 0) || + (fadump_conf->cpu_state_entry_size > + fadump_conf->cpu_state_data_size)) { + pr_err("CPU state data is invalid. Ignoring!\n"); + return false; + } + + return true; +} + +/* + * Convert CPU state data saved at the time of crash into ELF notes. + * + * While the crashing CPU's register data is saved by the kernel, CPU state + * data for all CPUs is saved by f/w. In CPU state data provided by f/w, + * each register entry is of 16 bytes, a numerical identifier along with + * a GPR/SPR flag in the first 8 bytes and the register value in the next + * 8 bytes. For more details refer to F/W documentation. If this data is + * missing or in unsupported format, append crashing CPU's register data + * saved by the kernel in the PT_NOTE, to have something to work with in + * the vmcore file. + */ +static int __init +opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf, + struct fadump_crash_info_header *fdh) +{ + u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize; + struct hdat_fadump_thread_hdr *thdr; + bool is_cpu_data_valid = false; + u32 num_cpus = 1, *note_buf; + struct pt_regs regs; + char *bufp; + int rc, i; + + if (is_opal_fadump_cpu_data_valid(fadump_conf)) { + size_per_thread = fadump_conf->cpu_state_entry_size; + num_cpus = (fadump_conf->cpu_state_data_size / size_per_thread); + bufp = __va(fadump_conf->cpu_state_dest_vaddr); + is_cpu_data_valid = true; + } + + rc = fadump_setup_cpu_notes_buf(num_cpus); + if (rc != 0) + return rc; + + note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr; + if (!is_cpu_data_valid) + goto out; + + /* + * Offset for register entries, entry size and registers count is + * duplicated in every thread header in keeping with HDAT format. + * Use these values from the first thread header. + */ + thdr = (struct hdat_fadump_thread_hdr *)bufp; + regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) + + be32_to_cpu(thdr->offset)); + reg_esize = be32_to_cpu(thdr->esize); + regs_cnt = be32_to_cpu(thdr->ecnt); + + pr_debug("--------CPU State Data------------\n"); + pr_debug("NumCpus : %u\n", num_cpus); + pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n", + regs_offset, reg_esize, regs_cnt); + + for (i = 0; i < num_cpus; i++, bufp += size_per_thread) { + thdr = (struct hdat_fadump_thread_hdr *)bufp; + + thread_pir = be32_to_cpu(thdr->pir); + pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n", + i, thread_pir, thdr->core_state); + + /* + * If this is kernel initiated crash, crashing_cpu would be set + * appropriately and register data of the crashing CPU saved by + * crashing kernel. Add this saved register data of crashing CPU + * to elf notes and populate the pt_regs for the remaining CPUs + * from register state data provided by firmware. + */ + if (fdh->crashing_cpu == thread_pir) { + note_buf = fadump_regs_to_elf_notes(note_buf, + &fdh->regs); + pr_debug("Crashing CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n", + fdh->crashing_cpu, fdh->regs.gpr[1], + fdh->regs.nip); + continue; + } + + /* + * Register state data of MAX cores is provided by firmware, + * but some of this cores may not be active. So, while + * processing register state data, check core state and + * skip threads that belong to inactive cores. + */ + if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE) + continue; + + opal_fadump_read_regs((bufp + regs_offset), regs_cnt, + reg_esize, true, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n", + thread_pir, regs.gpr[1], regs.nip); + } + +out: + /* + * CPU state data is invalid/unsupported. Try appending crashing CPU's + * register data, if it is saved by the kernel. + */ + if (fadump_conf->cpu_notes_buf_vaddr == (u64)note_buf) { + if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN) { + fadump_free_cpu_notes_buf(); + return -ENODEV; + } + + pr_warn("WARNING: appending only crashing CPU's register data\n"); + note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs)); + } + + final_note(note_buf); + + pr_debug("Updating elfcore header (%llx) with cpu notes\n", + fadump_conf->elfcorehdr_addr); + fadump_update_elfcore_header((char *)fadump_conf->elfcorehdr_addr); + return 0; +} + +static int __init opal_fadump_process(struct fw_dump *fadump_conf) +{ + struct fadump_crash_info_header *fdh; + int rc = -EINVAL; + + if (!opal_fdm_active || !fadump_conf->fadumphdr_addr) + return rc; + + fdh = __va(fadump_conf->fadumphdr_addr); + +#ifdef CONFIG_OPAL_CORE + /* + * If this is a kernel initiated crash, crashing_cpu would be set + * appropriately and register data of the crashing CPU saved by + * crashing kernel. Add this saved register data of crashing CPU + * to elf notes and populate the pt_regs for the remaining CPUs + * from register state data provided by firmware. + */ + if (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN) + kernel_initiated = true; +#endif + + return opal_fadump_build_cpu_notes(fadump_conf, fdh); +} + +static void opal_fadump_region_show(struct fw_dump *fadump_conf, + struct seq_file *m) +{ + const struct opal_fadump_mem_struct *fdm_ptr; + u64 dumped_bytes = 0; + int i; + + if (fadump_conf->dump_active) + fdm_ptr = opal_fdm_active; + else + fdm_ptr = opal_fdm; + + for (i = 0; i < be16_to_cpu(fdm_ptr->region_cnt); i++) { + /* + * Only regions that are registered for MPIPL + * would have dump data. + */ + if ((fadump_conf->dump_active) && + (i < be16_to_cpu(fdm_ptr->registered_regions))) + dumped_bytes = be64_to_cpu(fdm_ptr->rgn[i].size); + + seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ", + be64_to_cpu(fdm_ptr->rgn[i].src), + be64_to_cpu(fdm_ptr->rgn[i].dest)); + seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n", + be64_to_cpu(fdm_ptr->rgn[i].size), dumped_bytes); + } + + /* Dump is active. Show preserved area start address. */ + if (fadump_conf->dump_active) { + seq_printf(m, "\nMemory above %#016llx is reserved for saving crash dump\n", + fadump_conf->boot_mem_top); + } +} + +static void opal_fadump_trigger(struct fadump_crash_info_header *fdh, + const char *msg) +{ + int rc; + + /* + * Unlike on pSeries platform, logical CPU number is not provided + * with architected register state data. So, store the crashing + * CPU's PIR instead to plug the appropriate register data for + * crashing CPU in the vmcore file. + */ + fdh->crashing_cpu = (u32)mfspr(SPRN_PIR); + + rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg); + if (rc == OPAL_UNSUPPORTED) { + pr_emerg("Reboot type %d not supported.\n", + OPAL_REBOOT_MPIPL); + } else if (rc == OPAL_HARDWARE) + pr_emerg("No backend support for MPIPL!\n"); +} + +/* FADUMP_MAX_MEM_REGS or lower */ +static int opal_fadump_max_boot_mem_rgns(void) +{ + return FADUMP_MAX_MEM_REGS; +} + +static struct fadump_ops opal_fadump_ops = { + .fadump_init_mem_struct = opal_fadump_init_mem_struct, + .fadump_get_metadata_size = opal_fadump_get_metadata_size, + .fadump_setup_metadata = opal_fadump_setup_metadata, + .fadump_get_bootmem_min = opal_fadump_get_bootmem_min, + .fadump_register = opal_fadump_register, + .fadump_unregister = opal_fadump_unregister, + .fadump_invalidate = opal_fadump_invalidate, + .fadump_cleanup = opal_fadump_cleanup, + .fadump_process = opal_fadump_process, + .fadump_region_show = opal_fadump_region_show, + .fadump_trigger = opal_fadump_trigger, + .fadump_max_boot_mem_rgns = opal_fadump_max_boot_mem_rgns, +}; + +void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node) +{ + const __be32 *prop; + unsigned long dn; + __be64 be_addr; + u64 addr = 0; + int i, len; + s64 ret; + + /* + * Check if Firmware-Assisted Dump is supported. if yes, check + * if dump has been initiated on last reboot. + */ + dn = of_get_flat_dt_subnode_by_name(node, "dump"); + if (dn == -FDT_ERR_NOTFOUND) { + pr_debug("FADump support is missing!\n"); + return; + } + + if (!of_flat_dt_is_compatible(dn, "ibm,opal-dump")) { + pr_err("Support missing for this f/w version!\n"); + return; + } + + prop = of_get_flat_dt_prop(dn, "fw-load-area", &len); + if (prop) { + /* + * Each f/w load area is an (address,size) pair, + * 2 cells each, totalling 4 cells per range. + */ + for (i = 0; i < len / (sizeof(*prop) * 4); i++) { + u64 base, end; + + base = of_read_number(prop + (i * 4) + 0, 2); + end = base; + end += of_read_number(prop + (i * 4) + 2, 2); + if (end > OPAL_FADUMP_MIN_BOOT_MEM) { + pr_err("F/W load area: 0x%llx-0x%llx\n", + base, end); + pr_err("F/W version not supported!\n"); + return; + } + } + } + + fadump_conf->ops = &opal_fadump_ops; + fadump_conf->fadump_supported = 1; + /* TODO: Add support to pass additional parameters */ + fadump_conf->param_area_supported = 0; + + /* + * Firmware supports 32-bit field for size. Align it to PAGE_SIZE + * and request firmware to copy multiple kernel boot memory regions. + */ + fadump_conf->max_copy_size = ALIGN_DOWN(U32_MAX, PAGE_SIZE); + + /* + * Check if dump has been initiated on last reboot. + */ + prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL); + if (!prop) + return; + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &be_addr); + if ((ret != OPAL_SUCCESS) || !be_addr) { + pr_err("Failed to get Kernel metadata (%lld)\n", ret); + return; + } + + addr = be64_to_cpu(be_addr); + pr_debug("Kernel metadata addr: %llx\n", addr); + + opal_fdm_active = __va(addr); + if (opal_fdm_active->version != OPAL_FADUMP_VERSION) { + pr_warn("Supported kernel metadata version: %u, found: %d!\n", + OPAL_FADUMP_VERSION, opal_fdm_active->version); + pr_warn("WARNING: Kernel metadata format mismatch identified! Core file maybe corrupted..\n"); + } + + /* Kernel regions not registered with f/w for MPIPL */ + if (be16_to_cpu(opal_fdm_active->registered_regions) == 0) { + opal_fdm_active = NULL; + return; + } + + ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &be_addr); + if (be_addr) { + addr = be64_to_cpu(be_addr); + pr_debug("CPU metadata addr: %llx\n", addr); + opal_cpu_metadata = __va(addr); + } + + pr_info("Firmware-assisted dump is active.\n"); + fadump_conf->dump_active = 1; + opal_fadump_get_config(fadump_conf, opal_fdm_active); +} +#endif /* !CONFIG_PRESERVE_FA_DUMP */ diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h new file mode 100644 index 000000000000..5eeb794b5eb1 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-fadump.h @@ -0,0 +1,146 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Firmware-Assisted Dump support on POWER platform (OPAL). + * + * Copyright 2019, Hari Bathini, IBM Corporation. + */ + +#ifndef _POWERNV_OPAL_FADUMP_H +#define _POWERNV_OPAL_FADUMP_H + +#include <asm/reg.h> + +/* + * With kernel & initrd loaded at 512MB (with 256MB size), enforce a minimum + * boot memory size of 768MB to ensure f/w loading kernel and initrd doesn't + * mess with crash'ed kernel's memory during MPIPL. + */ +#define OPAL_FADUMP_MIN_BOOT_MEM (0x30000000UL) + +/* + * OPAL FADump metadata structure format version + * + * OPAL FADump kernel metadata structure stores kernel metadata needed to + * register-for/process crash dump. Format version is used to keep a tab on + * the changes in the structure format. The changes, if any, to the format + * are expected to be minimal and backward compatible. + */ +#define OPAL_FADUMP_VERSION 0x1 + +/* + * OPAL FADump kernel metadata + * + * The address of this structure will be registered with f/w for retrieving + * in the capture kernel to process the crash dump. + */ +struct opal_fadump_mem_struct { + u8 version; + u8 reserved[3]; + __be16 region_cnt; /* number of regions */ + __be16 registered_regions; /* Regions registered for MPIPL */ + __be64 fadumphdr_addr; + struct opal_mpipl_region rgn[FADUMP_MAX_MEM_REGS]; +} __packed; + +/* + * CPU state data + * + * CPU state data information is provided by f/w. The format for this data + * is defined in the HDAT spec. Version is used to keep a tab on the changes + * in this CPU state data format. Changes to this format are unlikely, but + * if there are any changes, please refer to latest HDAT specification. + */ +#define HDAT_FADUMP_CPU_DATA_VER 1 + +#define HDAT_FADUMP_CORE_INACTIVE (0x0F) + +/* HDAT thread header for register entries */ +struct hdat_fadump_thread_hdr { + __be32 pir; + /* 0x00 - 0x0F - The corresponding stop state of the core */ + u8 core_state; + u8 reserved[3]; + + __be32 offset; /* Offset to Register Entries array */ + __be32 ecnt; /* Number of entries */ + __be32 esize; /* Alloc size of each array entry in bytes */ + __be32 eactsz; /* Actual size of each array entry in bytes */ +} __packed; + +/* Register types populated by f/w */ +#define HDAT_FADUMP_REG_TYPE_GPR 0x01 +#define HDAT_FADUMP_REG_TYPE_SPR 0x02 + +/* ID numbers used by f/w while populating certain registers */ +#define HDAT_FADUMP_REG_ID_NIP 0x7D0 +#define HDAT_FADUMP_REG_ID_MSR 0x7D1 +#define HDAT_FADUMP_REG_ID_CCR 0x7D2 + +/* HDAT register entry. */ +struct hdat_fadump_reg_entry { + __be32 reg_type; + __be32 reg_num; + __be64 reg_val; +} __packed; + +static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs, + u32 reg_type, u32 reg_num, + u64 reg_val) +{ + if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) { + if (reg_num < 32) + regs->gpr[reg_num] = reg_val; + return; + } + + switch (reg_num) { + case SPRN_CTR: + regs->ctr = reg_val; + break; + case SPRN_LR: + regs->link = reg_val; + break; + case SPRN_XER: + regs->xer = reg_val; + break; + case SPRN_DAR: + regs->dar = reg_val; + break; + case SPRN_DSISR: + regs->dsisr = reg_val; + break; + case HDAT_FADUMP_REG_ID_NIP: + regs->nip = reg_val; + break; + case HDAT_FADUMP_REG_ID_MSR: + regs->msr = reg_val; + break; + case HDAT_FADUMP_REG_ID_CCR: + regs->ccr = reg_val; + break; + } +} + +static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt, + unsigned int reg_entry_size, + bool cpu_endian, + struct pt_regs *regs) +{ + struct hdat_fadump_reg_entry *reg_entry; + u64 val; + int i; + + memset(regs, 0, sizeof(struct pt_regs)); + + for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) { + reg_entry = (struct hdat_fadump_reg_entry *)bufp; + val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) : + (u64 __force)(reg_entry->reg_val)); + opal_fadump_set_regval_regnum(regs, + be32_to_cpu(reg_entry->reg_type), + be32_to_cpu(reg_entry->reg_num), + val); + } +} + +#endif /* _POWERNV_OPAL_FADUMP_H */ diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c index b37015101bf6..a3f7a2928767 100644 --- a/arch/powerpc/platforms/powernv/opal-flash.c +++ b/arch/powerpc/platforms/powernv/opal-flash.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV OPAL Firmware Update Interface * * Copyright 2013 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define DEBUG @@ -436,7 +432,7 @@ static int alloc_image_buf(char *buffer, size_t count) * and pre-allocate required memory. */ static ssize_t image_data_write(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buffer, loff_t pos, size_t count) { int rc; @@ -516,7 +512,7 @@ static struct attribute *image_op_attrs[] = { NULL /* need to NULL terminate the list of attributes */ }; -static struct attribute_group image_op_attr_group = { +static const struct attribute_group image_op_attr_group = { .attrs = image_op_attrs, }; @@ -524,6 +520,10 @@ void __init opal_flash_update_init(void) { int ret; + /* Firmware update is not supported by firmware */ + if (!opal_check_token(OPAL_FLASH_VALIDATE)) + return; + /* Allocate validate image buffer */ validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL); if (!validate_flash_data.buf) { diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c index 586ec71a4e17..f0c1830deb51 100644 --- a/arch/powerpc/platforms/powernv/opal-hmi.c +++ b/arch/powerpc/platforms/powernv/opal-hmi.c @@ -1,19 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * OPAL hypervisor Maintenance interrupt handling support in PowerNV. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; If not, see <http://www.gnu.org/licenses/>. - * * Copyright 2014 IBM Corporation * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> */ @@ -149,6 +137,43 @@ static void print_nx_checkstop_reason(const char *level, xstop_reason[i].description); } +static void print_npu_checkstop_reason(const char *level, + struct OpalHMIEvent *hmi_evt) +{ + uint8_t reason, reason_count, i; + + /* + * We may not have a checkstop reason on some combination of + * hardware and/or skiboot version + */ + if (!hmi_evt->u.xstop_error.xstop_reason) { + printk("%s NPU checkstop on chip %x\n", level, + be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id)); + return; + } + + /* + * NPU2 has 3 FIRs. Reason encoded on a byte as: + * 2 bits for the FIR number + * 6 bits for the bit number + * It may be possible to find several reasons. + * + * We don't display a specific message per FIR bit as there + * are too many and most are meaningless without the workbook + * and/or hw team help anyway. + */ + reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) / + sizeof(reason); + for (i = 0; i < reason_count; i++) { + reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF; + if (reason) + printk("%s NPU checkstop on chip %x: FIR%d bit %d is set\n", + level, + be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id), + reason >> 6, reason & 0x3F); + } +} + static void print_checkstop_reason(const char *level, struct OpalHMIEvent *hmi_evt) { @@ -160,6 +185,9 @@ static void print_checkstop_reason(const char *level, case CHECKSTOP_TYPE_NX: print_nx_checkstop_reason(level, hmi_evt); break; + case CHECKSTOP_TYPE_NPU: + print_npu_checkstop_reason(level, hmi_evt); + break; default: printk("%s Unknown Malfunction Alert of type %d\n", level, type); @@ -185,6 +213,8 @@ static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt) "A hypervisor resource error occurred", "CAPP recovery process is in progress", }; + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); /* Print things out */ if (hmi_evt->version < OpalHMIEvt_V1) { @@ -212,19 +242,22 @@ static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt) break; } - printk("%s%s Hypervisor Maintenance interrupt [%s]\n", - level, sevstr, - hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ? - "Recovered" : "Not recovered"); - error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ? - hmi_error_types[hmi_evt->type] - : "Unknown"; - printk("%s Error detail: %s\n", level, error_info); - printk("%s HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer)); - if ((hmi_evt->type == OpalHMI_ERROR_TFAC) || - (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY)) - printk("%s TFMR: %016llx\n", level, + if (hmi_evt->severity != OpalHMI_SEV_NO_ERROR || __ratelimit(&rs)) { + printk("%s%s Hypervisor Maintenance interrupt [%s]\n", + level, sevstr, + hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ? + "Recovered" : "Not recovered"); + error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ? + hmi_error_types[hmi_evt->type] + : "Unknown"; + printk("%s Error detail: %s\n", level, error_info); + printk("%s HMER: %016llx\n", level, + be64_to_cpu(hmi_evt->hmer)); + if ((hmi_evt->type == OpalHMI_ERROR_TFAC) || + (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY)) + printk("%s TFMR: %016llx\n", level, be64_to_cpu(hmi_evt->tfmr)); + } if (hmi_evt->version < OpalHMIEvt_V2) return; diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 58a07948c76e..828fc4d88471 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * OPAL IMC interface detection driver * Supported on POWERNV platform @@ -5,23 +6,17 @@ * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation. * (C) 2017 Anju T Sudhakar, IBM Corporation. * (C) 2017 Hemant K Shaw, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or later version. */ #include <linux/kernel.h> #include <linux/platform_device.h> #include <linux/of.h> #include <linux/of_address.h> -#include <linux/of_platform.h> #include <linux/crash_dump.h> +#include <linux/debugfs.h> #include <asm/opal.h> #include <asm/io.h> #include <asm/imc-pmu.h> #include <asm/cputhreads.h> -#include <asm/debugfs.h> static struct dentry *imc_debugfs_parent; @@ -39,11 +34,10 @@ static int imc_mem_set(void *data, u64 val) } DEFINE_DEBUGFS_ATTRIBUTE(fops_imc_x64, imc_mem_get, imc_mem_set, "0x%016llx\n"); -static struct dentry *imc_debugfs_create_x64(const char *name, umode_t mode, - struct dentry *parent, u64 *value) +static void imc_debugfs_create_x64(const char *name, umode_t mode, + struct dentry *parent, u64 *value) { - return debugfs_create_file_unsafe(name, mode, parent, - value, &fops_imc_x64); + debugfs_create_file_unsafe(name, mode, parent, value, &fops_imc_x64); } /* @@ -57,41 +51,28 @@ static void export_imc_mode_and_cmd(struct device_node *node, struct imc_pmu *pmu_ptr) { static u64 loc, *imc_mode_addr, *imc_cmd_addr; - int chip = 0, nid; char mode[16], cmd[16]; u32 cb_offset; + struct imc_mem_info *ptr = pmu_ptr->mem_info; - imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); - - /* - * Return here, either because 'imc' directory already exists, - * Or failed to create a new one. - */ - if (!imc_debugfs_parent) - return; + imc_debugfs_parent = debugfs_create_dir("imc", arch_debugfs_dir); if (of_property_read_u32(node, "cb_offset", &cb_offset)) cb_offset = IMC_CNTL_BLK_OFFSET; - for_each_node(nid) { - loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset; + while (ptr->vbase != NULL) { + loc = (u64)(ptr->vbase) + cb_offset; imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET); - sprintf(mode, "imc_mode_%d", nid); - if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent, - imc_mode_addr)) - goto err; + sprintf(mode, "imc_mode_%d", (u32)(ptr->id)); + imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent, + imc_mode_addr); imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET); - sprintf(cmd, "imc_cmd_%d", nid); - if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent, - imc_cmd_addr)) - goto err; - chip++; + sprintf(cmd, "imc_cmd_%d", (u32)(ptr->id)); + imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent, + imc_cmd_addr); + ptr++; } - return; - -err: - debugfs_remove_recursive(imc_debugfs_parent); } /* @@ -127,7 +108,7 @@ static int imc_get_mem_addr_nest(struct device_node *node, nr_chips)) goto error; - pmu_ptr->mem_info = kcalloc(nr_chips, sizeof(*pmu_ptr->mem_info), + pmu_ptr->mem_info = kcalloc(nr_chips + 1, sizeof(*pmu_ptr->mem_info), GFP_KERNEL); if (!pmu_ptr->mem_info) goto error; @@ -139,7 +120,6 @@ static int imc_get_mem_addr_nest(struct device_node *node, } pmu_ptr->imc_counter_mmaped = true; - export_imc_mode_and_cmd(node, pmu_ptr); kfree(base_addr_arr); kfree(chipid_arr); return 0; @@ -155,31 +135,31 @@ error: * and domain as the inputs. * Allocates memory for the struct imc_pmu, sets up its domain, size and offsets */ -static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) +static struct imc_pmu *imc_pmu_create(struct device_node *parent, int pmu_index, int domain) { int ret = 0; struct imc_pmu *pmu_ptr; u32 offset; + /* Return for unknown domain */ + if (domain < 0) + return NULL; + /* memory for pmu */ pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL); if (!pmu_ptr) - return -ENOMEM; + return NULL; /* Set the domain */ pmu_ptr->domain = domain; ret = of_property_read_u32(parent, "size", &pmu_ptr->counter_mem_size); - if (ret) { - ret = -EINVAL; + if (ret) goto free_pmu; - } if (!of_property_read_u32(parent, "offset", &offset)) { - if (imc_get_mem_addr_nest(parent, pmu_ptr, offset)) { - ret = -EINVAL; + if (imc_get_mem_addr_nest(parent, pmu_ptr, offset)) goto free_pmu; - } } /* Function to register IMC pmu */ @@ -190,14 +170,14 @@ static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) if (pmu_ptr->domain == IMC_DOMAIN_NEST) kfree(pmu_ptr->mem_info); kfree(pmu_ptr); - return ret; + return NULL; } - return 0; + return pmu_ptr; free_pmu: kfree(pmu_ptr); - return ret; + return NULL; } static void disable_nest_pmu_counters(void) @@ -205,7 +185,7 @@ static void disable_nest_pmu_counters(void) int nid, cpu; const struct cpumask *l_cpumask; - get_online_cpus(); + cpus_read_lock(); for_each_node_with_cpus(nid) { l_cpumask = cpumask_of_node(nid); cpu = cpumask_first_and(l_cpumask, cpu_online_mask); @@ -214,25 +194,25 @@ static void disable_nest_pmu_counters(void) opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, get_hard_smp_processor_id(cpu)); } - put_online_cpus(); + cpus_read_unlock(); } static void disable_core_pmu_counters(void) { - cpumask_t cores_map; int cpu, rc; - get_online_cpus(); + cpus_read_lock(); /* Disable the IMC Core functions */ - cores_map = cpu_online_cores_map(); - for_each_cpu(cpu, &cores_map) { + for_each_online_cpu(cpu) { + if (cpu_first_thread_sibling(cpu) != cpu) + continue; rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, get_hard_smp_processor_id(cpu)); if (rc) pr_err("%s: Failed to stop Core (cpu = %d)\n", - __FUNCTION__, cpu); + __func__, cpu); } - put_online_cpus(); + cpus_read_unlock(); } int get_max_nest_dev(void) @@ -254,6 +234,7 @@ int get_max_nest_dev(void) static int opal_imc_counters_probe(struct platform_device *pdev) { struct device_node *imc_dev = pdev->dev.of_node; + struct imc_pmu *pmu; int pmu_count = 0, domain; bool core_imc_reg = false, thread_imc_reg = false; u32 type; @@ -269,6 +250,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev) } for_each_compatible_node(imc_dev, NULL, IMC_DTB_UNIT_COMPAT) { + pmu = NULL; if (of_property_read_u32(imc_dev, "type", &type)) { pr_warn("IMC Device without type property\n"); continue; @@ -284,15 +266,22 @@ static int opal_imc_counters_probe(struct platform_device *pdev) case IMC_TYPE_THREAD: domain = IMC_DOMAIN_THREAD; break; + case IMC_TYPE_TRACE: + domain = IMC_DOMAIN_TRACE; + break; default: pr_warn("IMC Unknown Device type \n"); domain = -1; break; } - if (!imc_pmu_create(imc_dev, pmu_count, domain)) { - if (domain == IMC_DOMAIN_NEST) + pmu = imc_pmu_create(imc_dev, pmu_count, domain); + if (pmu != NULL) { + if (domain == IMC_DOMAIN_NEST) { + if (!imc_debugfs_parent) + export_imc_mode_and_cmd(imc_dev, pmu); pmu_count++; + } if (domain == IMC_DOMAIN_CORE) core_imc_reg = true; if (domain == IMC_DOMAIN_THREAD) @@ -300,10 +289,6 @@ static int opal_imc_counters_probe(struct platform_device *pdev) } } - /* If none of the nest units are registered, remove debugfs interface */ - if (pmu_count == 0) - debugfs_remove_recursive(imc_debugfs_parent); - /* If core imc is not registered, unregister thread-imc */ if (!core_imc_reg && thread_imc_reg) unregister_thread_imc(); diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index bc97770a67db..e180bd8e1400 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * This file implements an irqchip for OPAL events. Whenever there is * an interrupt that is handled by OPAL we get passed a list of events @@ -5,11 +6,6 @@ * interrupts to Linux so we implement an irqchip to handle them. * * Copyright Alistair Popple, IBM Corporation 2014. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. */ #include <linux/bitops.h> #include <linux/irq.h> @@ -50,23 +46,20 @@ void opal_handle_events(void) e = READ_ONCE(last_outstanding_events) & opal_event_irqchip.mask; again: while (e) { - int virq, hwirq; + int hwirq; hwirq = fls64(e) - 1; e &= ~BIT_ULL(hwirq); local_irq_disable(); - virq = irq_find_mapping(opal_event_irqchip.domain, hwirq); - if (virq) { - irq_enter(); - generic_handle_irq(virq); - irq_exit(); - } + irq_enter(); + generic_handle_domain_irq(opal_event_irqchip.domain, hwirq); + irq_exit(); local_irq_enable(); cond_resched(); } - last_outstanding_events = 0; + WRITE_ONCE(last_outstanding_events, 0); if (opal_poll_events(&events) != OPAL_SUCCESS) return; e = be64_to_cpu(events) & opal_event_irqchip.mask; @@ -76,7 +69,7 @@ again: bool opal_have_pending_events(void) { - if (last_outstanding_events & opal_event_irqchip.mask) + if (READ_ONCE(last_outstanding_events) & opal_event_irqchip.mask) return true; return false; } @@ -131,7 +124,7 @@ static irqreturn_t opal_interrupt(int irq, void *data) __be64 events; opal_handle_interrupt(virq_to_hw(irq), &events); - last_outstanding_events = be64_to_cpu(events); + WRITE_ONCE(last_outstanding_events, be64_to_cpu(events)); if (opal_have_pending_events()) opal_wake_poller(); @@ -198,7 +191,8 @@ int __init opal_event_init(void) * fall back to the legacy method (opal_event_request(...)) * anyway. */ dn = of_find_compatible_node(NULL, NULL, "ibm,opal-event"); - opal_event_irqchip.domain = irq_domain_add_linear(dn, MAX_NUM_EVENTS, + opal_event_irqchip.domain = irq_domain_create_linear(of_fwnode_handle(dn), + MAX_NUM_EVENTS, &opal_event_domain_ops, &opal_event_irqchip); of_node_put(dn); if (!opal_event_irqchip.domain) { @@ -282,11 +276,14 @@ int __init opal_event_init(void) else name = kasprintf(GFP_KERNEL, "opal"); + if (!name) + continue; /* Install interrupt handler */ rc = request_irq(r->start, opal_interrupt, r->flags & IRQD_TRIGGER_MASK, name, NULL); if (rc) { pr_warn("Error %d requesting OPAL irq %d\n", rc, (int)r->start); + kfree(name); continue; } } diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c index 55691950d981..bb4218fa796e 100644 --- a/arch/powerpc/platforms/powernv/opal-kmsg.c +++ b/arch/powerpc/platforms/powernv/opal-kmsg.c @@ -1,14 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * kmsg dumper that ensures the OPAL console fully flushes panic messages * * Author: Russell Currey <ruscur@russell.cc> * * Copyright 2015 IBM Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. */ #include <linux/kmsg_dump.h> @@ -24,13 +20,13 @@ * message, it just ensures that OPAL completely flushes the console buffer. */ static void kmsg_dump_opal_console_flush(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) + struct kmsg_dump_detail *detail) { /* * Outside of a panic context the pollers will continue to run, * so we don't need to do any special flushing. */ - if (reason != KMSG_DUMP_PANIC) + if (detail->reason != KMSG_DUMP_PANIC) return; opal_flush_console(0); diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index 2623996a193a..8a7f39e106bd 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV LPC bus handling. * * Copyright 2013 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> @@ -14,13 +10,13 @@ #include <linux/bug.h> #include <linux/io.h> #include <linux/slab.h> +#include <linux/debugfs.h> #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/opal.h> #include <asm/prom.h> #include <linux/uaccess.h> -#include <asm/debugfs.h> #include <asm/isa-bridge.h> static int opal_lpc_chip_id = -1; @@ -201,7 +197,7 @@ static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf, /* * Select access size based on count and alignment and - * access type. IO and MEM only support byte acceses, + * access type. IO and MEM only support byte accesses, * FW supports all 3. */ len = 1; @@ -375,7 +371,7 @@ static int opal_lpc_init_debugfs(void) if (opal_lpc_chip_id < 0) return -ENODEV; - root = debugfs_create_dir("lpc", powerpc_debugfs_root); + root = debugfs_create_dir("lpc", arch_debugfs_dir); rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO); rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM); @@ -397,16 +393,17 @@ void __init opal_lpc_init(void) for_each_compatible_node(np, NULL, "ibm,power8-lpc") { if (!of_device_is_available(np)) continue; - if (!of_get_property(np, "primary", NULL)) + if (!of_property_present(np, "primary")) continue; opal_lpc_chip_id = of_get_ibm_chip_id(np); + of_node_put(np); break; } if (opal_lpc_chip_id < 0) return; /* Does it support direct mapping ? */ - if (of_get_property(np, "ranges", NULL)) { + if (of_property_present(np, "ranges")) { pr_info("OPAL: Found memory mapped LPC bus on chip %d\n", opal_lpc_chip_id); isa_bridge_init_non_pci(np); diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c index dcb42bcb5efa..a1754a28265d 100644 --- a/arch/powerpc/platforms/powernv/opal-memory-errors.c +++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c @@ -1,20 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * OPAL asynchronus Memory error handling support in PowerNV. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * * Copyright 2013 IBM Corporation * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com> */ @@ -95,7 +82,7 @@ static DECLARE_WORK(mem_error_work, mem_error_handler); /* * opal_memory_err_event - notifier handler that queues up the opal message - * to be preocessed later. + * to be processed later. */ static int opal_memory_err_event(struct notifier_block *nb, unsigned long msg_type, void *msg) diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c index acd3206dfae3..992a6b379a66 100644 --- a/arch/powerpc/platforms/powernv/opal-msglog.c +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV OPAL in-memory console interface * * Copyright 2014 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <asm/io.h> @@ -16,6 +12,8 @@ #include <linux/types.h> #include <asm/barrier.h> +#include "powernv.h" + /* OPAL in-memory console. Defined in OPAL source at core/console.c */ struct memcons { __be64 magic; @@ -33,23 +31,23 @@ struct memcons { static struct memcons *opal_memcons = NULL; -ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) +ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count) { const char *conbuf; ssize_t ret; size_t first_read = 0; uint32_t out_pos, avail; - if (!opal_memcons) + if (!mc) return -ENODEV; - out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos)); + out_pos = be32_to_cpu(READ_ONCE(mc->out_pos)); /* Now we've read out_pos, put a barrier in before reading the new * data it points to in conbuf. */ smp_rmb(); - conbuf = phys_to_virt(be64_to_cpu(opal_memcons->obuf_phys)); + conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys)); /* When the buffer has wrapped, read from the out_pos marker to the end * of the buffer, and then read the remaining data as in the un-wrapped @@ -57,7 +55,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) if (out_pos & MEMCONS_OUT_POS_WRAP) { out_pos &= MEMCONS_OUT_POS_MASK; - avail = be32_to_cpu(opal_memcons->obuf_size) - out_pos; + avail = be32_to_cpu(mc->obuf_size) - out_pos; ret = memory_read_from_buffer(to, count, &pos, conbuf + out_pos, avail); @@ -75,7 +73,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) } /* Sanity check. The firmware should not do this to us. */ - if (out_pos > be32_to_cpu(opal_memcons->obuf_size)) { + if (out_pos > be32_to_cpu(mc->obuf_size)) { pr_err("OPAL: memory console corruption. Aborting read.\n"); return -EINVAL; } @@ -90,44 +88,65 @@ out: return ret; } +ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count) +{ + return memcons_copy(opal_memcons, to, pos, count); +} + static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj, - struct bin_attribute *bin_attr, char *to, + const struct bin_attribute *bin_attr, char *to, loff_t pos, size_t count) { return opal_msglog_copy(to, pos, count); } -static struct bin_attribute opal_msglog_attr = { - .attr = {.name = "msglog", .mode = 0444}, +static struct bin_attribute opal_msglog_attr __ro_after_init = { + .attr = {.name = "msglog", .mode = 0400}, .read = opal_msglog_read }; -void __init opal_msglog_init(void) +struct memcons *__init memcons_init(struct device_node *node, const char *mc_prop_name) { u64 mcaddr; struct memcons *mc; - if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) { - pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n"); - return; + if (of_property_read_u64(node, mc_prop_name, &mcaddr)) { + pr_warn("%s property not found, no message log\n", + mc_prop_name); + goto out_err; } mc = phys_to_virt(mcaddr); if (!mc) { - pr_warn("OPAL: memory console address is invalid\n"); - return; + pr_warn("memory console address is invalid\n"); + goto out_err; } if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) { - pr_warn("OPAL: memory console version is invalid\n"); - return; + pr_warn("memory console version is invalid\n"); + goto out_err; } - /* Report maximum size */ - opal_msglog_attr.size = be32_to_cpu(mc->ibuf_size) + - be32_to_cpu(mc->obuf_size); + return mc; + +out_err: + return NULL; +} + +u32 __init memcons_get_size(struct memcons *mc) +{ + return be32_to_cpu(mc->ibuf_size) + be32_to_cpu(mc->obuf_size); +} + +void __init opal_msglog_init(void) +{ + opal_memcons = memcons_init(opal_node, "ibm,opal-memcons"); + if (!opal_memcons) { + pr_warn("OPAL: memcons failed to load from ibm,opal-memcons\n"); + return; + } - opal_memcons = mc; + opal_msglog_attr.size = memcons_get_size(opal_memcons); } void __init opal_msglog_sysfs_init(void) diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c index 5584247f5029..380bc2d7ebbf 100644 --- a/arch/powerpc/platforms/powernv/opal-nvram.c +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV nvram code. * * Copyright 2011 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define DEBUG diff --git a/arch/powerpc/platforms/powernv/opal-power.c b/arch/powerpc/platforms/powernv/opal-power.c index 89ab1da57657..db99ffcb7b82 100644 --- a/arch/powerpc/platforms/powernv/opal-power.c +++ b/arch/powerpc/platforms/powernv/opal-power.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV OPAL power control for graceful shutdown handling * * Copyright 2015 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "opal-power: " fmt @@ -57,7 +53,7 @@ static bool detect_epow(void) } /* Check for existing EPOW, DPO events */ -static bool poweroff_pending(void) +static bool __init poweroff_pending(void) { int rc; __be64 opal_dpo_timeout; diff --git a/arch/powerpc/platforms/powernv/opal-powercap.c b/arch/powerpc/platforms/powernv/opal-powercap.c index d90ee4fc2c6a..ea917266aa17 100644 --- a/arch/powerpc/platforms/powernv/opal-powercap.c +++ b/arch/powerpc/platforms/powernv/opal-powercap.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV OPAL Powercap interface * * Copyright 2017 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "opal-powercap: " fmt @@ -17,7 +13,7 @@ #include <asm/opal.h> -DEFINE_MUTEX(powercap_mutex); +static DEFINE_MUTEX(powercap_mutex); static struct kobject *powercap_kobj; @@ -133,7 +129,7 @@ out_token: return ret; } -static void powercap_add_attr(int handle, const char *name, +static void __init powercap_add_attr(int handle, const char *name, struct powercap_attr *attr) { attr->handle = handle; @@ -157,7 +153,7 @@ void __init opal_powercap_init(void) pcaps = kcalloc(of_get_child_count(powercap), sizeof(*pcaps), GFP_KERNEL); if (!pcaps) - return; + goto out_put_powercap; powercap_kobj = kobject_create_and_add("powercap", opal_kobj); if (!powercap_kobj) { @@ -200,6 +196,12 @@ void __init opal_powercap_init(void) j = 0; pcaps[i].pg.name = kasprintf(GFP_KERNEL, "%pOFn", node); + if (!pcaps[i].pg.name) { + kfree(pcaps[i].pattrs); + kfree(pcaps[i].pg.attrs); + goto out_pcaps_pattrs; + } + if (has_min) { powercap_add_attr(min, "powercap-min", &pcaps[i].pattrs[j]); @@ -230,6 +232,7 @@ void __init opal_powercap_init(void) } i++; } + of_node_put(powercap); return; @@ -240,6 +243,9 @@ out_pcaps_pattrs: kfree(pcaps[i].pg.name); } kobject_put(powercap_kobj); + of_node_put(node); out_pcaps: kfree(pcaps); +out_put_powercap: + of_node_put(powercap); } diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index 4070bb4e9da4..dc246ed4b7b4 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -1,17 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * OPAL Runtime Diagnostics interface driver * Supported on POWERNV platform * * Copyright IBM Corporation 2015 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #define pr_fmt(fmt) "opal-prd: " fmt @@ -32,13 +24,20 @@ #include <linux/uaccess.h> -/** +struct opal_prd_msg { + union { + struct opal_prd_msg_header header; + DECLARE_FLEX_ARRAY(u8, data); + }; +}; + +/* * The msg member must be at the end of the struct, as it's followed by the * message data. */ struct opal_prd_msg_queue_item { - struct list_head list; - struct opal_prd_msg_header msg; + struct list_head list; + struct opal_prd_msg msg; }; static struct device_node *prd_node; @@ -67,6 +66,8 @@ static bool opal_prd_range_is_valid(uint64_t addr, uint64_t size) const char *label; addrp = of_get_address(node, 0, &range_size, NULL); + if (!addrp) + continue; range_addr = of_read_number(addrp, 2); range_end = range_addr + range_size; @@ -113,7 +114,6 @@ static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma) { size_t addr, size; pgprot_t page_prot; - int rc; pr_devel("opal_prd_mmap(0x%016lx, 0x%016lx, 0x%lx, 0x%lx)\n", vma->vm_start, vma->vm_end, vma->vm_pgoff, @@ -129,10 +129,8 @@ static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma) page_prot = phys_mem_access_prot(file, vma->vm_pgoff, size, vma->vm_page_prot); - rc = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size, + return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size, page_prot); - - return rc; } static bool opal_msg_queue_empty(void) @@ -167,7 +165,7 @@ static ssize_t opal_prd_read(struct file *file, char __user *buf, int rc; /* we need at least a header's worth of data */ - if (count < sizeof(item->msg)) + if (count < sizeof(item->msg.header)) return -EINVAL; if (*ppos) @@ -197,7 +195,7 @@ static ssize_t opal_prd_read(struct file *file, char __user *buf, return -EINTR; } - size = be16_to_cpu(item->msg.size); + size = be16_to_cpu(item->msg.header.size); if (size > count) { err = -EINVAL; goto err_requeue; @@ -225,8 +223,8 @@ static ssize_t opal_prd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct opal_prd_msg_header hdr; + struct opal_prd_msg *msg; ssize_t size; - void *msg; int rc; size = sizeof(hdr); @@ -258,12 +256,12 @@ static ssize_t opal_prd_write(struct file *file, const char __user *buf, static int opal_prd_release(struct inode *inode, struct file *file) { - struct opal_prd_msg_header msg; + struct opal_prd_msg msg; - msg.size = cpu_to_be16(sizeof(msg)); - msg.type = OPAL_PRD_MSG_TYPE_FINI; + msg.header.size = cpu_to_be16(sizeof(msg)); + msg.header.type = OPAL_PRD_MSG_TYPE_FINI; - opal_prd_msg((struct opal_prd_msg *)&msg); + opal_prd_msg(&msg); atomic_xchg(&prd_usage, 0); @@ -350,7 +348,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb, int msg_size, item_size; unsigned long flags; - if (msg_type != OPAL_MSG_PRD) + if (msg_type != OPAL_MSG_PRD && msg_type != OPAL_MSG_PRD2) return 0; /* Calculate total size of the message and item we need to store. The @@ -363,7 +361,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb, if (!item) return -ENOMEM; - memcpy(&item->msg, msg->params, msg_size); + memcpy(&item->msg.data, msg->params, msg_size); spin_lock_irqsave(&opal_prd_msg_queue_lock, flags); list_add_tail(&item->list, &opal_prd_msg_queue); @@ -380,6 +378,12 @@ static struct notifier_block opal_prd_event_nb = { .priority = 0, }; +static struct notifier_block opal_prd_event_nb2 = { + .notifier_call = opal_prd_msg_notifier, + .next = NULL, + .priority = 0, +}; + static int opal_prd_probe(struct platform_device *pdev) { int rc; @@ -401,22 +405,31 @@ static int opal_prd_probe(struct platform_device *pdev) return rc; } + rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb2); + if (rc) { + pr_err("Couldn't register PRD2 event notifier\n"); + opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb); + return rc; + } + rc = misc_register(&opal_prd_dev); if (rc) { pr_err("failed to register miscdev\n"); opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb); + opal_message_notifier_unregister(OPAL_MSG_PRD2, + &opal_prd_event_nb2); return rc; } return 0; } -static int opal_prd_remove(struct platform_device *pdev) +static void opal_prd_remove(struct platform_device *pdev) { misc_deregister(&opal_prd_dev); opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb); - return 0; + opal_message_notifier_unregister(OPAL_MSG_PRD2, &opal_prd_event_nb2); } static const struct of_device_id opal_prd_match[] = { @@ -430,7 +443,7 @@ static struct platform_driver opal_prd_driver = { .of_match_table = opal_prd_match, }, .probe = opal_prd_probe, - .remove = opal_prd_remove, + .remove = opal_prd_remove, }; module_platform_driver(opal_prd_driver); diff --git a/arch/powerpc/platforms/powernv/opal-psr.c b/arch/powerpc/platforms/powernv/opal-psr.c index 74986b35cf77..6441e17b6996 100644 --- a/arch/powerpc/platforms/powernv/opal-psr.c +++ b/arch/powerpc/platforms/powernv/opal-psr.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV OPAL Power-Shift-Ratio interface * * Copyright 2017 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "opal-psr: " fmt @@ -17,11 +13,11 @@ #include <asm/opal.h> -DEFINE_MUTEX(psr_mutex); +static DEFINE_MUTEX(psr_mutex); static struct kobject *psr_kobj; -struct psr_attr { +static struct psr_attr { u32 handle; struct kobj_attribute attr; } *psr_attrs; @@ -139,7 +135,7 @@ void __init opal_psr_init(void) psr_attrs = kcalloc(of_get_child_count(psr), sizeof(*psr_attrs), GFP_KERNEL); if (!psr_attrs) - return; + goto out_put_psr; psr_kobj = kobject_create_and_add("psr", opal_kobj); if (!psr_kobj) { @@ -166,10 +162,14 @@ void __init opal_psr_init(void) } i++; } + of_node_put(psr); return; out_kobj: + of_node_put(node); kobject_put(psr_kobj); out: kfree(psr_attrs); +out_put_psr: + of_node_put(psr); } diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c index 42ec642a3eba..79011a263aa6 100644 --- a/arch/powerpc/platforms/powernv/opal-rtc.c +++ b/arch/powerpc/platforms/powernv/opal-rtc.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV Real Time Clock. * * Copyright 2011 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ @@ -15,14 +11,15 @@ #include <linux/bcd.h> #include <linux/rtc.h> #include <linux/delay.h> -#include <linux/platform_device.h> +#include <linux/of.h> #include <linux/of_platform.h> +#include <linux/platform_device.h> #include <asm/opal.h> #include <asm/firmware.h> #include <asm/machdep.h> -static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm) +static void __init opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm) { tm->tm_year = ((bcd2bin(y_m_d >> 24) * 100) + bcd2bin((y_m_d >> 16) & 0xff)) - 1900; diff --git a/arch/powerpc/platforms/powernv/opal-secvar.c b/arch/powerpc/platforms/powernv/opal-secvar.c new file mode 100644 index 000000000000..6ac410f4d3c7 --- /dev/null +++ b/arch/powerpc/platforms/powernv/opal-secvar.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PowerNV code for secure variables + * + * Copyright (C) 2019 IBM Corporation + * Author: Claudio Carvalho + * Nayna Jain + * + * APIs to access secure variables managed by OPAL. + */ + +#define pr_fmt(fmt) "secvar: "fmt + +#include <linux/types.h> +#include <linux/of.h> +#include <linux/platform_device.h> +#include <asm/opal.h> +#include <asm/secvar.h> +#include <asm/secure_boot.h> + +static int opal_status_to_err(int rc) +{ + int err; + + switch (rc) { + case OPAL_SUCCESS: + err = 0; + break; + case OPAL_UNSUPPORTED: + err = -ENXIO; + break; + case OPAL_PARAMETER: + err = -EINVAL; + break; + case OPAL_RESOURCE: + err = -ENOSPC; + break; + case OPAL_HARDWARE: + err = -EIO; + break; + case OPAL_NO_MEM: + err = -ENOMEM; + break; + case OPAL_EMPTY: + err = -ENOENT; + break; + case OPAL_PARTIAL: + err = -EFBIG; + break; + default: + err = -EINVAL; + } + + return err; +} + +static int opal_get_variable(const char *key, u64 ksize, u8 *data, u64 *dsize) +{ + int rc; + + if (!key || !dsize) + return -EINVAL; + + *dsize = cpu_to_be64(*dsize); + + rc = opal_secvar_get(key, ksize, data, dsize); + + *dsize = be64_to_cpu(*dsize); + + return opal_status_to_err(rc); +} + +static int opal_get_next_variable(const char *key, u64 *keylen, u64 keybufsize) +{ + int rc; + + if (!key || !keylen) + return -EINVAL; + + *keylen = cpu_to_be64(*keylen); + + rc = opal_secvar_get_next(key, keylen, keybufsize); + + *keylen = be64_to_cpu(*keylen); + + return opal_status_to_err(rc); +} + +static int opal_set_variable(const char *key, u64 ksize, u8 *data, u64 dsize) +{ + int rc; + + if (!key || !data) + return -EINVAL; + + rc = opal_secvar_enqueue_update(key, ksize, data, dsize); + + return opal_status_to_err(rc); +} + +static ssize_t opal_secvar_format(char *buf, size_t bufsize) +{ + ssize_t rc = 0; + struct device_node *node; + const char *format; + + node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend"); + if (!of_device_is_available(node)) { + rc = -ENODEV; + goto out; + } + + rc = of_property_read_string(node, "format", &format); + if (rc) + goto out; + + rc = snprintf(buf, bufsize, "%s", format); + +out: + of_node_put(node); + + return rc; +} + +static int opal_secvar_max_size(u64 *max_size) +{ + int rc; + struct device_node *node; + + node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend"); + if (!node) + return -ENODEV; + + if (!of_device_is_available(node)) { + rc = -ENODEV; + goto out; + } + + rc = of_property_read_u64(node, "max-var-size", max_size); + +out: + of_node_put(node); + return rc; +} + +static const struct secvar_operations opal_secvar_ops = { + .get = opal_get_variable, + .get_next = opal_get_next_variable, + .set = opal_set_variable, + .format = opal_secvar_format, + .max_size = opal_secvar_max_size, +}; + +static int opal_secvar_probe(struct platform_device *pdev) +{ + if (!opal_check_token(OPAL_SECVAR_GET) + || !opal_check_token(OPAL_SECVAR_GET_NEXT) + || !opal_check_token(OPAL_SECVAR_ENQUEUE_UPDATE)) { + pr_err("OPAL doesn't support secure variables\n"); + return -ENODEV; + } + + return set_secvar_ops(&opal_secvar_ops); +} + +static const struct of_device_id opal_secvar_match[] = { + { .compatible = "ibm,secvar-backend",}, + {}, +}; + +static struct platform_driver opal_secvar_driver = { + .driver = { + .name = "secvar", + .of_match_table = opal_secvar_match, + }, +}; + +static int __init opal_secvar_init(void) +{ + return platform_driver_probe(&opal_secvar_driver, opal_secvar_probe); +} +device_initcall(opal_secvar_init); diff --git a/arch/powerpc/platforms/powernv/opal-sensor-groups.c b/arch/powerpc/platforms/powernv/opal-sensor-groups.c index 179609220e6f..9944376b115c 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor-groups.c +++ b/arch/powerpc/platforms/powernv/opal-sensor-groups.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV OPAL Sensor-groups interface * * Copyright 2017 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "opal-sensor-groups: " fmt @@ -17,7 +13,7 @@ #include <asm/opal.h> -DEFINE_MUTEX(sg_mutex); +static DEFINE_MUTEX(sg_mutex); static struct kobject *sg_kobj; @@ -130,7 +126,7 @@ static void add_attr(int handle, struct sg_attr *attr, int index) attr->attr.store = ops_info[index].store; } -static int add_attr_group(const __be32 *ops, int len, struct sensor_group *sg, +static int __init add_attr_group(const __be32 *ops, int len, struct sensor_group *sg, u32 handle) { int i, j; @@ -148,7 +144,7 @@ static int add_attr_group(const __be32 *ops, int len, struct sensor_group *sg, return sysfs_create_group(sg_kobj, &sg->sg); } -static int get_nr_attrs(const __be32 *ops, int len) +static int __init get_nr_attrs(const __be32 *ops, int len) { int i, j; int nr_attrs = 0; @@ -174,7 +170,7 @@ void __init opal_sensor_groups_init(void) sgs = kcalloc(of_get_child_count(sg), sizeof(*sgs), GFP_KERNEL); if (!sgs) - return; + goto out_sg_put; sg_kobj = kobject_create_and_add("sensor_groups", opal_kobj); if (!sg_kobj) { @@ -226,6 +222,7 @@ void __init opal_sensor_groups_init(void) } i++; } + of_node_put(sg); return; @@ -235,6 +232,9 @@ out_sgs_sgattrs: kfree(sgs[i].sg.attrs); } kobject_put(sg_kobj); + of_node_put(node); out_sgs: kfree(sgs); +out_sg_put: + of_node_put(sg); } diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c index 35a5f4b9aeb5..8880a1c14573 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor.c +++ b/arch/powerpc/platforms/powernv/opal-sensor.c @@ -1,25 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV sensor code * * Copyright (C) 2013 IBM - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/delay.h> +#include <linux/of.h> #include <linux/of_platform.h> +#include <linux/platform_device.h> #include <asm/opal.h> #include <asm/machdep.h> diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c index 916a4b7b1bb5..a12312afe4ef 100644 --- a/arch/powerpc/platforms/powernv/opal-sysparam.c +++ b/arch/powerpc/platforms/powernv/opal-sysparam.c @@ -1,21 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV system parameter code * * Copyright (C) 2013 IBM - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/kobject.h> diff --git a/arch/powerpc/platforms/powernv/opal-tracepoints.c b/arch/powerpc/platforms/powernv/opal-tracepoints.c index f16a43540e30..91b36541b9e5 100644 --- a/arch/powerpc/platforms/powernv/opal-tracepoints.c +++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c @@ -2,7 +2,6 @@ #include <linux/percpu.h> #include <linux/jump_label.h> #include <asm/trace.h> -#include <asm/asm-prototypes.h> #ifdef CONFIG_JUMP_LABEL struct static_key opal_tracepoint_key = STATIC_KEY_INIT; diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index f4875fe3f8ff..0ed95f753416 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -1,12 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * PowerNV OPAL API wrappers * * Copyright 2011 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/jump_label.h> @@ -17,317 +13,51 @@ #include <asm/asm-compat.h> #include <asm/feature-fixups.h> - .section ".text" - -#ifdef CONFIG_TRACEPOINTS -#ifdef CONFIG_JUMP_LABEL -#define OPAL_BRANCH(LABEL) \ - ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key) -#else - - .section ".toc","aw" - - .globl opal_tracepoint_refcount -opal_tracepoint_refcount: - .8byte 0 - - .section ".text" + .section ".text" /* - * We branch around this in early init by using an unconditional cpu - * feature. + * r3-r10 - OPAL call arguments + * STK_PARAM(R11) - OPAL opcode + * STK_PARAM(R12) - MSR to restore */ -#define OPAL_BRANCH(LABEL) \ -BEGIN_FTR_SECTION; \ - b 1f; \ -END_FTR_SECTION(0, 1); \ - ld r11,opal_tracepoint_refcount@toc(r2); \ - cmpdi r11,0; \ - bne- LABEL; \ -1: - -#endif - -#else -#define OPAL_BRANCH(LABEL) -#endif - -/* - * DO_OPAL_CALL assumes: - * r0 = opal call token - * r12 = msr - * LR has been saved - */ -#define DO_OPAL_CALL() \ - mfcr r11; \ - stw r11,8(r1); \ - li r11,0; \ - ori r11,r11,MSR_EE; \ - std r12,PACASAVEDMSR(r13); \ - andc r12,r12,r11; \ - mtmsrd r12,1; \ - LOAD_REG_ADDR(r11,opal_return); \ - mtlr r11; \ - li r11,MSR_DR|MSR_IR|MSR_LE;\ - andc r12,r12,r11; \ - mtspr SPRN_HSRR1,r12; \ - LOAD_REG_ADDR(r11,opal); \ - ld r12,8(r11); \ - ld r2,0(r11); \ - mtspr SPRN_HSRR0,r12; \ +_GLOBAL_TOC(__opal_call) + mflr r0 + std r0,PPC_LR_STKOFF(r1) + ld r12,STK_PARAM(R12)(r1) + li r0,MSR_IR|MSR_DR|MSR_LE + andc r12,r12,r0 + LOAD_REG_ADDR(r11, opal_return) + mtlr r11 + LOAD_REG_ADDR(r11, opal) + ld r2,0(r11) + ld r11,8(r11) + mtspr SPRN_HSRR0,r11 + mtspr SPRN_HSRR1,r12 + /* set token to r0 */ + ld r0,STK_PARAM(R11)(r1) hrfid - -#define OPAL_CALL(name, token) \ - _GLOBAL_TOC(name); \ - mfmsr r12; \ - mflr r0; \ - andi. r11,r12,MSR_IR|MSR_DR; \ - std r0,PPC_LR_STKOFF(r1); \ - li r0,token; \ - beq opal_real_call; \ - OPAL_BRANCH(opal_tracepoint_entry) \ - DO_OPAL_CALL() - - opal_return: /* - * Fixup endian on OPAL return... we should be able to simplify - * this by instead converting the below trampoline to a set of - * bytes (always BE) since MSR:LE will end up fixed up as a side - * effect of the rfid. + * Restore MSR on OPAL return. The MSR is set to big-endian. */ - FIXUP_ENDIAN_HV - ld r2,PACATOC(r13); - lwz r4,8(r1); - ld r5,PPC_LR_STKOFF(r1); - ld r6,PACASAVEDMSR(r13); - mtcr r4; - mtspr SPRN_HSRR0,r5; - mtspr SPRN_HSRR1,r6; - hrfid - -opal_real_call: - mfcr r11 - stw r11,8(r1) - /* Set opal return address */ - LOAD_REG_ADDR(r11, opal_return_realmode) - mtlr r11 - li r11,MSR_LE - andc r12,r12,r11 - mtspr SPRN_HSRR1,r12 - LOAD_REG_ADDR(r11,opal) - ld r12,8(r11) - ld r2,0(r11) - mtspr SPRN_HSRR0,r12 - hrfid - -opal_return_realmode: - FIXUP_ENDIAN_HV - ld r2,PACATOC(r13); - lwz r11,8(r1); - ld r12,PPC_LR_STKOFF(r1) - mtcr r11; - mtlr r12 - blr - -#ifdef CONFIG_TRACEPOINTS -opal_tracepoint_entry: - stdu r1,-STACKFRAMESIZE(r1) - std r0,STK_REG(R23)(r1) - std r3,STK_REG(R24)(r1) - std r4,STK_REG(R25)(r1) - std r5,STK_REG(R26)(r1) - std r6,STK_REG(R27)(r1) - std r7,STK_REG(R28)(r1) - std r8,STK_REG(R29)(r1) - std r9,STK_REG(R30)(r1) - std r10,STK_REG(R31)(r1) - mr r3,r0 - addi r4,r1,STK_REG(R24) - bl __trace_opal_entry - ld r0,STK_REG(R23)(r1) - ld r3,STK_REG(R24)(r1) - ld r4,STK_REG(R25)(r1) - ld r5,STK_REG(R26)(r1) - ld r6,STK_REG(R27)(r1) - ld r7,STK_REG(R28)(r1) - ld r8,STK_REG(R29)(r1) - ld r9,STK_REG(R30)(r1) - ld r10,STK_REG(R31)(r1) - - /* setup LR so we return via tracepoint_return */ - LOAD_REG_ADDR(r11,opal_tracepoint_return) - std r11,16(r1) - - mfmsr r12 - DO_OPAL_CALL() - -opal_tracepoint_return: - std r3,STK_REG(R31)(r1) - mr r4,r3 - ld r3,STK_REG(R23)(r1) - bl __trace_opal_exit - ld r3,STK_REG(R31)(r1) - addi r1,r1,STACKFRAMESIZE - ld r0,16(r1) +#ifdef __BIG_ENDIAN__ + ld r11,STK_PARAM(R12)(r1) + mtmsrd r11 +#else + /* Endian can only be switched with rfi, must byte reverse MSR load */ + .short 0x4039 /* li r10,STK_PARAM(R12) */ + .byte (STK_PARAM(R12) >> 8) & 0xff + .byte STK_PARAM(R12) & 0xff + + .long 0x280c6a7d /* ldbrx r11,r10,r1 */ + .long 0x05009f42 /* bcl 20,31,$+4 */ + .long 0xa602487d /* mflr r10 */ + .long 0x14004a39 /* addi r10,r10,20 */ + .long 0xa64b5a7d /* mthsrr0 r10 */ + .long 0xa64b7b7d /* mthsrr1 r11 */ + .long 0x2402004c /* hrfid */ +#endif + LOAD_PACA_TOC() + ld r0,PPC_LR_STKOFF(r1) mtlr r0 blr -#endif - - -OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); -OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); -OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ); -OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE); -OPAL_CALL(opal_rtc_read, OPAL_RTC_READ); -OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE); -OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN); -OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT); -OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2); -OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM); -OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM); -OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT); -OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS); -OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY); -OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY); -OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE); -OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD); -OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD); -OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); -OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); -OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); -OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); -OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); -OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); -OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); -OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR); -OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET); -OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT); -OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC); -OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE); -OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW); -OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW); -OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY); -OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE); -OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV); -OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE); -OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE); -OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE); -OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE); -OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE); -OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE); -OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32); -OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64); -OPAL_CALL(opal_start_cpu, OPAL_START_CPU); -OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS); -OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL); -OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW); -OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL); -OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET); -OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA); -OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA); -OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB); -OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT); -OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR); -OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS); -OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS); -OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS); -OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED); -OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR); -OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL); -OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI); -OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2); -OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ); -OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE); -OPAL_CALL(opal_lpc_read, OPAL_LPC_READ); -OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE); -OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU); -OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS); -OPAL_CALL(opal_read_elog, OPAL_ELOG_READ); -OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK); -OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE); -OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND); -OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE); -OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE); -OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE); -OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE); -OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE); -OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN); -OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT); -OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO); -OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2); -OPAL_CALL(opal_dump_read, OPAL_DUMP_READ); -OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK); -OPAL_CALL(opal_get_msg, OPAL_GET_MSG); -OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC); -OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION); -OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND); -OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT); -OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); -OPAL_CALL(opal_get_param, OPAL_GET_PARAM); -OPAL_CALL(opal_set_param, OPAL_SET_PARAM); -OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); -OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); -OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); -OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); -OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION); -OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE); -OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO); -OPAL_CALL(opal_tpo_read, OPAL_READ_TPO); -OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND); -OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV); -OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST); -OPAL_CALL(opal_flash_read, OPAL_FLASH_READ); -OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE); -OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE); -OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG); -OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR); -OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR); -OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH); -OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE); -OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE); -OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE); -OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE); -OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR); -OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR); -OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); -OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); -OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); -OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); -OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET); -OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO); -OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG); -OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG); -OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO); -OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); -OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); -OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); -OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); -OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); -OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); -OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); -OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); -OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); -OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); -OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET); -OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); -OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); -OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); -OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT); -OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START); -OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP); -OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P); -OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP); -OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP); -OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO); -OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO); -OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR); -OPAL_CALL(opal_quiesce, OPAL_QUIESCE); -OPAL_CALL(opal_npu_spa_setup, OPAL_NPU_SPA_SETUP); -OPAL_CALL(opal_npu_spa_clear_cache, OPAL_NPU_SPA_CLEAR_CACHE); -OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET); -OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR); -OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR); -OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64); -OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE); -OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index 22d5e1110dbb..748c2b97fa53 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -1,12 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* - * PowerNV LPC bus handling. + * PowerNV SCOM bus debugfs interface * + * Copyright 2010 Benjamin Herrenschmidt, IBM Corp + * <benh@kernel.crashing.org> + * and David Gibson, IBM Corporation. * Copyright 2013 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> @@ -14,62 +13,13 @@ #include <linux/bug.h> #include <linux/gfp.h> #include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/debugfs.h> #include <asm/machdep.h> #include <asm/firmware.h> #include <asm/opal.h> -#include <asm/scom.h> - -/* - * We could probably fit that inside the scom_map_t - * which is a void* after all but it's really too ugly - * so let's kmalloc it for now - */ -struct opal_scom_map { - uint32_t chip; - uint64_t addr; -}; - -static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count) -{ - struct opal_scom_map *m; - const __be32 *gcid; - - if (!of_get_property(dev, "scom-controller", NULL)) { - pr_err("%s: device %pOF is not a SCOM controller\n", - __func__, dev); - return SCOM_MAP_INVALID; - } - gcid = of_get_property(dev, "ibm,chip-id", NULL); - if (!gcid) { - pr_err("%s: device %pOF has no ibm,chip-id\n", - __func__, dev); - return SCOM_MAP_INVALID; - } - m = kmalloc(sizeof(*m), GFP_KERNEL); - if (!m) - return NULL; - m->chip = be32_to_cpup(gcid); - m->addr = reg; - - return (scom_map_t)m; -} - -static void opal_scom_unmap(scom_map_t map) -{ - kfree(map); -} - -static int opal_xscom_err_xlate(int64_t rc) -{ - switch(rc) { - case 0: - return 0; - /* Add more translations if necessary */ - default: - return -EIO; - } -} +#include <asm/prom.h> static u64 opal_scom_unmangle(u64 addr) { @@ -102,39 +52,159 @@ static u64 opal_scom_unmangle(u64 addr) return addr; } -static int opal_scom_read(scom_map_t map, u64 reg, u64 *value) +static int opal_scom_read(uint32_t chip, uint64_t addr, u64 reg, u64 *value) { - struct opal_scom_map *m = map; int64_t rc; __be64 v; - reg = opal_scom_unmangle(m->addr + reg); - rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v)); + reg = opal_scom_unmangle(addr + reg); + rc = opal_xscom_read(chip, reg, (__be64 *)__pa(&v)); + if (rc) { + *value = 0xfffffffffffffffful; + return -EIO; + } *value = be64_to_cpu(v); - return opal_xscom_err_xlate(rc); + return 0; } -static int opal_scom_write(scom_map_t map, u64 reg, u64 value) +static int opal_scom_write(uint32_t chip, uint64_t addr, u64 reg, u64 value) { - struct opal_scom_map *m = map; int64_t rc; - reg = opal_scom_unmangle(m->addr + reg); - rc = opal_xscom_write(m->chip, reg, value); - return opal_xscom_err_xlate(rc); + reg = opal_scom_unmangle(addr + reg); + rc = opal_xscom_write(chip, reg, value); + if (rc) + return -EIO; + return 0; } -static const struct scom_controller opal_scom_controller = { - .map = opal_scom_map, - .unmap = opal_scom_unmap, - .read = opal_scom_read, - .write = opal_scom_write +struct scom_debug_entry { + u32 chip; + struct debugfs_blob_wrapper path; + char name[16]; }; -static int opal_xscom_init(void) +static ssize_t scom_debug_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) { - if (firmware_has_feature(FW_FEATURE_OPAL)) - scom_init(&opal_scom_controller); + struct scom_debug_entry *ent = filp->private_data; + u64 __user *ubuf64 = (u64 __user *)ubuf; + loff_t off = *ppos; + ssize_t done = 0; + u64 reg, reg_base, reg_cnt, val; + int rc; + + if (off < 0 || (off & 7) || (count & 7)) + return -EINVAL; + reg_base = off >> 3; + reg_cnt = count >> 3; + + for (reg = 0; reg < reg_cnt; reg++) { + rc = opal_scom_read(ent->chip, reg_base, reg, &val); + if (!rc) + rc = put_user(val, ubuf64); + if (rc) { + if (!done) + done = rc; + break; + } + ubuf64++; + *ppos += 8; + done += 8; + } + return done; +} + +static ssize_t scom_debug_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct scom_debug_entry *ent = filp->private_data; + u64 __user *ubuf64 = (u64 __user *)ubuf; + loff_t off = *ppos; + ssize_t done = 0; + u64 reg, reg_base, reg_cnt, val; + int rc; + + if (off < 0 || (off & 7) || (count & 7)) + return -EINVAL; + reg_base = off >> 3; + reg_cnt = count >> 3; + + for (reg = 0; reg < reg_cnt; reg++) { + rc = get_user(val, ubuf64); + if (!rc) + rc = opal_scom_write(ent->chip, reg_base, reg, val); + if (rc) { + if (!done) + done = rc; + break; + } + ubuf64++; + done += 8; + } + return done; +} + +static const struct file_operations scom_debug_fops = { + .read = scom_debug_read, + .write = scom_debug_write, + .open = simple_open, + .llseek = default_llseek, +}; + +static int scom_debug_init_one(struct dentry *root, struct device_node *dn, + int chip) +{ + struct scom_debug_entry *ent; + struct dentry *dir; + + ent = kzalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->chip = chip; + snprintf(ent->name, 16, "%08x", chip); + ent->path.data = (void *)kasprintf(GFP_KERNEL, "%pOF", dn); + if (!ent->path.data) { + kfree(ent); + return -ENOMEM; + } + + ent->path.size = strlen((char *)ent->path.data); + + dir = debugfs_create_dir(ent->name, root); + if (IS_ERR(dir)) { + kfree(ent->path.data); + kfree(ent); + return -1; + } + + debugfs_create_blob("devspec", 0400, dir, &ent->path); + debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops); + return 0; } -machine_arch_initcall(powernv, opal_xscom_init); + +static int scom_debug_init(void) +{ + struct device_node *dn; + struct dentry *root; + int chip, rc; + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return 0; + + root = debugfs_create_dir("scom", arch_debugfs_dir); + if (IS_ERR(root)) + return -1; + + rc = 0; + for_each_node_with_property(dn, "scom-controller") { + chip = of_get_ibm_chip_id(dn); + WARN_ON(chip == -1); + rc |= scom_debug_init_one(root, dn, chip); + } + + return rc; +} +device_initcall(scom_debug_init); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 79586f127521..09bd93464b4f 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV OPAL high level interfaces * * Copyright 2011 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "opal: " fmt @@ -26,7 +22,6 @@ #include <linux/memblock.h> #include <linux/kthread.h> #include <linux/freezer.h> -#include <linux/printk.h> #include <linux/kmsg_dump.h> #include <linux/console.h> #include <linux/sched/debug.h> @@ -40,6 +35,16 @@ #include "powernv.h" +#define OPAL_MSG_QUEUE_MAX 16 + +struct opal_msg_node { + struct list_head list; + struct opal_msg msg; +}; + +static DEFINE_SPINLOCK(msg_list_lock); +static LIST_HEAD(msg_list); + /* /sys/firmware/opal */ struct kobject *opal_kobj; @@ -55,6 +60,8 @@ struct mcheck_recoverable_range { u64 recover_addr; }; +static int msg_list_size; + static struct mcheck_recoverable_range *mc_recoverable_range; static int mc_recoverable_range_len; @@ -63,8 +70,10 @@ static DEFINE_SPINLOCK(opal_write_lock); static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX]; static uint32_t opal_heartbeat; static struct task_struct *kopald_tsk; +static struct opal_msg *opal_msg; +static u32 opal_msg_size __ro_after_init; -void opal_configure_cores(void) +void __init opal_configure_cores(void) { u64 reinit_flags = 0; @@ -171,8 +180,7 @@ int __init early_init_dt_scan_recoverable_ranges(unsigned long node, /* * Allocate a buffer to hold the MC recoverable ranges. */ - mc_recoverable_range =__va(memblock_phys_alloc(size, __alignof__(u64))); - memset(mc_recoverable_range, 0, size); + mc_recoverable_range = memblock_alloc_or_panic(size, __alignof__(u64)); for (i = 0; i < mc_recoverable_range_len; i++) { mc_recoverable_range[i].start_addr = @@ -205,16 +213,18 @@ static int __init opal_register_exception_handlers(void) glue = 0x7000; /* - * Check if we are running on newer firmware that exports - * OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to patch - * the HMI interrupt and we catch it directly in Linux. + * Only ancient OPAL firmware requires this. + * Specifically, firmware from FW810.00 (released June 2014) + * through FW810.20 (Released October 2014). * - * For older firmware (i.e currently released POWER8 System Firmware - * as of today <= SV810_087), we fallback to old behavior and let OPAL - * patch the HMI vector and handle it inside OPAL firmware. + * Check if we are running on newer (post Oct 2014) firmware that + * exports the OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to + * patch the HMI interrupt and we catch it directly in Linux. * - * For newer firmware (in development/yet to be released) we will - * start catching/handling HMI directly in Linux. + * For older firmware (i.e < FW810.20), we fallback to old behavior and + * let OPAL patch the HMI vector and handle it inside OPAL firmware. + * + * For newer firmware we catch/handle the HMI directly in Linux. */ if (!opal_check_token(OPAL_HANDLE_HMI)) { pr_info("Old firmware detected, OPAL handles HMIs.\n"); @@ -224,6 +234,11 @@ static int __init opal_register_exception_handlers(void) glue += 128; } + /* + * Only applicable to ancient firmware, all modern + * (post March 2015/skiboot 5.0) firmware will just return + * OPAL_UNSUPPORTED. + */ opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue); #endif @@ -231,6 +246,43 @@ static int __init opal_register_exception_handlers(void) } machine_early_initcall(powernv, opal_register_exception_handlers); +static void queue_replay_msg(void *msg) +{ + struct opal_msg_node *msg_node; + + if (msg_list_size < OPAL_MSG_QUEUE_MAX) { + msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC); + if (msg_node) { + INIT_LIST_HEAD(&msg_node->list); + memcpy(&msg_node->msg, msg, sizeof(struct opal_msg)); + list_add_tail(&msg_node->list, &msg_list); + msg_list_size++; + } else + pr_warn_once("message queue no memory\n"); + + if (msg_list_size >= OPAL_MSG_QUEUE_MAX) + pr_warn_once("message queue full\n"); + } +} + +static void dequeue_replay_msg(enum opal_msg_type msg_type) +{ + struct opal_msg_node *msg_node, *tmp; + + list_for_each_entry_safe(msg_node, tmp, &msg_list, list) { + if (be32_to_cpu(msg_node->msg.msg_type) != msg_type) + continue; + + atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type], + msg_type, + &msg_node->msg); + + list_del(&msg_node->list); + kfree(msg_node); + msg_list_size--; + } +} + /* * Opal message notifier based on message type. Allow subscribers to get * notified for specific messgae type. @@ -238,14 +290,30 @@ machine_early_initcall(powernv, opal_register_exception_handlers); int opal_message_notifier_register(enum opal_msg_type msg_type, struct notifier_block *nb) { + int ret; + unsigned long flags; + if (!nb || msg_type >= OPAL_MSG_TYPE_MAX) { pr_warn("%s: Invalid arguments, msg_type:%d\n", __func__, msg_type); return -EINVAL; } - return atomic_notifier_chain_register( - &opal_msg_notifier_head[msg_type], nb); + spin_lock_irqsave(&msg_list_lock, flags); + ret = atomic_notifier_chain_register( + &opal_msg_notifier_head[msg_type], nb); + + /* + * If the registration succeeded, replay any queued messages that came + * in prior to the notifier chain registration. msg_list_lock held here + * to ensure they're delivered prior to any subsequent messages. + */ + if (ret == 0) + dequeue_replay_msg(msg_type); + + spin_unlock_irqrestore(&msg_list_lock, flags); + + return ret; } EXPORT_SYMBOL_GPL(opal_message_notifier_register); @@ -259,6 +327,23 @@ EXPORT_SYMBOL_GPL(opal_message_notifier_unregister); static void opal_message_do_notify(uint32_t msg_type, void *msg) { + unsigned long flags; + bool queued = false; + + spin_lock_irqsave(&msg_list_lock, flags); + if (opal_msg_notifier_head[msg_type].head == NULL) { + /* + * Queue up the msg since no notifiers have registered + * yet for this msg_type. + */ + queue_replay_msg(msg); + queued = true; + } + spin_unlock_irqrestore(&msg_list_lock, flags); + + if (queued) + return; + /* notify subscribers */ atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type], msg_type, msg); @@ -267,14 +352,9 @@ static void opal_message_do_notify(uint32_t msg_type, void *msg) static void opal_handle_message(void) { s64 ret; - /* - * TODO: pre-allocate a message buffer depending on opal-msg-size - * value in /proc/device-tree. - */ - static struct opal_msg msg; u32 type; - ret = opal_get_msg(__pa(&msg), sizeof(msg)); + ret = opal_get_msg(__pa(opal_msg), opal_msg_size); /* No opal message pending. */ if (ret == OPAL_RESOURCE) return; @@ -286,14 +366,14 @@ static void opal_handle_message(void) return; } - type = be32_to_cpu(msg.msg_type); + type = be32_to_cpu(opal_msg->msg_type); /* Sanity check */ if (type >= OPAL_MSG_TYPE_MAX) { pr_warn_once("%s: Unknown message type: %u\n", __func__, type); return; } - opal_message_do_notify(type, (void *)&msg); + opal_message_do_notify(type, (void *)opal_msg); } static irqreturn_t opal_message_notify(int irq, void *data) @@ -302,10 +382,24 @@ static irqreturn_t opal_message_notify(int irq, void *data) return IRQ_HANDLED; } -static int __init opal_message_init(void) +static int __init opal_message_init(struct device_node *opal_node) { int ret, i, irq; + ret = of_property_read_u32(opal_node, "opal-msg-size", &opal_msg_size); + if (ret) { + pr_notice("Failed to read opal-msg-size property\n"); + opal_msg_size = sizeof(struct opal_msg); + } + + opal_msg = kmalloc(opal_msg_size, GFP_KERNEL); + if (!opal_msg) { + opal_msg_size = sizeof(struct opal_msg); + /* Try to allocate fixed message size */ + opal_msg = kmalloc(opal_msg_size, GFP_KERNEL); + BUG_ON(opal_msg == NULL); + } + for (i = 0; i < OPAL_MSG_TYPE_MAX; i++) ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]); @@ -327,7 +421,7 @@ static int __init opal_message_init(void) return 0; } -int opal_get_chars(uint32_t vtermno, char *buf, int count) +ssize_t opal_get_chars(uint32_t vtermno, u8 *buf, size_t count) { s64 rc; __be64 evt, len; @@ -344,10 +438,11 @@ int opal_get_chars(uint32_t vtermno, char *buf, int count) return 0; } -static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, bool atomic) +static ssize_t __opal_put_chars(uint32_t vtermno, const u8 *data, + size_t total_len, bool atomic) { unsigned long flags = 0 /* shut up gcc */; - int written; + ssize_t written; __be64 olen; s64 rc; @@ -387,7 +482,7 @@ static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, b if (atomic) { /* Should not happen */ pr_warn("atomic console write returned partial " - "len=%d written=%d\n", total_len, written); + "len=%zu written=%zd\n", total_len, written); } if (!written) written = -EAGAIN; @@ -400,7 +495,7 @@ out: return written; } -int opal_put_chars(uint32_t vtermno, const char *data, int total_len) +ssize_t opal_put_chars(uint32_t vtermno, const u8 *data, size_t total_len) { return __opal_put_chars(vtermno, data, total_len, false); } @@ -411,7 +506,8 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len) * true at the moment because console space can race with OPAL's console * writes. */ -int opal_put_chars_atomic(uint32_t vtermno, const char *data, int total_len) +ssize_t opal_put_chars_atomic(uint32_t vtermno, const u8 *data, + size_t total_len) { return __opal_put_chars(vtermno, data, total_len, true); } @@ -491,7 +587,7 @@ static int opal_recover_mce(struct pt_regs *regs, { int recovered = 0; - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; @@ -504,7 +600,7 @@ static int opal_recover_mce(struct pt_regs *regs, recovered = 0; } - if (!recovered && evt->severity == MCE_SEV_ERROR_SYNC) { + if (!recovered && evt->sync_error) { /* * Try to kill processes if we get a synchronous machine check * (e.g., one caused by execution of this instruction). This @@ -527,7 +623,7 @@ static int opal_recover_mce(struct pt_regs *regs, */ recovered = 0; } else { - die("Machine check", regs, SIGBUS); + die_mce("Machine check", regs, SIGBUS); recovered = 1; } } @@ -587,7 +683,7 @@ int opal_machine_check(struct pt_regs *regs) evt.version); return 0; } - machine_check_print_event_info(&evt, user_mode(regs)); + machine_check_print_event_info(&evt, user_mode(regs), false); if (opal_recover_mce(regs, &evt)) return 1; @@ -613,7 +709,28 @@ int opal_hmi_exception_early(struct pt_regs *regs) return 0; } -/* HMI exception handler called in virtual mode during check_irq_replay. */ +int opal_hmi_exception_early2(struct pt_regs *regs) +{ + s64 rc; + __be64 out_flags; + + /* + * call opal hmi handler. + * Check 64-bit flag mask to find out if an event was generated, + * and whether TB is still valid or not etc. + */ + rc = opal_handle_hmi2(&out_flags); + if (rc != OPAL_SUCCESS) + return 0; + + if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_NEW_EVENT) + local_paca->hmi_event_available = 1; + if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_TOD_TB_FAIL) + tb_invalid = true; + return 1; +} + +/* HMI exception handler called in virtual mode when irqs are next enabled. */ int opal_handle_hmi_exception(struct pt_regs *regs) { /* @@ -655,13 +772,13 @@ bool opal_mce_check_early_recovery(struct pt_regs *regs) * Setup regs->nip to rfi into fixup address. */ if (recover_addr) - regs->nip = recover_addr; + regs_set_return_ip(regs, recover_addr); out: return !!recover_addr; } -static int opal_sysfs_init(void) +static int __init opal_sysfs_init(void) { opal_kobj = kobject_create_and_add("opal", firmware_kobj); if (!opal_kobj) { @@ -672,45 +789,77 @@ static int opal_sysfs_init(void) return 0; } -static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj, - struct bin_attribute *bin_attr, - char *buf, loff_t off, size_t count) +static int opal_add_one_export(struct kobject *parent, const char *export_name, + struct device_node *np, const char *prop_name) { - return memory_read_from_buffer(buf, count, &off, bin_attr->private, - bin_attr->size); -} + struct bin_attribute *attr = NULL; + const char *name = NULL; + u64 vals[2]; + int rc; -static BIN_ATTR_RO(symbol_map, 0); + rc = of_property_read_u64_array(np, prop_name, &vals[0], 2); + if (rc) + goto out; -static void opal_export_symmap(void) -{ - const __be64 *syms; - unsigned int size; - struct device_node *fw; - int rc; + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) { + rc = -ENOMEM; + goto out; + } + name = kstrdup(export_name, GFP_KERNEL); + if (!name) { + rc = -ENOMEM; + goto out; + } - fw = of_find_node_by_path("/ibm,opal/firmware"); - if (!fw) - return; - syms = of_get_property(fw, "symbol-map", &size); - if (!syms || size != 2 * sizeof(__be64)) - return; + sysfs_bin_attr_init(attr); + attr->attr.name = name; + attr->attr.mode = 0400; + attr->read = sysfs_bin_attr_simple_read; + attr->private = __va(vals[0]); + attr->size = vals[1]; - /* Setup attributes */ - bin_attr_symbol_map.private = __va(be64_to_cpu(syms[0])); - bin_attr_symbol_map.size = be64_to_cpu(syms[1]); + rc = sysfs_create_bin_file(parent, attr); +out: + if (rc) { + kfree(name); + kfree(attr); + } - rc = sysfs_create_bin_file(opal_kobj, &bin_attr_symbol_map); - if (rc) - pr_warn("Error %d creating OPAL symbols file\n", rc); + return rc; } -static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, - loff_t off, size_t count) +static void opal_add_exported_attrs(struct device_node *np, + struct kobject *kobj) { - return memory_read_from_buffer(buf, count, &off, bin_attr->private, - bin_attr->size); + struct device_node *child; + struct property *prop; + + for_each_property_of_node(np, prop) { + int rc; + + if (!strcmp(prop->name, "name") || + !strcmp(prop->name, "phandle")) + continue; + + rc = opal_add_one_export(kobj, prop->name, np, prop->name); + if (rc) { + pr_warn("Unable to add export %pOF/%s, rc = %d!\n", + np, prop->name, rc); + } + } + + for_each_child_of_node(np, child) { + struct kobject *child_kobj; + + child_kobj = kobject_create_and_add(child->name, kobj); + if (!child_kobj) { + pr_err("Unable to create export dir for %pOF\n", child); + continue; + } + + opal_add_exported_attrs(child, child_kobj); + } } /* @@ -722,11 +871,8 @@ static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, */ static void opal_export_attrs(void) { - struct bin_attribute *attr; struct device_node *np; - struct property *prop; struct kobject *kobj; - u64 vals[2]; int rc; np = of_find_node_by_path("/ibm,opal/firmware/exports"); @@ -737,44 +883,20 @@ static void opal_export_attrs(void) kobj = kobject_create_and_add("exports", opal_kobj); if (!kobj) { pr_warn("kobject_create_and_add() of exports failed\n"); + of_node_put(np); return; } - for_each_property_of_node(np, prop) { - if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle")) - continue; - - if (of_property_read_u64_array(np, prop->name, &vals[0], 2)) - continue; - - attr = kzalloc(sizeof(*attr), GFP_KERNEL); + opal_add_exported_attrs(np, kobj); - if (attr == NULL) { - pr_warn("Failed kmalloc for bin_attribute!"); - continue; - } - - sysfs_bin_attr_init(attr); - attr->attr.name = kstrdup(prop->name, GFP_KERNEL); - attr->attr.mode = 0400; - attr->read = export_attr_read; - attr->private = __va(vals[0]); - attr->size = vals[1]; - - if (attr->attr.name == NULL) { - pr_warn("Failed kstrdup for bin_attribute attr.name"); - kfree(attr); - continue; - } - - rc = sysfs_create_bin_file(kobj, attr); - if (rc) { - pr_warn("Error %d creating OPAL sysfs exports/%s file\n", - rc, prop->name); - kfree(attr->attr.name); - kfree(attr); - } - } + /* + * NB: symbol_map existed before the generic export interface so it + * lives under the top level opal_kobj. + */ + rc = opal_add_one_export(opal_kobj, "symbol_map", + np->parent, "symbol-map"); + if (rc) + pr_warn("Error %d creating OPAL symbols file\n", rc); of_node_put(np); } @@ -807,7 +929,7 @@ static void __init opal_dump_region_init(void) "rc = %d\n", rc); } -static void opal_pdev_init(const char *compatible) +static void __init opal_pdev_init(const char *compatible) { struct device_node *np; @@ -822,6 +944,8 @@ static void __init opal_imc_init_dev(void) np = of_find_compatible_node(NULL, NULL, IMC_DTB_COMPAT); if (np) of_platform_device_create(np, NULL, NULL); + + of_node_put(np); } static int kopald(void *unused) @@ -851,7 +975,7 @@ void opal_wake_poller(void) wake_up_process(kopald_tsk); } -static void opal_init_heartbeat(void) +static void __init opal_init_heartbeat(void) { /* Old firwmware, we assume the HVC heartbeat is sufficient */ if (of_property_read_u32(opal_node, "ibm,heartbeat-ms", @@ -885,7 +1009,7 @@ static int __init opal_init(void) } /* Initialise OPAL messaging system */ - opal_message_init(); + opal_message_init(opal_node); /* Initialise OPAL asynchronous completion interface */ opal_async_comp_init(); @@ -921,8 +1045,6 @@ static int __init opal_init(void) /* Create "opal" kobject under /sys/firmware */ rc = opal_sysfs_init(); if (rc == 0) { - /* Export symbol map to userspace */ - opal_export_symmap(); /* Setup dump region interface */ opal_dump_region_init(); /* Setup error log interface */ @@ -935,11 +1057,10 @@ static int __init opal_init(void) opal_sys_param_init(); /* Setup message log sysfs interface. */ opal_msglog_sysfs_init(); + /* Add all export properties*/ + opal_export_attrs(); } - /* Export all properties */ - opal_export_attrs(); - /* Initialize platform devices: IPMI backend, PRD & flash interface */ opal_pdev_init("ibm,opal-ipmi"); opal_pdev_init("ibm,opal-flash"); @@ -963,6 +1084,9 @@ static int __init opal_init(void) /* Initialise OPAL Power control interface */ opal_power_control_init(); + /* Initialize OPAL secure variables */ + opal_pdev_init("ibm,secvar-backend"); + return 0; } machine_subsys_initcall(powernv, opal_init); diff --git a/arch/powerpc/platforms/powernv/pci-cxl.c b/arch/powerpc/platforms/powernv/pci-cxl.c deleted file mode 100644 index 1b18111453d7..000000000000 --- a/arch/powerpc/platforms/powernv/pci-cxl.c +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright 2014-2016 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/module.h> -#include <asm/pnv-pci.h> -#include <asm/opal.h> - -#include "pci.h" - -int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - struct pnv_ioda_pe *pe; - int rc; - - pe = pnv_ioda_get_pe(dev); - if (!pe) - return -ENODEV; - - pe_info(pe, "Switching PHB to CXL\n"); - - rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number); - if (rc == OPAL_UNSUPPORTED) - dev_err(&dev->dev, "Required cxl mode not supported by firmware - update skiboot\n"); - else if (rc) - dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc); - - return rc; -} -EXPORT_SYMBOL(pnv_phb_to_cxl_mode); - -/* Find PHB for cxl dev and allocate MSI hwirqs? - * Returns the absolute hardware IRQ number - */ -int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num); - - if (hwirq < 0) { - dev_warn(&dev->dev, "Failed to find a free MSI\n"); - return -ENOSPC; - } - - return phb->msi_base + hwirq; -} -EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs); - -void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num); -} -EXPORT_SYMBOL(pnv_cxl_release_hwirqs); - -void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs, - struct pci_dev *dev) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - int i, hwirq; - - for (i = 1; i < CXL_IRQ_RANGES; i++) { - if (!irqs->range[i]) - continue; - pr_devel("cxl release irq range 0x%x: offset: 0x%lx limit: %ld\n", - i, irqs->offset[i], - irqs->range[i]); - hwirq = irqs->offset[i] - phb->msi_base; - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, - irqs->range[i]); - } -} -EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges); - -int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs, - struct pci_dev *dev, int num) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - int i, hwirq, try; - - memset(irqs, 0, sizeof(struct cxl_irq_ranges)); - - /* 0 is reserved for the multiplexed PSL DSI interrupt */ - for (i = 1; i < CXL_IRQ_RANGES && num; i++) { - try = num; - while (try) { - hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try); - if (hwirq >= 0) - break; - try /= 2; - } - if (!try) - goto fail; - - irqs->offset[i] = phb->msi_base + hwirq; - irqs->range[i] = try; - pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx limit: %li\n", - i, irqs->offset[i], irqs->range[i]); - num -= try; - } - if (num) - goto fail; - - return 0; -fail: - pnv_cxl_release_hwirq_ranges(irqs, dev); - return -ENOSPC; -} -EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges); - -int pnv_cxl_get_irq_count(struct pci_dev *dev) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - - return phb->msi_bmp.irq_count; -} -EXPORT_SYMBOL(pnv_cxl_get_irq_count); - -int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq, - unsigned int virq) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; - unsigned int xive_num = hwirq - phb->msi_base; - struct pnv_ioda_pe *pe; - int rc; - - if (!(pe = pnv_ioda_get_pe(dev))) - return -ENODEV; - - /* Assign XIVE to PE */ - rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num); - if (rc) { - pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x " - "hwirq 0x%x XIVE 0x%x PE\n", - pci_name(dev), rc, phb->msi_base, hwirq, xive_num); - return -EIO; - } - pnv_set_msi_irq_chip(phb, virq); - - return 0; -} -EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup); - -#if IS_MODULE(CONFIG_CXL) -static inline int get_cxl_module(void) -{ - struct module *cxl_module; - - mutex_lock(&module_mutex); - - cxl_module = find_module("cxl"); - if (cxl_module) - __module_get(cxl_module); - - mutex_unlock(&module_mutex); - - if (!cxl_module) - return -ENODEV; - - return 0; -} -#else -static inline int get_cxl_module(void) { return 0; } -#endif diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 697449afb3f7..e96324502db0 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -17,6 +17,34 @@ #include <asm/tce.h> #include "pci.h" +unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb) +{ + struct pci_controller *hose = phb->hose; + struct device_node *dn = hose->dn; + unsigned long mask = 0; + int i, rc, count; + u32 val; + + count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes"); + if (count <= 0) { + mask = SZ_4K | SZ_64K; + /* Add 16M for POWER8 by default */ + if (cpu_has_feature(CPU_FTR_ARCH_207S) && + !cpu_has_feature(CPU_FTR_ARCH_300)) + mask |= SZ_16M | SZ_256M; + return mask; + } + + for (i = 0; i < count; i++) { + rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes", + i, &val); + if (rc == 0) + mask |= 1ULL << val; + } + + return mask; +} + void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset, unsigned int page_shift) @@ -36,7 +64,8 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) struct page *tce_mem = NULL; __be64 *addr; - tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT); + tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN, + shift - PAGE_SHIFT); if (!tce_mem) { pr_err("Failed to allocate a TCE memory, level shift=%d\n", shift); @@ -48,6 +77,9 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) return addr; } +static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, + unsigned long size, unsigned int levels); + static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) { __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base; @@ -57,9 +89,9 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) while (level) { int n = (idx & mask) >> (level * shift); - unsigned long tce; + unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n])); - if (tmp[n] == 0) { + if (!tce) { __be64 *tmp2; if (!alloc) @@ -70,10 +102,15 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) if (!tmp2) return NULL; - tmp[n] = cpu_to_be64(__pa(tmp2) | - TCE_PCI_READ | TCE_PCI_WRITE); + tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE; + oldtce = be64_to_cpu(cmpxchg(&tmp[n], 0, + cpu_to_be64(tce))); + if (oldtce) { + pnv_pci_ioda2_table_do_free_pages(tmp2, + ilog2(tbl->it_level_size) + 3, 1); + tce = oldtce; + } } - tce = be64_to_cpu(tmp[n]); tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); idx &= ~mask; @@ -108,8 +145,7 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages, #ifdef CONFIG_IOMMU_API int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction, - bool alloc) + unsigned long *hpa, enum dma_data_direction *direction) { u64 proto_tce = iommu_direction_to_tce_perm(*direction); unsigned long newtce = *hpa | proto_tce, oldtce; @@ -127,9 +163,9 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index, } if (!ptce) { - ptce = pnv_tce(tbl, false, idx, alloc); + ptce = pnv_tce(tbl, false, idx, true); if (!ptce) - return alloc ? H_HARDWARE : H_TOO_HARD; + return -ENOMEM; } if (newtce & TCE_PCI_WRITE) @@ -161,6 +197,9 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages) if (ptce) *ptce = cpu_to_be64(0); + else + /* Skip the rest of the level */ + i |= tbl->it_level_size - 1; } } @@ -260,7 +299,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, unsigned int table_shift = max_t(unsigned int, entries_shift + 3, PAGE_SHIFT); const unsigned long tce_table_size = 1UL << table_shift; - unsigned int tmplevels = levels; if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) return -EINVAL; @@ -268,9 +306,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (!is_power_of_2(window_size)) return -EINVAL; - if (alloc_userspace_copy && (window_size > (1ULL << 32))) - tmplevels = 1; - /* Adjust direct table size from window_size and levels */ entries_shift = (entries_shift + levels - 1) / levels; level_shift = entries_shift + 3; @@ -281,7 +316,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, /* Allocate TCE table */ addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - tmplevels, tce_table_size, &offset, &total_allocated); + 1, tce_table_size, &offset, &total_allocated); /* addr==NULL means that the first level allocation failed */ if (!addr) @@ -292,18 +327,18 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, * we did not allocate as much as we wanted, * release partially allocated table. */ - if (tmplevels == levels && offset < tce_table_size) + if (levels == 1 && offset < tce_table_size) goto free_tces_exit; /* Allocate userspace view of the TCE table */ if (alloc_userspace_copy) { offset = 0; uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - tmplevels, tce_table_size, &offset, + 1, tce_table_size, &offset, &total_allocated_uas); if (!uas) goto free_tces_exit; - if (tmplevels == levels && (offset < tce_table_size || + if (levels == 1 && (offset < tce_table_size || total_allocated_uas != total_allocated)) goto free_uas_exit; } @@ -313,13 +348,12 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, page_shift); tbl->it_level_size = 1ULL << (level_shift - 3); tbl->it_indirect_levels = levels - 1; - tbl->it_allocated_size = total_allocated; tbl->it_userspace = uas; tbl->it_nid = nid; pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n", window_size, tce_table_size, bus_offset, tbl->it_base, - tbl->it_userspace, tmplevels, levels); + tbl->it_userspace, 1, levels); return 0; @@ -333,14 +367,6 @@ free_tces_exit: return -ENOMEM; } -static void pnv_iommu_table_group_link_free(struct rcu_head *head) -{ - struct iommu_table_group_link *tgl = container_of(head, - struct iommu_table_group_link, rcu); - - kfree(tgl); -} - void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, struct iommu_table_group *table_group) { @@ -353,14 +379,18 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, /* Remove link to a group from table's list of attached groups */ found = false; + + rcu_read_lock(); list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { if (tgl->table_group == table_group) { list_del_rcu(&tgl->next); - call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free); + kfree_rcu(tgl, rcu); found = true; break; } } + rcu_read_unlock(); + if (WARN_ON(!found)) return; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 7db3119f8a5b..b0c1d9d16fb5 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Support PCI/PCIe on PowerNV platforms * * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #undef DEBUG @@ -19,15 +15,18 @@ #include <linux/init.h> #include <linux/memblock.h> #include <linux/irq.h> +#include <linux/irqchip/irq-msi-lib.h> #include <linux/io.h> #include <linux/msi.h> #include <linux/iommu.h> #include <linux/rculist.h> #include <linux/sizes.h> +#include <linux/debugfs.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> #include <asm/sections.h> #include <asm/io.h> -#include <asm/prom.h> #include <asm/pci-bridge.h> #include <asm/machdep.h> #include <asm/msi_bitmap.h> @@ -36,23 +35,19 @@ #include <asm/iommu.h> #include <asm/tce.h> #include <asm/xics.h> -#include <asm/debugfs.h> #include <asm/firmware.h> #include <asm/pnv-pci.h> #include <asm/mmzone.h> -#include <misc/cxl-base.h> - #include "powernv.h" #include "pci.h" #include "../../../../drivers/pci/pci.h" -#define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */ -#define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */ -#define PNV_IODA1_DMA32_SEGSIZE 0x10000000 +/* This array is indexed with enum pnv_phb_type */ +static const char * const pnv_phb_names[] = { "IODA2", "NPU_OCAPI" }; -static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK", - "NPU_OCAPI" }; +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); +static void pnv_pci_configure_bus(struct pci_bus *bus); void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...) @@ -67,7 +62,7 @@ void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, vaf.va = &args; if (pe->flags & PNV_IODA_PE_DEV) - strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); + strscpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) sprintf(pfix, "%04x:%02x ", pci_domain_nr(pe->pbus), pe->pbus->number); @@ -116,32 +111,13 @@ static int __init pci_reset_phbs_setup(char *str) early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup); -static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r) -{ - /* - * WARNING: We cannot rely on the resource flags. The Linux PCI - * allocation code sometimes decides to put a 64-bit prefetchable - * BAR in the 32-bit window, so we have to compare the addresses. - * - * For simplicity we only test resource start. - */ - return (r->start >= phb->ioda.m64_base && - r->start < (phb->ioda.m64_base + phb->ioda.m64_size)); -} - -static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags) -{ - unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH); - - return (resource_flags & flags) == flags; -} - static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no) { s64 rc; phb->ioda.pe_array[pe_no].phb = phb; phb->ioda.pe_array[pe_no].pe_number = pe_no; + phb->ioda.pe_array[pe_no].dma_setup_done = false; /* * Clear the PE frozen state as it might be put into frozen state @@ -165,35 +141,58 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) return; } + mutex_lock(&phb->ioda.pe_alloc_mutex); if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) pr_debug("%s: PE %x was reserved on PHB#%x\n", __func__, pe_no, phb->hose->global_number); + mutex_unlock(&phb->ioda.pe_alloc_mutex); pnv_ioda_init_pe(phb, pe_no); } -static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb) +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count) { - long pe; + struct pnv_ioda_pe *ret = NULL; + int run = 0, pe, i; + mutex_lock(&phb->ioda.pe_alloc_mutex); + + /* scan backwards for a run of @count cleared bits */ for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) { - if (!test_and_set_bit(pe, phb->ioda.pe_alloc)) - return pnv_ioda_init_pe(phb, pe); + if (test_bit(pe, phb->ioda.pe_alloc)) { + run = 0; + continue; + } + + run++; + if (run == count) + break; + } + if (run != count) + goto out; + + for (i = pe; i < pe + count; i++) { + set_bit(i, phb->ioda.pe_alloc); + pnv_ioda_init_pe(phb, i); } + ret = &phb->ioda.pe_array[pe]; - return NULL; +out: + mutex_unlock(&phb->ioda.pe_alloc_mutex); + return ret; } -static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) +void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) { struct pnv_phb *phb = pe->phb; unsigned int pe_num = pe->pe_number; WARN_ON(pe->pdev); - WARN_ON(pe->npucomp); /* NPUs are not supposed to be freed */ - kfree(pe->npucomp); memset(pe, 0, sizeof(struct pnv_ioda_pe)); + + mutex_lock(&phb->ioda.pe_alloc_mutex); clear_bit(pe_num, phb->ioda.pe_alloc); + mutex_unlock(&phb->ioda.pe_alloc_mutex); } /* The default M64 BAR is shared by all PEs */ @@ -253,8 +252,7 @@ fail: static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, unsigned long *pe_bitmap) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct resource *r; resource_size_t base, sgsz, start, end; int segno, i; @@ -266,8 +264,8 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, if (!r->parent || !pnv_pci_is_m64(phb, r)) continue; - start = _ALIGN_DOWN(r->start - base, sgsz); - end = _ALIGN_UP(r->end - base, sgsz); + start = ALIGN_DOWN(r->start - base, sgsz); + end = ALIGN(r->end - base, sgsz); for (segno = start / sgsz; segno < end / sgsz; segno++) { if (pe_bitmap) set_bit(segno, pe_bitmap); @@ -277,64 +275,6 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, } } -static int pnv_ioda1_init_m64(struct pnv_phb *phb) -{ - struct resource *r; - int index; - - /* - * There are 16 M64 BARs, each of which has 8 segments. So - * there are as many M64 segments as the maximum number of - * PEs, which is 128. - */ - for (index = 0; index < PNV_IODA1_M64_NUM; index++) { - unsigned long base, segsz = phb->ioda.m64_segsize; - int64_t rc; - - base = phb->ioda.m64_base + - index * PNV_IODA1_M64_SEGS * segsz; - rc = opal_pci_set_phb_mem_window(phb->opal_id, - OPAL_M64_WINDOW_TYPE, index, base, 0, - PNV_IODA1_M64_SEGS * segsz); - if (rc != OPAL_SUCCESS) { - pr_warn(" Error %lld setting M64 PHB#%x-BAR#%d\n", - rc, phb->hose->global_number, index); - goto fail; - } - - rc = opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, index, - OPAL_ENABLE_M64_SPLIT); - if (rc != OPAL_SUCCESS) { - pr_warn(" Error %lld enabling M64 PHB#%x-BAR#%d\n", - rc, phb->hose->global_number, index); - goto fail; - } - } - - /* - * Exclude the segments for reserved and root bus PE, which - * are first or last two PEs. - */ - r = &phb->hose->mem_resources[1]; - if (phb->ioda.reserved_pe_idx == 0) - r->start += (2 * phb->ioda.m64_segsize); - else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) - r->end -= (2 * phb->ioda.m64_segsize); - else - WARN(1, "Wrong reserved PE#%x on PHB#%x\n", - phb->ioda.reserved_pe_idx, phb->hose->global_number); - - return 0; - -fail: - for ( ; index >= 0; index--) - opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64); - - return -EIO; -} - static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus, unsigned long *pe_bitmap, bool all) @@ -352,8 +292,7 @@ static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus, static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); struct pnv_ioda_pe *master_pe, *pe; unsigned long size, *pe_alloc; int i; @@ -363,7 +302,7 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) return NULL; /* Allocate bitmap */ - size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); + size = ALIGN(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); pe_alloc = kzalloc(size, GFP_KERNEL); if (!pe_alloc) { pr_warn("%s: Out of memory !\n", @@ -404,26 +343,6 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) pe->master = master_pe; list_add_tail(&pe->list, &master_pe->slaves); } - - /* - * P7IOC supports M64DT, which helps mapping M64 segment - * to one particular PE#. However, PHB3 has fixed mapping - * between M64 segment and PE#. In order to have same logic - * for P7IOC and PHB3, we enforce fixed mapping between M64 - * segment and PE# on P7IOC. - */ - if (phb->type == PNV_PHB_IODA1) { - int64_t rc; - - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe->pe_number, OPAL_M64_WINDOW_TYPE, - pe->pe_number / PNV_IODA1_M64_SEGS, - pe->pe_number % PNV_IODA1_M64_SEGS); - if (rc != OPAL_SUCCESS) - pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n", - __func__, rc, phb->hose->global_number, - pe->pe_number); - } } kfree(pe_alloc); @@ -439,7 +358,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) const __be32 *r; u64 pci_addr; - if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) { + if (phb->type != PNV_PHB_IODA2) { pr_info(" Not support M64 window\n"); return; } @@ -514,10 +433,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) * Setup init functions for M64 based on IODA version, IODA3 uses * the IODA2 code. */ - if (phb->type == PNV_PHB_IODA1) - phb->init_m64 = pnv_ioda1_init_m64; - else - phb->init_m64 = pnv_ioda2_init_m64; + phb->init_m64 = pnv_ioda2_init_m64; } static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no) @@ -662,10 +578,19 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) return state; } +struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn) +{ + int pe_number = phb->ioda.pe_rmap[bdfn]; + + if (pe_number == IODA_INVALID_PE) + return NULL; + + return &phb->ioda.pe_array[pe_number]; +} + struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); struct pci_dn *pdn = pci_get_pdn(dev); if (!pdn) @@ -779,7 +704,35 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, return 0; } -static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) +static void pnv_ioda_unset_peltv(struct pnv_phb *phb, + struct pnv_ioda_pe *pe, + struct pci_dev *parent) +{ + int64_t rc; + + while (parent) { + struct pci_dn *pdn = pci_get_pdn(parent); + + if (pdn && pdn->pe_number != IODA_INVALID_PE) { + rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, + pe->pe_number, + OPAL_REMOVE_PE_FROM_DOMAIN); + /* XXX What to do in case of error ? */ + } + parent = parent->bus->self; + } + + opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + + /* Disassociate PE in PELT */ + rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, + pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); + if (rc) + pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc); +} + +int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { struct pci_dev *parent; uint8_t bcomp, dcomp, fcomp; @@ -794,7 +747,7 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; parent = pe->pbus->self; if (pe->flags & PNV_IODA_PE_BUS_ALL) - count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; + count = resource_size(&pe->pbus->busn_res); else count = 1; @@ -829,29 +782,17 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) for (rid = pe->rid; rid < rid_end; rid++) phb->ioda.pe_rmap[rid] = IODA_INVALID_PE; - /* Release from all parents PELT-V */ - while (parent) { - struct pci_dn *pdn = pci_get_pdn(parent); - if (pdn && pdn->pe_number != IODA_INVALID_PE) { - rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, - pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); - /* XXX What to do in case of error ? */ - } - parent = parent->bus->self; - } - - opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, - OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + /* + * Release from all parents PELT-V. NPUs don't have a PELTV + * table + */ + if (phb->type != PNV_PHB_NPU_OCAPI) + pnv_ioda_unset_peltv(phb, pe, parent); - /* Disassociate PE in PELT */ - rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, - pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); - if (rc) - pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc); rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, bcomp, dcomp, fcomp, OPAL_UNMAP_PE); if (rc) - pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); + pe_err(pe, "OPAL error %lld trying to setup PELT table\n", rc); pe->pbus = NULL; pe->pdev = NULL; @@ -862,9 +803,8 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) return 0; } -static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) +int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { - struct pci_dev *parent; uint8_t bcomp, dcomp, fcomp; long rc, rid_end, rid; @@ -874,9 +814,8 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; - parent = pe->pbus->self; if (pe->flags & PNV_IODA_PE_BUS_ALL) - count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; + count = resource_size(&pe->pbus->busn_res); else count = 1; @@ -895,12 +834,6 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) } rid_end = pe->rid + (count << 8); } else { -#ifdef CONFIG_PCI_IOV - if (pe->flags & PNV_IODA_PE_VF) - parent = pe->parent_dev; - else -#endif /* CONFIG_PCI_IOV */ - parent = pe->pdev->bus->self; bcomp = OpalPciBusAll; dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; @@ -924,128 +857,21 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) * Configure PELTV. NPUs don't have a PELTV table so skip * configuration on them. */ - if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI) + if (phb->type != PNV_PHB_NPU_OCAPI) pnv_ioda_set_peltv(phb, pe, true); /* Setup reverse map */ for (rid = pe->rid; rid < rid_end; rid++) phb->ioda.pe_rmap[rid] = pe->pe_number; - /* Setup one MVTs on IODA1 */ - if (phb->type != PNV_PHB_IODA1) { - pe->mve_number = 0; - goto out; - } - - pe->mve_number = pe->pe_number; - rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number); - if (rc != OPAL_SUCCESS) { - pe_err(pe, "OPAL error %ld setting up MVE %x\n", - rc, pe->mve_number); - pe->mve_number = -1; - } else { - rc = opal_pci_set_mve_enable(phb->opal_id, - pe->mve_number, OPAL_ENABLE_MVE); - if (rc) { - pe_err(pe, "OPAL error %ld enabling MVE %x\n", - rc, pe->mve_number); - pe->mve_number = -1; - } - } - -out: - return 0; -} - -#ifdef CONFIG_PCI_IOV -static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) -{ - struct pci_dn *pdn = pci_get_pdn(dev); - int i; - struct resource *res, res2; - resource_size_t size; - u16 num_vfs; - - if (!dev->is_physfn) - return -EINVAL; - - /* - * "offset" is in VFs. The M64 windows are sized so that when they - * are segmented, each segment is the same size as the IOV BAR. - * Each segment is in a separate PE, and the high order bits of the - * address are the PE number. Therefore, each VF's BAR is in a - * separate PE, and changing the IOV BAR start address changes the - * range of PEs the VFs are in. - */ - num_vfs = pdn->num_vfs; - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &dev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || !res->parent) - continue; - - /* - * The actual IOV BAR range is determined by the start address - * and the actual size for num_vfs VFs BAR. This check is to - * make sure that after shifting, the range will not overlap - * with another device. - */ - size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); - res2.flags = res->flags; - res2.start = res->start + (size * offset); - res2.end = res2.start + (size * num_vfs) - 1; - - if (res2.end > res->end) { - dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", - i, &res2, res, num_vfs, offset); - return -EBUSY; - } - } - - /* - * Since M64 BAR shares segments among all possible 256 PEs, - * we have to shift the beginning of PF IOV BAR to make it start from - * the segment which belongs to the PE number assigned to the first VF. - * This creates a "hole" in the /proc/iomem which could be used for - * allocating other resources so we reserve this area below and - * release when IOV is released. - */ - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &dev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || !res->parent) - continue; - - size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); - res2 = *res; - res->start += size * offset; - - dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n", - i, &res2, res, (offset > 0) ? "En" : "Dis", - num_vfs, offset); - - if (offset < 0) { - devm_release_resource(&dev->dev, &pdn->holes[i]); - memset(&pdn->holes[i], 0, sizeof(pdn->holes[i])); - } - - pci_update_resource(dev, i + PCI_IOV_RESOURCES); + pe->mve_number = 0; - if (offset > 0) { - pdn->holes[i].start = res2.start; - pdn->holes[i].end = res2.start + size * offset - 1; - pdn->holes[i].flags = IORESOURCE_BUS; - pdn->holes[i].name = "pnv_iov_reserved"; - devm_request_resource(&dev->dev, res->parent, - &pdn->holes[i]); - } - } return 0; } -#endif /* CONFIG_PCI_IOV */ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); struct pci_dn *pdn = pci_get_pdn(dev); struct pnv_ioda_pe *pe; @@ -1057,27 +883,26 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) if (pdn->pe_number != IODA_INVALID_PE) return NULL; - pe = pnv_ioda_alloc_pe(phb); + pe = pnv_ioda_alloc_pe(phb, 1); if (!pe) { pr_warn("%s: Not enough PE# available, disabling device\n", pci_name(dev)); return NULL; } - /* NOTE: We get only one ref to the pci_dev for the pdn, not for the - * pointer in the PE data structure, both should be destroyed at the - * same time. However, this needs to be looked at more closely again - * once we actually start removing things (Hotplug, SR-IOV, ...) + /* NOTE: We don't get a reference for the pointer in the PE + * data structure, both the device and PE structures should be + * destroyed at the same time. * * At some point we want to remove the PDN completely anyways */ - pci_dev_get(dev); pdn->pe_number = pe->pe_number; pe->flags = PNV_IODA_PE_DEV; pe->pdev = dev; pe->pbus = NULL; pe->mve_number = -1; pe->rid = dev->bus->number << 8 | pdn->devfn; + pe->device_count++; pe_info(pe, "Associated device to PE\n"); @@ -1086,44 +911,16 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) pnv_ioda_free_pe(pe); pdn->pe_number = IODA_INVALID_PE; pe->pdev = NULL; - pci_dev_put(dev); return NULL; } /* Put PE to the list */ + mutex_lock(&phb->ioda.pe_list_mutex); list_add_tail(&pe->list, &phb->ioda.pe_list); - + mutex_unlock(&phb->ioda.pe_list_mutex); return pe; } -static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - struct pci_dn *pdn = pci_get_pdn(dev); - - if (pdn == NULL) { - pr_warn("%s: No device node associated with device !\n", - pci_name(dev)); - continue; - } - - /* - * In partial hotplug case, the PCI device might be still - * associated with the PE and needn't attach it to the PE - * again. - */ - if (pdn->pe_number != IODA_INVALID_PE) - continue; - - pe->device_count++; - pdn->pe_number = pe->pe_number; - if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_same_PE(dev->subordinate, pe); - } -} - /* * There're 2 types of PCI bus sensitive PEs: One that is compromised of * single PCI bus. Another one that contains the primary PCI bus and its @@ -1132,8 +929,7 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) */ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); struct pnv_ioda_pe *pe = NULL; unsigned int pe_num; @@ -1142,15 +938,13 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) * We should reuse it instead of allocating a new one. */ pe_num = phb->ioda.pe_rmap[bus->number << 8]; - if (pe_num != IODA_INVALID_PE) { + if (WARN_ON(pe_num != IODA_INVALID_PE)) { pe = &phb->ioda.pe_array[pe_num]; - pnv_ioda_setup_same_PE(bus, pe); return NULL; } /* PE number for root bus should have been reserved */ - if (pci_is_root_bus(bus) && - phb->ioda.root_pe_idx != IODA_INVALID_PE) + if (pci_is_root_bus(bus)) pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx]; /* Check if PE is determined by M64 */ @@ -1159,7 +953,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) /* The PE number isn't pinned by M64 */ if (!pe) - pe = pnv_ioda_alloc_pe(phb); + pe = pnv_ioda_alloc_pe(phb, 1); if (!pe) { pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n", @@ -1174,11 +968,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) pe->rid = bus->busn_res.start << 8; if (all) - pe_info(pe, "Secondary bus %d..%d associated with PE#%x\n", - bus->busn_res.start, bus->busn_res.end, pe->pe_number); + pe_info(pe, "Secondary bus %pad..%pad associated with PE#%x\n", + &bus->busn_res.start, &bus->busn_res.end, + pe->pe_number); else - pe_info(pe, "Secondary bus %d associated with PE#%x\n", - bus->busn_res.start, pe->pe_number); + pe_info(pe, "Secondary bus %pad associated with PE#%x\n", + &bus->busn_res.start, pe->pe_number); if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ @@ -1187,598 +982,66 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) return NULL; } - /* Associate it with all child devices */ - pnv_ioda_setup_same_PE(bus, pe); - /* Put PE to the list */ list_add_tail(&pe->list, &phb->ioda.pe_list); return pe; } -static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) +static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev) { - int pe_num, found_pe = false, rc; - long rid; - struct pnv_ioda_pe *pe; - struct pci_dev *gpu_pdev; - struct pci_dn *npu_pdn; - struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus); - struct pnv_phb *phb = hose->private_data; - - /* - * Due to a hardware errata PE#0 on the NPU is reserved for - * error handling. This means we only have three PEs remaining - * which need to be assigned to four links, implying some - * links must share PEs. - * - * To achieve this we assign PEs such that NPUs linking the - * same GPU get assigned the same PE. - */ - gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev); - for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) { - pe = &phb->ioda.pe_array[pe_num]; - if (!pe->pdev) - continue; - - if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) { - /* - * This device has the same peer GPU so should - * be assigned the same PE as the existing - * peer NPU. - */ - dev_info(&npu_pdev->dev, - "Associating to existing PE %x\n", pe_num); - pci_dev_get(npu_pdev); - npu_pdn = pci_get_pdn(npu_pdev); - rid = npu_pdev->bus->number << 8 | npu_pdn->devfn; - npu_pdn->pe_number = pe_num; - phb->ioda.pe_rmap[rid] = pe->pe_number; - - /* Map the PE to this link */ - rc = opal_pci_set_pe(phb->opal_id, pe_num, rid, - OpalPciBusAll, - OPAL_COMPARE_RID_DEVICE_NUMBER, - OPAL_COMPARE_RID_FUNCTION_NUMBER, - OPAL_MAP_PE); - WARN_ON(rc != OPAL_SUCCESS); - found_pe = true; - break; - } - } - - if (!found_pe) - /* - * Could not find an existing PE so allocate a new - * one. - */ - return pnv_ioda_setup_dev_PE(npu_pdev); - else - return pe; -} - -static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus) -{ - struct pci_dev *pdev; - - list_for_each_entry(pdev, &bus->devices, bus_list) - pnv_ioda_setup_npu_PE(pdev); -} - -static void pnv_pci_ioda_setup_PEs(void) -{ - struct pci_controller *hose; - struct pnv_phb *phb; - struct pci_bus *bus; - struct pci_dev *pdev; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); + struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; - list_for_each_entry(hose, &hose_list, list_node) { - phb = hose->private_data; - if (phb->type == PNV_PHB_NPU_NVLINK) { - /* PE#0 is needed for error reporting */ - pnv_ioda_reserve_pe(phb, 0); - pnv_ioda_setup_npu_PEs(hose->bus); - if (phb->model == PNV_PHB_MODEL_NPU2) - WARN_ON_ONCE(pnv_npu2_init(hose)); - } - if (phb->type == PNV_PHB_NPU_OCAPI) { - bus = hose->bus; - list_for_each_entry(pdev, &bus->devices, bus_list) - pnv_ioda_setup_dev_PE(pdev); - } - } - list_for_each_entry(hose, &hose_list, list_node) { - phb = hose->private_data; - if (phb->type != PNV_PHB_IODA2) - continue; - - list_for_each_entry(pe, &phb->ioda.pe_list, list) - pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV); - } -} - -#ifdef CONFIG_PCI_IOV -static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pci_dn *pdn; - int i, j; - int m64_bars; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (pdn->m64_single_mode) - m64_bars = num_vfs; - else - m64_bars = 1; - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) - for (j = 0; j < m64_bars; j++) { - if (pdn->m64_map[j][i] == IODA_INVALID_M64) - continue; - opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0); - clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc); - pdn->m64_map[j][i] = IODA_INVALID_M64; - } - - kfree(pdn->m64_map); - return 0; -} - -static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pci_dn *pdn; - unsigned int win; - struct resource *res; - int i, j; - int64_t rc; - int total_vfs; - resource_size_t size, start; - int pe_num; - int m64_bars; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - total_vfs = pci_sriov_get_totalvfs(pdev); - - if (pdn->m64_single_mode) - m64_bars = num_vfs; - else - m64_bars = 1; - - pdn->m64_map = kmalloc_array(m64_bars, - sizeof(*pdn->m64_map), - GFP_KERNEL); - if (!pdn->m64_map) - return -ENOMEM; - /* Initialize the m64_map to IODA_INVALID_M64 */ - for (i = 0; i < m64_bars ; i++) - for (j = 0; j < PCI_SRIOV_NUM_BARS; j++) - pdn->m64_map[i][j] = IODA_INVALID_M64; - - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || !res->parent) - continue; - - for (j = 0; j < m64_bars; j++) { - do { - win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, - phb->ioda.m64_bar_idx + 1, 0); - - if (win >= phb->ioda.m64_bar_idx + 1) - goto m64_failed; - } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); - - pdn->m64_map[j][i] = win; - - if (pdn->m64_single_mode) { - size = pci_iov_resource_size(pdev, - PCI_IOV_RESOURCES + i); - start = res->start + size * j; - } else { - size = resource_size(res); - start = res->start; - } - - /* Map the M64 here */ - if (pdn->m64_single_mode) { - pe_num = pdn->pe_num_map[j]; - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe_num, OPAL_M64_WINDOW_TYPE, - pdn->m64_map[j][i], 0); - } - - rc = opal_pci_set_phb_mem_window(phb->opal_id, - OPAL_M64_WINDOW_TYPE, - pdn->m64_map[j][i], - start, - 0, /* unused */ - size); - - - if (rc != OPAL_SUCCESS) { - dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n", - win, rc); - goto m64_failed; - } - - if (pdn->m64_single_mode) - rc = opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2); - else - rc = opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1); - - if (rc != OPAL_SUCCESS) { - dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", - win, rc); - goto m64_failed; - } - } - } - return 0; - -m64_failed: - pnv_pci_vf_release_m64(pdev, num_vfs); - return -EBUSY; -} - -static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, - int num); - -static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) -{ - struct iommu_table *tbl; - int64_t rc; - - tbl = pe->table_group.tables[0]; - rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); - if (rc) - pe_warn(pe, "OPAL error %ld release DMA window\n", rc); - - pnv_pci_ioda2_set_bypass(pe, false); - if (pe->table_group.group) { - iommu_group_put(pe->table_group.group); - BUG_ON(pe->table_group.group); - } - iommu_tce_table_put(tbl); -} - -static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe, *pe_n; - struct pci_dn *pdn; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (!pdev->is_physfn) - return; - - list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { - if (pe->parent_dev != pdev) - continue; - - pnv_pci_ioda2_release_dma_pe(pdev, pe); - - /* Remove from list */ - mutex_lock(&phb->ioda.pe_list_mutex); - list_del(&pe->list); - mutex_unlock(&phb->ioda.pe_list_mutex); - - pnv_ioda_deconfigure_pe(phb, pe); - - pnv_ioda_free_pe(pe); - } -} - -void pnv_pci_sriov_disable(struct pci_dev *pdev) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - struct pci_dn *pdn; - u16 num_vfs, i; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - num_vfs = pdn->num_vfs; - - /* Release VF PEs */ - pnv_ioda_release_vf_PE(pdev); - - if (phb->type == PNV_PHB_IODA2) { - if (!pdn->m64_single_mode) - pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map); - - /* Release M64 windows */ - pnv_pci_vf_release_m64(pdev, num_vfs); - - /* Release PE numbers */ - if (pdn->m64_single_mode) { - for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] == IODA_INVALID_PE) - continue; - - pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; - pnv_ioda_free_pe(pe); - } - } else - bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); - /* Releasing pe_num_map */ - kfree(pdn->pe_num_map); - } -} - -static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe); -#ifdef CONFIG_IOMMU_API -static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe, - struct iommu_table_group *table_group, struct pci_bus *bus); - -#endif -static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - int pe_num; - u16 vf_index; - struct pci_dn *pdn; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (!pdev->is_physfn) - return; - - /* Reserve PE for each VF */ - for (vf_index = 0; vf_index < num_vfs; vf_index++) { - if (pdn->m64_single_mode) - pe_num = pdn->pe_num_map[vf_index]; - else - pe_num = *pdn->pe_num_map + vf_index; - - pe = &phb->ioda.pe_array[pe_num]; - pe->pe_number = pe_num; - pe->phb = phb; - pe->flags = PNV_IODA_PE_VF; - pe->pbus = NULL; - pe->parent_dev = pdev; - pe->mve_number = -1; - pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | - pci_iov_virtfn_devfn(pdev, vf_index); - - pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n", - hose->global_number, pdev->bus->number, - PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)), - PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num); - - if (pnv_ioda_configure_pe(phb, pe)) { - /* XXX What do we do here ? */ - pnv_ioda_free_pe(pe); - pe->pdev = NULL; - continue; - } - - /* Put PE to the list */ - mutex_lock(&phb->ioda.pe_list_mutex); - list_add_tail(&pe->list, &phb->ioda.pe_list); - mutex_unlock(&phb->ioda.pe_list_mutex); - - pnv_pci_ioda2_setup_dma_pe(phb, pe); -#ifdef CONFIG_IOMMU_API - pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL); -#endif - } -} - -int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - struct pci_dn *pdn; - int ret; - u16 i; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (phb->type == PNV_PHB_IODA2) { - if (!pdn->vfs_expanded) { - dev_info(&pdev->dev, "don't support this SRIOV device" - " with non 64bit-prefetchable IOV BAR\n"); - return -ENOSPC; - } - - /* - * When M64 BARs functions in Single PE mode, the number of VFs - * could be enabled must be less than the number of M64 BARs. - */ - if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) { - dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n"); - return -EBUSY; - } - - /* Allocating pe_num_map */ - if (pdn->m64_single_mode) - pdn->pe_num_map = kmalloc_array(num_vfs, - sizeof(*pdn->pe_num_map), - GFP_KERNEL); - else - pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL); - - if (!pdn->pe_num_map) - return -ENOMEM; - - if (pdn->m64_single_mode) - for (i = 0; i < num_vfs; i++) - pdn->pe_num_map[i] = IODA_INVALID_PE; - - /* Calculate available PE for required VFs */ - if (pdn->m64_single_mode) { - for (i = 0; i < num_vfs; i++) { - pe = pnv_ioda_alloc_pe(phb); - if (!pe) { - ret = -EBUSY; - goto m64_failed; - } + /* Check if the BDFN for this device is associated with a PE yet */ + pe = pnv_pci_bdfn_to_pe(phb, pci_dev_id(pdev)); + if (!pe) { + /* VF PEs should be pre-configured in pnv_pci_sriov_enable() */ + if (WARN_ON(pdev->is_virtfn)) + return; - pdn->pe_num_map[i] = pe->pe_number; - } - } else { - mutex_lock(&phb->ioda.pe_alloc_mutex); - *pdn->pe_num_map = bitmap_find_next_zero_area( - phb->ioda.pe_alloc, phb->ioda.total_pe_num, - 0, num_vfs, 0); - if (*pdn->pe_num_map >= phb->ioda.total_pe_num) { - mutex_unlock(&phb->ioda.pe_alloc_mutex); - dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); - kfree(pdn->pe_num_map); - return -EBUSY; - } - bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); - mutex_unlock(&phb->ioda.pe_alloc_mutex); - } - pdn->num_vfs = num_vfs; + pnv_pci_configure_bus(pdev->bus); + pe = pnv_pci_bdfn_to_pe(phb, pci_dev_id(pdev)); + pci_info(pdev, "Configured PE#%x\n", pe ? pe->pe_number : 0xfffff); - /* Assign M64 window accordingly */ - ret = pnv_pci_vf_assign_m64(pdev, num_vfs); - if (ret) { - dev_info(&pdev->dev, "Not enough M64 window resources\n"); - goto m64_failed; - } /* - * When using one M64 BAR to map one IOV BAR, we need to shift - * the IOV BAR according to the PE# allocated to the VFs. - * Otherwise, the PE# for the VF will conflict with others. + * If we can't setup the IODA PE something has gone horribly + * wrong and we can't enable DMA for the device. */ - if (!pdn->m64_single_mode) { - ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map); - if (ret) - goto m64_failed; - } + if (WARN_ON(!pe)) + return; + } else { + pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number); } - /* Setup VF PEs */ - pnv_ioda_setup_vf_PE(pdev, num_vfs); - - return 0; - -m64_failed: - if (pdn->m64_single_mode) { - for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] == IODA_INVALID_PE) - continue; - - pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; - pnv_ioda_free_pe(pe); - } - } else - bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); - - /* Releasing pe_num_map */ - kfree(pdn->pe_num_map); - - return ret; -} - -int pnv_pcibios_sriov_disable(struct pci_dev *pdev) -{ - pnv_pci_sriov_disable(pdev); - - /* Release PCI data */ - remove_dev_pci_data(pdev); - return 0; -} - -int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) -{ - /* Allocate PCI data */ - add_dev_pci_data(pdev); - - return pnv_pci_sriov_enable(pdev, num_vfs); -} -#endif /* CONFIG_PCI_IOV */ - -static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) -{ - struct pci_dn *pdn = pci_get_pdn(pdev); - struct pnv_ioda_pe *pe; - /* - * The function can be called while the PE# - * hasn't been assigned. Do nothing for the - * case. + * We assume that bridges *probably* don't need to do any DMA so we can + * skip allocating a TCE table, etc unless we get a non-bridge device. */ - if (!pdn || pdn->pe_number == IODA_INVALID_PE) - return; + if (!pe->dma_setup_done && !pci_is_bridge(pdev)) { + switch (phb->type) { + case PNV_PHB_IODA2: + pnv_pci_ioda2_setup_dma_pe(phb, pe); + break; + default: + pr_warn("%s: No DMA for PHB#%x (type %d)\n", + __func__, phb->hose->global_number, phb->type); + } + } + + if (pdn) + pdn->pe_number = pe->pe_number; + pe->device_count++; - pe = &phb->ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); - set_dma_offset(&pdev->dev, pe->tce_bypass_base); + pdev->dev.archdata.dma_offset = pe->tce_bypass_base; set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); - /* - * Note: iommu_add_device() will fail here as - * for physical PE: the device is already added by now; - * for virtual PE: sysfs entries are not ready yet and - * tce_iommu_bus_notifier will add the device to a group later. - */ -} - -static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe) -{ - unsigned short vendor = 0; - struct pci_dev *pdev; - - if (pe->device_count == 1) - return true; - - /* pe->pdev should be set if it's a single device, pe->pbus if not */ - if (!pe->pbus) - return true; - - list_for_each_entry(pdev, &pe->pbus->devices, bus_list) { - if (!vendor) { - vendor = pdev->vendor; - continue; - } - - if (pdev->vendor != vendor) - return false; - } - return true; + /* PEs with a DMA weight of zero won't have a group */ + if (pe->table_group.group) + iommu_add_device(&pe->table_group, &pdev->dev); } /* @@ -1850,235 +1113,79 @@ err: return -EIO; } -static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, + u64 dma_mask) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; - uint64_t top; - bool bypass = false; - s64 rc; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return -ENODEV; + return false; pe = &phb->ioda.pe_array[pdn->pe_number]; if (pe->tce_bypass_enabled) { - top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; - bypass = (dma_mask >= top); - } - - if (bypass) { - dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); - set_dma_ops(&pdev->dev, &dma_nommu_ops); - } else { - /* - * If the device can't set the TCE bypass bit but still wants - * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to - * bypass the 32-bit region and be usable for 64-bit DMAs. - * The device needs to be able to address all of this space. - */ - if (dma_mask >> 32 && - dma_mask > (memory_hotplug_max() + (1ULL << 32)) && - pnv_pci_ioda_pe_single_vendor(pe) && - phb->model == PNV_PHB_MODEL_PHB3) { - /* Configure the bypass mode */ - rc = pnv_pci_ioda_dma_64bit_bypass(pe); - if (rc) - return rc; - /* 4GB offset bypasses 32-bit space */ - set_dma_offset(&pdev->dev, (1ULL << 32)); - set_dma_ops(&pdev->dev, &dma_nommu_ops); - } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) { - /* - * Fail the request if a DMA mask between 32 and 64 bits - * was requested but couldn't be fulfilled. Ideally we - * would do this for 64-bits but historically we have - * always fallen back to 32-bits. - */ - return -ENOMEM; - } else { - dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); - set_dma_ops(&pdev->dev, &dma_iommu_ops); - } - } - *pdev->dev.dma_mask = dma_mask; - - /* Update peer npu devices */ - pnv_npu_try_dma_set_bypass(pdev, bypass); - - return 0; -} - -static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pci_get_pdn(pdev); - struct pnv_ioda_pe *pe; - u64 end, mask; - - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return 0; - - pe = &phb->ioda.pe_array[pdn->pe_number]; - if (!pe->tce_bypass_enabled) - return __dma_get_required_mask(&pdev->dev); - - - end = pe->tce_bypass_base + memblock_end_of_DRAM(); - mask = 1ULL << (fls64(end) - 1); - mask += mask - 1; - - return mask; -} - -static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); - set_dma_offset(&dev->dev, pe->tce_bypass_base); - - if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_bus_dma(pe, dev->subordinate); + u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; + if (dma_mask >= top) + return true; } -} - -static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb, - bool real_mode) -{ - return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) : - (phb->regs + 0x210); -} - -static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl, - unsigned long index, unsigned long npages, bool rm) -{ - struct iommu_table_group_link *tgl = list_first_entry_or_null( - &tbl->it_group_list, struct iommu_table_group_link, - next); - struct pnv_ioda_pe *pe = container_of(tgl->table_group, - struct pnv_ioda_pe, table_group); - __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm); - unsigned long start, end, inc; - - start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset); - end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset + - npages - 1); - - /* p7ioc-style invalidation, 2 TCEs per write */ - start |= (1ull << 63); - end |= (1ull << 63); - inc = 16; - end |= inc - 1; /* round up end to be different than start */ - - mb(); /* Ensure above stores are visible */ - while (start <= end) { - if (rm) - __raw_rm_writeq_be(start, invalidate); - else - __raw_writeq_be(start, invalidate); - - start += inc; - } /* - * The iommu layer will do another mb() for us on build() - * and we don't care on free() + * If the device can't set the TCE bypass bit but still wants + * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to + * bypass the 32-bit region and be usable for 64-bit DMAs. + * The device needs to be able to address all of this space. */ -} - -static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, - long npages, unsigned long uaddr, - enum dma_data_direction direction, - unsigned long attrs) -{ - int ret = pnv_tce_build(tbl, index, npages, uaddr, direction, - attrs); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false); + if (dma_mask >> 32 && + dma_mask > (memory_hotplug_max() + (1ULL << 32)) && + /* pe->pdev should be set if it's a single device, pe->pbus if not */ + (pe->device_count == 1 || !pe->pbus) && + phb->model == PNV_PHB_MODEL_PHB3) { + /* Configure the bypass mode */ + s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe); + if (rc) + return false; + /* 4GB offset bypasses 32-bit space */ + pdev->dev.archdata.dma_offset = (1ULL << 32); + return true; + } - return ret; + return false; } -#ifdef CONFIG_IOMMU_API -static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) +static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false); - - return ret; + return phb->regs + 0x210; } -static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index, +#ifdef CONFIG_IOMMU_API +/* Common for IODA1 and IODA2 */ +static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true); - - return ret; -} -#endif - -static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, - long npages) -{ - pnv_tce_free(tbl, index, npages); - - pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false); + return pnv_tce_xchg(tbl, index, hpa, direction); } - -static struct iommu_table_ops pnv_ioda1_iommu_ops = { - .set = pnv_ioda1_tce_build, -#ifdef CONFIG_IOMMU_API - .exchange = pnv_ioda1_tce_xchg, - .exchange_rm = pnv_ioda1_tce_xchg_rm, - .useraddrptr = pnv_tce_useraddrptr, #endif - .clear = pnv_ioda1_tce_free, - .get = pnv_tce_get, -}; #define PHB3_TCE_KILL_INVAL_ALL PPC_BIT(0) #define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1) #define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2) -static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm) -{ - __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm); - const unsigned long val = PHB3_TCE_KILL_INVAL_ALL; - - mb(); /* Ensure previous TCE table stores are visible */ - if (rm) - __raw_rm_writeq_be(val, invalidate); - else - __raw_writeq_be(val, invalidate); -} - static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe) { /* 01xb - invalidate TCEs that match the specified PE# */ - __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false); + __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb); unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF); mb(); /* Ensure above stores are visible */ __raw_writeq_be(val, invalidate); } -static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm, +static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, unsigned shift, unsigned long index, unsigned long npages) { - __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm); + __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb); unsigned long start, end, inc; /* We'll invalidate DMA address in PE scope */ @@ -2093,10 +1200,7 @@ static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm, mb(); while (start <= end) { - if (rm) - __raw_rm_writeq_be(start, invalidate); - else - __raw_writeq_be(start, invalidate); + __raw_writeq_be(start, invalidate); start += inc; } } @@ -2113,7 +1217,7 @@ static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) } static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, - unsigned long index, unsigned long npages, bool rm) + unsigned long index, unsigned long npages) { struct iommu_table_group_link *tgl; @@ -2123,22 +1227,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, struct pnv_phb *phb = pe->phb; unsigned int shift = tbl->it_page_shift; - /* - * NVLink1 can use the TCE kill register directly as - * it's the same as PHB3. NVLink2 is different and - * should go via the OPAL call. - */ - if (phb->model == PNV_PHB_MODEL_NPU) { - /* - * The NVLink hardware does not support TCE kill - * per TCE entry so we have to invalidate - * the entire cache for it. - */ - pnv_pci_phb3_tce_invalidate_entire(phb, rm); - continue; - } if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs) - pnv_pci_phb3_tce_invalidate(pe, rm, shift, + pnv_pci_phb3_tce_invalidate(pe, shift, index, npages); else opal_pci_tce_kill(phb->opal_id, @@ -2148,14 +1238,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, } } -void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm) -{ - if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3) - pnv_pci_phb3_tce_invalidate_entire(phb, rm); - else - opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0); -} - static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, @@ -2165,48 +1247,24 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, attrs); if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); + pnv_pci_ioda2_tce_invalidate(tbl, index, npages); return ret; } -#ifdef CONFIG_IOMMU_API -static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); - - if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); - - return ret; -} - -static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); - - if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true); - - return ret; -} -#endif - static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, long npages) { pnv_tce_free(tbl, index, npages); - pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); + pnv_pci_ioda2_tce_invalidate(tbl, index, npages); } static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, #ifdef CONFIG_IOMMU_API - .exchange = pnv_ioda2_tce_xchg, - .exchange_rm = pnv_ioda2_tce_xchg_rm, + .xchg_no_kill = pnv_ioda_tce_xchg_no_kill, + .tce_kill = pnv_pci_ioda2_tce_invalidate, .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda2_tce_free, @@ -2214,178 +1272,6 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = { .free = pnv_pci_ioda2_table_free_pages, }; -static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data) -{ - unsigned int *weight = (unsigned int *)data; - - /* This is quite simplistic. The "base" weight of a device - * is 10. 0 means no DMA is to be accounted for it. - */ - if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) - return 0; - - if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || - dev->class == PCI_CLASS_SERIAL_USB_OHCI || - dev->class == PCI_CLASS_SERIAL_USB_EHCI) - *weight += 3; - else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) - *weight += 15; - else - *weight += 10; - - return 0; -} - -static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe) -{ - unsigned int weight = 0; - - /* SRIOV VF has same DMA32 weight as its PF */ -#ifdef CONFIG_PCI_IOV - if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) { - pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight); - return weight; - } -#endif - - if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) { - pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight); - } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) { - struct pci_dev *pdev; - - list_for_each_entry(pdev, &pe->pbus->devices, bus_list) - pnv_pci_ioda_dev_dma_weight(pdev, &weight); - } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) { - pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight); - } - - return weight; -} - -static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) -{ - - struct page *tce_mem = NULL; - struct iommu_table *tbl; - unsigned int weight, total_weight = 0; - unsigned int tce32_segsz, base, segs, avail, i; - int64_t rc; - void *addr; - - /* XXX FIXME: Handle 64-bit only DMA devices */ - /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ - /* XXX FIXME: Allocate multi-level tables on PHB3 */ - weight = pnv_pci_ioda_pe_dma_weight(pe); - if (!weight) - return; - - pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight, - &total_weight); - segs = (weight * phb->ioda.dma32_count) / total_weight; - if (!segs) - segs = 1; - - /* - * Allocate contiguous DMA32 segments. We begin with the expected - * number of segments. With one more attempt, the number of DMA32 - * segments to be allocated is decreased by one until one segment - * is allocated successfully. - */ - do { - for (base = 0; base <= phb->ioda.dma32_count - segs; base++) { - for (avail = 0, i = base; i < base + segs; i++) { - if (phb->ioda.dma32_segmap[i] == - IODA_INVALID_PE) - avail++; - } - - if (avail == segs) - goto found; - } - } while (--segs); - - if (!segs) { - pe_warn(pe, "No available DMA32 segments\n"); - return; - } - -found: - tbl = pnv_pci_table_alloc(phb->hose->node); - if (WARN_ON(!tbl)) - return; - - iommu_register_group(&pe->table_group, phb->hose->global_number, - pe->pe_number); - pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); - - /* Grab a 32-bit TCE table */ - pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n", - weight, total_weight, base, segs); - pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", - base * PNV_IODA1_DMA32_SEGSIZE, - (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1); - - /* XXX Currently, we allocate one big contiguous table for the - * TCEs. We only really need one chunk per 256M of TCE space - * (ie per segment) but that's an optimization for later, it - * requires some added smarts with our get/put_tce implementation - * - * Each TCE page is 4KB in size and each TCE entry occupies 8 - * bytes - */ - tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3); - tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, - get_order(tce32_segsz * segs)); - if (!tce_mem) { - pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); - goto fail; - } - addr = page_address(tce_mem); - memset(addr, 0, tce32_segsz * segs); - - /* Configure HW */ - for (i = 0; i < segs; i++) { - rc = opal_pci_map_pe_dma_window(phb->opal_id, - pe->pe_number, - base + i, 1, - __pa(addr) + tce32_segsz * i, - tce32_segsz, IOMMU_PAGE_SIZE_4K); - if (rc) { - pe_err(pe, " Failed to configure 32-bit TCE table," - " err %ld\n", rc); - goto fail; - } - } - - /* Setup DMA32 segment mapping */ - for (i = base; i < base + segs; i++) - phb->ioda.dma32_segmap[i] = pe->pe_number; - - /* Setup linux iommu table */ - pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs, - base * PNV_IODA1_DMA32_SEGSIZE, - IOMMU_PAGE_SHIFT_4K); - - tbl->it_ops = &pnv_ioda1_iommu_ops; - pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift; - pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; - iommu_init_table(tbl, phb->hose->node); - - if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) - pnv_ioda_setup_bus_dma(pe, pe->pbus); - - return; - fail: - /* XXX Failure: Try to fallback to 64-bit only ? */ - if (tce_mem) - __free_pages(tce_mem, get_order(tce32_segsz * segs)); - if (tbl) { - pnv_pci_unlink_table_and_group(tbl, &pe->table_group); - iommu_tce_table_put(tbl); - } -} - static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, int num, struct iommu_table *tbl) { @@ -2398,9 +1284,9 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; const __u64 win_size = tbl->it_size << tbl->it_page_shift; - pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num, - start_addr, start_addr + win_size - 1, - IOMMU_PAGE_SIZE(tbl)); + pe_info(pe, "Setting up window#%d %llx..%llx pg=%lx\n", + num, start_addr, start_addr + win_size - 1, + IOMMU_PAGE_SIZE(tbl)); /* * Map TCE table through TVT. The TVE index is the PE number @@ -2414,7 +1300,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, size << 3, IOMMU_PAGE_SIZE(tbl)); if (rc) { - pe_err(pe, "Failed to configure TCE table, err %ld\n", rc); + pe_err(pe, "Failed to configure TCE table, err %lld\n", rc); return rc; } @@ -2425,7 +1311,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, return 0; } -void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) { uint16_t window_id = (pe->pe_number << 1 ) + 1; int64_t rc; @@ -2487,6 +1373,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) { struct iommu_table *tbl = NULL; long rc; + unsigned long res_start, res_end; /* * crashkernel= specifies the kdump kernel's maximum memory at @@ -2500,35 +1387,70 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) * DMA window can be larger than available memory, which will * cause errors later. */ - const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory); + const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_PAGE_ORDER); - rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, - IOMMU_PAGE_SHIFT_4K, - window_size, - POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl); + /* + * We create the default window as big as we can. The constraint is + * the max order of allocation possible. The TCE table is likely to + * end up being multilevel and with on-demand allocation in place, + * the initial use is not going to be huge as the default window aims + * to support crippled devices (i.e. not fully 64bit DMAble) only. + */ + /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */ + const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory); + /* Each TCE level cannot exceed maxblock so go multilevel if needed */ + unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT); + unsigned long tcelevel_order = ilog2(maxblock >> 3); + unsigned int levels = tces_order / tcelevel_order; + + if (tces_order % tcelevel_order) + levels += 1; + /* + * We try to stick to default levels (which is >1 at the moment) in + * order to save memory by relying on on-demain TCE level allocation. + */ + levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS); + + rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT, + window_size, levels, false, &tbl); if (rc) { pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); return rc; } - iommu_init_table(tbl, pe->phb->hose->node); + /* We use top part of 32bit space for MMIO so exclude it from DMA */ + res_start = 0; + res_end = 0; + if (window_size > pe->phb->ioda.m32_pci_base) { + res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift; + res_end = min(window_size, SZ_4G) >> tbl->it_page_shift; + } - rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); + tbl->it_index = (pe->phb->hose->global_number << 16) | pe->pe_number; + if (iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end)) + rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); + else + rc = -ENOMEM; if (rc) { - pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", - rc); + pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", rc); iommu_tce_table_put(tbl); - return rc; + tbl = NULL; /* This clears iommu_table_base below */ } - if (!pnv_iommu_bypass_disabled) pnv_pci_ioda2_set_bypass(pe, true); + /* + * Set table base for the case of IOMMU DMA use. Usually this is done + * from dma_dev_setup() which is not called when a device is returned + * from VFIO so do it here. + */ + if (pe->pdev) + set_iommu_table_base(&pe->pdev->dev, tbl); + return 0; } -#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV) static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, int num) { @@ -2552,7 +1474,6 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, return ret; } -#endif #ifdef CONFIG_IOMMU_API unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, @@ -2576,7 +1497,7 @@ unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, direct_table_size = 1UL << table_shift; for ( ; levels; --levels) { - bytes += _ALIGN_UP(tce_table_size, direct_table_size); + bytes += ALIGN(tce_table_size, direct_table_size); tce_table_size /= direct_table_size; tce_table_size <<= 3; @@ -2592,29 +1513,63 @@ static long pnv_pci_ioda2_create_table_userspace( int num, __u32 page_shift, __u64 window_size, __u32 levels, struct iommu_table **ptbl) { - return pnv_pci_ioda2_create_table(table_group, + long ret = pnv_pci_ioda2_create_table(table_group, num, page_shift, window_size, levels, true, ptbl); + + if (!ret) + (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size( + page_shift, window_size, levels); + return ret; } -static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) +static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); + dev->dev.archdata.dma_offset = pe->tce_bypass_base; + + if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) + pnv_ioda_setup_bus_dma(pe, dev->subordinate); + } +} + +static long pnv_ioda2_take_ownership(struct iommu_table_group *table_group, + struct device *dev __maybe_unused) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, table_group); /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */ struct iommu_table *tbl = pe->table_group.tables[0]; + /* + * iommu_ops transfers the ownership per a device and we mode + * the group ownership with the first device in the group. + */ + if (!tbl) + return 0; + pnv_pci_ioda2_set_bypass(pe, false); pnv_pci_ioda2_unset_window(&pe->table_group, 0); if (pe->pbus) pnv_ioda_setup_bus_dma(pe, pe->pbus); + else if (pe->pdev) + set_iommu_table_base(&pe->pdev->dev, NULL); iommu_tce_table_put(tbl); + + return 0; } -static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) +static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group, + struct device *dev __maybe_unused) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, table_group); + /* See the comment about iommu_ops above */ + if (pe->table_group.tables[0]) + return; pnv_pci_ioda2_setup_default_config(pe); if (pe->pbus) pnv_ioda_setup_bus_dma(pe, pe->pbus); @@ -2628,145 +1583,13 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .take_ownership = pnv_ioda2_take_ownership, .release_ownership = pnv_ioda2_release_ownership, }; - -static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe, - struct iommu_table_group *table_group, - struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - iommu_add_device(table_group, &dev->dev); - - if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_bus_iommu_group_add_devices(pe, - table_group, dev->subordinate); - } -} - -static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe, - struct iommu_table_group *table_group, struct pci_bus *bus) -{ - - if (pe->flags & PNV_IODA_PE_DEV) - iommu_add_device(table_group, &pe->pdev->dev); - - if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus) - pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group, - bus); -} - -static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb); - -static void pnv_pci_ioda_setup_iommu_api(void) -{ - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - - /* - * There are 4 types of PEs: - * - PNV_IODA_PE_BUS: a downstream port with an adapter, - * created from pnv_pci_setup_bridge(); - * - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it, - * created from pnv_pci_setup_bridge(); - * - PNV_IODA_PE_VF: a SRIOV virtual function, - * created from pnv_pcibios_sriov_enable(); - * - PNV_IODA_PE_DEV: an NPU or OCAPI device, - * created from pnv_pci_ioda_fixup(). - * - * Normally a PE is represented by an IOMMU group, however for - * devices with side channels the groups need to be more strict. - */ - list_for_each_entry(hose, &hose_list, list_node) { - phb = hose->private_data; - - if (phb->type == PNV_PHB_NPU_NVLINK || - phb->type == PNV_PHB_NPU_OCAPI) - continue; - - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - struct iommu_table_group *table_group; - - table_group = pnv_try_setup_npu_table_group(pe); - if (!table_group) { - if (!pnv_pci_ioda_pe_dma_weight(pe)) - continue; - - table_group = &pe->table_group; - iommu_register_group(&pe->table_group, - pe->phb->hose->global_number, - pe->pe_number); - } - pnv_ioda_setup_bus_iommu_group(pe, table_group, - pe->pbus); - } - } - - /* - * Now we have all PHBs discovered, time to add NPU devices to - * the corresponding IOMMU groups. - */ - list_for_each_entry(hose, &hose_list, list_node) { - unsigned long pgsizes; - - phb = hose->private_data; - - if (phb->type != PNV_PHB_NPU_NVLINK) - continue; - - pgsizes = pnv_ioda_parse_tce_sizes(phb); - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - /* - * IODA2 bridges get this set up from - * pci_controller_ops::setup_bridge but NPU bridges - * do not have this hook defined so we do it here. - */ - pe->table_group.pgsizes = pgsizes; - pnv_npu_compound_attach(pe); - } - } -} -#else /* !CONFIG_IOMMU_API */ -static void pnv_pci_ioda_setup_iommu_api(void) { }; #endif -static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb) -{ - struct pci_controller *hose = phb->hose; - struct device_node *dn = hose->dn; - unsigned long mask = 0; - int i, rc, count; - u32 val; - - count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes"); - if (count <= 0) { - mask = SZ_4K | SZ_64K; - /* Add 16M for POWER8 by default */ - if (cpu_has_feature(CPU_FTR_ARCH_207S) && - !cpu_has_feature(CPU_FTR_ARCH_300)) - mask |= SZ_16M | SZ_256M; - return mask; - } - - for (i = 0; i < count; i++) { - rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes", - i, &val); - if (rc == 0) - mask |= 1ULL << val; - } - - return mask; -} - -static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) +void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) { int64_t rc; - if (!pnv_pci_ioda_pe_dma_weight(pe)) - return; - /* TVE #1 is selected by PCI address bit 59 */ pe->tce_bypass_base = 1ull << 59; @@ -2781,61 +1604,37 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, IOMMU_TABLE_GROUP_MAX_TABLES; pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS; pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb); -#ifdef CONFIG_IOMMU_API - pe->table_group.ops = &pnv_pci_ioda2_ops; -#endif rc = pnv_pci_ioda2_setup_default_config(pe); if (rc) return; - if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) - pnv_ioda_setup_bus_dma(pe, pe->pbus); -} - -int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) -{ - struct pnv_phb *phb = container_of(chip, struct pnv_phb, - ioda.irq_chip); - - return opal_pci_msi_eoi(phb->opal_id, hw_irq); +#ifdef CONFIG_IOMMU_API + pe->table_group.ops = &pnv_pci_ioda2_ops; + iommu_register_group(&pe->table_group, phb->hose->global_number, + pe->pe_number); +#endif + pe->dma_setup_done = true; } -static void pnv_ioda2_msi_eoi(struct irq_data *d) +/* + * Called from KVM in real mode to EOI passthru interrupts. The ICP + * EOI is handled directly in KVM in kvmppc_deliver_irq_passthru(). + * + * The IRQ data is mapped in the PCI-MSI domain and the EOI OPAL call + * needs an HW IRQ number mapped in the XICS IRQ domain. The HW IRQ + * numbers of the in-the-middle MSI domain are vector numbers and it's + * good enough for OPAL. Use that. + */ +int64_t pnv_opal_pci_msi_eoi(struct irq_data *d) { - int64_t rc; - unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); - struct irq_chip *chip = irq_data_get_irq_chip(d); - - rc = pnv_opal_pci_msi_eoi(chip, hw_irq); - WARN_ON_ONCE(rc); + struct pci_controller *hose = irq_data_get_irq_chip_data(d->parent_data); + struct pnv_phb *phb = hose->private_data; - icp_native_eoi(d); + return opal_pci_msi_eoi(phb->opal_id, d->parent_data->hwirq); } - -void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) -{ - struct irq_data *idata; - struct irq_chip *ichip; - - /* The MSI EOI OPAL call is only needed on PHB3 */ - if (phb->model != PNV_PHB_MODEL_PHB3) - return; - - if (!phb->ioda.irq_chip_init) { - /* - * First time we setup an MSI IRQ, we need to setup the - * corresponding IRQ chip to route correctly. - */ - idata = irq_get_irq_data(virq); - ichip = irq_data_get_irq_chip(idata); - phb->ioda.irq_chip_init = 1; - phb->ioda.irq_chip = *ichip; - phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi; - } - irq_set_chip(virq, &phb->ioda.irq_chip); -} +static struct irq_chip pnv_pci_msi_irq_chip; /* * Returns true iff chip is something that we could call @@ -2843,19 +1642,21 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) */ bool is_pnv_opal_msi(struct irq_chip *chip) { - return chip->irq_eoi == pnv_ioda2_msi_eoi; + return chip == &pnv_pci_msi_irq_chip; } EXPORT_SYMBOL_GPL(is_pnv_opal_msi); -static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int virq, - unsigned int is_64, struct msi_msg *msg) +static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int xive_num, + unsigned int is_64, struct msi_msg *msg) { struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); - unsigned int xive_num = hwirq - phb->msi_base; __be32 data; int rc; + dev_dbg(&dev->dev, "%s: setup %s-bit MSI for vector #%d\n", __func__, + is_64 ? "64" : "32", xive_num); + /* No PE assigned ? bail out ... no MSI for you ! */ if (pe == NULL) return -ENXIO; @@ -2903,17 +1704,188 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, } msg->data = be32_to_cpu(data); - pnv_set_msi_irq_chip(phb, virq); + return 0; +} + +static void pnv_msi_shutdown(struct irq_data *d) +{ + d = d->parent_data; + if (d->chip->irq_shutdown) + d->chip->irq_shutdown(d); +} + +static bool pnv_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) +{ + struct irq_chip *chip = info->chip; + + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; + + chip->irq_shutdown = pnv_msi_shutdown; + return true; +} + +#define PNV_PCI_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_PCI_MSI_MASK_PARENT) +#define PNV_PCI_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ + MSI_FLAG_PCI_MSIX | \ + MSI_FLAG_MULTI_PCI_MSI) + +static const struct msi_parent_ops pnv_msi_parent_ops = { + .required_flags = PNV_PCI_MSI_FLAGS_REQUIRED, + .supported_flags = PNV_PCI_MSI_FLAGS_SUPPORTED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI, + .bus_select_token = DOMAIN_BUS_NEXUS, + .bus_select_mask = MATCH_PCI_MSI, + .prefix = "PNV-", + .init_dev_msi_info = pnv_init_dev_msi_info, +}; + +static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg) +{ + struct msi_desc *entry = irq_data_get_msi_desc(d); + struct pci_dev *pdev = msi_desc_to_pci_dev(entry); + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + int rc; - pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," - " address=%x_%08x data=%x PE# %x\n", - pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, - msg->address_hi, msg->address_lo, data, pe->pe_number); + rc = __pnv_pci_ioda_msi_setup(phb, pdev, d->hwirq, + entry->pci.msi_attrib.is_64, msg); + if (rc) + dev_err(&pdev->dev, "Failed to setup %s-bit MSI #%ld : %d\n", + entry->pci.msi_attrib.is_64 ? "64" : "32", d->hwirq, rc); +} + +/* + * The IRQ data is mapped in the MSI domain in which HW IRQ numbers + * correspond to vector numbers. + */ +static void pnv_msi_eoi(struct irq_data *d) +{ + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + + if (phb->model == PNV_PHB_MODEL_PHB3) { + /* + * The EOI OPAL call takes an OPAL HW IRQ number but + * since it is translated into a vector number in + * OPAL, use that directly. + */ + WARN_ON_ONCE(opal_pci_msi_eoi(phb->opal_id, d->hwirq)); + } + + irq_chip_eoi_parent(d); +} + +static struct irq_chip pnv_msi_irq_chip = { + .name = "PNV-MSI", + .irq_shutdown = pnv_msi_shutdown, + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = pnv_msi_eoi, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_compose_msi_msg = pnv_msi_compose_msg, +}; + +static int pnv_irq_parent_domain_alloc(struct irq_domain *domain, + unsigned int virq, int hwirq) +{ + struct irq_fwspec parent_fwspec; + int ret; + + parent_fwspec.fwnode = domain->parent->fwnode; + parent_fwspec.param_count = 2; + parent_fwspec.param[0] = hwirq; + parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec); + if (ret) + return ret; + + return 0; +} + +static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct pci_controller *hose = domain->host_data; + struct pnv_phb *phb = hose->private_data; + msi_alloc_info_t *info = arg; + struct pci_dev *pdev = msi_desc_to_pci_dev(info->desc); + int hwirq; + int i, ret; + + hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, nr_irqs); + if (hwirq < 0) { + dev_warn(&pdev->dev, "failed to find a free MSI\n"); + return -ENOSPC; + } + + dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__, + hose->dn, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) { + ret = pnv_irq_parent_domain_alloc(domain, virq + i, + phb->msi_base + hwirq + i); + if (ret) + goto out; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &pnv_msi_irq_chip, hose); + } + + return 0; + +out: + irq_domain_free_irqs_parent(domain, virq, i); + msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, nr_irqs); + return ret; +} + +static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + + pr_debug("%s bridge %pOF %d/%lx #%d\n", __func__, hose->dn, + virq, d->hwirq, nr_irqs); + + msi_bitmap_free_hwirqs(&phb->msi_bmp, d->hwirq, nr_irqs); + irq_domain_free_irqs_parent(domain, virq, nr_irqs); +} + +static const struct irq_domain_ops pnv_irq_domain_ops = { + .select = msi_lib_irq_domain_select, + .alloc = pnv_irq_domain_alloc, + .free = pnv_irq_domain_free, +}; + +static int __init pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count) +{ + struct irq_domain *parent = irq_get_default_domain(); + struct irq_domain_info info = { + .fwnode = of_fwnode_handle(hose->dn), + .ops = &pnv_irq_domain_ops, + .host_data = hose, + .size = count, + .parent = parent, + }; + + hose->dev_domain = msi_create_parent_irq_domain(&info, &pnv_msi_parent_ops); + if (!hose->dev_domain) { + pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n", + hose->dn, hose->global_number); + return -ENOMEM; + } return 0; } -static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) +static void __init pnv_pci_init_ioda_msis(struct pnv_phb *phb) { unsigned int count; const __be32 *prop = of_get_property(phb->hose->dn, @@ -2933,102 +1905,11 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) return; } - phb->msi_setup = pnv_pci_ioda_msi_setup; - phb->msi32_support = 1; pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", count, phb->msi_base); -} - -#ifdef CONFIG_PCI_IOV -static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - const resource_size_t gate = phb->ioda.m64_segsize >> 2; - struct resource *res; - int i; - resource_size_t size, total_vf_bar_sz; - struct pci_dn *pdn; - int mul, total_vfs; - - if (!pdev->is_physfn || pci_dev_is_added(pdev)) - return; - - pdn = pci_get_pdn(pdev); - pdn->vfs_expanded = 0; - pdn->m64_single_mode = false; - - total_vfs = pci_sriov_get_totalvfs(pdev); - mul = phb->ioda.total_pe_num; - total_vf_bar_sz = 0; - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || res->parent) - continue; - if (!pnv_pci_is_m64_flags(res->flags)) { - dev_warn(&pdev->dev, "Don't support SR-IOV with" - " non M64 VF BAR%d: %pR. \n", - i, res); - goto truncate_iov; - } - - total_vf_bar_sz += pci_iov_resource_size(pdev, - i + PCI_IOV_RESOURCES); - - /* - * If bigger than quarter of M64 segment size, just round up - * power of two. - * - * Generally, one M64 BAR maps one IOV BAR. To avoid conflict - * with other devices, IOV BAR size is expanded to be - * (total_pe * VF_BAR_size). When VF_BAR_size is half of M64 - * segment size , the expanded size would equal to half of the - * whole M64 space size, which will exhaust the M64 Space and - * limit the system flexibility. This is a design decision to - * set the boundary to quarter of the M64 segment size. - */ - if (total_vf_bar_sz > gate) { - mul = roundup_pow_of_two(total_vfs); - dev_info(&pdev->dev, - "VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n", - total_vf_bar_sz, gate, mul); - pdn->m64_single_mode = true; - break; - } - } - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || res->parent) - continue; - - size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); - /* - * On PHB3, the minimum size alignment of M64 BAR in single - * mode is 32MB. - */ - if (pdn->m64_single_mode && (size < SZ_32M)) - goto truncate_iov; - dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res); - res->end = res->start + size * mul - 1; - dev_dbg(&pdev->dev, " %pR\n", res); - dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)", - i, res, mul); - } - pdn->vfs_expanded = mul; - - return; - -truncate_iov: - /* To save MMIO space, IOV BAR is truncated. */ - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - res->flags = 0; - res->end = res->start - 1; - } + pnv_msi_allocate_domains(phb->hose, count); } -#endif /* CONFIG_PCI_IOV */ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, struct resource *res) @@ -3038,7 +1919,8 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, int index; int64_t rc; - if (!res || !res->flags || res->start > res->end) + if (!res || !res->flags || res->start > res->end || + res->flags & IORESOURCE_UNSET) return; if (res->flags & IORESOURCE_IO) { @@ -3089,7 +1971,7 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, /* * This function is supposed to be called on basis of PE from top - * to bottom style. So the the I/O or MMIO segment assigned to + * to bottom style. So the I/O or MMIO segment assigned to * parent PE could be overridden by its child PEs if necessary. */ static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe) @@ -3124,19 +2006,9 @@ static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe) #ifdef CONFIG_DEBUG_FS static int pnv_pci_diag_data_set(void *data, u64 val) { - struct pci_controller *hose; - struct pnv_phb *phb; + struct pnv_phb *phb = data; s64 ret; - if (val != 1ULL) - return -EINVAL; - - hose = (struct pci_controller *)data; - if (!hose || !hose->private_data) - return -ENODEV; - - phb = hose->private_data; - /* Retrieve the diag data from firmware */ ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data, phb->diag_data_size); @@ -3148,8 +2020,35 @@ static int pnv_pci_diag_data_set(void *data, u64 val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, - pnv_pci_diag_data_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, pnv_pci_diag_data_set, + "%llu\n"); + +static int pnv_pci_ioda_pe_dump(void *data, u64 val) +{ + struct pnv_phb *phb = data; + int pe_num; + + for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) { + struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_num]; + + if (!test_bit(pe_num, phb->ioda.pe_alloc)) + continue; + + pe_warn(pe, "rid: %04x dev count: %2d flags: %s%s%s%s%s%s\n", + pe->rid, pe->device_count, + (pe->flags & PNV_IODA_PE_DEV) ? "dev " : "", + (pe->flags & PNV_IODA_PE_BUS) ? "bus " : "", + (pe->flags & PNV_IODA_PE_BUS_ALL) ? "all " : "", + (pe->flags & PNV_IODA_PE_MASTER) ? "master " : "", + (pe->flags & PNV_IODA_PE_SLAVE) ? "slave " : "", + (pe->flags & PNV_IODA_PE_VF) ? "vf " : ""); + } + + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_ioda_pe_dump_fops, NULL, + pnv_pci_ioda_pe_dump, "%llu\n"); #endif /* CONFIG_DEBUG_FS */ @@ -3163,19 +2062,13 @@ static void pnv_pci_ioda_create_dbgfs(void) list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { phb = hose->private_data; - /* Notify initialization of PHB done */ - phb->initialized = 1; - sprintf(name, "PCI%04x", hose->global_number); - phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); - if (!phb->dbgfs) { - pr_warn("%s: Error on creating debugfs on PHB#%x\n", - __func__, hose->global_number); - continue; - } + phb->dbgfs = debugfs_create_dir(name, arch_debugfs_dir); - debugfs_create_file("dump_diag_regs", 0200, phb->dbgfs, hose, - &pnv_pci_diag_data_fops); + debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs, + phb, &pnv_pci_diag_data_fops); + debugfs_create_file_unsafe("dump_ioda_pe_state", 0200, phb->dbgfs, + phb, &pnv_pci_ioda_pe_dump_fops); } #endif /* CONFIG_DEBUG_FS */ } @@ -3217,8 +2110,6 @@ static void pnv_pci_enable_bridges(void) static void pnv_pci_ioda_fixup(void) { - pnv_pci_ioda_setup_PEs(); - pnv_pci_ioda_setup_iommu_api(); pnv_pci_ioda_create_dbgfs(); pnv_pci_enable_bridges(); @@ -3243,10 +2134,9 @@ static void pnv_pci_ioda_fixup(void) static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, unsigned long type) { - struct pci_dev *bridge; - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); int num_pci_bridges = 0; + struct pci_dev *bridge; bridge = bus->self; while (bridge) { @@ -3330,28 +2220,16 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus, } } -static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type) +static void pnv_pci_configure_bus(struct pci_bus *bus) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; struct pci_dev *bridge = bus->self; struct pnv_ioda_pe *pe; - bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE); - - /* Extend bridge's windows if necessary */ - pnv_pci_fixup_bridge_resources(bus, type); + bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE); - /* The PE for root bus should be realized before any one else */ - if (!phb->ioda.root_pe_populated) { - pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false); - if (pe) { - phb->ioda.root_pe_idx = pe->pe_number; - phb->ioda.root_pe_populated = true; - } - } + dev_info(&bus->dev, "Configuring PE for bus\n"); /* Don't assign PE to PCI bus, which doesn't have subordinate devices */ - if (list_empty(&bus->devices)) + if (WARN_ON(list_empty(&bus->devices))) return; /* Reserve PEs according to used M64 resources */ @@ -3367,17 +2245,6 @@ static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type) return; pnv_ioda_setup_pe_seg(pe); - switch (phb->type) { - case PNV_PHB_IODA1: - pnv_pci_ioda1_setup_dma_pe(phb, pe); - break; - case PNV_PHB_IODA2: - pnv_pci_ioda2_setup_dma_pe(phb, pe); - break; - default: - pr_warn("%s: No DMA for PHB#%x (type %d)\n", - __func__, phb->hose->global_number, phb->type); - } } static resource_size_t pnv_pci_default_alignment(void) @@ -3385,134 +2252,50 @@ static resource_size_t pnv_pci_default_alignment(void) return PAGE_SIZE; } -#ifdef CONFIG_PCI_IOV -static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, - int resno) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pci_get_pdn(pdev); - resource_size_t align; - - /* - * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the - * SR-IOV. While from hardware perspective, the range mapped by M64 - * BAR should be size aligned. - * - * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra - * powernv-specific hardware restriction is gone. But if just use the - * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with - * in one segment of M64 #15, which introduces the PE conflict between - * PF and VF. Based on this, the minimum alignment of an IOV BAR is - * m64_segsize. - * - * This function returns the total IOV BAR size if M64 BAR is in - * Shared PE mode or just VF BAR size if not. - * If the M64 BAR is in Single PE mode, return the VF BAR size or - * M64 segment size if IOV BAR size is less. - */ - align = pci_iov_resource_size(pdev, resno); - if (!pdn->vfs_expanded) - return align; - if (pdn->m64_single_mode) - return max(align, (resource_size_t)phb->ioda.m64_segsize); - - return pdn->vfs_expanded * align; -} -#endif /* CONFIG_PCI_IOV */ - /* Prevent enabling devices for which we couldn't properly * assign a PE */ static bool pnv_pci_enable_device_hook(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; struct pci_dn *pdn; - /* The function is probably called while the PEs have - * not be created yet. For example, resource reassignment - * during PCI probe period. We just skip the check if - * PEs isn't ready. - */ - if (!phb->initialized) - return true; - pdn = pci_get_pdn(dev); - if (!pdn || pdn->pe_number == IODA_INVALID_PE) + if (!pdn || pdn->pe_number == IODA_INVALID_PE) { + pci_err(dev, "pci_enable_device() blocked, no PE assigned.\n"); return false; - - return true; -} - -static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group, - int num) -{ - struct pnv_ioda_pe *pe = container_of(table_group, - struct pnv_ioda_pe, table_group); - struct pnv_phb *phb = pe->phb; - unsigned int idx; - long rc; - - pe_info(pe, "Removing DMA window #%d\n", num); - for (idx = 0; idx < phb->ioda.dma32_count; idx++) { - if (phb->ioda.dma32_segmap[idx] != pe->pe_number) - continue; - - rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, - idx, 0, 0ul, 0ul, 0ul); - if (rc != OPAL_SUCCESS) { - pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n", - rc, idx); - return rc; - } - - phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE; } - pnv_pci_unlink_table_and_group(table_group->tables[num], table_group); - return OPAL_SUCCESS; + return true; } -static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe) +static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev) { - unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe); - struct iommu_table *tbl = pe->table_group.tables[0]; - int64_t rc; - - if (!weight) - return; + struct pci_dn *pdn; + struct pnv_ioda_pe *pe; - rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0); - if (rc != OPAL_SUCCESS) - return; + pdn = pci_get_pdn(dev); + if (!pdn) + return false; - pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false); - if (pe->table_group.group) { - iommu_group_put(pe->table_group.group); - WARN_ON(pe->table_group.group); + if (pdn->pe_number == IODA_INVALID_PE) { + pe = pnv_ioda_setup_dev_PE(dev); + if (!pe) + return false; } - - free_pages(tbl->it_base, get_order(tbl->it_size << 3)); - iommu_tce_table_put(tbl); + return true; } -static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) +void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) { struct iommu_table *tbl = pe->table_group.tables[0]; - unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe); -#ifdef CONFIG_IOMMU_API int64_t rc; -#endif - if (!weight) + if (!pe->dma_setup_done) return; -#ifdef CONFIG_IOMMU_API rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); if (rc) - pe_warn(pe, "OPAL error %ld release DMA window\n", rc); -#endif + pe_warn(pe, "OPAL error %lld release DMA window\n", rc); pnv_pci_ioda2_set_bypass(pe, false); if (pe->table_group.group) { @@ -3535,17 +2318,11 @@ static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe, if (map[idx] != pe->pe_number) continue; - if (win == OPAL_M64_WINDOW_TYPE) - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - phb->ioda.reserved_pe_idx, win, - idx / PNV_IODA1_M64_SEGS, - idx % PNV_IODA1_M64_SEGS); - else - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - phb->ioda.reserved_pe_idx, win, 0, idx); + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + phb->ioda.reserved_pe_idx, win, 0, idx); if (rc != OPAL_SUCCESS) - pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n", + pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n", rc, win, idx); map[idx] = IODA_INVALID_PE; @@ -3556,14 +2333,7 @@ static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe) { struct pnv_phb *phb = pe->phb; - if (phb->type == PNV_PHB_IODA1) { - pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE, - phb->ioda.io_segmap); - pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE, - phb->ioda.m32_segmap); - pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE, - phb->ioda.m64_segmap); - } else if (phb->type == PNV_PHB_IODA2) { + if (phb->type == PNV_PHB_IODA2) { pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE, phb->ioda.m32_segmap); } @@ -3574,14 +2344,18 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) struct pnv_phb *phb = pe->phb; struct pnv_ioda_pe *slave, *tmp; + pe_info(pe, "Releasing PE\n"); + + mutex_lock(&phb->ioda.pe_list_mutex); list_del(&pe->list); + mutex_unlock(&phb->ioda.pe_list_mutex); + switch (phb->type) { - case PNV_PHB_IODA1: - pnv_pci_ioda1_release_pe_dma(pe); - break; case PNV_PHB_IODA2: pnv_pci_ioda2_release_pe_dma(pe); break; + case PNV_PHB_NPU_OCAPI: + break; default: WARN_ON(1); } @@ -3603,26 +2377,35 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) * that it can be populated again in PCI hot add path. The PE * shouldn't be destroyed as it's the global reserved resource. */ - if (phb->ioda.root_pe_populated && - phb->ioda.root_pe_idx == pe->pe_number) - phb->ioda.root_pe_populated = false; - else - pnv_ioda_free_pe(pe); + if (phb->ioda.root_pe_idx == pe->pe_number) + return; + + pnv_ioda_free_pe(pe); } static void pnv_pci_release_device(struct pci_dev *pdev) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; + /* The VF PE state is torn down when sriov_disable() is called */ if (pdev->is_virtfn) return; if (!pdn || pdn->pe_number == IODA_INVALID_PE) return; +#ifdef CONFIG_PCI_IOV + /* + * FIXME: Try move this to sriov_disable(). It's here since we allocate + * the iov state at probe time since we need to fiddle with the IOV + * resources. + */ + if (pdev->is_physfn) + kfree(pdev->dev.archdata.iov_data); +#endif + /* * PCI hotplug can happen as part of EEH error recovery. The @pdn * isn't removed and added afterwards in this scenario. We should @@ -3639,15 +2422,6 @@ static void pnv_pci_release_device(struct pci_dev *pdev) pnv_ioda_release_pe(pe); } -static void pnv_npu_disable_device(struct pci_dev *pdev) -{ - struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev); - struct eeh_pe *eehpe = edev ? edev->pe : NULL; - - if (eehpe && eeh_ops && eeh_ops->reset) - eeh_ops->reset(eehpe, EEH_RESET_HOT); -} - static void pnv_pci_ioda_shutdown(struct pci_controller *hose) { struct pnv_phb *phb = hose->private_data; @@ -3656,43 +2430,64 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) OPAL_ASSERT_RESET); } -static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, - .dma_bus_setup = pnv_pci_dma_bus_setup, - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, - .enable_device_hook = pnv_pci_enable_device_hook, - .release_device = pnv_pci_release_device, - .window_alignment = pnv_pci_window_alignment, - .setup_bridge = pnv_pci_setup_bridge, - .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_pci_ioda_dma_set_mask, - .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, - .shutdown = pnv_pci_ioda_shutdown, -}; +static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus) +{ + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); + struct pnv_ioda_pe *pe; + + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))) + continue; -static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) + if (!pe->pbus) + continue; + + if (bus->number == ((pe->rid >> 8) & 0xFF)) { + pe->pbus = bus; + break; + } + } +} + +#ifdef CONFIG_IOMMU_API +static struct iommu_group *pnv_pci_device_group(struct pci_controller *hose, + struct pci_dev *pdev) { - dev_err_once(&npdev->dev, - "%s operation unsupported for NVLink devices\n", - __func__); - return -EPERM; + struct pnv_phb *phb = hose->private_data; + struct pnv_ioda_pe *pe; + + if (WARN_ON(!phb)) + return ERR_PTR(-ENODEV); + + pe = pnv_pci_bdfn_to_pe(phb, pci_dev_id(pdev)); + if (!pe) + return ERR_PTR(-ENODEV); + + if (!pe->table_group.group) + return ERR_PTR(-ENODEV); + + return iommu_group_ref_get(pe->table_group.group); } +#endif -static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, +static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { + .dma_dev_setup = pnv_pci_ioda_dma_dev_setup, + .dma_bus_setup = pnv_pci_ioda_dma_bus_setup, + .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported, .enable_device_hook = pnv_pci_enable_device_hook, + .release_device = pnv_pci_release_device, .window_alignment = pnv_pci_window_alignment, + .setup_bridge = pnv_pci_fixup_bridge_resources, .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_npu_dma_set_mask, .shutdown = pnv_pci_ioda_shutdown, - .disable_device = pnv_npu_disable_device, +#ifdef CONFIG_IOMMU_API + .device_group = pnv_pci_device_group, +#endif }; static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = { - .enable_device_hook = pnv_pci_enable_device_hook, + .enable_device_hook = pnv_ocapi_enable_device_hook, + .release_device = pnv_pci_release_device, .window_alignment = pnv_pci_window_alignment, .reset_secondary_bus = pnv_pci_reset_secondary_bus, .shutdown = pnv_pci_ioda_shutdown, @@ -3704,7 +2499,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, struct pci_controller *hose; struct pnv_phb *phb; unsigned long size, m64map_off, m32map_off, pemap_off; - unsigned long iomap_off = 0, dma32map_off = 0; + struct pnv_ioda_pe *root_pe; struct resource r; const __be64 *prop64; const __be32 *prop32; @@ -3727,14 +2522,17 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb_id = be64_to_cpup(prop64); pr_debug(" PHB-ID : 0x%016llx\n", phb_id); - phb = memblock_alloc(sizeof(*phb), SMP_CACHE_BYTES); + phb = kzalloc(sizeof(*phb), GFP_KERNEL); + if (!phb) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(*phb)); /* Allocate PCI controller */ phb->hose = hose = pcibios_alloc_controller(np); if (!phb->hose) { pr_err(" Can't allocate PCI controller for %pOF\n", np); - memblock_free(__pa(phb), sizeof(struct pnv_phb)); + memblock_free(phb, sizeof(struct pnv_phb)); return; } @@ -3759,10 +2557,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->model = PNV_PHB_MODEL_P7IOC; else if (of_device_is_compatible(np, "ibm,power8-pciex")) phb->model = PNV_PHB_MODEL_PHB3; - else if (of_device_is_compatible(np, "ibm,power8-npu-pciex")) - phb->model = PNV_PHB_MODEL_NPU; - else if (of_device_is_compatible(np, "ibm,power9-npu-pciex")) - phb->model = PNV_PHB_MODEL_NPU2; else phb->model = PNV_PHB_MODEL_UNKNOWN; @@ -3773,7 +2567,10 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, else phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE; - phb->diag_data = memblock_alloc(phb->diag_data_size, SMP_CACHE_BYTES); + phb->diag_data = kzalloc(phb->diag_data_size, GFP_KERNEL); + if (!phb->diag_data) + panic("%s: Failed to allocate %u bytes\n", __func__, + phb->diag_data_size); /* Parse 32-bit and IO ranges (if any) */ pci_process_bridge_OF_ranges(hose, np, !hose->global_number); @@ -3812,27 +2609,19 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num; phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ - /* Calculate how many 32-bit TCE segments we have */ - phb->ioda.dma32_count = phb->ioda.m32_pci_base / - PNV_IODA1_DMA32_SEGSIZE; - /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ - size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8, + size = ALIGN(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8, sizeof(unsigned long)); m64map_off = size; size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]); m32map_off = size; size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]); - if (phb->type == PNV_PHB_IODA1) { - iomap_off = size; - size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]); - dma32map_off = size; - size += phb->ioda.dma32_count * - sizeof(phb->ioda.dma32_segmap[0]); - } pemap_off = size; size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe); - aux = memblock_alloc(size, SMP_CACHE_BYTES); + aux = kzalloc(size, GFP_KERNEL); + if (!aux) + panic("%s: Failed to allocate %lu bytes\n", __func__, size); + phb->ioda.pe_alloc = aux; phb->ioda.m64_segmap = aux + m64map_off; phb->ioda.m32_segmap = aux + m32map_off; @@ -3840,15 +2629,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->ioda.m64_segmap[segno] = IODA_INVALID_PE; phb->ioda.m32_segmap[segno] = IODA_INVALID_PE; } - if (phb->type == PNV_PHB_IODA1) { - phb->ioda.io_segmap = aux + iomap_off; - for (segno = 0; segno < phb->ioda.total_pe_num; segno++) - phb->ioda.io_segmap[segno] = IODA_INVALID_PE; - - phb->ioda.dma32_segmap = aux + dma32map_off; - for (segno = 0; segno < phb->ioda.dma32_count; segno++) - phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE; - } phb->ioda.pe_array = aux + pemap_off; /* @@ -3864,16 +2644,14 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1; pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx); } else { - phb->ioda.root_pe_idx = IODA_INVALID_PE; + /* otherwise just allocate one */ + root_pe = pnv_ioda_alloc_pe(phb, 1); + phb->ioda.root_pe_idx = root_pe->pe_number; } INIT_LIST_HEAD(&phb->ioda.pe_list); mutex_init(&phb->ioda.pe_list_mutex); - /* Calculate how many 32-bit TCE segments we have */ - phb->ioda.dma32_count = phb->ioda.m32_pci_base / - PNV_IODA1_DMA32_SEGSIZE; - #if 0 /* We should really do that ... */ rc = opal_pci_set_phb_mem_window(opal->phb_id, window_type, @@ -3912,21 +2690,17 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; switch (phb->type) { - case PNV_PHB_NPU_NVLINK: - hose->controller_ops = pnv_npu_ioda_controller_ops; - break; case PNV_PHB_NPU_OCAPI: hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops; break; default: - phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; hose->controller_ops = pnv_pci_ioda_controller_ops; } ppc_md.pcibios_default_alignment = pnv_pci_default_alignment; #ifdef CONFIG_PCI_IOV - ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; + ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov; ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment; ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable; ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable; @@ -3944,9 +2718,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, * shutdown PCI devices correctly. We already got IODA table * cleaned out. So we have to issue PHB reset to stop all PCI * transactions from previous kernel. The ppc_pci_reset_phbs - * kernel parameter will force this reset too. + * kernel parameter will force this reset too. Additionally, + * if the IODA reset above failed then use a bigger hammer. + * This can happen if we get a PHB fatal error in very early + * boot. */ - if (is_kdump_kernel() || pci_reset_phbs) { + if (is_kdump_kernel() || pci_reset_phbs || rc) { pr_info(" Issue PHB reset ...\n"); pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE); @@ -3955,6 +2732,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, /* Remove M64 resource if we can't configure it successfully */ if (!phb->init_m64 || phb->init_m64(phb)) hose->mem_resources[1].flags = 0; + + /* create pci_dn's for DT nodes under this PHB */ + pci_devs_phb_init_dynamic(hose); } void __init pnv_pci_init_ioda2_phb(struct device_node *np) @@ -3962,11 +2742,6 @@ void __init pnv_pci_init_ioda2_phb(struct device_node *np) pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2); } -void __init pnv_pci_init_npu_phb(struct device_node *np) -{ - pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK); -} - void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np) { pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI); @@ -3974,8 +2749,7 @@ void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np) static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); if (!machine_is(powernv)) return; @@ -3984,27 +2758,3 @@ static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev) dev->cfg_size = PCI_CFG_SPACE_EXP_SIZE; } DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pnv_npu2_opencapi_cfg_size_fixup); - -void __init pnv_pci_init_ioda_hub(struct device_node *np) -{ - struct device_node *phbn; - const __be64 *prop64; - u64 hub_id; - - pr_info("Probing IODA IO-Hub %pOF\n", np); - - prop64 = of_get_property(np, "ibm,opal-hubid", NULL); - if (!prop64) { - pr_err(" Missing \"ibm,opal-hubid\" property !\n"); - return; - } - hub_id = be64_to_cpup(prop64); - pr_devel(" HUB-ID : 0x%016llx\n", hub_id); - - /* Count child PHBs */ - for_each_child_of_node(np, phbn) { - /* Look for IODA1 PHBs */ - if (of_device_is_compatible(phbn, "ibm,ioda-phb")) - pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1); - } -} diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c new file mode 100644 index 000000000000..cc7b1dd54ac6 --- /dev/null +++ b/arch/powerpc/platforms/powernv/pci-sriov.c @@ -0,0 +1,760 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/kernel.h> +#include <linux/ioport.h> +#include <linux/bitmap.h> +#include <linux/pci.h> + +#include <asm/opal.h> + +#include "pci.h" + +/* + * The majority of the complexity in supporting SR-IOV on PowerNV comes from + * the need to put the MMIO space for each VF into a separate PE. Internally + * the PHB maps MMIO addresses to a specific PE using the "Memory BAR Table". + * The MBT historically only applied to the 64bit MMIO window of the PHB + * so it's common to see it referred to as the "M64BT". + * + * An MBT entry stores the mapped range as an <base>,<mask> pair. This forces + * the address range that we want to map to be power-of-two sized and aligned. + * For conventional PCI devices this isn't really an issue since PCI device BARs + * have the same requirement. + * + * For a SR-IOV BAR things are a little more awkward since size and alignment + * are not coupled. The alignment is set based on the per-VF BAR size, but + * the total BAR area is: number-of-vfs * per-vf-size. The number of VFs + * isn't necessarily a power of two, so neither is the total size. To fix that + * we need to finesse (read: hack) the Linux BAR allocator so that it will + * allocate the SR-IOV BARs in a way that lets us map them using the MBT. + * + * The changes to size and alignment that we need to do depend on the "mode" + * of MBT entry that we use. We only support SR-IOV on PHB3 (IODA2) and above, + * so as a baseline we can assume that we have the following BAR modes + * available: + * + * NB: $PE_COUNT is the number of PEs that the PHB supports. + * + * a) A segmented BAR that splits the mapped range into $PE_COUNT equally sized + * segments. The n'th segment is mapped to the n'th PE. + * b) An un-segmented BAR that maps the whole address range to a specific PE. + * + * + * We prefer to use mode a) since it only requires one MBT entry per SR-IOV BAR + * For comparison b) requires one entry per-VF per-BAR, or: + * (num-vfs * num-sriov-bars) in total. To use a) we need the size of each segment + * to equal the size of the per-VF BAR area. So: + * + * new_size = per-vf-size * number-of-PEs + * + * The alignment for the SR-IOV BAR also needs to be changed from per-vf-size + * to "new_size", calculated above. Implementing this is a convoluted process + * which requires several hooks in the PCI core: + * + * 1. In pcibios_device_add() we call pnv_pci_ioda_fixup_iov(). + * + * At this point the device has been probed and the device's BARs are sized, + * but no resource allocations have been done. The SR-IOV BARs are sized + * based on the maximum number of VFs supported by the device and we need + * to increase that to new_size. + * + * 2. Later, when Linux actually assigns resources it tries to make the resource + * allocations for each PCI bus as compact as possible. As a part of that it + * sorts the BARs on a bus by their required alignment, which is calculated + * using pci_resource_alignment(). + * + * For IOV resources this goes: + * pci_resource_alignment() + * pci_sriov_resource_alignment() + * pcibios_sriov_resource_alignment() + * pnv_pci_iov_resource_alignment() + * + * Our hook overrides the default alignment, equal to the per-vf-size, with + * new_size computed above. + * + * 3. When userspace enables VFs for a device: + * + * sriov_enable() + * pcibios_sriov_enable() + * pnv_pcibios_sriov_enable() + * + * This is where we actually allocate PE numbers for each VF and setup the + * MBT mapping for each SR-IOV BAR. In steps 1) and 2) we setup an "arena" + * where each MBT segment is equal in size to the VF BAR so we can shift + * around the actual SR-IOV BAR location within this arena. We need this + * ability because the PE space is shared by all devices on the same PHB. + * When using mode a) described above segment 0 in maps to PE#0 which might + * be already being used by another device on the PHB. + * + * As a result we need allocate a contigious range of PE numbers, then shift + * the address programmed into the SR-IOV BAR of the PF so that the address + * of VF0 matches up with the segment corresponding to the first allocated + * PE number. This is handled in pnv_pci_vf_resource_shift(). + * + * Once all that is done we return to the PCI core which then enables VFs, + * scans them and creates pci_devs for each. The init process for a VF is + * largely the same as a normal device, but the VF is inserted into the IODA + * PE that we allocated for it rather than the PE associated with the bus. + * + * 4. When userspace disables VFs we unwind the above in + * pnv_pcibios_sriov_disable(). Fortunately this is relatively simple since + * we don't need to validate anything, just tear down the mappings and + * move SR-IOV resource back to its "proper" location. + * + * That's how mode a) works. In theory mode b) (single PE mapping) is less work + * since we can map each individual VF with a separate BAR. However, there's a + * few limitations: + * + * 1) For IODA2 mode b) has a minimum alignment requirement of 32MB. This makes + * it only usable for devices with very large per-VF BARs. Such devices are + * similar to Big Foot. They definitely exist, but I've never seen one. + * + * 2) The number of MBT entries that we have is limited. PHB3 and PHB4 only + * 16 total and some are needed for. Most SR-IOV capable network cards can support + * more than 16 VFs on each port. + * + * We use b) when using a) would use more than 1/4 of the entire 64 bit MMIO + * window of the PHB. + * + * + * + * PHB4 (IODA3) added a few new features that would be useful for SR-IOV. It + * allowed the MBT to map 32bit MMIO space in addition to 64bit which allows + * us to support SR-IOV BARs in the 32bit MMIO window. This is useful since + * the Linux BAR allocation will place any BAR marked as non-prefetchable into + * the non-prefetchable bridge window, which is 32bit only. It also added two + * new modes: + * + * c) A segmented BAR similar to a), but each segment can be individually + * mapped to any PE. This is matches how the 32bit MMIO window worked on + * IODA1&2. + * + * d) A segmented BAR with 8, 64, or 128 segments. This works similarly to a), + * but with fewer segments and configurable base PE. + * + * i.e. The n'th segment maps to the (n + base)'th PE. + * + * The base PE is also required to be a multiple of the window size. + * + * Unfortunately, the OPAL API doesn't currently (as of skiboot v6.6) allow us + * to exploit any of the IODA3 features. + */ + +static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) +{ + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); + struct resource *res; + int i; + resource_size_t vf_bar_sz; + struct pnv_iov_data *iov; + int mul; + + iov = kzalloc(sizeof(*iov), GFP_KERNEL); + if (!iov) + goto disable_iov; + pdev->dev.archdata.iov_data = iov; + mul = phb->ioda.total_pe_num; + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || res->parent) + continue; + if (!pnv_pci_is_m64_flags(res->flags)) { + dev_warn(&pdev->dev, "Don't support SR-IOV with non M64 VF BAR%d: %pR. \n", + i, res); + goto disable_iov; + } + + vf_bar_sz = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); + + /* + * Generally, one segmented M64 BAR maps one IOV BAR. However, + * if a VF BAR is too large we end up wasting a lot of space. + * If each VF needs more than 1/4 of the default m64 segment + * then each VF BAR should be mapped in single-PE mode to reduce + * the amount of space required. This does however limit the + * number of VFs we can support. + * + * The 1/4 limit is arbitrary and can be tweaked. + */ + if (vf_bar_sz > (phb->ioda.m64_segsize >> 2)) { + /* + * On PHB3, the minimum size alignment of M64 BAR in + * single mode is 32MB. If this VF BAR is smaller than + * 32MB, but still too large for a segmented window + * then we can't map it and need to disable SR-IOV for + * this device. + */ + if (vf_bar_sz < SZ_32M) { + pci_err(pdev, "VF BAR%d: %pR can't be mapped in single PE mode\n", + i, res); + goto disable_iov; + } + + iov->m64_single_mode[i] = true; + continue; + } + + /* + * This BAR can be mapped with one segmented window, so adjust + * te resource size to accommodate. + */ + pci_dbg(pdev, " Fixing VF BAR%d: %pR to\n", i, res); + res->end = res->start + vf_bar_sz * mul - 1; + pci_dbg(pdev, " %pR\n", res); + + pci_info(pdev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)", + i, res, mul); + + iov->need_shift = true; + } + + return; + +disable_iov: + /* Save ourselves some MMIO space by disabling the unusable BARs */ + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + res->flags = 0; + res->end = res->start - 1; + } + + pdev->dev.archdata.iov_data = NULL; + kfree(iov); +} + +void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev) +{ + if (pdev->is_virtfn) { + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev); + + /* + * VF PEs are single-device PEs so their pdev pointer needs to + * be set. The pdev doesn't exist when the PE is allocated (in + * (pcibios_sriov_enable()) so we fix it up here. + */ + pe->pdev = pdev; + WARN_ON(!(pe->flags & PNV_IODA_PE_VF)); + } else if (pdev->is_physfn) { + /* + * For PFs adjust their allocated IOV resources to match what + * the PHB can support using its M64 BAR table. + */ + pnv_pci_ioda_fixup_iov_resources(pdev); + } +} + +resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, + int resno) +{ + resource_size_t align = pci_iov_resource_size(pdev, resno); + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); + struct pnv_iov_data *iov = pnv_iov_get(pdev); + + /* + * iov can be null if we have an SR-IOV device with IOV BAR that can't + * be placed in the m64 space (i.e. The BAR is 32bit or non-prefetch). + * In that case we don't allow VFs to be enabled since one of their + * BARs would not be placed in the correct PE. + */ + if (!iov) + return align; + + /* + * If we're using single mode then we can just use the native VF BAR + * alignment. We validated that it's possible to use a single PE + * window above when we did the fixup. + */ + if (iov->m64_single_mode[resno - PCI_IOV_RESOURCES]) + return align; + + /* + * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the + * SR-IOV. While from hardware perspective, the range mapped by M64 + * BAR should be size aligned. + * + * This function returns the total IOV BAR size if M64 BAR is in + * Shared PE mode or just VF BAR size if not. + * If the M64 BAR is in Single PE mode, return the VF BAR size or + * M64 segment size if IOV BAR size is less. + */ + return phb->ioda.total_pe_num * align; +} + +static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) +{ + struct pnv_iov_data *iov; + struct pnv_phb *phb; + int window_id; + + phb = pci_bus_to_pnvhb(pdev->bus); + iov = pnv_iov_get(pdev); + + for_each_set_bit(window_id, iov->used_m64_bar_mask, MAX_M64_BARS) { + opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + 0); + + clear_bit(window_id, &phb->ioda.m64_bar_alloc); + } + + return 0; +} + + +/* + * PHB3 and beyond support segmented windows. The window's address range + * is subdivided into phb->ioda.total_pe_num segments and there's a 1-1 + * mapping between PEs and segments. + */ +static int64_t pnv_ioda_map_m64_segmented(struct pnv_phb *phb, + int window_id, + resource_size_t start, + resource_size_t size) +{ + int64_t rc; + + rc = opal_pci_set_phb_mem_window(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + start, + 0, /* unused */ + size); + if (rc) + goto out; + + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + OPAL_ENABLE_M64_SPLIT); +out: + if (rc) + pr_err("Failed to map M64 window #%d: %lld\n", window_id, rc); + + return rc; +} + +static int64_t pnv_ioda_map_m64_single(struct pnv_phb *phb, + int pe_num, + int window_id, + resource_size_t start, + resource_size_t size) +{ + int64_t rc; + + /* + * The API for setting up m64 mmio windows seems to have been designed + * with P7-IOC in mind. For that chip each M64 BAR (window) had a fixed + * split of 8 equally sized segments each of which could individually + * assigned to a PE. + * + * The problem with this is that the API doesn't have any way to + * communicate the number of segments we want on a BAR. This wasn't + * a problem for p7-ioc since you didn't have a choice, but the + * single PE windows added in PHB3 don't map cleanly to this API. + * + * As a result we've got this slightly awkward process where we + * call opal_pci_map_pe_mmio_window() to put the single in single + * PE mode, and set the PE for the window before setting the address + * bounds. We need to do it this way because the single PE windows + * for PHB3 have different alignment requirements on PHB3. + */ + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + pe_num, + OPAL_M64_WINDOW_TYPE, + window_id, + 0); + if (rc) + goto out; + + /* + * NB: In single PE mode the window needs to be aligned to 32MB + */ + rc = opal_pci_set_phb_mem_window(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + start, + 0, /* ignored by FW, m64 is 1-1 */ + size); + if (rc) + goto out; + + /* + * Now actually enable it. We specified the BAR should be in "non-split" + * mode so FW will validate that the BAR is in single PE mode. + */ + rc = opal_pci_phb_mmio_enable(phb->opal_id, + OPAL_M64_WINDOW_TYPE, + window_id, + OPAL_ENABLE_M64_NON_SPLIT); +out: + if (rc) + pr_err("Error mapping single PE BAR\n"); + + return rc; +} + +static int pnv_pci_alloc_m64_bar(struct pnv_phb *phb, struct pnv_iov_data *iov) +{ + int win; + + do { + win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, + phb->ioda.m64_bar_idx + 1, 0); + + if (win >= phb->ioda.m64_bar_idx + 1) + return -1; + } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); + + set_bit(win, iov->used_m64_bar_mask); + + return win; +} + +static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) +{ + struct pnv_iov_data *iov; + struct pnv_phb *phb; + int win; + struct resource *res; + int i, j; + int64_t rc; + resource_size_t size, start; + int base_pe_num; + + phb = pci_bus_to_pnvhb(pdev->bus); + iov = pnv_iov_get(pdev); + + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &pdev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + + /* don't need single mode? map everything in one go! */ + if (!iov->m64_single_mode[i]) { + win = pnv_pci_alloc_m64_bar(phb, iov); + if (win < 0) + goto m64_failed; + + size = resource_size(res); + start = res->start; + + rc = pnv_ioda_map_m64_segmented(phb, win, start, size); + if (rc) + goto m64_failed; + + continue; + } + + /* otherwise map each VF with single PE BARs */ + size = pci_iov_resource_size(pdev, PCI_IOV_RESOURCES + i); + base_pe_num = iov->vf_pe_arr[0].pe_number; + + for (j = 0; j < num_vfs; j++) { + win = pnv_pci_alloc_m64_bar(phb, iov); + if (win < 0) + goto m64_failed; + + start = res->start + size * j; + rc = pnv_ioda_map_m64_single(phb, win, + base_pe_num + j, + start, + size); + if (rc) + goto m64_failed; + } + } + return 0; + +m64_failed: + pnv_pci_vf_release_m64(pdev, num_vfs); + return -EBUSY; +} + +static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) +{ + struct pnv_phb *phb; + struct pnv_ioda_pe *pe, *pe_n; + + phb = pci_bus_to_pnvhb(pdev->bus); + + if (!pdev->is_physfn) + return; + + /* FIXME: Use pnv_ioda_release_pe()? */ + list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { + if (pe->parent_dev != pdev) + continue; + + pnv_pci_ioda2_release_pe_dma(pe); + + /* Remove from list */ + mutex_lock(&phb->ioda.pe_list_mutex); + list_del(&pe->list); + mutex_unlock(&phb->ioda.pe_list_mutex); + + pnv_ioda_deconfigure_pe(phb, pe); + + pnv_ioda_free_pe(pe); + } +} + +static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) +{ + struct resource *res, res2; + struct pnv_iov_data *iov; + resource_size_t size; + u16 num_vfs; + int i; + + if (!dev->is_physfn) + return -EINVAL; + iov = pnv_iov_get(dev); + + /* + * "offset" is in VFs. The M64 windows are sized so that when they + * are segmented, each segment is the same size as the IOV BAR. + * Each segment is in a separate PE, and the high order bits of the + * address are the PE number. Therefore, each VF's BAR is in a + * separate PE, and changing the IOV BAR start address changes the + * range of PEs the VFs are in. + */ + num_vfs = iov->num_vfs; + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &dev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + if (iov->m64_single_mode[i]) + continue; + + /* + * The actual IOV BAR range is determined by the start address + * and the actual size for num_vfs VFs BAR. This check is to + * make sure that after shifting, the range will not overlap + * with another device. + */ + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); + res2.flags = res->flags; + res2.start = res->start + (size * offset); + res2.end = res2.start + (size * num_vfs) - 1; + + if (res2.end > res->end) { + dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", + i, &res2, res, num_vfs, offset); + return -EBUSY; + } + } + + /* + * Since M64 BAR shares segments among all possible 256 PEs, + * we have to shift the beginning of PF IOV BAR to make it start from + * the segment which belongs to the PE number assigned to the first VF. + * This creates a "hole" in the /proc/iomem which could be used for + * allocating other resources so we reserve this area below and + * release when IOV is released. + */ + for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { + res = &dev->resource[i + PCI_IOV_RESOURCES]; + if (!res->flags || !res->parent) + continue; + if (iov->m64_single_mode[i]) + continue; + + size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); + res2 = *res; + res->start += size * offset; + + dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n", + i, &res2, res, (offset > 0) ? "En" : "Dis", + num_vfs, offset); + + if (offset < 0) { + devm_release_resource(&dev->dev, &iov->holes[i]); + memset(&iov->holes[i], 0, sizeof(iov->holes[i])); + } + + pci_update_resource(dev, i + PCI_IOV_RESOURCES); + + if (offset > 0) { + iov->holes[i].start = res2.start; + iov->holes[i].end = res2.start + size * offset - 1; + iov->holes[i].flags = IORESOURCE_BUS; + iov->holes[i].name = "pnv_iov_reserved"; + devm_request_resource(&dev->dev, res->parent, + &iov->holes[i]); + } + } + return 0; +} + +static void pnv_pci_sriov_disable(struct pci_dev *pdev) +{ + u16 num_vfs, base_pe; + struct pnv_iov_data *iov; + + iov = pnv_iov_get(pdev); + if (WARN_ON(!iov)) + return; + + num_vfs = iov->num_vfs; + base_pe = iov->vf_pe_arr[0].pe_number; + + /* Release VF PEs */ + pnv_ioda_release_vf_PE(pdev); + + /* Un-shift the IOV BARs if we need to */ + if (iov->need_shift) + pnv_pci_vf_resource_shift(pdev, -base_pe); + + /* Release M64 windows */ + pnv_pci_vf_release_m64(pdev, num_vfs); +} + +static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) +{ + struct pnv_phb *phb; + struct pnv_ioda_pe *pe; + int pe_num; + u16 vf_index; + struct pnv_iov_data *iov; + struct pci_dn *pdn; + + if (!pdev->is_physfn) + return; + + phb = pci_bus_to_pnvhb(pdev->bus); + pdn = pci_get_pdn(pdev); + iov = pnv_iov_get(pdev); + + /* Reserve PE for each VF */ + for (vf_index = 0; vf_index < num_vfs; vf_index++) { + int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index); + int vf_bus = pci_iov_virtfn_bus(pdev, vf_index); + struct pci_dn *vf_pdn; + + pe = &iov->vf_pe_arr[vf_index]; + pe->phb = phb; + pe->flags = PNV_IODA_PE_VF; + pe->pbus = NULL; + pe->parent_dev = pdev; + pe->mve_number = -1; + pe->rid = (vf_bus << 8) | vf_devfn; + + pe_num = pe->pe_number; + pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n", + pci_domain_nr(pdev->bus), pdev->bus->number, + PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num); + + if (pnv_ioda_configure_pe(phb, pe)) { + /* XXX What do we do here ? */ + pnv_ioda_free_pe(pe); + pe->pdev = NULL; + continue; + } + + /* Put PE to the list */ + mutex_lock(&phb->ioda.pe_list_mutex); + list_add_tail(&pe->list, &phb->ioda.pe_list); + mutex_unlock(&phb->ioda.pe_list_mutex); + + /* associate this pe to its pdn */ + list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) { + if (vf_pdn->busno == vf_bus && + vf_pdn->devfn == vf_devfn) { + vf_pdn->pe_number = pe_num; + break; + } + } + + pnv_pci_ioda2_setup_dma_pe(phb, pe); + } +} + +static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +{ + struct pnv_ioda_pe *base_pe; + struct pnv_iov_data *iov; + struct pnv_phb *phb; + int ret; + u16 i; + + phb = pci_bus_to_pnvhb(pdev->bus); + iov = pnv_iov_get(pdev); + + /* + * There's a calls to IODA2 PE setup code littered throughout. We could + * probably fix that, but we'd still have problems due to the + * restriction inherent on IODA1 PHBs. + * + * NB: We class IODA3 as IODA2 since they're very similar. + */ + if (phb->type != PNV_PHB_IODA2) { + pci_err(pdev, "SR-IOV is not supported on this PHB\n"); + return -ENXIO; + } + + if (!iov) { + dev_info(&pdev->dev, "don't support this SRIOV device with non 64bit-prefetchable IOV BAR\n"); + return -ENOSPC; + } + + /* allocate a contiguous block of PEs for our VFs */ + base_pe = pnv_ioda_alloc_pe(phb, num_vfs); + if (!base_pe) { + pci_err(pdev, "Unable to allocate PEs for %d VFs\n", num_vfs); + return -EBUSY; + } + + iov->vf_pe_arr = base_pe; + iov->num_vfs = num_vfs; + + /* Assign M64 window accordingly */ + ret = pnv_pci_vf_assign_m64(pdev, num_vfs); + if (ret) { + dev_info(&pdev->dev, "Not enough M64 window resources\n"); + goto m64_failed; + } + + /* + * When using one M64 BAR to map one IOV BAR, we need to shift + * the IOV BAR according to the PE# allocated to the VFs. + * Otherwise, the PE# for the VF will conflict with others. + */ + if (iov->need_shift) { + ret = pnv_pci_vf_resource_shift(pdev, base_pe->pe_number); + if (ret) + goto shift_failed; + } + + /* Setup VF PEs */ + pnv_ioda_setup_vf_PE(pdev, num_vfs); + + return 0; + +shift_failed: + pnv_pci_vf_release_m64(pdev, num_vfs); + +m64_failed: + for (i = 0; i < num_vfs; i++) + pnv_ioda_free_pe(&iov->vf_pe_arr[i]); + + return ret; +} + +int pnv_pcibios_sriov_disable(struct pci_dev *pdev) +{ + pnv_pci_sriov_disable(pdev); + + /* Release PCI data */ + remove_sriov_vf_pdns(pdev); + return 0; +} + +int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) +{ + /* Allocate PCI data */ + add_sriov_vf_pdns(pdev); + + return pnv_pci_sriov_enable(pdev, num_vfs); +} diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 45fb70b4bfa7..b2c1da025410 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Support PCI/PCIe on PowerNV platforms * * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> @@ -18,11 +14,9 @@ #include <linux/io.h> #include <linux/msi.h> #include <linux/iommu.h> -#include <linux/sched/mm.h> #include <asm/sections.h> #include <asm/io.h> -#include <asm/prom.h> #include <asm/pci-bridge.h> #include <asm/machdep.h> #include <asm/msi_bitmap.h> @@ -38,12 +32,9 @@ #include "powernv.h" #include "pci.h" -static DEFINE_MUTEX(p2p_mutex); -static DEFINE_MUTEX(tunnel_mutex); - int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id) { - struct device_node *parent = np; + struct device_node *node = np; u32 bdfn; u64 phbid; int ret; @@ -53,24 +44,29 @@ int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id) return -ENXIO; bdfn = ((bdfn & 0x00ffff00) >> 8); - while ((parent = of_get_parent(parent))) { - if (!PCI_DN(parent)) { - of_node_put(parent); + for (node = np; node; node = of_get_parent(node)) { + if (!PCI_DN(node)) { + of_node_put(node); break; } - if (!of_device_is_compatible(parent, "ibm,ioda2-phb")) { - of_node_put(parent); + if (!of_device_is_compatible(node, "ibm,ioda2-phb") && + !of_device_is_compatible(node, "ibm,ioda3-phb") && + !of_device_is_compatible(node, "ibm,ioda2-npu2-opencapi-phb")) { + of_node_put(node); continue; } - ret = of_property_read_u64(parent, "ibm,opal-phbid", &phbid); + ret = of_property_read_u64(node, "ibm,opal-phbid", &phbid); if (ret) { - of_node_put(parent); + of_node_put(node); return -ENXIO; } - *id = PCI_SLOT_ID(phbid, bdfn); + if (of_device_is_compatible(node, "ibm,ioda2-npu2-opencapi-phb")) + *id = PCI_PHB_SLOT_ID(phbid); + else + *id = PCI_SLOT_ID(phbid, bdfn); return 0; } @@ -160,75 +156,6 @@ exit: } EXPORT_SYMBOL_GPL(pnv_pci_set_power_state); -int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - struct msi_desc *entry; - struct msi_msg msg; - int hwirq; - unsigned int virq; - int rc; - - if (WARN_ON(!phb) || !phb->msi_bmp.bitmap) - return -ENODEV; - - if (pdev->no_64bit_msi && !phb->msi32_support) - return -ENODEV; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->msi_attrib.is_64 && !phb->msi32_support) { - pr_warn("%s: Supports only 64-bit MSIs\n", - pci_name(pdev)); - return -ENXIO; - } - hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1); - if (hwirq < 0) { - pr_warn("%s: Failed to find a free MSI\n", - pci_name(pdev)); - return -ENOSPC; - } - virq = irq_create_mapping(NULL, phb->msi_base + hwirq); - if (!virq) { - pr_warn("%s: Failed to map MSI to linux irq\n", - pci_name(pdev)); - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); - return -ENOMEM; - } - rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq, - virq, entry->msi_attrib.is_64, &msg); - if (rc) { - pr_warn("%s: Failed to setup MSI\n", pci_name(pdev)); - irq_dispose_mapping(virq); - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); - return rc; - } - irq_set_msi_desc(virq, entry); - pci_write_msi_msg(virq, &msg); - } - return 0; -} - -void pnv_teardown_msi_irqs(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - struct msi_desc *entry; - irq_hw_number_t hwirq; - - if (WARN_ON(!phb)) - return; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->irq) - continue; - hwirq = virq_to_hw(entry->irq); - irq_set_msi_desc(entry->irq, NULL); - irq_dispose_mapping(entry->irq); - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, 1); - } -} - /* Nicely print the contents of the PE State Tables (PEST). */ static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size) { @@ -713,7 +640,7 @@ int pnv_pci_cfg_write(struct pci_dn *pdn, return PCIBIOS_SUCCESSFUL; } -#if CONFIG_EEH +#ifdef CONFIG_EEH static bool pnv_pci_cfg_check(struct pci_dn *pdn) { struct eeh_dev *edev = NULL; @@ -814,259 +741,6 @@ struct iommu_table *pnv_pci_table_alloc(int nid) return tbl; } -void pnv_pci_dma_dev_setup(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; -#ifdef CONFIG_PCI_IOV - struct pnv_ioda_pe *pe; - struct pci_dn *pdn; - - /* Fix the VF pdn PE number */ - if (pdev->is_virtfn) { - pdn = pci_get_pdn(pdev); - WARN_ON(pdn->pe_number != IODA_INVALID_PE); - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - if (pe->rid == ((pdev->bus->number << 8) | - (pdev->devfn & 0xff))) { - pdn->pe_number = pe->pe_number; - pe->pdev = pdev; - break; - } - } - } -#endif /* CONFIG_PCI_IOV */ - - if (phb && phb->dma_dev_setup) - phb->dma_dev_setup(phb, pdev); -} - -void pnv_pci_dma_bus_setup(struct pci_bus *bus) -{ - struct pci_controller *hose = bus->sysdata; - struct pnv_phb *phb = hose->private_data; - struct pnv_ioda_pe *pe; - - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))) - continue; - - if (!pe->pbus) - continue; - - if (bus->number == ((pe->rid >> 8) & 0xFF)) { - pe->pbus = bus; - break; - } - } -} - -int pnv_pci_set_p2p(struct pci_dev *initiator, struct pci_dev *target, u64 desc) -{ - struct pci_controller *hose; - struct pnv_phb *phb_init, *phb_target; - struct pnv_ioda_pe *pe_init; - int rc; - - if (!opal_check_token(OPAL_PCI_SET_P2P)) - return -ENXIO; - - hose = pci_bus_to_host(initiator->bus); - phb_init = hose->private_data; - - hose = pci_bus_to_host(target->bus); - phb_target = hose->private_data; - - pe_init = pnv_ioda_get_pe(initiator); - if (!pe_init) - return -ENODEV; - - /* - * Configuring the initiator's PHB requires to adjust its - * TVE#1 setting. Since the same device can be an initiator - * several times for different target devices, we need to keep - * a reference count to know when we can restore the default - * bypass setting on its TVE#1 when disabling. Opal is not - * tracking PE states, so we add a reference count on the PE - * in linux. - * - * For the target, the configuration is per PHB, so we keep a - * target reference count on the PHB. - */ - mutex_lock(&p2p_mutex); - - if (desc & OPAL_PCI_P2P_ENABLE) { - /* always go to opal to validate the configuration */ - rc = opal_pci_set_p2p(phb_init->opal_id, phb_target->opal_id, - desc, pe_init->pe_number); - - if (rc != OPAL_SUCCESS) { - rc = -EIO; - goto out; - } - - pe_init->p2p_initiator_count++; - phb_target->p2p_target_count++; - } else { - if (!pe_init->p2p_initiator_count || - !phb_target->p2p_target_count) { - rc = -EINVAL; - goto out; - } - - if (--pe_init->p2p_initiator_count == 0) - pnv_pci_ioda2_set_bypass(pe_init, true); - - if (--phb_target->p2p_target_count == 0) { - rc = opal_pci_set_p2p(phb_init->opal_id, - phb_target->opal_id, desc, - pe_init->pe_number); - if (rc != OPAL_SUCCESS) { - rc = -EIO; - goto out; - } - } - } - rc = 0; -out: - mutex_unlock(&p2p_mutex); - return rc; -} -EXPORT_SYMBOL_GPL(pnv_pci_set_p2p); - -struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - - return of_node_get(hose->dn); -} -EXPORT_SYMBOL(pnv_pci_get_phb_node); - -int pnv_pci_enable_tunnel(struct pci_dev *dev, u64 *asnind) -{ - struct device_node *np; - const __be32 *prop; - struct pnv_ioda_pe *pe; - uint16_t window_id; - int rc; - - if (!radix_enabled()) - return -ENXIO; - - if (!(np = pnv_pci_get_phb_node(dev))) - return -ENXIO; - - prop = of_get_property(np, "ibm,phb-indications", NULL); - of_node_put(np); - - if (!prop || !prop[1]) - return -ENXIO; - - *asnind = (u64)be32_to_cpu(prop[1]); - pe = pnv_ioda_get_pe(dev); - if (!pe) - return -ENODEV; - - /* Increase real window size to accept as_notify messages. */ - window_id = (pe->pe_number << 1 ) + 1; - rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, pe->pe_number, - window_id, pe->tce_bypass_base, - (uint64_t)1 << 48); - return opal_error_code(rc); -} -EXPORT_SYMBOL_GPL(pnv_pci_enable_tunnel); - -int pnv_pci_disable_tunnel(struct pci_dev *dev) -{ - struct pnv_ioda_pe *pe; - - pe = pnv_ioda_get_pe(dev); - if (!pe) - return -ENODEV; - - /* Restore default real window size. */ - pnv_pci_ioda2_set_bypass(pe, true); - return 0; -} -EXPORT_SYMBOL_GPL(pnv_pci_disable_tunnel); - -int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable) -{ - __be64 val; - struct pci_controller *hose; - struct pnv_phb *phb; - u64 tunnel_bar; - int rc; - - if (!opal_check_token(OPAL_PCI_GET_PBCQ_TUNNEL_BAR)) - return -ENXIO; - if (!opal_check_token(OPAL_PCI_SET_PBCQ_TUNNEL_BAR)) - return -ENXIO; - - hose = pci_bus_to_host(dev->bus); - phb = hose->private_data; - - mutex_lock(&tunnel_mutex); - rc = opal_pci_get_pbcq_tunnel_bar(phb->opal_id, &val); - if (rc != OPAL_SUCCESS) { - rc = -EIO; - goto out; - } - tunnel_bar = be64_to_cpu(val); - if (enable) { - /* - * Only one device per PHB can use atomics. - * Our policy is first-come, first-served. - */ - if (tunnel_bar) { - if (tunnel_bar != addr) - rc = -EBUSY; - else - rc = 0; /* Setting same address twice is ok */ - goto out; - } - } else { - /* - * The device that owns atomics and wants to release - * them must pass the same address with enable == 0. - */ - if (tunnel_bar != addr) { - rc = -EPERM; - goto out; - } - addr = 0x0ULL; - } - rc = opal_pci_set_pbcq_tunnel_bar(phb->opal_id, addr); - rc = opal_error_code(rc); -out: - mutex_unlock(&tunnel_mutex); - return rc; -} -EXPORT_SYMBOL_GPL(pnv_pci_set_tunnel_bar); - -#ifdef CONFIG_PPC64 /* for thread.tidr */ -int pnv_pci_get_as_notify_info(struct task_struct *task, u32 *lpid, u32 *pid, - u32 *tid) -{ - struct mm_struct *mm = NULL; - - if (task == NULL) - return -EINVAL; - - mm = get_task_mm(task); - if (mm == NULL) - return -EINVAL; - - *pid = mm->context.id; - mmput(mm); - - *tid = task->thread.tidr; - *lpid = mfspr(SPRN_LPID); - return 0; -} -EXPORT_SYMBOL_GPL(pnv_pci_get_as_notify_info); -#endif - void pnv_pci_shutdown(void) { struct pci_controller *hose; @@ -1079,7 +753,7 @@ void pnv_pci_shutdown(void) /* Fixup wrong class code in p7ioc and p8 root complex */ static void pnv_p7ioc_rc_quirk(struct pci_dev *dev) { - dev->class = PCI_CLASS_BRIDGE_PCI << 8; + dev->class = PCI_CLASS_BRIDGE_PCI_NORMAL; } DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk); @@ -1093,10 +767,22 @@ void __init pnv_pci_init(void) if (!firmware_has_feature(FW_FEATURE_OPAL)) return; - /* Look for IODA IO-Hubs. */ - for_each_compatible_node(np, NULL, "ibm,ioda-hub") { - pnv_pci_init_ioda_hub(np); - } +#ifdef CONFIG_PCIEPORTBUS + /* + * On PowerNV PCIe devices are (currently) managed in cooperation + * with firmware. This isn't *strictly* required, but there's enough + * assumptions baked into both firmware and the platform code that + * it's unwise to allow the portbus services to be used. + * + * We need to fix this eventually, but for now set this flag to disable + * the portbus driver. The AER service isn't required since that AER + * events are handled via EEH. The pciehp hotplug driver can't work + * without kernel changes (and portbus binding breaks pnv_php). The + * other services also require some thinking about how we're going + * to integrate them. + */ + pcie_ports_disabled = true; +#endif /* Look for ioda2 built-in PHB3's */ for_each_compatible_node(np, NULL, "ibm,ioda2-phb") @@ -1106,17 +792,6 @@ void __init pnv_pci_init(void) for_each_compatible_node(np, NULL, "ibm,ioda3-phb") pnv_pci_init_ioda2_phb(np); - /* Look for NPU PHBs */ - for_each_compatible_node(np, NULL, "ibm,ioda2-npu-phb") - pnv_pci_init_npu_phb(np); - - /* - * Look for NPU2 PHBs which we treat mostly as NPU PHBs with - * the exception of TCE kill which requires an OPAL call. - */ - for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-phb") - pnv_pci_init_npu_phb(np); - /* Look for NPU2 OpenCAPI PHBs */ for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-opencapi-phb") pnv_pci_init_npu2_opencapi_phb(np); @@ -1124,46 +799,3 @@ void __init pnv_pci_init(void) /* Configure IOMMU DMA hooks */ set_pci_dma_ops(&dma_iommu_ops); } - -static int pnv_tce_iommu_bus_notifier(struct notifier_block *nb, - unsigned long action, void *data) -{ - struct device *dev = data; - struct pci_dev *pdev; - struct pci_dn *pdn; - struct pnv_ioda_pe *pe; - struct pci_controller *hose; - struct pnv_phb *phb; - - switch (action) { - case BUS_NOTIFY_ADD_DEVICE: - pdev = to_pci_dev(dev); - pdn = pci_get_pdn(pdev); - hose = pci_bus_to_host(pdev->bus); - phb = hose->private_data; - - WARN_ON_ONCE(!phb); - if (!pdn || pdn->pe_number == IODA_INVALID_PE || !phb) - return 0; - - pe = &phb->ioda.pe_array[pdn->pe_number]; - iommu_add_device(&pe->table_group, dev); - return 0; - case BUS_NOTIFY_DEL_DEVICE: - iommu_del_device(dev); - return 0; - default: - return 0; - } -} - -static struct notifier_block pnv_tce_iommu_bus_nb = { - .notifier_call = pnv_tce_iommu_bus_notifier, -}; - -static int __init pnv_tce_iommu_bus_notifier_init(void) -{ - bus_register_notifier(&pci_bus_type, &pnv_tce_iommu_bus_nb); - return 0; -} -machine_subsys_initcall_sync(powernv, pnv_tce_iommu_bus_notifier_init); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 8e36da379252..42075501663b 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -2,6 +2,7 @@ #ifndef __POWERNV_PCI_H #define __POWERNV_PCI_H +#include <linux/compiler.h> /* for __printf */ #include <linux/iommu.h> #include <asm/iommu.h> #include <asm/msi_bitmap.h> @@ -9,10 +10,8 @@ struct pci_dn; enum pnv_phb_type { - PNV_PHB_IODA1 = 0, - PNV_PHB_IODA2 = 1, - PNV_PHB_NPU_NVLINK = 2, - PNV_PHB_NPU_OCAPI = 3, + PNV_PHB_IODA2, + PNV_PHB_NPU_OCAPI, }; /* Precise PHB model for error management */ @@ -20,8 +19,6 @@ enum pnv_phb_model { PNV_PHB_MODEL_UNKNOWN, PNV_PHB_MODEL_P7IOC, PNV_PHB_MODEL_PHB3, - PNV_PHB_MODEL_NPU, - PNV_PHB_MODEL_NPU2, }; #define PNV_PCI_DIAG_BUF_SIZE 8192 @@ -32,6 +29,24 @@ enum pnv_phb_model { #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ +/* + * A brief note on PNV_IODA_PE_BUS_ALL + * + * This is needed because of the behaviour of PCIe-to-PCI bridges. The PHB uses + * the Requester ID field of the PCIe request header to determine the device + * (and PE) that initiated a DMA. In legacy PCI individual memory read/write + * requests aren't tagged with the RID. To work around this the PCIe-to-PCI + * bridge will use (secondary_bus_no << 8) | 0x00 as the RID on the PCIe side. + * + * PCIe-to-X bridges have a similar issue even though PCI-X requests also have + * a RID in the transaction header. The PCIe-to-X bridge is permitted to "take + * ownership" of a transaction by a PCI-X device when forwarding it to the PCIe + * side of the bridge. + * + * To work around these problems we use the BUS_ALL flag since every subordinate + * bus of the bridge should go into the same PE. + */ + /* Indicates operations are frozen for a PE: MMIO in PESTA & DMA in PESTB. */ #define PNV_IODA_STOPPED_STATE 0x8000000000000000 @@ -62,13 +77,19 @@ struct pnv_ioda_pe { /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ struct iommu_table_group table_group; - struct npu_comp *npucomp; /* 64-bit TCE bypass region */ bool tce_bypass_enabled; uint64_t tce_bypass_base; - /* MSIs. MVE index is identical for for 32 and 64 bit MSI + /* + * Used to track whether we've done DMA setup for this PE or not. We + * want to defer allocating TCE tables, etc until we've added a + * non-bridge device to the PE. + */ + bool dma_setup_done; + + /* MSIs. MVE index is identical for 32 and 64 bit MSI * and -1 if not supported. (It's actually identical to the * PE number) */ @@ -78,9 +99,6 @@ struct pnv_ioda_pe { struct pnv_ioda_pe *master; struct list_head slaves; - /* PCI peer-to-peer*/ - int p2p_initiator_count; - /* Link in list of PE#s */ struct list_head list; }; @@ -96,7 +114,6 @@ struct pnv_phb { int flags; void __iomem *regs; u64 regs_phys; - int initialized; spinlock_t lock; #ifdef CONFIG_DEBUG_FS @@ -105,12 +122,7 @@ struct pnv_phb { #endif unsigned int msi_base; - unsigned int msi32_support; struct msi_bitmap msi_bmp; - int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int virq, - unsigned int is_64, struct msi_msg *msg); - void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); int (*init_m64)(struct pnv_phb *phb); int (*get_pe_state)(struct pnv_phb *phb, int pe_no); void (*freeze_pe)(struct pnv_phb *phb, int pe_no); @@ -121,7 +133,6 @@ struct pnv_phb { unsigned int total_pe_num; unsigned int reserved_pe_idx; unsigned int root_pe_idx; - bool root_pe_populated; /* 32-bit MMIO window */ unsigned int m32_size; @@ -133,6 +144,7 @@ struct pnv_phb { unsigned long m64_size; unsigned long m64_segsize; unsigned long m64_base; +#define MAX_M64_BARS 64 unsigned long m64_bar_alloc; /* IO ports */ @@ -150,12 +162,7 @@ struct pnv_phb { unsigned int *m32_segmap; unsigned int *io_segmap; - /* DMA32 segment maps - IODA1 only */ - unsigned int dma32_count; - unsigned int *dma32_segmap; - /* IRQ chip */ - int irq_chip_init; struct irq_chip irq_chip; /* Sorted list of used PE's based @@ -171,10 +178,91 @@ struct pnv_phb { /* PHB and hub diagnostics */ unsigned int diag_data_size; u8 *diag_data; +}; + + +/* IODA PE management */ + +static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r) +{ + /* + * WARNING: We cannot rely on the resource flags. The Linux PCI + * allocation code sometimes decides to put a 64-bit prefetchable + * BAR in the 32-bit window, so we have to compare the addresses. + * + * For simplicity we only test resource start. + */ + return (r->start >= phb->ioda.m64_base && + r->start < (phb->ioda.m64_base + phb->ioda.m64_size)); +} + +static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags) +{ + unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH); + + return (resource_flags & flags) == flags; +} + +int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); +int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); + +void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe); +void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe); + +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count); +void pnv_ioda_free_pe(struct pnv_ioda_pe *pe); - int p2p_target_count; +#ifdef CONFIG_PCI_IOV +/* + * For SR-IOV we want to put each VF's MMIO resource in to a separate PE. + * This requires a bit of acrobatics with the MMIO -> PE configuration + * and this structure is used to keep track of it all. + */ +struct pnv_iov_data { + /* number of VFs enabled */ + u16 num_vfs; + + /* pointer to the array of VF PEs. num_vfs long*/ + struct pnv_ioda_pe *vf_pe_arr; + + /* Did we map the VF BAR with single-PE IODA BARs? */ + bool m64_single_mode[PCI_SRIOV_NUM_BARS]; + + /* + * True if we're using any segmented windows. In that case we need + * shift the start of the IOV resource the segment corresponding to + * the allocated PE. + */ + bool need_shift; + + /* + * Bit mask used to track which m64 windows are used to map the + * SR-IOV BARs for this device. + */ + DECLARE_BITMAP(used_m64_bar_mask, MAX_M64_BARS); + + /* + * If we map the SR-IOV BARs with a segmented window then + * parts of that window will be "claimed" by other PEs. + * + * "holes" here is used to reserve the leading portion + * of the window that is used by other (non VF) PEs. + */ + struct resource holes[PCI_SRIOV_NUM_BARS]; }; +static inline struct pnv_iov_data *pnv_iov_get(struct pci_dev *pdev) +{ + return pdev->dev.archdata.iov_data; +} + +void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev); +resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, int resno); + +int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs); +int pnv_pcibios_sriov_disable(struct pci_dev *pdev); +#endif /* CONFIG_PCI_IOV */ + extern struct pci_ops pnv_pci_ops; void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, @@ -185,25 +273,18 @@ int pnv_pci_cfg_write(struct pci_dn *pdn, int where, int size, u32 val); extern struct iommu_table *pnv_pci_table_alloc(int nid); -extern void pnv_pci_init_ioda_hub(struct device_node *np); extern void pnv_pci_init_ioda2_phb(struct device_node *np); -extern void pnv_pci_init_npu_phb(struct device_node *np); extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np); -extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr); extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); -extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev); -extern void pnv_pci_dma_bus_setup(struct pci_bus *bus); -extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); -extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); +extern struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn); extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); -extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); -extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, __u64 window_size, __u32 levels); extern int pnv_eeh_post_init(void); +__printf(3, 4) extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...); #define pe_err(pe, fmt, ...) \ @@ -213,17 +294,8 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, #define pe_info(pe, fmt, ...) \ pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) -/* Nvlink functions */ -extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); -extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); -extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); -extern struct iommu_table_group *pnv_try_setup_npu_table_group( - struct pnv_ioda_pe *pe); -extern struct iommu_table_group *pnv_npu_compound_attach( - struct pnv_ioda_pe *pe); - /* pci-ioda-tce.c */ -#define POWERNV_IOMMU_DEFAULT_LEVELS 1 +#define POWERNV_IOMMU_DEFAULT_LEVELS 2 #define POWERNV_IOMMU_MAX_LEVELS 5 extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, @@ -231,8 +303,7 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long attrs); extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); extern int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction, - bool alloc); + unsigned long *hpa, enum dma_data_direction *direction); extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, bool alloc); extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); @@ -251,4 +322,16 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, void *tce_mem, u64 tce_size, u64 dma_offset, unsigned int page_shift); +extern unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb); + +static inline struct pnv_phb *pci_bus_to_pnvhb(struct pci_bus *bus) +{ + struct pci_controller *hose = bus->sysdata; + + if (hose) + return hose->private_data; + + return NULL; +} + #endif /* __POWERNV_PCI_H */ diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index fd4a1c5a6369..866efdc103fd 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -2,6 +2,13 @@ #ifndef _POWERNV_H #define _POWERNV_H +/* + * There's various hacks scattered throughout the generic powerpc arch code + * that needs to call into powernv platform stuff. The prototypes for those + * functions are in asm/powernv.h + */ +#include <asm/powernv.h> + #ifdef CONFIG_SMP extern void pnv_smp_init(void); #else @@ -30,4 +37,11 @@ extern void opal_event_shutdown(void); bool cpu_core_split_required(void); +struct memcons; +ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count); +u32 __init memcons_get_size(struct memcons *mc); +struct memcons *__init memcons_init(struct device_node *node, const char *mc_prop_name); + +void pnv_rng_init(void); + #endif /* _POWERNV_H */ diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c index 718f50ed22f1..196aa70fe043 100644 --- a/arch/powerpc/platforms/powernv/rng.c +++ b/arch/powerpc/platforms/powernv/rng.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2013, Michael Ellerman, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "powernv-rng: " fmt @@ -21,33 +17,28 @@ #include <asm/prom.h> #include <asm/machdep.h> #include <asm/smp.h> +#include "powernv.h" #define DARN_ERR 0xFFFFFFFFFFFFFFFFul -struct powernv_rng { +struct pnv_rng { void __iomem *regs; void __iomem *regs_real; unsigned long mask; }; -static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng); +static DEFINE_PER_CPU(struct pnv_rng *, pnv_rng); - -int powernv_hwrng_present(void) -{ - struct powernv_rng *rng; - - rng = get_cpu_var(powernv_rng); - put_cpu_var(rng); - return rng != NULL; -} - -static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) +static unsigned long rng_whiten(struct pnv_rng *rng, unsigned long val) { unsigned long parity; /* Calculate the parity of the value */ - asm ("popcntd %0,%1" : "=r" (parity) : "r" (val)); + asm (".machine push; \ + .machine power7; \ + popcntd %0,%1; \ + .machine pop;" + : "=r" (parity) : "r" (val)); /* xor our value with the previous mask */ val ^= rng->mask; @@ -58,18 +49,7 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val) return val; } -int powernv_get_random_real_mode(unsigned long *v) -{ - struct powernv_rng *rng; - - rng = raw_cpu_read(powernv_rng); - - *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real)); - - return 1; -} - -int powernv_get_random_darn(unsigned long *v) +static int pnv_get_random_darn(unsigned long *v) { unsigned long val; @@ -84,7 +64,7 @@ int powernv_get_random_darn(unsigned long *v) return 1; } -static int initialise_darn(void) +static int __init initialise_darn(void) { unsigned long val; int i; @@ -93,32 +73,31 @@ static int initialise_darn(void) return -ENODEV; for (i = 0; i < 10; i++) { - if (powernv_get_random_darn(&val)) { - ppc_md.get_random_seed = powernv_get_random_darn; + if (pnv_get_random_darn(&val)) { + ppc_md.get_random_seed = pnv_get_random_darn; return 0; } } - - pr_warn("Unable to use DARN for get_random_seed()\n"); - return -EIO; } -int powernv_get_random_long(unsigned long *v) +int pnv_get_random_long(unsigned long *v) { - struct powernv_rng *rng; - - rng = get_cpu_var(powernv_rng); - - *v = rng_whiten(rng, in_be64(rng->regs)); - - put_cpu_var(rng); - + struct pnv_rng *rng; + + if (mfmsr() & MSR_DR) { + rng = get_cpu_var(pnv_rng); + *v = rng_whiten(rng, in_be64(rng->regs)); + put_cpu_var(rng); + } else { + rng = raw_cpu_read(pnv_rng); + *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real)); + } return 1; } -EXPORT_SYMBOL_GPL(powernv_get_random_long); +EXPORT_SYMBOL_GPL(pnv_get_random_long); -static __init void rng_init_per_cpu(struct powernv_rng *rng, +static __init void rng_init_per_cpu(struct pnv_rng *rng, struct device_node *dn) { int chip_id, cpu; @@ -128,16 +107,16 @@ static __init void rng_init_per_cpu(struct powernv_rng *rng, pr_warn("No ibm,chip-id found for %pOF.\n", dn); for_each_possible_cpu(cpu) { - if (per_cpu(powernv_rng, cpu) == NULL || + if (per_cpu(pnv_rng, cpu) == NULL || cpu_to_chip_id(cpu) == chip_id) { - per_cpu(powernv_rng, cpu) = rng; + per_cpu(pnv_rng, cpu) = rng; } } } static __init int rng_create(struct device_node *dn) { - struct powernv_rng *rng; + struct pnv_rng *rng; struct resource res; unsigned long val; @@ -163,32 +142,59 @@ static __init int rng_create(struct device_node *dn) rng_init_per_cpu(rng, dn); - pr_info_once("Registering arch random hook.\n"); - - ppc_md.get_random_seed = powernv_get_random_long; + ppc_md.get_random_seed = pnv_get_random_long; return 0; } -static __init int rng_init(void) +static int __init pnv_get_random_long_early(unsigned long *v) { struct device_node *dn; - int rc; - - for_each_compatible_node(dn, NULL, "ibm,power-rng") { - rc = rng_create(dn); - if (rc) { - pr_err("Failed creating rng for %pOF (%d).\n", - dn, rc); - continue; - } - /* Create devices for hwrng driver */ - of_platform_device_create(dn, NULL, NULL); - } + if (!slab_is_available()) + return 0; + + if (cmpxchg(&ppc_md.get_random_seed, pnv_get_random_long_early, + NULL) != pnv_get_random_long_early) + return 0; + + for_each_compatible_node(dn, NULL, "ibm,power-rng") + rng_create(dn); + + if (!ppc_md.get_random_seed) + return 0; + return ppc_md.get_random_seed(v); +} + +void __init pnv_rng_init(void) +{ + struct device_node *dn; + + /* Prefer darn over the rest. */ + if (!initialise_darn()) + return; + + dn = of_find_compatible_node(NULL, NULL, "ibm,power-rng"); + if (dn) + ppc_md.get_random_seed = pnv_get_random_long_early; + + of_node_put(dn); +} - initialise_darn(); +static int __init pnv_rng_late_init(void) +{ + struct device_node *dn; + unsigned long v; + + /* In case it wasn't called during init for some other reason. */ + if (ppc_md.get_random_seed == pnv_get_random_long_early) + pnv_get_random_long_early(&v); + + if (ppc_md.get_random_seed == pnv_get_random_long) { + for_each_compatible_node(dn, NULL, "ibm,power-rng") + of_platform_device_create(dn, NULL, NULL); + } return 0; } -machine_subsys_initcall(powernv, rng_init); +machine_subsys_initcall(powernv, pnv_rng_late_init); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 14befee4b3f1..4dbb47ddbdcc 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * PowerNV setup code. * * Copyright 2011 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #undef DEBUG @@ -21,6 +17,7 @@ #include <linux/console.h> #include <linux/delay.h> #include <linux/irq.h> +#include <linux/seq_buf.h> #include <linux/seq_file.h> #include <linux/of.h> #include <linux/of_fdt.h> @@ -28,6 +25,7 @@ #include <linux/bug.h> #include <linux/pci.h> #include <linux/cpufreq.h> +#include <linux/memblock.h> #include <asm/machdep.h> #include <asm/firmware.h> @@ -43,7 +41,7 @@ #include "powernv.h" -static bool fw_feature_is(const char *state, const char *name, +static bool __init fw_feature_is(const char *state, const char *name, struct device_node *fw_features) { struct device_node *np; @@ -58,7 +56,7 @@ static bool fw_feature_is(const char *state, const char *name, return rc; } -static void init_fw_feat_flags(struct device_node *np) +static void __init init_fw_feat_flags(struct device_node *np) { if (fw_feature_is("enabled", "inst-spec-barrier-ori31,31,0", np)) security_ftr_set(SEC_FTR_SPEC_BAR_ORI31); @@ -99,9 +97,18 @@ static void init_fw_feat_flags(struct device_node *np) if (fw_feature_is("disabled", "needs-spec-barrier-for-bound-checks", np)) security_ftr_clear(SEC_FTR_BNDS_CHK_SPEC_BAR); + + if (fw_feature_is("enabled", "no-need-l1d-flush-msr-pr-1-to-0", np)) + security_ftr_clear(SEC_FTR_L1D_FLUSH_ENTRY); + + if (fw_feature_is("enabled", "no-need-l1d-flush-kernel-on-user-access", np)) + security_ftr_clear(SEC_FTR_L1D_FLUSH_UACCESS); + + if (fw_feature_is("enabled", "no-need-store-drain-on-priv-state-switch", np)) + security_ftr_clear(SEC_FTR_STF_BARRIER); } -static void pnv_setup_rfi_flush(void) +static void __init pnv_setup_security_mitigations(void) { struct device_node *np, *fw_features; enum l1d_flush_type type; @@ -125,27 +132,68 @@ static void pnv_setup_rfi_flush(void) type = L1D_FLUSH_ORI; } + /* + * The issues addressed by the entry and uaccess flush don't affect P7 + * or P8, so on bare metal disable them explicitly in case firmware does + * not include the features to disable them. POWER9 and newer processors + * should have the appropriate firmware flags. + */ + if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p) || + pvr_version_is(PVR_POWER8E) || pvr_version_is(PVR_POWER8NVL) || + pvr_version_is(PVR_POWER8)) { + security_ftr_clear(SEC_FTR_L1D_FLUSH_ENTRY); + security_ftr_clear(SEC_FTR_L1D_FLUSH_UACCESS); + } + enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && \ (security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR) || \ security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV)); setup_rfi_flush(type, enable); setup_count_cache_flush(); + + enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && + security_ftr_enabled(SEC_FTR_L1D_FLUSH_ENTRY); + setup_entry_flush(enable); + + enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && + security_ftr_enabled(SEC_FTR_L1D_FLUSH_UACCESS); + setup_uaccess_flush(enable); + + setup_stf_barrier(); +} + +static void __init pnv_check_guarded_cores(void) +{ + struct device_node *dn; + int bad_count = 0; + + for_each_node_by_type(dn, "cpu") { + if (of_property_match_string(dn, "status", "bad") >= 0) + bad_count++; + } + + if (bad_count) { + printk(" _ _______________\n"); + pr_cont(" | | / \\\n"); + pr_cont(" | | | WARNING! |\n"); + pr_cont(" | | | |\n"); + pr_cont(" | | | It looks like |\n"); + pr_cont(" |_| | you have %*d |\n", 3, bad_count); + pr_cont(" _ | guarded cores |\n"); + pr_cont(" (_) \\_______________/\n"); + } } static void __init pnv_setup_arch(void) { set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); - pnv_setup_rfi_flush(); - setup_stf_barrier(); + pnv_setup_security_mitigations(); /* Initialize SMP */ pnv_smp_init(); - /* Setup PCI */ - pnv_pci_init(); - /* Setup RTC and NVRAM callbacks */ if (firmware_has_feature(FW_FEATURE_OPAL)) opal_nvram_init(); @@ -153,11 +201,36 @@ static void __init pnv_setup_arch(void) /* Enable NAP mode */ powersave_nap = 1; + pnv_check_guarded_cores(); + /* XXX PMCS */ + + pnv_rng_init(); +} + +static void __init pnv_add_hw_description(void) +{ + struct device_node *dn; + const char *s; + + dn = of_find_node_by_path("/ibm,opal/firmware"); + if (!dn) + return; + + if (of_property_read_string(dn, "version", &s) == 0 || + of_property_read_string(dn, "git-id", &s) == 0) + seq_buf_printf(&ppc_hw_desc, "opal:%s ", s); + + if (of_property_read_string(dn, "mi-version", &s) == 0) + seq_buf_printf(&ppc_hw_desc, "mi:%s ", s); + + of_node_put(dn); } static void __init pnv_init(void) { + pnv_add_hw_description(); + /* * Initialize the LPC bus now so that legacy serial * ports can be found on it @@ -170,6 +243,21 @@ static void __init pnv_init(void) else #endif add_preferred_console("hvc", 0, NULL); + +#ifdef CONFIG_PPC_64S_HASH_MMU + if (!radix_enabled()) { + size_t size = sizeof(struct slb_entry) * mmu_slb_size; + int i; + + /* Allocate per cpu area to save old slb contents during MCE */ + for_each_possible_cpu(i) { + paca_ptrs[i]->mce_faulty_slbs = + memblock_alloc_node(size, + __alignof__(struct slb_entry), + cpu_to_node(i)); + } + } +#endif } static void __init pnv_init_IRQ(void) @@ -224,10 +312,16 @@ static void __noreturn pnv_restart(char *cmd) pnv_prepare_going_down(); do { - if (!cmd) + if (!cmd || !strlen(cmd)) rc = opal_cec_reboot(); else if (strcmp(cmd, "full") == 0) rc = opal_cec_reboot2(OPAL_REBOOT_FULL_IPL, NULL); + else if (strcmp(cmd, "mpipl") == 0) + rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, NULL); + else if (strcmp(cmd, "error") == 0) + rc = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, NULL); + else if (strcmp(cmd, "fast") == 0) + rc = opal_cec_reboot2(OPAL_REBOOT_FAST, NULL); else rc = OPAL_UNSUPPORTED; @@ -385,10 +479,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) } #endif /* CONFIG_KEXEC_CORE */ -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG static unsigned long pnv_memory_block_size(void) { - return 256UL * 1024 * 1024; + return memory_block_size; } #endif @@ -401,15 +495,15 @@ static void __init pnv_setup_machdep_opal(void) /* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */ ppc_md.machine_check_exception = opal_machine_check; ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery; - ppc_md.hmi_exception_early = opal_hmi_exception_early; + if (opal_check_token(OPAL_HANDLE_HMI2)) + ppc_md.hmi_exception_early = opal_hmi_exception_early2; + else + ppc_md.hmi_exception_early = opal_hmi_exception_early; ppc_md.handle_hmi_exception = opal_handle_hmi_exception; } static int __init pnv_probe(void) { - if (!of_machine_is_compatible("ibm,powernv")) - return 0; - if (firmware_has_feature(FW_FEATURE_OPAL)) pnv_setup_machdep_opal(); @@ -473,20 +567,21 @@ static long pnv_machine_check_early(struct pt_regs *regs) define_machine(powernv) { .name = "PowerNV", + .compatible = "ibm,powernv", .probe = pnv_probe, .setup_arch = pnv_setup_arch, .init_IRQ = pnv_init_IRQ, .show_cpuinfo = pnv_show_cpuinfo, .get_proc_freq = pnv_get_proc_freq, + .discover_phbs = pnv_pci_init, .progress = pnv_progress, .machine_shutdown = pnv_shutdown, .power_save = NULL, - .calibrate_decr = generic_calibrate_decr, .machine_check_early = pnv_machine_check_early, #ifdef CONFIG_KEXEC_CORE .kexec_cpu_down = pnv_kexec_cpu_down, #endif -#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE +#ifdef CONFIG_MEMORY_HOTPLUG .memory_block_size = pnv_memory_block_size, #endif }; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 0d354e19ef92..8f41ef364fc6 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * SMP support for PowerNV machines. * * Copyright 2011 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> @@ -32,13 +28,15 @@ #include <asm/xive.h> #include <asm/opal.h> #include <asm/runlatch.h> -#include <asm/code-patching.h> +#include <asm/text-patching.h> #include <asm/dbell.h> #include <asm/kvm_ppc.h> #include <asm/ppc-opcode.h> #include <asm/cpuidle.h> #include <asm/kexec.h> #include <asm/reg.h> +#include <asm/powernv.h> +#include <asm/systemcfg.h> #include "powernv.h" @@ -46,7 +44,7 @@ #include <asm/udbg.h> #define DBG(fmt...) udbg_printf(fmt) #else -#define DBG(fmt...) +#define DBG(fmt...) do { } while (0) #endif static void pnv_smp_setup_cpu(int cpu) @@ -139,32 +137,42 @@ static int pnv_smp_cpu_disable(void) * the generic fixup_irqs. --BenH. */ set_cpu_online(cpu, false); - vdso_data->processorCount--; +#ifdef CONFIG_PPC64_PROC_SYSTEMCFG + systemcfg->processorCount--; +#endif if (cpu == boot_cpuid) boot_cpuid = cpumask_any(cpu_online_mask); if (xive_enabled()) xive_smp_disable_cpu(); else xics_migrate_irqs_away(); + + cleanup_cpu_mmu_context(); + return 0; } -static void pnv_smp_cpu_kill_self(void) +static void pnv_flush_interrupts(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (xive_enabled()) + xive_flush_interrupt(); + else + icp_opal_flush_interrupt(); + } else { + icp_native_flush_interrupt(); + } +} + +static void pnv_cpu_offline_self(void) { + unsigned long srr1, unexpected_mask, wmask; unsigned int cpu; - unsigned long srr1, wmask; + u64 lpcr_val; /* Standard hot unplug procedure */ - /* - * This hard disables local interurpts, ensuring we have no lazy - * irqs pending. - */ - WARN_ON(irqs_disabled()); - hard_irq_disable(); - WARN_ON(lazy_irq_pending()); idle_task_exit(); - current->active_mm = NULL; /* for sanity */ cpu = smp_processor_id(); DBG("CPU%d offline\n", cpu); generic_set_cpu_dead(cpu); @@ -174,6 +182,40 @@ static void pnv_smp_cpu_kill_self(void) if (cpu_has_feature(CPU_FTR_ARCH_207S)) wmask = SRR1_WAKEMASK_P8; + /* + * This turns the irq soft-disabled state we're called with, into a + * hard-disabled state with pending irq_happened interrupts cleared. + * + * PACA_IRQ_DEC - Decrementer should be ignored. + * PACA_IRQ_HMI - Can be ignored, processing is done in real mode. + * PACA_IRQ_DBELL, EE, PMI - Unexpected. + */ + hard_irq_disable(); + if (generic_check_cpu_restart(cpu)) + goto out; + + unexpected_mask = ~(PACA_IRQ_DEC | PACA_IRQ_HMI | PACA_IRQ_HARD_DIS); + if (local_paca->irq_happened & unexpected_mask) { + if (local_paca->irq_happened & PACA_IRQ_EE) + pnv_flush_interrupts(); + DBG("CPU%d Unexpected exit while offline irq_happened=%lx!\n", + cpu, local_paca->irq_happened); + } + local_paca->irq_happened = PACA_IRQ_HARD_DIS; + + /* + * We don't want to take decrementer interrupts while we are + * offline, so clear LPCR:PECE1. We keep PECE2 (and + * LPCR_PECE_HVEE on P9) enabled so as to let IPIs in. + * + * If the CPU gets woken up by a special wakeup, ensure that + * the SLW engine sets LPCR with decrementer bit cleared, else + * the CPU will come back to the kernel due to a spurious + * wakeup. + */ + lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); + while (!generic_check_cpu_restart(cpu)) { /* * Clear IPI flag, since we don't handle IPIs while @@ -182,10 +224,11 @@ static void pnv_smp_cpu_kill_self(void) * for coming online, which are handled via * generic_check_cpu_restart() calls. */ - kvmppc_set_host_ipi(cpu, 0); + kvmppc_clear_host_ipi(cpu); srr1 = pnv_cpu_offline(cpu); + WARN_ON_ONCE(!irqs_disabled()); WARN_ON(lazy_irq_pending()); /* @@ -201,13 +244,7 @@ static void pnv_smp_cpu_kill_self(void) */ if (((srr1 & wmask) == SRR1_WAKEEE) || ((srr1 & wmask) == SRR1_WAKEHVI)) { - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - if (xive_enabled()) - xive_flush_interrupt(); - else - icp_opal_flush_interrupt(); - } else - icp_native_flush_interrupt(); + pnv_flush_interrupts(); } else if ((srr1 & wmask) == SRR1_WAKEHDBELL) { unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); asm volatile(PPC_MSGCLR(%0) : : "r" (msg)); @@ -246,6 +283,16 @@ static void pnv_smp_cpu_kill_self(void) } + /* + * Re-enable decrementer interrupts in LPCR. + * + * Further, we want stop states to be woken up by decrementer + * for non-hotplug cases. So program the LPCR via stop api as + * well. + */ + lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); +out: DBG("CPU%d coming online...\n", cpu); } @@ -301,7 +348,7 @@ static void __init pnv_smp_probe(void) } } -static int pnv_system_reset_exception(struct pt_regs *regs) +noinstr static int pnv_system_reset_exception(struct pt_regs *regs) { if (smp_handle_nmi_ipi(regs)) return 1; @@ -376,6 +423,7 @@ static struct smp_ops_t pnv_smp_ops = { #ifdef CONFIG_HOTPLUG_CPU .cpu_disable = pnv_smp_cpu_disable, .cpu_die = generic_cpu_die, + .cpu_offline_self = pnv_cpu_offline_self, #endif /* CONFIG_HOTPLUG_CPU */ }; @@ -389,8 +437,7 @@ void __init pnv_smp_init(void) smp_ops = &pnv_smp_ops; #ifdef CONFIG_HOTPLUG_CPU - ppc_md.cpu_die = pnv_smp_cpu_kill_self; -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_DUMP crash_wake_offline = 1; #endif #endif diff --git a/arch/powerpc/platforms/powernv/subcore-asm.S b/arch/powerpc/platforms/powernv/subcore-asm.S index 39bb24aa8f34..e038f6761790 100644 --- a/arch/powerpc/platforms/powernv/subcore-asm.S +++ b/arch/powerpc/platforms/powernv/subcore-asm.S @@ -1,10 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #include <asm/asm-offsets.h> diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c index 45563004feda..393e747541fb 100644 --- a/arch/powerpc/platforms/powernv/subcore.c +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "powernv: " fmt @@ -24,6 +20,8 @@ #include <asm/opal.h> #include <asm/smp.h> +#include <trace/events/ipi.h> + #include "subcore.h" #include "powernv.h" @@ -173,6 +171,16 @@ static void update_hid_in_slw(u64 hid0) } } +static inline void update_power8_hid0(unsigned long hid0) +{ + /* + * The HID0 update on Power8 should at the very least be + * preceded by a SYNC instruction followed by an ISYNC + * instruction + */ + asm volatile("sync; mtspr %0,%1; isync":: "i"(SPRN_HID0), "r"(hid0)); +} + static void unsplit_core(void) { u64 hid0, mask; @@ -183,7 +191,7 @@ static void unsplit_core(void) cpu = smp_processor_id(); if (cpu_thread_in_core(cpu) != 0) { while (mfspr(SPRN_HID0) & mask) - power7_idle_insn(PNV_THREAD_NAP); + power7_idle_type(PNV_THREAD_NAP); per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT; return; @@ -409,13 +417,16 @@ static DEVICE_ATTR(subcores_per_core, 0644, static int subcore_init(void) { + struct device *dev_root; unsigned pvr_ver; + int rc = 0; pvr_ver = PVR_VER(mfspr(SPRN_PVR)); if (pvr_ver != PVR_POWER8 && pvr_ver != PVR_POWER8E && - pvr_ver != PVR_POWER8NVL) + pvr_ver != PVR_POWER8NVL && + pvr_ver != PVR_HX_C2000) return 0; /* @@ -429,7 +440,11 @@ static int subcore_init(void) set_subcores_per_core(1); - return device_create_file(cpu_subsys.dev_root, - &dev_attr_subcores_per_core); + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { + rc = device_create_file(dev_root, &dev_attr_subcores_per_core); + put_device(dev_root); + } + return rc; } machine_device_initcall(powernv, subcore_init); diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h index 84e02ae52895..413fd85d9bc2 100644 --- a/arch/powerpc/platforms/powernv/subcore.h +++ b/arch/powerpc/platforms/powernv/subcore.h @@ -1,10 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright 2013, Michael Ellerman, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ /* These are ordered and tested with <= */ @@ -13,13 +9,13 @@ #define SYNC_STEP_REAL_MODE 2 /* Set by secondary when in real mode */ #define SYNC_STEP_FINISHED 3 /* Set by secondary when split/unsplit is done */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_SMP void split_core_secondary_loop(u8 *state); extern void update_subcore_sibling_mask(void); #else -static inline void update_subcore_sibling_mask(void) { }; +static inline void update_subcore_sibling_mask(void) { } #endif /* CONFIG_SMP */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c new file mode 100644 index 000000000000..c526871a1229 --- /dev/null +++ b/arch/powerpc/platforms/powernv/ultravisor.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Ultravisor high level interfaces + * + * Copyright 2019, IBM Corporation. + * + */ +#include <linux/init.h> +#include <linux/printk.h> +#include <linux/of_fdt.h> +#include <linux/of.h> + +#include <asm/ultravisor.h> +#include <asm/firmware.h> +#include <asm/machdep.h> + +#include "powernv.h" + +static struct kobject *ultravisor_kobj; + +int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname, + int depth, void *data) +{ + if (!of_flat_dt_is_compatible(node, "ibm,ultravisor")) + return 0; + + powerpc_firmware_features |= FW_FEATURE_ULTRAVISOR; + pr_debug("Ultravisor detected!\n"); + return 1; +} + +static struct memcons *uv_memcons; + +static ssize_t uv_msglog_read(struct file *file, struct kobject *kobj, + const struct bin_attribute *bin_attr, char *to, + loff_t pos, size_t count) +{ + return memcons_copy(uv_memcons, to, pos, count); +} + +static struct bin_attribute uv_msglog_attr __ro_after_init = { + .attr = {.name = "msglog", .mode = 0400}, + .read = uv_msglog_read +}; + +static int __init uv_init(void) +{ + struct device_node *node; + + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + return 0; + + node = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware"); + if (!node) + return -ENODEV; + + uv_memcons = memcons_init(node, "memcons"); + of_node_put(node); + if (!uv_memcons) + return -ENOENT; + + uv_msglog_attr.size = memcons_get_size(uv_memcons); + + ultravisor_kobj = kobject_create_and_add("ultravisor", firmware_kobj); + if (!ultravisor_kobj) + return -ENOMEM; + + return sysfs_create_bin_file(ultravisor_kobj, &uv_msglog_attr); +} +machine_subsys_initcall(powernv, uv_init); diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c index 4d3929fbc08f..3ce89a4b54be 100644 --- a/arch/powerpc/platforms/powernv/vas-debug.c +++ b/arch/powerpc/platforms/powernv/vas-debug.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2016-17 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "vas: " fmt @@ -13,6 +9,7 @@ #include <linux/slab.h> #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <asm/vas.h> #include "vas.h" static struct dentry *vas_debugfs; @@ -32,7 +29,7 @@ static char *cop_to_str(int cop) static int info_show(struct seq_file *s, void *private) { - struct vas_window *window = s->private; + struct pnv_vas_window *window = s->private; mutex_lock(&vas_mutex); @@ -40,9 +37,9 @@ static int info_show(struct seq_file *s, void *private) if (!window->hvwc_map) goto unlock; - seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop), + seq_printf(s, "Type: %s, %s\n", cop_to_str(window->vas_win.cop), window->tx_win ? "Send" : "Receive"); - seq_printf(s, "Pid : %d\n", window->pid); + seq_printf(s, "Pid : %d\n", vas_window_pid(&window->vas_win)); unlock: mutex_unlock(&vas_mutex); @@ -51,7 +48,7 @@ unlock: DEFINE_SHOW_ATTRIBUTE(info); -static inline void print_reg(struct seq_file *s, struct vas_window *win, +static inline void print_reg(struct seq_file *s, struct pnv_vas_window *win, char *name, u32 reg) { seq_printf(s, "0x%016llx %s\n", read_hvwc_reg(win, name, reg), name); @@ -59,7 +56,7 @@ static inline void print_reg(struct seq_file *s, struct vas_window *win, static int hvwc_show(struct seq_file *s, void *private) { - struct vas_window *window = s->private; + struct pnv_vas_window *window = s->private; mutex_lock(&vas_mutex); @@ -107,8 +104,10 @@ unlock: DEFINE_SHOW_ATTRIBUTE(hvwc); -void vas_window_free_dbgdir(struct vas_window *window) +void vas_window_free_dbgdir(struct pnv_vas_window *pnv_win) { + struct vas_window *window = &pnv_win->vas_win; + if (window->dbgdir) { debugfs_remove_recursive(window->dbgdir); kfree(window->dbgname); @@ -117,42 +116,24 @@ void vas_window_free_dbgdir(struct vas_window *window) } } -void vas_window_init_dbgdir(struct vas_window *window) +void vas_window_init_dbgdir(struct pnv_vas_window *window) { - struct dentry *f, *d; + struct dentry *d; if (!window->vinst->dbgdir) return; - window->dbgname = kzalloc(16, GFP_KERNEL); - if (!window->dbgname) + window->vas_win.dbgname = kzalloc(16, GFP_KERNEL); + if (!window->vas_win.dbgname) return; - snprintf(window->dbgname, 16, "w%d", window->winid); - - d = debugfs_create_dir(window->dbgname, window->vinst->dbgdir); - if (IS_ERR(d)) - goto free_name; - - window->dbgdir = d; - - f = debugfs_create_file("info", 0444, d, window, &info_fops); - if (IS_ERR(f)) - goto remove_dir; + snprintf(window->vas_win.dbgname, 16, "w%d", window->vas_win.winid); - f = debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops); - if (IS_ERR(f)) - goto remove_dir; + d = debugfs_create_dir(window->vas_win.dbgname, window->vinst->dbgdir); + window->vas_win.dbgdir = d; - return; - -remove_dir: - debugfs_remove_recursive(window->dbgdir); - window->dbgdir = NULL; - -free_name: - kfree(window->dbgname); - window->dbgname = NULL; + debugfs_create_file("info", 0444, d, window, &info_fops); + debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops); } void vas_instance_init_dbgdir(struct vas_instance *vinst) @@ -160,8 +141,6 @@ void vas_instance_init_dbgdir(struct vas_instance *vinst) struct dentry *d; vas_init_dbgdir(); - if (!vas_debugfs) - return; vinst->dbgname = kzalloc(16, GFP_KERNEL); if (!vinst->dbgname) @@ -170,16 +149,7 @@ void vas_instance_init_dbgdir(struct vas_instance *vinst) snprintf(vinst->dbgname, 16, "v%d", vinst->vas_id); d = debugfs_create_dir(vinst->dbgname, vas_debugfs); - if (IS_ERR(d)) - goto free_name; - vinst->dbgdir = d; - return; - -free_name: - kfree(vinst->dbgname); - vinst->dbgname = NULL; - vinst->dbgdir = NULL; } /* @@ -195,6 +165,4 @@ void vas_init_dbgdir(void) first_time = false; vas_debugfs = debugfs_create_dir("vas", NULL); - if (IS_ERR(vas_debugfs)) - vas_debugfs = NULL; } diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c new file mode 100644 index 000000000000..2b47d5a86328 --- /dev/null +++ b/arch/powerpc/platforms/powernv/vas-fault.c @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * VAS Fault handling. + * Copyright 2019, IBM Corporation + */ + +#define pr_fmt(fmt) "vas: " fmt + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/kthread.h> +#include <linux/sched/signal.h> +#include <linux/mmu_context.h> +#include <asm/icswx.h> + +#include "vas.h" + +/* + * The maximum FIFO size for fault window can be 8MB + * (VAS_RX_FIFO_SIZE_MAX). Using 4MB FIFO since each VAS + * instance will be having fault window. + * 8MB FIFO can be used if expects more faults for each VAS + * instance. + */ +#define VAS_FAULT_WIN_FIFO_SIZE (4 << 20) + +static void dump_fifo(struct vas_instance *vinst, void *entry) +{ + unsigned long *end = vinst->fault_fifo + vinst->fault_fifo_size; + unsigned long *fifo = entry; + int i; + + pr_err("Fault fifo size %d, Max crbs %d\n", vinst->fault_fifo_size, + vinst->fault_fifo_size / CRB_SIZE); + + /* Dump 10 CRB entries or until end of FIFO */ + pr_err("Fault FIFO Dump:\n"); + for (i = 0; i < 10*(CRB_SIZE/8) && fifo < end; i += 4, fifo += 4) { + pr_err("[%.3d, %p]: 0x%.16lx 0x%.16lx 0x%.16lx 0x%.16lx\n", + i, fifo, *fifo, *(fifo+1), *(fifo+2), *(fifo+3)); + } +} + +/* + * Process valid CRBs in fault FIFO. + * NX process user space requests, return credit and update the status + * in CRB. If it encounters transalation error when accessing CRB or + * request buffers, raises interrupt on the CPU to handle the fault. + * It takes credit on fault window, updates nx_fault_stamp in CRB with + * the following information and pastes CRB in fault FIFO. + * + * pswid - window ID of the window on which the request is sent. + * fault_storage_addr - fault address + * + * It can raise a single interrupt for multiple faults. Expects OS to + * process all valid faults and return credit for each fault on user + * space and fault windows. This fault FIFO control will be done with + * credit mechanism. NX can continuously paste CRBs until credits are not + * available on fault window. Otherwise, returns with RMA_reject. + * + * Total credits available on fault window: FIFO_SIZE(4MB)/CRBS_SIZE(128) + * + */ +irqreturn_t vas_fault_thread_fn(int irq, void *data) +{ + struct vas_instance *vinst = data; + struct coprocessor_request_block *crb, *entry; + struct coprocessor_request_block buf; + struct pnv_vas_window *window; + unsigned long flags; + void *fifo; + + crb = &buf; + + /* + * VAS can interrupt with multiple page faults. So process all + * valid CRBs within fault FIFO until reaches invalid CRB. + * We use CCW[0] and pswid to validate CRBs: + * + * CCW[0] Reserved bit. When NX pastes CRB, CCW[0]=0 + * OS sets this bit to 1 after reading CRB. + * pswid NX assigns window ID. Set pswid to -1 after + * reading CRB from fault FIFO. + * + * We exit this function if no valid CRBs are available to process. + * So acquire fault_lock and reset fifo_in_progress to 0 before + * exit. + * In case kernel receives another interrupt with different page + * fault, interrupt handler returns with IRQ_HANDLED if + * fifo_in_progress is set. Means these new faults will be + * handled by the current thread. Otherwise set fifo_in_progress + * and return IRQ_WAKE_THREAD to wake up thread. + */ + while (true) { + spin_lock_irqsave(&vinst->fault_lock, flags); + /* + * Advance the fault fifo pointer to next CRB. + * Use CRB_SIZE rather than sizeof(*crb) since the latter is + * aligned to CRB_ALIGN (256) but the CRB written to by VAS is + * only CRB_SIZE in len. + */ + fifo = vinst->fault_fifo + (vinst->fault_crbs * CRB_SIZE); + entry = fifo; + + if ((entry->stamp.nx.pswid == cpu_to_be32(FIFO_INVALID_ENTRY)) + || (entry->ccw & cpu_to_be32(CCW0_INVALID))) { + vinst->fifo_in_progress = 0; + spin_unlock_irqrestore(&vinst->fault_lock, flags); + return IRQ_HANDLED; + } + + spin_unlock_irqrestore(&vinst->fault_lock, flags); + vinst->fault_crbs++; + if (vinst->fault_crbs == (vinst->fault_fifo_size / CRB_SIZE)) + vinst->fault_crbs = 0; + + memcpy(crb, fifo, CRB_SIZE); + entry->stamp.nx.pswid = cpu_to_be32(FIFO_INVALID_ENTRY); + entry->ccw |= cpu_to_be32(CCW0_INVALID); + /* + * Return credit for the fault window. + */ + vas_return_credit(vinst->fault_win, false); + + pr_devel("VAS[%d] fault_fifo %p, fifo %p, fault_crbs %d\n", + vinst->vas_id, vinst->fault_fifo, fifo, + vinst->fault_crbs); + + vas_dump_crb(crb); + window = vas_pswid_to_window(vinst, + be32_to_cpu(crb->stamp.nx.pswid)); + + if (IS_ERR(window)) { + /* + * We got an interrupt about a specific send + * window but we can't find that window and we can't + * even clean it up (return credit on user space + * window). + * But we should not get here. + * TODO: Disable IRQ. + */ + dump_fifo(vinst, (void *)entry); + pr_err("VAS[%d] fault_fifo %p, fifo %p, pswid 0x%x, fault_crbs %d bad CRB?\n", + vinst->vas_id, vinst->fault_fifo, fifo, + be32_to_cpu(crb->stamp.nx.pswid), + vinst->fault_crbs); + + WARN_ON_ONCE(1); + } else { + /* + * NX sees faults only with user space windows. + */ + if (window->user_win) + vas_update_csb(crb, &window->vas_win.task_ref); + else + WARN_ON_ONCE(!window->user_win); + + /* + * Return credit for send window after processing + * fault CRB. + */ + vas_return_credit(window, true); + } + } +} + +irqreturn_t vas_fault_handler(int irq, void *dev_id) +{ + struct vas_instance *vinst = dev_id; + irqreturn_t ret = IRQ_WAKE_THREAD; + unsigned long flags; + + /* + * NX can generate an interrupt for multiple faults. So the + * fault handler thread process all CRBs until finds invalid + * entry. In case if NX sees continuous faults, it is possible + * that the thread function entered with the first interrupt + * can execute and process all valid CRBs. + * So wake up thread only if the fault thread is not in progress. + */ + spin_lock_irqsave(&vinst->fault_lock, flags); + + if (vinst->fifo_in_progress) + ret = IRQ_HANDLED; + else + vinst->fifo_in_progress = 1; + + spin_unlock_irqrestore(&vinst->fault_lock, flags); + + return ret; +} + +/* + * Fault window is opened per VAS instance. NX pastes fault CRB in fault + * FIFO upon page faults. + */ +int vas_setup_fault_window(struct vas_instance *vinst) +{ + struct vas_rx_win_attr attr; + struct vas_window *win; + + vinst->fault_fifo_size = VAS_FAULT_WIN_FIFO_SIZE; + vinst->fault_fifo = kzalloc(vinst->fault_fifo_size, GFP_KERNEL); + if (!vinst->fault_fifo) { + pr_err("Unable to alloc %d bytes for fault_fifo\n", + vinst->fault_fifo_size); + return -ENOMEM; + } + + /* + * Invalidate all CRB entries. NX pastes valid entry for each fault. + */ + memset(vinst->fault_fifo, FIFO_INVALID_ENTRY, vinst->fault_fifo_size); + vas_init_rx_win_attr(&attr, VAS_COP_TYPE_FAULT); + + attr.rx_fifo_size = vinst->fault_fifo_size; + attr.rx_fifo = __pa(vinst->fault_fifo); + + /* + * Max creds is based on number of CRBs can fit in the FIFO. + * (fault_fifo_size/CRB_SIZE). If 8MB FIFO is used, max creds + * will be 0xffff since the receive creds field is 16bits wide. + */ + attr.wcreds_max = vinst->fault_fifo_size / CRB_SIZE; + attr.lnotify_lpid = 0; + attr.lnotify_pid = mfspr(SPRN_PID); + attr.lnotify_tid = mfspr(SPRN_PID); + + win = vas_rx_win_open(vinst->vas_id, VAS_COP_TYPE_FAULT, &attr); + if (IS_ERR(win)) { + pr_err("VAS: Error %ld opening FaultWin\n", PTR_ERR(win)); + kfree(vinst->fault_fifo); + return PTR_ERR(win); + } + + vinst->fault_win = container_of(win, struct pnv_vas_window, vas_win); + + pr_devel("VAS: Created FaultWin %d, LPID/PID/TID [%d/%d/%d]\n", + vinst->fault_win->vas_win.winid, attr.lnotify_lpid, + attr.lnotify_pid, attr.lnotify_tid); + + return 0; +} diff --git a/arch/powerpc/platforms/powernv/vas-trace.h b/arch/powerpc/platforms/powernv/vas-trace.h index a449b9f0c12e..ca2e08f2ddc0 100644 --- a/arch/powerpc/platforms/powernv/vas-trace.h +++ b/arch/powerpc/platforms/powernv/vas-trace.h @@ -80,7 +80,7 @@ TRACE_EVENT( vas_tx_win_open, TRACE_EVENT( vas_paste_crb, TP_PROTO(struct task_struct *tsk, - struct vas_window *win), + struct pnv_vas_window *win), TP_ARGS(tsk, win), @@ -96,7 +96,7 @@ TRACE_EVENT( vas_paste_crb, TP_fast_assign( __entry->pid = tsk->pid; __entry->vasid = win->vinst->vas_id; - __entry->winid = win->winid; + __entry->winid = win->vas_win.winid; __entry->paste_kaddr = (unsigned long)win->paste_kaddr ), diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c index e59e0e60e5b5..5147df3a18ac 100644 --- a/arch/powerpc/platforms/powernv/vas-window.c +++ b/arch/powerpc/platforms/powernv/vas-window.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2016-17 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "vas: " fmt @@ -16,8 +12,11 @@ #include <linux/log2.h> #include <linux/rcupdate.h> #include <linux/cred.h> +#include <linux/sched/mm.h> +#include <linux/mmu_context.h> #include <asm/switch_to.h> #include <asm/ppc-opcode.h> +#include <asm/vas.h> #include "vas.h" #include "copy-paste.h" @@ -28,14 +27,14 @@ * Compute the paste address region for the window @window using the * ->paste_base_addr and ->paste_win_id_shift we got from device tree. */ -static void compute_paste_address(struct vas_window *window, u64 *addr, int *len) +void vas_win_paste_addr(struct pnv_vas_window *window, u64 *addr, int *len) { int winid; u64 base, shift; base = window->vinst->paste_base_addr; shift = window->vinst->paste_win_id_shift; - winid = window->winid; + winid = window->vas_win.winid; *addr = base + (winid << shift); if (len) @@ -44,33 +43,23 @@ static void compute_paste_address(struct vas_window *window, u64 *addr, int *len pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr); } -u64 vas_win_paste_addr(struct vas_window *win) -{ - u64 addr; - - compute_paste_address(win, &addr, NULL); - - return addr; -} -EXPORT_SYMBOL(vas_win_paste_addr); - -static inline void get_hvwc_mmio_bar(struct vas_window *window, +static inline void get_hvwc_mmio_bar(struct pnv_vas_window *window, u64 *start, int *len) { u64 pbaddr; pbaddr = window->vinst->hvwc_bar_start; - *start = pbaddr + window->winid * VAS_HVWC_SIZE; + *start = pbaddr + window->vas_win.winid * VAS_HVWC_SIZE; *len = VAS_HVWC_SIZE; } -static inline void get_uwc_mmio_bar(struct vas_window *window, +static inline void get_uwc_mmio_bar(struct pnv_vas_window *window, u64 *start, int *len) { u64 pbaddr; pbaddr = window->vinst->uwc_bar_start; - *start = pbaddr + window->winid * VAS_UWC_SIZE; + *start = pbaddr + window->vas_win.winid * VAS_UWC_SIZE; *len = VAS_UWC_SIZE; } @@ -79,7 +68,7 @@ static inline void get_uwc_mmio_bar(struct vas_window *window, * space. Unlike MMIO regions (map_mmio_region() below), paste region must * be mapped cache-able and is only applicable to send windows. */ -static void *map_paste_region(struct vas_window *txwin) +static void *map_paste_region(struct pnv_vas_window *txwin) { int len; void *map; @@ -87,12 +76,12 @@ static void *map_paste_region(struct vas_window *txwin) u64 start; name = kasprintf(GFP_KERNEL, "window-v%d-w%d", txwin->vinst->vas_id, - txwin->winid); + txwin->vas_win.winid); if (!name) goto free_name; txwin->paste_addr_name = name; - compute_paste_address(txwin, &start, &len); + vas_win_paste_addr(txwin, &start, &len); if (!request_mem_region(start, len, name)) { pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n", @@ -144,13 +133,13 @@ static void unmap_region(void *addr, u64 start, int len) /* * Unmap the paste address region for a window. */ -static void unmap_paste_region(struct vas_window *window) +static void unmap_paste_region(struct pnv_vas_window *window) { int len; u64 busaddr_start; if (window->paste_kaddr) { - compute_paste_address(window, &busaddr_start, &len); + vas_win_paste_addr(window, &busaddr_start, &len); unmap_region(window->paste_kaddr, busaddr_start, len); window->paste_kaddr = NULL; kfree(window->paste_addr_name); @@ -165,7 +154,7 @@ static void unmap_paste_region(struct vas_window *window) * path, just minimize the time we hold the mutex for now. We can add * a per-instance mutex later if necessary. */ -static void unmap_winctx_mmio_bars(struct vas_window *window) +static void unmap_winctx_mmio_bars(struct pnv_vas_window *window) { int len; void *uwc_map; @@ -198,7 +187,7 @@ static void unmap_winctx_mmio_bars(struct vas_window *window) * OS/User Window Context (UWC) MMIO Base Address Region for the given window. * Map these bus addresses and save the mapped kernel addresses in @window. */ -int map_winctx_mmio_bars(struct vas_window *window) +static int map_winctx_mmio_bars(struct pnv_vas_window *window) { int len; u64 start; @@ -226,7 +215,7 @@ int map_winctx_mmio_bars(struct vas_window *window) * registers are not sequential. And, we can only write to offsets * with valid registers. */ -void reset_window_regs(struct vas_window *window) +static void reset_window_regs(struct pnv_vas_window *window) { write_hvwc_reg(window, VREG(LPID), 0ULL); write_hvwc_reg(window, VREG(PID), 0ULL); @@ -282,7 +271,7 @@ void reset_window_regs(struct vas_window *window) * want to add fields to vas_winctx and move the initialization to * init_vas_winctx_regs(). */ -static void init_xlate_regs(struct vas_window *window, bool user_win) +static void init_xlate_regs(struct pnv_vas_window *window, bool user_win) { u64 lpcr, val; @@ -347,7 +336,7 @@ static void init_xlate_regs(struct vas_window *window, bool user_win) * * TODO: Reserved (aka dedicated) send buffers are not supported yet. */ -static void init_rsvd_tx_buf_count(struct vas_window *txwin, +static void init_rsvd_tx_buf_count(struct pnv_vas_window *txwin, struct vas_winctx *winctx) { write_hvwc_reg(txwin, VREG(TX_RSVD_BUF_COUNT), 0ULL); @@ -369,7 +358,8 @@ static void init_rsvd_tx_buf_count(struct vas_window *txwin, * as a one-time task? That could work for NX but what about other * receivers? Let the receivers tell us the rx-fifo buffers for now. */ -int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx) +static void init_winctx_regs(struct pnv_vas_window *window, + struct vas_winctx *winctx) { u64 val; int fifo_size; @@ -387,7 +377,7 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx) init_xlate_regs(window, winctx->user_win); val = 0ULL; - val = SET_FIELD(VAS_FAULT_TX_WIN, val, 0); + val = SET_FIELD(VAS_FAULT_TX_WIN, val, winctx->fault_win_id); write_hvwc_reg(window, VREG(FAULT_TX_WIN), val); /* In PowerNV, interrupts go to HV. */ @@ -414,7 +404,7 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx) * * See also: Design note in function header. */ - val = __pa(winctx->rx_fifo); + val = winctx->rx_fifo; val = SET_FIELD(VAS_PAGE_MIGRATION_SELECT, val, 0); write_hvwc_reg(window, VREG(LFIFO_BAR), val); @@ -511,8 +501,6 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx) val = SET_FIELD(VAS_WINCTL_NX_WIN, val, winctx->nx_win); val = SET_FIELD(VAS_WINCTL_OPEN, val, 1); write_hvwc_reg(window, VREG(WINCTL), val); - - return 0; } static void vas_release_window_id(struct ida *ida, int winid) @@ -532,10 +520,10 @@ static int vas_assign_window_id(struct ida *ida) return winid; } -static void vas_window_free(struct vas_window *window) +static void vas_window_free(struct pnv_vas_window *window) { - int winid = window->winid; struct vas_instance *vinst = window->vinst; + int winid = window->vas_win.winid; unmap_winctx_mmio_bars(window); @@ -546,10 +534,10 @@ static void vas_window_free(struct vas_window *window) vas_release_window_id(&vinst->ida, winid); } -static struct vas_window *vas_window_alloc(struct vas_instance *vinst) +static struct pnv_vas_window *vas_window_alloc(struct vas_instance *vinst) { int winid; - struct vas_window *window; + struct pnv_vas_window *window; winid = vas_assign_window_id(&vinst->ida); if (winid < 0) @@ -560,7 +548,7 @@ static struct vas_window *vas_window_alloc(struct vas_instance *vinst) goto out_free; window->vinst = vinst; - window->winid = winid; + window->vas_win.winid = winid; if (map_winctx_mmio_bars(window)) goto out_free; @@ -575,7 +563,7 @@ out_free: return ERR_PTR(-ENOMEM); } -static void put_rx_win(struct vas_window *rxwin) +static void put_rx_win(struct pnv_vas_window *rxwin) { /* Better not be a send window! */ WARN_ON_ONCE(rxwin->tx_win); @@ -591,10 +579,11 @@ static void put_rx_win(struct vas_window *rxwin) * * NOTE: We access ->windows[] table and assume that vinst->mutex is held. */ -static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid) +static struct pnv_vas_window *get_user_rxwin(struct vas_instance *vinst, + u32 pswid) { int vasid, winid; - struct vas_window *rxwin; + struct pnv_vas_window *rxwin; decode_pswid(pswid, &vasid, &winid); @@ -603,7 +592,7 @@ static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid) rxwin = vinst->windows[winid]; - if (!rxwin || rxwin->tx_win || rxwin->cop != VAS_COP_TYPE_FTW) + if (!rxwin || rxwin->tx_win || rxwin->vas_win.cop != VAS_COP_TYPE_FTW) return ERR_PTR(-EINVAL); return rxwin; @@ -615,10 +604,10 @@ static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid) * * See also function header of set_vinst_win(). */ -static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst, +static struct pnv_vas_window *get_vinst_rxwin(struct vas_instance *vinst, enum vas_cop_type cop, u32 pswid) { - struct vas_window *rxwin; + struct pnv_vas_window *rxwin; mutex_lock(&vinst->mutex); @@ -651,9 +640,9 @@ static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst, * window, we also save the window in the ->rxwin[] table. */ static void set_vinst_win(struct vas_instance *vinst, - struct vas_window *window) + struct pnv_vas_window *window) { - int id = window->winid; + int id = window->vas_win.winid; mutex_lock(&vinst->mutex); @@ -662,8 +651,8 @@ static void set_vinst_win(struct vas_instance *vinst, * unless its a user (FTW) window. */ if (!window->user_win && !window->tx_win) { - WARN_ON_ONCE(vinst->rxwin[window->cop]); - vinst->rxwin[window->cop] = window; + WARN_ON_ONCE(vinst->rxwin[window->vas_win.cop]); + vinst->rxwin[window->vas_win.cop] = window; } WARN_ON_ONCE(vinst->windows[id] != NULL); @@ -676,16 +665,16 @@ static void set_vinst_win(struct vas_instance *vinst, * Clear this window from the table(s) of windows for this VAS instance. * See also function header of set_vinst_win(). */ -static void clear_vinst_win(struct vas_window *window) +static void clear_vinst_win(struct pnv_vas_window *window) { - int id = window->winid; + int id = window->vas_win.winid; struct vas_instance *vinst = window->vinst; mutex_lock(&vinst->mutex); if (!window->user_win && !window->tx_win) { - WARN_ON_ONCE(!vinst->rxwin[window->cop]); - vinst->rxwin[window->cop] = NULL; + WARN_ON_ONCE(!vinst->rxwin[window->vas_win.cop]); + vinst->rxwin[window->vas_win.cop] = NULL; } WARN_ON_ONCE(vinst->windows[id] != window); @@ -694,7 +683,7 @@ static void clear_vinst_win(struct vas_window *window) mutex_unlock(&vinst->mutex); } -static void init_winctx_for_rxwin(struct vas_window *rxwin, +static void init_winctx_for_rxwin(struct pnv_vas_window *rxwin, struct vas_rx_win_attr *rxattr, struct vas_winctx *winctx) { @@ -715,7 +704,7 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, winctx->rx_fifo = rxattr->rx_fifo; winctx->rx_fifo_size = rxattr->rx_fifo_size; - winctx->wcreds_max = rxwin->wcreds_max; + winctx->wcreds_max = rxwin->vas_win.wcreds_max; winctx->pin_win = rxattr->pin_win; winctx->nx_win = rxattr->nx_win; @@ -750,7 +739,7 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, */ winctx->fifo_disable = true; winctx->intr_disable = true; - winctx->rx_fifo = NULL; + winctx->rx_fifo = 0; } winctx->lnotify_lpid = rxattr->lnotify_lpid; @@ -762,6 +751,8 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin, winctx->min_scope = VAS_SCOPE_LOCAL; winctx->max_scope = VAS_SCOPE_VECTORED_GROUP; + if (rxwin->vinst->virq) + winctx->irq_port = rxwin->vinst->irq_port; } static bool rx_win_args_valid(enum vas_cop_type cop, @@ -782,7 +773,7 @@ static bool rx_win_args_valid(enum vas_cop_type cop, if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX) return false; - if (attr->wcreds_max > VAS_RX_WCREDS_MAX) + if (!attr->wcreds_max) return false; if (attr->nx_win) { @@ -827,7 +818,8 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop) { memset(rxattr, 0, sizeof(*rxattr)); - if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) { + if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI || + cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) { rxattr->pin_win = true; rxattr->nx_win = true; rxattr->fault_win = false; @@ -841,9 +833,9 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop) rxattr->fault_win = true; rxattr->notify_disable = true; rxattr->rx_wcred_mode = true; - rxattr->tx_wcred_mode = true; rxattr->rx_win_ord_mode = true; - rxattr->tx_win_ord_mode = true; + rxattr->rej_no_credit = true; + rxattr->tc_mode = VAS_THRESH_DISABLED; } else if (cop == VAS_COP_TYPE_FTW) { rxattr->user_win = true; rxattr->intr_disable = true; @@ -861,7 +853,7 @@ EXPORT_SYMBOL_GPL(vas_init_rx_win_attr); struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop, struct vas_rx_win_attr *rxattr) { - struct vas_window *rxwin; + struct pnv_vas_window *rxwin; struct vas_winctx winctx; struct vas_instance *vinst; @@ -880,23 +872,21 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop, rxwin = vas_window_alloc(vinst); if (IS_ERR(rxwin)) { pr_devel("Unable to allocate memory for Rx window\n"); - return rxwin; + return (struct vas_window *)rxwin; } rxwin->tx_win = false; rxwin->nx_win = rxattr->nx_win; rxwin->user_win = rxattr->user_win; - rxwin->cop = cop; - rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT; - if (rxattr->user_win) - rxwin->pid = task_pid_vnr(current); + rxwin->vas_win.cop = cop; + rxwin->vas_win.wcreds_max = rxattr->wcreds_max; init_winctx_for_rxwin(rxwin, rxattr, &winctx); init_winctx_regs(rxwin, &winctx); set_vinst_win(vinst, rxwin); - return rxwin; + return &rxwin->vas_win; } EXPORT_SYMBOL_GPL(vas_rx_win_open); @@ -904,7 +894,8 @@ void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, enum vas_cop_type cop) { memset(txattr, 0, sizeof(*txattr)); - if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) { + if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI || + cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) { txattr->rej_no_credit = false; txattr->rx_wcred_mode = true; txattr->tx_wcred_mode = true; @@ -916,7 +907,7 @@ void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, enum vas_cop_type cop) } EXPORT_SYMBOL_GPL(vas_init_tx_win_attr); -static void init_winctx_for_txwin(struct vas_window *txwin, +static void init_winctx_for_txwin(struct pnv_vas_window *txwin, struct vas_tx_win_attr *txattr, struct vas_winctx *winctx) { @@ -937,7 +928,7 @@ static void init_winctx_for_txwin(struct vas_window *txwin, */ memset(winctx, 0, sizeof(struct vas_winctx)); - winctx->wcreds_max = txwin->wcreds_max; + winctx->wcreds_max = txwin->vas_win.wcreds_max; winctx->user_win = txattr->user_win; winctx->nx_win = txwin->rxwin->nx_win; @@ -957,14 +948,24 @@ static void init_winctx_for_txwin(struct vas_window *txwin, winctx->lpid = txattr->lpid; winctx->pidr = txattr->pidr; - winctx->rx_win_id = txwin->rxwin->winid; + winctx->rx_win_id = txwin->rxwin->vas_win.winid; + /* + * IRQ and fault window setup is successful. Set fault window + * for the send window so that ready to handle faults. + */ + if (txwin->vinst->virq) + winctx->fault_win_id = txwin->vinst->fault_win->vas_win.winid; winctx->dma_type = VAS_DMA_TYPE_INJECT; winctx->tc_mode = txattr->tc_mode; winctx->min_scope = VAS_SCOPE_LOCAL; winctx->max_scope = VAS_SCOPE_VECTORED_GROUP; + if (txwin->vinst->virq) + winctx->irq_port = txwin->vinst->irq_port; - winctx->pswid = 0; + winctx->pswid = txattr->pswid ? txattr->pswid : + encode_pswid(txwin->vinst->vas_id, + txwin->vas_win.winid); } static bool tx_win_args_valid(enum vas_cop_type cop, @@ -979,9 +980,14 @@ static bool tx_win_args_valid(enum vas_cop_type cop, if (attr->wcreds_max > VAS_TX_WCREDS_MAX) return false; - if (attr->user_win && - (cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count)) - return false; + if (attr->user_win) { + if (attr->rsvd_txbuf_count) + return false; + + if (cop != VAS_COP_TYPE_FTW && cop != VAS_COP_TYPE_GZIP && + cop != VAS_COP_TYPE_GZIP_HIPRI) + return false; + } return true; } @@ -990,8 +996,8 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, struct vas_tx_win_attr *attr) { int rc; - struct vas_window *txwin; - struct vas_window *rxwin; + struct pnv_vas_window *txwin; + struct pnv_vas_window *rxwin; struct vas_winctx winctx; struct vas_instance *vinst; @@ -1017,7 +1023,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, rxwin = get_vinst_rxwin(vinst, cop, attr->pswid); if (IS_ERR(rxwin)) { pr_devel("No RxWin for vasid %d, cop %d\n", vasid, cop); - return rxwin; + return (struct vas_window *)rxwin; } txwin = vas_window_alloc(vinst); @@ -1026,13 +1032,12 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, goto put_rxwin; } - txwin->cop = cop; + txwin->vas_win.cop = cop; txwin->tx_win = 1; txwin->rxwin = rxwin; txwin->nx_win = txwin->rxwin->nx_win; - txwin->pid = attr->pid; txwin->user_win = attr->user_win; - txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT; + txwin->vas_win.wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT; init_winctx_for_txwin(txwin, attr, &winctx); @@ -1054,17 +1059,24 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop, } } else { /* - * A user mapping must ensure that context switch issues - * CP_ABORT for this thread. + * Interrupt handler or fault window setup failed. Means + * NX can not generate fault for page fault. So not + * opening for user space tx window. */ - rc = set_thread_uses_vas(); + if (!vinst->virq) { + rc = -ENODEV; + goto free_window; + } + rc = get_vas_user_win_ref(&txwin->vas_win.task_ref); if (rc) goto free_window; + + vas_user_win_add_mm_context(&txwin->vas_win.task_ref); } set_vinst_win(vinst, txwin); - return txwin; + return &txwin->vas_win; free_window: vas_window_free(txwin); @@ -1083,12 +1095,14 @@ int vas_copy_crb(void *crb, int offset) EXPORT_SYMBOL_GPL(vas_copy_crb); #define RMA_LSMP_REPORT_ENABLE PPC_BIT(53) -int vas_paste_crb(struct vas_window *txwin, int offset, bool re) +int vas_paste_crb(struct vas_window *vwin, int offset, bool re) { + struct pnv_vas_window *txwin; int rc; void *addr; uint64_t val; + txwin = container_of(vwin, struct pnv_vas_window, vas_win); trace_vas_paste_crb(current, txwin); /* @@ -1118,7 +1132,7 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re) else rc = -EINVAL; - pr_debug("Txwin #%d: Msg count %llu\n", txwin->winid, + pr_debug("Txwin #%d: Msg count %llu\n", txwin->vas_win.winid, read_hvwc_reg(txwin, VREG(LRFIFO_PUSH))); return rc; @@ -1138,10 +1152,11 @@ EXPORT_SYMBOL_GPL(vas_paste_crb); * user space. (NX-842 driver waits for CSB and Fast thread-wakeup * doesn't use credit checking). */ -static void poll_window_credits(struct vas_window *window) +static void poll_window_credits(struct pnv_vas_window *window) { u64 val; int creds, mode; + int count = 0; val = read_hvwc_reg(window, VREG(WINCTL)); if (window->tx_win) @@ -1160,10 +1175,28 @@ retry: creds = GET_FIELD(VAS_LRX_WCRED, val); } - if (creds < window->wcreds_max) { + /* + * Takes around few milliseconds to complete all pending requests + * and return credits. + * TODO: Scan fault FIFO and invalidate CRBs points to this window + * and issue CRB Kill to stop all pending requests. Need only + * if there is a bug in NX or fault handling in kernel. + */ + if (creds < window->vas_win.wcreds_max) { val = 0; set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(msecs_to_jiffies(10)); + count++; + /* + * Process can not close send window until all credits are + * returned. + */ + if (!(count % 1000)) + pr_warn_ratelimited("VAS: pid %d stuck. Waiting for credits returned for Window(%d). creds %d, Retries %d\n", + vas_window_pid(&window->vas_win), + window->vas_win.winid, + creds, count); + goto retry; } } @@ -1173,10 +1206,11 @@ retry: * short time to queue a CRB, so window should not be busy for too long. * Trying 5ms intervals. */ -static void poll_window_busy_state(struct vas_window *window) +static void poll_window_busy_state(struct pnv_vas_window *window) { int busy; u64 val; + int count = 0; retry: val = read_hvwc_reg(window, VREG(WIN_STATUS)); @@ -1184,7 +1218,17 @@ retry: if (busy) { val = 0; set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(msecs_to_jiffies(5)); + schedule_timeout(msecs_to_jiffies(10)); + count++; + /* + * Takes around few milliseconds to process all pending + * requests. + */ + if (!(count % 1000)) + pr_warn_ratelimited("VAS: pid %d stuck. Window (ID=%d) is in busy state. Retries %d\n", + vas_window_pid(&window->vas_win), + window->vas_win.winid, count); + goto retry; } } @@ -1205,7 +1249,7 @@ retry: * casting out becomes necessary we should consider offloading the * job to a worker thread, so the window close can proceed quickly. */ -static void poll_window_castout(struct vas_window *window) +static void poll_window_castout(struct pnv_vas_window *window) { /* stub for now */ } @@ -1214,7 +1258,7 @@ static void poll_window_castout(struct vas_window *window) * Unpin and close a window so no new requests are accepted and the * hardware can evict this window from cache if necessary. */ -static void unpin_close_window(struct vas_window *window) +static void unpin_close_window(struct pnv_vas_window *window) { u64 val; @@ -1236,11 +1280,15 @@ static void unpin_close_window(struct vas_window *window) * * Besides the hardware, kernel has some bookkeeping of course. */ -int vas_win_close(struct vas_window *window) +int vas_win_close(struct vas_window *vwin) { - if (!window) + struct pnv_vas_window *window; + + if (!vwin) return 0; + window = container_of(vwin, struct pnv_vas_window, vas_win); + if (!window->tx_win && atomic_read(&window->num_txwins) != 0) { pr_devel("Attempting to close an active Rx window!\n"); WARN_ON_ONCE(1); @@ -1249,19 +1297,24 @@ int vas_win_close(struct vas_window *window) unmap_paste_region(window); - clear_vinst_win(window); - poll_window_busy_state(window); unpin_close_window(window); poll_window_credits(window); + clear_vinst_win(window); + poll_window_castout(window); /* if send window, drop reference to matching receive window */ - if (window->tx_win) + if (window->tx_win) { + if (window->user_win) { + mm_context_remove_vas_window(vwin->task_ref.mm); + put_vas_user_win_ref(&vwin->task_ref); + } put_rx_win(window->rxwin); + } vas_window_free(window); @@ -1270,10 +1323,149 @@ int vas_win_close(struct vas_window *window) EXPORT_SYMBOL_GPL(vas_win_close); /* - * Return a system-wide unique window id for the window @win. + * Return credit for the given window. + * Send windows and fault window uses credit mechanism as follows: + * + * Send windows: + * - The default number of credits available for each send window is + * 1024. It means 1024 requests can be issued asynchronously at the + * same time. If the credit is not available, that request will be + * returned with RMA_Busy. + * - One credit is taken when NX request is issued. + * - This credit is returned after NX processed that request. + * - If NX encounters translation error, kernel will return the + * credit on the specific send window after processing the fault CRB. + * + * Fault window: + * - The total number credits available is FIFO_SIZE/CRB_SIZE. + * Means 4MB/128 in the current implementation. If credit is not + * available, RMA_Reject is returned. + * - A credit is taken when NX pastes CRB in fault FIFO. + * - The kernel with return credit on fault window after reading entry + * from fault FIFO. + */ +void vas_return_credit(struct pnv_vas_window *window, bool tx) +{ + uint64_t val; + + val = 0ULL; + if (tx) { /* send window */ + val = SET_FIELD(VAS_TX_WCRED, val, 1); + write_hvwc_reg(window, VREG(TX_WCRED_ADDER), val); + } else { + val = SET_FIELD(VAS_LRX_WCRED, val, 1); + write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), val); + } +} + +struct pnv_vas_window *vas_pswid_to_window(struct vas_instance *vinst, + uint32_t pswid) +{ + struct pnv_vas_window *window; + int winid; + + if (!pswid) { + pr_devel("%s: called for pswid 0!\n", __func__); + return ERR_PTR(-ESRCH); + } + + decode_pswid(pswid, NULL, &winid); + + if (winid >= VAS_WINDOWS_PER_CHIP) + return ERR_PTR(-ESRCH); + + /* + * If application closes the window before the hardware + * returns the fault CRB, we should wait in vas_win_close() + * for the pending requests. so the window must be active + * and the process alive. + * + * If its a kernel process, we should not get any faults and + * should not get here. + */ + window = vinst->windows[winid]; + + if (!window) { + pr_err("PSWID decode: Could not find window for winid %d pswid %d vinst 0x%p\n", + winid, pswid, vinst); + return NULL; + } + + /* + * Do some sanity checks on the decoded window. Window should be + * NX GZIP user send window. FTW windows should not incur faults + * since their CRBs are ignored (not queued on FIFO or processed + * by NX). + */ + if (!window->tx_win || !window->user_win || !window->nx_win || + window->vas_win.cop == VAS_COP_TYPE_FAULT || + window->vas_win.cop == VAS_COP_TYPE_FTW) { + pr_err("PSWID decode: id %d, tx %d, user %d, nx %d, cop %d\n", + winid, window->tx_win, window->user_win, + window->nx_win, window->vas_win.cop); + WARN_ON(1); + } + + return window; +} + +static struct vas_window *vas_user_win_open(int vas_id, u64 flags, + enum vas_cop_type cop_type) +{ + struct vas_tx_win_attr txattr = {}; + + vas_init_tx_win_attr(&txattr, cop_type); + + txattr.lpid = mfspr(SPRN_LPID); + txattr.pidr = mfspr(SPRN_PID); + txattr.user_win = true; + txattr.rsvd_txbuf_count = false; + txattr.pswid = false; + + pr_devel("Pid %d: Opening txwin, PIDR %ld\n", txattr.pidr, + mfspr(SPRN_PID)); + + return vas_tx_win_open(vas_id, cop_type, &txattr); +} + +static u64 vas_user_win_paste_addr(struct vas_window *txwin) +{ + struct pnv_vas_window *win; + u64 paste_addr; + + win = container_of(txwin, struct pnv_vas_window, vas_win); + vas_win_paste_addr(win, &paste_addr, NULL); + + return paste_addr; +} + +static int vas_user_win_close(struct vas_window *txwin) +{ + vas_win_close(txwin); + + return 0; +} + +static const struct vas_user_win_ops vops = { + .open_win = vas_user_win_open, + .paste_addr = vas_user_win_paste_addr, + .close_win = vas_user_win_close, +}; + +/* + * Supporting only nx-gzip coprocessor type now, but this API code + * extended to other coprocessor types later. */ -u32 vas_win_id(struct vas_window *win) +int vas_register_api_powernv(struct module *mod, enum vas_cop_type cop_type, + const char *name) +{ + + return vas_register_coproc_api(mod, cop_type, name, &vops); +} +EXPORT_SYMBOL_GPL(vas_register_api_powernv); + +void vas_unregister_api_powernv(void) { - return encode_pswid(win->vinst->vas_id, win->winid); + vas_unregister_coproc_api(); } -EXPORT_SYMBOL_GPL(vas_win_id); +EXPORT_SYMBOL_GPL(vas_unregister_api_powernv); diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c index 5a2b24cbbc88..9c9650319f3b 100644 --- a/arch/powerpc/platforms/powernv/vas.c +++ b/arch/powerpc/platforms/powernv/vas.c @@ -1,10 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2016-17 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) "vas: " fmt @@ -18,7 +14,10 @@ #include <linux/of_platform.h> #include <linux/of_address.h> #include <linux/of.h> +#include <linux/irqdomain.h> +#include <linux/interrupt.h> #include <asm/prom.h> +#include <asm/xive.h> #include "vas.h" @@ -27,12 +26,35 @@ static LIST_HEAD(vas_instances); static DEFINE_PER_CPU(int, cpu_vas_id); +static int vas_irq_fault_window_setup(struct vas_instance *vinst) +{ + int rc = 0; + + rc = request_threaded_irq(vinst->virq, vas_fault_handler, + vas_fault_thread_fn, 0, vinst->name, vinst); + + if (rc) { + pr_err("VAS[%d]: Request IRQ(%d) failed with %d\n", + vinst->vas_id, vinst->virq, rc); + goto out; + } + + rc = vas_setup_fault_window(vinst); + if (rc) + free_irq(vinst->virq, vinst); + +out: + return rc; +} + static int init_vas_instance(struct platform_device *pdev) { - int rc, cpu, vasid; - struct resource *res; - struct vas_instance *vinst; struct device_node *dn = pdev->dev.of_node; + struct vas_instance *vinst; + struct xive_irq_data *xd; + uint32_t chipid, hwirq; + struct resource *res; + int rc, cpu, vasid; rc = of_property_read_u32(dn, "ibm,vas-id", &vasid); if (rc) { @@ -40,6 +62,12 @@ static int init_vas_instance(struct platform_device *pdev) return -ENODEV; } + rc = of_property_read_u32(dn, "ibm,chip-id", &chipid); + if (rc) { + pr_err("No ibm,chip-id property for %s?\n", pdev->name); + return -ENODEV; + } + if (pdev->num_resources != 4) { pr_err("Unexpected DT configuration for [%s, %d]\n", pdev->name, vasid); @@ -50,6 +78,12 @@ static int init_vas_instance(struct platform_device *pdev) if (!vinst) return -ENOMEM; + vinst->name = kasprintf(GFP_KERNEL, "vas-%d", vasid); + if (!vinst->name) { + kfree(vinst); + return -ENOMEM; + } + INIT_LIST_HEAD(&vinst->node); ida_init(&vinst->ida); mutex_init(&vinst->mutex); @@ -73,9 +107,32 @@ static int init_vas_instance(struct platform_device *pdev) vinst->paste_win_id_shift = 63 - res->end; - pr_devel("Initialized instance [%s, %d], paste_base 0x%llx, " - "paste_win_id_shift 0x%llx\n", pdev->name, vasid, - vinst->paste_base_addr, vinst->paste_win_id_shift); + hwirq = xive_native_alloc_irq_on_chip(chipid); + if (!hwirq) { + pr_err("Inst%d: Unable to allocate global irq for chip %d\n", + vinst->vas_id, chipid); + return -ENOENT; + } + + vinst->virq = irq_create_mapping(NULL, hwirq); + if (!vinst->virq) { + pr_err("Inst%d: Unable to map global irq %d\n", + vinst->vas_id, hwirq); + return -EINVAL; + } + + xd = irq_get_chip_data(vinst->virq); + if (!xd) { + pr_err("Inst%d: Invalid virq %d\n", + vinst->vas_id, vinst->virq); + return -EINVAL; + } + + vinst->irq_port = xd->trig_page; + pr_devel("Initialized instance [%s, %d] paste_base 0x%llx paste_win_id_shift 0x%llx IRQ %d Port 0x%llx\n", + pdev->name, vasid, vinst->paste_base_addr, + vinst->paste_win_id_shift, vinst->virq, + vinst->irq_port); for_each_possible_cpu(cpu) { if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn)) @@ -86,6 +143,22 @@ static int init_vas_instance(struct platform_device *pdev) list_add(&vinst->node, &vas_instances); mutex_unlock(&vas_mutex); + spin_lock_init(&vinst->fault_lock); + /* + * IRQ and fault handling setup is needed only for user space + * send windows. + */ + if (vinst->virq) { + rc = vas_irq_fault_window_setup(vinst); + /* + * Fault window is used only for user space send windows. + * So if vinst->virq is NULL, tx_win_open returns -ENODEV + * for user space. + */ + if (rc) + vinst->virq = 0; + } + vas_instance_init_dbgdir(vinst); dev_set_drvdata(&pdev->dev, vinst); @@ -93,6 +166,7 @@ static int init_vas_instance(struct platform_device *pdev) return 0; free_vinst: + kfree(vinst->name); kfree(vinst); return -ENODEV; diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h index f5493dbdd7ff..08d9d3d5a22b 100644 --- a/arch/powerpc/platforms/powernv/vas.h +++ b/arch/powerpc/platforms/powernv/vas.h @@ -1,10 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright 2016-17 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #ifndef _VAS_H @@ -105,11 +101,9 @@ /* * Initial per-process credits. * Max send window credits: 4K-1 (12-bits in VAS_TX_WCRED) - * Max receive window credits: 64K-1 (16 bits in VAS_LRX_WCRED) * * TODO: Needs tuning for per-process credits */ -#define VAS_RX_WCREDS_MAX ((64 << 10) - 1) #define VAS_TX_WCREDS_MAX ((4 << 10) - 1) #define VAS_WCREDS_DEFAULT (1 << 10) @@ -300,6 +294,22 @@ enum vas_notify_after_count { }; /* + * NX can generate an interrupt for multiple faults and expects kernel + * to process all of them. So read all valid CRB entries until find the + * invalid one. So use pswid which is pasted by NX and ccw[0] (reserved + * bit in BE) to check valid CRB. CCW[0] will not be touched by user + * space. Application gets CRB formt error if it updates this bit. + * + * Invalidate FIFO during allocation and process all entries from last + * successful read until finds invalid pswid and ccw[0] values. + * After reading each CRB entry from fault FIFO, the kernel invalidate + * it by updating pswid with FIFO_INVALID_ENTRY and CCW[0] with + * CCW0_INVALID. + */ +#define FIFO_INVALID_ENTRY 0xffffffff +#define CCW0_INVALID 1 + +/* * One per instance of VAS. Each instance will have a separate set of * receive windows, one per coprocessor type. * @@ -317,39 +327,43 @@ struct vas_instance { u64 paste_base_addr; u64 paste_win_id_shift; + u64 irq_port; + int virq; + int fault_crbs; + int fault_fifo_size; + int fifo_in_progress; /* To wake up thread or return IRQ_HANDLED */ + spinlock_t fault_lock; /* Protects fifo_in_progress update */ + void *fault_fifo; + struct pnv_vas_window *fault_win; /* Fault window */ + struct mutex mutex; - struct vas_window *rxwin[VAS_COP_TYPE_MAX]; - struct vas_window *windows[VAS_WINDOWS_PER_CHIP]; + struct pnv_vas_window *rxwin[VAS_COP_TYPE_MAX]; + struct pnv_vas_window *windows[VAS_WINDOWS_PER_CHIP]; + char *name; char *dbgname; struct dentry *dbgdir; }; /* - * In-kernel state a VAS window. One per window. + * In-kernel state a VAS window on PowerNV. One per window. */ -struct vas_window { +struct pnv_vas_window { + struct vas_window vas_win; /* Fields common to send and receive windows */ struct vas_instance *vinst; - int winid; bool tx_win; /* True if send window */ bool nx_win; /* True if NX window */ bool user_win; /* True if user space window */ void *hvwc_map; /* HV window context */ void *uwc_map; /* OS/User window context */ - pid_t pid; /* Linux process id of owner */ - int wcreds_max; /* Window credits */ - - char *dbgname; - struct dentry *dbgdir; /* Fields applicable only to send windows */ void *paste_kaddr; char *paste_addr_name; - struct vas_window *rxwin; + struct pnv_vas_window *rxwin; - /* Feilds applicable only to receive windows */ - enum vas_cop_type cop; + /* Fields applicable only to receive windows */ atomic_t num_txwins; }; @@ -362,7 +376,7 @@ struct vas_window { * is a container for the register fields in the window context. */ struct vas_winctx { - void *rx_fifo; + u64 rx_fifo; int rx_fifo_size; int wcreds_max; int rsvd_txbuf_count; @@ -408,19 +422,32 @@ extern struct mutex vas_mutex; extern struct vas_instance *find_vas_instance(int vasid); extern void vas_init_dbgdir(void); extern void vas_instance_init_dbgdir(struct vas_instance *vinst); -extern void vas_window_init_dbgdir(struct vas_window *win); -extern void vas_window_free_dbgdir(struct vas_window *win); +extern void vas_window_init_dbgdir(struct pnv_vas_window *win); +extern void vas_window_free_dbgdir(struct pnv_vas_window *win); +extern int vas_setup_fault_window(struct vas_instance *vinst); +extern irqreturn_t vas_fault_thread_fn(int irq, void *data); +extern irqreturn_t vas_fault_handler(int irq, void *dev_id); +extern void vas_return_credit(struct pnv_vas_window *window, bool tx); +extern struct pnv_vas_window *vas_pswid_to_window(struct vas_instance *vinst, + uint32_t pswid); +extern void vas_win_paste_addr(struct pnv_vas_window *window, u64 *addr, + int *len); + +static inline int vas_window_pid(struct vas_window *window) +{ + return pid_vnr(window->task_ref.pid); +} -static inline void vas_log_write(struct vas_window *win, char *name, +static inline void vas_log_write(struct pnv_vas_window *win, char *name, void *regptr, u64 val) { if (val) pr_debug("%swin #%d: %s reg %p, val 0x%016llx\n", - win->tx_win ? "Tx" : "Rx", win->winid, name, - regptr, val); + win->tx_win ? "Tx" : "Rx", win->vas_win.winid, + name, regptr, val); } -static inline void write_uwc_reg(struct vas_window *win, char *name, +static inline void write_uwc_reg(struct pnv_vas_window *win, char *name, s32 reg, u64 val) { void *regptr; @@ -431,7 +458,7 @@ static inline void write_uwc_reg(struct vas_window *win, char *name, out_be64(regptr, val); } -static inline void write_hvwc_reg(struct vas_window *win, char *name, +static inline void write_hvwc_reg(struct pnv_vas_window *win, char *name, s32 reg, u64 val) { void *regptr; @@ -442,7 +469,7 @@ static inline void write_hvwc_reg(struct vas_window *win, char *name, out_be64(regptr, val); } -static inline u64 read_hvwc_reg(struct vas_window *win, +static inline u64 read_hvwc_reg(struct pnv_vas_window *win, char *name __maybe_unused, s32 reg) { return in_be64(win->hvwc_map+reg); @@ -460,12 +487,7 @@ static inline u64 read_hvwc_reg(struct vas_window *win, */ static inline u32 encode_pswid(int vasid, int winid) { - u32 pswid = 0; - - pswid |= vasid << (31 - 7); - pswid |= winid; - - return pswid; + return ((u32)winid | (vasid << (31 - 7))); } static inline void decode_pswid(u32 pswid, int *vasid, int *winid) |
