summaryrefslogtreecommitdiff
path: root/arch/powerpc/platforms/powernv
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r--arch/powerpc/platforms/powernv/Kconfig28
-rw-r--r--arch/powerpc/platforms/powernv/Makefile14
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c279
-rw-r--r--arch/powerpc/platforms/powernv/idle.c356
-rw-r--r--arch/powerpc/platforms/powernv/memtrace.c230
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c630
-rw-r--r--arch/powerpc/platforms/powernv/ocxl.c139
-rw-r--r--arch/powerpc/platforms/powernv/opal-async.c2
-rw-r--r--arch/powerpc/platforms/powernv/opal-call.c8
-rw-r--r--arch/powerpc/platforms/powernv/opal-core.c75
-rw-r--r--arch/powerpc/platforms/powernv/opal-dump.c59
-rw-r--r--arch/powerpc/platforms/powernv/opal-elog.c51
-rw-r--r--arch/powerpc/platforms/powernv/opal-fadump.c106
-rw-r--r--arch/powerpc/platforms/powernv/opal-fadump.h10
-rw-r--r--arch/powerpc/platforms/powernv/opal-flash.c6
-rw-r--r--arch/powerpc/platforms/powernv/opal-hmi.c29
-rw-r--r--arch/powerpc/platforms/powernv/opal-imc.c54
-rw-r--r--arch/powerpc/platforms/powernv/opal-irqchip.c19
-rw-r--r--arch/powerpc/platforms/powernv/opal-lpc.c9
-rw-r--r--arch/powerpc/platforms/powernv/opal-memory-errors.c2
-rw-r--r--arch/powerpc/platforms/powernv/opal-msglog.c6
-rw-r--r--arch/powerpc/platforms/powernv/opal-power.c2
-rw-r--r--arch/powerpc/platforms/powernv/opal-powercap.c14
-rw-r--r--arch/powerpc/platforms/powernv/opal-prd.c53
-rw-r--r--arch/powerpc/platforms/powernv/opal-psr.c6
-rw-r--r--arch/powerpc/platforms/powernv/opal-rtc.c5
-rw-r--r--arch/powerpc/platforms/powernv/opal-secvar.c62
-rw-r--r--arch/powerpc/platforms/powernv/opal-sensor-groups.c10
-rw-r--r--arch/powerpc/platforms/powernv/opal-sensor.c2
-rw-r--r--arch/powerpc/platforms/powernv/opal-tracepoints.c1
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S2
-rw-r--r--arch/powerpc/platforms/powernv/opal-xscom.c13
-rw-r--r--arch/powerpc/platforms/powernv/opal.c181
-rw-r--r--arch/powerpc/platforms/powernv/pci-cxl.c23
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda-tce.c39
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c2288
-rw-r--r--arch/powerpc/platforms/powernv/pci-sriov.c760
-rw-r--r--arch/powerpc/platforms/powernv/pci.c214
-rw-r--r--arch/powerpc/platforms/powernv/pci.h162
-rw-r--r--arch/powerpc/platforms/powernv/powernv.h13
-rw-r--r--arch/powerpc/platforms/powernv/rng.c138
-rw-r--r--arch/powerpc/platforms/powernv/setup.c123
-rw-r--r--arch/powerpc/platforms/powernv/smp.c14
-rw-r--r--arch/powerpc/platforms/powernv/subcore.c25
-rw-r--r--arch/powerpc/platforms/powernv/subcore.h2
-rw-r--r--arch/powerpc/platforms/powernv/ultravisor.c1
-rw-r--r--arch/powerpc/platforms/powernv/vas-debug.c64
-rw-r--r--arch/powerpc/platforms/powernv/vas-fault.c245
-rw-r--r--arch/powerpc/platforms/powernv/vas-trace.h4
-rw-r--r--arch/powerpc/platforms/powernv/vas-window.c385
-rw-r--r--arch/powerpc/platforms/powernv/vas.c90
-rw-r--r--arch/powerpc/platforms/powernv/vas.h94
52 files changed, 3548 insertions, 3599 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 938803eab0ad..70a46acc70d6 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -2,7 +2,7 @@
config PPC_POWERNV
depends on PPC64 && PPC_BOOK3S
bool "IBM PowerNV (Non-Virtualized) platform support"
- select PPC_NATIVE
+ select PPC_HASH_MMU_NATIVE if PPC_64S_HASH_MMU
select PPC_XICS
select PPC_ICP_NATIVE
select PPC_XIVE_NATIVE
@@ -12,40 +12,26 @@ config PPC_POWERNV
select EPAPR_BOOT
select PPC_INDIRECT_PIO
select PPC_UDBG_16550
- select ARCH_RANDOM
select CPU_FREQ
select PPC_DOORBELL
select MMU_NOTIFIER
select FORCE_SMP
+ select ARCH_SUPPORTS_PER_VMA_LOCK
default y
config OPAL_PRD
- tristate 'OPAL PRD driver'
+ tristate "OPAL PRD driver"
depends on PPC_POWERNV
help
This enables the opal-prd driver, a facility to run processor
recovery diagnostics on OpenPower machines
config PPC_MEMTRACE
- bool "Enable removal of RAM from kernel mappings for tracing"
- depends on PPC_POWERNV && MEMORY_HOTREMOVE
+ bool "Enable runtime allocation of RAM for tracing"
+ depends on PPC_POWERNV && MEMORY_HOTPLUG && CONTIG_ALLOC
help
- Enabling this option allows for the removal of memory (RAM)
- from the kernel mappings to be used for hardware tracing.
-
-config PPC_VAS
- bool "IBM Virtual Accelerator Switchboard (VAS)"
- depends on PPC_POWERNV && PPC_64K_PAGES
- default y
- help
- This enables support for IBM Virtual Accelerator Switchboard (VAS).
-
- VAS allows accelerators in co-processors like NX-GZIP and NX-842
- to be accessible to kernel subsystems and user processes.
-
- VAS adapters are found in POWER9 based systems.
-
- If unsure, say N.
+ Enabling this option allows for runtime allocation of memory (RAM)
+ for hardware tracing.
config SCOM_DEBUGFS
bool "Expose SCOM controllers via debugfs"
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index c0f8120045c3..19f0fc5c6f1b 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -1,4 +1,13 @@
# SPDX-License-Identifier: GPL-2.0
+
+# nothing that deals with real mode is safe to KASAN
+# in particular, idle code runs a bunch of things in real mode
+KASAN_SANITIZE_idle.o := n
+KASAN_SANITIZE_pci-ioda.o := n
+KASAN_SANITIZE_pci-ioda-tce.o := n
+# pnv_machine_check_early
+KASAN_SANITIZE_setup.o := n
+
obj-y += setup.o opal-call.o opal-wrappers.o opal.o opal-async.o
obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
@@ -10,14 +19,15 @@ obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o
obj-$(CONFIG_FA_DUMP) += opal-fadump.o
obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o
obj-$(CONFIG_OPAL_CORE) += opal-core.o
-obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
+obj-$(CONFIG_PCI) += pci.o pci-ioda.o pci-ioda-tce.o
+obj-$(CONFIG_PCI_IOV) += pci-sriov.o
obj-$(CONFIG_CXL_BASE) += pci-cxl.o
obj-$(CONFIG_EEH) += eeh-powernv.o
obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o
obj-$(CONFIG_OPAL_PRD) += opal-prd.o
obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
-obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o
+obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o vas-fault.o
obj-$(CONFIG_OCXL_BASE) += ocxl.o
obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o
obj-$(CONFIG_PPC_SECURE_BOOT) += opal-secvar.o
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 6f300ab7f0e9..af3a5d37a149 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -11,6 +11,7 @@
#include <linux/export.h>
#include <linux/init.h>
#include <linux/interrupt.h>
+#include <linux/irqdomain.h>
#include <linux/list.h>
#include <linux/msi.h>
#include <linux/of.h>
@@ -38,65 +39,10 @@
static int eeh_event_irq = -EINVAL;
-void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
+static void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
{
- struct pci_dn *pdn = pci_get_pdn(pdev);
-
- if (!pdn || eeh_has_flag(EEH_FORCE_DISABLED))
- return;
-
dev_dbg(&pdev->dev, "EEH: Setting up device\n");
- eeh_add_device_early(pdn);
- eeh_add_device_late(pdev);
- eeh_sysfs_add_device(pdev);
-}
-
-static int pnv_eeh_init(void)
-{
- struct pci_controller *hose;
- struct pnv_phb *phb;
- int max_diag_size = PNV_PCI_DIAG_BUF_SIZE;
-
- if (!firmware_has_feature(FW_FEATURE_OPAL)) {
- pr_warn("%s: OPAL is required !\n",
- __func__);
- return -EINVAL;
- }
-
- /* Set probe mode */
- eeh_add_flag(EEH_PROBE_MODE_DEV);
-
- /*
- * P7IOC blocks PCI config access to frozen PE, but PHB3
- * doesn't do that. So we have to selectively enable I/O
- * prior to collecting error log.
- */
- list_for_each_entry(hose, &hose_list, list_node) {
- phb = hose->private_data;
-
- if (phb->model == PNV_PHB_MODEL_P7IOC)
- eeh_add_flag(EEH_ENABLE_IO_FOR_LOG);
-
- if (phb->diag_data_size > max_diag_size)
- max_diag_size = phb->diag_data_size;
-
- /*
- * PE#0 should be regarded as valid by EEH core
- * if it's not the reserved one. Currently, we
- * have the reserved PE#255 and PE#127 for PHB3
- * and P7IOC separately. So we should regard
- * PE#0 as valid for PHB3 and P7IOC.
- */
- if (phb->ioda.reserved_pe_idx != 0)
- eeh_add_flag(EEH_VALID_PE_ZERO);
-
- break;
- }
-
- eeh_set_pe_aux_size(max_diag_size);
- ppc_md.pcibios_bus_add_device = pnv_pcibios_bus_add_device;
-
- return 0;
+ eeh_probe_device(pdev);
}
static irqreturn_t pnv_eeh_event(int irq, void *data)
@@ -142,7 +88,7 @@ static ssize_t pnv_eeh_ei_write(struct file *filp,
return -EINVAL;
/* Retrieve PE */
- pe = eeh_pe_get(hose, pe_no, 0);
+ pe = eeh_pe_get(hose, pe_no);
if (!pe)
return -ENODEV;
@@ -197,7 +143,7 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
#endif /* CONFIG_DEBUG_FS */
-void pnv_eeh_enable_phbs(void)
+static void pnv_eeh_enable_phbs(void)
{
struct pci_controller *hose;
struct pnv_phb *phb;
@@ -345,28 +291,41 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap)
return 0;
}
+static struct eeh_pe *pnv_eeh_get_upstream_pe(struct pci_dev *pdev)
+{
+ struct pci_controller *hose = pdev->bus->sysdata;
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dev *parent = pdev->bus->self;
+
+#ifdef CONFIG_PCI_IOV
+ /* for VFs we use the PF's PE as the upstream PE */
+ if (pdev->is_virtfn)
+ parent = pdev->physfn;
+#endif
+
+ /* otherwise use the PE of our parent bridge */
+ if (parent) {
+ struct pnv_ioda_pe *ioda_pe = pnv_ioda_get_pe(parent);
+
+ return eeh_pe_get(phb->hose, ioda_pe->pe_number);
+ }
+
+ return NULL;
+}
+
/**
* pnv_eeh_probe - Do probe on PCI device
- * @pdn: PCI device node
- * @data: unused
+ * @pdev: pci_dev to probe
*
- * When EEH module is installed during system boot, all PCI devices
- * are checked one by one to see if it supports EEH. The function
- * is introduced for the purpose. By default, EEH has been enabled
- * on all PCI devices. That's to say, we only need do necessary
- * initialization on the corresponding eeh device and create PE
- * accordingly.
- *
- * It's notable that's unsafe to retrieve the EEH device through
- * the corresponding PCI device. During the PCI device hotplug, which
- * was possiblly triggered by EEH core, the binding between EEH device
- * and the PCI device isn't built yet.
+ * Create, or find the existing, eeh_dev for this pci_dev.
*/
-static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
+static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev)
{
+ struct pci_dn *pdn = pci_get_pdn(pdev);
struct pci_controller *hose = pdn->phb;
struct pnv_phb *phb = hose->private_data;
struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ struct eeh_pe *upstream_pe;
uint32_t pcie_flags;
int ret;
int config_addr = (pdn->busno << 8) | (pdn->devfn);
@@ -380,20 +339,27 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
if (!edev || edev->pe)
return NULL;
+ /* already configured? */
+ if (edev->pdev) {
+ pr_debug("%s: found existing edev for %04x:%02x:%02x.%01x\n",
+ __func__, hose->global_number, config_addr >> 8,
+ PCI_SLOT(config_addr), PCI_FUNC(config_addr));
+ return edev;
+ }
+
/* Skip for PCI-ISA bridge */
- if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
+ if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
return NULL;
eeh_edev_dbg(edev, "Probing device\n");
/* Initialize eeh device */
- edev->class_code = pdn->class_code;
edev->mode &= 0xFFFFFF00;
edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX);
edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP);
edev->af_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_AF);
edev->aer_cap = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
- if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
+ if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
edev->mode |= EEH_DEV_BRIDGE;
if (edev->pcie_cap) {
pnv_pci_cfg_read(pdn, edev->pcie_cap + PCI_EXP_FLAGS,
@@ -408,8 +374,10 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
edev->pe_config_addr = phb->ioda.pe_rmap[config_addr];
+ upstream_pe = pnv_eeh_get_upstream_pe(pdev);
+
/* Create PE */
- ret = eeh_add_to_parent_pe(edev);
+ ret = eeh_pe_tree_insert(edev, upstream_pe);
if (ret) {
eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret);
return NULL;
@@ -423,7 +391,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
* should be blocked until PE reset. MMIO access is dropped
* by hardware certainly. In order to drop PCI config requests,
* one more flag (EEH_PE_CFG_RESTRICTED) is introduced, which
- * will be checked in the backend for PE state retrival. If
+ * will be checked in the backend for PE state retrieval. If
* the PE becomes frozen for the first time and the flag has
* been set for the PE, we will set EEH_PE_CFG_BLOCKED for
* that PE to block its config space.
@@ -471,7 +439,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
eeh_edev_dbg(edev, "EEH enabled on device\n");
- return NULL;
+ return edev;
}
/**
@@ -544,18 +512,6 @@ static int pnv_eeh_set_option(struct eeh_pe *pe, int option)
return 0;
}
-/**
- * pnv_eeh_get_pe_addr - Retrieve PE address
- * @pe: EEH PE
- *
- * Retrieve the PE address according to the given tranditional
- * PCI BDF (Bus/Device/Function) address.
- */
-static int pnv_eeh_get_pe_addr(struct eeh_pe *pe)
-{
- return pe->addr;
-}
-
static void pnv_eeh_get_phb_diag(struct eeh_pe *pe)
{
struct pnv_phb *phb = pe->phb->private_data;
@@ -859,32 +815,32 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option)
case EEH_RESET_HOT:
/* Don't report linkDown event */
if (aer) {
- eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+ eeh_ops->read_config(edev, aer + PCI_ERR_UNCOR_MASK,
4, &ctrl);
ctrl |= PCI_ERR_UNC_SURPDN;
- eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+ eeh_ops->write_config(edev, aer + PCI_ERR_UNCOR_MASK,
4, ctrl);
}
- eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl);
+ eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &ctrl);
ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
- eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl);
+ eeh_ops->write_config(edev, PCI_BRIDGE_CONTROL, 2, ctrl);
msleep(EEH_PE_RST_HOLD_TIME);
break;
case EEH_RESET_DEACTIVATE:
- eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl);
+ eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &ctrl);
ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
- eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl);
+ eeh_ops->write_config(edev, PCI_BRIDGE_CONTROL, 2, ctrl);
msleep(EEH_PE_RST_SETTLE_TIME);
/* Continue reporting linkDown event */
if (aer) {
- eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+ eeh_ops->read_config(edev, aer + PCI_ERR_UNCOR_MASK,
4, &ctrl);
ctrl &= ~PCI_ERR_UNC_SURPDN;
- eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+ eeh_ops->write_config(edev, aer + PCI_ERR_UNCOR_MASK,
4, ctrl);
}
@@ -899,8 +855,7 @@ static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option)
struct pci_controller *hose = pci_bus_to_host(pdev->bus);
struct pnv_phb *phb = hose->private_data;
struct device_node *dn = pci_device_to_OF_node(pdev);
- uint64_t id = PCI_SLOT_ID(phb->opal_id,
- (pdev->bus->number << 8) | pdev->devfn);
+ uint64_t id = PCI_SLOT_ID(phb->opal_id, pci_dev_id(pdev));
uint8_t scope;
int64_t rc;
@@ -953,11 +908,12 @@ void pnv_pci_reset_secondary_bus(struct pci_dev *dev)
static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type,
int pos, u16 mask)
{
+ struct eeh_dev *edev = pdn->edev;
int i, status = 0;
/* Wait for Transaction Pending bit to be cleared */
for (i = 0; i < 4; i++) {
- eeh_ops->read_config(pdn, pos, 2, &status);
+ eeh_ops->read_config(edev, pos, 2, &status);
if (!(status & mask))
return;
@@ -978,7 +934,7 @@ static int pnv_eeh_do_flr(struct pci_dn *pdn, int option)
if (WARN_ON(!edev->pcie_cap))
return -ENOTTY;
- eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP, 4, &reg);
+ eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCAP, 4, &reg);
if (!(reg & PCI_EXP_DEVCAP_FLR))
return -ENOTTY;
@@ -988,18 +944,18 @@ static int pnv_eeh_do_flr(struct pci_dn *pdn, int option)
pnv_eeh_wait_for_pending(pdn, "",
edev->pcie_cap + PCI_EXP_DEVSTA,
PCI_EXP_DEVSTA_TRPND);
- eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL,
4, &reg);
reg |= PCI_EXP_DEVCTL_BCR_FLR;
- eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ eeh_ops->write_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL,
4, reg);
msleep(EEH_PE_RST_HOLD_TIME);
break;
case EEH_RESET_DEACTIVATE:
- eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL,
4, &reg);
reg &= ~PCI_EXP_DEVCTL_BCR_FLR;
- eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ eeh_ops->write_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL,
4, reg);
msleep(EEH_PE_RST_SETTLE_TIME);
break;
@@ -1016,7 +972,7 @@ static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option)
if (WARN_ON(!edev->af_cap))
return -ENOTTY;
- eeh_ops->read_config(pdn, edev->af_cap + PCI_AF_CAP, 1, &cap);
+ eeh_ops->read_config(edev, edev->af_cap + PCI_AF_CAP, 1, &cap);
if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR))
return -ENOTTY;
@@ -1025,18 +981,18 @@ static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option)
case EEH_RESET_FUNDAMENTAL:
/*
* Wait for Transaction Pending bit to clear. A word-aligned
- * test is used, so we use the conrol offset rather than status
+ * test is used, so we use the control offset rather than status
* and shift the test bit to match.
*/
pnv_eeh_wait_for_pending(pdn, "AF",
edev->af_cap + PCI_AF_CTRL,
PCI_AF_STATUS_TP << 8);
- eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL,
+ eeh_ops->write_config(edev, edev->af_cap + PCI_AF_CTRL,
1, PCI_AF_CTRL_FLR);
msleep(EEH_PE_RST_HOLD_TIME);
break;
case EEH_RESET_DEACTIVATE:
- eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL, 1, 0);
+ eeh_ops->write_config(edev, edev->af_cap + PCI_AF_CTRL, 1, 0);
msleep(EEH_PE_RST_SETTLE_TIME);
break;
}
@@ -1092,7 +1048,7 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
* frozen state during PE reset. However, the good idea here from
* benh is to keep frozen state before we get PE reset done completely
* (until BAR restore). With the frozen state, HW drops illegal IO
- * or MMIO access, which can incur recrusive frozen PE during PE
+ * or MMIO access, which can incur recursive frozen PE during PE
* reset. The side effect is that EEH core has to clear the frozen
* state explicitly after BAR restore.
*/
@@ -1139,8 +1095,8 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
* bus is behind a hotplug slot and it will use the slot provided
* reset methods to prevent spurious hotplug events during the reset.
*
- * Fundemental resets need to be handled internally to EEH since the
- * PCI core doesn't really have a concept of a fundemental reset,
+ * Fundamental resets need to be handled internally to EEH since the
+ * PCI core doesn't really have a concept of a fundamental reset,
* mainly because there's no standard way to generate one. Only a
* few devices require an FRESET so it should be fine.
*/
@@ -1270,9 +1226,11 @@ static inline bool pnv_eeh_cfg_blocked(struct pci_dn *pdn)
return false;
}
-static int pnv_eeh_read_config(struct pci_dn *pdn,
+static int pnv_eeh_read_config(struct eeh_dev *edev,
int where, int size, u32 *val)
{
+ struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
if (!pdn)
return PCIBIOS_DEVICE_NOT_FOUND;
@@ -1284,9 +1242,11 @@ static int pnv_eeh_read_config(struct pci_dn *pdn,
return pnv_pci_cfg_read(pdn, where, size, val);
}
-static int pnv_eeh_write_config(struct pci_dn *pdn,
+static int pnv_eeh_write_config(struct eeh_dev *edev,
int where, int size, u32 val)
{
+ struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
if (!pdn)
return PCIBIOS_DEVICE_NOT_FOUND;
@@ -1398,7 +1358,7 @@ static int pnv_eeh_get_pe(struct pci_controller *hose,
}
/* Find the PE according to PE# */
- dev_pe = eeh_pe_get(hose, pe_no, 0);
+ dev_pe = eeh_pe_get(hose, pe_no);
if (!dev_pe)
return -EEXIST;
@@ -1640,34 +1600,24 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
return ret;
}
-static int pnv_eeh_restore_config(struct pci_dn *pdn)
+static int pnv_eeh_restore_config(struct eeh_dev *edev)
{
- struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
struct pnv_phb *phb;
s64 ret = 0;
- int config_addr = (pdn->busno << 8) | (pdn->devfn);
if (!edev)
return -EEXIST;
- /*
- * We have to restore the PCI config space after reset since the
- * firmware can't see SRIOV VFs.
- *
- * FIXME: The MPS, error routing rules, timeout setting are worthy
- * to be exported by firmware in extendible way.
- */
- if (edev->physfn) {
- ret = eeh_restore_vf_config(pdn);
- } else {
- phb = pdn->phb->private_data;
- ret = opal_pci_reinit(phb->opal_id,
- OPAL_REINIT_PCI_DEV, config_addr);
- }
+ if (edev->physfn)
+ return 0;
+
+ phb = edev->controller->private_data;
+ ret = opal_pci_reinit(phb->opal_id,
+ OPAL_REINIT_PCI_DEV, edev->bdfn);
if (ret) {
pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n",
- __func__, config_addr, ret);
+ __func__, edev->bdfn, ret);
return -EIO;
}
@@ -1676,10 +1626,8 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn)
static struct eeh_ops pnv_eeh_ops = {
.name = "powernv",
- .init = pnv_eeh_init,
.probe = pnv_eeh_probe,
.set_option = pnv_eeh_set_option,
- .get_pe_addr = pnv_eeh_get_pe_addr,
.get_state = pnv_eeh_get_state,
.reset = pnv_eeh_reset,
.get_log = pnv_eeh_get_log,
@@ -1692,24 +1640,6 @@ static struct eeh_ops pnv_eeh_ops = {
.notify_resume = NULL
};
-#ifdef CONFIG_PCI_IOV
-static void pnv_pci_fixup_vf_mps(struct pci_dev *pdev)
-{
- struct pci_dn *pdn = pci_get_pdn(pdev);
- int parent_mps;
-
- if (!pdev->is_virtfn)
- return;
-
- /* Synchronize MPS for VF and PF */
- parent_mps = pcie_get_mps(pdev->physfn);
- if ((128 << pdev->pcie_mpss) >= parent_mps)
- pcie_set_mps(pdev, parent_mps);
- pdn->mps = pcie_get_mps(pdev);
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_pci_fixup_vf_mps);
-#endif /* CONFIG_PCI_IOV */
-
/**
* eeh_powernv_init - Register platform dependent EEH operations
*
@@ -1718,9 +1648,44 @@ DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_pci_fixup_vf_mps);
*/
static int __init eeh_powernv_init(void)
{
+ int max_diag_size = PNV_PCI_DIAG_BUF_SIZE;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
int ret = -EINVAL;
- ret = eeh_ops_register(&pnv_eeh_ops);
+ if (!firmware_has_feature(FW_FEATURE_OPAL)) {
+ pr_warn("%s: OPAL is required !\n", __func__);
+ return -EINVAL;
+ }
+
+ /* Set probe mode */
+ eeh_add_flag(EEH_PROBE_MODE_DEV);
+
+ /*
+ * P7IOC blocks PCI config access to frozen PE, but PHB3
+ * doesn't do that. So we have to selectively enable I/O
+ * prior to collecting error log.
+ */
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb = hose->private_data;
+
+ if (phb->model == PNV_PHB_MODEL_P7IOC)
+ eeh_add_flag(EEH_ENABLE_IO_FOR_LOG);
+
+ if (phb->diag_data_size > max_diag_size)
+ max_diag_size = phb->diag_data_size;
+
+ break;
+ }
+
+ /*
+ * eeh_init() allocates the eeh_pe and its aux data buf so the
+ * size needs to be set before calling eeh_init().
+ */
+ eeh_set_pe_aux_size(max_diag_size);
+ ppc_md.pcibios_bus_add_device = pnv_pcibios_bus_add_device;
+
+ ret = eeh_init(&pnv_eeh_ops);
if (!ret)
pr_info("EEH: PowerNV platform initialized\n");
else
@@ -1728,4 +1693,4 @@ static int __init eeh_powernv_init(void)
return ret;
}
-machine_early_initcall(powernv, eeh_powernv_init);
+machine_arch_initcall(powernv, eeh_powernv_init);
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 78599bca66c2..ad41dffe4d92 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -12,8 +12,8 @@
#include <linux/device.h>
#include <linux/cpu.h>
-#include <asm/asm-prototypes.h>
#include <asm/firmware.h>
+#include <asm/interrupt.h>
#include <asm/machdep.h>
#include <asm/opal.h>
#include <asm/cputhreads.h>
@@ -48,7 +48,7 @@ static bool default_stop_found;
* First stop state levels when SPR and TB loss can occur.
*/
static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
-static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
+static u64 deep_spr_loss_state = MAX_STOP_STATE + 1;
/*
* psscr value and mask of the deepest stop idle state.
@@ -61,7 +61,7 @@ static bool deepest_stop_found;
static unsigned long power7_offline_type;
-static int pnv_save_sprs_for_deep_states(void)
+static int __init pnv_save_sprs_for_deep_states(void)
{
int cpu;
int rc;
@@ -73,9 +73,6 @@ static int pnv_save_sprs_for_deep_states(void)
*/
uint64_t lpcr_val = mfspr(SPRN_LPCR);
uint64_t hid0_val = mfspr(SPRN_HID0);
- uint64_t hid1_val = mfspr(SPRN_HID1);
- uint64_t hid4_val = mfspr(SPRN_HID4);
- uint64_t hid5_val = mfspr(SPRN_HID5);
uint64_t hmeer_val = mfspr(SPRN_HMEER);
uint64_t msr_val = MSR_IDLE;
uint64_t psscr_val = pnv_deepest_stop_psscr_val;
@@ -115,8 +112,11 @@ static int pnv_save_sprs_for_deep_states(void)
if (rc != 0)
return rc;
- /* Only p8 needs to set extra HID regiters */
+ /* Only p8 needs to set extra HID registers */
if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+ uint64_t hid1_val = mfspr(SPRN_HID1);
+ uint64_t hid4_val = mfspr(SPRN_HID4);
+ uint64_t hid5_val = mfspr(SPRN_HID5);
rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
if (rc != 0)
@@ -145,9 +145,13 @@ EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
static void pnv_fastsleep_workaround_apply(void *info)
{
+ int cpu = smp_processor_id();
int rc;
int *err = info;
+ if (cpu_first_thread_sibling(cpu) != cpu)
+ return;
+
rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP,
OPAL_CONFIG_IDLE_APPLY);
if (rc)
@@ -174,7 +178,6 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
struct device_attribute *attr, const char *buf,
size_t count)
{
- cpumask_t primary_thread_mask;
int err;
u8 val;
@@ -198,12 +201,9 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
*/
power7_fastsleep_workaround_exit = false;
- get_online_cpus();
- primary_thread_mask = cpu_online_cores_map();
- on_each_cpu_mask(&primary_thread_mask,
- pnv_fastsleep_workaround_apply,
- &err, 1);
- put_online_cpus();
+ cpus_read_lock();
+ on_each_cpu(pnv_fastsleep_workaround_apply, &err, 1);
+ cpus_read_unlock();
if (err) {
pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply");
goto fail;
@@ -246,9 +246,9 @@ static inline void atomic_lock_thread_idle(void)
{
int cpu = raw_smp_processor_id();
int first = cpu_first_thread_sibling(cpu);
- unsigned long *state = &paca_ptrs[first]->idle_state;
+ unsigned long *lock = &paca_ptrs[first]->idle_lock;
- while (unlikely(test_and_set_bit_lock(NR_PNV_CORE_IDLE_LOCK_BIT, state)))
+ while (unlikely(test_and_set_bit_lock(NR_PNV_CORE_IDLE_LOCK_BIT, lock)))
barrier();
}
@@ -258,29 +258,31 @@ static inline void atomic_unlock_and_stop_thread_idle(void)
int first = cpu_first_thread_sibling(cpu);
unsigned long thread = 1UL << cpu_thread_in_core(cpu);
unsigned long *state = &paca_ptrs[first]->idle_state;
+ unsigned long *lock = &paca_ptrs[first]->idle_lock;
u64 s = READ_ONCE(*state);
u64 new, tmp;
- BUG_ON(!(s & PNV_CORE_IDLE_LOCK_BIT));
+ BUG_ON(!(READ_ONCE(*lock) & PNV_CORE_IDLE_LOCK_BIT));
BUG_ON(s & thread);
again:
- new = (s | thread) & ~PNV_CORE_IDLE_LOCK_BIT;
+ new = s | thread;
tmp = cmpxchg(state, s, new);
if (unlikely(tmp != s)) {
s = tmp;
goto again;
}
+ clear_bit_unlock(NR_PNV_CORE_IDLE_LOCK_BIT, lock);
}
static inline void atomic_unlock_thread_idle(void)
{
int cpu = raw_smp_processor_id();
int first = cpu_first_thread_sibling(cpu);
- unsigned long *state = &paca_ptrs[first]->idle_state;
+ unsigned long *lock = &paca_ptrs[first]->idle_lock;
- BUG_ON(!test_bit(NR_PNV_CORE_IDLE_LOCK_BIT, state));
- clear_bit_unlock(NR_PNV_CORE_IDLE_LOCK_BIT, state);
+ BUG_ON(!test_bit(NR_PNV_CORE_IDLE_LOCK_BIT, lock));
+ clear_bit_unlock(NR_PNV_CORE_IDLE_LOCK_BIT, lock);
}
/* P7 and P8 */
@@ -305,8 +307,8 @@ struct p7_sprs {
/* per thread SPRs that get lost in shallow states */
u64 amr;
u64 iamr;
- u64 amor;
u64 uamor;
+ /* amor is restored to constant ~0 */
};
static unsigned long power7_idle_insn(unsigned long type)
@@ -377,7 +379,6 @@ static unsigned long power7_idle_insn(unsigned long type)
if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
sprs.amr = mfspr(SPRN_AMR);
sprs.iamr = mfspr(SPRN_IAMR);
- sprs.amor = mfspr(SPRN_AMOR);
sprs.uamor = mfspr(SPRN_UAMOR);
}
@@ -396,7 +397,7 @@ static unsigned long power7_idle_insn(unsigned long type)
*/
mtspr(SPRN_AMR, sprs.amr);
mtspr(SPRN_IAMR, sprs.iamr);
- mtspr(SPRN_AMOR, sprs.amor);
+ mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_UAMOR, sprs.uamor);
}
}
@@ -491,12 +492,14 @@ subcore_woken:
mtspr(SPRN_SPRG3, local_paca->sprg_vdso);
+#ifdef CONFIG_PPC_64S_HASH_MMU
/*
* The SLB has to be restored here, but it sometimes still
* contains entries, so the __ variant must be used to prevent
* multi hits.
*/
__slb_restore_bolted_realmode();
+#endif
return srr1;
}
@@ -565,7 +568,7 @@ void power7_idle_type(unsigned long type)
irq_set_pending_from_srr1(srr1);
}
-void power7_idle(void)
+static void power7_idle(void)
{
if (!powersave_nap)
return;
@@ -588,7 +591,7 @@ struct p9_sprs {
u64 purr;
u64 spurr;
u64 dscr;
- u64 wort;
+ u64 ciabr;
u64 mmcra;
u32 mmcr0;
@@ -602,7 +605,7 @@ struct p9_sprs {
u64 uamor;
};
-static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
+static unsigned long power9_idle_stop(unsigned long psscr)
{
int cpu = raw_smp_processor_id();
int first = cpu_first_thread_sibling(cpu);
@@ -611,14 +614,13 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
unsigned long srr1;
unsigned long pls;
unsigned long mmcr0 = 0;
+ unsigned long mmcra = 0;
struct p9_sprs sprs = {}; /* avoid false used-uninitialised */
bool sprs_saved = false;
if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
/* EC=ESL=0 case */
- BUG_ON(!mmu_on);
-
/*
* Wake synchronously. SRESET via xscom may still cause
* a 0x100 powersave wakeup with SRR1 reason!
@@ -657,7 +659,8 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
*/
mmcr0 = mfspr(SPRN_MMCR0);
}
- if ((psscr & PSSCR_RL_MASK) >= pnv_first_spr_loss_level) {
+
+ if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) {
sprs.lpcr = mfspr(SPRN_LPCR);
sprs.hfscr = mfspr(SPRN_HFSCR);
sprs.fscr = mfspr(SPRN_FSCR);
@@ -665,7 +668,7 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
sprs.purr = mfspr(SPRN_PURR);
sprs.spurr = mfspr(SPRN_SPURR);
sprs.dscr = mfspr(SPRN_DSCR);
- sprs.wort = mfspr(SPRN_WORT);
+ sprs.ciabr = mfspr(SPRN_CIABR);
sprs.mmcra = mfspr(SPRN_MMCRA);
sprs.mmcr0 = mfspr(SPRN_MMCR0);
@@ -685,7 +688,6 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
sprs.amr = mfspr(SPRN_AMR);
sprs.iamr = mfspr(SPRN_IAMR);
- sprs.amor = mfspr(SPRN_AMOR);
sprs.uamor = mfspr(SPRN_UAMOR);
srr1 = isa300_idle_stop_mayloss(psscr); /* go idle */
@@ -700,15 +702,13 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) {
- unsigned long mmcra;
-
/*
* We don't need an isync after the mtsprs here because the
* upcoming mtmsrd is execution synchronizing.
*/
mtspr(SPRN_AMR, sprs.amr);
mtspr(SPRN_IAMR, sprs.iamr);
- mtspr(SPRN_AMOR, sprs.amor);
+ mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_UAMOR, sprs.uamor);
/*
@@ -741,7 +741,7 @@ static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
* just always test PSSCR for SPR/TB state loss.
*/
pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT;
- if (likely(pls < pnv_first_spr_loss_level)) {
+ if (likely(pls < deep_spr_loss_state)) {
if (sprs_saved)
atomic_stop_thread_idle();
goto out;
@@ -784,7 +784,7 @@ core_woken:
mtspr(SPRN_PURR, sprs.purr);
mtspr(SPRN_SPURR, sprs.spurr);
mtspr(SPRN_DSCR, sprs.dscr);
- mtspr(SPRN_WORT, sprs.wort);
+ mtspr(SPRN_CIABR, sprs.ciabr);
mtspr(SPRN_MMCRA, sprs.mmcra);
mtspr(SPRN_MMCR0, sprs.mmcr0);
@@ -799,78 +799,10 @@ core_woken:
__slb_restore_bolted_realmode();
out:
- if (mmu_on)
- mtmsr(MSR_KERNEL);
-
- return srr1;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static unsigned long power9_offline_stop(unsigned long psscr)
-{
- unsigned long srr1;
-
-#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
- __ppc64_runlatch_off();
- srr1 = power9_idle_stop(psscr, true);
- __ppc64_runlatch_on();
-#else
- /*
- * Tell KVM we're entering idle.
- * This does not have to be done in real mode because the P9 MMU
- * is independent per-thread. Some steppings share radix/hash mode
- * between threads, but in that case KVM has a barrier sync in real
- * mode before and after switching between radix and hash.
- *
- * kvm_start_guest must still be called in real mode though, hence
- * the false argument.
- */
- local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
-
- __ppc64_runlatch_off();
- srr1 = power9_idle_stop(psscr, false);
- __ppc64_runlatch_on();
-
- local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
- /* Order setting hwthread_state vs. testing hwthread_req */
- smp_mb();
- if (local_paca->kvm_hstate.hwthread_req)
- srr1 = idle_kvm_start_guest(srr1);
mtmsr(MSR_KERNEL);
-#endif
return srr1;
}
-#endif
-
-void power9_idle_type(unsigned long stop_psscr_val,
- unsigned long stop_psscr_mask)
-{
- unsigned long psscr;
- unsigned long srr1;
-
- if (!prep_irq_for_idle_irqsoff())
- return;
-
- psscr = mfspr(SPRN_PSSCR);
- psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
-
- __ppc64_runlatch_off();
- srr1 = power9_idle_stop(psscr, true);
- __ppc64_runlatch_on();
-
- fini_irq_for_idle_irqsoff();
-
- irq_set_pending_from_srr1(srr1);
-}
-
-/*
- * Used for ppc_md.power_save which needs a function with no parameters
- */
-void power9_idle(void)
-{
- power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask);
-}
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
/*
@@ -944,6 +876,165 @@ void pnv_power9_force_smt4_release(void)
EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_release);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+struct p10_sprs {
+ /*
+ * SPRs that get lost in shallow states:
+ *
+ * P10 loses CR, LR, CTR, FPSCR, VSCR, XER, TAR, SPRG2, and HSPRG1
+ * isa300 idle routines restore CR, LR.
+ * CTR is volatile
+ * idle thread doesn't use FP or VEC
+ * kernel doesn't use TAR
+ * HSPRG1 is only live in HV interrupt entry
+ * SPRG2 is only live in KVM guests, KVM handles it.
+ */
+};
+
+static unsigned long power10_idle_stop(unsigned long psscr)
+{
+ int cpu = raw_smp_processor_id();
+ int first = cpu_first_thread_sibling(cpu);
+ unsigned long *state = &paca_ptrs[first]->idle_state;
+ unsigned long core_thread_mask = (1UL << threads_per_core) - 1;
+ unsigned long srr1;
+ unsigned long pls;
+// struct p10_sprs sprs = {}; /* avoid false used-uninitialised */
+ bool sprs_saved = false;
+
+ if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
+ /* EC=ESL=0 case */
+
+ /*
+ * Wake synchronously. SRESET via xscom may still cause
+ * a 0x100 powersave wakeup with SRR1 reason!
+ */
+ srr1 = isa300_idle_stop_noloss(psscr); /* go idle */
+ if (likely(!srr1))
+ return 0;
+
+ /*
+ * Registers not saved, can't recover!
+ * This would be a hardware bug
+ */
+ BUG_ON((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS);
+
+ goto out;
+ }
+
+ /* EC=ESL=1 case */
+ if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) {
+ /* XXX: save SPRs for deep state loss here. */
+
+ sprs_saved = true;
+
+ atomic_start_thread_idle();
+ }
+
+ srr1 = isa300_idle_stop_mayloss(psscr); /* go idle */
+
+ psscr = mfspr(SPRN_PSSCR);
+
+ WARN_ON_ONCE(!srr1);
+ WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
+
+ if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
+ hmi_exception_realmode(NULL);
+
+ /*
+ * On POWER10, SRR1 bits do not match exactly as expected.
+ * SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so
+ * just always test PSSCR for SPR/TB state loss.
+ */
+ pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT;
+ if (likely(pls < deep_spr_loss_state)) {
+ if (sprs_saved)
+ atomic_stop_thread_idle();
+ goto out;
+ }
+
+ /* HV state loss */
+ BUG_ON(!sprs_saved);
+
+ atomic_lock_thread_idle();
+
+ if ((*state & core_thread_mask) != 0)
+ goto core_woken;
+
+ /* XXX: restore per-core SPRs here */
+
+ if (pls >= pnv_first_tb_loss_level) {
+ /* TB loss */
+ if (opal_resync_timebase() != OPAL_SUCCESS)
+ BUG();
+ }
+
+ /*
+ * isync after restoring shared SPRs and before unlocking. Unlock
+ * only contains hwsync which does not necessarily do the right
+ * thing for SPRs.
+ */
+ isync();
+
+core_woken:
+ atomic_unlock_and_stop_thread_idle();
+
+ /* XXX: restore per-thread SPRs here */
+
+ if (!radix_enabled())
+ __slb_restore_bolted_realmode();
+
+out:
+ mtmsr(MSR_KERNEL);
+
+ return srr1;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static unsigned long arch300_offline_stop(unsigned long psscr)
+{
+ unsigned long srr1;
+
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ srr1 = power10_idle_stop(psscr);
+ else
+ srr1 = power9_idle_stop(psscr);
+
+ return srr1;
+}
+#endif
+
+void arch300_idle_type(unsigned long stop_psscr_val,
+ unsigned long stop_psscr_mask)
+{
+ unsigned long psscr;
+ unsigned long srr1;
+
+ if (!prep_irq_for_idle_irqsoff())
+ return;
+
+ psscr = mfspr(SPRN_PSSCR);
+ psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
+
+ __ppc64_runlatch_off();
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ srr1 = power10_idle_stop(psscr);
+ else
+ srr1 = power9_idle_stop(psscr);
+ __ppc64_runlatch_on();
+
+ fini_irq_for_idle_irqsoff();
+
+ irq_set_pending_from_srr1(srr1);
+}
+
+/*
+ * Used for ppc_md.power_save which needs a function with no parameters
+ */
+static void arch300_idle(void)
+{
+ arch300_idle_type(pnv_default_stop_val, pnv_default_stop_mask);
+}
+
#ifdef CONFIG_HOTPLUG_CPU
void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
@@ -977,7 +1068,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
psscr = mfspr(SPRN_PSSCR);
psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
pnv_deepest_stop_psscr_val;
- srr1 = power9_offline_stop(psscr);
+ srr1 = arch300_offline_stop(psscr);
} else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) {
srr1 = power7_offline();
} else {
@@ -1033,7 +1124,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
* stop instruction
*/
-int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags)
+int __init validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags)
{
int err = 0;
@@ -1075,11 +1166,15 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags)
* @dt_idle_states: Number of idle state entries
* Returns 0 on success
*/
-static void __init pnv_power9_idle_init(void)
+static void __init pnv_arch300_idle_init(void)
{
u64 max_residency_ns = 0;
int i;
+ /* stop is not really architected, we only have p9,p10 drivers */
+ if (!pvr_version_is(PVR_POWER10) && !pvr_version_is(PVR_POWER9))
+ return;
+
/*
* pnv_deepest_stop_{val,mask} should be set to values corresponding to
* the deepest stop state.
@@ -1088,31 +1183,36 @@ static void __init pnv_power9_idle_init(void)
* the deepest loss-less (OPAL_PM_STOP_INST_FAST) stop state.
*/
pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
- pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
+ deep_spr_loss_state = MAX_STOP_STATE + 1;
for (i = 0; i < nr_pnv_idle_states; i++) {
int err;
struct pnv_idle_states_t *state = &pnv_idle_states[i];
u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK;
+ /* No deep loss driver implemented for POWER10 yet */
+ if (pvr_version_is(PVR_POWER10) &&
+ state->flags & (OPAL_PM_TIMEBASE_STOP|OPAL_PM_LOSE_FULL_CONTEXT))
+ continue;
+
if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
(pnv_first_tb_loss_level > psscr_rl))
pnv_first_tb_loss_level = psscr_rl;
if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
- (pnv_first_spr_loss_level > psscr_rl))
- pnv_first_spr_loss_level = psscr_rl;
+ (deep_spr_loss_state > psscr_rl))
+ deep_spr_loss_state = psscr_rl;
/*
* The idle code does not deal with TB loss occurring
* in a shallower state than SPR loss, so force it to
* behave like SPRs are lost if TB is lost. POWER9 would
- * never encouter this, but a POWER8 core would if it
+ * never encounter this, but a POWER8 core would if it
* implemented the stop instruction. So this is for forward
* compatibility.
*/
if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
- (pnv_first_spr_loss_level > psscr_rl))
- pnv_first_spr_loss_level = psscr_rl;
+ (deep_spr_loss_state > psscr_rl))
+ deep_spr_loss_state = psscr_rl;
err = validate_psscr_val_mask(&state->psscr_val,
&state->psscr_mask,
@@ -1144,7 +1244,7 @@ static void __init pnv_power9_idle_init(void)
if (unlikely(!default_stop_found)) {
pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n");
} else {
- ppc_md.power_save = power9_idle;
+ ppc_md.power_save = arch300_idle;
pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n",
pnv_default_stop_val, pnv_default_stop_mask);
}
@@ -1158,7 +1258,7 @@ static void __init pnv_power9_idle_init(void)
}
pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%llx\n",
- pnv_first_spr_loss_level);
+ deep_spr_loss_state);
pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%llx\n",
pnv_first_tb_loss_level);
@@ -1206,7 +1306,7 @@ static void __init pnv_probe_idle_states(void)
}
if (cpu_has_feature(CPU_FTR_ARCH_300))
- pnv_power9_idle_init();
+ pnv_arch300_idle_init();
for (i = 0; i < nr_pnv_idle_states; i++)
supported_cpuidle_states |= pnv_idle_states[i].flags;
@@ -1218,7 +1318,7 @@ static void __init pnv_probe_idle_states(void)
* which is the number of cpuidle states discovered through device-tree.
*/
-static int pnv_parse_cpuidle_dt(void)
+static int __init pnv_parse_cpuidle_dt(void)
{
struct device_node *np;
int nr_idle_states, i;
@@ -1270,14 +1370,14 @@ static int pnv_parse_cpuidle_dt(void)
/* Read residencies */
if (of_property_read_u32_array(np, "ibm,cpu-idle-state-residency-ns",
temp_u32, nr_idle_states)) {
- pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-latencies-ns in DT\n");
+ pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-residency-ns in DT\n");
rc = -EINVAL;
goto out;
}
for (i = 0; i < nr_idle_states; i++)
pnv_idle_states[i].residency_ns = temp_u32[i];
- /* For power9 */
+ /* For power9 and later */
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
/* Read pm_crtl_val */
if (of_property_read_u64_array(np, "ibm,cpu-idle-state-psscr",
@@ -1313,7 +1413,7 @@ static int pnv_parse_cpuidle_dt(void)
goto out;
}
for (i = 0; i < nr_idle_states; i++)
- strlcpy(pnv_idle_states[i].name, temp_string[i],
+ strscpy(pnv_idle_states[i].name, temp_string[i],
PNV_IDLE_NAME_LEN);
nr_pnv_idle_states = nr_idle_states;
rc = 0;
@@ -1321,6 +1421,7 @@ out:
kfree(temp_u32);
kfree(temp_u64);
kfree(temp_string);
+ of_node_put(np);
return rc;
}
@@ -1340,8 +1441,8 @@ static int __init pnv_init_idle_states(void)
if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
/* P7/P8 nap */
p->thread_idle_state = PNV_THREAD_RUNNING;
- } else {
- /* P9 stop */
+ } else if (pvr_version_is(PVR_POWER9)) {
+ /* P9 stop workarounds */
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
p->requested_psscr = 0;
atomic_set(&p->dont_stop, 0);
@@ -1365,14 +1466,19 @@ static int __init pnv_init_idle_states(void)
power7_fastsleep_workaround_entry = false;
power7_fastsleep_workaround_exit = false;
} else {
+ struct device *dev_root;
/*
* OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
* workaround is needed to use fastsleep. Provide sysfs
* control to choose how this workaround has to be
* applied.
*/
- device_create_file(cpu_subsys.dev_root,
- &dev_attr_fastsleep_workaround_applyonce);
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ device_create_file(dev_root,
+ &dev_attr_fastsleep_workaround_applyonce);
+ put_device(dev_root);
+ }
}
update_subcore_sibling_mask();
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index eb2e75dac369..877720c64515 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -18,7 +18,7 @@
#include <linux/memory_hotplug.h>
#include <linux/numa.h>
#include <asm/machdep.h>
-#include <asm/debugfs.h>
+#include <asm/cacheflush.h>
/* This enables us to keep track of the memory removed from each node. */
struct memtrace_entry {
@@ -30,6 +30,7 @@ struct memtrace_entry {
char name[16];
};
+static DEFINE_MUTEX(memtrace_mutex);
static u64 memtrace_size;
static struct memtrace_entry *memtrace_array;
@@ -44,90 +45,102 @@ static ssize_t memtrace_read(struct file *filp, char __user *ubuf,
return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size);
}
+static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct memtrace_entry *ent = filp->private_data;
+
+ if (ent->size < vma->vm_end - vma->vm_start)
+ return -EINVAL;
+
+ if (vma->vm_pgoff << PAGE_SHIFT >= ent->size)
+ return -EINVAL;
+
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ return remap_pfn_range(vma, vma->vm_start, PHYS_PFN(ent->start) + vma->vm_pgoff,
+ vma->vm_end - vma->vm_start, vma->vm_page_prot);
+}
+
static const struct file_operations memtrace_fops = {
.llseek = default_llseek,
.read = memtrace_read,
.open = simple_open,
+ .mmap = memtrace_mmap,
};
-static int check_memblock_online(struct memory_block *mem, void *arg)
-{
- if (mem->state != MEM_ONLINE)
- return -1;
-
- return 0;
-}
-
-static int change_memblock_state(struct memory_block *mem, void *arg)
+#define FLUSH_CHUNK_SIZE SZ_1G
+/**
+ * flush_dcache_range_chunked(): Write any modified data cache blocks out to
+ * memory and invalidate them, in chunks of up to FLUSH_CHUNK_SIZE
+ * Does not invalidate the corresponding instruction cache blocks.
+ *
+ * @start: the start address
+ * @stop: the stop address (exclusive)
+ * @chunk: the max size of the chunks
+ */
+static void flush_dcache_range_chunked(unsigned long start, unsigned long stop,
+ unsigned long chunk)
{
- unsigned long state = (unsigned long)arg;
-
- mem->state = state;
+ unsigned long i;
- return 0;
+ for (i = start; i < stop; i += chunk) {
+ flush_dcache_range(i, min(stop, i + chunk));
+ cond_resched();
+ }
}
-/* called with device_hotplug_lock held */
-static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
+static void memtrace_clear_range(unsigned long start_pfn,
+ unsigned long nr_pages)
{
- const unsigned long start = PFN_PHYS(start_pfn);
- const unsigned long size = PFN_PHYS(nr_pages);
+ unsigned long pfn;
- if (walk_memory_blocks(start, size, NULL, check_memblock_online))
- return false;
-
- walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
- change_memblock_state);
-
- if (offline_pages(start_pfn, nr_pages)) {
- walk_memory_blocks(start, size, (void *)MEM_ONLINE,
- change_memblock_state);
- return false;
+ /* As HIGHMEM does not apply, use clear_page() directly. */
+ for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) {
+ if (IS_ALIGNED(pfn, PAGES_PER_SECTION))
+ cond_resched();
+ clear_page(__va(PFN_PHYS(pfn)));
}
-
- walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
- change_memblock_state);
-
-
- return true;
+ /*
+ * Before we go ahead and use this range as cache inhibited range
+ * flush the cache.
+ */
+ flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn),
+ (unsigned long)pfn_to_kaddr(start_pfn + nr_pages),
+ FLUSH_CHUNK_SIZE);
}
static u64 memtrace_alloc_node(u32 nid, u64 size)
{
- u64 start_pfn, end_pfn, nr_pages, pfn;
- u64 base_pfn;
- u64 bytes = memory_block_size_bytes();
+ const unsigned long nr_pages = PHYS_PFN(size);
+ unsigned long pfn, start_pfn;
+ struct page *page;
- if (!node_spanned_pages(nid))
+ /*
+ * Trace memory needs to be aligned to the size, which is guaranteed
+ * by alloc_contig_pages().
+ */
+ page = alloc_contig_pages(nr_pages, GFP_KERNEL | __GFP_THISNODE |
+ __GFP_NOWARN, nid, NULL);
+ if (!page)
return 0;
+ start_pfn = page_to_pfn(page);
- start_pfn = node_start_pfn(nid);
- end_pfn = node_end_pfn(nid);
- nr_pages = size >> PAGE_SHIFT;
-
- /* Trace memory needs to be aligned to the size */
- end_pfn = round_down(end_pfn - nr_pages, nr_pages);
-
- lock_device_hotplug();
- for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) {
- if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) {
- /*
- * Remove memory in memory block size chunks so that
- * iomem resources are always split to the same size and
- * we never try to remove memory that spans two iomem
- * resources.
- */
- end_pfn = base_pfn + nr_pages;
- for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) {
- __remove_memory(nid, pfn << PAGE_SHIFT, bytes);
- }
- unlock_device_hotplug();
- return base_pfn << PAGE_SHIFT;
- }
- }
- unlock_device_hotplug();
+ /*
+ * Clear the range while we still have a linear mapping.
+ *
+ * TODO: use __GFP_ZERO with alloc_contig_pages() once supported.
+ */
+ memtrace_clear_range(start_pfn, nr_pages);
- return 0;
+ /*
+ * Set pages PageOffline(), to indicate that nobody (e.g., hibernation,
+ * dumping, ...) should be touching these pages.
+ */
+ for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++)
+ __SetPageOffline(pfn_to_page(pfn));
+
+ arch_remove_linear_mapping(PFN_PHYS(start_pfn), size);
+
+ return PFN_PHYS(start_pfn);
}
static int memtrace_init_regions_runtime(u64 size)
@@ -187,14 +200,9 @@ static int memtrace_init_debugfs(void)
snprintf(ent->name, 16, "%08x", ent->nid);
dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir);
- if (!dir) {
- pr_err("Failed to create debugfs directory for node %d\n",
- ent->nid);
- return -1;
- }
ent->dir = dir;
- debugfs_create_file("trace", 0400, dir, ent, &memtrace_fops);
+ debugfs_create_file_unsafe("trace", 0600, dir, ent, &memtrace_fops);
debugfs_create_x64("start", 0400, dir, &ent->start);
debugfs_create_x64("size", 0400, dir, &ent->size);
}
@@ -202,16 +210,30 @@ static int memtrace_init_debugfs(void)
return ret;
}
-static int online_mem_block(struct memory_block *mem, void *arg)
+static int memtrace_free(int nid, u64 start, u64 size)
{
- return device_online(&mem->dev);
+ struct mhp_params params = { .pgprot = PAGE_KERNEL };
+ const unsigned long nr_pages = PHYS_PFN(size);
+ const unsigned long start_pfn = PHYS_PFN(start);
+ unsigned long pfn;
+ int ret;
+
+ ret = arch_create_linear_mapping(nid, start, size, &params);
+ if (ret)
+ return ret;
+
+ for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++)
+ __ClearPageOffline(pfn_to_page(pfn));
+
+ free_contig_range(start_pfn, nr_pages);
+ return 0;
}
/*
- * Iterate through the chunks of memory we have removed from the kernel
- * and attempt to add them back to the kernel.
+ * Iterate through the chunks of memory we allocated and attempt to expose
+ * them back to the kernel.
*/
-static int memtrace_online(void)
+static int memtrace_free_regions(void)
{
int i, ret = 0;
struct memtrace_entry *ent;
@@ -219,7 +241,7 @@ static int memtrace_online(void)
for (i = memtrace_array_nr - 1; i >= 0; i--) {
ent = &memtrace_array[i];
- /* We have onlined this chunk previously */
+ /* We have freed this chunk previously */
if (ent->nid == NUMA_NO_NODE)
continue;
@@ -229,36 +251,25 @@ static int memtrace_online(void)
ent->mem = 0;
}
- if (add_memory(ent->nid, ent->start, ent->size)) {
- pr_err("Failed to add trace memory to node %d\n",
+ if (memtrace_free(ent->nid, ent->start, ent->size)) {
+ pr_err("Failed to free trace memory on node %d\n",
ent->nid);
ret += 1;
continue;
}
/*
- * If kernel isn't compiled with the auto online option
- * we need to online the memory ourselves.
- */
- if (!memhp_auto_online) {
- lock_device_hotplug();
- walk_memory_blocks(ent->start, ent->size, NULL,
- online_mem_block);
- unlock_device_hotplug();
- }
-
- /*
- * Memory was added successfully so clean up references to it
- * so on reentry we can tell that this chunk was added.
+ * Memory was freed successfully so clean up references to it
+ * so on reentry we can tell that this chunk was freed.
*/
debugfs_remove_recursive(ent->dir);
- pr_info("Added trace memory back to node %d\n", ent->nid);
+ pr_info("Freed trace memory back on node %d\n", ent->nid);
ent->size = ent->start = ent->nid = NUMA_NO_NODE;
}
if (ret)
return ret;
- /* If all chunks of memory were added successfully, reset globals */
+ /* If all chunks of memory were freed successfully, reset globals */
kfree(memtrace_array);
memtrace_array = NULL;
memtrace_size = 0;
@@ -268,6 +279,7 @@ static int memtrace_online(void)
static int memtrace_enable_set(void *data, u64 val)
{
+ int rc = -EAGAIN;
u64 bytes;
/*
@@ -280,25 +292,29 @@ static int memtrace_enable_set(void *data, u64 val)
return -EINVAL;
}
- /* Re-add/online previously removed/offlined memory */
- if (memtrace_size) {
- if (memtrace_online())
- return -EAGAIN;
- }
+ mutex_lock(&memtrace_mutex);
- if (!val)
- return 0;
+ /* Free all previously allocated memory. */
+ if (memtrace_size && memtrace_free_regions())
+ goto out_unlock;
+
+ if (!val) {
+ rc = 0;
+ goto out_unlock;
+ }
- /* Offline and remove memory */
+ /* Allocate memory. */
if (memtrace_init_regions_runtime(val))
- return -EINVAL;
+ goto out_unlock;
if (memtrace_init_debugfs())
- return -EINVAL;
+ goto out_unlock;
memtrace_size = val;
-
- return 0;
+ rc = 0;
+out_unlock:
+ mutex_unlock(&memtrace_mutex);
+ return rc;
}
static int memtrace_enable_get(void *data, u64 *val)
@@ -313,9 +329,7 @@ DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get,
static int memtrace_init(void)
{
memtrace_debugfs_dir = debugfs_create_dir("memtrace",
- powerpc_debugfs_root);
- if (!memtrace_debugfs_dir)
- return -1;
+ arch_debugfs_dir);
debugfs_create_file("enable", 0600, memtrace_debugfs_dir,
NULL, &memtrace_init_fops);
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
deleted file mode 100644
index b95b9e3c4c98..000000000000
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ /dev/null
@@ -1,630 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * This file implements the DMA operations for NVLink devices. The NPU
- * devices all point to the same iommu table as the parent PCI device.
- *
- * Copyright Alistair Popple, IBM Corporation 2015.
- */
-
-#include <linux/mmu_notifier.h>
-#include <linux/mmu_context.h>
-#include <linux/of.h>
-#include <linux/pci.h>
-#include <linux/memblock.h>
-#include <linux/sizes.h>
-
-#include <asm/debugfs.h>
-#include <asm/powernv.h>
-#include <asm/opal.h>
-
-#include "pci.h"
-
-static struct pci_dev *get_pci_dev(struct device_node *dn)
-{
- struct pci_dn *pdn = PCI_DN(dn);
- struct pci_dev *pdev;
-
- pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
- pdn->busno, pdn->devfn);
-
- /*
- * pci_get_domain_bus_and_slot() increased the reference count of
- * the PCI device, but callers don't need that actually as the PE
- * already holds a reference to the device. Since callers aren't
- * aware of the reference count change, call pci_dev_put() now to
- * avoid leaks.
- */
- if (pdev)
- pci_dev_put(pdev);
-
- return pdev;
-}
-
-/* Given a NPU device get the associated PCI device. */
-struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
-{
- struct device_node *dn;
- struct pci_dev *gpdev;
-
- if (WARN_ON(!npdev))
- return NULL;
-
- if (WARN_ON(!npdev->dev.of_node))
- return NULL;
-
- /* Get assoicated PCI device */
- dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
- if (!dn)
- return NULL;
-
- gpdev = get_pci_dev(dn);
- of_node_put(dn);
-
- return gpdev;
-}
-EXPORT_SYMBOL(pnv_pci_get_gpu_dev);
-
-/* Given the real PCI device get a linked NPU device. */
-struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
-{
- struct device_node *dn;
- struct pci_dev *npdev;
-
- if (WARN_ON(!gpdev))
- return NULL;
-
- /* Not all PCI devices have device-tree nodes */
- if (!gpdev->dev.of_node)
- return NULL;
-
- /* Get assoicated PCI device */
- dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
- if (!dn)
- return NULL;
-
- npdev = get_pci_dev(dn);
- of_node_put(dn);
-
- return npdev;
-}
-EXPORT_SYMBOL(pnv_pci_get_npu_dev);
-
-#ifdef CONFIG_IOMMU_API
-/*
- * Returns the PE assoicated with the PCI device of the given
- * NPU. Returns the linked pci device if pci_dev != NULL.
- */
-static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
- struct pci_dev **gpdev)
-{
- struct pnv_phb *phb;
- struct pci_controller *hose;
- struct pci_dev *pdev;
- struct pnv_ioda_pe *pe;
- struct pci_dn *pdn;
-
- pdev = pnv_pci_get_gpu_dev(npe->pdev);
- if (!pdev)
- return NULL;
-
- pdn = pci_get_pdn(pdev);
- if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
- return NULL;
-
- hose = pci_bus_to_host(pdev->bus);
- phb = hose->private_data;
- pe = &phb->ioda.pe_array[pdn->pe_number];
-
- if (gpdev)
- *gpdev = pdev;
-
- return pe;
-}
-
-static long pnv_npu_unset_window(struct iommu_table_group *table_group,
- int num);
-
-static long pnv_npu_set_window(struct iommu_table_group *table_group, int num,
- struct iommu_table *tbl)
-{
- struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
- table_group);
- struct pnv_phb *phb = npe->phb;
- int64_t rc;
- const unsigned long size = tbl->it_indirect_levels ?
- tbl->it_level_size : tbl->it_size;
- const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
- const __u64 win_size = tbl->it_size << tbl->it_page_shift;
- int num2 = (num == 0) ? 1 : 0;
-
- /* NPU has just one TVE so if there is another table, remove it first */
- if (npe->table_group.tables[num2])
- pnv_npu_unset_window(&npe->table_group, num2);
-
- pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
- start_addr, start_addr + win_size - 1,
- IOMMU_PAGE_SIZE(tbl));
-
- rc = opal_pci_map_pe_dma_window(phb->opal_id,
- npe->pe_number,
- npe->pe_number,
- tbl->it_indirect_levels + 1,
- __pa(tbl->it_base),
- size << 3,
- IOMMU_PAGE_SIZE(tbl));
- if (rc) {
- pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
- return rc;
- }
- pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
- /* Add the table to the list so its TCE cache will get invalidated */
- pnv_pci_link_table_and_group(phb->hose->node, num,
- tbl, &npe->table_group);
-
- return 0;
-}
-
-static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num)
-{
- struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
- table_group);
- struct pnv_phb *phb = npe->phb;
- int64_t rc;
-
- if (!npe->table_group.tables[num])
- return 0;
-
- pe_info(npe, "Removing DMA window\n");
-
- rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
- npe->pe_number,
- 0/* levels */, 0/* table address */,
- 0/* table size */, 0/* page size */);
- if (rc) {
- pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
- return rc;
- }
- pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
- pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
- &npe->table_group);
-
- return 0;
-}
-
-/* Switch ownership from platform code to external user (e.g. VFIO) */
-static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
-{
- struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
- table_group);
- struct pnv_phb *phb = npe->phb;
- int64_t rc;
- struct pci_dev *gpdev = NULL;
-
- /*
- * Note: NPU has just a single TVE in the hardware which means that
- * while used by the kernel, it can have either 32bit window or
- * DMA bypass but never both. So we deconfigure 32bit window only
- * if it was enabled at the moment of ownership change.
- */
- if (npe->table_group.tables[0]) {
- pnv_npu_unset_window(&npe->table_group, 0);
- return;
- }
-
- /* Disable bypass */
- rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
- npe->pe_number, npe->pe_number,
- 0 /* bypass base */, 0);
- if (rc) {
- pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
- return;
- }
- pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
-
- get_gpu_pci_dev_and_pe(npe, &gpdev);
- if (gpdev)
- pnv_npu2_unmap_lpar_dev(gpdev);
-}
-
-static void pnv_npu_release_ownership(struct iommu_table_group *table_group)
-{
- struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
- table_group);
- struct pci_dev *gpdev = NULL;
-
- get_gpu_pci_dev_and_pe(npe, &gpdev);
- if (gpdev)
- pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV);
-}
-
-static struct iommu_table_group_ops pnv_pci_npu_ops = {
- .set_window = pnv_npu_set_window,
- .unset_window = pnv_npu_unset_window,
- .take_ownership = pnv_npu_take_ownership,
- .release_ownership = pnv_npu_release_ownership,
-};
-#endif /* !CONFIG_IOMMU_API */
-
-/*
- * NPU2 ATS
- */
-/* Maximum possible number of ATSD MMIO registers per NPU */
-#define NV_NMMU_ATSD_REGS 8
-#define NV_NPU_MAX_PE_NUM 16
-
-/*
- * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or
- * up to 3 x (GPU + 2xNPUs) (POWER9).
- */
-struct npu_comp {
- struct iommu_table_group table_group;
- int pe_num;
- struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM];
-};
-
-/* An NPU descriptor, valid for POWER9 only */
-struct npu {
- int index;
- struct npu_comp npucomp;
-};
-
-#ifdef CONFIG_IOMMU_API
-static long pnv_npu_peers_create_table_userspace(
- struct iommu_table_group *table_group,
- int num, __u32 page_shift, __u64 window_size, __u32 levels,
- struct iommu_table **ptbl)
-{
- struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
- table_group);
-
- if (!npucomp->pe_num || !npucomp->pe[0] ||
- !npucomp->pe[0]->table_group.ops ||
- !npucomp->pe[0]->table_group.ops->create_table)
- return -EFAULT;
-
- return npucomp->pe[0]->table_group.ops->create_table(
- &npucomp->pe[0]->table_group, num, page_shift,
- window_size, levels, ptbl);
-}
-
-static long pnv_npu_peers_set_window(struct iommu_table_group *table_group,
- int num, struct iommu_table *tbl)
-{
- int i, j;
- long ret = 0;
- struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
- table_group);
-
- for (i = 0; i < npucomp->pe_num; ++i) {
- struct pnv_ioda_pe *pe = npucomp->pe[i];
-
- if (!pe->table_group.ops->set_window)
- continue;
-
- ret = pe->table_group.ops->set_window(&pe->table_group,
- num, tbl);
- if (ret)
- break;
- }
-
- if (ret) {
- for (j = 0; j < i; ++j) {
- struct pnv_ioda_pe *pe = npucomp->pe[j];
-
- if (!pe->table_group.ops->unset_window)
- continue;
-
- ret = pe->table_group.ops->unset_window(
- &pe->table_group, num);
- if (ret)
- break;
- }
- } else {
- table_group->tables[num] = iommu_tce_table_get(tbl);
- }
-
- return ret;
-}
-
-static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group,
- int num)
-{
- int i, j;
- long ret = 0;
- struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
- table_group);
-
- for (i = 0; i < npucomp->pe_num; ++i) {
- struct pnv_ioda_pe *pe = npucomp->pe[i];
-
- WARN_ON(npucomp->table_group.tables[num] !=
- table_group->tables[num]);
- if (!npucomp->table_group.tables[num])
- continue;
-
- if (!pe->table_group.ops->unset_window)
- continue;
-
- ret = pe->table_group.ops->unset_window(&pe->table_group, num);
- if (ret)
- break;
- }
-
- if (ret) {
- for (j = 0; j < i; ++j) {
- struct pnv_ioda_pe *pe = npucomp->pe[j];
-
- if (!npucomp->table_group.tables[num])
- continue;
-
- if (!pe->table_group.ops->set_window)
- continue;
-
- ret = pe->table_group.ops->set_window(&pe->table_group,
- num, table_group->tables[num]);
- if (ret)
- break;
- }
- } else if (table_group->tables[num]) {
- iommu_tce_table_put(table_group->tables[num]);
- table_group->tables[num] = NULL;
- }
-
- return ret;
-}
-
-static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group)
-{
- int i;
- struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
- table_group);
-
- for (i = 0; i < npucomp->pe_num; ++i) {
- struct pnv_ioda_pe *pe = npucomp->pe[i];
-
- if (!pe->table_group.ops->take_ownership)
- continue;
- pe->table_group.ops->take_ownership(&pe->table_group);
- }
-}
-
-static void pnv_npu_peers_release_ownership(
- struct iommu_table_group *table_group)
-{
- int i;
- struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
- table_group);
-
- for (i = 0; i < npucomp->pe_num; ++i) {
- struct pnv_ioda_pe *pe = npucomp->pe[i];
-
- if (!pe->table_group.ops->release_ownership)
- continue;
- pe->table_group.ops->release_ownership(&pe->table_group);
- }
-}
-
-static struct iommu_table_group_ops pnv_npu_peers_ops = {
- .get_table_size = pnv_pci_ioda2_get_table_size,
- .create_table = pnv_npu_peers_create_table_userspace,
- .set_window = pnv_npu_peers_set_window,
- .unset_window = pnv_npu_peers_unset_window,
- .take_ownership = pnv_npu_peers_take_ownership,
- .release_ownership = pnv_npu_peers_release_ownership,
-};
-
-static void pnv_comp_attach_table_group(struct npu_comp *npucomp,
- struct pnv_ioda_pe *pe)
-{
- if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM))
- return;
-
- npucomp->pe[npucomp->pe_num] = pe;
- ++npucomp->pe_num;
-}
-
-struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
-{
- struct iommu_table_group *table_group;
- struct npu_comp *npucomp;
- struct pci_dev *gpdev = NULL;
- struct pci_controller *hose;
- struct pci_dev *npdev = NULL;
-
- list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) {
- npdev = pnv_pci_get_npu_dev(gpdev, 0);
- if (npdev)
- break;
- }
-
- if (!npdev)
- /* It is not an NPU attached device, skip */
- return NULL;
-
- hose = pci_bus_to_host(npdev->bus);
-
- if (hose->npu) {
- table_group = &hose->npu->npucomp.table_group;
-
- if (!table_group->group) {
- table_group->ops = &pnv_npu_peers_ops;
- iommu_register_group(table_group,
- hose->global_number,
- pe->pe_number);
- }
- } else {
- /* Create a group for 1 GPU and attached NPUs for POWER8 */
- pe->npucomp = kzalloc(sizeof(*pe->npucomp), GFP_KERNEL);
- table_group = &pe->npucomp->table_group;
- table_group->ops = &pnv_npu_peers_ops;
- iommu_register_group(table_group, hose->global_number,
- pe->pe_number);
- }
-
- /* Steal capabilities from a GPU PE */
- table_group->max_dynamic_windows_supported =
- pe->table_group.max_dynamic_windows_supported;
- table_group->tce32_start = pe->table_group.tce32_start;
- table_group->tce32_size = pe->table_group.tce32_size;
- table_group->max_levels = pe->table_group.max_levels;
- if (!table_group->pgsizes)
- table_group->pgsizes = pe->table_group.pgsizes;
-
- npucomp = container_of(table_group, struct npu_comp, table_group);
- pnv_comp_attach_table_group(npucomp, pe);
-
- return table_group;
-}
-
-struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
-{
- struct iommu_table_group *table_group;
- struct npu_comp *npucomp;
- struct pci_dev *gpdev = NULL;
- struct pci_dev *npdev;
- struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev);
-
- WARN_ON(!(pe->flags & PNV_IODA_PE_DEV));
- if (!gpe)
- return NULL;
-
- /*
- * IODA2 bridges get this set up from pci_controller_ops::setup_bridge
- * but NPU bridges do not have this hook defined so we do it here.
- * We do not setup other table group parameters as they won't be used
- * anyway - NVLink bridges are subordinate PEs.
- */
- pe->table_group.ops = &pnv_pci_npu_ops;
-
- table_group = iommu_group_get_iommudata(
- iommu_group_get(&gpdev->dev));
-
- /*
- * On P9 NPU PHB and PCI PHB support different page sizes,
- * keep only matching. We expect here that NVLink bridge PE pgsizes is
- * initialized by the caller.
- */
- table_group->pgsizes &= pe->table_group.pgsizes;
- npucomp = container_of(table_group, struct npu_comp, table_group);
- pnv_comp_attach_table_group(npucomp, pe);
-
- list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) {
- struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev);
-
- if (gpdevtmp != gpdev)
- continue;
-
- iommu_add_device(table_group, &npdev->dev);
- }
-
- return table_group;
-}
-#endif /* CONFIG_IOMMU_API */
-
-int pnv_npu2_init(struct pci_controller *hose)
-{
- static int npu_index;
- struct npu *npu;
- int ret;
-
- npu = kzalloc(sizeof(*npu), GFP_KERNEL);
- if (!npu)
- return -ENOMEM;
-
- npu_index++;
- if (WARN_ON(npu_index >= NV_MAX_NPUS)) {
- ret = -ENOSPC;
- goto fail_exit;
- }
- npu->index = npu_index;
- hose->npu = npu;
-
- return 0;
-
-fail_exit:
- kfree(npu);
- return ret;
-}
-
-int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
- unsigned long msr)
-{
- int ret;
- struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
- struct pci_controller *hose;
- struct pnv_phb *nphb;
-
- if (!npdev)
- return -ENODEV;
-
- hose = pci_bus_to_host(npdev->bus);
- nphb = hose->private_data;
-
- dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=%u\n",
- nphb->opal_id, lparid);
- /*
- * Currently we only support radix and non-zero LPCR only makes sense
- * for hash tables so skiboot expects the LPCR parameter to be a zero.
- */
- ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), lparid,
- 0 /* LPCR bits */);
- if (ret) {
- dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
- return ret;
- }
-
- dev_dbg(&gpdev->dev, "init context opalid=%llu msr=%lx\n",
- nphb->opal_id, msr);
- ret = opal_npu_init_context(nphb->opal_id, 0/*__unused*/, msr,
- pci_dev_id(gpdev));
- if (ret < 0)
- dev_err(&gpdev->dev, "Failed to init context: %d\n", ret);
- else
- ret = 0;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(pnv_npu2_map_lpar_dev);
-
-void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr)
-{
- struct pci_dev *gpdev;
-
- list_for_each_entry(gpdev, &gpe->pbus->devices, bus_list)
- pnv_npu2_map_lpar_dev(gpdev, 0, msr);
-}
-
-int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev)
-{
- int ret;
- struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
- struct pci_controller *hose;
- struct pnv_phb *nphb;
-
- if (!npdev)
- return -ENODEV;
-
- hose = pci_bus_to_host(npdev->bus);
- nphb = hose->private_data;
-
- dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n",
- nphb->opal_id);
- ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/,
- pci_dev_id(gpdev));
- if (ret < 0) {
- dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret);
- return ret;
- }
-
- /* Set LPID to 0 anyway, just to be safe */
- dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id);
- ret = opal_npu_map_lpar(nphb->opal_id, pci_dev_id(gpdev), 0 /*LPID*/,
- 0 /* LPCR bits */);
- if (ret)
- dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev);
diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c
index 8c65aacda9c8..64a9c7125c29 100644
--- a/arch/powerpc/platforms/powernv/ocxl.c
+++ b/arch/powerpc/platforms/powernv/ocxl.c
@@ -2,7 +2,6 @@
// Copyright 2017 IBM Corp.
#include <asm/pnv-ocxl.h>
#include <asm/opal.h>
-#include <asm/xive.h>
#include <misc/ocxl-config.h>
#include "pci.h"
@@ -108,7 +107,8 @@ static int get_max_afu_index(struct pci_dev *dev, int *afu_idx)
int pos;
u32 val;
- pos = find_dvsec_from_pos(dev, OCXL_DVSEC_FUNC_ID, 0);
+ pos = pci_find_dvsec_capability(dev, PCI_VENDOR_ID_IBM,
+ OCXL_DVSEC_FUNC_ID);
if (!pos)
return -ESRCH;
@@ -289,7 +289,7 @@ int pnv_ocxl_get_pasid_count(struct pci_dev *dev, int *count)
* be used by a function depends on how many functions exist
* on the device. The NPU needs to be configured to know how
* many bits are available to PASIDs and how many are to be
- * used by the function BDF indentifier.
+ * used by the function BDF identifier.
*
* We only support one AFU-carrying function for now.
*/
@@ -449,7 +449,7 @@ int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask,
if (!data)
return -ENOMEM;
- bdfn = (dev->bus->number << 8) | dev->devfn;
+ bdfn = pci_dev_id(dev);
rc = opal_npu_spa_setup(phb->opal_id, bdfn, virt_to_phys(spa_mem),
PE_mask);
if (rc) {
@@ -478,38 +478,121 @@ EXPORT_SYMBOL_GPL(pnv_ocxl_spa_release);
int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle)
{
struct spa_data *data = (struct spa_data *) platform_data;
- int rc;
- rc = opal_npu_spa_clear_cache(data->phb_opal_id, data->bdfn, pe_handle);
- return rc;
+ return opal_npu_spa_clear_cache(data->phb_opal_id, data->bdfn, pe_handle);
}
EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache);
-int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr)
+int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid,
+ uint64_t lpcr, void __iomem **arva)
{
- __be64 flags, trigger_page;
- s64 rc;
- u32 hwirq;
-
- hwirq = xive_native_alloc_irq();
- if (!hwirq)
- return -ENOENT;
-
- rc = opal_xive_get_irq_info(hwirq, &flags, NULL, &trigger_page, NULL,
- NULL);
- if (rc || !trigger_page) {
- xive_native_free_irq(hwirq);
- return -ENOENT;
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ u64 mmio_atsd;
+ int rc;
+
+ /* ATSD physical address.
+ * ATSD LAUNCH register: write access initiates a shoot down to
+ * initiate the TLB Invalidate command.
+ */
+ rc = of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
+ 0, &mmio_atsd);
+ if (rc) {
+ dev_info(&dev->dev, "No available ATSD found\n");
+ return rc;
+ }
+
+ /* Assign a register set to a Logical Partition and MMIO ATSD
+ * LPARID register to the required value.
+ */
+ rc = opal_npu_map_lpar(phb->opal_id, pci_dev_id(dev),
+ lparid, lpcr);
+ if (rc) {
+ dev_err(&dev->dev, "Error mapping device to LPAR: %d\n", rc);
+ return rc;
+ }
+
+ *arva = ioremap(mmio_atsd, 24);
+ if (!(*arva)) {
+ dev_warn(&dev->dev, "ioremap failed - mmio_atsd: %#llx\n", mmio_atsd);
+ rc = -ENOMEM;
}
- *irq = hwirq;
- *trigger_addr = be64_to_cpu(trigger_page);
- return 0;
+ return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_map_lpar);
+
+void pnv_ocxl_unmap_lpar(void __iomem *arva)
+{
+ iounmap(arva);
}
-EXPORT_SYMBOL_GPL(pnv_ocxl_alloc_xive_irq);
+EXPORT_SYMBOL_GPL(pnv_ocxl_unmap_lpar);
-void pnv_ocxl_free_xive_irq(u32 irq)
+void pnv_ocxl_tlb_invalidate(void __iomem *arva,
+ unsigned long pid,
+ unsigned long addr,
+ unsigned long page_size)
{
- xive_native_free_irq(irq);
+ unsigned long timeout = jiffies + (HZ * PNV_OCXL_ATSD_TIMEOUT);
+ u64 val = 0ull;
+ int pend;
+ u8 size;
+
+ if (!(arva))
+ return;
+
+ if (addr) {
+ /* load Abbreviated Virtual Address register with
+ * the necessary value
+ */
+ val |= FIELD_PREP(PNV_OCXL_ATSD_AVA_AVA, addr >> (63-51));
+ out_be64(arva + PNV_OCXL_ATSD_AVA, val);
+ }
+
+ /* Write access initiates a shoot down to initiate the
+ * TLB Invalidate command
+ */
+ val = PNV_OCXL_ATSD_LNCH_R;
+ val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_RIC, 0b10);
+ if (addr)
+ val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_IS, 0b00);
+ else {
+ val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_IS, 0b01);
+ val |= PNV_OCXL_ATSD_LNCH_OCAPI_SINGLETON;
+ }
+ val |= PNV_OCXL_ATSD_LNCH_PRS;
+ /* Actual Page Size to be invalidated
+ * 000 4KB
+ * 101 64KB
+ * 001 2MB
+ * 010 1GB
+ */
+ size = 0b101;
+ if (page_size == 0x1000)
+ size = 0b000;
+ if (page_size == 0x200000)
+ size = 0b001;
+ if (page_size == 0x40000000)
+ size = 0b010;
+ val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_AP, size);
+ val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_PID, pid);
+ out_be64(arva + PNV_OCXL_ATSD_LNCH, val);
+
+ /* Poll the ATSD status register to determine when the
+ * TLB Invalidate has been completed.
+ */
+ val = in_be64(arva + PNV_OCXL_ATSD_STAT);
+ pend = val >> 63;
+
+ while (pend) {
+ if (time_after_eq(jiffies, timeout)) {
+ pr_err("%s - Timeout while reading XTS MMIO ATSD status register (val=%#llx, pidr=0x%lx)\n",
+ __func__, val, pid);
+ return;
+ }
+ cpu_relax();
+ val = in_be64(arva + PNV_OCXL_ATSD_STAT);
+ pend = val >> 63;
+ }
}
-EXPORT_SYMBOL_GPL(pnv_ocxl_free_xive_irq);
+EXPORT_SYMBOL_GPL(pnv_ocxl_tlb_invalidate);
diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index 1656e8965d6b..c094fdf5825c 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -104,7 +104,7 @@ static int __opal_async_release_token(int token)
*/
case ASYNC_TOKEN_DISPATCHED:
opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED;
- /* Fall through */
+ fallthrough;
default:
rc = 1;
}
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
index 5cd0f52d258f..021b0ec29e24 100644
--- a/arch/powerpc/platforms/powernv/opal-call.c
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/percpu.h>
#include <linux/jump_label.h>
+#include <asm/interrupt.h>
#include <asm/opal-api.h>
#include <asm/trace.h>
#include <asm/asm-prototypes.h>
@@ -100,6 +101,9 @@ static int64_t opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
bool mmu = (msr & (MSR_IR|MSR_DR));
int64_t ret;
+ /* OPAL call / firmware may use SRR and/or HSRR */
+ srr_regs_clobbered();
+
msr &= ~MSR_EE;
if (unlikely(!mmu))
@@ -163,8 +167,6 @@ OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW);
OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY);
OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE);
OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV);
-OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE);
-OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE);
OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE);
OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE);
OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE);
@@ -267,8 +269,6 @@ OPAL_CALL(opal_xive_get_queue_state, OPAL_XIVE_GET_QUEUE_STATE);
OPAL_CALL(opal_xive_set_queue_state, OPAL_XIVE_SET_QUEUE_STATE);
OPAL_CALL(opal_xive_get_vp_state, OPAL_XIVE_GET_VP_STATE);
OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET);
-OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT);
-OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT);
OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR);
OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT);
OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START);
diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
index ed895d82c048..c9a9b759cc92 100644
--- a/arch/powerpc/platforms/powernv/opal-core.c
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -16,7 +16,7 @@
#include <linux/kobject.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
#include <linux/of.h>
#include <asm/page.h>
@@ -71,6 +71,7 @@ static LIST_HEAD(opalcore_list);
static struct opalcore_config *oc_conf;
static const struct opal_mpipl_fadump *opalc_metadata;
static const struct opal_mpipl_fadump *opalc_cpu_metadata;
+static struct kobject *mpipl_kobj;
/*
* Set crashing CPU's signal to SIGUSR1. if the kernel is triggered
@@ -88,7 +89,7 @@ static inline int is_opalcore_usable(void)
return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0;
}
-static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name,
+static Elf64_Word *__init append_elf64_note(Elf64_Word *buf, char *name,
u32 type, void *data,
size_t data_len)
{
@@ -107,19 +108,19 @@ static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name,
return buf;
}
-static void fill_prstatus(struct elf_prstatus *prstatus, int pir,
+static void __init fill_prstatus(struct elf_prstatus *prstatus, int pir,
struct pt_regs *regs)
{
memset(prstatus, 0, sizeof(struct elf_prstatus));
- elf_core_copy_kernel_regs(&(prstatus->pr_reg), regs);
+ elf_core_copy_regs(&(prstatus->pr_reg), regs);
/*
* Overload PID with PIR value.
* As a PIR value could also be '0', add an offset of '100'
* to every PIR to avoid misinterpretations in GDB.
*/
- prstatus->pr_pid = cpu_to_be32(100 + pir);
- prstatus->pr_ppid = cpu_to_be32(1);
+ prstatus->common.pr_pid = cpu_to_be32(100 + pir);
+ prstatus->common.pr_ppid = cpu_to_be32(1);
/*
* Indicate SIGUSR1 for crash initiated from kernel.
@@ -129,11 +130,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus, int pir,
short sig;
sig = kernel_initiated ? SIGUSR1 : SIGTERM;
- prstatus->pr_cursig = cpu_to_be16(sig);
+ prstatus->common.pr_cursig = cpu_to_be16(sig);
}
}
-static Elf64_Word *auxv_to_elf64_notes(Elf64_Word *buf,
+static Elf64_Word *__init auxv_to_elf64_notes(Elf64_Word *buf,
u64 opal_boot_entry)
{
Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf;
@@ -347,6 +348,8 @@ static int __init create_opalcore(void)
if (!dn || ret)
pr_warn("WARNING: Failed to read OPAL base & entry values\n");
+ of_node_put(dn);
+
/* Use count to keep track of the program headers */
count = 0;
@@ -428,7 +431,7 @@ static void opalcore_cleanup(void)
return;
/* Remove OPAL core sysfs file */
- sysfs_remove_bin_file(opal_kobj, &opal_core_attr);
+ sysfs_remove_bin_file(mpipl_kobj, &opal_core_attr);
oc_conf->ptload_phdr = NULL;
oc_conf->ptload_cnt = 0;
@@ -509,7 +512,7 @@ static void __init opalcore_config_init(void)
idx = be32_to_cpu(opalc_metadata->region_cnt);
if (idx > MAX_PT_LOAD_CNT) {
pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)",
- MAX_PT_LOAD_CNT, idx);
+ idx, MAX_PT_LOAD_CNT);
idx = MAX_PT_LOAD_CNT;
}
for (i = 0; i < idx; i++) {
@@ -563,9 +566,9 @@ error_out:
of_node_put(np);
}
-static ssize_t fadump_release_opalcore_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t release_core_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
int input = -1;
@@ -589,9 +592,23 @@ static ssize_t fadump_release_opalcore_store(struct kobject *kobj,
return count;
}
-static struct kobj_attribute opalcore_rel_attr = __ATTR(fadump_release_opalcore,
- 0200, NULL,
- fadump_release_opalcore_store);
+static struct kobj_attribute opalcore_rel_attr = __ATTR_WO(release_core);
+
+static struct attribute *mpipl_attr[] = {
+ &opalcore_rel_attr.attr,
+ NULL,
+};
+
+static struct bin_attribute *mpipl_bin_attr[] = {
+ &opal_core_attr,
+ NULL,
+
+};
+
+static const struct attribute_group mpipl_group = {
+ .attrs = mpipl_attr,
+ .bin_attrs = mpipl_bin_attr,
+};
static int __init opalcore_init(void)
{
@@ -609,7 +626,7 @@ static int __init opalcore_init(void)
* then capture the dump.
*/
if (!(is_opalcore_usable())) {
- pr_err("Failed to export /sys/firmware/opal/core\n");
+ pr_err("Failed to export /sys/firmware/opal/mpipl/core\n");
opalcore_cleanup();
return rc;
}
@@ -617,18 +634,28 @@ static int __init opalcore_init(void)
/* Set OPAL core file size */
opal_core_attr.size = oc_conf->opalcore_size;
+ mpipl_kobj = kobject_create_and_add("mpipl", opal_kobj);
+ if (!mpipl_kobj) {
+ pr_err("unable to create mpipl kobject\n");
+ return -ENOMEM;
+ }
+
/* Export OPAL core sysfs file */
- rc = sysfs_create_bin_file(opal_kobj, &opal_core_attr);
- if (rc != 0) {
- pr_err("Failed to export /sys/firmware/opal/core\n");
+ rc = sysfs_create_group(mpipl_kobj, &mpipl_group);
+ if (rc) {
+ pr_err("mpipl sysfs group creation failed (%d)", rc);
opalcore_cleanup();
return rc;
}
-
- rc = sysfs_create_file(kernel_kobj, &opalcore_rel_attr.attr);
+ /* The /sys/firmware/opal/core is moved to /sys/firmware/opal/mpipl/
+ * directory, need to create symlink at old location to maintain
+ * backward compatibility.
+ */
+ rc = compat_only_sysfs_link_entry_to_kobj(opal_kobj, mpipl_kobj,
+ "core", NULL);
if (rc) {
- pr_warn("unable to create sysfs file fadump_release_opalcore (%d)\n",
- rc);
+ pr_err("unable to create core symlink (%d)\n", rc);
+ return rc;
}
return 0;
diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c
index 543c816fa99e..16c5860f1372 100644
--- a/arch/powerpc/platforms/powernv/opal-dump.c
+++ b/arch/powerpc/platforms/powernv/opal-dump.c
@@ -88,9 +88,14 @@ static ssize_t dump_ack_store(struct dump_obj *dump_obj,
const char *buf,
size_t count)
{
- dump_send_ack(dump_obj->id);
- sysfs_remove_file_self(&dump_obj->kobj, &attr->attr);
- kobject_put(&dump_obj->kobj);
+ /*
+ * Try to self remove this attribute. If we are successful,
+ * delete the kobject itself.
+ */
+ if (sysfs_remove_file_self(&dump_obj->kobj, &attr->attr)) {
+ dump_send_ack(dump_obj->id);
+ kobject_put(&dump_obj->kobj);
+ }
return count;
}
@@ -145,7 +150,7 @@ static struct attribute *initiate_attrs[] = {
NULL,
};
-static struct attribute_group initiate_attr_group = {
+static const struct attribute_group initiate_attr_group = {
.attrs = initiate_attrs,
};
@@ -203,11 +208,12 @@ static struct attribute *dump_default_attrs[] = {
&ack_attribute.attr,
NULL,
};
+ATTRIBUTE_GROUPS(dump_default);
static struct kobj_type dump_ktype = {
.sysfs_ops = &dump_sysfs_ops,
.release = &dump_release,
- .default_attrs = dump_default_attrs,
+ .default_groups = dump_default_groups,
};
static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t *dump_type)
@@ -318,15 +324,14 @@ static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
return count;
}
-static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
- uint32_t type)
+static void create_dump_obj(uint32_t id, size_t size, uint32_t type)
{
struct dump_obj *dump;
int rc;
dump = kzalloc(sizeof(*dump), GFP_KERNEL);
if (!dump)
- return NULL;
+ return;
dump->kobj.kset = dump_kset;
@@ -346,21 +351,39 @@ static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
if (rc) {
kobject_put(&dump->kobj);
- return NULL;
+ return;
}
+ /*
+ * As soon as the sysfs file for this dump is created/activated there is
+ * a chance the opal_errd daemon (or any userspace) might read and
+ * acknowledge the dump before kobject_uevent() is called. If that
+ * happens then there is a potential race between
+ * dump_ack_store->kobject_put() and kobject_uevent() which leads to a
+ * use-after-free of a kernfs object resulting in a kernel crash.
+ *
+ * To avoid that, we need to take a reference on behalf of the bin file,
+ * so that our reference remains valid while we call kobject_uevent().
+ * We then drop our reference before exiting the function, leaving the
+ * bin file to drop the last reference (if it hasn't already).
+ */
+
+ /* Take a reference for the bin file */
+ kobject_get(&dump->kobj);
rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
- if (rc) {
+ if (rc == 0) {
+ kobject_uevent(&dump->kobj, KOBJ_ADD);
+
+ pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
+ __func__, dump->id, dump->size);
+ } else {
+ /* Drop reference count taken for bin file */
kobject_put(&dump->kobj);
- return NULL;
}
- pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
- __func__, dump->id, dump->size);
-
- kobject_uevent(&dump->kobj, KOBJ_ADD);
-
- return dump;
+ /* Drop our reference */
+ kobject_put(&dump->kobj);
+ return;
}
static irqreturn_t process_dump(int irq, void *data)
@@ -397,7 +420,7 @@ void __init opal_platform_dump_init(void)
int rc;
int dump_irq;
- /* ELOG not supported by firmware */
+ /* Dump not supported by firmware */
if (!opal_check_token(OPAL_DUMP_READ))
return;
diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c
index 62ef7ad995da..554fdd7f88b8 100644
--- a/arch/powerpc/platforms/powernv/opal-elog.c
+++ b/arch/powerpc/platforms/powernv/opal-elog.c
@@ -72,9 +72,14 @@ static ssize_t elog_ack_store(struct elog_obj *elog_obj,
const char *buf,
size_t count)
{
- opal_send_ack_elog(elog_obj->id);
- sysfs_remove_file_self(&elog_obj->kobj, &attr->attr);
- kobject_put(&elog_obj->kobj);
+ /*
+ * Try to self remove this attribute. If we are successful,
+ * delete the kobject itself.
+ */
+ if (sysfs_remove_file_self(&elog_obj->kobj, &attr->attr)) {
+ opal_send_ack_elog(elog_obj->id);
+ kobject_put(&elog_obj->kobj);
+ }
return count;
}
@@ -139,11 +144,12 @@ static struct attribute *elog_default_attrs[] = {
&ack_attribute.attr,
NULL,
};
+ATTRIBUTE_GROUPS(elog_default);
static struct kobj_type elog_ktype = {
.sysfs_ops = &elog_sysfs_ops,
.release = &elog_release,
- .default_attrs = elog_default_attrs,
+ .default_groups = elog_default_groups,
};
/* Maximum size of a single log on FSP is 16KB */
@@ -166,8 +172,8 @@ static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
opal_rc = opal_read_elog(__pa(elog->buffer),
elog->size, elog->id);
if (opal_rc != OPAL_SUCCESS) {
- pr_err("ELOG: log read failed for log-id=%llx\n",
- elog->id);
+ pr_err_ratelimited("ELOG: log read failed for log-id=%llx\n",
+ elog->id);
kfree(elog->buffer);
elog->buffer = NULL;
return -EIO;
@@ -179,14 +185,14 @@ static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
return count;
}
-static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
+static void create_elog_obj(uint64_t id, size_t size, uint64_t type)
{
struct elog_obj *elog;
int rc;
elog = kzalloc(sizeof(*elog), GFP_KERNEL);
if (!elog)
- return NULL;
+ return;
elog->kobj.kset = elog_kset;
@@ -219,18 +225,37 @@ static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
rc = kobject_add(&elog->kobj, NULL, "0x%llx", id);
if (rc) {
kobject_put(&elog->kobj);
- return NULL;
+ return;
}
+ /*
+ * As soon as the sysfs file for this elog is created/activated there is
+ * a chance the opal_errd daemon (or any userspace) might read and
+ * acknowledge the elog before kobject_uevent() is called. If that
+ * happens then there is a potential race between
+ * elog_ack_store->kobject_put() and kobject_uevent() which leads to a
+ * use-after-free of a kernfs object resulting in a kernel crash.
+ *
+ * To avoid that, we need to take a reference on behalf of the bin file,
+ * so that our reference remains valid while we call kobject_uevent().
+ * We then drop our reference before exiting the function, leaving the
+ * bin file to drop the last reference (if it hasn't already).
+ */
+
+ /* Take a reference for the bin file */
+ kobject_get(&elog->kobj);
rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr);
- if (rc) {
+ if (rc == 0) {
+ kobject_uevent(&elog->kobj, KOBJ_ADD);
+ } else {
+ /* Drop the reference taken for the bin file */
kobject_put(&elog->kobj);
- return NULL;
}
- kobject_uevent(&elog->kobj, KOBJ_ADD);
+ /* Drop our reference */
+ kobject_put(&elog->kobj);
- return elog;
+ return;
}
static irqreturn_t elog_event(int irq, void *data)
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index d361d37d975f..964f464b1b0e 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -60,7 +60,7 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
addr = be64_to_cpu(addr);
pr_debug("Kernel metadata addr: %llx\n", addr);
opal_fdm_active = (void *)addr;
- if (opal_fdm_active->registered_regions == 0)
+ if (be16_to_cpu(opal_fdm_active->registered_regions) == 0)
return;
ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_BOOT_MEM, &addr);
@@ -95,24 +95,24 @@ static int opal_fadump_unregister(struct fw_dump *fadump_conf);
static void opal_fadump_update_config(struct fw_dump *fadump_conf,
const struct opal_fadump_mem_struct *fdm)
{
- pr_debug("Boot memory regions count: %d\n", fdm->region_cnt);
+ pr_debug("Boot memory regions count: %d\n", be16_to_cpu(fdm->region_cnt));
/*
* The destination address of the first boot memory region is the
* destination address of boot memory regions.
*/
- fadump_conf->boot_mem_dest_addr = fdm->rgn[0].dest;
+ fadump_conf->boot_mem_dest_addr = be64_to_cpu(fdm->rgn[0].dest);
pr_debug("Destination address of boot memory regions: %#016llx\n",
fadump_conf->boot_mem_dest_addr);
- fadump_conf->fadumphdr_addr = fdm->fadumphdr_addr;
+ fadump_conf->fadumphdr_addr = be64_to_cpu(fdm->fadumphdr_addr);
}
/*
* This function is called in the capture kernel to get configuration details
* from metadata setup by the first kernel.
*/
-static void opal_fadump_get_config(struct fw_dump *fadump_conf,
+static void __init opal_fadump_get_config(struct fw_dump *fadump_conf,
const struct opal_fadump_mem_struct *fdm)
{
unsigned long base, size, last_end, hole_size;
@@ -126,9 +126,9 @@ static void opal_fadump_get_config(struct fw_dump *fadump_conf,
fadump_conf->boot_memory_size = 0;
pr_debug("Boot memory regions:\n");
- for (i = 0; i < fdm->region_cnt; i++) {
- base = fdm->rgn[i].src;
- size = fdm->rgn[i].size;
+ for (i = 0; i < be16_to_cpu(fdm->region_cnt); i++) {
+ base = be64_to_cpu(fdm->rgn[i].src);
+ size = be64_to_cpu(fdm->rgn[i].size);
pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size);
fadump_conf->boot_mem_addr[i] = base;
@@ -143,7 +143,7 @@ static void opal_fadump_get_config(struct fw_dump *fadump_conf,
* Start address of reserve dump area (permanent reservation) for
* re-registering FADump after dump capture.
*/
- fadump_conf->reserve_dump_area_start = fdm->rgn[0].dest;
+ fadump_conf->reserve_dump_area_start = be64_to_cpu(fdm->rgn[0].dest);
/*
* Rarely, but it can so happen that system crashes before all
@@ -155,13 +155,14 @@ static void opal_fadump_get_config(struct fw_dump *fadump_conf,
* Hope the memory that could not be preserved only has pages
* that are usually filtered out while saving the vmcore.
*/
- if (fdm->region_cnt > fdm->registered_regions) {
+ if (be16_to_cpu(fdm->region_cnt) > be16_to_cpu(fdm->registered_regions)) {
pr_warn("Not all memory regions were saved!!!\n");
pr_warn(" Unsaved memory regions:\n");
- i = fdm->registered_regions;
- while (i < fdm->region_cnt) {
+ i = be16_to_cpu(fdm->registered_regions);
+ while (i < be16_to_cpu(fdm->region_cnt)) {
pr_warn("\t[%03d] base: 0x%llx, size: 0x%llx\n",
- i, fdm->rgn[i].src, fdm->rgn[i].size);
+ i, be64_to_cpu(fdm->rgn[i].src),
+ be64_to_cpu(fdm->rgn[i].size));
i++;
}
@@ -170,7 +171,7 @@ static void opal_fadump_get_config(struct fw_dump *fadump_conf,
}
fadump_conf->boot_mem_top = (fadump_conf->boot_memory_size + hole_size);
- fadump_conf->boot_mem_regs_cnt = fdm->region_cnt;
+ fadump_conf->boot_mem_regs_cnt = be16_to_cpu(fdm->region_cnt);
opal_fadump_update_config(fadump_conf, fdm);
}
@@ -178,35 +179,38 @@ static void opal_fadump_get_config(struct fw_dump *fadump_conf,
static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm)
{
fdm->version = OPAL_FADUMP_VERSION;
- fdm->region_cnt = 0;
- fdm->registered_regions = 0;
- fdm->fadumphdr_addr = 0;
+ fdm->region_cnt = cpu_to_be16(0);
+ fdm->registered_regions = cpu_to_be16(0);
+ fdm->fadumphdr_addr = cpu_to_be64(0);
}
static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
{
u64 addr = fadump_conf->reserve_dump_area_start;
+ u16 reg_cnt;
int i;
opal_fdm = __va(fadump_conf->kernel_metadata);
opal_fadump_init_metadata(opal_fdm);
/* Boot memory regions */
+ reg_cnt = be16_to_cpu(opal_fdm->region_cnt);
for (i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) {
- opal_fdm->rgn[i].src = fadump_conf->boot_mem_addr[i];
- opal_fdm->rgn[i].dest = addr;
- opal_fdm->rgn[i].size = fadump_conf->boot_mem_sz[i];
+ opal_fdm->rgn[i].src = cpu_to_be64(fadump_conf->boot_mem_addr[i]);
+ opal_fdm->rgn[i].dest = cpu_to_be64(addr);
+ opal_fdm->rgn[i].size = cpu_to_be64(fadump_conf->boot_mem_sz[i]);
- opal_fdm->region_cnt++;
+ reg_cnt++;
addr += fadump_conf->boot_mem_sz[i];
}
+ opal_fdm->region_cnt = cpu_to_be16(reg_cnt);
/*
- * Kernel metadata is passed to f/w and retrieved in capture kerenl.
+ * Kernel metadata is passed to f/w and retrieved in capture kernel.
* So, use it to save fadump header address instead of calculating it.
*/
- opal_fdm->fadumphdr_addr = (opal_fdm->rgn[0].dest +
- fadump_conf->boot_memory_size);
+ opal_fdm->fadumphdr_addr = cpu_to_be64(be64_to_cpu(opal_fdm->rgn[0].dest) +
+ fadump_conf->boot_memory_size);
opal_fadump_update_config(fadump_conf, opal_fdm);
@@ -269,18 +273,21 @@ static u64 opal_fadump_get_bootmem_min(void)
static int opal_fadump_register(struct fw_dump *fadump_conf)
{
s64 rc = OPAL_PARAMETER;
+ u16 registered_regs;
int i, err = -EIO;
- for (i = 0; i < opal_fdm->region_cnt; i++) {
+ registered_regs = be16_to_cpu(opal_fdm->registered_regions);
+ for (i = 0; i < be16_to_cpu(opal_fdm->region_cnt); i++) {
rc = opal_mpipl_update(OPAL_MPIPL_ADD_RANGE,
- opal_fdm->rgn[i].src,
- opal_fdm->rgn[i].dest,
- opal_fdm->rgn[i].size);
+ be64_to_cpu(opal_fdm->rgn[i].src),
+ be64_to_cpu(opal_fdm->rgn[i].dest),
+ be64_to_cpu(opal_fdm->rgn[i].size));
if (rc != OPAL_SUCCESS)
break;
- opal_fdm->registered_regions++;
+ registered_regs++;
}
+ opal_fdm->registered_regions = cpu_to_be16(registered_regs);
switch (rc) {
case OPAL_SUCCESS:
@@ -291,7 +298,8 @@ static int opal_fadump_register(struct fw_dump *fadump_conf)
case OPAL_RESOURCE:
/* If MAX regions limit in f/w is hit, warn and proceed. */
pr_warn("%d regions could not be registered for MPIPL as MAX limit is reached!\n",
- (opal_fdm->region_cnt - opal_fdm->registered_regions));
+ (be16_to_cpu(opal_fdm->region_cnt) -
+ be16_to_cpu(opal_fdm->registered_regions)));
fadump_conf->dump_registered = 1;
err = 0;
break;
@@ -312,7 +320,7 @@ static int opal_fadump_register(struct fw_dump *fadump_conf)
* If some regions were registered before OPAL_MPIPL_ADD_RANGE
* OPAL call failed, unregister all regions.
*/
- if ((err < 0) && (opal_fdm->registered_regions > 0))
+ if ((err < 0) && (be16_to_cpu(opal_fdm->registered_regions) > 0))
opal_fadump_unregister(fadump_conf);
return err;
@@ -328,7 +336,7 @@ static int opal_fadump_unregister(struct fw_dump *fadump_conf)
return -EIO;
}
- opal_fdm->registered_regions = 0;
+ opal_fdm->registered_regions = cpu_to_be16(0);
fadump_conf->dump_registered = 0;
return 0;
}
@@ -563,25 +571,26 @@ static void opal_fadump_region_show(struct fw_dump *fadump_conf,
else
fdm_ptr = opal_fdm;
- for (i = 0; i < fdm_ptr->region_cnt; i++) {
+ for (i = 0; i < be16_to_cpu(fdm_ptr->region_cnt); i++) {
/*
* Only regions that are registered for MPIPL
* would have dump data.
*/
if ((fadump_conf->dump_active) &&
- (i < fdm_ptr->registered_regions))
- dumped_bytes = fdm_ptr->rgn[i].size;
+ (i < be16_to_cpu(fdm_ptr->registered_regions)))
+ dumped_bytes = be64_to_cpu(fdm_ptr->rgn[i].size);
seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
- fdm_ptr->rgn[i].src, fdm_ptr->rgn[i].dest);
+ be64_to_cpu(fdm_ptr->rgn[i].src),
+ be64_to_cpu(fdm_ptr->rgn[i].dest));
seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
- fdm_ptr->rgn[i].size, dumped_bytes);
+ be64_to_cpu(fdm_ptr->rgn[i].size), dumped_bytes);
}
- /* Dump is active. Show reserved area start address. */
+ /* Dump is active. Show preserved area start address. */
if (fadump_conf->dump_active) {
- seq_printf(m, "\nMemory above %#016lx is reserved for saving crash dump\n",
- fadump_conf->reserve_dump_area_start);
+ seq_printf(m, "\nMemory above %#016llx is reserved for saving crash dump\n",
+ fadump_conf->boot_mem_top);
}
}
@@ -624,6 +633,7 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
{
const __be32 *prop;
unsigned long dn;
+ __be64 be_addr;
u64 addr = 0;
int i, len;
s64 ret;
@@ -671,7 +681,7 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
* Firmware supports 32-bit field for size. Align it to PAGE_SIZE
* and request firmware to copy multiple kernel boot memory regions.
*/
- fadump_conf->max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE);
+ fadump_conf->max_copy_size = ALIGN_DOWN(U32_MAX, PAGE_SIZE);
/*
* Check if dump has been initiated on last reboot.
@@ -680,13 +690,13 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
if (!prop)
return;
- ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
- if ((ret != OPAL_SUCCESS) || !addr) {
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &be_addr);
+ if ((ret != OPAL_SUCCESS) || !be_addr) {
pr_err("Failed to get Kernel metadata (%lld)\n", ret);
return;
}
- addr = be64_to_cpu(addr);
+ addr = be64_to_cpu(be_addr);
pr_debug("Kernel metadata addr: %llx\n", addr);
opal_fdm_active = __va(addr);
@@ -697,14 +707,14 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
}
/* Kernel regions not registered with f/w for MPIPL */
- if (opal_fdm_active->registered_regions == 0) {
+ if (be16_to_cpu(opal_fdm_active->registered_regions) == 0) {
opal_fdm_active = NULL;
return;
}
- ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
- if (addr) {
- addr = be64_to_cpu(addr);
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &be_addr);
+ if (be_addr) {
+ addr = be64_to_cpu(be_addr);
pr_debug("CPU metadata addr: %llx\n", addr);
opal_cpu_metadata = __va(addr);
}
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h
index f1e9ecf548c5..5eeb794b5eb1 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.h
+++ b/arch/powerpc/platforms/powernv/opal-fadump.h
@@ -31,14 +31,14 @@
* OPAL FADump kernel metadata
*
* The address of this structure will be registered with f/w for retrieving
- * and processing during crash dump.
+ * in the capture kernel to process the crash dump.
*/
struct opal_fadump_mem_struct {
u8 version;
u8 reserved[3];
- u16 region_cnt; /* number of regions */
- u16 registered_regions; /* Regions registered for MPIPL */
- u64 fadumphdr_addr;
+ __be16 region_cnt; /* number of regions */
+ __be16 registered_regions; /* Regions registered for MPIPL */
+ __be64 fadumphdr_addr;
struct opal_mpipl_region rgn[FADUMP_MAX_MEM_REGS];
} __packed;
@@ -135,7 +135,7 @@ static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt,
for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) {
reg_entry = (struct hdat_fadump_reg_entry *)bufp;
val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) :
- reg_entry->reg_val);
+ (u64 __force)(reg_entry->reg_val));
opal_fadump_set_regval_regnum(regs,
be32_to_cpu(reg_entry->reg_type),
be32_to_cpu(reg_entry->reg_num),
diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c
index 7e7d38b17420..d5ea04e8e4c5 100644
--- a/arch/powerpc/platforms/powernv/opal-flash.c
+++ b/arch/powerpc/platforms/powernv/opal-flash.c
@@ -512,7 +512,7 @@ static struct attribute *image_op_attrs[] = {
NULL /* need to NULL terminate the list of attributes */
};
-static struct attribute_group image_op_attr_group = {
+static const struct attribute_group image_op_attr_group = {
.attrs = image_op_attrs,
};
@@ -520,6 +520,10 @@ void __init opal_flash_update_init(void)
{
int ret;
+ /* Firmware update is not supported by firmware */
+ if (!opal_check_token(OPAL_FLASH_VALIDATE))
+ return;
+
/* Allocate validate image buffer */
validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL);
if (!validate_flash_data.buf) {
diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
index 3e1f064a18db..f0c1830deb51 100644
--- a/arch/powerpc/platforms/powernv/opal-hmi.c
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -213,6 +213,8 @@ static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
"A hypervisor resource error occurred",
"CAPP recovery process is in progress",
};
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
/* Print things out */
if (hmi_evt->version < OpalHMIEvt_V1) {
@@ -240,19 +242,22 @@ static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
break;
}
- printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
- level, sevstr,
- hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
- "Recovered" : "Not recovered");
- error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
- hmi_error_types[hmi_evt->type]
- : "Unknown";
- printk("%s Error detail: %s\n", level, error_info);
- printk("%s HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
- if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
- (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
- printk("%s TFMR: %016llx\n", level,
+ if (hmi_evt->severity != OpalHMI_SEV_NO_ERROR || __ratelimit(&rs)) {
+ printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
+ level, sevstr,
+ hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
+ "Recovered" : "Not recovered");
+ error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
+ hmi_error_types[hmi_evt->type]
+ : "Unknown";
+ printk("%s Error detail: %s\n", level, error_info);
+ printk("%s HMER: %016llx\n", level,
+ be64_to_cpu(hmi_evt->hmer));
+ if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
+ (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
+ printk("%s TFMR: %016llx\n", level,
be64_to_cpu(hmi_evt->tfmr));
+ }
if (hmi_evt->version < OpalHMIEvt_V2)
return;
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
index 000b350d4060..828fc4d88471 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -11,13 +11,12 @@
#include <linux/platform_device.h>
#include <linux/of.h>
#include <linux/of_address.h>
-#include <linux/of_platform.h>
#include <linux/crash_dump.h>
+#include <linux/debugfs.h>
#include <asm/opal.h>
#include <asm/io.h>
#include <asm/imc-pmu.h>
#include <asm/cputhreads.h>
-#include <asm/debugfs.h>
static struct dentry *imc_debugfs_parent;
@@ -35,11 +34,10 @@ static int imc_mem_set(void *data, u64 val)
}
DEFINE_DEBUGFS_ATTRIBUTE(fops_imc_x64, imc_mem_get, imc_mem_set, "0x%016llx\n");
-static struct dentry *imc_debugfs_create_x64(const char *name, umode_t mode,
- struct dentry *parent, u64 *value)
+static void imc_debugfs_create_x64(const char *name, umode_t mode,
+ struct dentry *parent, u64 *value)
{
- return debugfs_create_file_unsafe(name, mode, parent,
- value, &fops_imc_x64);
+ debugfs_create_file_unsafe(name, mode, parent, value, &fops_imc_x64);
}
/*
@@ -57,10 +55,7 @@ static void export_imc_mode_and_cmd(struct device_node *node,
u32 cb_offset;
struct imc_mem_info *ptr = pmu_ptr->mem_info;
- imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
-
- if (!imc_debugfs_parent)
- return;
+ imc_debugfs_parent = debugfs_create_dir("imc", arch_debugfs_dir);
if (of_property_read_u32(node, "cb_offset", &cb_offset))
cb_offset = IMC_CNTL_BLK_OFFSET;
@@ -69,21 +64,15 @@ static void export_imc_mode_and_cmd(struct device_node *node,
loc = (u64)(ptr->vbase) + cb_offset;
imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET);
sprintf(mode, "imc_mode_%d", (u32)(ptr->id));
- if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent,
- imc_mode_addr))
- goto err;
+ imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent,
+ imc_mode_addr);
imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET);
sprintf(cmd, "imc_cmd_%d", (u32)(ptr->id));
- if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent,
- imc_cmd_addr))
- goto err;
+ imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent,
+ imc_cmd_addr);
ptr++;
}
- return;
-
-err:
- debugfs_remove_recursive(imc_debugfs_parent);
}
/*
@@ -196,7 +185,7 @@ static void disable_nest_pmu_counters(void)
int nid, cpu;
const struct cpumask *l_cpumask;
- get_online_cpus();
+ cpus_read_lock();
for_each_node_with_cpus(nid) {
l_cpumask = cpumask_of_node(nid);
cpu = cpumask_first_and(l_cpumask, cpu_online_mask);
@@ -205,25 +194,25 @@ static void disable_nest_pmu_counters(void)
opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
get_hard_smp_processor_id(cpu));
}
- put_online_cpus();
+ cpus_read_unlock();
}
static void disable_core_pmu_counters(void)
{
- cpumask_t cores_map;
int cpu, rc;
- get_online_cpus();
+ cpus_read_lock();
/* Disable the IMC Core functions */
- cores_map = cpu_online_cores_map();
- for_each_cpu(cpu, &cores_map) {
+ for_each_online_cpu(cpu) {
+ if (cpu_first_thread_sibling(cpu) != cpu)
+ continue;
rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
get_hard_smp_processor_id(cpu));
if (rc)
pr_err("%s: Failed to stop Core (cpu = %d)\n",
- __FUNCTION__, cpu);
+ __func__, cpu);
}
- put_online_cpus();
+ cpus_read_unlock();
}
int get_max_nest_dev(void)
@@ -278,14 +267,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev)
domain = IMC_DOMAIN_THREAD;
break;
case IMC_TYPE_TRACE:
- /*
- * FIXME. Using trace_imc events to monitor application
- * or KVM thread performance can cause a checkstop
- * (system crash).
- * Disable it for now.
- */
- pr_info_once("IMC: disabling trace_imc PMU\n");
- domain = -1;
+ domain = IMC_DOMAIN_TRACE;
break;
default:
pr_warn("IMC Unknown Device type \n");
diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
index c164419e254d..56a1f7ce78d2 100644
--- a/arch/powerpc/platforms/powernv/opal-irqchip.c
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -46,23 +46,20 @@ void opal_handle_events(void)
e = READ_ONCE(last_outstanding_events) & opal_event_irqchip.mask;
again:
while (e) {
- int virq, hwirq;
+ int hwirq;
hwirq = fls64(e) - 1;
e &= ~BIT_ULL(hwirq);
local_irq_disable();
- virq = irq_find_mapping(opal_event_irqchip.domain, hwirq);
- if (virq) {
- irq_enter();
- generic_handle_irq(virq);
- irq_exit();
- }
+ irq_enter();
+ generic_handle_domain_irq(opal_event_irqchip.domain, hwirq);
+ irq_exit();
local_irq_enable();
cond_resched();
}
- last_outstanding_events = 0;
+ WRITE_ONCE(last_outstanding_events, 0);
if (opal_poll_events(&events) != OPAL_SUCCESS)
return;
e = be64_to_cpu(events) & opal_event_irqchip.mask;
@@ -72,7 +69,7 @@ again:
bool opal_have_pending_events(void)
{
- if (last_outstanding_events & opal_event_irqchip.mask)
+ if (READ_ONCE(last_outstanding_events) & opal_event_irqchip.mask)
return true;
return false;
}
@@ -127,7 +124,7 @@ static irqreturn_t opal_interrupt(int irq, void *data)
__be64 events;
opal_handle_interrupt(virq_to_hw(irq), &events);
- last_outstanding_events = be64_to_cpu(events);
+ WRITE_ONCE(last_outstanding_events, be64_to_cpu(events));
if (opal_have_pending_events())
opal_wake_poller();
@@ -278,6 +275,8 @@ int __init opal_event_init(void)
else
name = kasprintf(GFP_KERNEL, "opal");
+ if (!name)
+ continue;
/* Install interrupt handler */
rc = request_irq(r->start, opal_interrupt, r->flags & IRQD_TRIGGER_MASK,
name, NULL);
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index 608569082ba0..a16f07cdab26 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -10,13 +10,13 @@
#include <linux/bug.h>
#include <linux/io.h>
#include <linux/slab.h>
+#include <linux/debugfs.h>
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/opal.h>
#include <asm/prom.h>
#include <linux/uaccess.h>
-#include <asm/debugfs.h>
#include <asm/isa-bridge.h>
static int opal_lpc_chip_id = -1;
@@ -197,7 +197,7 @@ static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf,
/*
* Select access size based on count and alignment and
- * access type. IO and MEM only support byte acceses,
+ * access type. IO and MEM only support byte accesses,
* FW supports all 3.
*/
len = 1;
@@ -371,7 +371,7 @@ static int opal_lpc_init_debugfs(void)
if (opal_lpc_chip_id < 0)
return -ENODEV;
- root = debugfs_create_dir("lpc", powerpc_debugfs_root);
+ root = debugfs_create_dir("lpc", arch_debugfs_dir);
rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO);
rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM);
@@ -396,13 +396,14 @@ void __init opal_lpc_init(void)
if (!of_get_property(np, "primary", NULL))
continue;
opal_lpc_chip_id = of_get_ibm_chip_id(np);
+ of_node_put(np);
break;
}
if (opal_lpc_chip_id < 0)
return;
/* Does it support direct mapping ? */
- if (of_get_property(np, "ranges", NULL)) {
+ if (of_property_present(np, "ranges")) {
pr_info("OPAL: Found memory mapped LPC bus on chip %d\n",
opal_lpc_chip_id);
isa_bridge_init_non_pci(np);
diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c
index 1e8e17df9ce8..a1754a28265d 100644
--- a/arch/powerpc/platforms/powernv/opal-memory-errors.c
+++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -82,7 +82,7 @@ static DECLARE_WORK(mem_error_work, mem_error_handler);
/*
* opal_memory_err_event - notifier handler that queues up the opal message
- * to be preocessed later.
+ * to be processed later.
*/
static int opal_memory_err_event(struct notifier_block *nb,
unsigned long msg_type, void *msg)
diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
index d26da19a611f..22d6efe17b0d 100644
--- a/arch/powerpc/platforms/powernv/opal-msglog.c
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -12,6 +12,8 @@
#include <linux/types.h>
#include <asm/barrier.h>
+#include "powernv.h"
+
/* OPAL in-memory console. Defined in OPAL source at core/console.c */
struct memcons {
__be64 magic;
@@ -103,7 +105,7 @@ static struct bin_attribute opal_msglog_attr = {
.read = opal_msglog_read
};
-struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name)
+struct memcons *__init memcons_init(struct device_node *node, const char *mc_prop_name)
{
u64 mcaddr;
struct memcons *mc;
@@ -131,7 +133,7 @@ out_err:
return NULL;
}
-u32 memcons_get_size(struct memcons *mc)
+u32 __init memcons_get_size(struct memcons *mc)
{
return be32_to_cpu(mc->ibuf_size) + be32_to_cpu(mc->obuf_size);
}
diff --git a/arch/powerpc/platforms/powernv/opal-power.c b/arch/powerpc/platforms/powernv/opal-power.c
index 2a3717fc24ea..db99ffcb7b82 100644
--- a/arch/powerpc/platforms/powernv/opal-power.c
+++ b/arch/powerpc/platforms/powernv/opal-power.c
@@ -53,7 +53,7 @@ static bool detect_epow(void)
}
/* Check for existing EPOW, DPO events */
-static bool poweroff_pending(void)
+static bool __init poweroff_pending(void)
{
int rc;
__be64 opal_dpo_timeout;
diff --git a/arch/powerpc/platforms/powernv/opal-powercap.c b/arch/powerpc/platforms/powernv/opal-powercap.c
index c16d44f6f1d1..ea917266aa17 100644
--- a/arch/powerpc/platforms/powernv/opal-powercap.c
+++ b/arch/powerpc/platforms/powernv/opal-powercap.c
@@ -129,7 +129,7 @@ out_token:
return ret;
}
-static void powercap_add_attr(int handle, const char *name,
+static void __init powercap_add_attr(int handle, const char *name,
struct powercap_attr *attr)
{
attr->handle = handle;
@@ -153,7 +153,7 @@ void __init opal_powercap_init(void)
pcaps = kcalloc(of_get_child_count(powercap), sizeof(*pcaps),
GFP_KERNEL);
if (!pcaps)
- return;
+ goto out_put_powercap;
powercap_kobj = kobject_create_and_add("powercap", opal_kobj);
if (!powercap_kobj) {
@@ -196,6 +196,12 @@ void __init opal_powercap_init(void)
j = 0;
pcaps[i].pg.name = kasprintf(GFP_KERNEL, "%pOFn", node);
+ if (!pcaps[i].pg.name) {
+ kfree(pcaps[i].pattrs);
+ kfree(pcaps[i].pg.attrs);
+ goto out_pcaps_pattrs;
+ }
+
if (has_min) {
powercap_add_attr(min, "powercap-min",
&pcaps[i].pattrs[j]);
@@ -226,6 +232,7 @@ void __init opal_powercap_init(void)
}
i++;
}
+ of_node_put(powercap);
return;
@@ -236,6 +243,9 @@ out_pcaps_pattrs:
kfree(pcaps[i].pg.name);
}
kobject_put(powercap_kobj);
+ of_node_put(node);
out_pcaps:
kfree(pcaps);
+out_put_powercap:
+ of_node_put(powercap);
}
diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
index 45f4223a790f..24f04f20d3e8 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -24,13 +24,20 @@
#include <linux/uaccess.h>
-/**
+struct opal_prd_msg {
+ union {
+ struct opal_prd_msg_header header;
+ DECLARE_FLEX_ARRAY(u8, data);
+ };
+};
+
+/*
* The msg member must be at the end of the struct, as it's followed by the
* message data.
*/
struct opal_prd_msg_queue_item {
- struct list_head list;
- struct opal_prd_msg_header msg;
+ struct list_head list;
+ struct opal_prd_msg msg;
};
static struct device_node *prd_node;
@@ -59,6 +66,8 @@ static bool opal_prd_range_is_valid(uint64_t addr, uint64_t size)
const char *label;
addrp = of_get_address(node, 0, &range_size, NULL);
+ if (!addrp)
+ continue;
range_addr = of_read_number(addrp, 2);
range_end = range_addr + range_size;
@@ -105,7 +114,6 @@ static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma)
{
size_t addr, size;
pgprot_t page_prot;
- int rc;
pr_devel("opal_prd_mmap(0x%016lx, 0x%016lx, 0x%lx, 0x%lx)\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
@@ -121,10 +129,8 @@ static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma)
page_prot = phys_mem_access_prot(file, vma->vm_pgoff,
size, vma->vm_page_prot);
- rc = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size,
+ return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size,
page_prot);
-
- return rc;
}
static bool opal_msg_queue_empty(void)
@@ -159,7 +165,7 @@ static ssize_t opal_prd_read(struct file *file, char __user *buf,
int rc;
/* we need at least a header's worth of data */
- if (count < sizeof(item->msg))
+ if (count < sizeof(item->msg.header))
return -EINVAL;
if (*ppos)
@@ -189,7 +195,7 @@ static ssize_t opal_prd_read(struct file *file, char __user *buf,
return -EINTR;
}
- size = be16_to_cpu(item->msg.size);
+ size = be16_to_cpu(item->msg.header.size);
if (size > count) {
err = -EINVAL;
goto err_requeue;
@@ -217,8 +223,8 @@ static ssize_t opal_prd_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct opal_prd_msg_header hdr;
+ struct opal_prd_msg *msg;
ssize_t size;
- void *msg;
int rc;
size = sizeof(hdr);
@@ -250,12 +256,12 @@ static ssize_t opal_prd_write(struct file *file, const char __user *buf,
static int opal_prd_release(struct inode *inode, struct file *file)
{
- struct opal_prd_msg_header msg;
+ struct opal_prd_msg msg;
- msg.size = cpu_to_be16(sizeof(msg));
- msg.type = OPAL_PRD_MSG_TYPE_FINI;
+ msg.header.size = cpu_to_be16(sizeof(msg));
+ msg.header.type = OPAL_PRD_MSG_TYPE_FINI;
- opal_prd_msg((struct opal_prd_msg *)&msg);
+ opal_prd_msg(&msg);
atomic_xchg(&prd_usage, 0);
@@ -355,7 +361,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb,
if (!item)
return -ENOMEM;
- memcpy(&item->msg, msg->params, msg_size);
+ memcpy(&item->msg.data, msg->params, msg_size);
spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
list_add_tail(&item->list, &opal_prd_msg_queue);
@@ -372,6 +378,12 @@ static struct notifier_block opal_prd_event_nb = {
.priority = 0,
};
+static struct notifier_block opal_prd_event_nb2 = {
+ .notifier_call = opal_prd_msg_notifier,
+ .next = NULL,
+ .priority = 0,
+};
+
static int opal_prd_probe(struct platform_device *pdev)
{
int rc;
@@ -393,9 +405,10 @@ static int opal_prd_probe(struct platform_device *pdev)
return rc;
}
- rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb);
+ rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb2);
if (rc) {
pr_err("Couldn't register PRD2 event notifier\n");
+ opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb);
return rc;
}
@@ -404,17 +417,19 @@ static int opal_prd_probe(struct platform_device *pdev)
pr_err("failed to register miscdev\n");
opal_message_notifier_unregister(OPAL_MSG_PRD,
&opal_prd_event_nb);
+ opal_message_notifier_unregister(OPAL_MSG_PRD2,
+ &opal_prd_event_nb2);
return rc;
}
return 0;
}
-static int opal_prd_remove(struct platform_device *pdev)
+static void opal_prd_remove(struct platform_device *pdev)
{
misc_deregister(&opal_prd_dev);
opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb);
- return 0;
+ opal_message_notifier_unregister(OPAL_MSG_PRD2, &opal_prd_event_nb2);
}
static const struct of_device_id opal_prd_match[] = {
@@ -428,7 +443,7 @@ static struct platform_driver opal_prd_driver = {
.of_match_table = opal_prd_match,
},
.probe = opal_prd_probe,
- .remove = opal_prd_remove,
+ .remove_new = opal_prd_remove,
};
module_platform_driver(opal_prd_driver);
diff --git a/arch/powerpc/platforms/powernv/opal-psr.c b/arch/powerpc/platforms/powernv/opal-psr.c
index 69d7e75950d1..6441e17b6996 100644
--- a/arch/powerpc/platforms/powernv/opal-psr.c
+++ b/arch/powerpc/platforms/powernv/opal-psr.c
@@ -135,7 +135,7 @@ void __init opal_psr_init(void)
psr_attrs = kcalloc(of_get_child_count(psr), sizeof(*psr_attrs),
GFP_KERNEL);
if (!psr_attrs)
- return;
+ goto out_put_psr;
psr_kobj = kobject_create_and_add("psr", opal_kobj);
if (!psr_kobj) {
@@ -162,10 +162,14 @@ void __init opal_psr_init(void)
}
i++;
}
+ of_node_put(psr);
return;
out_kobj:
+ of_node_put(node);
kobject_put(psr_kobj);
out:
kfree(psr_attrs);
+out_put_psr:
+ of_node_put(psr);
}
diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c
index 44d7dacb33a2..79011a263aa6 100644
--- a/arch/powerpc/platforms/powernv/opal-rtc.c
+++ b/arch/powerpc/platforms/powernv/opal-rtc.c
@@ -11,14 +11,15 @@
#include <linux/bcd.h>
#include <linux/rtc.h>
#include <linux/delay.h>
-#include <linux/platform_device.h>
+#include <linux/of.h>
#include <linux/of_platform.h>
+#include <linux/platform_device.h>
#include <asm/opal.h>
#include <asm/firmware.h>
#include <asm/machdep.h>
-static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm)
+static void __init opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm)
{
tm->tm_year = ((bcd2bin(y_m_d >> 24) * 100) +
bcd2bin((y_m_d >> 16) & 0xff)) - 1900;
diff --git a/arch/powerpc/platforms/powernv/opal-secvar.c b/arch/powerpc/platforms/powernv/opal-secvar.c
index 14133e120bdd..6ac410f4d3c7 100644
--- a/arch/powerpc/platforms/powernv/opal-secvar.c
+++ b/arch/powerpc/platforms/powernv/opal-secvar.c
@@ -12,8 +12,8 @@
#define pr_fmt(fmt) "secvar: "fmt
#include <linux/types.h>
+#include <linux/of.h>
#include <linux/platform_device.h>
-#include <linux/of_platform.h>
#include <asm/opal.h>
#include <asm/secvar.h>
#include <asm/secure_boot.h>
@@ -54,8 +54,7 @@ static int opal_status_to_err(int rc)
return err;
}
-static int opal_get_variable(const char *key, uint64_t ksize,
- u8 *data, uint64_t *dsize)
+static int opal_get_variable(const char *key, u64 ksize, u8 *data, u64 *dsize)
{
int rc;
@@ -71,8 +70,7 @@ static int opal_get_variable(const char *key, uint64_t ksize,
return opal_status_to_err(rc);
}
-static int opal_get_next_variable(const char *key, uint64_t *keylen,
- uint64_t keybufsize)
+static int opal_get_next_variable(const char *key, u64 *keylen, u64 keybufsize)
{
int rc;
@@ -88,8 +86,7 @@ static int opal_get_next_variable(const char *key, uint64_t *keylen,
return opal_status_to_err(rc);
}
-static int opal_set_variable(const char *key, uint64_t ksize, u8 *data,
- uint64_t dsize)
+static int opal_set_variable(const char *key, u64 ksize, u8 *data, u64 dsize)
{
int rc;
@@ -101,10 +98,57 @@ static int opal_set_variable(const char *key, uint64_t ksize, u8 *data,
return opal_status_to_err(rc);
}
+static ssize_t opal_secvar_format(char *buf, size_t bufsize)
+{
+ ssize_t rc = 0;
+ struct device_node *node;
+ const char *format;
+
+ node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend");
+ if (!of_device_is_available(node)) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ rc = of_property_read_string(node, "format", &format);
+ if (rc)
+ goto out;
+
+ rc = snprintf(buf, bufsize, "%s", format);
+
+out:
+ of_node_put(node);
+
+ return rc;
+}
+
+static int opal_secvar_max_size(u64 *max_size)
+{
+ int rc;
+ struct device_node *node;
+
+ node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend");
+ if (!node)
+ return -ENODEV;
+
+ if (!of_device_is_available(node)) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ rc = of_property_read_u64(node, "max-var-size", max_size);
+
+out:
+ of_node_put(node);
+ return rc;
+}
+
static const struct secvar_operations opal_secvar_ops = {
.get = opal_get_variable,
.get_next = opal_get_next_variable,
.set = opal_set_variable,
+ .format = opal_secvar_format,
+ .max_size = opal_secvar_max_size,
};
static int opal_secvar_probe(struct platform_device *pdev)
@@ -116,9 +160,7 @@ static int opal_secvar_probe(struct platform_device *pdev)
return -ENODEV;
}
- set_secvar_ops(&opal_secvar_ops);
-
- return 0;
+ return set_secvar_ops(&opal_secvar_ops);
}
static const struct of_device_id opal_secvar_match[] = {
diff --git a/arch/powerpc/platforms/powernv/opal-sensor-groups.c b/arch/powerpc/platforms/powernv/opal-sensor-groups.c
index f8ae1fb0c102..9944376b115c 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor-groups.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor-groups.c
@@ -126,7 +126,7 @@ static void add_attr(int handle, struct sg_attr *attr, int index)
attr->attr.store = ops_info[index].store;
}
-static int add_attr_group(const __be32 *ops, int len, struct sensor_group *sg,
+static int __init add_attr_group(const __be32 *ops, int len, struct sensor_group *sg,
u32 handle)
{
int i, j;
@@ -144,7 +144,7 @@ static int add_attr_group(const __be32 *ops, int len, struct sensor_group *sg,
return sysfs_create_group(sg_kobj, &sg->sg);
}
-static int get_nr_attrs(const __be32 *ops, int len)
+static int __init get_nr_attrs(const __be32 *ops, int len)
{
int i, j;
int nr_attrs = 0;
@@ -170,7 +170,7 @@ void __init opal_sensor_groups_init(void)
sgs = kcalloc(of_get_child_count(sg), sizeof(*sgs), GFP_KERNEL);
if (!sgs)
- return;
+ goto out_sg_put;
sg_kobj = kobject_create_and_add("sensor_groups", opal_kobj);
if (!sg_kobj) {
@@ -222,6 +222,7 @@ void __init opal_sensor_groups_init(void)
}
i++;
}
+ of_node_put(sg);
return;
@@ -231,6 +232,9 @@ out_sgs_sgattrs:
kfree(sgs[i].sg.attrs);
}
kobject_put(sg_kobj);
+ of_node_put(node);
out_sgs:
kfree(sgs);
+out_sg_put:
+ of_node_put(sg);
}
diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
index 3192c614a1e1..8880a1c14573 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -6,7 +6,9 @@
*/
#include <linux/delay.h>
+#include <linux/of.h>
#include <linux/of_platform.h>
+#include <linux/platform_device.h>
#include <asm/opal.h>
#include <asm/machdep.h>
diff --git a/arch/powerpc/platforms/powernv/opal-tracepoints.c b/arch/powerpc/platforms/powernv/opal-tracepoints.c
index f16a43540e30..91b36541b9e5 100644
--- a/arch/powerpc/platforms/powernv/opal-tracepoints.c
+++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c
@@ -2,7 +2,6 @@
#include <linux/percpu.h>
#include <linux/jump_label.h>
#include <asm/trace.h>
-#include <asm/asm-prototypes.h>
#ifdef CONFIG_JUMP_LABEL
struct static_key opal_tracepoint_key = STATIC_KEY_INIT;
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index e5acc33b3b20..0ed95f753416 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -57,7 +57,7 @@ opal_return:
.long 0xa64b7b7d /* mthsrr1 r11 */
.long 0x2402004c /* hrfid */
#endif
- ld r2,PACATOC(r13)
+ LOAD_PACA_TOC()
ld r0,PPC_LR_STKOFF(r1)
mtlr r0
blr
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index fd510d961b8c..748c2b97fa53 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -14,11 +14,11 @@
#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/debugfs.h>
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/opal.h>
-#include <asm/debugfs.h>
#include <asm/prom.h>
static u64 opal_scom_unmangle(u64 addr)
@@ -165,10 +165,15 @@ static int scom_debug_init_one(struct dentry *root, struct device_node *dn,
ent->chip = chip;
snprintf(ent->name, 16, "%08x", chip);
ent->path.data = (void *)kasprintf(GFP_KERNEL, "%pOF", dn);
+ if (!ent->path.data) {
+ kfree(ent);
+ return -ENOMEM;
+ }
+
ent->path.size = strlen((char *)ent->path.data);
dir = debugfs_create_dir(ent->name, root);
- if (!dir) {
+ if (IS_ERR(dir)) {
kfree(ent->path.data);
kfree(ent);
return -1;
@@ -189,8 +194,8 @@ static int scom_debug_init(void)
if (!firmware_has_feature(FW_FEATURE_OPAL))
return 0;
- root = debugfs_create_dir("scom", powerpc_debugfs_root);
- if (!root)
+ root = debugfs_create_dir("scom", arch_debugfs_dir);
+ if (IS_ERR(root))
return -1;
rc = 0;
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index a6ee08009f0f..45dd77e3ccf6 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -73,7 +73,7 @@ static struct task_struct *kopald_tsk;
static struct opal_msg *opal_msg;
static u32 opal_msg_size __ro_after_init;
-void opal_configure_cores(void)
+void __init opal_configure_cores(void)
{
u64 reinit_flags = 0;
@@ -424,7 +424,7 @@ static int __init opal_message_init(struct device_node *opal_node)
return 0;
}
-int opal_get_chars(uint32_t vtermno, char *buf, int count)
+ssize_t opal_get_chars(uint32_t vtermno, u8 *buf, size_t count)
{
s64 rc;
__be64 evt, len;
@@ -441,10 +441,11 @@ int opal_get_chars(uint32_t vtermno, char *buf, int count)
return 0;
}
-static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, bool atomic)
+static ssize_t __opal_put_chars(uint32_t vtermno, const u8 *data,
+ size_t total_len, bool atomic)
{
unsigned long flags = 0 /* shut up gcc */;
- int written;
+ ssize_t written;
__be64 olen;
s64 rc;
@@ -484,7 +485,7 @@ static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, b
if (atomic) {
/* Should not happen */
pr_warn("atomic console write returned partial "
- "len=%d written=%d\n", total_len, written);
+ "len=%zu written=%zd\n", total_len, written);
}
if (!written)
written = -EAGAIN;
@@ -497,7 +498,7 @@ out:
return written;
}
-int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
+ssize_t opal_put_chars(uint32_t vtermno, const u8 *data, size_t total_len)
{
return __opal_put_chars(vtermno, data, total_len, false);
}
@@ -508,7 +509,8 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
* true at the moment because console space can race with OPAL's console
* writes.
*/
-int opal_put_chars_atomic(uint32_t vtermno, const char *data, int total_len)
+ssize_t opal_put_chars_atomic(uint32_t vtermno, const u8 *data,
+ size_t total_len)
{
return __opal_put_chars(vtermno, data, total_len, true);
}
@@ -588,7 +590,7 @@ static int opal_recover_mce(struct pt_regs *regs,
{
int recovered = 0;
- if (!(regs->msr & MSR_RI)) {
+ if (regs_is_unrecoverable(regs)) {
/* If MSR_RI isn't set, we cannot recover */
pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
recovered = 0;
@@ -624,7 +626,7 @@ static int opal_recover_mce(struct pt_regs *regs,
*/
recovered = 0;
} else {
- die("Machine check", regs, SIGBUS);
+ die_mce("Machine check", regs, SIGBUS);
recovered = 1;
}
}
@@ -731,7 +733,7 @@ int opal_hmi_exception_early2(struct pt_regs *regs)
return 1;
}
-/* HMI exception handler called in virtual mode during check_irq_replay. */
+/* HMI exception handler called in virtual mode when irqs are next enabled. */
int opal_handle_hmi_exception(struct pt_regs *regs)
{
/*
@@ -773,13 +775,13 @@ bool opal_mce_check_early_recovery(struct pt_regs *regs)
* Setup regs->nip to rfi into fixup address.
*/
if (recover_addr)
- regs->nip = recover_addr;
+ regs_set_return_ip(regs, recover_addr);
out:
return !!recover_addr;
}
-static int opal_sysfs_init(void)
+static int __init opal_sysfs_init(void)
{
opal_kobj = kobject_create_and_add("opal", firmware_kobj);
if (!opal_kobj) {
@@ -790,48 +792,85 @@ static int opal_sysfs_init(void)
return 0;
}
-static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t count)
+static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
+ struct bin_attribute *bin_attr, char *buf,
+ loff_t off, size_t count)
{
return memory_read_from_buffer(buf, count, &off, bin_attr->private,
bin_attr->size);
}
-static struct bin_attribute symbol_map_attr = {
- .attr = {.name = "symbol_map", .mode = 0400},
- .read = symbol_map_read
-};
-
-static void opal_export_symmap(void)
+static int opal_add_one_export(struct kobject *parent, const char *export_name,
+ struct device_node *np, const char *prop_name)
{
- const __be64 *syms;
- unsigned int size;
- struct device_node *fw;
+ struct bin_attribute *attr = NULL;
+ const char *name = NULL;
+ u64 vals[2];
int rc;
- fw = of_find_node_by_path("/ibm,opal/firmware");
- if (!fw)
- return;
- syms = of_get_property(fw, "symbol-map", &size);
- if (!syms || size != 2 * sizeof(__be64))
- return;
+ rc = of_property_read_u64_array(np, prop_name, &vals[0], 2);
+ if (rc)
+ goto out;
- /* Setup attributes */
- symbol_map_attr.private = __va(be64_to_cpu(syms[0]));
- symbol_map_attr.size = be64_to_cpu(syms[1]);
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (!attr) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ name = kstrdup(export_name, GFP_KERNEL);
+ if (!name) {
+ rc = -ENOMEM;
+ goto out;
+ }
- rc = sysfs_create_bin_file(opal_kobj, &symbol_map_attr);
- if (rc)
- pr_warn("Error %d creating OPAL symbols file\n", rc);
+ sysfs_bin_attr_init(attr);
+ attr->attr.name = name;
+ attr->attr.mode = 0400;
+ attr->read = export_attr_read;
+ attr->private = __va(vals[0]);
+ attr->size = vals[1];
+
+ rc = sysfs_create_bin_file(parent, attr);
+out:
+ if (rc) {
+ kfree(name);
+ kfree(attr);
+ }
+
+ return rc;
}
-static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
- struct bin_attribute *bin_attr, char *buf,
- loff_t off, size_t count)
+static void opal_add_exported_attrs(struct device_node *np,
+ struct kobject *kobj)
{
- return memory_read_from_buffer(buf, count, &off, bin_attr->private,
- bin_attr->size);
+ struct device_node *child;
+ struct property *prop;
+
+ for_each_property_of_node(np, prop) {
+ int rc;
+
+ if (!strcmp(prop->name, "name") ||
+ !strcmp(prop->name, "phandle"))
+ continue;
+
+ rc = opal_add_one_export(kobj, prop->name, np, prop->name);
+ if (rc) {
+ pr_warn("Unable to add export %pOF/%s, rc = %d!\n",
+ np, prop->name, rc);
+ }
+ }
+
+ for_each_child_of_node(np, child) {
+ struct kobject *child_kobj;
+
+ child_kobj = kobject_create_and_add(child->name, kobj);
+ if (!child_kobj) {
+ pr_err("Unable to create export dir for %pOF\n", child);
+ continue;
+ }
+
+ opal_add_exported_attrs(child, child_kobj);
+ }
}
/*
@@ -843,11 +882,8 @@ static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
*/
static void opal_export_attrs(void)
{
- struct bin_attribute *attr;
struct device_node *np;
- struct property *prop;
struct kobject *kobj;
- u64 vals[2];
int rc;
np = of_find_node_by_path("/ibm,opal/firmware/exports");
@@ -858,44 +894,20 @@ static void opal_export_attrs(void)
kobj = kobject_create_and_add("exports", opal_kobj);
if (!kobj) {
pr_warn("kobject_create_and_add() of exports failed\n");
+ of_node_put(np);
return;
}
- for_each_property_of_node(np, prop) {
- if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle"))
- continue;
-
- if (of_property_read_u64_array(np, prop->name, &vals[0], 2))
- continue;
-
- attr = kzalloc(sizeof(*attr), GFP_KERNEL);
-
- if (attr == NULL) {
- pr_warn("Failed kmalloc for bin_attribute!");
- continue;
- }
-
- sysfs_bin_attr_init(attr);
- attr->attr.name = kstrdup(prop->name, GFP_KERNEL);
- attr->attr.mode = 0400;
- attr->read = export_attr_read;
- attr->private = __va(vals[0]);
- attr->size = vals[1];
-
- if (attr->attr.name == NULL) {
- pr_warn("Failed kstrdup for bin_attribute attr.name");
- kfree(attr);
- continue;
- }
+ opal_add_exported_attrs(np, kobj);
- rc = sysfs_create_bin_file(kobj, attr);
- if (rc) {
- pr_warn("Error %d creating OPAL sysfs exports/%s file\n",
- rc, prop->name);
- kfree(attr->attr.name);
- kfree(attr);
- }
- }
+ /*
+ * NB: symbol_map existed before the generic export interface so it
+ * lives under the top level opal_kobj.
+ */
+ rc = opal_add_one_export(opal_kobj, "symbol_map",
+ np->parent, "symbol-map");
+ if (rc)
+ pr_warn("Error %d creating OPAL symbols file\n", rc);
of_node_put(np);
}
@@ -928,7 +940,7 @@ static void __init opal_dump_region_init(void)
"rc = %d\n", rc);
}
-static void opal_pdev_init(const char *compatible)
+static void __init opal_pdev_init(const char *compatible)
{
struct device_node *np;
@@ -943,6 +955,8 @@ static void __init opal_imc_init_dev(void)
np = of_find_compatible_node(NULL, NULL, IMC_DTB_COMPAT);
if (np)
of_platform_device_create(np, NULL, NULL);
+
+ of_node_put(np);
}
static int kopald(void *unused)
@@ -972,7 +986,7 @@ void opal_wake_poller(void)
wake_up_process(kopald_tsk);
}
-static void opal_init_heartbeat(void)
+static void __init opal_init_heartbeat(void)
{
/* Old firwmware, we assume the HVC heartbeat is sufficient */
if (of_property_read_u32(opal_node, "ibm,heartbeat-ms",
@@ -1042,8 +1056,6 @@ static int __init opal_init(void)
/* Create "opal" kobject under /sys/firmware */
rc = opal_sysfs_init();
if (rc == 0) {
- /* Export symbol map to userspace */
- opal_export_symmap();
/* Setup dump region interface */
opal_dump_region_init();
/* Setup error log interface */
@@ -1056,11 +1068,10 @@ static int __init opal_init(void)
opal_sys_param_init();
/* Setup message log sysfs interface. */
opal_msglog_sysfs_init();
+ /* Add all export properties*/
+ opal_export_attrs();
}
- /* Export all properties */
- opal_export_attrs();
-
/* Initialize platform devices: IPMI backend, PRD & flash interface */
opal_pdev_init("ibm,opal-ipmi");
opal_pdev_init("ibm,opal-flash");
diff --git a/arch/powerpc/platforms/powernv/pci-cxl.c b/arch/powerpc/platforms/powernv/pci-cxl.c
index 8c739c94ed28..7e419de71db8 100644
--- a/arch/powerpc/platforms/powernv/pci-cxl.c
+++ b/arch/powerpc/platforms/powernv/pci-cxl.c
@@ -4,6 +4,7 @@
*/
#include <linux/module.h>
+#include <misc/cxl-base.h>
#include <asm/pnv-pci.h>
#include <asm/opal.h>
@@ -150,25 +151,3 @@ int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
return 0;
}
EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup);
-
-#if IS_MODULE(CONFIG_CXL)
-static inline int get_cxl_module(void)
-{
- struct module *cxl_module;
-
- mutex_lock(&module_mutex);
-
- cxl_module = find_module("cxl");
- if (cxl_module)
- __module_get(cxl_module);
-
- mutex_unlock(&module_mutex);
-
- if (!cxl_module)
- return -ENODEV;
-
- return 0;
-}
-#else
-static inline int get_cxl_module(void) { return 0; }
-#endif
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index 5dc6847d5f4c..e96324502db0 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -17,6 +17,34 @@
#include <asm/tce.h>
#include "pci.h"
+unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
+{
+ struct pci_controller *hose = phb->hose;
+ struct device_node *dn = hose->dn;
+ unsigned long mask = 0;
+ int i, rc, count;
+ u32 val;
+
+ count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
+ if (count <= 0) {
+ mask = SZ_4K | SZ_64K;
+ /* Add 16M for POWER8 by default */
+ if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+ !cpu_has_feature(CPU_FTR_ARCH_300))
+ mask |= SZ_16M | SZ_256M;
+ return mask;
+ }
+
+ for (i = 0; i < count; i++) {
+ rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
+ i, &val);
+ if (rc == 0)
+ mask |= 1ULL << val;
+ }
+
+ return mask;
+}
+
void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
void *tce_mem, u64 tce_size,
u64 dma_offset, unsigned int page_shift)
@@ -117,8 +145,7 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
#ifdef CONFIG_IOMMU_API
int pnv_tce_xchg(struct iommu_table *tbl, long index,
- unsigned long *hpa, enum dma_data_direction *direction,
- bool alloc)
+ unsigned long *hpa, enum dma_data_direction *direction)
{
u64 proto_tce = iommu_direction_to_tce_perm(*direction);
unsigned long newtce = *hpa | proto_tce, oldtce;
@@ -136,9 +163,9 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index,
}
if (!ptce) {
- ptce = pnv_tce(tbl, false, idx, alloc);
+ ptce = pnv_tce(tbl, false, idx, true);
if (!ptce)
- return alloc ? H_HARDWARE : H_TOO_HARD;
+ return -ENOMEM;
}
if (newtce & TCE_PCI_WRITE)
@@ -352,6 +379,8 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
/* Remove link to a group from table's list of attached groups */
found = false;
+
+ rcu_read_lock();
list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
if (tgl->table_group == table_group) {
list_del_rcu(&tgl->next);
@@ -360,6 +389,8 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
break;
}
}
+ rcu_read_unlock();
+
if (WARN_ON(!found))
return;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index da1068a9c263..23f5b5093ec1 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -20,10 +20,12 @@
#include <linux/iommu.h>
#include <linux/rculist.h>
#include <linux/sizes.h>
+#include <linux/debugfs.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
#include <asm/sections.h>
#include <asm/io.h>
-#include <asm/prom.h>
#include <asm/pci-bridge.h>
#include <asm/machdep.h>
#include <asm/msi_bitmap.h>
@@ -32,10 +34,10 @@
#include <asm/iommu.h>
#include <asm/tce.h>
#include <asm/xics.h>
-#include <asm/debugfs.h>
#include <asm/firmware.h>
#include <asm/pnv-pci.h>
#include <asm/mmzone.h>
+#include <asm/xive.h>
#include <misc/cxl-base.h>
@@ -43,14 +45,11 @@
#include "pci.h"
#include "../../../../drivers/pci/pci.h"
-#define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */
-#define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */
-#define PNV_IODA1_DMA32_SEGSIZE 0x10000000
-
-static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
- "NPU_OCAPI" };
+/* This array is indexed with enum pnv_phb_type */
+static const char * const pnv_phb_names[] = { "IODA2", "NPU_OCAPI" };
static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+static void pnv_pci_configure_bus(struct pci_bus *bus);
void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
const char *fmt, ...)
@@ -65,7 +64,7 @@ void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
vaf.va = &args;
if (pe->flags & PNV_IODA_PE_DEV)
- strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
+ strscpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
sprintf(pfix, "%04x:%02x ",
pci_domain_nr(pe->pbus), pe->pbus->number);
@@ -114,32 +113,13 @@ static int __init pci_reset_phbs_setup(char *str)
early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
-static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
-{
- /*
- * WARNING: We cannot rely on the resource flags. The Linux PCI
- * allocation code sometimes decides to put a 64-bit prefetchable
- * BAR in the 32-bit window, so we have to compare the addresses.
- *
- * For simplicity we only test resource start.
- */
- return (r->start >= phb->ioda.m64_base &&
- r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
-}
-
-static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
-{
- unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
-
- return (resource_flags & flags) == flags;
-}
-
static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
{
s64 rc;
phb->ioda.pe_array[pe_no].phb = phb;
phb->ioda.pe_array[pe_no].pe_number = pe_no;
+ phb->ioda.pe_array[pe_no].dma_setup_done = false;
/*
* Clear the PE frozen state as it might be put into frozen state
@@ -163,35 +143,58 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
return;
}
+ mutex_lock(&phb->ioda.pe_alloc_mutex);
if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
pr_debug("%s: PE %x was reserved on PHB#%x\n",
__func__, pe_no, phb->hose->global_number);
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
pnv_ioda_init_pe(phb, pe_no);
}
-static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
+struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count)
{
- long pe;
+ struct pnv_ioda_pe *ret = NULL;
+ int run = 0, pe, i;
+
+ mutex_lock(&phb->ioda.pe_alloc_mutex);
+ /* scan backwards for a run of @count cleared bits */
for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
- if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
- return pnv_ioda_init_pe(phb, pe);
+ if (test_bit(pe, phb->ioda.pe_alloc)) {
+ run = 0;
+ continue;
+ }
+
+ run++;
+ if (run == count)
+ break;
+ }
+ if (run != count)
+ goto out;
+
+ for (i = pe; i < pe + count; i++) {
+ set_bit(i, phb->ioda.pe_alloc);
+ pnv_ioda_init_pe(phb, i);
}
+ ret = &phb->ioda.pe_array[pe];
- return NULL;
+out:
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
+ return ret;
}
-static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
+void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
{
struct pnv_phb *phb = pe->phb;
unsigned int pe_num = pe->pe_number;
WARN_ON(pe->pdev);
- WARN_ON(pe->npucomp); /* NPUs are not supposed to be freed */
- kfree(pe->npucomp);
memset(pe, 0, sizeof(struct pnv_ioda_pe));
+
+ mutex_lock(&phb->ioda.pe_alloc_mutex);
clear_bit(pe_num, phb->ioda.pe_alloc);
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
}
/* The default M64 BAR is shared by all PEs */
@@ -251,8 +254,7 @@ fail:
static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
unsigned long *pe_bitmap)
{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
struct resource *r;
resource_size_t base, sgsz, start, end;
int segno, i;
@@ -264,8 +266,8 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
if (!r->parent || !pnv_pci_is_m64(phb, r))
continue;
- start = _ALIGN_DOWN(r->start - base, sgsz);
- end = _ALIGN_UP(r->end - base, sgsz);
+ start = ALIGN_DOWN(r->start - base, sgsz);
+ end = ALIGN(r->end - base, sgsz);
for (segno = start / sgsz; segno < end / sgsz; segno++) {
if (pe_bitmap)
set_bit(segno, pe_bitmap);
@@ -275,64 +277,6 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
}
}
-static int pnv_ioda1_init_m64(struct pnv_phb *phb)
-{
- struct resource *r;
- int index;
-
- /*
- * There are 16 M64 BARs, each of which has 8 segments. So
- * there are as many M64 segments as the maximum number of
- * PEs, which is 128.
- */
- for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
- unsigned long base, segsz = phb->ioda.m64_segsize;
- int64_t rc;
-
- base = phb->ioda.m64_base +
- index * PNV_IODA1_M64_SEGS * segsz;
- rc = opal_pci_set_phb_mem_window(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, index, base, 0,
- PNV_IODA1_M64_SEGS * segsz);
- if (rc != OPAL_SUCCESS) {
- pr_warn(" Error %lld setting M64 PHB#%x-BAR#%d\n",
- rc, phb->hose->global_number, index);
- goto fail;
- }
-
- rc = opal_pci_phb_mmio_enable(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, index,
- OPAL_ENABLE_M64_SPLIT);
- if (rc != OPAL_SUCCESS) {
- pr_warn(" Error %lld enabling M64 PHB#%x-BAR#%d\n",
- rc, phb->hose->global_number, index);
- goto fail;
- }
- }
-
- /*
- * Exclude the segments for reserved and root bus PE, which
- * are first or last two PEs.
- */
- r = &phb->hose->mem_resources[1];
- if (phb->ioda.reserved_pe_idx == 0)
- r->start += (2 * phb->ioda.m64_segsize);
- else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
- r->end -= (2 * phb->ioda.m64_segsize);
- else
- WARN(1, "Wrong reserved PE#%x on PHB#%x\n",
- phb->ioda.reserved_pe_idx, phb->hose->global_number);
-
- return 0;
-
-fail:
- for ( ; index >= 0; index--)
- opal_pci_phb_mmio_enable(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
-
- return -EIO;
-}
-
static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
unsigned long *pe_bitmap,
bool all)
@@ -350,8 +294,7 @@ static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
{
- struct pci_controller *hose = pci_bus_to_host(bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
struct pnv_ioda_pe *master_pe, *pe;
unsigned long size, *pe_alloc;
int i;
@@ -361,7 +304,7 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
return NULL;
/* Allocate bitmap */
- size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
+ size = ALIGN(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
pe_alloc = kzalloc(size, GFP_KERNEL);
if (!pe_alloc) {
pr_warn("%s: Out of memory !\n",
@@ -402,26 +345,6 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
pe->master = master_pe;
list_add_tail(&pe->list, &master_pe->slaves);
}
-
- /*
- * P7IOC supports M64DT, which helps mapping M64 segment
- * to one particular PE#. However, PHB3 has fixed mapping
- * between M64 segment and PE#. In order to have same logic
- * for P7IOC and PHB3, we enforce fixed mapping between M64
- * segment and PE# on P7IOC.
- */
- if (phb->type == PNV_PHB_IODA1) {
- int64_t rc;
-
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
- pe->pe_number, OPAL_M64_WINDOW_TYPE,
- pe->pe_number / PNV_IODA1_M64_SEGS,
- pe->pe_number % PNV_IODA1_M64_SEGS);
- if (rc != OPAL_SUCCESS)
- pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
- __func__, rc, phb->hose->global_number,
- pe->pe_number);
- }
}
kfree(pe_alloc);
@@ -437,7 +360,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
const __be32 *r;
u64 pci_addr;
- if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
+ if (phb->type != PNV_PHB_IODA2) {
pr_info(" Not support M64 window\n");
return;
}
@@ -512,10 +435,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
* Setup init functions for M64 based on IODA version, IODA3 uses
* the IODA2 code.
*/
- if (phb->type == PNV_PHB_IODA1)
- phb->init_m64 = pnv_ioda1_init_m64;
- else
- phb->init_m64 = pnv_ioda2_init_m64;
+ phb->init_m64 = pnv_ioda2_init_m64;
}
static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
@@ -660,10 +580,19 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
return state;
}
+struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn)
+{
+ int pe_number = phb->ioda.pe_rmap[bdfn];
+
+ if (pe_number == IODA_INVALID_PE)
+ return NULL;
+
+ return &phb->ioda.pe_array[pe_number];
+}
+
struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
struct pci_dn *pdn = pci_get_pdn(dev);
if (!pdn)
@@ -777,7 +706,35 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
return 0;
}
-static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+static void pnv_ioda_unset_peltv(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe,
+ struct pci_dev *parent)
+{
+ int64_t rc;
+
+ while (parent) {
+ struct pci_dn *pdn = pci_get_pdn(parent);
+
+ if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+ rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
+ pe->pe_number,
+ OPAL_REMOVE_PE_FROM_DOMAIN);
+ /* XXX What to do in case of error ? */
+ }
+ parent = parent->bus->self;
+ }
+
+ opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+
+ /* Disassociate PE in PELT */
+ rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
+ pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
+ if (rc)
+ pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
+}
+
+int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
{
struct pci_dev *parent;
uint8_t bcomp, dcomp, fcomp;
@@ -792,7 +749,7 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
parent = pe->pbus->self;
if (pe->flags & PNV_IODA_PE_BUS_ALL)
- count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
+ count = resource_size(&pe->pbus->busn_res);
else
count = 1;
@@ -827,25 +784,13 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
for (rid = pe->rid; rid < rid_end; rid++)
phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
- /* Release from all parents PELT-V */
- while (parent) {
- struct pci_dn *pdn = pci_get_pdn(parent);
- if (pdn && pdn->pe_number != IODA_INVALID_PE) {
- rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
- pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
- /* XXX What to do in case of error ? */
- }
- parent = parent->bus->self;
- }
-
- opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
- OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+ /*
+ * Release from all parents PELT-V. NPUs don't have a PELTV
+ * table
+ */
+ if (phb->type != PNV_PHB_NPU_OCAPI)
+ pnv_ioda_unset_peltv(phb, pe, parent);
- /* Disassociate PE in PELT */
- rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
- pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
- if (rc)
- pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
if (rc)
@@ -860,9 +805,8 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
return 0;
}
-static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
{
- struct pci_dev *parent;
uint8_t bcomp, dcomp, fcomp;
long rc, rid_end, rid;
@@ -872,9 +816,8 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
- parent = pe->pbus->self;
if (pe->flags & PNV_IODA_PE_BUS_ALL)
- count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
+ count = resource_size(&pe->pbus->busn_res);
else
count = 1;
@@ -893,12 +836,6 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
}
rid_end = pe->rid + (count << 8);
} else {
-#ifdef CONFIG_PCI_IOV
- if (pe->flags & PNV_IODA_PE_VF)
- parent = pe->parent_dev;
- else
-#endif /* CONFIG_PCI_IOV */
- parent = pe->pdev->bus->self;
bcomp = OpalPciBusAll;
dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
@@ -922,128 +859,21 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
* Configure PELTV. NPUs don't have a PELTV table so skip
* configuration on them.
*/
- if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
+ if (phb->type != PNV_PHB_NPU_OCAPI)
pnv_ioda_set_peltv(phb, pe, true);
/* Setup reverse map */
for (rid = pe->rid; rid < rid_end; rid++)
phb->ioda.pe_rmap[rid] = pe->pe_number;
- /* Setup one MVTs on IODA1 */
- if (phb->type != PNV_PHB_IODA1) {
- pe->mve_number = 0;
- goto out;
- }
+ pe->mve_number = 0;
- pe->mve_number = pe->pe_number;
- rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
- if (rc != OPAL_SUCCESS) {
- pe_err(pe, "OPAL error %ld setting up MVE %x\n",
- rc, pe->mve_number);
- pe->mve_number = -1;
- } else {
- rc = opal_pci_set_mve_enable(phb->opal_id,
- pe->mve_number, OPAL_ENABLE_MVE);
- if (rc) {
- pe_err(pe, "OPAL error %ld enabling MVE %x\n",
- rc, pe->mve_number);
- pe->mve_number = -1;
- }
- }
-
-out:
return 0;
}
-#ifdef CONFIG_PCI_IOV
-static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
-{
- struct pci_dn *pdn = pci_get_pdn(dev);
- int i;
- struct resource *res, res2;
- resource_size_t size;
- u16 num_vfs;
-
- if (!dev->is_physfn)
- return -EINVAL;
-
- /*
- * "offset" is in VFs. The M64 windows are sized so that when they
- * are segmented, each segment is the same size as the IOV BAR.
- * Each segment is in a separate PE, and the high order bits of the
- * address are the PE number. Therefore, each VF's BAR is in a
- * separate PE, and changing the IOV BAR start address changes the
- * range of PEs the VFs are in.
- */
- num_vfs = pdn->num_vfs;
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = &dev->resource[i + PCI_IOV_RESOURCES];
- if (!res->flags || !res->parent)
- continue;
-
- /*
- * The actual IOV BAR range is determined by the start address
- * and the actual size for num_vfs VFs BAR. This check is to
- * make sure that after shifting, the range will not overlap
- * with another device.
- */
- size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
- res2.flags = res->flags;
- res2.start = res->start + (size * offset);
- res2.end = res2.start + (size * num_vfs) - 1;
-
- if (res2.end > res->end) {
- dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
- i, &res2, res, num_vfs, offset);
- return -EBUSY;
- }
- }
-
- /*
- * Since M64 BAR shares segments among all possible 256 PEs,
- * we have to shift the beginning of PF IOV BAR to make it start from
- * the segment which belongs to the PE number assigned to the first VF.
- * This creates a "hole" in the /proc/iomem which could be used for
- * allocating other resources so we reserve this area below and
- * release when IOV is released.
- */
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = &dev->resource[i + PCI_IOV_RESOURCES];
- if (!res->flags || !res->parent)
- continue;
-
- size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
- res2 = *res;
- res->start += size * offset;
-
- dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
- i, &res2, res, (offset > 0) ? "En" : "Dis",
- num_vfs, offset);
-
- if (offset < 0) {
- devm_release_resource(&dev->dev, &pdn->holes[i]);
- memset(&pdn->holes[i], 0, sizeof(pdn->holes[i]));
- }
-
- pci_update_resource(dev, i + PCI_IOV_RESOURCES);
-
- if (offset > 0) {
- pdn->holes[i].start = res2.start;
- pdn->holes[i].end = res2.start + size * offset - 1;
- pdn->holes[i].flags = IORESOURCE_BUS;
- pdn->holes[i].name = "pnv_iov_reserved";
- devm_request_resource(&dev->dev, res->parent,
- &pdn->holes[i]);
- }
- }
- return 0;
-}
-#endif /* CONFIG_PCI_IOV */
-
static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
struct pci_dn *pdn = pci_get_pdn(dev);
struct pnv_ioda_pe *pe;
@@ -1055,27 +885,26 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
if (pdn->pe_number != IODA_INVALID_PE)
return NULL;
- pe = pnv_ioda_alloc_pe(phb);
+ pe = pnv_ioda_alloc_pe(phb, 1);
if (!pe) {
pr_warn("%s: Not enough PE# available, disabling device\n",
pci_name(dev));
return NULL;
}
- /* NOTE: We get only one ref to the pci_dev for the pdn, not for the
- * pointer in the PE data structure, both should be destroyed at the
- * same time. However, this needs to be looked at more closely again
- * once we actually start removing things (Hotplug, SR-IOV, ...)
+ /* NOTE: We don't get a reference for the pointer in the PE
+ * data structure, both the device and PE structures should be
+ * destroyed at the same time.
*
* At some point we want to remove the PDN completely anyways
*/
- pci_dev_get(dev);
pdn->pe_number = pe->pe_number;
pe->flags = PNV_IODA_PE_DEV;
pe->pdev = dev;
pe->pbus = NULL;
pe->mve_number = -1;
pe->rid = dev->bus->number << 8 | pdn->devfn;
+ pe->device_count++;
pe_info(pe, "Associated device to PE\n");
@@ -1084,44 +913,16 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
pnv_ioda_free_pe(pe);
pdn->pe_number = IODA_INVALID_PE;
pe->pdev = NULL;
- pci_dev_put(dev);
return NULL;
}
/* Put PE to the list */
+ mutex_lock(&phb->ioda.pe_list_mutex);
list_add_tail(&pe->list, &phb->ioda.pe_list);
-
+ mutex_unlock(&phb->ioda.pe_list_mutex);
return pe;
}
-static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
-{
- struct pci_dev *dev;
-
- list_for_each_entry(dev, &bus->devices, bus_list) {
- struct pci_dn *pdn = pci_get_pdn(dev);
-
- if (pdn == NULL) {
- pr_warn("%s: No device node associated with device !\n",
- pci_name(dev));
- continue;
- }
-
- /*
- * In partial hotplug case, the PCI device might be still
- * associated with the PE and needn't attach it to the PE
- * again.
- */
- if (pdn->pe_number != IODA_INVALID_PE)
- continue;
-
- pe->device_count++;
- pdn->pe_number = pe->pe_number;
- if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
- pnv_ioda_setup_same_PE(dev->subordinate, pe);
- }
-}
-
/*
* There're 2 types of PCI bus sensitive PEs: One that is compromised of
* single PCI bus. Another one that contains the primary PCI bus and its
@@ -1130,8 +931,7 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
*/
static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
{
- struct pci_controller *hose = pci_bus_to_host(bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
struct pnv_ioda_pe *pe = NULL;
unsigned int pe_num;
@@ -1140,15 +940,13 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
* We should reuse it instead of allocating a new one.
*/
pe_num = phb->ioda.pe_rmap[bus->number << 8];
- if (pe_num != IODA_INVALID_PE) {
+ if (WARN_ON(pe_num != IODA_INVALID_PE)) {
pe = &phb->ioda.pe_array[pe_num];
- pnv_ioda_setup_same_PE(bus, pe);
return NULL;
}
/* PE number for root bus should have been reserved */
- if (pci_is_root_bus(bus) &&
- phb->ioda.root_pe_idx != IODA_INVALID_PE)
+ if (pci_is_root_bus(bus))
pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
/* Check if PE is determined by M64 */
@@ -1157,7 +955,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
/* The PE number isn't pinned by M64 */
if (!pe)
- pe = pnv_ioda_alloc_pe(phb);
+ pe = pnv_ioda_alloc_pe(phb, 1);
if (!pe) {
pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
@@ -1186,575 +984,66 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
return NULL;
}
- /* Associate it with all child devices */
- pnv_ioda_setup_same_PE(bus, pe);
-
/* Put PE to the list */
list_add_tail(&pe->list, &phb->ioda.pe_list);
return pe;
}
-static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
-{
- int pe_num, found_pe = false, rc;
- long rid;
- struct pnv_ioda_pe *pe;
- struct pci_dev *gpu_pdev;
- struct pci_dn *npu_pdn;
- struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus);
- struct pnv_phb *phb = hose->private_data;
-
- /*
- * Due to a hardware errata PE#0 on the NPU is reserved for
- * error handling. This means we only have three PEs remaining
- * which need to be assigned to four links, implying some
- * links must share PEs.
- *
- * To achieve this we assign PEs such that NPUs linking the
- * same GPU get assigned the same PE.
- */
- gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
- for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
- pe = &phb->ioda.pe_array[pe_num];
- if (!pe->pdev)
- continue;
-
- if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) {
- /*
- * This device has the same peer GPU so should
- * be assigned the same PE as the existing
- * peer NPU.
- */
- dev_info(&npu_pdev->dev,
- "Associating to existing PE %x\n", pe_num);
- pci_dev_get(npu_pdev);
- npu_pdn = pci_get_pdn(npu_pdev);
- rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
- npu_pdn->pe_number = pe_num;
- phb->ioda.pe_rmap[rid] = pe->pe_number;
-
- /* Map the PE to this link */
- rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
- OpalPciBusAll,
- OPAL_COMPARE_RID_DEVICE_NUMBER,
- OPAL_COMPARE_RID_FUNCTION_NUMBER,
- OPAL_MAP_PE);
- WARN_ON(rc != OPAL_SUCCESS);
- found_pe = true;
- break;
- }
- }
-
- if (!found_pe)
- /*
- * Could not find an existing PE so allocate a new
- * one.
- */
- return pnv_ioda_setup_dev_PE(npu_pdev);
- else
- return pe;
-}
-
-static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
-{
- struct pci_dev *pdev;
-
- list_for_each_entry(pdev, &bus->devices, bus_list)
- pnv_ioda_setup_npu_PE(pdev);
-}
-
-static void pnv_pci_ioda_setup_PEs(void)
+static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
{
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pci_bus *bus;
- struct pci_dev *pdev;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+ struct pci_dn *pdn = pci_get_pdn(pdev);
struct pnv_ioda_pe *pe;
- list_for_each_entry(hose, &hose_list, list_node) {
- phb = hose->private_data;
- if (phb->type == PNV_PHB_NPU_NVLINK) {
- /* PE#0 is needed for error reporting */
- pnv_ioda_reserve_pe(phb, 0);
- pnv_ioda_setup_npu_PEs(hose->bus);
- if (phb->model == PNV_PHB_MODEL_NPU2)
- WARN_ON_ONCE(pnv_npu2_init(hose));
- }
- if (phb->type == PNV_PHB_NPU_OCAPI) {
- bus = hose->bus;
- list_for_each_entry(pdev, &bus->devices, bus_list)
- pnv_ioda_setup_dev_PE(pdev);
- }
- }
- list_for_each_entry(hose, &hose_list, list_node) {
- phb = hose->private_data;
- if (phb->type != PNV_PHB_IODA2)
- continue;
-
- list_for_each_entry(pe, &phb->ioda.pe_list, list)
- pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
- }
-}
-
-#ifdef CONFIG_PCI_IOV
-static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
-{
- struct pci_bus *bus;
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pci_dn *pdn;
- int i, j;
- int m64_bars;
-
- bus = pdev->bus;
- hose = pci_bus_to_host(bus);
- phb = hose->private_data;
- pdn = pci_get_pdn(pdev);
-
- if (pdn->m64_single_mode)
- m64_bars = num_vfs;
- else
- m64_bars = 1;
-
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
- for (j = 0; j < m64_bars; j++) {
- if (pdn->m64_map[j][i] == IODA_INVALID_M64)
- continue;
- opal_pci_phb_mmio_enable(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
- clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
- pdn->m64_map[j][i] = IODA_INVALID_M64;
- }
-
- kfree(pdn->m64_map);
- return 0;
-}
-
-static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
-{
- struct pci_bus *bus;
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pci_dn *pdn;
- unsigned int win;
- struct resource *res;
- int i, j;
- int64_t rc;
- int total_vfs;
- resource_size_t size, start;
- int pe_num;
- int m64_bars;
-
- bus = pdev->bus;
- hose = pci_bus_to_host(bus);
- phb = hose->private_data;
- pdn = pci_get_pdn(pdev);
- total_vfs = pci_sriov_get_totalvfs(pdev);
-
- if (pdn->m64_single_mode)
- m64_bars = num_vfs;
- else
- m64_bars = 1;
-
- pdn->m64_map = kmalloc_array(m64_bars,
- sizeof(*pdn->m64_map),
- GFP_KERNEL);
- if (!pdn->m64_map)
- return -ENOMEM;
- /* Initialize the m64_map to IODA_INVALID_M64 */
- for (i = 0; i < m64_bars ; i++)
- for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
- pdn->m64_map[i][j] = IODA_INVALID_M64;
-
-
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = &pdev->resource[i + PCI_IOV_RESOURCES];
- if (!res->flags || !res->parent)
- continue;
-
- for (j = 0; j < m64_bars; j++) {
- do {
- win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
- phb->ioda.m64_bar_idx + 1, 0);
-
- if (win >= phb->ioda.m64_bar_idx + 1)
- goto m64_failed;
- } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
-
- pdn->m64_map[j][i] = win;
-
- if (pdn->m64_single_mode) {
- size = pci_iov_resource_size(pdev,
- PCI_IOV_RESOURCES + i);
- start = res->start + size * j;
- } else {
- size = resource_size(res);
- start = res->start;
- }
-
- /* Map the M64 here */
- if (pdn->m64_single_mode) {
- pe_num = pdn->pe_num_map[j];
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
- pe_num, OPAL_M64_WINDOW_TYPE,
- pdn->m64_map[j][i], 0);
- }
-
- rc = opal_pci_set_phb_mem_window(phb->opal_id,
- OPAL_M64_WINDOW_TYPE,
- pdn->m64_map[j][i],
- start,
- 0, /* unused */
- size);
-
-
- if (rc != OPAL_SUCCESS) {
- dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
- win, rc);
- goto m64_failed;
- }
-
- if (pdn->m64_single_mode)
- rc = opal_pci_phb_mmio_enable(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
- else
- rc = opal_pci_phb_mmio_enable(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
-
- if (rc != OPAL_SUCCESS) {
- dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
- win, rc);
- goto m64_failed;
- }
- }
- }
- return 0;
-
-m64_failed:
- pnv_pci_vf_release_m64(pdev, num_vfs);
- return -EBUSY;
-}
-
-static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
- int num);
-
-static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
-{
- struct iommu_table *tbl;
- int64_t rc;
-
- tbl = pe->table_group.tables[0];
- rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
- if (rc)
- pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
-
- pnv_pci_ioda2_set_bypass(pe, false);
- if (pe->table_group.group) {
- iommu_group_put(pe->table_group.group);
- BUG_ON(pe->table_group.group);
- }
- iommu_tce_table_put(tbl);
-}
-
-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
-{
- struct pci_bus *bus;
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pnv_ioda_pe *pe, *pe_n;
- struct pci_dn *pdn;
-
- bus = pdev->bus;
- hose = pci_bus_to_host(bus);
- phb = hose->private_data;
- pdn = pci_get_pdn(pdev);
-
- if (!pdev->is_physfn)
- return;
-
- list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
- if (pe->parent_dev != pdev)
- continue;
-
- pnv_pci_ioda2_release_dma_pe(pdev, pe);
-
- /* Remove from list */
- mutex_lock(&phb->ioda.pe_list_mutex);
- list_del(&pe->list);
- mutex_unlock(&phb->ioda.pe_list_mutex);
-
- pnv_ioda_deconfigure_pe(phb, pe);
-
- pnv_ioda_free_pe(pe);
- }
-}
-
-void pnv_pci_sriov_disable(struct pci_dev *pdev)
-{
- struct pci_bus *bus;
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pnv_ioda_pe *pe;
- struct pci_dn *pdn;
- u16 num_vfs, i;
-
- bus = pdev->bus;
- hose = pci_bus_to_host(bus);
- phb = hose->private_data;
- pdn = pci_get_pdn(pdev);
- num_vfs = pdn->num_vfs;
-
- /* Release VF PEs */
- pnv_ioda_release_vf_PE(pdev);
-
- if (phb->type == PNV_PHB_IODA2) {
- if (!pdn->m64_single_mode)
- pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
-
- /* Release M64 windows */
- pnv_pci_vf_release_m64(pdev, num_vfs);
-
- /* Release PE numbers */
- if (pdn->m64_single_mode) {
- for (i = 0; i < num_vfs; i++) {
- if (pdn->pe_num_map[i] == IODA_INVALID_PE)
- continue;
-
- pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
- pnv_ioda_free_pe(pe);
- }
- } else
- bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
- /* Releasing pe_num_map */
- kfree(pdn->pe_num_map);
- }
-}
-
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
- struct pnv_ioda_pe *pe);
-#ifdef CONFIG_IOMMU_API
-static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
- struct iommu_table_group *table_group, struct pci_bus *bus);
-
-#endif
-static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
-{
- struct pci_bus *bus;
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pnv_ioda_pe *pe;
- int pe_num;
- u16 vf_index;
- struct pci_dn *pdn;
-
- bus = pdev->bus;
- hose = pci_bus_to_host(bus);
- phb = hose->private_data;
- pdn = pci_get_pdn(pdev);
-
- if (!pdev->is_physfn)
- return;
-
- /* Reserve PE for each VF */
- for (vf_index = 0; vf_index < num_vfs; vf_index++) {
- if (pdn->m64_single_mode)
- pe_num = pdn->pe_num_map[vf_index];
- else
- pe_num = *pdn->pe_num_map + vf_index;
-
- pe = &phb->ioda.pe_array[pe_num];
- pe->pe_number = pe_num;
- pe->phb = phb;
- pe->flags = PNV_IODA_PE_VF;
- pe->pbus = NULL;
- pe->parent_dev = pdev;
- pe->mve_number = -1;
- pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
- pci_iov_virtfn_devfn(pdev, vf_index);
-
- pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
- hose->global_number, pdev->bus->number,
- PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
- PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
-
- if (pnv_ioda_configure_pe(phb, pe)) {
- /* XXX What do we do here ? */
- pnv_ioda_free_pe(pe);
- pe->pdev = NULL;
- continue;
- }
-
- /* Put PE to the list */
- mutex_lock(&phb->ioda.pe_list_mutex);
- list_add_tail(&pe->list, &phb->ioda.pe_list);
- mutex_unlock(&phb->ioda.pe_list_mutex);
-
- pnv_pci_ioda2_setup_dma_pe(phb, pe);
-#ifdef CONFIG_IOMMU_API
- iommu_register_group(&pe->table_group,
- pe->phb->hose->global_number, pe->pe_number);
- pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL);
-#endif
- }
-}
-
-int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
-{
- struct pci_bus *bus;
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pnv_ioda_pe *pe;
- struct pci_dn *pdn;
- int ret;
- u16 i;
-
- bus = pdev->bus;
- hose = pci_bus_to_host(bus);
- phb = hose->private_data;
- pdn = pci_get_pdn(pdev);
-
- if (phb->type == PNV_PHB_IODA2) {
- if (!pdn->vfs_expanded) {
- dev_info(&pdev->dev, "don't support this SRIOV device"
- " with non 64bit-prefetchable IOV BAR\n");
- return -ENOSPC;
- }
-
- /*
- * When M64 BARs functions in Single PE mode, the number of VFs
- * could be enabled must be less than the number of M64 BARs.
- */
- if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
- dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
- return -EBUSY;
- }
-
- /* Allocating pe_num_map */
- if (pdn->m64_single_mode)
- pdn->pe_num_map = kmalloc_array(num_vfs,
- sizeof(*pdn->pe_num_map),
- GFP_KERNEL);
- else
- pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
-
- if (!pdn->pe_num_map)
- return -ENOMEM;
-
- if (pdn->m64_single_mode)
- for (i = 0; i < num_vfs; i++)
- pdn->pe_num_map[i] = IODA_INVALID_PE;
-
- /* Calculate available PE for required VFs */
- if (pdn->m64_single_mode) {
- for (i = 0; i < num_vfs; i++) {
- pe = pnv_ioda_alloc_pe(phb);
- if (!pe) {
- ret = -EBUSY;
- goto m64_failed;
- }
+ /* Check if the BDFN for this device is associated with a PE yet */
+ pe = pnv_pci_bdfn_to_pe(phb, pci_dev_id(pdev));
+ if (!pe) {
+ /* VF PEs should be pre-configured in pnv_pci_sriov_enable() */
+ if (WARN_ON(pdev->is_virtfn))
+ return;
- pdn->pe_num_map[i] = pe->pe_number;
- }
- } else {
- mutex_lock(&phb->ioda.pe_alloc_mutex);
- *pdn->pe_num_map = bitmap_find_next_zero_area(
- phb->ioda.pe_alloc, phb->ioda.total_pe_num,
- 0, num_vfs, 0);
- if (*pdn->pe_num_map >= phb->ioda.total_pe_num) {
- mutex_unlock(&phb->ioda.pe_alloc_mutex);
- dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
- kfree(pdn->pe_num_map);
- return -EBUSY;
- }
- bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
- mutex_unlock(&phb->ioda.pe_alloc_mutex);
- }
- pdn->num_vfs = num_vfs;
+ pnv_pci_configure_bus(pdev->bus);
+ pe = pnv_pci_bdfn_to_pe(phb, pci_dev_id(pdev));
+ pci_info(pdev, "Configured PE#%x\n", pe ? pe->pe_number : 0xfffff);
- /* Assign M64 window accordingly */
- ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
- if (ret) {
- dev_info(&pdev->dev, "Not enough M64 window resources\n");
- goto m64_failed;
- }
/*
- * When using one M64 BAR to map one IOV BAR, we need to shift
- * the IOV BAR according to the PE# allocated to the VFs.
- * Otherwise, the PE# for the VF will conflict with others.
+ * If we can't setup the IODA PE something has gone horribly
+ * wrong and we can't enable DMA for the device.
*/
- if (!pdn->m64_single_mode) {
- ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
- if (ret)
- goto m64_failed;
- }
+ if (WARN_ON(!pe))
+ return;
+ } else {
+ pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number);
}
- /* Setup VF PEs */
- pnv_ioda_setup_vf_PE(pdev, num_vfs);
-
- return 0;
-
-m64_failed:
- if (pdn->m64_single_mode) {
- for (i = 0; i < num_vfs; i++) {
- if (pdn->pe_num_map[i] == IODA_INVALID_PE)
- continue;
-
- pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
- pnv_ioda_free_pe(pe);
- }
- } else
- bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
-
- /* Releasing pe_num_map */
- kfree(pdn->pe_num_map);
-
- return ret;
-}
-
-int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
-{
- pnv_pci_sriov_disable(pdev);
-
- /* Release PCI data */
- remove_dev_pci_data(pdev);
- return 0;
-}
-
-int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
-{
- /* Allocate PCI data */
- add_dev_pci_data(pdev);
-
- return pnv_pci_sriov_enable(pdev, num_vfs);
-}
-#endif /* CONFIG_PCI_IOV */
-
-static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
-{
- struct pci_dn *pdn = pci_get_pdn(pdev);
- struct pnv_ioda_pe *pe;
-
/*
- * The function can be called while the PE#
- * hasn't been assigned. Do nothing for the
- * case.
+ * We assume that bridges *probably* don't need to do any DMA so we can
+ * skip allocating a TCE table, etc unless we get a non-bridge device.
*/
- if (!pdn || pdn->pe_number == IODA_INVALID_PE)
- return;
+ if (!pe->dma_setup_done && !pci_is_bridge(pdev)) {
+ switch (phb->type) {
+ case PNV_PHB_IODA2:
+ pnv_pci_ioda2_setup_dma_pe(phb, pe);
+ break;
+ default:
+ pr_warn("%s: No DMA for PHB#%x (type %d)\n",
+ __func__, phb->hose->global_number, phb->type);
+ }
+ }
+
+ if (pdn)
+ pdn->pe_number = pe->pe_number;
+ pe->device_count++;
- pe = &phb->ioda.pe_array[pdn->pe_number];
WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
- /*
- * Note: iommu_add_device() will fail here as
- * for physical PE: the device is already added by now;
- * for virtual PE: sysfs entries are not ready yet and
- * tce_iommu_bus_notifier will add the device to a group later.
- */
+
+ /* PEs with a DMA weight of zero won't have a group */
+ if (pe->table_group.group)
+ iommu_add_device(&pe->table_group, &pdev->dev);
}
/*
@@ -1829,8 +1118,7 @@ err:
static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
u64 dma_mask)
{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
struct pci_dn *pdn = pci_get_pdn(pdev);
struct pnv_ioda_pe *pe;
@@ -1867,137 +1155,39 @@ static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
return false;
}
-static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
+static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb)
{
- struct pci_dev *dev;
-
- list_for_each_entry(dev, &bus->devices, bus_list) {
- set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
- dev->dev.archdata.dma_offset = pe->tce_bypass_base;
-
- if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
- pnv_ioda_setup_bus_dma(pe, dev->subordinate);
- }
-}
-
-static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
- bool real_mode)
-{
- return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) :
- (phb->regs + 0x210);
-}
-
-static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl,
- unsigned long index, unsigned long npages, bool rm)
-{
- struct iommu_table_group_link *tgl = list_first_entry_or_null(
- &tbl->it_group_list, struct iommu_table_group_link,
- next);
- struct pnv_ioda_pe *pe = container_of(tgl->table_group,
- struct pnv_ioda_pe, table_group);
- __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
- unsigned long start, end, inc;
-
- start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
- end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
- npages - 1);
-
- /* p7ioc-style invalidation, 2 TCEs per write */
- start |= (1ull << 63);
- end |= (1ull << 63);
- inc = 16;
- end |= inc - 1; /* round up end to be different than start */
-
- mb(); /* Ensure above stores are visible */
- while (start <= end) {
- if (rm)
- __raw_rm_writeq_be(start, invalidate);
- else
- __raw_writeq_be(start, invalidate);
-
- start += inc;
- }
-
- /*
- * The iommu layer will do another mb() for us on build()
- * and we don't care on free()
- */
-}
-
-static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
- long npages, unsigned long uaddr,
- enum dma_data_direction direction,
- unsigned long attrs)
-{
- int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
- attrs);
-
- if (!ret)
- pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
-
- return ret;
+ return phb->regs + 0x210;
}
#ifdef CONFIG_IOMMU_API
/* Common for IODA1 and IODA2 */
static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
- unsigned long *hpa, enum dma_data_direction *direction,
- bool realmode)
+ unsigned long *hpa, enum dma_data_direction *direction)
{
- return pnv_tce_xchg(tbl, index, hpa, direction, !realmode);
+ return pnv_tce_xchg(tbl, index, hpa, direction);
}
#endif
-static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
- long npages)
-{
- pnv_tce_free(tbl, index, npages);
-
- pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
-}
-
-static struct iommu_table_ops pnv_ioda1_iommu_ops = {
- .set = pnv_ioda1_tce_build,
-#ifdef CONFIG_IOMMU_API
- .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
- .tce_kill = pnv_pci_p7ioc_tce_invalidate,
- .useraddrptr = pnv_tce_useraddrptr,
-#endif
- .clear = pnv_ioda1_tce_free,
- .get = pnv_tce_get,
-};
-
#define PHB3_TCE_KILL_INVAL_ALL PPC_BIT(0)
#define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1)
#define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2)
-static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
-{
- __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
- const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
-
- mb(); /* Ensure previous TCE table stores are visible */
- if (rm)
- __raw_rm_writeq_be(val, invalidate);
- else
- __raw_writeq_be(val, invalidate);
-}
-
static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
{
/* 01xb - invalidate TCEs that match the specified PE# */
- __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false);
+ __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb);
unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
mb(); /* Ensure above stores are visible */
__raw_writeq_be(val, invalidate);
}
-static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
+static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe,
unsigned shift, unsigned long index,
unsigned long npages)
{
- __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
+ __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb);
unsigned long start, end, inc;
/* We'll invalidate DMA address in PE scope */
@@ -2012,10 +1202,7 @@ static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
mb();
while (start <= end) {
- if (rm)
- __raw_rm_writeq_be(start, invalidate);
- else
- __raw_writeq_be(start, invalidate);
+ __raw_writeq_be(start, invalidate);
start += inc;
}
}
@@ -2032,7 +1219,7 @@ static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
}
static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
- unsigned long index, unsigned long npages, bool rm)
+ unsigned long index, unsigned long npages)
{
struct iommu_table_group_link *tgl;
@@ -2042,22 +1229,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
struct pnv_phb *phb = pe->phb;
unsigned int shift = tbl->it_page_shift;
- /*
- * NVLink1 can use the TCE kill register directly as
- * it's the same as PHB3. NVLink2 is different and
- * should go via the OPAL call.
- */
- if (phb->model == PNV_PHB_MODEL_NPU) {
- /*
- * The NVLink hardware does not support TCE kill
- * per TCE entry so we have to invalidate
- * the entire cache for it.
- */
- pnv_pci_phb3_tce_invalidate_entire(phb, rm);
- continue;
- }
if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
- pnv_pci_phb3_tce_invalidate(pe, rm, shift,
+ pnv_pci_phb3_tce_invalidate(pe, shift,
index, npages);
else
opal_pci_tce_kill(phb->opal_id,
@@ -2067,14 +1240,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
}
}
-void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
-{
- if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
- pnv_pci_phb3_tce_invalidate_entire(phb, rm);
- else
- opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
-}
-
static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
long npages, unsigned long uaddr,
enum dma_data_direction direction,
@@ -2084,7 +1249,7 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
attrs);
if (!ret)
- pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+ pnv_pci_ioda2_tce_invalidate(tbl, index, npages);
return ret;
}
@@ -2094,7 +1259,7 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
{
pnv_tce_free(tbl, index, npages);
- pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+ pnv_pci_ioda2_tce_invalidate(tbl, index, npages);
}
static struct iommu_table_ops pnv_ioda2_iommu_ops = {
@@ -2109,178 +1274,6 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
.free = pnv_pci_ioda2_table_free_pages,
};
-static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
-{
- unsigned int *weight = (unsigned int *)data;
-
- /* This is quite simplistic. The "base" weight of a device
- * is 10. 0 means no DMA is to be accounted for it.
- */
- if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
- return 0;
-
- if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
- dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
- dev->class == PCI_CLASS_SERIAL_USB_EHCI)
- *weight += 3;
- else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
- *weight += 15;
- else
- *weight += 10;
-
- return 0;
-}
-
-static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
-{
- unsigned int weight = 0;
-
- /* SRIOV VF has same DMA32 weight as its PF */
-#ifdef CONFIG_PCI_IOV
- if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
- pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
- return weight;
- }
-#endif
-
- if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
- pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
- } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
- struct pci_dev *pdev;
-
- list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
- pnv_pci_ioda_dev_dma_weight(pdev, &weight);
- } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
- pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
- }
-
- return weight;
-}
-
-static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
- struct pnv_ioda_pe *pe)
-{
-
- struct page *tce_mem = NULL;
- struct iommu_table *tbl;
- unsigned int weight, total_weight = 0;
- unsigned int tce32_segsz, base, segs, avail, i;
- int64_t rc;
- void *addr;
-
- /* XXX FIXME: Handle 64-bit only DMA devices */
- /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
- /* XXX FIXME: Allocate multi-level tables on PHB3 */
- weight = pnv_pci_ioda_pe_dma_weight(pe);
- if (!weight)
- return;
-
- pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
- &total_weight);
- segs = (weight * phb->ioda.dma32_count) / total_weight;
- if (!segs)
- segs = 1;
-
- /*
- * Allocate contiguous DMA32 segments. We begin with the expected
- * number of segments. With one more attempt, the number of DMA32
- * segments to be allocated is decreased by one until one segment
- * is allocated successfully.
- */
- do {
- for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
- for (avail = 0, i = base; i < base + segs; i++) {
- if (phb->ioda.dma32_segmap[i] ==
- IODA_INVALID_PE)
- avail++;
- }
-
- if (avail == segs)
- goto found;
- }
- } while (--segs);
-
- if (!segs) {
- pe_warn(pe, "No available DMA32 segments\n");
- return;
- }
-
-found:
- tbl = pnv_pci_table_alloc(phb->hose->node);
- if (WARN_ON(!tbl))
- return;
-
- iommu_register_group(&pe->table_group, phb->hose->global_number,
- pe->pe_number);
- pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
-
- /* Grab a 32-bit TCE table */
- pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
- weight, total_weight, base, segs);
- pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
- base * PNV_IODA1_DMA32_SEGSIZE,
- (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
-
- /* XXX Currently, we allocate one big contiguous table for the
- * TCEs. We only really need one chunk per 256M of TCE space
- * (ie per segment) but that's an optimization for later, it
- * requires some added smarts with our get/put_tce implementation
- *
- * Each TCE page is 4KB in size and each TCE entry occupies 8
- * bytes
- */
- tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
- tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
- get_order(tce32_segsz * segs));
- if (!tce_mem) {
- pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
- goto fail;
- }
- addr = page_address(tce_mem);
- memset(addr, 0, tce32_segsz * segs);
-
- /* Configure HW */
- for (i = 0; i < segs; i++) {
- rc = opal_pci_map_pe_dma_window(phb->opal_id,
- pe->pe_number,
- base + i, 1,
- __pa(addr) + tce32_segsz * i,
- tce32_segsz, IOMMU_PAGE_SIZE_4K);
- if (rc) {
- pe_err(pe, " Failed to configure 32-bit TCE table, err %lld\n",
- rc);
- goto fail;
- }
- }
-
- /* Setup DMA32 segment mapping */
- for (i = base; i < base + segs; i++)
- phb->ioda.dma32_segmap[i] = pe->pe_number;
-
- /* Setup linux iommu table */
- pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
- base * PNV_IODA1_DMA32_SEGSIZE,
- IOMMU_PAGE_SHIFT_4K);
-
- tbl->it_ops = &pnv_ioda1_iommu_ops;
- pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
- pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
- iommu_init_table(tbl, phb->hose->node, 0, 0);
-
- if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
- pnv_ioda_setup_bus_dma(pe, pe->pbus);
-
- return;
- fail:
- /* XXX Failure: Try to fallback to 64-bit only ? */
- if (tce_mem)
- __free_pages(tce_mem, get_order(tce32_segsz * segs));
- if (tbl) {
- pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
- iommu_tce_table_put(tbl);
- }
-}
-
static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
int num, struct iommu_table *tbl)
{
@@ -2396,7 +1389,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
* DMA window can be larger than available memory, which will
* cause errors later.
*/
- const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_ORDER - 1);
+ const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_PAGE_ORDER);
/*
* We create the default window as big as we can. The constraint is
@@ -2435,16 +1428,17 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
}
- iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end);
- rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+ tbl->it_index = (pe->phb->hose->global_number << 16) | pe->pe_number;
+ if (iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end))
+ rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+ else
+ rc = -ENOMEM;
if (rc) {
- pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
- rc);
+ pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", rc);
iommu_tce_table_put(tbl);
- return rc;
+ tbl = NULL; /* This clears iommu_table_base below */
}
-
if (!pnv_iommu_bypass_disabled)
pnv_pci_ioda2_set_bypass(pe, true);
@@ -2459,7 +1453,6 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
return 0;
}
-#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV)
static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
int num)
{
@@ -2483,7 +1476,6 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
return ret;
}
-#endif
#ifdef CONFIG_IOMMU_API
unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
@@ -2507,7 +1499,7 @@ unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
direct_table_size = 1UL << table_shift;
for ( ; levels; --levels) {
- bytes += _ALIGN_UP(tce_table_size, direct_table_size);
+ bytes += ALIGN(tce_table_size, direct_table_size);
tce_table_size /= direct_table_size;
tce_table_size <<= 3;
@@ -2532,13 +1524,33 @@ static long pnv_pci_ioda2_create_table_userspace(
return ret;
}
-static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
+ dev->dev.archdata.dma_offset = pe->tce_bypass_base;
+
+ if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
+ pnv_ioda_setup_bus_dma(pe, dev->subordinate);
+ }
+}
+
+static long pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
{
struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
table_group);
/* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
struct iommu_table *tbl = pe->table_group.tables[0];
+ /*
+ * iommu_ops transfers the ownership per a device and we mode
+ * the group ownership with the first device in the group.
+ */
+ if (!tbl)
+ return 0;
+
pnv_pci_ioda2_set_bypass(pe, false);
pnv_pci_ioda2_unset_window(&pe->table_group, 0);
if (pe->pbus)
@@ -2546,6 +1558,8 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
else if (pe->pdev)
set_iommu_table_base(&pe->pdev->dev, NULL);
iommu_tce_table_put(tbl);
+
+ return 0;
}
static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
@@ -2553,6 +1567,9 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
table_group);
+ /* See the comment about iommu_ops above */
+ if (pe->table_group.tables[0])
+ return;
pnv_pci_ioda2_setup_default_config(pe);
if (pe->pbus)
pnv_ioda_setup_bus_dma(pe, pe->pbus);
@@ -2566,145 +1583,13 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
.take_ownership = pnv_ioda2_take_ownership,
.release_ownership = pnv_ioda2_release_ownership,
};
-
-static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe,
- struct iommu_table_group *table_group,
- struct pci_bus *bus)
-{
- struct pci_dev *dev;
-
- list_for_each_entry(dev, &bus->devices, bus_list) {
- iommu_add_device(table_group, &dev->dev);
-
- if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
- pnv_ioda_setup_bus_iommu_group_add_devices(pe,
- table_group, dev->subordinate);
- }
-}
-
-static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
- struct iommu_table_group *table_group, struct pci_bus *bus)
-{
-
- if (pe->flags & PNV_IODA_PE_DEV)
- iommu_add_device(table_group, &pe->pdev->dev);
-
- if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus)
- pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group,
- bus);
-}
-
-static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
-
-static void pnv_pci_ioda_setup_iommu_api(void)
-{
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pnv_ioda_pe *pe;
-
- /*
- * There are 4 types of PEs:
- * - PNV_IODA_PE_BUS: a downstream port with an adapter,
- * created from pnv_pci_setup_bridge();
- * - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it,
- * created from pnv_pci_setup_bridge();
- * - PNV_IODA_PE_VF: a SRIOV virtual function,
- * created from pnv_pcibios_sriov_enable();
- * - PNV_IODA_PE_DEV: an NPU or OCAPI device,
- * created from pnv_pci_ioda_fixup().
- *
- * Normally a PE is represented by an IOMMU group, however for
- * devices with side channels the groups need to be more strict.
- */
- list_for_each_entry(hose, &hose_list, list_node) {
- phb = hose->private_data;
-
- if (phb->type == PNV_PHB_NPU_NVLINK ||
- phb->type == PNV_PHB_NPU_OCAPI)
- continue;
-
- list_for_each_entry(pe, &phb->ioda.pe_list, list) {
- struct iommu_table_group *table_group;
-
- table_group = pnv_try_setup_npu_table_group(pe);
- if (!table_group) {
- if (!pnv_pci_ioda_pe_dma_weight(pe))
- continue;
-
- table_group = &pe->table_group;
- iommu_register_group(&pe->table_group,
- pe->phb->hose->global_number,
- pe->pe_number);
- }
- pnv_ioda_setup_bus_iommu_group(pe, table_group,
- pe->pbus);
- }
- }
-
- /*
- * Now we have all PHBs discovered, time to add NPU devices to
- * the corresponding IOMMU groups.
- */
- list_for_each_entry(hose, &hose_list, list_node) {
- unsigned long pgsizes;
-
- phb = hose->private_data;
-
- if (phb->type != PNV_PHB_NPU_NVLINK)
- continue;
-
- pgsizes = pnv_ioda_parse_tce_sizes(phb);
- list_for_each_entry(pe, &phb->ioda.pe_list, list) {
- /*
- * IODA2 bridges get this set up from
- * pci_controller_ops::setup_bridge but NPU bridges
- * do not have this hook defined so we do it here.
- */
- pe->table_group.pgsizes = pgsizes;
- pnv_npu_compound_attach(pe);
- }
- }
-}
-#else /* !CONFIG_IOMMU_API */
-static void pnv_pci_ioda_setup_iommu_api(void) { };
#endif
-static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
-{
- struct pci_controller *hose = phb->hose;
- struct device_node *dn = hose->dn;
- unsigned long mask = 0;
- int i, rc, count;
- u32 val;
-
- count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
- if (count <= 0) {
- mask = SZ_4K | SZ_64K;
- /* Add 16M for POWER8 by default */
- if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
- !cpu_has_feature(CPU_FTR_ARCH_300))
- mask |= SZ_16M | SZ_256M;
- return mask;
- }
-
- for (i = 0; i < count; i++) {
- rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
- i, &val);
- if (rc == 0)
- mask |= 1ULL << val;
- }
-
- return mask;
-}
-
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
- struct pnv_ioda_pe *pe)
+void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe)
{
int64_t rc;
- if (!pnv_pci_ioda_pe_dma_weight(pe))
- return;
-
/* TVE #1 is selected by PCI address bit 59 */
pe->tce_bypass_base = 1ull << 59;
@@ -2719,39 +1604,53 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
IOMMU_TABLE_GROUP_MAX_TABLES;
pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
-#ifdef CONFIG_IOMMU_API
- pe->table_group.ops = &pnv_pci_ioda2_ops;
-#endif
rc = pnv_pci_ioda2_setup_default_config(pe);
if (rc)
return;
- if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
- pnv_ioda_setup_bus_dma(pe, pe->pbus);
+#ifdef CONFIG_IOMMU_API
+ pe->table_group.ops = &pnv_pci_ioda2_ops;
+ iommu_register_group(&pe->table_group, phb->hose->global_number,
+ pe->pe_number);
+#endif
+ pe->dma_setup_done = true;
}
-int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
+/*
+ * Called from KVM in real mode to EOI passthru interrupts. The ICP
+ * EOI is handled directly in KVM in kvmppc_deliver_irq_passthru().
+ *
+ * The IRQ data is mapped in the PCI-MSI domain and the EOI OPAL call
+ * needs an HW IRQ number mapped in the XICS IRQ domain. The HW IRQ
+ * numbers of the in-the-middle MSI domain are vector numbers and it's
+ * good enough for OPAL. Use that.
+ */
+int64_t pnv_opal_pci_msi_eoi(struct irq_data *d)
{
- struct pnv_phb *phb = container_of(chip, struct pnv_phb,
- ioda.irq_chip);
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d->parent_data);
+ struct pnv_phb *phb = hose->private_data;
- return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+ return opal_pci_msi_eoi(phb->opal_id, d->parent_data->hwirq);
}
+/*
+ * The IRQ data is mapped in the XICS domain, with OPAL HW IRQ numbers
+ */
static void pnv_ioda2_msi_eoi(struct irq_data *d)
{
int64_t rc;
unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
- struct irq_chip *chip = irq_data_get_irq_chip(d);
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+ struct pnv_phb *phb = hose->private_data;
- rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
+ rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
WARN_ON_ONCE(rc);
icp_native_eoi(d);
}
-
+/* P8/CXL only */
void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
{
struct irq_data *idata;
@@ -2773,27 +1672,32 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
}
irq_set_chip(virq, &phb->ioda.irq_chip);
+ irq_set_chip_data(virq, phb->hose);
}
+static struct irq_chip pnv_pci_msi_irq_chip;
+
/*
* Returns true iff chip is something that we could call
* pnv_opal_pci_msi_eoi for.
*/
bool is_pnv_opal_msi(struct irq_chip *chip)
{
- return chip->irq_eoi == pnv_ioda2_msi_eoi;
+ return chip == &pnv_pci_msi_irq_chip;
}
EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
-static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
- unsigned int hwirq, unsigned int virq,
- unsigned int is_64, struct msi_msg *msg)
+static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+ unsigned int xive_num,
+ unsigned int is_64, struct msi_msg *msg)
{
struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
- unsigned int xive_num = hwirq - phb->msi_base;
__be32 data;
int rc;
+ dev_dbg(&dev->dev, "%s: setup %s-bit MSI for vector #%d\n", __func__,
+ is_64 ? "64" : "32", xive_num);
+
/* No PE assigned ? bail out ... no MSI for you ! */
if (pe == NULL)
return -ENXIO;
@@ -2841,17 +1745,214 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
}
msg->data = be32_to_cpu(data);
- pnv_set_msi_irq_chip(phb, virq);
+ return 0;
+}
+
+/*
+ * The msi_free() op is called before irq_domain_free_irqs_top() when
+ * the handler data is still available. Use that to clear the XIVE
+ * controller.
+ */
+static void pnv_msi_ops_msi_free(struct irq_domain *domain,
+ struct msi_domain_info *info,
+ unsigned int irq)
+{
+ if (xive_enabled())
+ xive_irq_free_data(irq);
+}
- pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
- " address=%x_%08x data=%x PE# %x\n",
- pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
- msg->address_hi, msg->address_lo, data, pe->pe_number);
+static struct msi_domain_ops pnv_pci_msi_domain_ops = {
+ .msi_free = pnv_msi_ops_msi_free,
+};
+
+static void pnv_msi_shutdown(struct irq_data *d)
+{
+ d = d->parent_data;
+ if (d->chip->irq_shutdown)
+ d->chip->irq_shutdown(d);
+}
+
+static void pnv_msi_mask(struct irq_data *d)
+{
+ pci_msi_mask_irq(d);
+ irq_chip_mask_parent(d);
+}
+
+static void pnv_msi_unmask(struct irq_data *d)
+{
+ pci_msi_unmask_irq(d);
+ irq_chip_unmask_parent(d);
+}
+
+static struct irq_chip pnv_pci_msi_irq_chip = {
+ .name = "PNV-PCI-MSI",
+ .irq_shutdown = pnv_msi_shutdown,
+ .irq_mask = pnv_msi_mask,
+ .irq_unmask = pnv_msi_unmask,
+ .irq_eoi = irq_chip_eoi_parent,
+};
+
+static struct msi_domain_info pnv_msi_domain_info = {
+ .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+ MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX),
+ .ops = &pnv_pci_msi_domain_ops,
+ .chip = &pnv_pci_msi_irq_chip,
+};
+
+static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg)
+{
+ struct msi_desc *entry = irq_data_get_msi_desc(d);
+ struct pci_dev *pdev = msi_desc_to_pci_dev(entry);
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+ struct pnv_phb *phb = hose->private_data;
+ int rc;
+
+ rc = __pnv_pci_ioda_msi_setup(phb, pdev, d->hwirq,
+ entry->pci.msi_attrib.is_64, msg);
+ if (rc)
+ dev_err(&pdev->dev, "Failed to setup %s-bit MSI #%ld : %d\n",
+ entry->pci.msi_attrib.is_64 ? "64" : "32", d->hwirq, rc);
+}
+
+/*
+ * The IRQ data is mapped in the MSI domain in which HW IRQ numbers
+ * correspond to vector numbers.
+ */
+static void pnv_msi_eoi(struct irq_data *d)
+{
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+ struct pnv_phb *phb = hose->private_data;
+
+ if (phb->model == PNV_PHB_MODEL_PHB3) {
+ /*
+ * The EOI OPAL call takes an OPAL HW IRQ number but
+ * since it is translated into a vector number in
+ * OPAL, use that directly.
+ */
+ WARN_ON_ONCE(opal_pci_msi_eoi(phb->opal_id, d->hwirq));
+ }
+
+ irq_chip_eoi_parent(d);
+}
+
+static struct irq_chip pnv_msi_irq_chip = {
+ .name = "PNV-MSI",
+ .irq_shutdown = pnv_msi_shutdown,
+ .irq_mask = irq_chip_mask_parent,
+ .irq_unmask = irq_chip_unmask_parent,
+ .irq_eoi = pnv_msi_eoi,
+ .irq_set_affinity = irq_chip_set_affinity_parent,
+ .irq_compose_msi_msg = pnv_msi_compose_msg,
+};
+
+static int pnv_irq_parent_domain_alloc(struct irq_domain *domain,
+ unsigned int virq, int hwirq)
+{
+ struct irq_fwspec parent_fwspec;
+ int ret;
+
+ parent_fwspec.fwnode = domain->parent->fwnode;
+ parent_fwspec.param_count = 2;
+ parent_fwspec.param[0] = hwirq;
+ parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct pci_controller *hose = domain->host_data;
+ struct pnv_phb *phb = hose->private_data;
+ msi_alloc_info_t *info = arg;
+ struct pci_dev *pdev = msi_desc_to_pci_dev(info->desc);
+ int hwirq;
+ int i, ret;
+
+ hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, nr_irqs);
+ if (hwirq < 0) {
+ dev_warn(&pdev->dev, "failed to find a free MSI\n");
+ return -ENOSPC;
+ }
+
+ dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__,
+ hose->dn, virq, hwirq, nr_irqs);
+
+ for (i = 0; i < nr_irqs; i++) {
+ ret = pnv_irq_parent_domain_alloc(domain, virq + i,
+ phb->msi_base + hwirq + i);
+ if (ret)
+ goto out;
+
+ irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
+ &pnv_msi_irq_chip, hose);
+ }
return 0;
+
+out:
+ irq_domain_free_irqs_parent(domain, virq, i - 1);
+ msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, nr_irqs);
+ return ret;
}
-static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
+static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+ struct pnv_phb *phb = hose->private_data;
+
+ pr_debug("%s bridge %pOF %d/%lx #%d\n", __func__, hose->dn,
+ virq, d->hwirq, nr_irqs);
+
+ msi_bitmap_free_hwirqs(&phb->msi_bmp, d->hwirq, nr_irqs);
+ /* XIVE domain is cleared through ->msi_free() */
+}
+
+static const struct irq_domain_ops pnv_irq_domain_ops = {
+ .alloc = pnv_irq_domain_alloc,
+ .free = pnv_irq_domain_free,
+};
+
+static int __init pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count)
+{
+ struct pnv_phb *phb = hose->private_data;
+ struct irq_domain *parent = irq_get_default_host();
+
+ hose->fwnode = irq_domain_alloc_named_id_fwnode("PNV-MSI", phb->opal_id);
+ if (!hose->fwnode)
+ return -ENOMEM;
+
+ hose->dev_domain = irq_domain_create_hierarchy(parent, 0, count,
+ hose->fwnode,
+ &pnv_irq_domain_ops, hose);
+ if (!hose->dev_domain) {
+ pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n",
+ hose->dn, hose->global_number);
+ irq_domain_free_fwnode(hose->fwnode);
+ return -ENOMEM;
+ }
+
+ hose->msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(hose->dn),
+ &pnv_msi_domain_info,
+ hose->dev_domain);
+ if (!hose->msi_domain) {
+ pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n",
+ hose->dn, hose->global_number);
+ irq_domain_free_fwnode(hose->fwnode);
+ irq_domain_remove(hose->dev_domain);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __init pnv_pci_init_ioda_msis(struct pnv_phb *phb)
{
unsigned int count;
const __be32 *prop = of_get_property(phb->hose->dn,
@@ -2871,102 +1972,11 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
return;
}
- phb->msi_setup = pnv_pci_ioda_msi_setup;
- phb->msi32_support = 1;
pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
count, phb->msi_base);
-}
-
-#ifdef CONFIG_PCI_IOV
-static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
-{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
- const resource_size_t gate = phb->ioda.m64_segsize >> 2;
- struct resource *res;
- int i;
- resource_size_t size, total_vf_bar_sz;
- struct pci_dn *pdn;
- int mul, total_vfs;
- if (!pdev->is_physfn || pci_dev_is_added(pdev))
- return;
-
- pdn = pci_get_pdn(pdev);
- pdn->vfs_expanded = 0;
- pdn->m64_single_mode = false;
-
- total_vfs = pci_sriov_get_totalvfs(pdev);
- mul = phb->ioda.total_pe_num;
- total_vf_bar_sz = 0;
-
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = &pdev->resource[i + PCI_IOV_RESOURCES];
- if (!res->flags || res->parent)
- continue;
- if (!pnv_pci_is_m64_flags(res->flags)) {
- dev_warn(&pdev->dev, "Don't support SR-IOV with"
- " non M64 VF BAR%d: %pR. \n",
- i, res);
- goto truncate_iov;
- }
-
- total_vf_bar_sz += pci_iov_resource_size(pdev,
- i + PCI_IOV_RESOURCES);
-
- /*
- * If bigger than quarter of M64 segment size, just round up
- * power of two.
- *
- * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
- * with other devices, IOV BAR size is expanded to be
- * (total_pe * VF_BAR_size). When VF_BAR_size is half of M64
- * segment size , the expanded size would equal to half of the
- * whole M64 space size, which will exhaust the M64 Space and
- * limit the system flexibility. This is a design decision to
- * set the boundary to quarter of the M64 segment size.
- */
- if (total_vf_bar_sz > gate) {
- mul = roundup_pow_of_two(total_vfs);
- dev_info(&pdev->dev,
- "VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
- total_vf_bar_sz, gate, mul);
- pdn->m64_single_mode = true;
- break;
- }
- }
-
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = &pdev->resource[i + PCI_IOV_RESOURCES];
- if (!res->flags || res->parent)
- continue;
-
- size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
- /*
- * On PHB3, the minimum size alignment of M64 BAR in single
- * mode is 32MB.
- */
- if (pdn->m64_single_mode && (size < SZ_32M))
- goto truncate_iov;
- dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
- res->end = res->start + size * mul - 1;
- dev_dbg(&pdev->dev, " %pR\n", res);
- dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
- i, res, mul);
- }
- pdn->vfs_expanded = mul;
-
- return;
-
-truncate_iov:
- /* To save MMIO space, IOV BAR is truncated. */
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = &pdev->resource[i + PCI_IOV_RESOURCES];
- res->flags = 0;
- res->end = res->start - 1;
- }
+ pnv_msi_allocate_domains(phb->hose, count);
}
-#endif /* CONFIG_PCI_IOV */
static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
struct resource *res)
@@ -2976,7 +1986,8 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
int index;
int64_t rc;
- if (!res || !res->flags || res->start > res->end)
+ if (!res || !res->flags || res->start > res->end ||
+ res->flags & IORESOURCE_UNSET)
return;
if (res->flags & IORESOURCE_IO) {
@@ -3027,7 +2038,7 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
/*
* This function is supposed to be called on basis of PE from top
- * to bottom style. So the the I/O or MMIO segment assigned to
+ * to bottom style. So the I/O or MMIO segment assigned to
* parent PE could be overridden by its child PEs if necessary.
*/
static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
@@ -3062,19 +2073,9 @@ static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
#ifdef CONFIG_DEBUG_FS
static int pnv_pci_diag_data_set(void *data, u64 val)
{
- struct pci_controller *hose;
- struct pnv_phb *phb;
+ struct pnv_phb *phb = data;
s64 ret;
- if (val != 1ULL)
- return -EINVAL;
-
- hose = (struct pci_controller *)data;
- if (!hose || !hose->private_data)
- return -ENODEV;
-
- phb = hose->private_data;
-
/* Retrieve the diag data from firmware */
ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
phb->diag_data_size);
@@ -3089,6 +2090,33 @@ static int pnv_pci_diag_data_set(void *data, u64 val)
DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, pnv_pci_diag_data_set,
"%llu\n");
+static int pnv_pci_ioda_pe_dump(void *data, u64 val)
+{
+ struct pnv_phb *phb = data;
+ int pe_num;
+
+ for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
+ struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_num];
+
+ if (!test_bit(pe_num, phb->ioda.pe_alloc))
+ continue;
+
+ pe_warn(pe, "rid: %04x dev count: %2d flags: %s%s%s%s%s%s\n",
+ pe->rid, pe->device_count,
+ (pe->flags & PNV_IODA_PE_DEV) ? "dev " : "",
+ (pe->flags & PNV_IODA_PE_BUS) ? "bus " : "",
+ (pe->flags & PNV_IODA_PE_BUS_ALL) ? "all " : "",
+ (pe->flags & PNV_IODA_PE_MASTER) ? "master " : "",
+ (pe->flags & PNV_IODA_PE_SLAVE) ? "slave " : "",
+ (pe->flags & PNV_IODA_PE_VF) ? "vf " : "");
+ }
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_ioda_pe_dump_fops, NULL,
+ pnv_pci_ioda_pe_dump, "%llu\n");
+
#endif /* CONFIG_DEBUG_FS */
static void pnv_pci_ioda_create_dbgfs(void)
@@ -3101,19 +2129,13 @@ static void pnv_pci_ioda_create_dbgfs(void)
list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
phb = hose->private_data;
- /* Notify initialization of PHB done */
- phb->initialized = 1;
-
sprintf(name, "PCI%04x", hose->global_number);
- phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
- if (!phb->dbgfs) {
- pr_warn("%s: Error on creating debugfs on PHB#%x\n",
- __func__, hose->global_number);
- continue;
- }
+ phb->dbgfs = debugfs_create_dir(name, arch_debugfs_dir);
debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs,
- hose, &pnv_pci_diag_data_fops);
+ phb, &pnv_pci_diag_data_fops);
+ debugfs_create_file_unsafe("dump_ioda_pe_state", 0200, phb->dbgfs,
+ phb, &pnv_pci_ioda_pe_dump_fops);
}
#endif /* CONFIG_DEBUG_FS */
}
@@ -3155,8 +2177,6 @@ static void pnv_pci_enable_bridges(void)
static void pnv_pci_ioda_fixup(void)
{
- pnv_pci_ioda_setup_PEs();
- pnv_pci_ioda_setup_iommu_api();
pnv_pci_ioda_create_dbgfs();
pnv_pci_enable_bridges();
@@ -3181,10 +2201,9 @@ static void pnv_pci_ioda_fixup(void)
static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
unsigned long type)
{
- struct pci_dev *bridge;
- struct pci_controller *hose = pci_bus_to_host(bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
int num_pci_bridges = 0;
+ struct pci_dev *bridge;
bridge = bus->self;
while (bridge) {
@@ -3268,28 +2287,16 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
}
}
-static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
+static void pnv_pci_configure_bus(struct pci_bus *bus)
{
- struct pci_controller *hose = pci_bus_to_host(bus);
- struct pnv_phb *phb = hose->private_data;
struct pci_dev *bridge = bus->self;
struct pnv_ioda_pe *pe;
- bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
+ bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
- /* Extend bridge's windows if necessary */
- pnv_pci_fixup_bridge_resources(bus, type);
-
- /* The PE for root bus should be realized before any one else */
- if (!phb->ioda.root_pe_populated) {
- pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false);
- if (pe) {
- phb->ioda.root_pe_idx = pe->pe_number;
- phb->ioda.root_pe_populated = true;
- }
- }
+ dev_info(&bus->dev, "Configuring PE for bus\n");
/* Don't assign PE to PCI bus, which doesn't have subordinate devices */
- if (list_empty(&bus->devices))
+ if (WARN_ON(list_empty(&bus->devices)))
return;
/* Reserve PEs according to used M64 resources */
@@ -3305,17 +2312,6 @@ static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
return;
pnv_ioda_setup_pe_seg(pe);
- switch (phb->type) {
- case PNV_PHB_IODA1:
- pnv_pci_ioda1_setup_dma_pe(phb, pe);
- break;
- case PNV_PHB_IODA2:
- pnv_pci_ioda2_setup_dma_pe(phb, pe);
- break;
- default:
- pr_warn("%s: No DMA for PHB#%x (type %d)\n",
- __func__, phb->hose->global_number, phb->type);
- }
}
static resource_size_t pnv_pci_default_alignment(void)
@@ -3323,134 +2319,50 @@ static resource_size_t pnv_pci_default_alignment(void)
return PAGE_SIZE;
}
-#ifdef CONFIG_PCI_IOV
-static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
- int resno)
-{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
- struct pci_dn *pdn = pci_get_pdn(pdev);
- resource_size_t align;
-
- /*
- * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
- * SR-IOV. While from hardware perspective, the range mapped by M64
- * BAR should be size aligned.
- *
- * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
- * powernv-specific hardware restriction is gone. But if just use the
- * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
- * in one segment of M64 #15, which introduces the PE conflict between
- * PF and VF. Based on this, the minimum alignment of an IOV BAR is
- * m64_segsize.
- *
- * This function returns the total IOV BAR size if M64 BAR is in
- * Shared PE mode or just VF BAR size if not.
- * If the M64 BAR is in Single PE mode, return the VF BAR size or
- * M64 segment size if IOV BAR size is less.
- */
- align = pci_iov_resource_size(pdev, resno);
- if (!pdn->vfs_expanded)
- return align;
- if (pdn->m64_single_mode)
- return max(align, (resource_size_t)phb->ioda.m64_segsize);
-
- return pdn->vfs_expanded * align;
-}
-#endif /* CONFIG_PCI_IOV */
-
/* Prevent enabling devices for which we couldn't properly
* assign a PE
*/
static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
struct pci_dn *pdn;
- /* The function is probably called while the PEs have
- * not be created yet. For example, resource reassignment
- * during PCI probe period. We just skip the check if
- * PEs isn't ready.
- */
- if (!phb->initialized)
- return true;
-
pdn = pci_get_pdn(dev);
- if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+ if (!pdn || pdn->pe_number == IODA_INVALID_PE) {
+ pci_err(dev, "pci_enable_device() blocked, no PE assigned.\n");
return false;
-
- return true;
-}
-
-static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
- int num)
-{
- struct pnv_ioda_pe *pe = container_of(table_group,
- struct pnv_ioda_pe, table_group);
- struct pnv_phb *phb = pe->phb;
- unsigned int idx;
- long rc;
-
- pe_info(pe, "Removing DMA window #%d\n", num);
- for (idx = 0; idx < phb->ioda.dma32_count; idx++) {
- if (phb->ioda.dma32_segmap[idx] != pe->pe_number)
- continue;
-
- rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
- idx, 0, 0ul, 0ul, 0ul);
- if (rc != OPAL_SUCCESS) {
- pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n",
- rc, idx);
- return rc;
- }
-
- phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE;
}
- pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
- return OPAL_SUCCESS;
+ return true;
}
-static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
+static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev)
{
- unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
- struct iommu_table *tbl = pe->table_group.tables[0];
- int64_t rc;
-
- if (!weight)
- return;
+ struct pci_dn *pdn;
+ struct pnv_ioda_pe *pe;
- rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
- if (rc != OPAL_SUCCESS)
- return;
+ pdn = pci_get_pdn(dev);
+ if (!pdn)
+ return false;
- pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false);
- if (pe->table_group.group) {
- iommu_group_put(pe->table_group.group);
- WARN_ON(pe->table_group.group);
+ if (pdn->pe_number == IODA_INVALID_PE) {
+ pe = pnv_ioda_setup_dev_PE(dev);
+ if (!pe)
+ return false;
}
-
- free_pages(tbl->it_base, get_order(tbl->it_size << 3));
- iommu_tce_table_put(tbl);
+ return true;
}
-static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
+void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
{
struct iommu_table *tbl = pe->table_group.tables[0];
- unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
-#ifdef CONFIG_IOMMU_API
int64_t rc;
-#endif
- if (!weight)
+ if (!pe->dma_setup_done)
return;
-#ifdef CONFIG_IOMMU_API
rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
if (rc)
pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
-#endif
pnv_pci_ioda2_set_bypass(pe, false);
if (pe->table_group.group) {
@@ -3473,14 +2385,8 @@ static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
if (map[idx] != pe->pe_number)
continue;
- if (win == OPAL_M64_WINDOW_TYPE)
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
- phb->ioda.reserved_pe_idx, win,
- idx / PNV_IODA1_M64_SEGS,
- idx % PNV_IODA1_M64_SEGS);
- else
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
- phb->ioda.reserved_pe_idx, win, 0, idx);
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ phb->ioda.reserved_pe_idx, win, 0, idx);
if (rc != OPAL_SUCCESS)
pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n",
@@ -3494,14 +2400,7 @@ static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
{
struct pnv_phb *phb = pe->phb;
- if (phb->type == PNV_PHB_IODA1) {
- pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE,
- phb->ioda.io_segmap);
- pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
- phb->ioda.m32_segmap);
- pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE,
- phb->ioda.m64_segmap);
- } else if (phb->type == PNV_PHB_IODA2) {
+ if (phb->type == PNV_PHB_IODA2) {
pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
phb->ioda.m32_segmap);
}
@@ -3512,14 +2411,18 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
struct pnv_phb *phb = pe->phb;
struct pnv_ioda_pe *slave, *tmp;
+ pe_info(pe, "Releasing PE\n");
+
+ mutex_lock(&phb->ioda.pe_list_mutex);
list_del(&pe->list);
+ mutex_unlock(&phb->ioda.pe_list_mutex);
+
switch (phb->type) {
- case PNV_PHB_IODA1:
- pnv_pci_ioda1_release_pe_dma(pe);
- break;
case PNV_PHB_IODA2:
pnv_pci_ioda2_release_pe_dma(pe);
break;
+ case PNV_PHB_NPU_OCAPI:
+ break;
default:
WARN_ON(1);
}
@@ -3541,26 +2444,35 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
* that it can be populated again in PCI hot add path. The PE
* shouldn't be destroyed as it's the global reserved resource.
*/
- if (phb->ioda.root_pe_populated &&
- phb->ioda.root_pe_idx == pe->pe_number)
- phb->ioda.root_pe_populated = false;
- else
- pnv_ioda_free_pe(pe);
+ if (phb->ioda.root_pe_idx == pe->pe_number)
+ return;
+
+ pnv_ioda_free_pe(pe);
}
static void pnv_pci_release_device(struct pci_dev *pdev)
{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
struct pci_dn *pdn = pci_get_pdn(pdev);
struct pnv_ioda_pe *pe;
+ /* The VF PE state is torn down when sriov_disable() is called */
if (pdev->is_virtfn)
return;
if (!pdn || pdn->pe_number == IODA_INVALID_PE)
return;
+#ifdef CONFIG_PCI_IOV
+ /*
+ * FIXME: Try move this to sriov_disable(). It's here since we allocate
+ * the iov state at probe time since we need to fiddle with the IOV
+ * resources.
+ */
+ if (pdev->is_physfn)
+ kfree(pdev->dev.archdata.iov_data);
+#endif
+
/*
* PCI hotplug can happen as part of EEH error recovery. The @pdn
* isn't removed and added afterwards in this scenario. We should
@@ -3577,50 +2489,72 @@ static void pnv_pci_release_device(struct pci_dev *pdev)
pnv_ioda_release_pe(pe);
}
-static void pnv_npu_disable_device(struct pci_dev *pdev)
+static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
+{
+ struct pnv_phb *phb = hose->private_data;
+
+ opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
+ OPAL_ASSERT_RESET);
+}
+
+static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus)
{
- struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
- struct eeh_pe *eehpe = edev ? edev->pe : NULL;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
+ struct pnv_ioda_pe *pe;
+
+ list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+ if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
+ continue;
+
+ if (!pe->pbus)
+ continue;
- if (eehpe && eeh_ops && eeh_ops->reset)
- eeh_ops->reset(eehpe, EEH_RESET_HOT);
+ if (bus->number == ((pe->rid >> 8) & 0xFF)) {
+ pe->pbus = bus;
+ break;
+ }
+ }
}
-static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
+#ifdef CONFIG_IOMMU_API
+static struct iommu_group *pnv_pci_device_group(struct pci_controller *hose,
+ struct pci_dev *pdev)
{
struct pnv_phb *phb = hose->private_data;
+ struct pnv_ioda_pe *pe;
- opal_pci_reset(phb->opal_id, OPAL_RESET_PCI_IODA_TABLE,
- OPAL_ASSERT_RESET);
+ if (WARN_ON(!phb))
+ return ERR_PTR(-ENODEV);
+
+ pe = pnv_pci_bdfn_to_pe(phb, pci_dev_id(pdev));
+ if (!pe)
+ return ERR_PTR(-ENODEV);
+
+ if (!pe->table_group.group)
+ return ERR_PTR(-ENODEV);
+
+ return iommu_group_ref_get(pe->table_group.group);
}
+#endif
static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
- .dma_dev_setup = pnv_pci_dma_dev_setup,
- .dma_bus_setup = pnv_pci_dma_bus_setup,
+ .dma_dev_setup = pnv_pci_ioda_dma_dev_setup,
+ .dma_bus_setup = pnv_pci_ioda_dma_bus_setup,
.iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
- .setup_msi_irqs = pnv_setup_msi_irqs,
- .teardown_msi_irqs = pnv_teardown_msi_irqs,
.enable_device_hook = pnv_pci_enable_device_hook,
.release_device = pnv_pci_release_device,
.window_alignment = pnv_pci_window_alignment,
- .setup_bridge = pnv_pci_setup_bridge,
- .reset_secondary_bus = pnv_pci_reset_secondary_bus,
- .shutdown = pnv_pci_ioda_shutdown,
-};
-
-static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
- .dma_dev_setup = pnv_pci_dma_dev_setup,
- .setup_msi_irqs = pnv_setup_msi_irqs,
- .teardown_msi_irqs = pnv_teardown_msi_irqs,
- .enable_device_hook = pnv_pci_enable_device_hook,
- .window_alignment = pnv_pci_window_alignment,
+ .setup_bridge = pnv_pci_fixup_bridge_resources,
.reset_secondary_bus = pnv_pci_reset_secondary_bus,
.shutdown = pnv_pci_ioda_shutdown,
- .disable_device = pnv_npu_disable_device,
+#ifdef CONFIG_IOMMU_API
+ .device_group = pnv_pci_device_group,
+#endif
};
static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
- .enable_device_hook = pnv_pci_enable_device_hook,
+ .enable_device_hook = pnv_ocapi_enable_device_hook,
+ .release_device = pnv_pci_release_device,
.window_alignment = pnv_pci_window_alignment,
.reset_secondary_bus = pnv_pci_reset_secondary_bus,
.shutdown = pnv_pci_ioda_shutdown,
@@ -3632,7 +2566,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
struct pci_controller *hose;
struct pnv_phb *phb;
unsigned long size, m64map_off, m32map_off, pemap_off;
- unsigned long iomap_off = 0, dma32map_off = 0;
+ struct pnv_ioda_pe *root_pe;
struct resource r;
const __be64 *prop64;
const __be32 *prop32;
@@ -3655,7 +2589,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb_id = be64_to_cpup(prop64);
pr_debug(" PHB-ID : 0x%016llx\n", phb_id);
- phb = memblock_alloc(sizeof(*phb), SMP_CACHE_BYTES);
+ phb = kzalloc(sizeof(*phb), GFP_KERNEL);
if (!phb)
panic("%s: Failed to allocate %zu bytes\n", __func__,
sizeof(*phb));
@@ -3665,7 +2599,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
if (!phb->hose) {
pr_err(" Can't allocate PCI controller for %pOF\n",
np);
- memblock_free(__pa(phb), sizeof(struct pnv_phb));
+ memblock_free(phb, sizeof(struct pnv_phb));
return;
}
@@ -3690,10 +2624,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb->model = PNV_PHB_MODEL_P7IOC;
else if (of_device_is_compatible(np, "ibm,power8-pciex"))
phb->model = PNV_PHB_MODEL_PHB3;
- else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
- phb->model = PNV_PHB_MODEL_NPU;
- else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))
- phb->model = PNV_PHB_MODEL_NPU2;
else
phb->model = PNV_PHB_MODEL_UNKNOWN;
@@ -3704,7 +2634,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
else
phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE;
- phb->diag_data = memblock_alloc(phb->diag_data_size, SMP_CACHE_BYTES);
+ phb->diag_data = kzalloc(phb->diag_data_size, GFP_KERNEL);
if (!phb->diag_data)
panic("%s: Failed to allocate %u bytes\n", __func__,
phb->diag_data_size);
@@ -3746,29 +2676,19 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
- /* Calculate how many 32-bit TCE segments we have */
- phb->ioda.dma32_count = phb->ioda.m32_pci_base /
- PNV_IODA1_DMA32_SEGSIZE;
-
/* Allocate aux data & arrays. We don't have IO ports on PHB3 */
- size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
+ size = ALIGN(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
sizeof(unsigned long));
m64map_off = size;
size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
m32map_off = size;
size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
- if (phb->type == PNV_PHB_IODA1) {
- iomap_off = size;
- size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
- dma32map_off = size;
- size += phb->ioda.dma32_count *
- sizeof(phb->ioda.dma32_segmap[0]);
- }
pemap_off = size;
size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
- aux = memblock_alloc(size, SMP_CACHE_BYTES);
+ aux = kzalloc(size, GFP_KERNEL);
if (!aux)
panic("%s: Failed to allocate %lu bytes\n", __func__, size);
+
phb->ioda.pe_alloc = aux;
phb->ioda.m64_segmap = aux + m64map_off;
phb->ioda.m32_segmap = aux + m32map_off;
@@ -3776,15 +2696,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
}
- if (phb->type == PNV_PHB_IODA1) {
- phb->ioda.io_segmap = aux + iomap_off;
- for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
- phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
-
- phb->ioda.dma32_segmap = aux + dma32map_off;
- for (segno = 0; segno < phb->ioda.dma32_count; segno++)
- phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
- }
phb->ioda.pe_array = aux + pemap_off;
/*
@@ -3800,16 +2711,14 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
} else {
- phb->ioda.root_pe_idx = IODA_INVALID_PE;
+ /* otherwise just allocate one */
+ root_pe = pnv_ioda_alloc_pe(phb, 1);
+ phb->ioda.root_pe_idx = root_pe->pe_number;
}
INIT_LIST_HEAD(&phb->ioda.pe_list);
mutex_init(&phb->ioda.pe_list_mutex);
- /* Calculate how many 32-bit TCE segments we have */
- phb->ioda.dma32_count = phb->ioda.m32_pci_base /
- PNV_IODA1_DMA32_SEGSIZE;
-
#if 0 /* We should really do that ... */
rc = opal_pci_set_phb_mem_window(opal->phb_id,
window_type,
@@ -3848,21 +2757,17 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
switch (phb->type) {
- case PNV_PHB_NPU_NVLINK:
- hose->controller_ops = pnv_npu_ioda_controller_ops;
- break;
case PNV_PHB_NPU_OCAPI:
hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
break;
default:
- phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
hose->controller_ops = pnv_pci_ioda_controller_ops;
}
ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
#ifdef CONFIG_PCI_IOV
- ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
+ ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov;
ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;
@@ -3894,6 +2799,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
/* Remove M64 resource if we can't configure it successfully */
if (!phb->init_m64 || phb->init_m64(phb))
hose->mem_resources[1].flags = 0;
+
+ /* create pci_dn's for DT nodes under this PHB */
+ pci_devs_phb_init_dynamic(hose);
}
void __init pnv_pci_init_ioda2_phb(struct device_node *np)
@@ -3901,11 +2809,6 @@ void __init pnv_pci_init_ioda2_phb(struct device_node *np)
pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
}
-void __init pnv_pci_init_npu_phb(struct device_node *np)
-{
- pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK);
-}
-
void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
{
pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI);
@@ -3913,8 +2816,7 @@ void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
if (!machine_is(powernv))
return;
@@ -3923,27 +2825,3 @@ static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
dev->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
}
DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pnv_npu2_opencapi_cfg_size_fixup);
-
-void __init pnv_pci_init_ioda_hub(struct device_node *np)
-{
- struct device_node *phbn;
- const __be64 *prop64;
- u64 hub_id;
-
- pr_info("Probing IODA IO-Hub %pOF\n", np);
-
- prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
- if (!prop64) {
- pr_err(" Missing \"ibm,opal-hubid\" property !\n");
- return;
- }
- hub_id = be64_to_cpup(prop64);
- pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
-
- /* Count child PHBs */
- for_each_child_of_node(np, phbn) {
- /* Look for IODA1 PHBs */
- if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
- pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
- }
-}
diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
new file mode 100644
index 000000000000..59882da3e742
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -0,0 +1,760 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/kernel.h>
+#include <linux/ioport.h>
+#include <linux/bitmap.h>
+#include <linux/pci.h>
+
+#include <asm/opal.h>
+
+#include "pci.h"
+
+/*
+ * The majority of the complexity in supporting SR-IOV on PowerNV comes from
+ * the need to put the MMIO space for each VF into a separate PE. Internally
+ * the PHB maps MMIO addresses to a specific PE using the "Memory BAR Table".
+ * The MBT historically only applied to the 64bit MMIO window of the PHB
+ * so it's common to see it referred to as the "M64BT".
+ *
+ * An MBT entry stores the mapped range as an <base>,<mask> pair. This forces
+ * the address range that we want to map to be power-of-two sized and aligned.
+ * For conventional PCI devices this isn't really an issue since PCI device BARs
+ * have the same requirement.
+ *
+ * For a SR-IOV BAR things are a little more awkward since size and alignment
+ * are not coupled. The alignment is set based on the per-VF BAR size, but
+ * the total BAR area is: number-of-vfs * per-vf-size. The number of VFs
+ * isn't necessarily a power of two, so neither is the total size. To fix that
+ * we need to finesse (read: hack) the Linux BAR allocator so that it will
+ * allocate the SR-IOV BARs in a way that lets us map them using the MBT.
+ *
+ * The changes to size and alignment that we need to do depend on the "mode"
+ * of MBT entry that we use. We only support SR-IOV on PHB3 (IODA2) and above,
+ * so as a baseline we can assume that we have the following BAR modes
+ * available:
+ *
+ * NB: $PE_COUNT is the number of PEs that the PHB supports.
+ *
+ * a) A segmented BAR that splits the mapped range into $PE_COUNT equally sized
+ * segments. The n'th segment is mapped to the n'th PE.
+ * b) An un-segmented BAR that maps the whole address range to a specific PE.
+ *
+ *
+ * We prefer to use mode a) since it only requires one MBT entry per SR-IOV BAR
+ * For comparison b) requires one entry per-VF per-BAR, or:
+ * (num-vfs * num-sriov-bars) in total. To use a) we need the size of each segment
+ * to equal the size of the per-VF BAR area. So:
+ *
+ * new_size = per-vf-size * number-of-PEs
+ *
+ * The alignment for the SR-IOV BAR also needs to be changed from per-vf-size
+ * to "new_size", calculated above. Implementing this is a convoluted process
+ * which requires several hooks in the PCI core:
+ *
+ * 1. In pcibios_device_add() we call pnv_pci_ioda_fixup_iov().
+ *
+ * At this point the device has been probed and the device's BARs are sized,
+ * but no resource allocations have been done. The SR-IOV BARs are sized
+ * based on the maximum number of VFs supported by the device and we need
+ * to increase that to new_size.
+ *
+ * 2. Later, when Linux actually assigns resources it tries to make the resource
+ * allocations for each PCI bus as compact as possible. As a part of that it
+ * sorts the BARs on a bus by their required alignment, which is calculated
+ * using pci_resource_alignment().
+ *
+ * For IOV resources this goes:
+ * pci_resource_alignment()
+ * pci_sriov_resource_alignment()
+ * pcibios_sriov_resource_alignment()
+ * pnv_pci_iov_resource_alignment()
+ *
+ * Our hook overrides the default alignment, equal to the per-vf-size, with
+ * new_size computed above.
+ *
+ * 3. When userspace enables VFs for a device:
+ *
+ * sriov_enable()
+ * pcibios_sriov_enable()
+ * pnv_pcibios_sriov_enable()
+ *
+ * This is where we actually allocate PE numbers for each VF and setup the
+ * MBT mapping for each SR-IOV BAR. In steps 1) and 2) we setup an "arena"
+ * where each MBT segment is equal in size to the VF BAR so we can shift
+ * around the actual SR-IOV BAR location within this arena. We need this
+ * ability because the PE space is shared by all devices on the same PHB.
+ * When using mode a) described above segment 0 in maps to PE#0 which might
+ * be already being used by another device on the PHB.
+ *
+ * As a result we need allocate a contigious range of PE numbers, then shift
+ * the address programmed into the SR-IOV BAR of the PF so that the address
+ * of VF0 matches up with the segment corresponding to the first allocated
+ * PE number. This is handled in pnv_pci_vf_resource_shift().
+ *
+ * Once all that is done we return to the PCI core which then enables VFs,
+ * scans them and creates pci_devs for each. The init process for a VF is
+ * largely the same as a normal device, but the VF is inserted into the IODA
+ * PE that we allocated for it rather than the PE associated with the bus.
+ *
+ * 4. When userspace disables VFs we unwind the above in
+ * pnv_pcibios_sriov_disable(). Fortunately this is relatively simple since
+ * we don't need to validate anything, just tear down the mappings and
+ * move SR-IOV resource back to its "proper" location.
+ *
+ * That's how mode a) works. In theory mode b) (single PE mapping) is less work
+ * since we can map each individual VF with a separate BAR. However, there's a
+ * few limitations:
+ *
+ * 1) For IODA2 mode b) has a minimum alignment requirement of 32MB. This makes
+ * it only usable for devices with very large per-VF BARs. Such devices are
+ * similar to Big Foot. They definitely exist, but I've never seen one.
+ *
+ * 2) The number of MBT entries that we have is limited. PHB3 and PHB4 only
+ * 16 total and some are needed for. Most SR-IOV capable network cards can support
+ * more than 16 VFs on each port.
+ *
+ * We use b) when using a) would use more than 1/4 of the entire 64 bit MMIO
+ * window of the PHB.
+ *
+ *
+ *
+ * PHB4 (IODA3) added a few new features that would be useful for SR-IOV. It
+ * allowed the MBT to map 32bit MMIO space in addition to 64bit which allows
+ * us to support SR-IOV BARs in the 32bit MMIO window. This is useful since
+ * the Linux BAR allocation will place any BAR marked as non-prefetchable into
+ * the non-prefetchable bridge window, which is 32bit only. It also added two
+ * new modes:
+ *
+ * c) A segmented BAR similar to a), but each segment can be individually
+ * mapped to any PE. This is matches how the 32bit MMIO window worked on
+ * IODA1&2.
+ *
+ * d) A segmented BAR with 8, 64, or 128 segments. This works similarly to a),
+ * but with fewer segments and configurable base PE.
+ *
+ * i.e. The n'th segment maps to the (n + base)'th PE.
+ *
+ * The base PE is also required to be a multiple of the window size.
+ *
+ * Unfortunately, the OPAL API doesn't currently (as of skiboot v6.6) allow us
+ * to exploit any of the IODA3 features.
+ */
+
+static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
+{
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+ struct resource *res;
+ int i;
+ resource_size_t vf_bar_sz;
+ struct pnv_iov_data *iov;
+ int mul;
+
+ iov = kzalloc(sizeof(*iov), GFP_KERNEL);
+ if (!iov)
+ goto disable_iov;
+ pdev->dev.archdata.iov_data = iov;
+ mul = phb->ioda.total_pe_num;
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || res->parent)
+ continue;
+ if (!pnv_pci_is_m64_flags(res->flags)) {
+ dev_warn(&pdev->dev, "Don't support SR-IOV with non M64 VF BAR%d: %pR. \n",
+ i, res);
+ goto disable_iov;
+ }
+
+ vf_bar_sz = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+
+ /*
+ * Generally, one segmented M64 BAR maps one IOV BAR. However,
+ * if a VF BAR is too large we end up wasting a lot of space.
+ * If each VF needs more than 1/4 of the default m64 segment
+ * then each VF BAR should be mapped in single-PE mode to reduce
+ * the amount of space required. This does however limit the
+ * number of VFs we can support.
+ *
+ * The 1/4 limit is arbitrary and can be tweaked.
+ */
+ if (vf_bar_sz > (phb->ioda.m64_segsize >> 2)) {
+ /*
+ * On PHB3, the minimum size alignment of M64 BAR in
+ * single mode is 32MB. If this VF BAR is smaller than
+ * 32MB, but still too large for a segmented window
+ * then we can't map it and need to disable SR-IOV for
+ * this device.
+ */
+ if (vf_bar_sz < SZ_32M) {
+ pci_err(pdev, "VF BAR%d: %pR can't be mapped in single PE mode\n",
+ i, res);
+ goto disable_iov;
+ }
+
+ iov->m64_single_mode[i] = true;
+ continue;
+ }
+
+ /*
+ * This BAR can be mapped with one segmented window, so adjust
+ * te resource size to accommodate.
+ */
+ pci_dbg(pdev, " Fixing VF BAR%d: %pR to\n", i, res);
+ res->end = res->start + vf_bar_sz * mul - 1;
+ pci_dbg(pdev, " %pR\n", res);
+
+ pci_info(pdev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
+ i, res, mul);
+
+ iov->need_shift = true;
+ }
+
+ return;
+
+disable_iov:
+ /* Save ourselves some MMIO space by disabling the unusable BARs */
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ res->flags = 0;
+ res->end = res->start - 1;
+ }
+
+ pdev->dev.archdata.iov_data = NULL;
+ kfree(iov);
+}
+
+void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev)
+{
+ if (pdev->is_virtfn) {
+ struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
+
+ /*
+ * VF PEs are single-device PEs so their pdev pointer needs to
+ * be set. The pdev doesn't exist when the PE is allocated (in
+ * (pcibios_sriov_enable()) so we fix it up here.
+ */
+ pe->pdev = pdev;
+ WARN_ON(!(pe->flags & PNV_IODA_PE_VF));
+ } else if (pdev->is_physfn) {
+ /*
+ * For PFs adjust their allocated IOV resources to match what
+ * the PHB can support using it's M64 BAR table.
+ */
+ pnv_pci_ioda_fixup_iov_resources(pdev);
+ }
+}
+
+resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
+ int resno)
+{
+ resource_size_t align = pci_iov_resource_size(pdev, resno);
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+ struct pnv_iov_data *iov = pnv_iov_get(pdev);
+
+ /*
+ * iov can be null if we have an SR-IOV device with IOV BAR that can't
+ * be placed in the m64 space (i.e. The BAR is 32bit or non-prefetch).
+ * In that case we don't allow VFs to be enabled since one of their
+ * BARs would not be placed in the correct PE.
+ */
+ if (!iov)
+ return align;
+
+ /*
+ * If we're using single mode then we can just use the native VF BAR
+ * alignment. We validated that it's possible to use a single PE
+ * window above when we did the fixup.
+ */
+ if (iov->m64_single_mode[resno - PCI_IOV_RESOURCES])
+ return align;
+
+ /*
+ * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
+ * SR-IOV. While from hardware perspective, the range mapped by M64
+ * BAR should be size aligned.
+ *
+ * This function returns the total IOV BAR size if M64 BAR is in
+ * Shared PE mode or just VF BAR size if not.
+ * If the M64 BAR is in Single PE mode, return the VF BAR size or
+ * M64 segment size if IOV BAR size is less.
+ */
+ return phb->ioda.total_pe_num * align;
+}
+
+static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pnv_iov_data *iov;
+ struct pnv_phb *phb;
+ int window_id;
+
+ phb = pci_bus_to_pnvhb(pdev->bus);
+ iov = pnv_iov_get(pdev);
+
+ for_each_set_bit(window_id, iov->used_m64_bar_mask, MAX_M64_BARS) {
+ opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ window_id,
+ 0);
+
+ clear_bit(window_id, &phb->ioda.m64_bar_alloc);
+ }
+
+ return 0;
+}
+
+
+/*
+ * PHB3 and beyond support segmented windows. The window's address range
+ * is subdivided into phb->ioda.total_pe_num segments and there's a 1-1
+ * mapping between PEs and segments.
+ */
+static int64_t pnv_ioda_map_m64_segmented(struct pnv_phb *phb,
+ int window_id,
+ resource_size_t start,
+ resource_size_t size)
+{
+ int64_t rc;
+
+ rc = opal_pci_set_phb_mem_window(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ window_id,
+ start,
+ 0, /* unused */
+ size);
+ if (rc)
+ goto out;
+
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ window_id,
+ OPAL_ENABLE_M64_SPLIT);
+out:
+ if (rc)
+ pr_err("Failed to map M64 window #%d: %lld\n", window_id, rc);
+
+ return rc;
+}
+
+static int64_t pnv_ioda_map_m64_single(struct pnv_phb *phb,
+ int pe_num,
+ int window_id,
+ resource_size_t start,
+ resource_size_t size)
+{
+ int64_t rc;
+
+ /*
+ * The API for setting up m64 mmio windows seems to have been designed
+ * with P7-IOC in mind. For that chip each M64 BAR (window) had a fixed
+ * split of 8 equally sized segments each of which could individually
+ * assigned to a PE.
+ *
+ * The problem with this is that the API doesn't have any way to
+ * communicate the number of segments we want on a BAR. This wasn't
+ * a problem for p7-ioc since you didn't have a choice, but the
+ * single PE windows added in PHB3 don't map cleanly to this API.
+ *
+ * As a result we've got this slightly awkward process where we
+ * call opal_pci_map_pe_mmio_window() to put the single in single
+ * PE mode, and set the PE for the window before setting the address
+ * bounds. We need to do it this way because the single PE windows
+ * for PHB3 have different alignment requirements on PHB3.
+ */
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ pe_num,
+ OPAL_M64_WINDOW_TYPE,
+ window_id,
+ 0);
+ if (rc)
+ goto out;
+
+ /*
+ * NB: In single PE mode the window needs to be aligned to 32MB
+ */
+ rc = opal_pci_set_phb_mem_window(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ window_id,
+ start,
+ 0, /* ignored by FW, m64 is 1-1 */
+ size);
+ if (rc)
+ goto out;
+
+ /*
+ * Now actually enable it. We specified the BAR should be in "non-split"
+ * mode so FW will validate that the BAR is in single PE mode.
+ */
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ window_id,
+ OPAL_ENABLE_M64_NON_SPLIT);
+out:
+ if (rc)
+ pr_err("Error mapping single PE BAR\n");
+
+ return rc;
+}
+
+static int pnv_pci_alloc_m64_bar(struct pnv_phb *phb, struct pnv_iov_data *iov)
+{
+ int win;
+
+ do {
+ win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
+ phb->ioda.m64_bar_idx + 1, 0);
+
+ if (win >= phb->ioda.m64_bar_idx + 1)
+ return -1;
+ } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+
+ set_bit(win, iov->used_m64_bar_mask);
+
+ return win;
+}
+
+static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pnv_iov_data *iov;
+ struct pnv_phb *phb;
+ int win;
+ struct resource *res;
+ int i, j;
+ int64_t rc;
+ resource_size_t size, start;
+ int base_pe_num;
+
+ phb = pci_bus_to_pnvhb(pdev->bus);
+ iov = pnv_iov_get(pdev);
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || !res->parent)
+ continue;
+
+ /* don't need single mode? map everything in one go! */
+ if (!iov->m64_single_mode[i]) {
+ win = pnv_pci_alloc_m64_bar(phb, iov);
+ if (win < 0)
+ goto m64_failed;
+
+ size = resource_size(res);
+ start = res->start;
+
+ rc = pnv_ioda_map_m64_segmented(phb, win, start, size);
+ if (rc)
+ goto m64_failed;
+
+ continue;
+ }
+
+ /* otherwise map each VF with single PE BARs */
+ size = pci_iov_resource_size(pdev, PCI_IOV_RESOURCES + i);
+ base_pe_num = iov->vf_pe_arr[0].pe_number;
+
+ for (j = 0; j < num_vfs; j++) {
+ win = pnv_pci_alloc_m64_bar(phb, iov);
+ if (win < 0)
+ goto m64_failed;
+
+ start = res->start + size * j;
+ rc = pnv_ioda_map_m64_single(phb, win,
+ base_pe_num + j,
+ start,
+ size);
+ if (rc)
+ goto m64_failed;
+ }
+ }
+ return 0;
+
+m64_failed:
+ pnv_pci_vf_release_m64(pdev, num_vfs);
+ return -EBUSY;
+}
+
+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
+{
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe, *pe_n;
+
+ phb = pci_bus_to_pnvhb(pdev->bus);
+
+ if (!pdev->is_physfn)
+ return;
+
+ /* FIXME: Use pnv_ioda_release_pe()? */
+ list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
+ if (pe->parent_dev != pdev)
+ continue;
+
+ pnv_pci_ioda2_release_pe_dma(pe);
+
+ /* Remove from list */
+ mutex_lock(&phb->ioda.pe_list_mutex);
+ list_del(&pe->list);
+ mutex_unlock(&phb->ioda.pe_list_mutex);
+
+ pnv_ioda_deconfigure_pe(phb, pe);
+
+ pnv_ioda_free_pe(pe);
+ }
+}
+
+static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
+{
+ struct resource *res, res2;
+ struct pnv_iov_data *iov;
+ resource_size_t size;
+ u16 num_vfs;
+ int i;
+
+ if (!dev->is_physfn)
+ return -EINVAL;
+ iov = pnv_iov_get(dev);
+
+ /*
+ * "offset" is in VFs. The M64 windows are sized so that when they
+ * are segmented, each segment is the same size as the IOV BAR.
+ * Each segment is in a separate PE, and the high order bits of the
+ * address are the PE number. Therefore, each VF's BAR is in a
+ * separate PE, and changing the IOV BAR start address changes the
+ * range of PEs the VFs are in.
+ */
+ num_vfs = iov->num_vfs;
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &dev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || !res->parent)
+ continue;
+ if (iov->m64_single_mode[i])
+ continue;
+
+ /*
+ * The actual IOV BAR range is determined by the start address
+ * and the actual size for num_vfs VFs BAR. This check is to
+ * make sure that after shifting, the range will not overlap
+ * with another device.
+ */
+ size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+ res2.flags = res->flags;
+ res2.start = res->start + (size * offset);
+ res2.end = res2.start + (size * num_vfs) - 1;
+
+ if (res2.end > res->end) {
+ dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
+ i, &res2, res, num_vfs, offset);
+ return -EBUSY;
+ }
+ }
+
+ /*
+ * Since M64 BAR shares segments among all possible 256 PEs,
+ * we have to shift the beginning of PF IOV BAR to make it start from
+ * the segment which belongs to the PE number assigned to the first VF.
+ * This creates a "hole" in the /proc/iomem which could be used for
+ * allocating other resources so we reserve this area below and
+ * release when IOV is released.
+ */
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &dev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || !res->parent)
+ continue;
+ if (iov->m64_single_mode[i])
+ continue;
+
+ size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+ res2 = *res;
+ res->start += size * offset;
+
+ dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
+ i, &res2, res, (offset > 0) ? "En" : "Dis",
+ num_vfs, offset);
+
+ if (offset < 0) {
+ devm_release_resource(&dev->dev, &iov->holes[i]);
+ memset(&iov->holes[i], 0, sizeof(iov->holes[i]));
+ }
+
+ pci_update_resource(dev, i + PCI_IOV_RESOURCES);
+
+ if (offset > 0) {
+ iov->holes[i].start = res2.start;
+ iov->holes[i].end = res2.start + size * offset - 1;
+ iov->holes[i].flags = IORESOURCE_BUS;
+ iov->holes[i].name = "pnv_iov_reserved";
+ devm_request_resource(&dev->dev, res->parent,
+ &iov->holes[i]);
+ }
+ }
+ return 0;
+}
+
+static void pnv_pci_sriov_disable(struct pci_dev *pdev)
+{
+ u16 num_vfs, base_pe;
+ struct pnv_iov_data *iov;
+
+ iov = pnv_iov_get(pdev);
+ if (WARN_ON(!iov))
+ return;
+
+ num_vfs = iov->num_vfs;
+ base_pe = iov->vf_pe_arr[0].pe_number;
+
+ /* Release VF PEs */
+ pnv_ioda_release_vf_PE(pdev);
+
+ /* Un-shift the IOV BARs if we need to */
+ if (iov->need_shift)
+ pnv_pci_vf_resource_shift(pdev, -base_pe);
+
+ /* Release M64 windows */
+ pnv_pci_vf_release_m64(pdev, num_vfs);
+}
+
+static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe;
+ int pe_num;
+ u16 vf_index;
+ struct pnv_iov_data *iov;
+ struct pci_dn *pdn;
+
+ if (!pdev->is_physfn)
+ return;
+
+ phb = pci_bus_to_pnvhb(pdev->bus);
+ pdn = pci_get_pdn(pdev);
+ iov = pnv_iov_get(pdev);
+
+ /* Reserve PE for each VF */
+ for (vf_index = 0; vf_index < num_vfs; vf_index++) {
+ int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
+ int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
+ struct pci_dn *vf_pdn;
+
+ pe = &iov->vf_pe_arr[vf_index];
+ pe->phb = phb;
+ pe->flags = PNV_IODA_PE_VF;
+ pe->pbus = NULL;
+ pe->parent_dev = pdev;
+ pe->mve_number = -1;
+ pe->rid = (vf_bus << 8) | vf_devfn;
+
+ pe_num = pe->pe_number;
+ pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
+ pci_domain_nr(pdev->bus), pdev->bus->number,
+ PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
+
+ if (pnv_ioda_configure_pe(phb, pe)) {
+ /* XXX What do we do here ? */
+ pnv_ioda_free_pe(pe);
+ pe->pdev = NULL;
+ continue;
+ }
+
+ /* Put PE to the list */
+ mutex_lock(&phb->ioda.pe_list_mutex);
+ list_add_tail(&pe->list, &phb->ioda.pe_list);
+ mutex_unlock(&phb->ioda.pe_list_mutex);
+
+ /* associate this pe to it's pdn */
+ list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) {
+ if (vf_pdn->busno == vf_bus &&
+ vf_pdn->devfn == vf_devfn) {
+ vf_pdn->pe_number = pe_num;
+ break;
+ }
+ }
+
+ pnv_pci_ioda2_setup_dma_pe(phb, pe);
+ }
+}
+
+static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pnv_ioda_pe *base_pe;
+ struct pnv_iov_data *iov;
+ struct pnv_phb *phb;
+ int ret;
+ u16 i;
+
+ phb = pci_bus_to_pnvhb(pdev->bus);
+ iov = pnv_iov_get(pdev);
+
+ /*
+ * There's a calls to IODA2 PE setup code littered throughout. We could
+ * probably fix that, but we'd still have problems due to the
+ * restriction inherent on IODA1 PHBs.
+ *
+ * NB: We class IODA3 as IODA2 since they're very similar.
+ */
+ if (phb->type != PNV_PHB_IODA2) {
+ pci_err(pdev, "SR-IOV is not supported on this PHB\n");
+ return -ENXIO;
+ }
+
+ if (!iov) {
+ dev_info(&pdev->dev, "don't support this SRIOV device with non 64bit-prefetchable IOV BAR\n");
+ return -ENOSPC;
+ }
+
+ /* allocate a contiguous block of PEs for our VFs */
+ base_pe = pnv_ioda_alloc_pe(phb, num_vfs);
+ if (!base_pe) {
+ pci_err(pdev, "Unable to allocate PEs for %d VFs\n", num_vfs);
+ return -EBUSY;
+ }
+
+ iov->vf_pe_arr = base_pe;
+ iov->num_vfs = num_vfs;
+
+ /* Assign M64 window accordingly */
+ ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
+ if (ret) {
+ dev_info(&pdev->dev, "Not enough M64 window resources\n");
+ goto m64_failed;
+ }
+
+ /*
+ * When using one M64 BAR to map one IOV BAR, we need to shift
+ * the IOV BAR according to the PE# allocated to the VFs.
+ * Otherwise, the PE# for the VF will conflict with others.
+ */
+ if (iov->need_shift) {
+ ret = pnv_pci_vf_resource_shift(pdev, base_pe->pe_number);
+ if (ret)
+ goto shift_failed;
+ }
+
+ /* Setup VF PEs */
+ pnv_ioda_setup_vf_PE(pdev, num_vfs);
+
+ return 0;
+
+shift_failed:
+ pnv_pci_vf_release_m64(pdev, num_vfs);
+
+m64_failed:
+ for (i = 0; i < num_vfs; i++)
+ pnv_ioda_free_pe(&iov->vf_pe_arr[i]);
+
+ return ret;
+}
+
+int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
+{
+ pnv_pci_sriov_disable(pdev);
+
+ /* Release PCI data */
+ remove_sriov_vf_pdns(pdev);
+ return 0;
+}
+
+int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+ /* Allocate PCI data */
+ add_sriov_vf_pdns(pdev);
+
+ return pnv_pci_sriov_enable(pdev, num_vfs);
+}
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index c0bea75ac27b..35f566aa0424 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -18,7 +18,6 @@
#include <asm/sections.h>
#include <asm/io.h>
-#include <asm/prom.h>
#include <asm/pci-bridge.h>
#include <asm/machdep.h>
#include <asm/msi_bitmap.h>
@@ -38,7 +37,7 @@ static DEFINE_MUTEX(tunnel_mutex);
int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id)
{
- struct device_node *parent = np;
+ struct device_node *node = np;
u32 bdfn;
u64 phbid;
int ret;
@@ -48,25 +47,29 @@ int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id)
return -ENXIO;
bdfn = ((bdfn & 0x00ffff00) >> 8);
- while ((parent = of_get_parent(parent))) {
- if (!PCI_DN(parent)) {
- of_node_put(parent);
+ for (node = np; node; node = of_get_parent(node)) {
+ if (!PCI_DN(node)) {
+ of_node_put(node);
break;
}
- if (!of_device_is_compatible(parent, "ibm,ioda2-phb") &&
- !of_device_is_compatible(parent, "ibm,ioda3-phb")) {
- of_node_put(parent);
+ if (!of_device_is_compatible(node, "ibm,ioda2-phb") &&
+ !of_device_is_compatible(node, "ibm,ioda3-phb") &&
+ !of_device_is_compatible(node, "ibm,ioda2-npu2-opencapi-phb")) {
+ of_node_put(node);
continue;
}
- ret = of_property_read_u64(parent, "ibm,opal-phbid", &phbid);
+ ret = of_property_read_u64(node, "ibm,opal-phbid", &phbid);
if (ret) {
- of_node_put(parent);
+ of_node_put(node);
return -ENXIO;
}
- *id = PCI_SLOT_ID(phbid, bdfn);
+ if (of_device_is_compatible(node, "ibm,ioda2-npu2-opencapi-phb"))
+ *id = PCI_PHB_SLOT_ID(phbid);
+ else
+ *id = PCI_SLOT_ID(phbid, bdfn);
return 0;
}
@@ -156,75 +159,6 @@ exit:
}
EXPORT_SYMBOL_GPL(pnv_pci_set_power_state);
-int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
-{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
- struct msi_desc *entry;
- struct msi_msg msg;
- int hwirq;
- unsigned int virq;
- int rc;
-
- if (WARN_ON(!phb) || !phb->msi_bmp.bitmap)
- return -ENODEV;
-
- if (pdev->no_64bit_msi && !phb->msi32_support)
- return -ENODEV;
-
- for_each_pci_msi_entry(entry, pdev) {
- if (!entry->msi_attrib.is_64 && !phb->msi32_support) {
- pr_warn("%s: Supports only 64-bit MSIs\n",
- pci_name(pdev));
- return -ENXIO;
- }
- hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1);
- if (hwirq < 0) {
- pr_warn("%s: Failed to find a free MSI\n",
- pci_name(pdev));
- return -ENOSPC;
- }
- virq = irq_create_mapping(NULL, phb->msi_base + hwirq);
- if (!virq) {
- pr_warn("%s: Failed to map MSI to linux irq\n",
- pci_name(pdev));
- msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
- return -ENOMEM;
- }
- rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq,
- virq, entry->msi_attrib.is_64, &msg);
- if (rc) {
- pr_warn("%s: Failed to setup MSI\n", pci_name(pdev));
- irq_dispose_mapping(virq);
- msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
- return rc;
- }
- irq_set_msi_desc(virq, entry);
- pci_write_msi_msg(virq, &msg);
- }
- return 0;
-}
-
-void pnv_teardown_msi_irqs(struct pci_dev *pdev)
-{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
- struct msi_desc *entry;
- irq_hw_number_t hwirq;
-
- if (WARN_ON(!phb))
- return;
-
- for_each_pci_msi_entry(entry, pdev) {
- if (!entry->irq)
- continue;
- hwirq = virq_to_hw(entry->irq);
- irq_set_msi_desc(entry->irq, NULL);
- irq_dispose_mapping(entry->irq);
- msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, 1);
- }
-}
-
/* Nicely print the contents of the PE State Tables (PEST). */
static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size)
{
@@ -709,7 +643,7 @@ int pnv_pci_cfg_write(struct pci_dn *pdn,
return PCIBIOS_SUCCESSFUL;
}
-#if CONFIG_EEH
+#ifdef CONFIG_EEH
static bool pnv_pci_cfg_check(struct pci_dn *pdn)
{
struct eeh_dev *edev = NULL;
@@ -810,53 +744,6 @@ struct iommu_table *pnv_pci_table_alloc(int nid)
return tbl;
}
-void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
-{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
-#ifdef CONFIG_PCI_IOV
- struct pnv_ioda_pe *pe;
- struct pci_dn *pdn;
-
- /* Fix the VF pdn PE number */
- if (pdev->is_virtfn) {
- pdn = pci_get_pdn(pdev);
- WARN_ON(pdn->pe_number != IODA_INVALID_PE);
- list_for_each_entry(pe, &phb->ioda.pe_list, list) {
- if (pe->rid == ((pdev->bus->number << 8) |
- (pdev->devfn & 0xff))) {
- pdn->pe_number = pe->pe_number;
- pe->pdev = pdev;
- break;
- }
- }
- }
-#endif /* CONFIG_PCI_IOV */
-
- if (phb && phb->dma_dev_setup)
- phb->dma_dev_setup(phb, pdev);
-}
-
-void pnv_pci_dma_bus_setup(struct pci_bus *bus)
-{
- struct pci_controller *hose = bus->sysdata;
- struct pnv_phb *phb = hose->private_data;
- struct pnv_ioda_pe *pe;
-
- list_for_each_entry(pe, &phb->ioda.pe_list, list) {
- if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
- continue;
-
- if (!pe->pbus)
- continue;
-
- if (bus->number == ((pe->rid >> 8) & 0xFF)) {
- pe->pbus = bus;
- break;
- }
- }
-}
-
struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
{
struct pci_controller *hose = pci_bus_to_host(dev->bus);
@@ -867,10 +754,9 @@ EXPORT_SYMBOL(pnv_pci_get_phb_node);
int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable)
{
- __be64 val;
- struct pci_controller *hose;
- struct pnv_phb *phb;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
u64 tunnel_bar;
+ __be64 val;
int rc;
if (!opal_check_token(OPAL_PCI_GET_PBCQ_TUNNEL_BAR))
@@ -878,9 +764,6 @@ int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable)
if (!opal_check_token(OPAL_PCI_SET_PBCQ_TUNNEL_BAR))
return -ENXIO;
- hose = pci_bus_to_host(dev->bus);
- phb = hose->private_data;
-
mutex_lock(&tunnel_mutex);
rc = opal_pci_get_pbcq_tunnel_bar(phb->opal_id, &val);
if (rc != OPAL_SUCCESS) {
@@ -931,7 +814,7 @@ void pnv_pci_shutdown(void)
/* Fixup wrong class code in p7ioc and p8 root complex */
static void pnv_p7ioc_rc_quirk(struct pci_dev *dev)
{
- dev->class = PCI_CLASS_BRIDGE_PCI << 8;
+ dev->class = PCI_CLASS_BRIDGE_PCI_NORMAL;
}
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk);
@@ -962,11 +845,6 @@ void __init pnv_pci_init(void)
pcie_ports_disabled = true;
#endif
- /* Look for IODA IO-Hubs. */
- for_each_compatible_node(np, NULL, "ibm,ioda-hub") {
- pnv_pci_init_ioda_hub(np);
- }
-
/* Look for ioda2 built-in PHB3's */
for_each_compatible_node(np, NULL, "ibm,ioda2-phb")
pnv_pci_init_ioda2_phb(np);
@@ -975,17 +853,6 @@ void __init pnv_pci_init(void)
for_each_compatible_node(np, NULL, "ibm,ioda3-phb")
pnv_pci_init_ioda2_phb(np);
- /* Look for NPU PHBs */
- for_each_compatible_node(np, NULL, "ibm,ioda2-npu-phb")
- pnv_pci_init_npu_phb(np);
-
- /*
- * Look for NPU2 PHBs which we treat mostly as NPU PHBs with
- * the exception of TCE kill which requires an OPAL call.
- */
- for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-phb")
- pnv_pci_init_npu_phb(np);
-
/* Look for NPU2 OpenCAPI PHBs */
for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-opencapi-phb")
pnv_pci_init_npu2_opencapi_phb(np);
@@ -993,48 +860,3 @@ void __init pnv_pci_init(void)
/* Configure IOMMU DMA hooks */
set_pci_dma_ops(&dma_iommu_ops);
}
-
-static int pnv_tce_iommu_bus_notifier(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct device *dev = data;
- struct pci_dev *pdev;
- struct pci_dn *pdn;
- struct pnv_ioda_pe *pe;
- struct pci_controller *hose;
- struct pnv_phb *phb;
-
- switch (action) {
- case BUS_NOTIFY_ADD_DEVICE:
- pdev = to_pci_dev(dev);
- pdn = pci_get_pdn(pdev);
- hose = pci_bus_to_host(pdev->bus);
- phb = hose->private_data;
-
- WARN_ON_ONCE(!phb);
- if (!pdn || pdn->pe_number == IODA_INVALID_PE || !phb)
- return 0;
-
- pe = &phb->ioda.pe_array[pdn->pe_number];
- if (!pe->table_group.group)
- return 0;
- iommu_add_device(&pe->table_group, dev);
- return 0;
- case BUS_NOTIFY_DEL_DEVICE:
- iommu_del_device(dev);
- return 0;
- default:
- return 0;
- }
-}
-
-static struct notifier_block pnv_tce_iommu_bus_nb = {
- .notifier_call = pnv_tce_iommu_bus_notifier,
-};
-
-static int __init pnv_tce_iommu_bus_notifier_init(void)
-{
- bus_register_notifier(&pci_bus_type, &pnv_tce_iommu_bus_nb);
- return 0;
-}
-machine_subsys_initcall_sync(powernv, pnv_tce_iommu_bus_notifier_init);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index f914f0b14e4e..957f2b47a3c0 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -10,10 +10,8 @@
struct pci_dn;
enum pnv_phb_type {
- PNV_PHB_IODA1 = 0,
- PNV_PHB_IODA2 = 1,
- PNV_PHB_NPU_NVLINK = 2,
- PNV_PHB_NPU_OCAPI = 3,
+ PNV_PHB_IODA2,
+ PNV_PHB_NPU_OCAPI,
};
/* Precise PHB model for error management */
@@ -21,8 +19,6 @@ enum pnv_phb_model {
PNV_PHB_MODEL_UNKNOWN,
PNV_PHB_MODEL_P7IOC,
PNV_PHB_MODEL_PHB3,
- PNV_PHB_MODEL_NPU,
- PNV_PHB_MODEL_NPU2,
};
#define PNV_PCI_DIAG_BUF_SIZE 8192
@@ -33,6 +29,24 @@ enum pnv_phb_model {
#define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */
#define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */
+/*
+ * A brief note on PNV_IODA_PE_BUS_ALL
+ *
+ * This is needed because of the behaviour of PCIe-to-PCI bridges. The PHB uses
+ * the Requester ID field of the PCIe request header to determine the device
+ * (and PE) that initiated a DMA. In legacy PCI individual memory read/write
+ * requests aren't tagged with the RID. To work around this the PCIe-to-PCI
+ * bridge will use (secondary_bus_no << 8) | 0x00 as the RID on the PCIe side.
+ *
+ * PCIe-to-X bridges have a similar issue even though PCI-X requests also have
+ * a RID in the transaction header. The PCIe-to-X bridge is permitted to "take
+ * ownership" of a transaction by a PCI-X device when forwarding it to the PCIe
+ * side of the bridge.
+ *
+ * To work around these problems we use the BUS_ALL flag since every subordinate
+ * bus of the bridge should go into the same PE.
+ */
+
/* Indicates operations are frozen for a PE: MMIO in PESTA & DMA in PESTB. */
#define PNV_IODA_STOPPED_STATE 0x8000000000000000
@@ -63,13 +77,19 @@ struct pnv_ioda_pe {
/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
struct iommu_table_group table_group;
- struct npu_comp *npucomp;
/* 64-bit TCE bypass region */
bool tce_bypass_enabled;
uint64_t tce_bypass_base;
- /* MSIs. MVE index is identical for for 32 and 64 bit MSI
+ /*
+ * Used to track whether we've done DMA setup for this PE or not. We
+ * want to defer allocating TCE tables, etc until we've added a
+ * non-bridge device to the PE.
+ */
+ bool dma_setup_done;
+
+ /* MSIs. MVE index is identical for 32 and 64 bit MSI
* and -1 if not supported. (It's actually identical to the
* PE number)
*/
@@ -94,7 +114,6 @@ struct pnv_phb {
int flags;
void __iomem *regs;
u64 regs_phys;
- int initialized;
spinlock_t lock;
#ifdef CONFIG_DEBUG_FS
@@ -103,12 +122,7 @@ struct pnv_phb {
#endif
unsigned int msi_base;
- unsigned int msi32_support;
struct msi_bitmap msi_bmp;
- int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev,
- unsigned int hwirq, unsigned int virq,
- unsigned int is_64, struct msi_msg *msg);
- void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
int (*init_m64)(struct pnv_phb *phb);
int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
@@ -119,7 +133,6 @@ struct pnv_phb {
unsigned int total_pe_num;
unsigned int reserved_pe_idx;
unsigned int root_pe_idx;
- bool root_pe_populated;
/* 32-bit MMIO window */
unsigned int m32_size;
@@ -131,6 +144,7 @@ struct pnv_phb {
unsigned long m64_size;
unsigned long m64_segsize;
unsigned long m64_base;
+#define MAX_M64_BARS 64
unsigned long m64_bar_alloc;
/* IO ports */
@@ -148,10 +162,6 @@ struct pnv_phb {
unsigned int *m32_segmap;
unsigned int *io_segmap;
- /* DMA32 segment maps - IODA1 only */
- unsigned int dma32_count;
- unsigned int *dma32_segmap;
-
/* IRQ chip */
int irq_chip_init;
struct irq_chip irq_chip;
@@ -171,6 +181,89 @@ struct pnv_phb {
u8 *diag_data;
};
+
+/* IODA PE management */
+
+static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
+{
+ /*
+ * WARNING: We cannot rely on the resource flags. The Linux PCI
+ * allocation code sometimes decides to put a 64-bit prefetchable
+ * BAR in the 32-bit window, so we have to compare the addresses.
+ *
+ * For simplicity we only test resource start.
+ */
+ return (r->start >= phb->ioda.m64_base &&
+ r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
+}
+
+static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
+{
+ unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
+
+ return (resource_flags & flags) == flags;
+}
+
+int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
+int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
+
+void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
+void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe);
+
+struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count);
+void pnv_ioda_free_pe(struct pnv_ioda_pe *pe);
+
+#ifdef CONFIG_PCI_IOV
+/*
+ * For SR-IOV we want to put each VF's MMIO resource in to a separate PE.
+ * This requires a bit of acrobatics with the MMIO -> PE configuration
+ * and this structure is used to keep track of it all.
+ */
+struct pnv_iov_data {
+ /* number of VFs enabled */
+ u16 num_vfs;
+
+ /* pointer to the array of VF PEs. num_vfs long*/
+ struct pnv_ioda_pe *vf_pe_arr;
+
+ /* Did we map the VF BAR with single-PE IODA BARs? */
+ bool m64_single_mode[PCI_SRIOV_NUM_BARS];
+
+ /*
+ * True if we're using any segmented windows. In that case we need
+ * shift the start of the IOV resource the segment corresponding to
+ * the allocated PE.
+ */
+ bool need_shift;
+
+ /*
+ * Bit mask used to track which m64 windows are used to map the
+ * SR-IOV BARs for this device.
+ */
+ DECLARE_BITMAP(used_m64_bar_mask, MAX_M64_BARS);
+
+ /*
+ * If we map the SR-IOV BARs with a segmented window then
+ * parts of that window will be "claimed" by other PEs.
+ *
+ * "holes" here is used to reserve the leading portion
+ * of the window that is used by other (non VF) PEs.
+ */
+ struct resource holes[PCI_SRIOV_NUM_BARS];
+};
+
+static inline struct pnv_iov_data *pnv_iov_get(struct pci_dev *pdev)
+{
+ return pdev->dev.archdata.iov_data;
+}
+
+void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev);
+resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, int resno);
+
+int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs);
+int pnv_pcibios_sriov_disable(struct pci_dev *pdev);
+#endif /* CONFIG_PCI_IOV */
+
extern struct pci_ops pnv_pci_ops;
void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
@@ -183,16 +276,11 @@ extern struct iommu_table *pnv_pci_table_alloc(int nid);
extern void pnv_pci_init_ioda_hub(struct device_node *np);
extern void pnv_pci_init_ioda2_phb(struct device_node *np);
-extern void pnv_pci_init_npu_phb(struct device_node *np);
extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np);
-extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr);
extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
-extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
-extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
-extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
-extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
+extern struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn);
extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
@@ -209,15 +297,6 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
#define pe_info(pe, fmt, ...) \
pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
-/* Nvlink functions */
-extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
-extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
-extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
-extern struct iommu_table_group *pnv_try_setup_npu_table_group(
- struct pnv_ioda_pe *pe);
-extern struct iommu_table_group *pnv_npu_compound_attach(
- struct pnv_ioda_pe *pe);
-
/* pci-ioda-tce.c */
#define POWERNV_IOMMU_DEFAULT_LEVELS 2
#define POWERNV_IOMMU_MAX_LEVELS 5
@@ -227,8 +306,7 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
unsigned long attrs);
extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
- unsigned long *hpa, enum dma_data_direction *direction,
- bool alloc);
+ unsigned long *hpa, enum dma_data_direction *direction);
extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index,
bool alloc);
extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
@@ -247,4 +325,16 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
void *tce_mem, u64 tce_size,
u64 dma_offset, unsigned int page_shift);
+extern unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
+
+static inline struct pnv_phb *pci_bus_to_pnvhb(struct pci_bus *bus)
+{
+ struct pci_controller *hose = bus->sysdata;
+
+ if (hose)
+ return hose->private_data;
+
+ return NULL;
+}
+
#endif /* __POWERNV_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index 1aa51c4fa904..866efdc103fd 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -2,6 +2,13 @@
#ifndef _POWERNV_H
#define _POWERNV_H
+/*
+ * There's various hacks scattered throughout the generic powerpc arch code
+ * that needs to call into powernv platform stuff. The prototypes for those
+ * functions are in asm/powernv.h
+ */
+#include <asm/powernv.h>
+
#ifdef CONFIG_SMP
extern void pnv_smp_init(void);
#else
@@ -32,7 +39,9 @@ bool cpu_core_split_required(void);
struct memcons;
ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count);
-u32 memcons_get_size(struct memcons *mc);
-struct memcons *memcons_init(struct device_node *node, const char *mc_prop_name);
+u32 __init memcons_get_size(struct memcons *mc);
+struct memcons *__init memcons_init(struct device_node *node, const char *mc_prop_name);
+
+void pnv_rng_init(void);
#endif /* _POWERNV_H */
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
index 8035caf6e297..196aa70fe043 100644
--- a/arch/powerpc/platforms/powernv/rng.c
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -17,33 +17,28 @@
#include <asm/prom.h>
#include <asm/machdep.h>
#include <asm/smp.h>
+#include "powernv.h"
#define DARN_ERR 0xFFFFFFFFFFFFFFFFul
-struct powernv_rng {
+struct pnv_rng {
void __iomem *regs;
void __iomem *regs_real;
unsigned long mask;
};
-static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng);
+static DEFINE_PER_CPU(struct pnv_rng *, pnv_rng);
-
-int powernv_hwrng_present(void)
-{
- struct powernv_rng *rng;
-
- rng = get_cpu_var(powernv_rng);
- put_cpu_var(rng);
- return rng != NULL;
-}
-
-static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
+static unsigned long rng_whiten(struct pnv_rng *rng, unsigned long val)
{
unsigned long parity;
/* Calculate the parity of the value */
- asm ("popcntd %0,%1" : "=r" (parity) : "r" (val));
+ asm (".machine push; \
+ .machine power7; \
+ popcntd %0,%1; \
+ .machine pop;"
+ : "=r" (parity) : "r" (val));
/* xor our value with the previous mask */
val ^= rng->mask;
@@ -54,18 +49,7 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
return val;
}
-int powernv_get_random_real_mode(unsigned long *v)
-{
- struct powernv_rng *rng;
-
- rng = raw_cpu_read(powernv_rng);
-
- *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real));
-
- return 1;
-}
-
-int powernv_get_random_darn(unsigned long *v)
+static int pnv_get_random_darn(unsigned long *v)
{
unsigned long val;
@@ -80,7 +64,7 @@ int powernv_get_random_darn(unsigned long *v)
return 1;
}
-static int initialise_darn(void)
+static int __init initialise_darn(void)
{
unsigned long val;
int i;
@@ -89,32 +73,31 @@ static int initialise_darn(void)
return -ENODEV;
for (i = 0; i < 10; i++) {
- if (powernv_get_random_darn(&val)) {
- ppc_md.get_random_seed = powernv_get_random_darn;
+ if (pnv_get_random_darn(&val)) {
+ ppc_md.get_random_seed = pnv_get_random_darn;
return 0;
}
}
-
- pr_warn("Unable to use DARN for get_random_seed()\n");
-
return -EIO;
}
-int powernv_get_random_long(unsigned long *v)
+int pnv_get_random_long(unsigned long *v)
{
- struct powernv_rng *rng;
-
- rng = get_cpu_var(powernv_rng);
-
- *v = rng_whiten(rng, in_be64(rng->regs));
-
- put_cpu_var(rng);
-
+ struct pnv_rng *rng;
+
+ if (mfmsr() & MSR_DR) {
+ rng = get_cpu_var(pnv_rng);
+ *v = rng_whiten(rng, in_be64(rng->regs));
+ put_cpu_var(rng);
+ } else {
+ rng = raw_cpu_read(pnv_rng);
+ *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real));
+ }
return 1;
}
-EXPORT_SYMBOL_GPL(powernv_get_random_long);
+EXPORT_SYMBOL_GPL(pnv_get_random_long);
-static __init void rng_init_per_cpu(struct powernv_rng *rng,
+static __init void rng_init_per_cpu(struct pnv_rng *rng,
struct device_node *dn)
{
int chip_id, cpu;
@@ -124,16 +107,16 @@ static __init void rng_init_per_cpu(struct powernv_rng *rng,
pr_warn("No ibm,chip-id found for %pOF.\n", dn);
for_each_possible_cpu(cpu) {
- if (per_cpu(powernv_rng, cpu) == NULL ||
+ if (per_cpu(pnv_rng, cpu) == NULL ||
cpu_to_chip_id(cpu) == chip_id) {
- per_cpu(powernv_rng, cpu) = rng;
+ per_cpu(pnv_rng, cpu) = rng;
}
}
}
static __init int rng_create(struct device_node *dn)
{
- struct powernv_rng *rng;
+ struct pnv_rng *rng;
struct resource res;
unsigned long val;
@@ -159,32 +142,59 @@ static __init int rng_create(struct device_node *dn)
rng_init_per_cpu(rng, dn);
- pr_info_once("Registering arch random hook.\n");
-
- ppc_md.get_random_seed = powernv_get_random_long;
+ ppc_md.get_random_seed = pnv_get_random_long;
return 0;
}
-static __init int rng_init(void)
+static int __init pnv_get_random_long_early(unsigned long *v)
{
struct device_node *dn;
- int rc;
-
- for_each_compatible_node(dn, NULL, "ibm,power-rng") {
- rc = rng_create(dn);
- if (rc) {
- pr_err("Failed creating rng for %pOF (%d).\n",
- dn, rc);
- continue;
- }
- /* Create devices for hwrng driver */
- of_platform_device_create(dn, NULL, NULL);
- }
+ if (!slab_is_available())
+ return 0;
+
+ if (cmpxchg(&ppc_md.get_random_seed, pnv_get_random_long_early,
+ NULL) != pnv_get_random_long_early)
+ return 0;
+
+ for_each_compatible_node(dn, NULL, "ibm,power-rng")
+ rng_create(dn);
+
+ if (!ppc_md.get_random_seed)
+ return 0;
+ return ppc_md.get_random_seed(v);
+}
+
+void __init pnv_rng_init(void)
+{
+ struct device_node *dn;
+
+ /* Prefer darn over the rest. */
+ if (!initialise_darn())
+ return;
+
+ dn = of_find_compatible_node(NULL, NULL, "ibm,power-rng");
+ if (dn)
+ ppc_md.get_random_seed = pnv_get_random_long_early;
+
+ of_node_put(dn);
+}
- initialise_darn();
+static int __init pnv_rng_late_init(void)
+{
+ struct device_node *dn;
+ unsigned long v;
+
+ /* In case it wasn't called during init for some other reason. */
+ if (ppc_md.get_random_seed == pnv_get_random_long_early)
+ pnv_get_random_long_early(&v);
+
+ if (ppc_md.get_random_seed == pnv_get_random_long) {
+ for_each_compatible_node(dn, NULL, "ibm,power-rng")
+ of_platform_device_create(dn, NULL, NULL);
+ }
return 0;
}
-machine_subsys_initcall(powernv, rng_init);
+machine_subsys_initcall(powernv, pnv_rng_late_init);
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 83498604d322..4dbb47ddbdcc 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -17,6 +17,7 @@
#include <linux/console.h>
#include <linux/delay.h>
#include <linux/irq.h>
+#include <linux/seq_buf.h>
#include <linux/seq_file.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
@@ -40,7 +41,7 @@
#include "powernv.h"
-static bool fw_feature_is(const char *state, const char *name,
+static bool __init fw_feature_is(const char *state, const char *name,
struct device_node *fw_features)
{
struct device_node *np;
@@ -55,7 +56,7 @@ static bool fw_feature_is(const char *state, const char *name,
return rc;
}
-static void init_fw_feat_flags(struct device_node *np)
+static void __init init_fw_feat_flags(struct device_node *np)
{
if (fw_feature_is("enabled", "inst-spec-barrier-ori31,31,0", np))
security_ftr_set(SEC_FTR_SPEC_BAR_ORI31);
@@ -96,9 +97,18 @@ static void init_fw_feat_flags(struct device_node *np)
if (fw_feature_is("disabled", "needs-spec-barrier-for-bound-checks", np))
security_ftr_clear(SEC_FTR_BNDS_CHK_SPEC_BAR);
+
+ if (fw_feature_is("enabled", "no-need-l1d-flush-msr-pr-1-to-0", np))
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_ENTRY);
+
+ if (fw_feature_is("enabled", "no-need-l1d-flush-kernel-on-user-access", np))
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_UACCESS);
+
+ if (fw_feature_is("enabled", "no-need-store-drain-on-priv-state-switch", np))
+ security_ftr_clear(SEC_FTR_STF_BARRIER);
}
-static void pnv_setup_rfi_flush(void)
+static void __init pnv_setup_security_mitigations(void)
{
struct device_node *np, *fw_features;
enum l1d_flush_type type;
@@ -122,27 +132,68 @@ static void pnv_setup_rfi_flush(void)
type = L1D_FLUSH_ORI;
}
+ /*
+ * The issues addressed by the entry and uaccess flush don't affect P7
+ * or P8, so on bare metal disable them explicitly in case firmware does
+ * not include the features to disable them. POWER9 and newer processors
+ * should have the appropriate firmware flags.
+ */
+ if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p) ||
+ pvr_version_is(PVR_POWER8E) || pvr_version_is(PVR_POWER8NVL) ||
+ pvr_version_is(PVR_POWER8)) {
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_ENTRY);
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_UACCESS);
+ }
+
enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && \
(security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR) || \
security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV));
setup_rfi_flush(type, enable);
setup_count_cache_flush();
+
+ enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+ security_ftr_enabled(SEC_FTR_L1D_FLUSH_ENTRY);
+ setup_entry_flush(enable);
+
+ enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+ security_ftr_enabled(SEC_FTR_L1D_FLUSH_UACCESS);
+ setup_uaccess_flush(enable);
+
+ setup_stf_barrier();
+}
+
+static void __init pnv_check_guarded_cores(void)
+{
+ struct device_node *dn;
+ int bad_count = 0;
+
+ for_each_node_by_type(dn, "cpu") {
+ if (of_property_match_string(dn, "status", "bad") >= 0)
+ bad_count++;
+ }
+
+ if (bad_count) {
+ printk(" _ _______________\n");
+ pr_cont(" | | / \\\n");
+ pr_cont(" | | | WARNING! |\n");
+ pr_cont(" | | | |\n");
+ pr_cont(" | | | It looks like |\n");
+ pr_cont(" |_| | you have %*d |\n", 3, bad_count);
+ pr_cont(" _ | guarded cores |\n");
+ pr_cont(" (_) \\_______________/\n");
+ }
}
static void __init pnv_setup_arch(void)
{
set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
- pnv_setup_rfi_flush();
- setup_stf_barrier();
+ pnv_setup_security_mitigations();
/* Initialize SMP */
pnv_smp_init();
- /* Setup PCI */
- pnv_pci_init();
-
/* Setup RTC and NVRAM callbacks */
if (firmware_has_feature(FW_FEATURE_OPAL))
opal_nvram_init();
@@ -150,11 +201,36 @@ static void __init pnv_setup_arch(void)
/* Enable NAP mode */
powersave_nap = 1;
+ pnv_check_guarded_cores();
+
/* XXX PMCS */
+
+ pnv_rng_init();
+}
+
+static void __init pnv_add_hw_description(void)
+{
+ struct device_node *dn;
+ const char *s;
+
+ dn = of_find_node_by_path("/ibm,opal/firmware");
+ if (!dn)
+ return;
+
+ if (of_property_read_string(dn, "version", &s) == 0 ||
+ of_property_read_string(dn, "git-id", &s) == 0)
+ seq_buf_printf(&ppc_hw_desc, "opal:%s ", s);
+
+ if (of_property_read_string(dn, "mi-version", &s) == 0)
+ seq_buf_printf(&ppc_hw_desc, "mi:%s ", s);
+
+ of_node_put(dn);
}
static void __init pnv_init(void)
{
+ pnv_add_hw_description();
+
/*
* Initialize the LPC bus now so that legacy serial
* ports can be found on it
@@ -168,13 +244,20 @@ static void __init pnv_init(void)
#endif
add_preferred_console("hvc", 0, NULL);
+#ifdef CONFIG_PPC_64S_HASH_MMU
if (!radix_enabled()) {
+ size_t size = sizeof(struct slb_entry) * mmu_slb_size;
int i;
/* Allocate per cpu area to save old slb contents during MCE */
- for_each_possible_cpu(i)
- paca_ptrs[i]->mce_faulty_slbs = memblock_alloc_node(mmu_slb_size, __alignof__(*paca_ptrs[i]->mce_faulty_slbs), cpu_to_node(i));
+ for_each_possible_cpu(i) {
+ paca_ptrs[i]->mce_faulty_slbs =
+ memblock_alloc_node(size,
+ __alignof__(struct slb_entry),
+ cpu_to_node(i));
+ }
}
+#endif
}
static void __init pnv_init_IRQ(void)
@@ -229,10 +312,16 @@ static void __noreturn pnv_restart(char *cmd)
pnv_prepare_going_down();
do {
- if (!cmd)
+ if (!cmd || !strlen(cmd))
rc = opal_cec_reboot();
else if (strcmp(cmd, "full") == 0)
rc = opal_cec_reboot2(OPAL_REBOOT_FULL_IPL, NULL);
+ else if (strcmp(cmd, "mpipl") == 0)
+ rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, NULL);
+ else if (strcmp(cmd, "error") == 0)
+ rc = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, NULL);
+ else if (strcmp(cmd, "fast") == 0)
+ rc = opal_cec_reboot2(OPAL_REBOOT_FAST, NULL);
else
rc = OPAL_UNSUPPORTED;
@@ -390,10 +479,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
}
#endif /* CONFIG_KEXEC_CORE */
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
static unsigned long pnv_memory_block_size(void)
{
- return 256UL * 1024 * 1024;
+ return memory_block_size;
}
#endif
@@ -415,9 +504,6 @@ static void __init pnv_setup_machdep_opal(void)
static int __init pnv_probe(void)
{
- if (!of_machine_is_compatible("ibm,powernv"))
- return 0;
-
if (firmware_has_feature(FW_FEATURE_OPAL))
pnv_setup_machdep_opal();
@@ -481,20 +567,21 @@ static long pnv_machine_check_early(struct pt_regs *regs)
define_machine(powernv) {
.name = "PowerNV",
+ .compatible = "ibm,powernv",
.probe = pnv_probe,
.setup_arch = pnv_setup_arch,
.init_IRQ = pnv_init_IRQ,
.show_cpuinfo = pnv_show_cpuinfo,
.get_proc_freq = pnv_get_proc_freq,
+ .discover_phbs = pnv_pci_init,
.progress = pnv_progress,
.machine_shutdown = pnv_shutdown,
.power_save = NULL,
- .calibrate_decr = generic_calibrate_decr,
.machine_check_early = pnv_machine_check_early,
#ifdef CONFIG_KEXEC_CORE
.kexec_cpu_down = pnv_kexec_cpu_down,
#endif
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
.memory_block_size = pnv_memory_block_size,
#endif
};
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 13e251699346..8f14f0581a21 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -43,7 +43,7 @@
#include <asm/udbg.h>
#define DBG(fmt...) udbg_printf(fmt)
#else
-#define DBG(fmt...)
+#define DBG(fmt...) do { } while (0)
#endif
static void pnv_smp_setup_cpu(int cpu)
@@ -143,6 +143,9 @@ static int pnv_smp_cpu_disable(void)
xive_smp_disable_cpu();
else
xics_migrate_irqs_away();
+
+ cleanup_cpu_mmu_context();
+
return 0;
}
@@ -158,7 +161,7 @@ static void pnv_flush_interrupts(void)
}
}
-static void pnv_smp_cpu_kill_self(void)
+static void pnv_cpu_offline_self(void)
{
unsigned long srr1, unexpected_mask, wmask;
unsigned int cpu;
@@ -167,7 +170,6 @@ static void pnv_smp_cpu_kill_self(void)
/* Standard hot unplug procedure */
idle_task_exit();
- current->active_mm = NULL; /* for sanity */
cpu = smp_processor_id();
DBG("CPU%d offline\n", cpu);
generic_set_cpu_dead(cpu);
@@ -343,7 +345,7 @@ static void __init pnv_smp_probe(void)
}
}
-static int pnv_system_reset_exception(struct pt_regs *regs)
+noinstr static int pnv_system_reset_exception(struct pt_regs *regs)
{
if (smp_handle_nmi_ipi(regs))
return 1;
@@ -418,6 +420,7 @@ static struct smp_ops_t pnv_smp_ops = {
#ifdef CONFIG_HOTPLUG_CPU
.cpu_disable = pnv_smp_cpu_disable,
.cpu_die = generic_cpu_die,
+ .cpu_offline_self = pnv_cpu_offline_self,
#endif /* CONFIG_HOTPLUG_CPU */
};
@@ -431,8 +434,7 @@ void __init pnv_smp_init(void)
smp_ops = &pnv_smp_ops;
#ifdef CONFIG_HOTPLUG_CPU
- ppc_md.cpu_die = pnv_smp_cpu_kill_self;
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
crash_wake_offline = 1;
#endif
#endif
diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c
index 73207b53dc2b..393e747541fb 100644
--- a/arch/powerpc/platforms/powernv/subcore.c
+++ b/arch/powerpc/platforms/powernv/subcore.c
@@ -20,6 +20,8 @@
#include <asm/opal.h>
#include <asm/smp.h>
+#include <trace/events/ipi.h>
+
#include "subcore.h"
#include "powernv.h"
@@ -169,6 +171,16 @@ static void update_hid_in_slw(u64 hid0)
}
}
+static inline void update_power8_hid0(unsigned long hid0)
+{
+ /*
+ * The HID0 update on Power8 should at the very least be
+ * preceded by a SYNC instruction followed by an ISYNC
+ * instruction
+ */
+ asm volatile("sync; mtspr %0,%1; isync":: "i"(SPRN_HID0), "r"(hid0));
+}
+
static void unsplit_core(void)
{
u64 hid0, mask;
@@ -405,13 +417,16 @@ static DEVICE_ATTR(subcores_per_core, 0644,
static int subcore_init(void)
{
+ struct device *dev_root;
unsigned pvr_ver;
+ int rc = 0;
pvr_ver = PVR_VER(mfspr(SPRN_PVR));
if (pvr_ver != PVR_POWER8 &&
pvr_ver != PVR_POWER8E &&
- pvr_ver != PVR_POWER8NVL)
+ pvr_ver != PVR_POWER8NVL &&
+ pvr_ver != PVR_HX_C2000)
return 0;
/*
@@ -425,7 +440,11 @@ static int subcore_init(void)
set_subcores_per_core(1);
- return device_create_file(cpu_subsys.dev_root,
- &dev_attr_subcores_per_core);
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ rc = device_create_file(dev_root, &dev_attr_subcores_per_core);
+ put_device(dev_root);
+ }
+ return rc;
}
machine_device_initcall(powernv, subcore_init);
diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h
index c8f574d1c04a..77feee8436d4 100644
--- a/arch/powerpc/platforms/powernv/subcore.h
+++ b/arch/powerpc/platforms/powernv/subcore.h
@@ -15,7 +15,7 @@
void split_core_secondary_loop(u8 *state);
extern void update_subcore_sibling_mask(void);
#else
-static inline void update_subcore_sibling_mask(void) { };
+static inline void update_subcore_sibling_mask(void) { }
#endif /* CONFIG_SMP */
#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c
index e4a00ad06f9d..67c8c4b2d8b1 100644
--- a/arch/powerpc/platforms/powernv/ultravisor.c
+++ b/arch/powerpc/platforms/powernv/ultravisor.c
@@ -55,6 +55,7 @@ static int __init uv_init(void)
return -ENODEV;
uv_memcons = memcons_init(node, "memcons");
+ of_node_put(node);
if (!uv_memcons)
return -ENOENT;
diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c
index 09e63df53c30..3ce89a4b54be 100644
--- a/arch/powerpc/platforms/powernv/vas-debug.c
+++ b/arch/powerpc/platforms/powernv/vas-debug.c
@@ -9,6 +9,7 @@
#include <linux/slab.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <asm/vas.h>
#include "vas.h"
static struct dentry *vas_debugfs;
@@ -28,7 +29,7 @@ static char *cop_to_str(int cop)
static int info_show(struct seq_file *s, void *private)
{
- struct vas_window *window = s->private;
+ struct pnv_vas_window *window = s->private;
mutex_lock(&vas_mutex);
@@ -36,9 +37,9 @@ static int info_show(struct seq_file *s, void *private)
if (!window->hvwc_map)
goto unlock;
- seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop),
+ seq_printf(s, "Type: %s, %s\n", cop_to_str(window->vas_win.cop),
window->tx_win ? "Send" : "Receive");
- seq_printf(s, "Pid : %d\n", window->pid);
+ seq_printf(s, "Pid : %d\n", vas_window_pid(&window->vas_win));
unlock:
mutex_unlock(&vas_mutex);
@@ -47,7 +48,7 @@ unlock:
DEFINE_SHOW_ATTRIBUTE(info);
-static inline void print_reg(struct seq_file *s, struct vas_window *win,
+static inline void print_reg(struct seq_file *s, struct pnv_vas_window *win,
char *name, u32 reg)
{
seq_printf(s, "0x%016llx %s\n", read_hvwc_reg(win, name, reg), name);
@@ -55,7 +56,7 @@ static inline void print_reg(struct seq_file *s, struct vas_window *win,
static int hvwc_show(struct seq_file *s, void *private)
{
- struct vas_window *window = s->private;
+ struct pnv_vas_window *window = s->private;
mutex_lock(&vas_mutex);
@@ -103,8 +104,10 @@ unlock:
DEFINE_SHOW_ATTRIBUTE(hvwc);
-void vas_window_free_dbgdir(struct vas_window *window)
+void vas_window_free_dbgdir(struct pnv_vas_window *pnv_win)
{
+ struct vas_window *window = &pnv_win->vas_win;
+
if (window->dbgdir) {
debugfs_remove_recursive(window->dbgdir);
kfree(window->dbgname);
@@ -113,42 +116,24 @@ void vas_window_free_dbgdir(struct vas_window *window)
}
}
-void vas_window_init_dbgdir(struct vas_window *window)
+void vas_window_init_dbgdir(struct pnv_vas_window *window)
{
- struct dentry *f, *d;
+ struct dentry *d;
if (!window->vinst->dbgdir)
return;
- window->dbgname = kzalloc(16, GFP_KERNEL);
- if (!window->dbgname)
+ window->vas_win.dbgname = kzalloc(16, GFP_KERNEL);
+ if (!window->vas_win.dbgname)
return;
- snprintf(window->dbgname, 16, "w%d", window->winid);
-
- d = debugfs_create_dir(window->dbgname, window->vinst->dbgdir);
- if (IS_ERR(d))
- goto free_name;
-
- window->dbgdir = d;
-
- f = debugfs_create_file("info", 0444, d, window, &info_fops);
- if (IS_ERR(f))
- goto remove_dir;
+ snprintf(window->vas_win.dbgname, 16, "w%d", window->vas_win.winid);
- f = debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops);
- if (IS_ERR(f))
- goto remove_dir;
+ d = debugfs_create_dir(window->vas_win.dbgname, window->vinst->dbgdir);
+ window->vas_win.dbgdir = d;
- return;
-
-remove_dir:
- debugfs_remove_recursive(window->dbgdir);
- window->dbgdir = NULL;
-
-free_name:
- kfree(window->dbgname);
- window->dbgname = NULL;
+ debugfs_create_file("info", 0444, d, window, &info_fops);
+ debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops);
}
void vas_instance_init_dbgdir(struct vas_instance *vinst)
@@ -156,8 +141,6 @@ void vas_instance_init_dbgdir(struct vas_instance *vinst)
struct dentry *d;
vas_init_dbgdir();
- if (!vas_debugfs)
- return;
vinst->dbgname = kzalloc(16, GFP_KERNEL);
if (!vinst->dbgname)
@@ -166,16 +149,7 @@ void vas_instance_init_dbgdir(struct vas_instance *vinst)
snprintf(vinst->dbgname, 16, "v%d", vinst->vas_id);
d = debugfs_create_dir(vinst->dbgname, vas_debugfs);
- if (IS_ERR(d))
- goto free_name;
-
vinst->dbgdir = d;
- return;
-
-free_name:
- kfree(vinst->dbgname);
- vinst->dbgname = NULL;
- vinst->dbgdir = NULL;
}
/*
@@ -191,6 +165,4 @@ void vas_init_dbgdir(void)
first_time = false;
vas_debugfs = debugfs_create_dir("vas", NULL);
- if (IS_ERR(vas_debugfs))
- vas_debugfs = NULL;
}
diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c
new file mode 100644
index 000000000000..2b47d5a86328
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-fault.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * VAS Fault handling.
+ * Copyright 2019, IBM Corporation
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/sched/signal.h>
+#include <linux/mmu_context.h>
+#include <asm/icswx.h>
+
+#include "vas.h"
+
+/*
+ * The maximum FIFO size for fault window can be 8MB
+ * (VAS_RX_FIFO_SIZE_MAX). Using 4MB FIFO since each VAS
+ * instance will be having fault window.
+ * 8MB FIFO can be used if expects more faults for each VAS
+ * instance.
+ */
+#define VAS_FAULT_WIN_FIFO_SIZE (4 << 20)
+
+static void dump_fifo(struct vas_instance *vinst, void *entry)
+{
+ unsigned long *end = vinst->fault_fifo + vinst->fault_fifo_size;
+ unsigned long *fifo = entry;
+ int i;
+
+ pr_err("Fault fifo size %d, Max crbs %d\n", vinst->fault_fifo_size,
+ vinst->fault_fifo_size / CRB_SIZE);
+
+ /* Dump 10 CRB entries or until end of FIFO */
+ pr_err("Fault FIFO Dump:\n");
+ for (i = 0; i < 10*(CRB_SIZE/8) && fifo < end; i += 4, fifo += 4) {
+ pr_err("[%.3d, %p]: 0x%.16lx 0x%.16lx 0x%.16lx 0x%.16lx\n",
+ i, fifo, *fifo, *(fifo+1), *(fifo+2), *(fifo+3));
+ }
+}
+
+/*
+ * Process valid CRBs in fault FIFO.
+ * NX process user space requests, return credit and update the status
+ * in CRB. If it encounters transalation error when accessing CRB or
+ * request buffers, raises interrupt on the CPU to handle the fault.
+ * It takes credit on fault window, updates nx_fault_stamp in CRB with
+ * the following information and pastes CRB in fault FIFO.
+ *
+ * pswid - window ID of the window on which the request is sent.
+ * fault_storage_addr - fault address
+ *
+ * It can raise a single interrupt for multiple faults. Expects OS to
+ * process all valid faults and return credit for each fault on user
+ * space and fault windows. This fault FIFO control will be done with
+ * credit mechanism. NX can continuously paste CRBs until credits are not
+ * available on fault window. Otherwise, returns with RMA_reject.
+ *
+ * Total credits available on fault window: FIFO_SIZE(4MB)/CRBS_SIZE(128)
+ *
+ */
+irqreturn_t vas_fault_thread_fn(int irq, void *data)
+{
+ struct vas_instance *vinst = data;
+ struct coprocessor_request_block *crb, *entry;
+ struct coprocessor_request_block buf;
+ struct pnv_vas_window *window;
+ unsigned long flags;
+ void *fifo;
+
+ crb = &buf;
+
+ /*
+ * VAS can interrupt with multiple page faults. So process all
+ * valid CRBs within fault FIFO until reaches invalid CRB.
+ * We use CCW[0] and pswid to validate CRBs:
+ *
+ * CCW[0] Reserved bit. When NX pastes CRB, CCW[0]=0
+ * OS sets this bit to 1 after reading CRB.
+ * pswid NX assigns window ID. Set pswid to -1 after
+ * reading CRB from fault FIFO.
+ *
+ * We exit this function if no valid CRBs are available to process.
+ * So acquire fault_lock and reset fifo_in_progress to 0 before
+ * exit.
+ * In case kernel receives another interrupt with different page
+ * fault, interrupt handler returns with IRQ_HANDLED if
+ * fifo_in_progress is set. Means these new faults will be
+ * handled by the current thread. Otherwise set fifo_in_progress
+ * and return IRQ_WAKE_THREAD to wake up thread.
+ */
+ while (true) {
+ spin_lock_irqsave(&vinst->fault_lock, flags);
+ /*
+ * Advance the fault fifo pointer to next CRB.
+ * Use CRB_SIZE rather than sizeof(*crb) since the latter is
+ * aligned to CRB_ALIGN (256) but the CRB written to by VAS is
+ * only CRB_SIZE in len.
+ */
+ fifo = vinst->fault_fifo + (vinst->fault_crbs * CRB_SIZE);
+ entry = fifo;
+
+ if ((entry->stamp.nx.pswid == cpu_to_be32(FIFO_INVALID_ENTRY))
+ || (entry->ccw & cpu_to_be32(CCW0_INVALID))) {
+ vinst->fifo_in_progress = 0;
+ spin_unlock_irqrestore(&vinst->fault_lock, flags);
+ return IRQ_HANDLED;
+ }
+
+ spin_unlock_irqrestore(&vinst->fault_lock, flags);
+ vinst->fault_crbs++;
+ if (vinst->fault_crbs == (vinst->fault_fifo_size / CRB_SIZE))
+ vinst->fault_crbs = 0;
+
+ memcpy(crb, fifo, CRB_SIZE);
+ entry->stamp.nx.pswid = cpu_to_be32(FIFO_INVALID_ENTRY);
+ entry->ccw |= cpu_to_be32(CCW0_INVALID);
+ /*
+ * Return credit for the fault window.
+ */
+ vas_return_credit(vinst->fault_win, false);
+
+ pr_devel("VAS[%d] fault_fifo %p, fifo %p, fault_crbs %d\n",
+ vinst->vas_id, vinst->fault_fifo, fifo,
+ vinst->fault_crbs);
+
+ vas_dump_crb(crb);
+ window = vas_pswid_to_window(vinst,
+ be32_to_cpu(crb->stamp.nx.pswid));
+
+ if (IS_ERR(window)) {
+ /*
+ * We got an interrupt about a specific send
+ * window but we can't find that window and we can't
+ * even clean it up (return credit on user space
+ * window).
+ * But we should not get here.
+ * TODO: Disable IRQ.
+ */
+ dump_fifo(vinst, (void *)entry);
+ pr_err("VAS[%d] fault_fifo %p, fifo %p, pswid 0x%x, fault_crbs %d bad CRB?\n",
+ vinst->vas_id, vinst->fault_fifo, fifo,
+ be32_to_cpu(crb->stamp.nx.pswid),
+ vinst->fault_crbs);
+
+ WARN_ON_ONCE(1);
+ } else {
+ /*
+ * NX sees faults only with user space windows.
+ */
+ if (window->user_win)
+ vas_update_csb(crb, &window->vas_win.task_ref);
+ else
+ WARN_ON_ONCE(!window->user_win);
+
+ /*
+ * Return credit for send window after processing
+ * fault CRB.
+ */
+ vas_return_credit(window, true);
+ }
+ }
+}
+
+irqreturn_t vas_fault_handler(int irq, void *dev_id)
+{
+ struct vas_instance *vinst = dev_id;
+ irqreturn_t ret = IRQ_WAKE_THREAD;
+ unsigned long flags;
+
+ /*
+ * NX can generate an interrupt for multiple faults. So the
+ * fault handler thread process all CRBs until finds invalid
+ * entry. In case if NX sees continuous faults, it is possible
+ * that the thread function entered with the first interrupt
+ * can execute and process all valid CRBs.
+ * So wake up thread only if the fault thread is not in progress.
+ */
+ spin_lock_irqsave(&vinst->fault_lock, flags);
+
+ if (vinst->fifo_in_progress)
+ ret = IRQ_HANDLED;
+ else
+ vinst->fifo_in_progress = 1;
+
+ spin_unlock_irqrestore(&vinst->fault_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Fault window is opened per VAS instance. NX pastes fault CRB in fault
+ * FIFO upon page faults.
+ */
+int vas_setup_fault_window(struct vas_instance *vinst)
+{
+ struct vas_rx_win_attr attr;
+ struct vas_window *win;
+
+ vinst->fault_fifo_size = VAS_FAULT_WIN_FIFO_SIZE;
+ vinst->fault_fifo = kzalloc(vinst->fault_fifo_size, GFP_KERNEL);
+ if (!vinst->fault_fifo) {
+ pr_err("Unable to alloc %d bytes for fault_fifo\n",
+ vinst->fault_fifo_size);
+ return -ENOMEM;
+ }
+
+ /*
+ * Invalidate all CRB entries. NX pastes valid entry for each fault.
+ */
+ memset(vinst->fault_fifo, FIFO_INVALID_ENTRY, vinst->fault_fifo_size);
+ vas_init_rx_win_attr(&attr, VAS_COP_TYPE_FAULT);
+
+ attr.rx_fifo_size = vinst->fault_fifo_size;
+ attr.rx_fifo = __pa(vinst->fault_fifo);
+
+ /*
+ * Max creds is based on number of CRBs can fit in the FIFO.
+ * (fault_fifo_size/CRB_SIZE). If 8MB FIFO is used, max creds
+ * will be 0xffff since the receive creds field is 16bits wide.
+ */
+ attr.wcreds_max = vinst->fault_fifo_size / CRB_SIZE;
+ attr.lnotify_lpid = 0;
+ attr.lnotify_pid = mfspr(SPRN_PID);
+ attr.lnotify_tid = mfspr(SPRN_PID);
+
+ win = vas_rx_win_open(vinst->vas_id, VAS_COP_TYPE_FAULT, &attr);
+ if (IS_ERR(win)) {
+ pr_err("VAS: Error %ld opening FaultWin\n", PTR_ERR(win));
+ kfree(vinst->fault_fifo);
+ return PTR_ERR(win);
+ }
+
+ vinst->fault_win = container_of(win, struct pnv_vas_window, vas_win);
+
+ pr_devel("VAS: Created FaultWin %d, LPID/PID/TID [%d/%d/%d]\n",
+ vinst->fault_win->vas_win.winid, attr.lnotify_lpid,
+ attr.lnotify_pid, attr.lnotify_tid);
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/vas-trace.h b/arch/powerpc/platforms/powernv/vas-trace.h
index a449b9f0c12e..ca2e08f2ddc0 100644
--- a/arch/powerpc/platforms/powernv/vas-trace.h
+++ b/arch/powerpc/platforms/powernv/vas-trace.h
@@ -80,7 +80,7 @@ TRACE_EVENT( vas_tx_win_open,
TRACE_EVENT( vas_paste_crb,
TP_PROTO(struct task_struct *tsk,
- struct vas_window *win),
+ struct pnv_vas_window *win),
TP_ARGS(tsk, win),
@@ -96,7 +96,7 @@ TRACE_EVENT( vas_paste_crb,
TP_fast_assign(
__entry->pid = tsk->pid;
__entry->vasid = win->vinst->vas_id;
- __entry->winid = win->winid;
+ __entry->winid = win->vas_win.winid;
__entry->paste_kaddr = (unsigned long)win->paste_kaddr
),
diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 0c0d27d17976..b664838008c1 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -12,8 +12,11 @@
#include <linux/log2.h>
#include <linux/rcupdate.h>
#include <linux/cred.h>
+#include <linux/sched/mm.h>
+#include <linux/mmu_context.h>
#include <asm/switch_to.h>
#include <asm/ppc-opcode.h>
+#include <asm/vas.h>
#include "vas.h"
#include "copy-paste.h"
@@ -24,14 +27,14 @@
* Compute the paste address region for the window @window using the
* ->paste_base_addr and ->paste_win_id_shift we got from device tree.
*/
-static void compute_paste_address(struct vas_window *window, u64 *addr, int *len)
+void vas_win_paste_addr(struct pnv_vas_window *window, u64 *addr, int *len)
{
int winid;
u64 base, shift;
base = window->vinst->paste_base_addr;
shift = window->vinst->paste_win_id_shift;
- winid = window->winid;
+ winid = window->vas_win.winid;
*addr = base + (winid << shift);
if (len)
@@ -40,23 +43,23 @@ static void compute_paste_address(struct vas_window *window, u64 *addr, int *len
pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
}
-static inline void get_hvwc_mmio_bar(struct vas_window *window,
+static inline void get_hvwc_mmio_bar(struct pnv_vas_window *window,
u64 *start, int *len)
{
u64 pbaddr;
pbaddr = window->vinst->hvwc_bar_start;
- *start = pbaddr + window->winid * VAS_HVWC_SIZE;
+ *start = pbaddr + window->vas_win.winid * VAS_HVWC_SIZE;
*len = VAS_HVWC_SIZE;
}
-static inline void get_uwc_mmio_bar(struct vas_window *window,
+static inline void get_uwc_mmio_bar(struct pnv_vas_window *window,
u64 *start, int *len)
{
u64 pbaddr;
pbaddr = window->vinst->uwc_bar_start;
- *start = pbaddr + window->winid * VAS_UWC_SIZE;
+ *start = pbaddr + window->vas_win.winid * VAS_UWC_SIZE;
*len = VAS_UWC_SIZE;
}
@@ -65,7 +68,7 @@ static inline void get_uwc_mmio_bar(struct vas_window *window,
* space. Unlike MMIO regions (map_mmio_region() below), paste region must
* be mapped cache-able and is only applicable to send windows.
*/
-static void *map_paste_region(struct vas_window *txwin)
+static void *map_paste_region(struct pnv_vas_window *txwin)
{
int len;
void *map;
@@ -73,12 +76,12 @@ static void *map_paste_region(struct vas_window *txwin)
u64 start;
name = kasprintf(GFP_KERNEL, "window-v%d-w%d", txwin->vinst->vas_id,
- txwin->winid);
+ txwin->vas_win.winid);
if (!name)
goto free_name;
txwin->paste_addr_name = name;
- compute_paste_address(txwin, &start, &len);
+ vas_win_paste_addr(txwin, &start, &len);
if (!request_mem_region(start, len, name)) {
pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n",
@@ -130,13 +133,13 @@ static void unmap_region(void *addr, u64 start, int len)
/*
* Unmap the paste address region for a window.
*/
-static void unmap_paste_region(struct vas_window *window)
+static void unmap_paste_region(struct pnv_vas_window *window)
{
int len;
u64 busaddr_start;
if (window->paste_kaddr) {
- compute_paste_address(window, &busaddr_start, &len);
+ vas_win_paste_addr(window, &busaddr_start, &len);
unmap_region(window->paste_kaddr, busaddr_start, len);
window->paste_kaddr = NULL;
kfree(window->paste_addr_name);
@@ -151,7 +154,7 @@ static void unmap_paste_region(struct vas_window *window)
* path, just minimize the time we hold the mutex for now. We can add
* a per-instance mutex later if necessary.
*/
-static void unmap_winctx_mmio_bars(struct vas_window *window)
+static void unmap_winctx_mmio_bars(struct pnv_vas_window *window)
{
int len;
void *uwc_map;
@@ -184,7 +187,7 @@ static void unmap_winctx_mmio_bars(struct vas_window *window)
* OS/User Window Context (UWC) MMIO Base Address Region for the given window.
* Map these bus addresses and save the mapped kernel addresses in @window.
*/
-int map_winctx_mmio_bars(struct vas_window *window)
+static int map_winctx_mmio_bars(struct pnv_vas_window *window)
{
int len;
u64 start;
@@ -212,7 +215,7 @@ int map_winctx_mmio_bars(struct vas_window *window)
* registers are not sequential. And, we can only write to offsets
* with valid registers.
*/
-void reset_window_regs(struct vas_window *window)
+static void reset_window_regs(struct pnv_vas_window *window)
{
write_hvwc_reg(window, VREG(LPID), 0ULL);
write_hvwc_reg(window, VREG(PID), 0ULL);
@@ -268,7 +271,7 @@ void reset_window_regs(struct vas_window *window)
* want to add fields to vas_winctx and move the initialization to
* init_vas_winctx_regs().
*/
-static void init_xlate_regs(struct vas_window *window, bool user_win)
+static void init_xlate_regs(struct pnv_vas_window *window, bool user_win)
{
u64 lpcr, val;
@@ -333,7 +336,7 @@ static void init_xlate_regs(struct vas_window *window, bool user_win)
*
* TODO: Reserved (aka dedicated) send buffers are not supported yet.
*/
-static void init_rsvd_tx_buf_count(struct vas_window *txwin,
+static void init_rsvd_tx_buf_count(struct pnv_vas_window *txwin,
struct vas_winctx *winctx)
{
write_hvwc_reg(txwin, VREG(TX_RSVD_BUF_COUNT), 0ULL);
@@ -355,7 +358,8 @@ static void init_rsvd_tx_buf_count(struct vas_window *txwin,
* as a one-time task? That could work for NX but what about other
* receivers? Let the receivers tell us the rx-fifo buffers for now.
*/
-int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx)
+static void init_winctx_regs(struct pnv_vas_window *window,
+ struct vas_winctx *winctx)
{
u64 val;
int fifo_size;
@@ -373,7 +377,7 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx)
init_xlate_regs(window, winctx->user_win);
val = 0ULL;
- val = SET_FIELD(VAS_FAULT_TX_WIN, val, 0);
+ val = SET_FIELD(VAS_FAULT_TX_WIN, val, winctx->fault_win_id);
write_hvwc_reg(window, VREG(FAULT_TX_WIN), val);
/* In PowerNV, interrupts go to HV. */
@@ -400,7 +404,7 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx)
*
* See also: Design note in function header.
*/
- val = __pa(winctx->rx_fifo);
+ val = winctx->rx_fifo;
val = SET_FIELD(VAS_PAGE_MIGRATION_SELECT, val, 0);
write_hvwc_reg(window, VREG(LFIFO_BAR), val);
@@ -497,8 +501,6 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx)
val = SET_FIELD(VAS_WINCTL_NX_WIN, val, winctx->nx_win);
val = SET_FIELD(VAS_WINCTL_OPEN, val, 1);
write_hvwc_reg(window, VREG(WINCTL), val);
-
- return 0;
}
static void vas_release_window_id(struct ida *ida, int winid)
@@ -518,10 +520,10 @@ static int vas_assign_window_id(struct ida *ida)
return winid;
}
-static void vas_window_free(struct vas_window *window)
+static void vas_window_free(struct pnv_vas_window *window)
{
- int winid = window->winid;
struct vas_instance *vinst = window->vinst;
+ int winid = window->vas_win.winid;
unmap_winctx_mmio_bars(window);
@@ -532,10 +534,10 @@ static void vas_window_free(struct vas_window *window)
vas_release_window_id(&vinst->ida, winid);
}
-static struct vas_window *vas_window_alloc(struct vas_instance *vinst)
+static struct pnv_vas_window *vas_window_alloc(struct vas_instance *vinst)
{
int winid;
- struct vas_window *window;
+ struct pnv_vas_window *window;
winid = vas_assign_window_id(&vinst->ida);
if (winid < 0)
@@ -546,7 +548,7 @@ static struct vas_window *vas_window_alloc(struct vas_instance *vinst)
goto out_free;
window->vinst = vinst;
- window->winid = winid;
+ window->vas_win.winid = winid;
if (map_winctx_mmio_bars(window))
goto out_free;
@@ -561,7 +563,7 @@ out_free:
return ERR_PTR(-ENOMEM);
}
-static void put_rx_win(struct vas_window *rxwin)
+static void put_rx_win(struct pnv_vas_window *rxwin)
{
/* Better not be a send window! */
WARN_ON_ONCE(rxwin->tx_win);
@@ -577,10 +579,11 @@ static void put_rx_win(struct vas_window *rxwin)
*
* NOTE: We access ->windows[] table and assume that vinst->mutex is held.
*/
-static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid)
+static struct pnv_vas_window *get_user_rxwin(struct vas_instance *vinst,
+ u32 pswid)
{
int vasid, winid;
- struct vas_window *rxwin;
+ struct pnv_vas_window *rxwin;
decode_pswid(pswid, &vasid, &winid);
@@ -589,7 +592,7 @@ static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid)
rxwin = vinst->windows[winid];
- if (!rxwin || rxwin->tx_win || rxwin->cop != VAS_COP_TYPE_FTW)
+ if (!rxwin || rxwin->tx_win || rxwin->vas_win.cop != VAS_COP_TYPE_FTW)
return ERR_PTR(-EINVAL);
return rxwin;
@@ -601,10 +604,10 @@ static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid)
*
* See also function header of set_vinst_win().
*/
-static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst,
+static struct pnv_vas_window *get_vinst_rxwin(struct vas_instance *vinst,
enum vas_cop_type cop, u32 pswid)
{
- struct vas_window *rxwin;
+ struct pnv_vas_window *rxwin;
mutex_lock(&vinst->mutex);
@@ -637,9 +640,9 @@ static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst,
* window, we also save the window in the ->rxwin[] table.
*/
static void set_vinst_win(struct vas_instance *vinst,
- struct vas_window *window)
+ struct pnv_vas_window *window)
{
- int id = window->winid;
+ int id = window->vas_win.winid;
mutex_lock(&vinst->mutex);
@@ -648,8 +651,8 @@ static void set_vinst_win(struct vas_instance *vinst,
* unless its a user (FTW) window.
*/
if (!window->user_win && !window->tx_win) {
- WARN_ON_ONCE(vinst->rxwin[window->cop]);
- vinst->rxwin[window->cop] = window;
+ WARN_ON_ONCE(vinst->rxwin[window->vas_win.cop]);
+ vinst->rxwin[window->vas_win.cop] = window;
}
WARN_ON_ONCE(vinst->windows[id] != NULL);
@@ -662,16 +665,16 @@ static void set_vinst_win(struct vas_instance *vinst,
* Clear this window from the table(s) of windows for this VAS instance.
* See also function header of set_vinst_win().
*/
-static void clear_vinst_win(struct vas_window *window)
+static void clear_vinst_win(struct pnv_vas_window *window)
{
- int id = window->winid;
+ int id = window->vas_win.winid;
struct vas_instance *vinst = window->vinst;
mutex_lock(&vinst->mutex);
if (!window->user_win && !window->tx_win) {
- WARN_ON_ONCE(!vinst->rxwin[window->cop]);
- vinst->rxwin[window->cop] = NULL;
+ WARN_ON_ONCE(!vinst->rxwin[window->vas_win.cop]);
+ vinst->rxwin[window->vas_win.cop] = NULL;
}
WARN_ON_ONCE(vinst->windows[id] != window);
@@ -680,7 +683,7 @@ static void clear_vinst_win(struct vas_window *window)
mutex_unlock(&vinst->mutex);
}
-static void init_winctx_for_rxwin(struct vas_window *rxwin,
+static void init_winctx_for_rxwin(struct pnv_vas_window *rxwin,
struct vas_rx_win_attr *rxattr,
struct vas_winctx *winctx)
{
@@ -701,7 +704,7 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
winctx->rx_fifo = rxattr->rx_fifo;
winctx->rx_fifo_size = rxattr->rx_fifo_size;
- winctx->wcreds_max = rxwin->wcreds_max;
+ winctx->wcreds_max = rxwin->vas_win.wcreds_max;
winctx->pin_win = rxattr->pin_win;
winctx->nx_win = rxattr->nx_win;
@@ -736,7 +739,7 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
*/
winctx->fifo_disable = true;
winctx->intr_disable = true;
- winctx->rx_fifo = NULL;
+ winctx->rx_fifo = 0;
}
winctx->lnotify_lpid = rxattr->lnotify_lpid;
@@ -748,6 +751,8 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
winctx->min_scope = VAS_SCOPE_LOCAL;
winctx->max_scope = VAS_SCOPE_VECTORED_GROUP;
+ if (rxwin->vinst->virq)
+ winctx->irq_port = rxwin->vinst->irq_port;
}
static bool rx_win_args_valid(enum vas_cop_type cop,
@@ -768,7 +773,7 @@ static bool rx_win_args_valid(enum vas_cop_type cop,
if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX)
return false;
- if (attr->wcreds_max > VAS_RX_WCREDS_MAX)
+ if (!attr->wcreds_max)
return false;
if (attr->nx_win) {
@@ -813,7 +818,8 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop)
{
memset(rxattr, 0, sizeof(*rxattr));
- if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) {
+ if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI ||
+ cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) {
rxattr->pin_win = true;
rxattr->nx_win = true;
rxattr->fault_win = false;
@@ -827,9 +833,9 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop)
rxattr->fault_win = true;
rxattr->notify_disable = true;
rxattr->rx_wcred_mode = true;
- rxattr->tx_wcred_mode = true;
rxattr->rx_win_ord_mode = true;
- rxattr->tx_win_ord_mode = true;
+ rxattr->rej_no_credit = true;
+ rxattr->tc_mode = VAS_THRESH_DISABLED;
} else if (cop == VAS_COP_TYPE_FTW) {
rxattr->user_win = true;
rxattr->intr_disable = true;
@@ -847,7 +853,7 @@ EXPORT_SYMBOL_GPL(vas_init_rx_win_attr);
struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop,
struct vas_rx_win_attr *rxattr)
{
- struct vas_window *rxwin;
+ struct pnv_vas_window *rxwin;
struct vas_winctx winctx;
struct vas_instance *vinst;
@@ -866,23 +872,21 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop,
rxwin = vas_window_alloc(vinst);
if (IS_ERR(rxwin)) {
pr_devel("Unable to allocate memory for Rx window\n");
- return rxwin;
+ return (struct vas_window *)rxwin;
}
rxwin->tx_win = false;
rxwin->nx_win = rxattr->nx_win;
rxwin->user_win = rxattr->user_win;
- rxwin->cop = cop;
- rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
- if (rxattr->user_win)
- rxwin->pid = task_pid_vnr(current);
+ rxwin->vas_win.cop = cop;
+ rxwin->vas_win.wcreds_max = rxattr->wcreds_max;
init_winctx_for_rxwin(rxwin, rxattr, &winctx);
init_winctx_regs(rxwin, &winctx);
set_vinst_win(vinst, rxwin);
- return rxwin;
+ return &rxwin->vas_win;
}
EXPORT_SYMBOL_GPL(vas_rx_win_open);
@@ -890,7 +894,8 @@ void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, enum vas_cop_type cop)
{
memset(txattr, 0, sizeof(*txattr));
- if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) {
+ if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI ||
+ cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) {
txattr->rej_no_credit = false;
txattr->rx_wcred_mode = true;
txattr->tx_wcred_mode = true;
@@ -902,7 +907,7 @@ void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, enum vas_cop_type cop)
}
EXPORT_SYMBOL_GPL(vas_init_tx_win_attr);
-static void init_winctx_for_txwin(struct vas_window *txwin,
+static void init_winctx_for_txwin(struct pnv_vas_window *txwin,
struct vas_tx_win_attr *txattr,
struct vas_winctx *winctx)
{
@@ -923,7 +928,7 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
*/
memset(winctx, 0, sizeof(struct vas_winctx));
- winctx->wcreds_max = txwin->wcreds_max;
+ winctx->wcreds_max = txwin->vas_win.wcreds_max;
winctx->user_win = txattr->user_win;
winctx->nx_win = txwin->rxwin->nx_win;
@@ -943,14 +948,24 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
winctx->lpid = txattr->lpid;
winctx->pidr = txattr->pidr;
- winctx->rx_win_id = txwin->rxwin->winid;
+ winctx->rx_win_id = txwin->rxwin->vas_win.winid;
+ /*
+ * IRQ and fault window setup is successful. Set fault window
+ * for the send window so that ready to handle faults.
+ */
+ if (txwin->vinst->virq)
+ winctx->fault_win_id = txwin->vinst->fault_win->vas_win.winid;
winctx->dma_type = VAS_DMA_TYPE_INJECT;
winctx->tc_mode = txattr->tc_mode;
winctx->min_scope = VAS_SCOPE_LOCAL;
winctx->max_scope = VAS_SCOPE_VECTORED_GROUP;
+ if (txwin->vinst->virq)
+ winctx->irq_port = txwin->vinst->irq_port;
- winctx->pswid = 0;
+ winctx->pswid = txattr->pswid ? txattr->pswid :
+ encode_pswid(txwin->vinst->vas_id,
+ txwin->vas_win.winid);
}
static bool tx_win_args_valid(enum vas_cop_type cop,
@@ -965,9 +980,14 @@ static bool tx_win_args_valid(enum vas_cop_type cop,
if (attr->wcreds_max > VAS_TX_WCREDS_MAX)
return false;
- if (attr->user_win &&
- (cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count))
- return false;
+ if (attr->user_win) {
+ if (attr->rsvd_txbuf_count)
+ return false;
+
+ if (cop != VAS_COP_TYPE_FTW && cop != VAS_COP_TYPE_GZIP &&
+ cop != VAS_COP_TYPE_GZIP_HIPRI)
+ return false;
+ }
return true;
}
@@ -976,8 +996,8 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
struct vas_tx_win_attr *attr)
{
int rc;
- struct vas_window *txwin;
- struct vas_window *rxwin;
+ struct pnv_vas_window *txwin;
+ struct pnv_vas_window *rxwin;
struct vas_winctx winctx;
struct vas_instance *vinst;
@@ -1003,7 +1023,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
rxwin = get_vinst_rxwin(vinst, cop, attr->pswid);
if (IS_ERR(rxwin)) {
pr_devel("No RxWin for vasid %d, cop %d\n", vasid, cop);
- return rxwin;
+ return (struct vas_window *)rxwin;
}
txwin = vas_window_alloc(vinst);
@@ -1012,13 +1032,12 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
goto put_rxwin;
}
- txwin->cop = cop;
+ txwin->vas_win.cop = cop;
txwin->tx_win = 1;
txwin->rxwin = rxwin;
txwin->nx_win = txwin->rxwin->nx_win;
- txwin->pid = attr->pid;
txwin->user_win = attr->user_win;
- txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+ txwin->vas_win.wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
init_winctx_for_txwin(txwin, attr, &winctx);
@@ -1040,17 +1059,24 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
}
} else {
/*
- * A user mapping must ensure that context switch issues
- * CP_ABORT for this thread.
+ * Interrupt hanlder or fault window setup failed. Means
+ * NX can not generate fault for page fault. So not
+ * opening for user space tx window.
*/
- rc = set_thread_uses_vas();
+ if (!vinst->virq) {
+ rc = -ENODEV;
+ goto free_window;
+ }
+ rc = get_vas_user_win_ref(&txwin->vas_win.task_ref);
if (rc)
goto free_window;
+
+ vas_user_win_add_mm_context(&txwin->vas_win.task_ref);
}
set_vinst_win(vinst, txwin);
- return txwin;
+ return &txwin->vas_win;
free_window:
vas_window_free(txwin);
@@ -1069,12 +1095,14 @@ int vas_copy_crb(void *crb, int offset)
EXPORT_SYMBOL_GPL(vas_copy_crb);
#define RMA_LSMP_REPORT_ENABLE PPC_BIT(53)
-int vas_paste_crb(struct vas_window *txwin, int offset, bool re)
+int vas_paste_crb(struct vas_window *vwin, int offset, bool re)
{
+ struct pnv_vas_window *txwin;
int rc;
void *addr;
uint64_t val;
+ txwin = container_of(vwin, struct pnv_vas_window, vas_win);
trace_vas_paste_crb(current, txwin);
/*
@@ -1104,7 +1132,7 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re)
else
rc = -EINVAL;
- pr_debug("Txwin #%d: Msg count %llu\n", txwin->winid,
+ pr_debug("Txwin #%d: Msg count %llu\n", txwin->vas_win.winid,
read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
return rc;
@@ -1124,10 +1152,11 @@ EXPORT_SYMBOL_GPL(vas_paste_crb);
* user space. (NX-842 driver waits for CSB and Fast thread-wakeup
* doesn't use credit checking).
*/
-static void poll_window_credits(struct vas_window *window)
+static void poll_window_credits(struct pnv_vas_window *window)
{
u64 val;
int creds, mode;
+ int count = 0;
val = read_hvwc_reg(window, VREG(WINCTL));
if (window->tx_win)
@@ -1146,10 +1175,28 @@ retry:
creds = GET_FIELD(VAS_LRX_WCRED, val);
}
- if (creds < window->wcreds_max) {
+ /*
+ * Takes around few milliseconds to complete all pending requests
+ * and return credits.
+ * TODO: Scan fault FIFO and invalidate CRBs points to this window
+ * and issue CRB Kill to stop all pending requests. Need only
+ * if there is a bug in NX or fault handling in kernel.
+ */
+ if (creds < window->vas_win.wcreds_max) {
val = 0;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(msecs_to_jiffies(10));
+ count++;
+ /*
+ * Process can not close send window until all credits are
+ * returned.
+ */
+ if (!(count % 1000))
+ pr_warn_ratelimited("VAS: pid %d stuck. Waiting for credits returned for Window(%d). creds %d, Retries %d\n",
+ vas_window_pid(&window->vas_win),
+ window->vas_win.winid,
+ creds, count);
+
goto retry;
}
}
@@ -1159,10 +1206,11 @@ retry:
* short time to queue a CRB, so window should not be busy for too long.
* Trying 5ms intervals.
*/
-static void poll_window_busy_state(struct vas_window *window)
+static void poll_window_busy_state(struct pnv_vas_window *window)
{
int busy;
u64 val;
+ int count = 0;
retry:
val = read_hvwc_reg(window, VREG(WIN_STATUS));
@@ -1170,7 +1218,17 @@ retry:
if (busy) {
val = 0;
set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(msecs_to_jiffies(5));
+ schedule_timeout(msecs_to_jiffies(10));
+ count++;
+ /*
+ * Takes around few milliseconds to process all pending
+ * requests.
+ */
+ if (!(count % 1000))
+ pr_warn_ratelimited("VAS: pid %d stuck. Window (ID=%d) is in busy state. Retries %d\n",
+ vas_window_pid(&window->vas_win),
+ window->vas_win.winid, count);
+
goto retry;
}
}
@@ -1191,7 +1249,7 @@ retry:
* casting out becomes necessary we should consider offloading the
* job to a worker thread, so the window close can proceed quickly.
*/
-static void poll_window_castout(struct vas_window *window)
+static void poll_window_castout(struct pnv_vas_window *window)
{
/* stub for now */
}
@@ -1200,7 +1258,7 @@ static void poll_window_castout(struct vas_window *window)
* Unpin and close a window so no new requests are accepted and the
* hardware can evict this window from cache if necessary.
*/
-static void unpin_close_window(struct vas_window *window)
+static void unpin_close_window(struct pnv_vas_window *window)
{
u64 val;
@@ -1222,11 +1280,15 @@ static void unpin_close_window(struct vas_window *window)
*
* Besides the hardware, kernel has some bookkeeping of course.
*/
-int vas_win_close(struct vas_window *window)
+int vas_win_close(struct vas_window *vwin)
{
- if (!window)
+ struct pnv_vas_window *window;
+
+ if (!vwin)
return 0;
+ window = container_of(vwin, struct pnv_vas_window, vas_win);
+
if (!window->tx_win && atomic_read(&window->num_txwins) != 0) {
pr_devel("Attempting to close an active Rx window!\n");
WARN_ON_ONCE(1);
@@ -1235,22 +1297,175 @@ int vas_win_close(struct vas_window *window)
unmap_paste_region(window);
- clear_vinst_win(window);
-
poll_window_busy_state(window);
unpin_close_window(window);
poll_window_credits(window);
+ clear_vinst_win(window);
+
poll_window_castout(window);
/* if send window, drop reference to matching receive window */
- if (window->tx_win)
+ if (window->tx_win) {
+ if (window->user_win) {
+ mm_context_remove_vas_window(vwin->task_ref.mm);
+ put_vas_user_win_ref(&vwin->task_ref);
+ }
put_rx_win(window->rxwin);
+ }
vas_window_free(window);
return 0;
}
EXPORT_SYMBOL_GPL(vas_win_close);
+
+/*
+ * Return credit for the given window.
+ * Send windows and fault window uses credit mechanism as follows:
+ *
+ * Send windows:
+ * - The default number of credits available for each send window is
+ * 1024. It means 1024 requests can be issued asynchronously at the
+ * same time. If the credit is not available, that request will be
+ * returned with RMA_Busy.
+ * - One credit is taken when NX request is issued.
+ * - This credit is returned after NX processed that request.
+ * - If NX encounters translation error, kernel will return the
+ * credit on the specific send window after processing the fault CRB.
+ *
+ * Fault window:
+ * - The total number credits available is FIFO_SIZE/CRB_SIZE.
+ * Means 4MB/128 in the current implementation. If credit is not
+ * available, RMA_Reject is returned.
+ * - A credit is taken when NX pastes CRB in fault FIFO.
+ * - The kernel with return credit on fault window after reading entry
+ * from fault FIFO.
+ */
+void vas_return_credit(struct pnv_vas_window *window, bool tx)
+{
+ uint64_t val;
+
+ val = 0ULL;
+ if (tx) { /* send window */
+ val = SET_FIELD(VAS_TX_WCRED, val, 1);
+ write_hvwc_reg(window, VREG(TX_WCRED_ADDER), val);
+ } else {
+ val = SET_FIELD(VAS_LRX_WCRED, val, 1);
+ write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), val);
+ }
+}
+
+struct pnv_vas_window *vas_pswid_to_window(struct vas_instance *vinst,
+ uint32_t pswid)
+{
+ struct pnv_vas_window *window;
+ int winid;
+
+ if (!pswid) {
+ pr_devel("%s: called for pswid 0!\n", __func__);
+ return ERR_PTR(-ESRCH);
+ }
+
+ decode_pswid(pswid, NULL, &winid);
+
+ if (winid >= VAS_WINDOWS_PER_CHIP)
+ return ERR_PTR(-ESRCH);
+
+ /*
+ * If application closes the window before the hardware
+ * returns the fault CRB, we should wait in vas_win_close()
+ * for the pending requests. so the window must be active
+ * and the process alive.
+ *
+ * If its a kernel process, we should not get any faults and
+ * should not get here.
+ */
+ window = vinst->windows[winid];
+
+ if (!window) {
+ pr_err("PSWID decode: Could not find window for winid %d pswid %d vinst 0x%p\n",
+ winid, pswid, vinst);
+ return NULL;
+ }
+
+ /*
+ * Do some sanity checks on the decoded window. Window should be
+ * NX GZIP user send window. FTW windows should not incur faults
+ * since their CRBs are ignored (not queued on FIFO or processed
+ * by NX).
+ */
+ if (!window->tx_win || !window->user_win || !window->nx_win ||
+ window->vas_win.cop == VAS_COP_TYPE_FAULT ||
+ window->vas_win.cop == VAS_COP_TYPE_FTW) {
+ pr_err("PSWID decode: id %d, tx %d, user %d, nx %d, cop %d\n",
+ winid, window->tx_win, window->user_win,
+ window->nx_win, window->vas_win.cop);
+ WARN_ON(1);
+ }
+
+ return window;
+}
+
+static struct vas_window *vas_user_win_open(int vas_id, u64 flags,
+ enum vas_cop_type cop_type)
+{
+ struct vas_tx_win_attr txattr = {};
+
+ vas_init_tx_win_attr(&txattr, cop_type);
+
+ txattr.lpid = mfspr(SPRN_LPID);
+ txattr.pidr = mfspr(SPRN_PID);
+ txattr.user_win = true;
+ txattr.rsvd_txbuf_count = false;
+ txattr.pswid = false;
+
+ pr_devel("Pid %d: Opening txwin, PIDR %ld\n", txattr.pidr,
+ mfspr(SPRN_PID));
+
+ return vas_tx_win_open(vas_id, cop_type, &txattr);
+}
+
+static u64 vas_user_win_paste_addr(struct vas_window *txwin)
+{
+ struct pnv_vas_window *win;
+ u64 paste_addr;
+
+ win = container_of(txwin, struct pnv_vas_window, vas_win);
+ vas_win_paste_addr(win, &paste_addr, NULL);
+
+ return paste_addr;
+}
+
+static int vas_user_win_close(struct vas_window *txwin)
+{
+ vas_win_close(txwin);
+
+ return 0;
+}
+
+static const struct vas_user_win_ops vops = {
+ .open_win = vas_user_win_open,
+ .paste_addr = vas_user_win_paste_addr,
+ .close_win = vas_user_win_close,
+};
+
+/*
+ * Supporting only nx-gzip coprocessor type now, but this API code
+ * extended to other coprocessor types later.
+ */
+int vas_register_api_powernv(struct module *mod, enum vas_cop_type cop_type,
+ const char *name)
+{
+
+ return vas_register_coproc_api(mod, cop_type, name, &vops);
+}
+EXPORT_SYMBOL_GPL(vas_register_api_powernv);
+
+void vas_unregister_api_powernv(void)
+{
+ vas_unregister_coproc_api();
+}
+EXPORT_SYMBOL_GPL(vas_unregister_api_powernv);
diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
index ed9cc6df329a..b65256a63e87 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -14,7 +14,10 @@
#include <linux/of_platform.h>
#include <linux/of_address.h>
#include <linux/of.h>
+#include <linux/irqdomain.h>
+#include <linux/interrupt.h>
#include <asm/prom.h>
+#include <asm/xive.h>
#include "vas.h"
@@ -23,12 +26,35 @@ static LIST_HEAD(vas_instances);
static DEFINE_PER_CPU(int, cpu_vas_id);
+static int vas_irq_fault_window_setup(struct vas_instance *vinst)
+{
+ int rc = 0;
+
+ rc = request_threaded_irq(vinst->virq, vas_fault_handler,
+ vas_fault_thread_fn, 0, vinst->name, vinst);
+
+ if (rc) {
+ pr_err("VAS[%d]: Request IRQ(%d) failed with %d\n",
+ vinst->vas_id, vinst->virq, rc);
+ goto out;
+ }
+
+ rc = vas_setup_fault_window(vinst);
+ if (rc)
+ free_irq(vinst->virq, vinst);
+
+out:
+ return rc;
+}
+
static int init_vas_instance(struct platform_device *pdev)
{
- int rc, cpu, vasid;
- struct resource *res;
- struct vas_instance *vinst;
struct device_node *dn = pdev->dev.of_node;
+ struct vas_instance *vinst;
+ struct xive_irq_data *xd;
+ uint32_t chipid, hwirq;
+ struct resource *res;
+ int rc, cpu, vasid;
rc = of_property_read_u32(dn, "ibm,vas-id", &vasid);
if (rc) {
@@ -36,6 +62,12 @@ static int init_vas_instance(struct platform_device *pdev)
return -ENODEV;
}
+ rc = of_property_read_u32(dn, "ibm,chip-id", &chipid);
+ if (rc) {
+ pr_err("No ibm,chip-id property for %s?\n", pdev->name);
+ return -ENODEV;
+ }
+
if (pdev->num_resources != 4) {
pr_err("Unexpected DT configuration for [%s, %d]\n",
pdev->name, vasid);
@@ -46,6 +78,12 @@ static int init_vas_instance(struct platform_device *pdev)
if (!vinst)
return -ENOMEM;
+ vinst->name = kasprintf(GFP_KERNEL, "vas-%d", vasid);
+ if (!vinst->name) {
+ kfree(vinst);
+ return -ENOMEM;
+ }
+
INIT_LIST_HEAD(&vinst->node);
ida_init(&vinst->ida);
mutex_init(&vinst->mutex);
@@ -69,9 +107,32 @@ static int init_vas_instance(struct platform_device *pdev)
vinst->paste_win_id_shift = 63 - res->end;
- pr_devel("Initialized instance [%s, %d], paste_base 0x%llx, "
- "paste_win_id_shift 0x%llx\n", pdev->name, vasid,
- vinst->paste_base_addr, vinst->paste_win_id_shift);
+ hwirq = xive_native_alloc_irq_on_chip(chipid);
+ if (!hwirq) {
+ pr_err("Inst%d: Unable to allocate global irq for chip %d\n",
+ vinst->vas_id, chipid);
+ return -ENOENT;
+ }
+
+ vinst->virq = irq_create_mapping(NULL, hwirq);
+ if (!vinst->virq) {
+ pr_err("Inst%d: Unable to map global irq %d\n",
+ vinst->vas_id, hwirq);
+ return -EINVAL;
+ }
+
+ xd = irq_get_handler_data(vinst->virq);
+ if (!xd) {
+ pr_err("Inst%d: Invalid virq %d\n",
+ vinst->vas_id, vinst->virq);
+ return -EINVAL;
+ }
+
+ vinst->irq_port = xd->trig_page;
+ pr_devel("Initialized instance [%s, %d] paste_base 0x%llx paste_win_id_shift 0x%llx IRQ %d Port 0x%llx\n",
+ pdev->name, vasid, vinst->paste_base_addr,
+ vinst->paste_win_id_shift, vinst->virq,
+ vinst->irq_port);
for_each_possible_cpu(cpu) {
if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn))
@@ -82,6 +143,22 @@ static int init_vas_instance(struct platform_device *pdev)
list_add(&vinst->node, &vas_instances);
mutex_unlock(&vas_mutex);
+ spin_lock_init(&vinst->fault_lock);
+ /*
+ * IRQ and fault handling setup is needed only for user space
+ * send windows.
+ */
+ if (vinst->virq) {
+ rc = vas_irq_fault_window_setup(vinst);
+ /*
+ * Fault window is used only for user space send windows.
+ * So if vinst->virq is NULL, tx_win_open returns -ENODEV
+ * for user space.
+ */
+ if (rc)
+ vinst->virq = 0;
+ }
+
vas_instance_init_dbgdir(vinst);
dev_set_drvdata(&pdev->dev, vinst);
@@ -89,6 +166,7 @@ static int init_vas_instance(struct platform_device *pdev)
return 0;
free_vinst:
+ kfree(vinst->name);
kfree(vinst);
return -ENODEV;
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 5574aec9ee88..08d9d3d5a22b 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -101,11 +101,9 @@
/*
* Initial per-process credits.
* Max send window credits: 4K-1 (12-bits in VAS_TX_WCRED)
- * Max receive window credits: 64K-1 (16 bits in VAS_LRX_WCRED)
*
* TODO: Needs tuning for per-process credits
*/
-#define VAS_RX_WCREDS_MAX ((64 << 10) - 1)
#define VAS_TX_WCREDS_MAX ((4 << 10) - 1)
#define VAS_WCREDS_DEFAULT (1 << 10)
@@ -296,6 +294,22 @@ enum vas_notify_after_count {
};
/*
+ * NX can generate an interrupt for multiple faults and expects kernel
+ * to process all of them. So read all valid CRB entries until find the
+ * invalid one. So use pswid which is pasted by NX and ccw[0] (reserved
+ * bit in BE) to check valid CRB. CCW[0] will not be touched by user
+ * space. Application gets CRB formt error if it updates this bit.
+ *
+ * Invalidate FIFO during allocation and process all entries from last
+ * successful read until finds invalid pswid and ccw[0] values.
+ * After reading each CRB entry from fault FIFO, the kernel invalidate
+ * it by updating pswid with FIFO_INVALID_ENTRY and CCW[0] with
+ * CCW0_INVALID.
+ */
+#define FIFO_INVALID_ENTRY 0xffffffff
+#define CCW0_INVALID 1
+
+/*
* One per instance of VAS. Each instance will have a separate set of
* receive windows, one per coprocessor type.
*
@@ -313,39 +327,43 @@ struct vas_instance {
u64 paste_base_addr;
u64 paste_win_id_shift;
+ u64 irq_port;
+ int virq;
+ int fault_crbs;
+ int fault_fifo_size;
+ int fifo_in_progress; /* To wake up thread or return IRQ_HANDLED */
+ spinlock_t fault_lock; /* Protects fifo_in_progress update */
+ void *fault_fifo;
+ struct pnv_vas_window *fault_win; /* Fault window */
+
struct mutex mutex;
- struct vas_window *rxwin[VAS_COP_TYPE_MAX];
- struct vas_window *windows[VAS_WINDOWS_PER_CHIP];
+ struct pnv_vas_window *rxwin[VAS_COP_TYPE_MAX];
+ struct pnv_vas_window *windows[VAS_WINDOWS_PER_CHIP];
+ char *name;
char *dbgname;
struct dentry *dbgdir;
};
/*
- * In-kernel state a VAS window. One per window.
+ * In-kernel state a VAS window on PowerNV. One per window.
*/
-struct vas_window {
+struct pnv_vas_window {
+ struct vas_window vas_win;
/* Fields common to send and receive windows */
struct vas_instance *vinst;
- int winid;
bool tx_win; /* True if send window */
bool nx_win; /* True if NX window */
bool user_win; /* True if user space window */
void *hvwc_map; /* HV window context */
void *uwc_map; /* OS/User window context */
- pid_t pid; /* Linux process id of owner */
- int wcreds_max; /* Window credits */
-
- char *dbgname;
- struct dentry *dbgdir;
/* Fields applicable only to send windows */
void *paste_kaddr;
char *paste_addr_name;
- struct vas_window *rxwin;
+ struct pnv_vas_window *rxwin;
- /* Feilds applicable only to receive windows */
- enum vas_cop_type cop;
+ /* Fields applicable only to receive windows */
atomic_t num_txwins;
};
@@ -358,7 +376,7 @@ struct vas_window {
* is a container for the register fields in the window context.
*/
struct vas_winctx {
- void *rx_fifo;
+ u64 rx_fifo;
int rx_fifo_size;
int wcreds_max;
int rsvd_txbuf_count;
@@ -404,19 +422,32 @@ extern struct mutex vas_mutex;
extern struct vas_instance *find_vas_instance(int vasid);
extern void vas_init_dbgdir(void);
extern void vas_instance_init_dbgdir(struct vas_instance *vinst);
-extern void vas_window_init_dbgdir(struct vas_window *win);
-extern void vas_window_free_dbgdir(struct vas_window *win);
+extern void vas_window_init_dbgdir(struct pnv_vas_window *win);
+extern void vas_window_free_dbgdir(struct pnv_vas_window *win);
+extern int vas_setup_fault_window(struct vas_instance *vinst);
+extern irqreturn_t vas_fault_thread_fn(int irq, void *data);
+extern irqreturn_t vas_fault_handler(int irq, void *dev_id);
+extern void vas_return_credit(struct pnv_vas_window *window, bool tx);
+extern struct pnv_vas_window *vas_pswid_to_window(struct vas_instance *vinst,
+ uint32_t pswid);
+extern void vas_win_paste_addr(struct pnv_vas_window *window, u64 *addr,
+ int *len);
+
+static inline int vas_window_pid(struct vas_window *window)
+{
+ return pid_vnr(window->task_ref.pid);
+}
-static inline void vas_log_write(struct vas_window *win, char *name,
+static inline void vas_log_write(struct pnv_vas_window *win, char *name,
void *regptr, u64 val)
{
if (val)
pr_debug("%swin #%d: %s reg %p, val 0x%016llx\n",
- win->tx_win ? "Tx" : "Rx", win->winid, name,
- regptr, val);
+ win->tx_win ? "Tx" : "Rx", win->vas_win.winid,
+ name, regptr, val);
}
-static inline void write_uwc_reg(struct vas_window *win, char *name,
+static inline void write_uwc_reg(struct pnv_vas_window *win, char *name,
s32 reg, u64 val)
{
void *regptr;
@@ -427,7 +458,7 @@ static inline void write_uwc_reg(struct vas_window *win, char *name,
out_be64(regptr, val);
}
-static inline void write_hvwc_reg(struct vas_window *win, char *name,
+static inline void write_hvwc_reg(struct pnv_vas_window *win, char *name,
s32 reg, u64 val)
{
void *regptr;
@@ -438,12 +469,27 @@ static inline void write_hvwc_reg(struct vas_window *win, char *name,
out_be64(regptr, val);
}
-static inline u64 read_hvwc_reg(struct vas_window *win,
+static inline u64 read_hvwc_reg(struct pnv_vas_window *win,
char *name __maybe_unused, s32 reg)
{
return in_be64(win->hvwc_map+reg);
}
+/*
+ * Encode/decode the Partition Send Window ID (PSWID) for a window in
+ * a way that we can uniquely identify any window in the system. i.e.
+ * we should be able to locate the 'struct vas_window' given the PSWID.
+ *
+ * Bits Usage
+ * 0:7 VAS id (8 bits)
+ * 8:15 Unused, 0 (3 bits)
+ * 16:31 Window id (16 bits)
+ */
+static inline u32 encode_pswid(int vasid, int winid)
+{
+ return ((u32)winid | (vasid << (31 - 7)));
+}
+
static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
{
if (vasid)