summaryrefslogtreecommitdiff
path: root/arch/powerpc/platforms/powernv
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r--arch/powerpc/platforms/powernv/Kconfig33
-rw-r--r--arch/powerpc/platforms/powernv/Makefile27
-rw-r--r--arch/powerpc/platforms/powernv/copy-paste.h6
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c381
-rw-r--r--arch/powerpc/platforms/powernv/idle.c1113
-rw-r--r--arch/powerpc/platforms/powernv/memtrace.c235
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c1291
-rw-r--r--arch/powerpc/platforms/powernv/ocxl.c151
-rw-r--r--arch/powerpc/platforms/powernv/opal-async.c8
-rw-r--r--arch/powerpc/platforms/powernv/opal-call.c295
-rw-r--r--arch/powerpc/platforms/powernv/opal-core.c663
-rw-r--r--arch/powerpc/platforms/powernv/opal-dump.c69
-rw-r--r--arch/powerpc/platforms/powernv/opal-elog.c61
-rw-r--r--arch/powerpc/platforms/powernv/opal-fadump.c719
-rw-r--r--arch/powerpc/platforms/powernv/opal-fadump.h146
-rw-r--r--arch/powerpc/platforms/powernv/opal-flash.c14
-rw-r--r--arch/powerpc/platforms/powernv/opal-hmi.c83
-rw-r--r--arch/powerpc/platforms/powernv/opal-imc.c109
-rw-r--r--arch/powerpc/platforms/powernv/opal-irqchip.c29
-rw-r--r--arch/powerpc/platforms/powernv/opal-kmsg.c10
-rw-r--r--arch/powerpc/platforms/powernv/opal-lpc.c17
-rw-r--r--arch/powerpc/platforms/powernv/opal-memory-errors.c17
-rw-r--r--arch/powerpc/platforms/powernv/opal-msglog.c71
-rw-r--r--arch/powerpc/platforms/powernv/opal-nvram.c6
-rw-r--r--arch/powerpc/platforms/powernv/opal-power.c8
-rw-r--r--arch/powerpc/platforms/powernv/opal-powercap.c22
-rw-r--r--arch/powerpc/platforms/powernv/opal-prd.c69
-rw-r--r--arch/powerpc/platforms/powernv/opal-psr.c16
-rw-r--r--arch/powerpc/platforms/powernv/opal-rtc.c11
-rw-r--r--arch/powerpc/platforms/powernv/opal-secvar.c182
-rw-r--r--arch/powerpc/platforms/powernv/opal-sensor-groups.c18
-rw-r--r--arch/powerpc/platforms/powernv/opal-sensor.c17
-rw-r--r--arch/powerpc/platforms/powernv/opal-sysparam.c15
-rw-r--r--arch/powerpc/platforms/powernv/opal-tracepoints.c1
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S350
-rw-r--r--arch/powerpc/platforms/powernv/opal-xscom.c224
-rw-r--r--arch/powerpc/platforms/powernv/opal.c356
-rw-r--r--arch/powerpc/platforms/powernv/pci-cxl.c178
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda-tce.c88
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c2566
-rw-r--r--arch/powerpc/platforms/powernv/pci-sriov.c760
-rw-r--r--arch/powerpc/platforms/powernv/pci.c434
-rw-r--r--arch/powerpc/platforms/powernv/pci.h173
-rw-r--r--arch/powerpc/platforms/powernv/powernv.h14
-rw-r--r--arch/powerpc/platforms/powernv/rng.c144
-rw-r--r--arch/powerpc/platforms/powernv/setup.c139
-rw-r--r--arch/powerpc/platforms/powernv/smp.c105
-rw-r--r--arch/powerpc/platforms/powernv/subcore-asm.S6
-rw-r--r--arch/powerpc/platforms/powernv/subcore.c33
-rw-r--r--arch/powerpc/platforms/powernv/subcore.h12
-rw-r--r--arch/powerpc/platforms/powernv/ultravisor.c70
-rw-r--r--arch/powerpc/platforms/powernv/vas-debug.c70
-rw-r--r--arch/powerpc/platforms/powernv/vas-fault.c245
-rw-r--r--arch/powerpc/platforms/powernv/vas-trace.h4
-rw-r--r--arch/powerpc/platforms/powernv/vas-window.c400
-rw-r--r--arch/powerpc/platforms/powernv/vas.c96
-rw-r--r--arch/powerpc/platforms/powernv/vas.h92
57 files changed, 6832 insertions, 5640 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 850eee860cf2..b5ad7c173ef0 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -2,48 +2,39 @@
config PPC_POWERNV
depends on PPC64 && PPC_BOOK3S
bool "IBM PowerNV (Non-Virtualized) platform support"
- select PPC_NATIVE
+ select PPC_HASH_MMU_NATIVE if PPC_64S_HASH_MMU
select PPC_XICS
select PPC_ICP_NATIVE
select PPC_XIVE_NATIVE
select PPC_P7_NAP
select FORCE_PCI
select PCI_MSI
+ select IRQ_MSI_LIB
select EPAPR_BOOT
select PPC_INDIRECT_PIO
select PPC_UDBG_16550
- select PPC_SCOM
- select ARCH_RANDOM
select CPU_FREQ
select PPC_DOORBELL
select MMU_NOTIFIER
select FORCE_SMP
+ select ARCH_SUPPORTS_PER_VMA_LOCK
+ select PPC_RADIX_BROADCAST_TLBIE if PPC_RADIX_MMU
default y
config OPAL_PRD
- tristate 'OPAL PRD driver'
+ tristate "OPAL PRD driver"
depends on PPC_POWERNV
help
This enables the opal-prd driver, a facility to run processor
recovery diagnostics on OpenPower machines
config PPC_MEMTRACE
- bool "Enable removal of RAM from kernel mappings for tracing"
- depends on PPC_POWERNV && MEMORY_HOTREMOVE
+ bool "Enable runtime allocation of RAM for tracing"
+ depends on PPC_POWERNV && MEMORY_HOTPLUG && CONTIG_ALLOC
help
- Enabling this option allows for the removal of memory (RAM)
- from the kernel mappings to be used for hardware tracing.
+ Enabling this option allows for runtime allocation of memory (RAM)
+ for hardware tracing.
-config PPC_VAS
- bool "IBM Virtual Accelerator Switchboard (VAS)"
- depends on PPC_POWERNV && PPC_64K_PAGES
- default y
- help
- This enables support for IBM Virtual Accelerator Switchboard (VAS).
-
- VAS allows accelerators in co-processors like NX-GZIP and NX-842
- to be accessible to kernel subsystems and user processes.
-
- VAS adapters are found in POWER9 based systems.
-
- If unsure, say N.
+config SCOM_DEBUGFS
+ bool "Expose SCOM controllers via debugfs"
+ depends on DEBUG_FS
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index b540ce8eec55..9e5d0c847ee2 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -1,19 +1,32 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y += setup.o opal-wrappers.o opal.o opal-async.o idle.o
-obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
+
+# nothing that deals with real mode is safe to KASAN
+# in particular, idle code runs a bunch of things in real mode
+KASAN_SANITIZE_idle.o := n
+KASAN_SANITIZE_pci-ioda.o := n
+KASAN_SANITIZE_pci-ioda-tce.o := n
+# pnv_machine_check_early
+KASAN_SANITIZE_setup.o := n
+
+obj-y += setup.o opal-call.o opal-wrappers.o opal.o opal-async.o
+obj-y += idle.o opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o
obj-y += opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
obj-y += opal-kmsg.o opal-powercap.o opal-psr.o opal-sensor-groups.o
+obj-y += ultravisor.o
obj-$(CONFIG_SMP) += smp.o subcore.o subcore-asm.o
-obj-$(CONFIG_PCI) += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
-obj-$(CONFIG_CXL_BASE) += pci-cxl.o
+obj-$(CONFIG_FA_DUMP) += opal-fadump.o
+obj-$(CONFIG_PRESERVE_FA_DUMP) += opal-fadump.o
+obj-$(CONFIG_OPAL_CORE) += opal-core.o
+obj-$(CONFIG_PCI) += pci.o pci-ioda.o pci-ioda-tce.o
+obj-$(CONFIG_PCI_IOV) += pci-sriov.o
obj-$(CONFIG_EEH) += eeh-powernv.o
-obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o
-obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o
obj-$(CONFIG_OPAL_PRD) += opal-prd.o
obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
-obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o
+obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o vas-fault.o
obj-$(CONFIG_OCXL_BASE) += ocxl.o
+obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o
+obj-$(CONFIG_PPC_SECURE_BOOT) += opal-secvar.o
diff --git a/arch/powerpc/platforms/powernv/copy-paste.h b/arch/powerpc/platforms/powernv/copy-paste.h
index cb36f9fbcef3..f063807eda9a 100644
--- a/arch/powerpc/platforms/powernv/copy-paste.h
+++ b/arch/powerpc/platforms/powernv/copy-paste.h
@@ -1,10 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright 2016-17 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <asm/ppc-opcode.h>
#include <asm/reg.h>
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index f38078976c5d..db3370d1673c 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1,14 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * The file intends to implement the platform dependent EEH operations on
- * powernv platform. Actually, the powernv was created in order to fully
- * hypervisor support.
+ * PowerNV Platform dependent EEH operations
*
* Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2013.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/atomic.h>
@@ -17,6 +11,7 @@
#include <linux/export.h>
#include <linux/init.h>
#include <linux/interrupt.h>
+#include <linux/irqdomain.h>
#include <linux/list.h>
#include <linux/msi.h>
#include <linux/of.h>
@@ -40,71 +35,14 @@
#include "powernv.h"
#include "pci.h"
+#include "../../../../drivers/pci/pci.h"
static int eeh_event_irq = -EINVAL;
-void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
+static void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
{
- struct pci_dn *pdn = pci_get_pdn(pdev);
-
- if (!pdev->is_virtfn)
- return;
-
- /*
- * The following operations will fail if VF's sysfs files
- * aren't created or its resources aren't finalized.
- */
- eeh_add_device_early(pdn);
- eeh_add_device_late(pdev);
- eeh_sysfs_add_device(pdev);
-}
-
-static int pnv_eeh_init(void)
-{
- struct pci_controller *hose;
- struct pnv_phb *phb;
- int max_diag_size = PNV_PCI_DIAG_BUF_SIZE;
-
- if (!firmware_has_feature(FW_FEATURE_OPAL)) {
- pr_warn("%s: OPAL is required !\n",
- __func__);
- return -EINVAL;
- }
-
- /* Set probe mode */
- eeh_add_flag(EEH_PROBE_MODE_DEV);
-
- /*
- * P7IOC blocks PCI config access to frozen PE, but PHB3
- * doesn't do that. So we have to selectively enable I/O
- * prior to collecting error log.
- */
- list_for_each_entry(hose, &hose_list, list_node) {
- phb = hose->private_data;
-
- if (phb->model == PNV_PHB_MODEL_P7IOC)
- eeh_add_flag(EEH_ENABLE_IO_FOR_LOG);
-
- if (phb->diag_data_size > max_diag_size)
- max_diag_size = phb->diag_data_size;
-
- /*
- * PE#0 should be regarded as valid by EEH core
- * if it's not the reserved one. Currently, we
- * have the reserved PE#255 and PE#127 for PHB3
- * and P7IOC separately. So we should regard
- * PE#0 as valid for PHB3 and P7IOC.
- */
- if (phb->ioda.reserved_pe_idx != 0)
- eeh_add_flag(EEH_VALID_PE_ZERO);
-
- break;
- }
-
- eeh_set_pe_aux_size(max_diag_size);
- ppc_md.pcibios_bus_add_device = pnv_pcibios_bus_add_device;
-
- return 0;
+ dev_dbg(&pdev->dev, "EEH: Setting up device\n");
+ eeh_probe_device(pdev);
}
static irqreturn_t pnv_eeh_event(int irq, void *data)
@@ -150,7 +88,7 @@ static ssize_t pnv_eeh_ei_write(struct file *filp,
return -EINVAL;
/* Retrieve PE */
- pe = eeh_pe_get(hose, pe_no, 0);
+ pe = eeh_pe_get(hose, pe_no);
if (!pe)
return -ENODEV;
@@ -161,7 +99,6 @@ static ssize_t pnv_eeh_ei_write(struct file *filp,
static const struct file_operations pnv_eeh_ei_fops = {
.open = simple_open,
- .llseek = no_llseek,
.write = pnv_eeh_ei_write,
};
@@ -205,6 +142,25 @@ PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
#endif /* CONFIG_DEBUG_FS */
+static void pnv_eeh_enable_phbs(void)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb = hose->private_data;
+ /*
+ * If EEH is enabled, we're going to rely on that.
+ * Otherwise, we restore to conventional mechanism
+ * to clear frozen PE during PCI config access.
+ */
+ if (eeh_enabled())
+ phb->flags |= PNV_PHB_FLAG_EEH;
+ else
+ phb->flags &= ~PNV_PHB_FLAG_EEH;
+ }
+}
+
/**
* pnv_eeh_post_init - EEH platform dependent post initialization
*
@@ -219,9 +175,7 @@ int pnv_eeh_post_init(void)
struct pnv_phb *phb;
int ret = 0;
- /* Probe devices & build address cache */
- eeh_probe_devices();
- eeh_addr_cache_build();
+ eeh_show_enabled();
/* Register OPAL event notifier */
eeh_event_irq = opal_event_request(ilog2(OPAL_EVENT_PCI_ERROR));
@@ -243,19 +197,11 @@ int pnv_eeh_post_init(void)
if (!eeh_enabled())
disable_irq(eeh_event_irq);
+ pnv_eeh_enable_phbs();
+
list_for_each_entry(hose, &hose_list, list_node) {
phb = hose->private_data;
- /*
- * If EEH is enabled, we're going to rely on that.
- * Otherwise, we restore to conventional mechanism
- * to clear frozen PE during PCI config access.
- */
- if (eeh_enabled())
- phb->flags |= PNV_PHB_FLAG_EEH;
- else
- phb->flags &= ~PNV_PHB_FLAG_EEH;
-
/* Create debugfs entries */
#ifdef CONFIG_DEBUG_FS
if (phb->has_dbgfs || !phb->dbgfs)
@@ -344,28 +290,41 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap)
return 0;
}
+static struct eeh_pe *pnv_eeh_get_upstream_pe(struct pci_dev *pdev)
+{
+ struct pci_controller *hose = pdev->bus->sysdata;
+ struct pnv_phb *phb = hose->private_data;
+ struct pci_dev *parent = pdev->bus->self;
+
+#ifdef CONFIG_PCI_IOV
+ /* for VFs we use the PF's PE as the upstream PE */
+ if (pdev->is_virtfn)
+ parent = pdev->physfn;
+#endif
+
+ /* otherwise use the PE of our parent bridge */
+ if (parent) {
+ struct pnv_ioda_pe *ioda_pe = pnv_ioda_get_pe(parent);
+
+ return eeh_pe_get(phb->hose, ioda_pe->pe_number);
+ }
+
+ return NULL;
+}
+
/**
* pnv_eeh_probe - Do probe on PCI device
- * @pdn: PCI device node
- * @data: unused
+ * @pdev: pci_dev to probe
*
- * When EEH module is installed during system boot, all PCI devices
- * are checked one by one to see if it supports EEH. The function
- * is introduced for the purpose. By default, EEH has been enabled
- * on all PCI devices. That's to say, we only need do necessary
- * initialization on the corresponding eeh device and create PE
- * accordingly.
- *
- * It's notable that's unsafe to retrieve the EEH device through
- * the corresponding PCI device. During the PCI device hotplug, which
- * was possiblly triggered by EEH core, the binding between EEH device
- * and the PCI device isn't built yet.
+ * Create, or find the existing, eeh_dev for this pci_dev.
*/
-static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
+static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev)
{
+ struct pci_dn *pdn = pci_get_pdn(pdev);
struct pci_controller *hose = pdn->phb;
struct pnv_phb *phb = hose->private_data;
struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+ struct eeh_pe *upstream_pe;
uint32_t pcie_flags;
int ret;
int config_addr = (pdn->busno << 8) | (pdn->devfn);
@@ -379,18 +338,27 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
if (!edev || edev->pe)
return NULL;
+ /* already configured? */
+ if (edev->pdev) {
+ pr_debug("%s: found existing edev for %04x:%02x:%02x.%01x\n",
+ __func__, hose->global_number, config_addr >> 8,
+ PCI_SLOT(config_addr), PCI_FUNC(config_addr));
+ return edev;
+ }
+
/* Skip for PCI-ISA bridge */
- if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
+ if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
return NULL;
+ eeh_edev_dbg(edev, "Probing device\n");
+
/* Initialize eeh device */
- edev->class_code = pdn->class_code;
edev->mode &= 0xFFFFFF00;
edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX);
edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP);
edev->af_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_AF);
edev->aer_cap = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
- if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
+ if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
edev->mode |= EEH_DEV_BRIDGE;
if (edev->pcie_cap) {
pnv_pci_cfg_read(pdn, edev->pcie_cap + PCI_EXP_FLAGS,
@@ -405,12 +373,12 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
edev->pe_config_addr = phb->ioda.pe_rmap[config_addr];
+ upstream_pe = pnv_eeh_get_upstream_pe(pdev);
+
/* Create PE */
- ret = eeh_add_to_parent_pe(edev);
+ ret = eeh_pe_tree_insert(edev, upstream_pe);
if (ret) {
- pr_warn("%s: Can't add PCI dev %04x:%02x:%02x.%01x to parent PE (%x)\n",
- __func__, hose->global_number, pdn->busno,
- PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn), ret);
+ eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret);
return NULL;
}
@@ -422,7 +390,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
* should be blocked until PE reset. MMIO access is dropped
* by hardware certainly. In order to drop PCI config requests,
* one more flag (EEH_PE_CFG_RESTRICTED) is introduced, which
- * will be checked in the backend for PE state retrival. If
+ * will be checked in the backend for PE state retrieval. If
* the PE becomes frozen for the first time and the flag has
* been set for the PE, we will set EEH_PE_CFG_BLOCKED for
* that PE to block its config space.
@@ -459,12 +427,18 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
* Enable EEH explicitly so that we will do EEH check
* while accessing I/O stuff
*/
- eeh_add_flag(EEH_ENABLED);
+ if (!eeh_has_flag(EEH_ENABLED)) {
+ enable_irq(eeh_event_irq);
+ pnv_eeh_enable_phbs();
+ eeh_add_flag(EEH_ENABLED);
+ }
/* Save memory bars */
eeh_save_bars(edev);
- return NULL;
+ eeh_edev_dbg(edev, "EEH enabled on device\n");
+
+ return edev;
}
/**
@@ -537,18 +511,6 @@ static int pnv_eeh_set_option(struct eeh_pe *pe, int option)
return 0;
}
-/**
- * pnv_eeh_get_pe_addr - Retrieve PE address
- * @pe: EEH PE
- *
- * Retrieve the PE address according to the given tranditional
- * PCI BDF (Bus/Device/Function) address.
- */
-static int pnv_eeh_get_pe_addr(struct eeh_pe *pe)
-{
- return pe->addr;
-}
-
static void pnv_eeh_get_phb_diag(struct eeh_pe *pe)
{
struct pnv_phb *phb = pe->phb->private_data;
@@ -843,7 +805,7 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option)
int aer = edev ? edev->aer_cap : 0;
u32 ctrl;
- pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n",
+ pr_debug("%s: Secondary Reset PCI bus %04x:%02x with option %d\n",
__func__, pci_domain_nr(dev->bus),
dev->bus->number, option);
@@ -852,32 +814,32 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int option)
case EEH_RESET_HOT:
/* Don't report linkDown event */
if (aer) {
- eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+ eeh_ops->read_config(edev, aer + PCI_ERR_UNCOR_MASK,
4, &ctrl);
ctrl |= PCI_ERR_UNC_SURPDN;
- eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+ eeh_ops->write_config(edev, aer + PCI_ERR_UNCOR_MASK,
4, ctrl);
}
- eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl);
+ eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &ctrl);
ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
- eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl);
+ eeh_ops->write_config(edev, PCI_BRIDGE_CONTROL, 2, ctrl);
msleep(EEH_PE_RST_HOLD_TIME);
break;
case EEH_RESET_DEACTIVATE:
- eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, &ctrl);
+ eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, &ctrl);
ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
- eeh_ops->write_config(pdn, PCI_BRIDGE_CONTROL, 2, ctrl);
+ eeh_ops->write_config(edev, PCI_BRIDGE_CONTROL, 2, ctrl);
msleep(EEH_PE_RST_SETTLE_TIME);
/* Continue reporting linkDown event */
if (aer) {
- eeh_ops->read_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+ eeh_ops->read_config(edev, aer + PCI_ERR_UNCOR_MASK,
4, &ctrl);
ctrl &= ~PCI_ERR_UNC_SURPDN;
- eeh_ops->write_config(pdn, aer + PCI_ERR_UNCOR_MASK,
+ eeh_ops->write_config(edev, aer + PCI_ERR_UNCOR_MASK,
4, ctrl);
}
@@ -892,15 +854,18 @@ static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int option)
struct pci_controller *hose = pci_bus_to_host(pdev->bus);
struct pnv_phb *phb = hose->private_data;
struct device_node *dn = pci_device_to_OF_node(pdev);
- uint64_t id = PCI_SLOT_ID(phb->opal_id,
- (pdev->bus->number << 8) | pdev->devfn);
+ uint64_t id = PCI_SLOT_ID(phb->opal_id, pci_dev_id(pdev));
uint8_t scope;
int64_t rc;
/* Hot reset to the bus if firmware cannot handle */
- if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL))
+ if (!dn || !of_property_present(dn, "ibm,reset-by-firmware"))
return __pnv_eeh_bridge_reset(pdev, option);
+ pr_debug("%s: FW reset PCI bus %04x:%02x with option %d\n",
+ __func__, pci_domain_nr(pdev->bus),
+ pdev->bus->number, option);
+
switch (option) {
case EEH_RESET_FUNDAMENTAL:
scope = OPAL_RESET_PCI_FUNDAMENTAL;
@@ -942,11 +907,12 @@ void pnv_pci_reset_secondary_bus(struct pci_dev *dev)
static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type,
int pos, u16 mask)
{
+ struct eeh_dev *edev = pdn->edev;
int i, status = 0;
/* Wait for Transaction Pending bit to be cleared */
for (i = 0; i < 4; i++) {
- eeh_ops->read_config(pdn, pos, 2, &status);
+ eeh_ops->read_config(edev, pos, 2, &status);
if (!(status & mask))
return;
@@ -967,7 +933,7 @@ static int pnv_eeh_do_flr(struct pci_dn *pdn, int option)
if (WARN_ON(!edev->pcie_cap))
return -ENOTTY;
- eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP, 4, &reg);
+ eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCAP, 4, &reg);
if (!(reg & PCI_EXP_DEVCAP_FLR))
return -ENOTTY;
@@ -977,18 +943,18 @@ static int pnv_eeh_do_flr(struct pci_dn *pdn, int option)
pnv_eeh_wait_for_pending(pdn, "",
edev->pcie_cap + PCI_EXP_DEVSTA,
PCI_EXP_DEVSTA_TRPND);
- eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL,
4, &reg);
reg |= PCI_EXP_DEVCTL_BCR_FLR;
- eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ eeh_ops->write_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL,
4, reg);
msleep(EEH_PE_RST_HOLD_TIME);
break;
case EEH_RESET_DEACTIVATE:
- eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ eeh_ops->read_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL,
4, &reg);
reg &= ~PCI_EXP_DEVCTL_BCR_FLR;
- eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+ eeh_ops->write_config(edev, edev->pcie_cap + PCI_EXP_DEVCTL,
4, reg);
msleep(EEH_PE_RST_SETTLE_TIME);
break;
@@ -1005,7 +971,7 @@ static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option)
if (WARN_ON(!edev->af_cap))
return -ENOTTY;
- eeh_ops->read_config(pdn, edev->af_cap + PCI_AF_CAP, 1, &cap);
+ eeh_ops->read_config(edev, edev->af_cap + PCI_AF_CAP, 1, &cap);
if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR))
return -ENOTTY;
@@ -1014,18 +980,18 @@ static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option)
case EEH_RESET_FUNDAMENTAL:
/*
* Wait for Transaction Pending bit to clear. A word-aligned
- * test is used, so we use the conrol offset rather than status
+ * test is used, so we use the control offset rather than status
* and shift the test bit to match.
*/
pnv_eeh_wait_for_pending(pdn, "AF",
edev->af_cap + PCI_AF_CTRL,
PCI_AF_STATUS_TP << 8);
- eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL,
+ eeh_ops->write_config(edev, edev->af_cap + PCI_AF_CTRL,
1, PCI_AF_CTRL_FLR);
msleep(EEH_PE_RST_HOLD_TIME);
break;
case EEH_RESET_DEACTIVATE:
- eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL, 1, 0);
+ eeh_ops->write_config(edev, edev->af_cap + PCI_AF_CTRL, 1, 0);
msleep(EEH_PE_RST_SETTLE_TIME);
break;
}
@@ -1081,7 +1047,7 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
* frozen state during PE reset. However, the good idea here from
* benh is to keep frozen state before we get PE reset done completely
* (until BAR restore). With the frozen state, HW drops illegal IO
- * or MMIO access, which can incur recrusive frozen PE during PE
+ * or MMIO access, which can incur recursive frozen PE during PE
* reset. The side effect is that EEH core has to clear the frozen
* state explicitly after BAR restore.
*/
@@ -1119,17 +1085,37 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
return -EIO;
}
+ if (pci_is_root_bus(bus))
+ return pnv_eeh_root_reset(hose, option);
+
/*
- * If dealing with the root bus (or the bus underneath the
- * root port), we reset the bus underneath the root port.
+ * For hot resets try use the generic PCI error recovery reset
+ * functions. These correctly handles the case where the secondary
+ * bus is behind a hotplug slot and it will use the slot provided
+ * reset methods to prevent spurious hotplug events during the reset.
*
- * The cxl driver depends on this behaviour for bi-modal card
- * switching.
+ * Fundamental resets need to be handled internally to EEH since the
+ * PCI core doesn't really have a concept of a fundamental reset,
+ * mainly because there's no standard way to generate one. Only a
+ * few devices require an FRESET so it should be fine.
*/
- if (pci_is_root_bus(bus) ||
- pci_is_root_bus(bus->parent))
- return pnv_eeh_root_reset(hose, option);
+ if (option != EEH_RESET_FUNDAMENTAL) {
+ /*
+ * NB: Skiboot and pnv_eeh_bridge_reset() also no-op the
+ * de-assert step. It's like the OPAL reset API was
+ * poorly designed or something...
+ */
+ if (option == EEH_RESET_DEACTIVATE)
+ return 0;
+ rc = pci_bus_error_reset(bus->self);
+ if (!rc)
+ return 0;
+ }
+
+ /* otherwise, use the generic bridge reset. this might call into FW */
+ if (pci_is_root_bus(bus->parent))
+ return pnv_eeh_root_reset(hose, option);
return pnv_eeh_bridge_reset(bus->self, option);
}
@@ -1239,9 +1225,11 @@ static inline bool pnv_eeh_cfg_blocked(struct pci_dn *pdn)
return false;
}
-static int pnv_eeh_read_config(struct pci_dn *pdn,
+static int pnv_eeh_read_config(struct eeh_dev *edev,
int where, int size, u32 *val)
{
+ struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
if (!pdn)
return PCIBIOS_DEVICE_NOT_FOUND;
@@ -1253,9 +1241,11 @@ static int pnv_eeh_read_config(struct pci_dn *pdn,
return pnv_pci_cfg_read(pdn, where, size, val);
}
-static int pnv_eeh_write_config(struct pci_dn *pdn,
+static int pnv_eeh_write_config(struct eeh_dev *edev,
int where, int size, u32 val)
{
+ struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
if (!pdn)
return PCIBIOS_DEVICE_NOT_FOUND;
@@ -1367,7 +1357,7 @@ static int pnv_eeh_get_pe(struct pci_controller *hose,
}
/* Find the PE according to PE# */
- dev_pe = eeh_pe_get(hose, pe_no, 0);
+ dev_pe = eeh_pe_get(hose, pe_no);
if (!dev_pe)
return -EEXIST;
@@ -1609,34 +1599,24 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
return ret;
}
-static int pnv_eeh_restore_config(struct pci_dn *pdn)
+static int pnv_eeh_restore_config(struct eeh_dev *edev)
{
- struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
struct pnv_phb *phb;
s64 ret = 0;
- int config_addr = (pdn->busno << 8) | (pdn->devfn);
if (!edev)
return -EEXIST;
- /*
- * We have to restore the PCI config space after reset since the
- * firmware can't see SRIOV VFs.
- *
- * FIXME: The MPS, error routing rules, timeout setting are worthy
- * to be exported by firmware in extendible way.
- */
- if (edev->physfn) {
- ret = eeh_restore_vf_config(pdn);
- } else {
- phb = pdn->phb->private_data;
- ret = opal_pci_reinit(phb->opal_id,
- OPAL_REINIT_PCI_DEV, config_addr);
- }
+ if (edev->physfn)
+ return 0;
+
+ phb = edev->controller->private_data;
+ ret = opal_pci_reinit(phb->opal_id,
+ OPAL_REINIT_PCI_DEV, edev->bdfn);
if (ret) {
pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n",
- __func__, config_addr, ret);
+ __func__, edev->bdfn, ret);
return -EIO;
}
@@ -1645,10 +1625,8 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn)
static struct eeh_ops pnv_eeh_ops = {
.name = "powernv",
- .init = pnv_eeh_init,
.probe = pnv_eeh_probe,
.set_option = pnv_eeh_set_option,
- .get_pe_addr = pnv_eeh_get_pe_addr,
.get_state = pnv_eeh_get_state,
.reset = pnv_eeh_reset,
.get_log = pnv_eeh_get_log,
@@ -1661,24 +1639,6 @@ static struct eeh_ops pnv_eeh_ops = {
.notify_resume = NULL
};
-#ifdef CONFIG_PCI_IOV
-static void pnv_pci_fixup_vf_mps(struct pci_dev *pdev)
-{
- struct pci_dn *pdn = pci_get_pdn(pdev);
- int parent_mps;
-
- if (!pdev->is_virtfn)
- return;
-
- /* Synchronize MPS for VF and PF */
- parent_mps = pcie_get_mps(pdev->physfn);
- if ((128 << pdev->pcie_mpss) >= parent_mps)
- pcie_set_mps(pdev, parent_mps);
- pdn->mps = pcie_get_mps(pdev);
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_pci_fixup_vf_mps);
-#endif /* CONFIG_PCI_IOV */
-
/**
* eeh_powernv_init - Register platform dependent EEH operations
*
@@ -1687,9 +1647,44 @@ DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_pci_fixup_vf_mps);
*/
static int __init eeh_powernv_init(void)
{
+ int max_diag_size = PNV_PCI_DIAG_BUF_SIZE;
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
int ret = -EINVAL;
- ret = eeh_ops_register(&pnv_eeh_ops);
+ if (!firmware_has_feature(FW_FEATURE_OPAL)) {
+ pr_warn("%s: OPAL is required !\n", __func__);
+ return -EINVAL;
+ }
+
+ /* Set probe mode */
+ eeh_add_flag(EEH_PROBE_MODE_DEV);
+
+ /*
+ * P7IOC blocks PCI config access to frozen PE, but PHB3
+ * doesn't do that. So we have to selectively enable I/O
+ * prior to collecting error log.
+ */
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb = hose->private_data;
+
+ if (phb->model == PNV_PHB_MODEL_P7IOC)
+ eeh_add_flag(EEH_ENABLE_IO_FOR_LOG);
+
+ if (phb->diag_data_size > max_diag_size)
+ max_diag_size = phb->diag_data_size;
+
+ break;
+ }
+
+ /*
+ * eeh_init() allocates the eeh_pe and its aux data buf so the
+ * size needs to be set before calling eeh_init().
+ */
+ eeh_set_pe_aux_size(max_diag_size);
+ ppc_md.pcibios_bus_add_device = pnv_pcibios_bus_add_device;
+
+ ret = eeh_init(&pnv_eeh_ops);
if (!ret)
pr_info("EEH: PowerNV platform initialized\n");
else
@@ -1697,4 +1692,4 @@ static int __init eeh_powernv_init(void)
return ret;
}
-machine_early_initcall(powernv, eeh_powernv_init);
+machine_arch_initcall(powernv, eeh_powernv_init);
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 35f699ebb662..d98b933e4984 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV cpuidle code
*
* Copyright 2015 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/types.h>
@@ -17,11 +13,12 @@
#include <linux/cpu.h>
#include <asm/firmware.h>
+#include <asm/interrupt.h>
#include <asm/machdep.h>
#include <asm/opal.h>
#include <asm/cputhreads.h>
#include <asm/cpuidle.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/smp.h>
#include <asm/runlatch.h>
#include <asm/dbell.h>
@@ -48,10 +45,10 @@ static u64 pnv_default_stop_mask;
static bool default_stop_found;
/*
- * First deep stop state. Used to figure out when to save/restore
- * hypervisor context.
+ * First stop state levels when SPR and TB loss can occur.
*/
-u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
+static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
+static u64 deep_spr_loss_state = MAX_STOP_STATE + 1;
/*
* psscr value and mask of the deepest stop idle state.
@@ -62,7 +59,9 @@ static u64 pnv_deepest_stop_psscr_mask;
static u64 pnv_deepest_stop_flag;
static bool deepest_stop_found;
-static int pnv_save_sprs_for_deep_states(void)
+static unsigned long power7_offline_type;
+
+static int __init pnv_save_sprs_for_deep_states(void)
{
int cpu;
int rc;
@@ -72,12 +71,9 @@ static int pnv_save_sprs_for_deep_states(void)
* all cpus at boot. Get these reg values of current cpu and use the
* same across all cpus.
*/
- uint64_t lpcr_val = mfspr(SPRN_LPCR);
- uint64_t hid0_val = mfspr(SPRN_HID0);
- uint64_t hid1_val = mfspr(SPRN_HID1);
- uint64_t hid4_val = mfspr(SPRN_HID4);
- uint64_t hid5_val = mfspr(SPRN_HID5);
- uint64_t hmeer_val = mfspr(SPRN_HMEER);
+ uint64_t lpcr_val = mfspr(SPRN_LPCR);
+ uint64_t hid0_val = mfspr(SPRN_HID0);
+ uint64_t hmeer_val = mfspr(SPRN_HMEER);
uint64_t msr_val = MSR_IDLE;
uint64_t psscr_val = pnv_deepest_stop_psscr_val;
@@ -116,8 +112,11 @@ static int pnv_save_sprs_for_deep_states(void)
if (rc != 0)
return rc;
- /* Only p8 needs to set extra HID regiters */
+ /* Only p8 needs to set extra HID registers */
if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+ uint64_t hid1_val = mfspr(SPRN_HID1);
+ uint64_t hid4_val = mfspr(SPRN_HID4);
+ uint64_t hid5_val = mfspr(SPRN_HID5);
rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
if (rc != 0)
@@ -137,89 +136,6 @@ static int pnv_save_sprs_for_deep_states(void)
return 0;
}
-static void pnv_alloc_idle_core_states(void)
-{
- int i, j;
- int nr_cores = cpu_nr_cores();
- u32 *core_idle_state;
-
- /*
- * core_idle_state - The lower 8 bits track the idle state of
- * each thread of the core.
- *
- * The most significant bit is the lock bit.
- *
- * Initially all the bits corresponding to threads_per_core
- * are set. They are cleared when the thread enters deep idle
- * state like sleep and winkle/stop.
- *
- * Initially the lock bit is cleared. The lock bit has 2
- * purposes:
- * a. While the first thread in the core waking up from
- * idle is restoring core state, it prevents other
- * threads in the core from switching to process
- * context.
- * b. While the last thread in the core is saving the
- * core state, it prevents a different thread from
- * waking up.
- */
- for (i = 0; i < nr_cores; i++) {
- int first_cpu = i * threads_per_core;
- int node = cpu_to_node(first_cpu);
- size_t paca_ptr_array_size;
-
- core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
- *core_idle_state = (1 << threads_per_core) - 1;
- paca_ptr_array_size = (threads_per_core *
- sizeof(struct paca_struct *));
-
- for (j = 0; j < threads_per_core; j++) {
- int cpu = first_cpu + j;
-
- paca_ptrs[cpu]->core_idle_state_ptr = core_idle_state;
- paca_ptrs[cpu]->thread_idle_state = PNV_THREAD_RUNNING;
- paca_ptrs[cpu]->thread_mask = 1 << j;
- }
- }
-
- update_subcore_sibling_mask();
-
- if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
- int rc = pnv_save_sprs_for_deep_states();
-
- if (likely(!rc))
- return;
-
- /*
- * The stop-api is unable to restore hypervisor
- * resources on wakeup from platform idle states which
- * lose full context. So disable such states.
- */
- supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
- pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
- pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
-
- if (cpu_has_feature(CPU_FTR_ARCH_300) &&
- (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
- /*
- * Use the default stop state for CPU-Hotplug
- * if available.
- */
- if (default_stop_found) {
- pnv_deepest_stop_psscr_val =
- pnv_default_stop_val;
- pnv_deepest_stop_psscr_mask =
- pnv_default_stop_mask;
- pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
- pnv_deepest_stop_psscr_val);
- } else { /* Fallback to snooze loop for CPU-Hotplug */
- deepest_stop_found = false;
- pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
- }
- }
- }
-}
-
u32 pnv_get_supported_cpuidle_states(void)
{
return supported_cpuidle_states;
@@ -229,15 +145,22 @@ EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
static void pnv_fastsleep_workaround_apply(void *info)
{
+ int cpu = smp_processor_id();
int rc;
int *err = info;
+ if (cpu_first_thread_sibling(cpu) != cpu)
+ return;
+
rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP,
OPAL_CONFIG_IDLE_APPLY);
if (rc)
*err = 1;
}
+static bool power7_fastsleep_workaround_entry = true;
+static bool power7_fastsleep_workaround_exit = true;
+
/*
* Used to store fastsleep workaround state
* 0 - Workaround applied/undone at fastsleep entry/exit path (Default)
@@ -255,7 +178,6 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
struct device_attribute *attr, const char *buf,
size_t count)
{
- cpumask_t primary_thread_mask;
int err;
u8 val;
@@ -269,40 +191,25 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
* fastsleep_workaround_applyonce = 1 implies
* fastsleep workaround needs to be left in 'applied' state on all
* the cores. Do this by-
- * 1. Patching out the call to 'undo' workaround in fastsleep exit path
- * 2. Sending ipi to all the cores which have at least one online thread
- * 3. Patching out the call to 'apply' workaround in fastsleep entry
- * path
+ * 1. Disable the 'undo' workaround in fastsleep exit path
+ * 2. Sendi IPIs to all the cores which have at least one online thread
+ * 3. Disable the 'apply' workaround in fastsleep entry path
+ *
* There is no need to send ipi to cores which have all threads
* offlined, as last thread of the core entering fastsleep or deeper
* state would have applied workaround.
*/
- err = patch_instruction(
- (unsigned int *)pnv_fastsleep_workaround_at_exit,
- PPC_INST_NOP);
- if (err) {
- pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_exit");
- goto fail;
- }
+ power7_fastsleep_workaround_exit = false;
- get_online_cpus();
- primary_thread_mask = cpu_online_cores_map();
- on_each_cpu_mask(&primary_thread_mask,
- pnv_fastsleep_workaround_apply,
- &err, 1);
- put_online_cpus();
+ cpus_read_lock();
+ on_each_cpu(pnv_fastsleep_workaround_apply, &err, 1);
+ cpus_read_unlock();
if (err) {
pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply");
goto fail;
}
- err = patch_instruction(
- (unsigned int *)pnv_fastsleep_workaround_at_entry,
- PPC_INST_NOP);
- if (err) {
- pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_entry");
- goto fail;
- }
+ power7_fastsleep_workaround_entry = false;
fastsleep_workaround_applyonce = 1;
@@ -315,31 +222,353 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
show_fastsleep_workaround_applyonce,
store_fastsleep_workaround_applyonce);
-static unsigned long __power7_idle_type(unsigned long type)
+static inline void atomic_start_thread_idle(void)
+{
+ int cpu = raw_smp_processor_id();
+ int first = cpu_first_thread_sibling(cpu);
+ int thread_nr = cpu_thread_in_core(cpu);
+ unsigned long *state = &paca_ptrs[first]->idle_state;
+
+ clear_bit(thread_nr, state);
+}
+
+static inline void atomic_stop_thread_idle(void)
+{
+ int cpu = raw_smp_processor_id();
+ int first = cpu_first_thread_sibling(cpu);
+ int thread_nr = cpu_thread_in_core(cpu);
+ unsigned long *state = &paca_ptrs[first]->idle_state;
+
+ set_bit(thread_nr, state);
+}
+
+static inline void atomic_lock_thread_idle(void)
{
+ int cpu = raw_smp_processor_id();
+ int first = cpu_first_thread_sibling(cpu);
+ unsigned long *lock = &paca_ptrs[first]->idle_lock;
+
+ while (unlikely(test_and_set_bit_lock(NR_PNV_CORE_IDLE_LOCK_BIT, lock)))
+ barrier();
+}
+
+static inline void atomic_unlock_and_stop_thread_idle(void)
+{
+ int cpu = raw_smp_processor_id();
+ int first = cpu_first_thread_sibling(cpu);
+ unsigned long thread = 1UL << cpu_thread_in_core(cpu);
+ unsigned long *state = &paca_ptrs[first]->idle_state;
+ unsigned long *lock = &paca_ptrs[first]->idle_lock;
+ u64 s = READ_ONCE(*state);
+ u64 new, tmp;
+
+ BUG_ON(!(READ_ONCE(*lock) & PNV_CORE_IDLE_LOCK_BIT));
+ BUG_ON(s & thread);
+
+again:
+ new = s | thread;
+ tmp = cmpxchg(state, s, new);
+ if (unlikely(tmp != s)) {
+ s = tmp;
+ goto again;
+ }
+ clear_bit_unlock(NR_PNV_CORE_IDLE_LOCK_BIT, lock);
+}
+
+static inline void atomic_unlock_thread_idle(void)
+{
+ int cpu = raw_smp_processor_id();
+ int first = cpu_first_thread_sibling(cpu);
+ unsigned long *lock = &paca_ptrs[first]->idle_lock;
+
+ BUG_ON(!test_bit(NR_PNV_CORE_IDLE_LOCK_BIT, lock));
+ clear_bit_unlock(NR_PNV_CORE_IDLE_LOCK_BIT, lock);
+}
+
+/* P7 and P8 */
+struct p7_sprs {
+ /* per core */
+ u64 tscr;
+ u64 worc;
+
+ /* per subcore */
+ u64 sdr1;
+ u64 rpr;
+
+ /* per thread */
+ u64 lpcr;
+ u64 hfscr;
+ u64 fscr;
+ u64 purr;
+ u64 spurr;
+ u64 dscr;
+ u64 wort;
+
+ /* per thread SPRs that get lost in shallow states */
+ u64 amr;
+ u64 iamr;
+ u64 uamor;
+ /* amor is restored to constant ~0 */
+};
+
+static unsigned long power7_idle_insn(unsigned long type)
+{
+ int cpu = raw_smp_processor_id();
+ int first = cpu_first_thread_sibling(cpu);
+ unsigned long *state = &paca_ptrs[first]->idle_state;
+ unsigned long thread = 1UL << cpu_thread_in_core(cpu);
+ unsigned long core_thread_mask = (1UL << threads_per_core) - 1;
unsigned long srr1;
+ bool full_winkle;
+ struct p7_sprs sprs = {}; /* avoid false use-uninitialised */
+ bool sprs_saved = false;
+ int rc;
- if (!prep_irq_for_idle_irqsoff())
- return 0;
+ if (unlikely(type != PNV_THREAD_NAP)) {
+ atomic_lock_thread_idle();
+
+ BUG_ON(!(*state & thread));
+ *state &= ~thread;
+
+ if (power7_fastsleep_workaround_entry) {
+ if ((*state & core_thread_mask) == 0) {
+ rc = opal_config_cpu_idle_state(
+ OPAL_CONFIG_IDLE_FASTSLEEP,
+ OPAL_CONFIG_IDLE_APPLY);
+ BUG_ON(rc);
+ }
+ }
+
+ if (type == PNV_THREAD_WINKLE) {
+ sprs.tscr = mfspr(SPRN_TSCR);
+ sprs.worc = mfspr(SPRN_WORC);
+
+ sprs.sdr1 = mfspr(SPRN_SDR1);
+ sprs.rpr = mfspr(SPRN_RPR);
+
+ sprs.lpcr = mfspr(SPRN_LPCR);
+ if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+ sprs.hfscr = mfspr(SPRN_HFSCR);
+ sprs.fscr = mfspr(SPRN_FSCR);
+ }
+ sprs.purr = mfspr(SPRN_PURR);
+ sprs.spurr = mfspr(SPRN_SPURR);
+ sprs.dscr = mfspr(SPRN_DSCR);
+ sprs.wort = mfspr(SPRN_WORT);
+
+ sprs_saved = true;
+
+ /*
+ * Increment winkle counter and set all winkle bits if
+ * all threads are winkling. This allows wakeup side to
+ * distinguish between fast sleep and winkle state
+ * loss. Fast sleep still has to resync the timebase so
+ * this may not be a really big win.
+ */
+ *state += 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
+ if ((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS)
+ >> PNV_CORE_IDLE_WINKLE_COUNT_SHIFT
+ == threads_per_core)
+ *state |= PNV_CORE_IDLE_THREAD_WINKLE_BITS;
+ WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
+ }
+
+ atomic_unlock_thread_idle();
+ }
+
+ if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+ sprs.amr = mfspr(SPRN_AMR);
+ sprs.iamr = mfspr(SPRN_IAMR);
+ sprs.uamor = mfspr(SPRN_UAMOR);
+ }
+
+ local_paca->thread_idle_state = type;
+ srr1 = isa206_idle_insn_mayloss(type); /* go idle */
+ local_paca->thread_idle_state = PNV_THREAD_RUNNING;
+
+ WARN_ON_ONCE(!srr1);
+ WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
+
+ if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+ if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) {
+ /*
+ * We don't need an isync after the mtsprs here because
+ * the upcoming mtmsrd is execution synchronizing.
+ */
+ mtspr(SPRN_AMR, sprs.amr);
+ mtspr(SPRN_IAMR, sprs.iamr);
+ mtspr(SPRN_AMOR, ~0);
+ mtspr(SPRN_UAMOR, sprs.uamor);
+ }
+ }
+
+ if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
+ hmi_exception_realmode(NULL);
+
+ if (likely((srr1 & SRR1_WAKESTATE) != SRR1_WS_HVLOSS)) {
+ if (unlikely(type != PNV_THREAD_NAP)) {
+ atomic_lock_thread_idle();
+ if (type == PNV_THREAD_WINKLE) {
+ WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
+ *state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
+ *state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT);
+ }
+ atomic_unlock_and_stop_thread_idle();
+ }
+ return srr1;
+ }
+
+ /* HV state loss */
+ BUG_ON(type == PNV_THREAD_NAP);
+
+ atomic_lock_thread_idle();
+
+ full_winkle = false;
+ if (type == PNV_THREAD_WINKLE) {
+ WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
+ *state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
+ if (*state & (thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT)) {
+ *state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT);
+ full_winkle = true;
+ BUG_ON(!sprs_saved);
+ }
+ }
+
+ WARN_ON(*state & thread);
+
+ if ((*state & core_thread_mask) != 0)
+ goto core_woken;
+
+ /* Per-core SPRs */
+ if (full_winkle) {
+ mtspr(SPRN_TSCR, sprs.tscr);
+ mtspr(SPRN_WORC, sprs.worc);
+ }
+
+ if (power7_fastsleep_workaround_exit) {
+ rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP,
+ OPAL_CONFIG_IDLE_UNDO);
+ BUG_ON(rc);
+ }
+
+ /* TB */
+ if (opal_resync_timebase() != OPAL_SUCCESS)
+ BUG();
+
+core_woken:
+ if (!full_winkle)
+ goto subcore_woken;
+
+ if ((*state & local_paca->subcore_sibling_mask) != 0)
+ goto subcore_woken;
+
+ /* Per-subcore SPRs */
+ mtspr(SPRN_SDR1, sprs.sdr1);
+ mtspr(SPRN_RPR, sprs.rpr);
+
+subcore_woken:
+ /*
+ * isync after restoring shared SPRs and before unlocking. Unlock
+ * only contains hwsync which does not necessarily do the right
+ * thing for SPRs.
+ */
+ isync();
+ atomic_unlock_and_stop_thread_idle();
+
+ /* Fast sleep does not lose SPRs */
+ if (!full_winkle)
+ return srr1;
+
+ /* Per-thread SPRs */
+ mtspr(SPRN_LPCR, sprs.lpcr);
+ if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+ mtspr(SPRN_HFSCR, sprs.hfscr);
+ mtspr(SPRN_FSCR, sprs.fscr);
+ }
+ mtspr(SPRN_PURR, sprs.purr);
+ mtspr(SPRN_SPURR, sprs.spurr);
+ mtspr(SPRN_DSCR, sprs.dscr);
+ mtspr(SPRN_WORT, sprs.wort);
+
+ mtspr(SPRN_SPRG3, local_paca->sprg_vdso);
+
+#ifdef CONFIG_PPC_64S_HASH_MMU
+ /*
+ * The SLB has to be restored here, but it sometimes still
+ * contains entries, so the __ variant must be used to prevent
+ * multi hits.
+ */
+ __slb_restore_bolted_realmode();
+#endif
+
+ return srr1;
+}
+
+extern unsigned long idle_kvm_start_guest(unsigned long srr1);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static unsigned long power7_offline(void)
+{
+ unsigned long srr1;
+
+ mtmsr(MSR_IDLE);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ /* Tell KVM we're entering idle. */
+ /******************************************************/
+ /* N O T E W E L L ! ! ! N O T E W E L L */
+ /* The following store to HSTATE_HWTHREAD_STATE(r13) */
+ /* MUST occur in real mode, i.e. with the MMU off, */
+ /* and the MMU must stay off until we clear this flag */
+ /* and test HSTATE_HWTHREAD_REQ(r13) in */
+ /* pnv_powersave_wakeup in this file. */
+ /* The reason is that another thread can switch the */
+ /* MMU to a guest context whenever this flag is set */
+ /* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on, */
+ /* that would potentially cause this thread to start */
+ /* executing instructions from guest memory in */
+ /* hypervisor mode, leading to a host crash or data */
+ /* corruption, or worse. */
+ /******************************************************/
+ local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
+#endif
__ppc64_runlatch_off();
- srr1 = power7_idle_insn(type);
+ srr1 = power7_idle_insn(power7_offline_type);
__ppc64_runlatch_on();
- fini_irq_for_idle_irqsoff();
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_KERNEL;
+ /* Order setting hwthread_state vs. testing hwthread_req */
+ smp_mb();
+ if (local_paca->kvm_hstate.hwthread_req)
+ srr1 = idle_kvm_start_guest(srr1);
+#endif
+
+ mtmsr(MSR_KERNEL);
return srr1;
}
+#endif
void power7_idle_type(unsigned long type)
{
unsigned long srr1;
- srr1 = __power7_idle_type(type);
+ if (!prep_irq_for_idle_irqsoff())
+ return;
+
+ mtmsr(MSR_IDLE);
+ __ppc64_runlatch_off();
+ srr1 = power7_idle_insn(type);
+ __ppc64_runlatch_on();
+ mtmsr(MSR_KERNEL);
+
+ fini_irq_for_idle_irqsoff();
irq_set_pending_from_srr1(srr1);
}
-void power7_idle(void)
+static void power7_idle(void)
{
if (!powersave_nap)
return;
@@ -347,42 +576,232 @@ void power7_idle(void)
power7_idle_type(PNV_THREAD_NAP);
}
-static unsigned long __power9_idle_type(unsigned long stop_psscr_val,
- unsigned long stop_psscr_mask)
+struct p9_sprs {
+ /* per core */
+ u64 ptcr;
+ u64 rpr;
+ u64 tscr;
+ u64 ldbar;
+
+ /* per thread */
+ u64 lpcr;
+ u64 hfscr;
+ u64 fscr;
+ u64 pid;
+ u64 purr;
+ u64 spurr;
+ u64 dscr;
+ u64 ciabr;
+
+ u64 mmcra;
+ u32 mmcr0;
+ u32 mmcr1;
+ u64 mmcr2;
+
+ /* per thread SPRs that get lost in shallow states */
+ u64 amr;
+ u64 iamr;
+ u64 amor;
+ u64 uamor;
+};
+
+static unsigned long power9_idle_stop(unsigned long psscr)
{
- unsigned long psscr;
+ int cpu = raw_smp_processor_id();
+ int first = cpu_first_thread_sibling(cpu);
+ unsigned long *state = &paca_ptrs[first]->idle_state;
+ unsigned long core_thread_mask = (1UL << threads_per_core) - 1;
unsigned long srr1;
+ unsigned long pls;
+ unsigned long mmcr0 = 0;
+ unsigned long mmcra = 0;
+ struct p9_sprs sprs = {}; /* avoid false used-uninitialised */
+ bool sprs_saved = false;
- if (!prep_irq_for_idle_irqsoff())
- return 0;
+ if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
+ /* EC=ESL=0 case */
+
+ /*
+ * Wake synchronously. SRESET via xscom may still cause
+ * a 0x100 powersave wakeup with SRR1 reason!
+ */
+ srr1 = isa300_idle_stop_noloss(psscr); /* go idle */
+ if (likely(!srr1))
+ return 0;
+
+ /*
+ * Registers not saved, can't recover!
+ * This would be a hardware bug
+ */
+ BUG_ON((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS);
+
+ goto out;
+ }
+
+ /* EC=ESL=1 case */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ if (cpu_has_feature(CPU_FTR_P9_TM_XER_SO_BUG)) {
+ local_paca->requested_psscr = psscr;
+ /* order setting requested_psscr vs testing dont_stop */
+ smp_mb();
+ if (atomic_read(&local_paca->dont_stop)) {
+ local_paca->requested_psscr = 0;
+ return 0;
+ }
+ }
+#endif
+
+ if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) {
+ /*
+ * POWER9 DD2 can incorrectly set PMAO when waking up
+ * after a state-loss idle. Saving and restoring MMCR0
+ * over idle is a workaround.
+ */
+ mmcr0 = mfspr(SPRN_MMCR0);
+ }
+
+ if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) {
+ sprs.lpcr = mfspr(SPRN_LPCR);
+ sprs.hfscr = mfspr(SPRN_HFSCR);
+ sprs.fscr = mfspr(SPRN_FSCR);
+ sprs.pid = mfspr(SPRN_PID);
+ sprs.purr = mfspr(SPRN_PURR);
+ sprs.spurr = mfspr(SPRN_SPURR);
+ sprs.dscr = mfspr(SPRN_DSCR);
+ sprs.ciabr = mfspr(SPRN_CIABR);
+
+ sprs.mmcra = mfspr(SPRN_MMCRA);
+ sprs.mmcr0 = mfspr(SPRN_MMCR0);
+ sprs.mmcr1 = mfspr(SPRN_MMCR1);
+ sprs.mmcr2 = mfspr(SPRN_MMCR2);
+
+ sprs.ptcr = mfspr(SPRN_PTCR);
+ sprs.rpr = mfspr(SPRN_RPR);
+ sprs.tscr = mfspr(SPRN_TSCR);
+ if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+ sprs.ldbar = mfspr(SPRN_LDBAR);
+
+ sprs_saved = true;
+
+ atomic_start_thread_idle();
+ }
+
+ sprs.amr = mfspr(SPRN_AMR);
+ sprs.iamr = mfspr(SPRN_IAMR);
+ sprs.uamor = mfspr(SPRN_UAMOR);
+
+ srr1 = isa300_idle_stop_mayloss(psscr); /* go idle */
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ local_paca->requested_psscr = 0;
+#endif
psscr = mfspr(SPRN_PSSCR);
- psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
- __ppc64_runlatch_off();
- srr1 = power9_idle_stop(psscr);
- __ppc64_runlatch_on();
+ WARN_ON_ONCE(!srr1);
+ WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
- fini_irq_for_idle_irqsoff();
+ if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) {
+ /*
+ * We don't need an isync after the mtsprs here because the
+ * upcoming mtmsrd is execution synchronizing.
+ */
+ mtspr(SPRN_AMR, sprs.amr);
+ mtspr(SPRN_IAMR, sprs.iamr);
+ mtspr(SPRN_AMOR, ~0);
+ mtspr(SPRN_UAMOR, sprs.uamor);
- return srr1;
-}
+ /*
+ * Workaround for POWER9 DD2.0, if we lost resources, the ERAT
+ * might have been corrupted and needs flushing. We also need
+ * to reload MMCR0 (see mmcr0 comment above).
+ */
+ if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) {
+ asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT);
+ mtspr(SPRN_MMCR0, mmcr0);
+ }
-void power9_idle_type(unsigned long stop_psscr_val,
- unsigned long stop_psscr_mask)
-{
- unsigned long srr1;
+ /*
+ * DD2.2 and earlier need to set then clear bit 60 in MMCRA
+ * to ensure the PMU starts running.
+ */
+ mmcra = mfspr(SPRN_MMCRA);
+ mmcra |= PPC_BIT(60);
+ mtspr(SPRN_MMCRA, mmcra);
+ mmcra &= ~PPC_BIT(60);
+ mtspr(SPRN_MMCRA, mmcra);
+ }
- srr1 = __power9_idle_type(stop_psscr_val, stop_psscr_mask);
- irq_set_pending_from_srr1(srr1);
-}
+ if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
+ hmi_exception_realmode(NULL);
-/*
- * Used for ppc_md.power_save which needs a function with no parameters
- */
-void power9_idle(void)
-{
- power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask);
+ /*
+ * On POWER9, SRR1 bits do not match exactly as expected.
+ * SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so
+ * just always test PSSCR for SPR/TB state loss.
+ */
+ pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT;
+ if (likely(pls < deep_spr_loss_state)) {
+ if (sprs_saved)
+ atomic_stop_thread_idle();
+ goto out;
+ }
+
+ /* HV state loss */
+ BUG_ON(!sprs_saved);
+
+ atomic_lock_thread_idle();
+
+ if ((*state & core_thread_mask) != 0)
+ goto core_woken;
+
+ /* Per-core SPRs */
+ mtspr(SPRN_PTCR, sprs.ptcr);
+ mtspr(SPRN_RPR, sprs.rpr);
+ mtspr(SPRN_TSCR, sprs.tscr);
+
+ if (pls >= pnv_first_tb_loss_level) {
+ /* TB loss */
+ if (opal_resync_timebase() != OPAL_SUCCESS)
+ BUG();
+ }
+
+ /*
+ * isync after restoring shared SPRs and before unlocking. Unlock
+ * only contains hwsync which does not necessarily do the right
+ * thing for SPRs.
+ */
+ isync();
+
+core_woken:
+ atomic_unlock_and_stop_thread_idle();
+
+ /* Per-thread SPRs */
+ mtspr(SPRN_LPCR, sprs.lpcr);
+ mtspr(SPRN_HFSCR, sprs.hfscr);
+ mtspr(SPRN_FSCR, sprs.fscr);
+ mtspr(SPRN_PID, sprs.pid);
+ mtspr(SPRN_PURR, sprs.purr);
+ mtspr(SPRN_SPURR, sprs.spurr);
+ mtspr(SPRN_DSCR, sprs.dscr);
+ mtspr(SPRN_CIABR, sprs.ciabr);
+
+ mtspr(SPRN_MMCRA, sprs.mmcra);
+ mtspr(SPRN_MMCR0, sprs.mmcr0);
+ mtspr(SPRN_MMCR1, sprs.mmcr1);
+ mtspr(SPRN_MMCR2, sprs.mmcr2);
+ if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+ mtspr(SPRN_LDBAR, sprs.ldbar);
+
+ mtspr(SPRN_SPRG3, local_paca->sprg_vdso);
+
+ if (!radix_enabled())
+ __slb_restore_bolted_realmode();
+
+out:
+ mtmsr(MSR_KERNEL);
+
+ return srr1;
}
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -409,7 +828,7 @@ void pnv_power9_force_smt4_catch(void)
atomic_inc(&paca_ptrs[cpu0+thr]->dont_stop);
}
/* order setting dont_stop vs testing requested_psscr */
- mb();
+ smp_mb();
for (thr = 0; thr < threads_per_core; ++thr) {
if (!paca_ptrs[cpu0+thr]->requested_psscr)
++awake_threads;
@@ -457,8 +876,168 @@ void pnv_power9_force_smt4_release(void)
EXPORT_SYMBOL_GPL(pnv_power9_force_smt4_release);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+struct p10_sprs {
+ /*
+ * SPRs that get lost in shallow states:
+ *
+ * P10 loses CR, LR, CTR, FPSCR, VSCR, XER, TAR, SPRG2, and HSPRG1
+ * isa300 idle routines restore CR, LR.
+ * CTR is volatile
+ * idle thread doesn't use FP or VEC
+ * kernel doesn't use TAR
+ * HSPRG1 is only live in HV interrupt entry
+ * SPRG2 is only live in KVM guests, KVM handles it.
+ */
+};
+
+static unsigned long power10_idle_stop(unsigned long psscr)
+{
+ int cpu = raw_smp_processor_id();
+ int first = cpu_first_thread_sibling(cpu);
+ unsigned long *state = &paca_ptrs[first]->idle_state;
+ unsigned long core_thread_mask = (1UL << threads_per_core) - 1;
+ unsigned long srr1;
+ unsigned long pls;
+// struct p10_sprs sprs = {}; /* avoid false used-uninitialised */
+ bool sprs_saved = false;
+
+ if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
+ /* EC=ESL=0 case */
+
+ /*
+ * Wake synchronously. SRESET via xscom may still cause
+ * a 0x100 powersave wakeup with SRR1 reason!
+ */
+ srr1 = isa300_idle_stop_noloss(psscr); /* go idle */
+ if (likely(!srr1))
+ return 0;
+
+ /*
+ * Registers not saved, can't recover!
+ * This would be a hardware bug
+ */
+ BUG_ON((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS);
+
+ goto out;
+ }
+
+ /* EC=ESL=1 case */
+ if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) {
+ /* XXX: save SPRs for deep state loss here. */
+
+ sprs_saved = true;
+
+ atomic_start_thread_idle();
+ }
+
+ srr1 = isa300_idle_stop_mayloss(psscr); /* go idle */
+
+ psscr = mfspr(SPRN_PSSCR);
+
+ WARN_ON_ONCE(!srr1);
+ WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
+
+ if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
+ hmi_exception_realmode(NULL);
+
+ /*
+ * On POWER10, SRR1 bits do not match exactly as expected.
+ * SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so
+ * just always test PSSCR for SPR/TB state loss.
+ */
+ pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT;
+ if (likely(pls < deep_spr_loss_state)) {
+ if (sprs_saved)
+ atomic_stop_thread_idle();
+ goto out;
+ }
+
+ /* HV state loss */
+ BUG_ON(!sprs_saved);
+
+ atomic_lock_thread_idle();
+
+ if ((*state & core_thread_mask) != 0)
+ goto core_woken;
+
+ /* XXX: restore per-core SPRs here */
+
+ if (pls >= pnv_first_tb_loss_level) {
+ /* TB loss */
+ if (opal_resync_timebase() != OPAL_SUCCESS)
+ BUG();
+ }
+
+ /*
+ * isync after restoring shared SPRs and before unlocking. Unlock
+ * only contains hwsync which does not necessarily do the right
+ * thing for SPRs.
+ */
+ isync();
+
+core_woken:
+ atomic_unlock_and_stop_thread_idle();
+
+ /* XXX: restore per-thread SPRs here */
+
+ if (!radix_enabled())
+ __slb_restore_bolted_realmode();
+
+out:
+ mtmsr(MSR_KERNEL);
+
+ return srr1;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static unsigned long arch300_offline_stop(unsigned long psscr)
+{
+ unsigned long srr1;
+
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ srr1 = power10_idle_stop(psscr);
+ else
+ srr1 = power9_idle_stop(psscr);
+
+ return srr1;
+}
+#endif
+
+void arch300_idle_type(unsigned long stop_psscr_val,
+ unsigned long stop_psscr_mask)
+{
+ unsigned long psscr;
+ unsigned long srr1;
+
+ if (!prep_irq_for_idle_irqsoff())
+ return;
+
+ psscr = mfspr(SPRN_PSSCR);
+ psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
+
+ __ppc64_runlatch_off();
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
+ srr1 = power10_idle_stop(psscr);
+ else
+ srr1 = power9_idle_stop(psscr);
+ __ppc64_runlatch_on();
+
+ fini_irq_for_idle_irqsoff();
+
+ irq_set_pending_from_srr1(srr1);
+}
+
+/*
+ * Used for ppc_md.power_save which needs a function with no parameters
+ */
+static void arch300_idle(void)
+{
+ arch300_idle_type(pnv_default_stop_val, pnv_default_stop_mask);
+}
+
#ifdef CONFIG_HOTPLUG_CPU
-static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
+
+void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
{
u64 pir = get_hard_smp_processor_id(cpu);
@@ -480,21 +1059,6 @@ static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
unsigned long pnv_cpu_offline(unsigned int cpu)
{
unsigned long srr1;
- u32 idle_states = pnv_get_supported_cpuidle_states();
- u64 lpcr_val;
-
- /*
- * We don't want to take decrementer interrupts while we are
- * offline, so clear LPCR:PECE1. We keep PECE2 (and
- * LPCR_PECE_HVEE on P9) enabled as to let IPIs in.
- *
- * If the CPU gets woken up by a special wakeup, ensure that
- * the SLW engine sets LPCR with decrementer bit cleared, else
- * the CPU will come back to the kernel due to a spurious
- * wakeup.
- */
- lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
- pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
__ppc64_runlatch_off();
@@ -504,16 +1068,9 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
psscr = mfspr(SPRN_PSSCR);
psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
pnv_deepest_stop_psscr_val;
- srr1 = power9_offline_stop(psscr);
-
- } else if ((idle_states & OPAL_PM_WINKLE_ENABLED) &&
- (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) {
- srr1 = power7_idle_insn(PNV_THREAD_WINKLE);
- } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
- (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
- srr1 = power7_idle_insn(PNV_THREAD_SLEEP);
- } else if (idle_states & OPAL_PM_NAP_ENABLED) {
- srr1 = power7_idle_insn(PNV_THREAD_NAP);
+ srr1 = arch300_offline_stop(psscr);
+ } else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) {
+ srr1 = power7_offline();
} else {
/* This is the fallback method. We emulate snooze */
while (!generic_check_cpu_restart(cpu)) {
@@ -526,16 +1083,6 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
__ppc64_runlatch_on();
- /*
- * Re-enable decrementer interrupts in LPCR.
- *
- * Further, we want stop states to be woken up by decrementer
- * for non-hotplug cases. So program the LPCR via stop api as
- * well.
- */
- lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1;
- pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
-
return srr1;
}
#endif
@@ -577,7 +1124,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
* stop instruction
*/
-int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags)
+int __init validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags)
{
int err = 0;
@@ -619,33 +1166,53 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags)
* @dt_idle_states: Number of idle state entries
* Returns 0 on success
*/
-static int __init pnv_power9_idle_init(void)
+static void __init pnv_arch300_idle_init(void)
{
u64 max_residency_ns = 0;
int i;
+ /* stop is not really architected, we only have p9,p10 drivers */
+ if (!pvr_version_is(PVR_POWER10) && !pvr_version_is(PVR_POWER9))
+ return;
+
/*
- * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask},
- * and the pnv_default_stop_{val,mask}.
- *
- * pnv_first_deep_stop_state should be set to the first stop
- * level to cause hypervisor state loss.
- *
* pnv_deepest_stop_{val,mask} should be set to values corresponding to
* the deepest stop state.
*
* pnv_default_stop_{val,mask} should be set to values corresponding to
- * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state.
+ * the deepest loss-less (OPAL_PM_STOP_INST_FAST) stop state.
*/
- pnv_first_deep_stop_state = MAX_STOP_STATE;
+ pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
+ deep_spr_loss_state = MAX_STOP_STATE + 1;
for (i = 0; i < nr_pnv_idle_states; i++) {
int err;
struct pnv_idle_states_t *state = &pnv_idle_states[i];
u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK;
+ /* No deep loss driver implemented for POWER10 yet */
+ if (pvr_version_is(PVR_POWER10) &&
+ state->flags & (OPAL_PM_TIMEBASE_STOP|OPAL_PM_LOSE_FULL_CONTEXT))
+ continue;
+
+ if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
+ (pnv_first_tb_loss_level > psscr_rl))
+ pnv_first_tb_loss_level = psscr_rl;
+
if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
- pnv_first_deep_stop_state > psscr_rl)
- pnv_first_deep_stop_state = psscr_rl;
+ (deep_spr_loss_state > psscr_rl))
+ deep_spr_loss_state = psscr_rl;
+
+ /*
+ * The idle code does not deal with TB loss occurring
+ * in a shallower state than SPR loss, so force it to
+ * behave like SPRs are lost if TB is lost. POWER9 would
+ * never encounter this, but a POWER8 core would if it
+ * implemented the stop instruction. So this is for forward
+ * compatibility.
+ */
+ if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
+ (deep_spr_loss_state > psscr_rl))
+ deep_spr_loss_state = psscr_rl;
err = validate_psscr_val_mask(&state->psscr_val,
&state->psscr_mask,
@@ -670,13 +1237,14 @@ static int __init pnv_power9_idle_init(void)
pnv_default_stop_val = state->psscr_val;
pnv_default_stop_mask = state->psscr_mask;
default_stop_found = true;
+ WARN_ON(state->flags & OPAL_PM_LOSE_FULL_CONTEXT);
}
}
if (unlikely(!default_stop_found)) {
pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n");
} else {
- ppc_md.power_save = power9_idle;
+ ppc_md.power_save = arch300_idle;
pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n",
pnv_default_stop_val, pnv_default_stop_mask);
}
@@ -689,10 +1257,40 @@ static int __init pnv_power9_idle_init(void)
pnv_deepest_stop_psscr_mask);
}
- pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n",
- pnv_first_deep_stop_state);
+ pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%llx\n",
+ deep_spr_loss_state);
- return 0;
+ pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%llx\n",
+ pnv_first_tb_loss_level);
+}
+
+static void __init pnv_disable_deep_states(void)
+{
+ /*
+ * The stop-api is unable to restore hypervisor
+ * resources on wakeup from platform idle states which
+ * lose full context. So disable such states.
+ */
+ supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
+ pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
+ pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+ (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
+ /*
+ * Use the default stop state for CPU-Hotplug
+ * if available.
+ */
+ if (default_stop_found) {
+ pnv_deepest_stop_psscr_val = pnv_default_stop_val;
+ pnv_deepest_stop_psscr_mask = pnv_default_stop_mask;
+ pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
+ pnv_deepest_stop_psscr_val);
+ } else { /* Fallback to snooze loop for CPU-Hotplug */
+ deepest_stop_found = false;
+ pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
+ }
+ }
}
/*
@@ -707,10 +1305,8 @@ static void __init pnv_probe_idle_states(void)
return;
}
- if (cpu_has_feature(CPU_FTR_ARCH_300)) {
- if (pnv_power9_idle_init())
- return;
- }
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ pnv_arch300_idle_init();
for (i = 0; i < nr_pnv_idle_states; i++)
supported_cpuidle_states |= pnv_idle_states[i].flags;
@@ -722,7 +1318,7 @@ static void __init pnv_probe_idle_states(void)
* which is the number of cpuidle states discovered through device-tree.
*/
-static int pnv_parse_cpuidle_dt(void)
+static int __init pnv_parse_cpuidle_dt(void)
{
struct device_node *np;
int nr_idle_states, i;
@@ -774,14 +1370,14 @@ static int pnv_parse_cpuidle_dt(void)
/* Read residencies */
if (of_property_read_u32_array(np, "ibm,cpu-idle-state-residency-ns",
temp_u32, nr_idle_states)) {
- pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-latencies-ns in DT\n");
+ pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-residency-ns in DT\n");
rc = -EINVAL;
goto out;
}
for (i = 0; i < nr_idle_states; i++)
pnv_idle_states[i].residency_ns = temp_u32[i];
- /* For power9 */
+ /* For power9 and later */
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
/* Read pm_crtl_val */
if (of_property_read_u64_array(np, "ibm,cpu-idle-state-psscr",
@@ -817,7 +1413,7 @@ static int pnv_parse_cpuidle_dt(void)
goto out;
}
for (i = 0; i < nr_idle_states; i++)
- strlcpy(pnv_idle_states[i].name, temp_string[i],
+ strscpy(pnv_idle_states[i].name, temp_string[i],
PNV_IDLE_NAME_LEN);
nr_pnv_idle_states = nr_idle_states;
rc = 0;
@@ -825,16 +1421,39 @@ out:
kfree(temp_u32);
kfree(temp_u64);
kfree(temp_string);
+ of_node_put(np);
return rc;
}
static int __init pnv_init_idle_states(void)
{
+ int cpu;
int rc = 0;
- supported_cpuidle_states = 0;
+
+ /* Set up PACA fields */
+ for_each_present_cpu(cpu) {
+ struct paca_struct *p = paca_ptrs[cpu];
+
+ p->idle_state = 0;
+ if (cpu == cpu_first_thread_sibling(cpu))
+ p->idle_state = (1 << threads_per_core) - 1;
+
+ if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+ /* P7/P8 nap */
+ p->thread_idle_state = PNV_THREAD_RUNNING;
+ } else if (pvr_version_is(PVR_POWER9)) {
+ /* P9 stop workarounds */
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ p->requested_psscr = 0;
+ atomic_set(&p->dont_stop, 0);
+#endif
+ }
+ }
/* In case we error out nr_pnv_idle_states will be zero */
nr_pnv_idle_states = 0;
+ supported_cpuidle_states = 0;
+
if (cpuidle_disable != IDLE_NO_OVERRIDE)
goto out;
rc = pnv_parse_cpuidle_dt();
@@ -842,27 +1461,45 @@ static int __init pnv_init_idle_states(void)
return rc;
pnv_probe_idle_states();
- if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
- patch_instruction(
- (unsigned int *)pnv_fastsleep_workaround_at_entry,
- PPC_INST_NOP);
- patch_instruction(
- (unsigned int *)pnv_fastsleep_workaround_at_exit,
- PPC_INST_NOP);
- } else {
- /*
- * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
- * workaround is needed to use fastsleep. Provide sysfs
- * control to choose how this workaround has to be applied.
- */
- device_create_file(cpu_subsys.dev_root,
- &dev_attr_fastsleep_workaround_applyonce);
- }
+ if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+ if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
+ power7_fastsleep_workaround_entry = false;
+ power7_fastsleep_workaround_exit = false;
+ } else {
+ struct device *dev_root;
+ /*
+ * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
+ * workaround is needed to use fastsleep. Provide sysfs
+ * control to choose how this workaround has to be
+ * applied.
+ */
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ device_create_file(dev_root,
+ &dev_attr_fastsleep_workaround_applyonce);
+ put_device(dev_root);
+ }
+ }
+
+ update_subcore_sibling_mask();
- pnv_alloc_idle_core_states();
+ if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) {
+ ppc_md.power_save = power7_idle;
+ power7_offline_type = PNV_THREAD_NAP;
+ }
+
+ if ((supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED) &&
+ (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT))
+ power7_offline_type = PNV_THREAD_WINKLE;
+ else if ((supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED) ||
+ (supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1))
+ power7_offline_type = PNV_THREAD_SLEEP;
+ }
- if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED)
- ppc_md.power_save = power7_idle;
+ if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
+ if (pnv_save_sprs_for_deep_states())
+ pnv_disable_deep_states();
+ }
out:
return 0;
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 84d038ed3882..2ea30b343354 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) IBM Corporation, 2014, 2017
* Anton Blanchard, Rashmica Gupta.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#define pr_fmt(fmt) "memtrace: " fmt
@@ -20,8 +16,9 @@
#include <linux/slab.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
+#include <linux/numa.h>
#include <asm/machdep.h>
-#include <asm/debugfs.h>
+#include <asm/cacheflush.h>
/* This enables us to keep track of the memory removed from each node. */
struct memtrace_entry {
@@ -33,6 +30,7 @@ struct memtrace_entry {
char name[16];
};
+static DEFINE_MUTEX(memtrace_mutex);
static u64 memtrace_size;
static struct memtrace_entry *memtrace_array;
@@ -47,90 +45,87 @@ static ssize_t memtrace_read(struct file *filp, char __user *ubuf,
return simple_read_from_buffer(ubuf, count, ppos, ent->mem, ent->size);
}
-static const struct file_operations memtrace_fops = {
- .llseek = default_llseek,
- .read = memtrace_read,
- .open = simple_open,
-};
-
-static int check_memblock_online(struct memory_block *mem, void *arg)
+static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma)
{
- if (mem->state != MEM_ONLINE)
- return -1;
-
- return 0;
-}
+ struct memtrace_entry *ent = filp->private_data;
+ unsigned long ent_nrpages = ent->size >> PAGE_SHIFT;
+ unsigned long vma_nrpages = vma_pages(vma);
-static int change_memblock_state(struct memory_block *mem, void *arg)
-{
- unsigned long state = (unsigned long)arg;
+ /* The requested page offset should be within object's page count */
+ if (vma->vm_pgoff >= ent_nrpages)
+ return -EINVAL;
- mem->state = state;
+ /* The requested mapping range should remain within the bounds */
+ if (vma_nrpages > ent_nrpages - vma->vm_pgoff)
+ return -EINVAL;
- return 0;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ return remap_pfn_range(vma, vma->vm_start, PHYS_PFN(ent->start) + vma->vm_pgoff,
+ vma->vm_end - vma->vm_start, vma->vm_page_prot);
}
-/* called with device_hotplug_lock held */
-static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
-{
- u64 end_pfn = start_pfn + nr_pages - 1;
-
- if (walk_memory_range(start_pfn, end_pfn, NULL,
- check_memblock_online))
- return false;
+static const struct file_operations memtrace_fops = {
+ .llseek = default_llseek,
+ .read = memtrace_read,
+ .open = simple_open,
+ .mmap = memtrace_mmap,
+};
- walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
- change_memblock_state);
+#define FLUSH_CHUNK_SIZE SZ_1G
+/**
+ * flush_dcache_range_chunked(): Write any modified data cache blocks out to
+ * memory and invalidate them, in chunks of up to FLUSH_CHUNK_SIZE
+ * Does not invalidate the corresponding instruction cache blocks.
+ *
+ * @start: the start address
+ * @stop: the stop address (exclusive)
+ * @chunk: the max size of the chunks
+ */
+static void flush_dcache_range_chunked(unsigned long start, unsigned long stop,
+ unsigned long chunk)
+{
+ unsigned long i;
- if (offline_pages(start_pfn, nr_pages)) {
- walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
- change_memblock_state);
- return false;
+ for (i = start; i < stop; i += chunk) {
+ flush_dcache_range(i, min(stop, i + chunk));
+ cond_resched();
}
-
- walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
- change_memblock_state);
-
-
- return true;
}
static u64 memtrace_alloc_node(u32 nid, u64 size)
{
- u64 start_pfn, end_pfn, nr_pages, pfn;
- u64 base_pfn;
- u64 bytes = memory_block_size_bytes();
+ const unsigned long nr_pages = PHYS_PFN(size);
+ unsigned long pfn, start_pfn;
+ struct page *page;
- if (!node_spanned_pages(nid))
+ /*
+ * Trace memory needs to be aligned to the size, which is guaranteed
+ * by alloc_contig_pages().
+ */
+ page = alloc_contig_pages(nr_pages, GFP_KERNEL | __GFP_THISNODE |
+ __GFP_NOWARN | __GFP_ZERO, nid, NULL);
+ if (!page)
return 0;
+ start_pfn = page_to_pfn(page);
- start_pfn = node_start_pfn(nid);
- end_pfn = node_end_pfn(nid);
- nr_pages = size >> PAGE_SHIFT;
-
- /* Trace memory needs to be aligned to the size */
- end_pfn = round_down(end_pfn - nr_pages, nr_pages);
-
- lock_device_hotplug();
- for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) {
- if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) {
- /*
- * Remove memory in memory block size chunks so that
- * iomem resources are always split to the same size and
- * we never try to remove memory that spans two iomem
- * resources.
- */
- end_pfn = base_pfn + nr_pages;
- for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) {
- __remove_memory(nid, pfn << PAGE_SHIFT, bytes);
- }
- unlock_device_hotplug();
- return base_pfn << PAGE_SHIFT;
- }
- }
- unlock_device_hotplug();
+ /*
+ * Before we go ahead and use this range as cache inhibited range
+ * flush the cache.
+ */
+ flush_dcache_range_chunked((unsigned long)pfn_to_kaddr(start_pfn),
+ (unsigned long)pfn_to_kaddr(start_pfn + nr_pages),
+ FLUSH_CHUNK_SIZE);
- return 0;
+ /*
+ * Set pages PageOffline(), to indicate that nobody (e.g., hibernation,
+ * dumping, ...) should be touching these pages.
+ */
+ for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++)
+ __SetPageOffline(pfn_to_page(pfn));
+
+ arch_remove_linear_mapping(PFN_PHYS(start_pfn), size);
+
+ return PFN_PHYS(start_pfn);
}
static int memtrace_init_regions_runtime(u64 size)
@@ -190,14 +185,9 @@ static int memtrace_init_debugfs(void)
snprintf(ent->name, 16, "%08x", ent->nid);
dir = debugfs_create_dir(ent->name, memtrace_debugfs_dir);
- if (!dir) {
- pr_err("Failed to create debugfs directory for node %d\n",
- ent->nid);
- return -1;
- }
ent->dir = dir;
- debugfs_create_file("trace", 0400, dir, ent, &memtrace_fops);
+ debugfs_create_file_unsafe("trace", 0600, dir, ent, &memtrace_fops);
debugfs_create_x64("start", 0400, dir, &ent->start);
debugfs_create_x64("size", 0400, dir, &ent->size);
}
@@ -205,16 +195,30 @@ static int memtrace_init_debugfs(void)
return ret;
}
-static int online_mem_block(struct memory_block *mem, void *arg)
+static int memtrace_free(int nid, u64 start, u64 size)
{
- return device_online(&mem->dev);
+ struct mhp_params params = { .pgprot = PAGE_KERNEL };
+ const unsigned long nr_pages = PHYS_PFN(size);
+ const unsigned long start_pfn = PHYS_PFN(start);
+ unsigned long pfn;
+ int ret;
+
+ ret = arch_create_linear_mapping(nid, start, size, &params);
+ if (ret)
+ return ret;
+
+ for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++)
+ __ClearPageOffline(pfn_to_page(pfn));
+
+ free_contig_range(start_pfn, nr_pages);
+ return 0;
}
/*
- * Iterate through the chunks of memory we have removed from the kernel
- * and attempt to add them back to the kernel.
+ * Iterate through the chunks of memory we allocated and attempt to expose
+ * them back to the kernel.
*/
-static int memtrace_online(void)
+static int memtrace_free_regions(void)
{
int i, ret = 0;
struct memtrace_entry *ent;
@@ -222,8 +226,8 @@ static int memtrace_online(void)
for (i = memtrace_array_nr - 1; i >= 0; i--) {
ent = &memtrace_array[i];
- /* We have onlined this chunk previously */
- if (ent->nid == -1)
+ /* We have freed this chunk previously */
+ if (ent->nid == NUMA_NO_NODE)
continue;
/* Remove from io mappings */
@@ -232,37 +236,25 @@ static int memtrace_online(void)
ent->mem = 0;
}
- if (add_memory(ent->nid, ent->start, ent->size)) {
- pr_err("Failed to add trace memory to node %d\n",
+ if (memtrace_free(ent->nid, ent->start, ent->size)) {
+ pr_err("Failed to free trace memory on node %d\n",
ent->nid);
ret += 1;
continue;
}
/*
- * If kernel isn't compiled with the auto online option
- * we need to online the memory ourselves.
- */
- if (!memhp_auto_online) {
- lock_device_hotplug();
- walk_memory_range(PFN_DOWN(ent->start),
- PFN_UP(ent->start + ent->size - 1),
- NULL, online_mem_block);
- unlock_device_hotplug();
- }
-
- /*
- * Memory was added successfully so clean up references to it
- * so on reentry we can tell that this chunk was added.
+ * Memory was freed successfully so clean up references to it
+ * so on reentry we can tell that this chunk was freed.
*/
debugfs_remove_recursive(ent->dir);
- pr_info("Added trace memory back to node %d\n", ent->nid);
- ent->size = ent->start = ent->nid = -1;
+ pr_info("Freed trace memory back on node %d\n", ent->nid);
+ ent->size = ent->start = ent->nid = NUMA_NO_NODE;
}
if (ret)
return ret;
- /* If all chunks of memory were added successfully, reset globals */
+ /* If all chunks of memory were freed successfully, reset globals */
kfree(memtrace_array);
memtrace_array = NULL;
memtrace_size = 0;
@@ -272,6 +264,7 @@ static int memtrace_online(void)
static int memtrace_enable_set(void *data, u64 val)
{
+ int rc = -EAGAIN;
u64 bytes;
/*
@@ -284,25 +277,29 @@ static int memtrace_enable_set(void *data, u64 val)
return -EINVAL;
}
- /* Re-add/online previously removed/offlined memory */
- if (memtrace_size) {
- if (memtrace_online())
- return -EAGAIN;
- }
+ mutex_lock(&memtrace_mutex);
- if (!val)
- return 0;
+ /* Free all previously allocated memory. */
+ if (memtrace_size && memtrace_free_regions())
+ goto out_unlock;
+
+ if (!val) {
+ rc = 0;
+ goto out_unlock;
+ }
- /* Offline and remove memory */
+ /* Allocate memory. */
if (memtrace_init_regions_runtime(val))
- return -EINVAL;
+ goto out_unlock;
if (memtrace_init_debugfs())
- return -EINVAL;
+ goto out_unlock;
memtrace_size = val;
-
- return 0;
+ rc = 0;
+out_unlock:
+ mutex_unlock(&memtrace_mutex);
+ return rc;
}
static int memtrace_enable_get(void *data, u64 *val)
@@ -317,9 +314,7 @@ DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get,
static int memtrace_init(void)
{
memtrace_debugfs_dir = debugfs_create_dir("memtrace",
- powerpc_debugfs_root);
- if (!memtrace_debugfs_dir)
- return -1;
+ arch_debugfs_dir);
debugfs_create_file("enable", 0600, memtrace_debugfs_dir,
NULL, &memtrace_init_fops);
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
deleted file mode 100644
index 3f58c7dbd581..000000000000
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ /dev/null
@@ -1,1291 +0,0 @@
-/*
- * This file implements the DMA operations for NVLink devices. The NPU
- * devices all point to the same iommu table as the parent PCI device.
- *
- * Copyright Alistair Popple, IBM Corporation 2015.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- */
-
-#include <linux/mmu_notifier.h>
-#include <linux/mmu_context.h>
-#include <linux/of.h>
-#include <linux/pci.h>
-#include <linux/memblock.h>
-#include <linux/sizes.h>
-
-#include <asm/debugfs.h>
-#include <asm/powernv.h>
-#include <asm/opal.h>
-
-#include "pci.h"
-
-/*
- * spinlock to protect initialisation of an npu_context for a particular
- * mm_struct.
- */
-static DEFINE_SPINLOCK(npu_context_lock);
-
-/*
- * Other types of TCE cache invalidation are not functional in the
- * hardware.
- */
-static struct pci_dev *get_pci_dev(struct device_node *dn)
-{
- struct pci_dn *pdn = PCI_DN(dn);
-
- return pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
- pdn->busno, pdn->devfn);
-}
-
-/* Given a NPU device get the associated PCI device. */
-struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
-{
- struct device_node *dn;
- struct pci_dev *gpdev;
-
- if (WARN_ON(!npdev))
- return NULL;
-
- if (WARN_ON(!npdev->dev.of_node))
- return NULL;
-
- /* Get assoicated PCI device */
- dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
- if (!dn)
- return NULL;
-
- gpdev = get_pci_dev(dn);
- of_node_put(dn);
-
- return gpdev;
-}
-EXPORT_SYMBOL(pnv_pci_get_gpu_dev);
-
-/* Given the real PCI device get a linked NPU device. */
-struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
-{
- struct device_node *dn;
- struct pci_dev *npdev;
-
- if (WARN_ON(!gpdev))
- return NULL;
-
- /* Not all PCI devices have device-tree nodes */
- if (!gpdev->dev.of_node)
- return NULL;
-
- /* Get assoicated PCI device */
- dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
- if (!dn)
- return NULL;
-
- npdev = get_pci_dev(dn);
- of_node_put(dn);
-
- return npdev;
-}
-EXPORT_SYMBOL(pnv_pci_get_npu_dev);
-
-/*
- * Returns the PE assoicated with the PCI device of the given
- * NPU. Returns the linked pci device if pci_dev != NULL.
- */
-static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
- struct pci_dev **gpdev)
-{
- struct pnv_phb *phb;
- struct pci_controller *hose;
- struct pci_dev *pdev;
- struct pnv_ioda_pe *pe;
- struct pci_dn *pdn;
-
- pdev = pnv_pci_get_gpu_dev(npe->pdev);
- if (!pdev)
- return NULL;
-
- pdn = pci_get_pdn(pdev);
- if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
- return NULL;
-
- hose = pci_bus_to_host(pdev->bus);
- phb = hose->private_data;
- pe = &phb->ioda.pe_array[pdn->pe_number];
-
- if (gpdev)
- *gpdev = pdev;
-
- return pe;
-}
-
-static long pnv_npu_unset_window(struct iommu_table_group *table_group,
- int num);
-
-static long pnv_npu_set_window(struct iommu_table_group *table_group, int num,
- struct iommu_table *tbl)
-{
- struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
- table_group);
- struct pnv_phb *phb = npe->phb;
- int64_t rc;
- const unsigned long size = tbl->it_indirect_levels ?
- tbl->it_level_size : tbl->it_size;
- const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
- const __u64 win_size = tbl->it_size << tbl->it_page_shift;
- int num2 = (num == 0) ? 1 : 0;
-
- /* NPU has just one TVE so if there is another table, remove it first */
- if (npe->table_group.tables[num2])
- pnv_npu_unset_window(&npe->table_group, num2);
-
- pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
- start_addr, start_addr + win_size - 1,
- IOMMU_PAGE_SIZE(tbl));
-
- rc = opal_pci_map_pe_dma_window(phb->opal_id,
- npe->pe_number,
- npe->pe_number,
- tbl->it_indirect_levels + 1,
- __pa(tbl->it_base),
- size << 3,
- IOMMU_PAGE_SIZE(tbl));
- if (rc) {
- pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
- return rc;
- }
- pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
- /* Add the table to the list so its TCE cache will get invalidated */
- pnv_pci_link_table_and_group(phb->hose->node, num,
- tbl, &npe->table_group);
-
- return 0;
-}
-
-static long pnv_npu_unset_window(struct iommu_table_group *table_group, int num)
-{
- struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
- table_group);
- struct pnv_phb *phb = npe->phb;
- int64_t rc;
-
- if (!npe->table_group.tables[num])
- return 0;
-
- pe_info(npe, "Removing DMA window\n");
-
- rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
- npe->pe_number,
- 0/* levels */, 0/* table address */,
- 0/* table size */, 0/* page size */);
- if (rc) {
- pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
- return rc;
- }
- pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
- pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
- &npe->table_group);
-
- return 0;
-}
-
-/*
- * Enables 32 bit DMA on NPU.
- */
-static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
-{
- struct pci_dev *gpdev;
- struct pnv_ioda_pe *gpe;
- int64_t rc;
-
- /*
- * Find the assoicated PCI devices and get the dma window
- * information from there.
- */
- if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
- return;
-
- gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
- if (!gpe)
- return;
-
- rc = pnv_npu_set_window(&npe->table_group, 0,
- gpe->table_group.tables[0]);
-
- /*
- * NVLink devices use the same TCE table configuration as
- * their parent device so drivers shouldn't be doing DMA
- * operations directly on these devices.
- */
- set_dma_ops(&npe->pdev->dev, NULL);
-}
-
-/*
- * Enables bypass mode on the NPU. The NPU only supports one
- * window per link, so bypass needs to be explicitly enabled or
- * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
- * active at the same time.
- */
-static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
-{
- struct pnv_phb *phb = npe->phb;
- int64_t rc = 0;
- phys_addr_t top = memblock_end_of_DRAM();
-
- if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
- return -EINVAL;
-
- rc = pnv_npu_unset_window(&npe->table_group, 0);
- if (rc != OPAL_SUCCESS)
- return rc;
-
- /* Enable the bypass window */
-
- top = roundup_pow_of_two(top);
- dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n",
- npe->pe_number);
- rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
- npe->pe_number, npe->pe_number,
- 0 /* bypass base */, top);
-
- if (rc == OPAL_SUCCESS)
- pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
- return rc;
-}
-
-void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
-{
- int i;
- struct pnv_phb *phb;
- struct pci_dn *pdn;
- struct pnv_ioda_pe *npe;
- struct pci_dev *npdev;
-
- for (i = 0; ; ++i) {
- npdev = pnv_pci_get_npu_dev(gpdev, i);
-
- if (!npdev)
- break;
-
- pdn = pci_get_pdn(npdev);
- if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
- return;
-
- phb = pci_bus_to_host(npdev->bus)->private_data;
-
- /* We only do bypass if it's enabled on the linked device */
- npe = &phb->ioda.pe_array[pdn->pe_number];
-
- if (bypass) {
- dev_info(&npdev->dev,
- "Using 64-bit DMA iommu bypass\n");
- pnv_npu_dma_set_bypass(npe);
- } else {
- dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
- pnv_npu_dma_set_32(npe);
- }
- }
-}
-
-#ifdef CONFIG_IOMMU_API
-/* Switch ownership from platform code to external user (e.g. VFIO) */
-static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
-{
- struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
- table_group);
- struct pnv_phb *phb = npe->phb;
- int64_t rc;
- struct pci_dev *gpdev = NULL;
-
- /*
- * Note: NPU has just a single TVE in the hardware which means that
- * while used by the kernel, it can have either 32bit window or
- * DMA bypass but never both. So we deconfigure 32bit window only
- * if it was enabled at the moment of ownership change.
- */
- if (npe->table_group.tables[0]) {
- pnv_npu_unset_window(&npe->table_group, 0);
- return;
- }
-
- /* Disable bypass */
- rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
- npe->pe_number, npe->pe_number,
- 0 /* bypass base */, 0);
- if (rc) {
- pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
- return;
- }
- pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
-
- get_gpu_pci_dev_and_pe(npe, &gpdev);
- if (gpdev)
- pnv_npu2_unmap_lpar_dev(gpdev);
-}
-
-static void pnv_npu_release_ownership(struct iommu_table_group *table_group)
-{
- struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
- table_group);
- struct pci_dev *gpdev = NULL;
-
- get_gpu_pci_dev_and_pe(npe, &gpdev);
- if (gpdev)
- pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV);
-}
-
-static struct iommu_table_group_ops pnv_pci_npu_ops = {
- .set_window = pnv_npu_set_window,
- .unset_window = pnv_npu_unset_window,
- .take_ownership = pnv_npu_take_ownership,
- .release_ownership = pnv_npu_release_ownership,
-};
-#endif /* !CONFIG_IOMMU_API */
-
-/*
- * NPU2 ATS
- */
-/* Maximum possible number of ATSD MMIO registers per NPU */
-#define NV_NMMU_ATSD_REGS 8
-#define NV_NPU_MAX_PE_NUM 16
-
-/*
- * A compound NPU IOMMU group which might consist of 1 GPU + 2xNPUs (POWER8) or
- * up to 3 x (GPU + 2xNPUs) (POWER9).
- */
-struct npu_comp {
- struct iommu_table_group table_group;
- int pe_num;
- struct pnv_ioda_pe *pe[NV_NPU_MAX_PE_NUM];
-};
-
-/* An NPU descriptor, valid for POWER9 only */
-struct npu {
- int index;
- __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
- unsigned int mmio_atsd_count;
-
- /* Bitmask for MMIO register usage */
- unsigned long mmio_atsd_usage;
-
- /* Do we need to explicitly flush the nest mmu? */
- bool nmmu_flush;
-
- struct npu_comp npucomp;
-};
-
-#ifdef CONFIG_IOMMU_API
-static long pnv_npu_peers_create_table_userspace(
- struct iommu_table_group *table_group,
- int num, __u32 page_shift, __u64 window_size, __u32 levels,
- struct iommu_table **ptbl)
-{
- struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
- table_group);
-
- if (!npucomp->pe_num || !npucomp->pe[0] ||
- !npucomp->pe[0]->table_group.ops ||
- !npucomp->pe[0]->table_group.ops->create_table)
- return -EFAULT;
-
- return npucomp->pe[0]->table_group.ops->create_table(
- &npucomp->pe[0]->table_group, num, page_shift,
- window_size, levels, ptbl);
-}
-
-static long pnv_npu_peers_set_window(struct iommu_table_group *table_group,
- int num, struct iommu_table *tbl)
-{
- int i, j;
- long ret = 0;
- struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
- table_group);
-
- for (i = 0; i < npucomp->pe_num; ++i) {
- struct pnv_ioda_pe *pe = npucomp->pe[i];
-
- if (!pe->table_group.ops->set_window)
- continue;
-
- ret = pe->table_group.ops->set_window(&pe->table_group,
- num, tbl);
- if (ret)
- break;
- }
-
- if (ret) {
- for (j = 0; j < i; ++j) {
- struct pnv_ioda_pe *pe = npucomp->pe[j];
-
- if (!pe->table_group.ops->unset_window)
- continue;
-
- ret = pe->table_group.ops->unset_window(
- &pe->table_group, num);
- if (ret)
- break;
- }
- } else {
- table_group->tables[num] = iommu_tce_table_get(tbl);
- }
-
- return ret;
-}
-
-static long pnv_npu_peers_unset_window(struct iommu_table_group *table_group,
- int num)
-{
- int i, j;
- long ret = 0;
- struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
- table_group);
-
- for (i = 0; i < npucomp->pe_num; ++i) {
- struct pnv_ioda_pe *pe = npucomp->pe[i];
-
- WARN_ON(npucomp->table_group.tables[num] !=
- table_group->tables[num]);
- if (!npucomp->table_group.tables[num])
- continue;
-
- if (!pe->table_group.ops->unset_window)
- continue;
-
- ret = pe->table_group.ops->unset_window(&pe->table_group, num);
- if (ret)
- break;
- }
-
- if (ret) {
- for (j = 0; j < i; ++j) {
- struct pnv_ioda_pe *pe = npucomp->pe[j];
-
- if (!npucomp->table_group.tables[num])
- continue;
-
- if (!pe->table_group.ops->set_window)
- continue;
-
- ret = pe->table_group.ops->set_window(&pe->table_group,
- num, table_group->tables[num]);
- if (ret)
- break;
- }
- } else if (table_group->tables[num]) {
- iommu_tce_table_put(table_group->tables[num]);
- table_group->tables[num] = NULL;
- }
-
- return ret;
-}
-
-static void pnv_npu_peers_take_ownership(struct iommu_table_group *table_group)
-{
- int i;
- struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
- table_group);
-
- for (i = 0; i < npucomp->pe_num; ++i) {
- struct pnv_ioda_pe *pe = npucomp->pe[i];
-
- if (!pe->table_group.ops->take_ownership)
- continue;
- pe->table_group.ops->take_ownership(&pe->table_group);
- }
-}
-
-static void pnv_npu_peers_release_ownership(
- struct iommu_table_group *table_group)
-{
- int i;
- struct npu_comp *npucomp = container_of(table_group, struct npu_comp,
- table_group);
-
- for (i = 0; i < npucomp->pe_num; ++i) {
- struct pnv_ioda_pe *pe = npucomp->pe[i];
-
- if (!pe->table_group.ops->release_ownership)
- continue;
- pe->table_group.ops->release_ownership(&pe->table_group);
- }
-}
-
-static struct iommu_table_group_ops pnv_npu_peers_ops = {
- .get_table_size = pnv_pci_ioda2_get_table_size,
- .create_table = pnv_npu_peers_create_table_userspace,
- .set_window = pnv_npu_peers_set_window,
- .unset_window = pnv_npu_peers_unset_window,
- .take_ownership = pnv_npu_peers_take_ownership,
- .release_ownership = pnv_npu_peers_release_ownership,
-};
-
-static void pnv_comp_attach_table_group(struct npu_comp *npucomp,
- struct pnv_ioda_pe *pe)
-{
- if (WARN_ON(npucomp->pe_num == NV_NPU_MAX_PE_NUM))
- return;
-
- npucomp->pe[npucomp->pe_num] = pe;
- ++npucomp->pe_num;
-}
-
-struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
-{
- struct iommu_table_group *table_group;
- struct npu_comp *npucomp;
- struct pci_dev *gpdev = NULL;
- struct pci_controller *hose;
- struct pci_dev *npdev = NULL;
-
- list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) {
- npdev = pnv_pci_get_npu_dev(gpdev, 0);
- if (npdev)
- break;
- }
-
- if (!npdev)
- /* It is not an NPU attached device, skip */
- return NULL;
-
- hose = pci_bus_to_host(npdev->bus);
-
- if (hose->npu) {
- table_group = &hose->npu->npucomp.table_group;
-
- if (!table_group->group) {
- table_group->ops = &pnv_npu_peers_ops;
- iommu_register_group(table_group,
- hose->global_number,
- pe->pe_number);
- }
- } else {
- /* Create a group for 1 GPU and attached NPUs for POWER8 */
- pe->npucomp = kzalloc(sizeof(*pe->npucomp), GFP_KERNEL);
- table_group = &pe->npucomp->table_group;
- table_group->ops = &pnv_npu_peers_ops;
- iommu_register_group(table_group, hose->global_number,
- pe->pe_number);
- }
-
- /* Steal capabilities from a GPU PE */
- table_group->max_dynamic_windows_supported =
- pe->table_group.max_dynamic_windows_supported;
- table_group->tce32_start = pe->table_group.tce32_start;
- table_group->tce32_size = pe->table_group.tce32_size;
- table_group->max_levels = pe->table_group.max_levels;
- if (!table_group->pgsizes)
- table_group->pgsizes = pe->table_group.pgsizes;
-
- npucomp = container_of(table_group, struct npu_comp, table_group);
- pnv_comp_attach_table_group(npucomp, pe);
-
- return table_group;
-}
-
-struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
-{
- struct iommu_table_group *table_group;
- struct npu_comp *npucomp;
- struct pci_dev *gpdev = NULL;
- struct pci_dev *npdev;
- struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(pe, &gpdev);
-
- WARN_ON(!(pe->flags & PNV_IODA_PE_DEV));
- if (!gpe)
- return NULL;
-
- /*
- * IODA2 bridges get this set up from pci_controller_ops::setup_bridge
- * but NPU bridges do not have this hook defined so we do it here.
- * We do not setup other table group parameters as they won't be used
- * anyway - NVLink bridges are subordinate PEs.
- */
- pe->table_group.ops = &pnv_pci_npu_ops;
-
- table_group = iommu_group_get_iommudata(
- iommu_group_get(&gpdev->dev));
-
- /*
- * On P9 NPU PHB and PCI PHB support different page sizes,
- * keep only matching. We expect here that NVLink bridge PE pgsizes is
- * initialized by the caller.
- */
- table_group->pgsizes &= pe->table_group.pgsizes;
- npucomp = container_of(table_group, struct npu_comp, table_group);
- pnv_comp_attach_table_group(npucomp, pe);
-
- list_for_each_entry(npdev, &pe->phb->hose->bus->devices, bus_list) {
- struct pci_dev *gpdevtmp = pnv_pci_get_gpu_dev(npdev);
-
- if (gpdevtmp != gpdev)
- continue;
-
- iommu_add_device(table_group, &npdev->dev);
- }
-
- return table_group;
-}
-#endif /* CONFIG_IOMMU_API */
-
-/* Maximum number of nvlinks per npu */
-#define NV_MAX_LINKS 6
-
-/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
-static int max_npu2_index;
-
-struct npu_context {
- struct mm_struct *mm;
- struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
- struct mmu_notifier mn;
- struct kref kref;
- bool nmmu_flush;
-
- /* Callback to stop translation requests on a given GPU */
- void (*release_cb)(struct npu_context *context, void *priv);
-
- /*
- * Private pointer passed to the above callback for usage by
- * device drivers.
- */
- void *priv;
-};
-
-struct mmio_atsd_reg {
- struct npu *npu;
- int reg;
-};
-
-/*
- * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
- * if none are available.
- */
-static int get_mmio_atsd_reg(struct npu *npu)
-{
- int i;
-
- for (i = 0; i < npu->mmio_atsd_count; i++) {
- if (!test_bit(i, &npu->mmio_atsd_usage))
- if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
- return i;
- }
-
- return -ENOSPC;
-}
-
-static void put_mmio_atsd_reg(struct npu *npu, int reg)
-{
- clear_bit_unlock(reg, &npu->mmio_atsd_usage);
-}
-
-/* MMIO ATSD register offsets */
-#define XTS_ATSD_LAUNCH 0
-#define XTS_ATSD_AVA 1
-#define XTS_ATSD_STAT 2
-
-static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize)
-{
- unsigned long launch = 0;
-
- if (psize == MMU_PAGE_COUNT) {
- /* IS set to invalidate entire matching PID */
- launch |= PPC_BIT(12);
- } else {
- /* AP set to invalidate region of psize */
- launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17);
- }
-
- /* PRS set to process-scoped */
- launch |= PPC_BIT(13);
-
- /* PID */
- launch |= pid << PPC_BITLSHIFT(38);
-
- /* Leave "No flush" (bit 39) 0 so every ATSD performs a flush */
-
- return launch;
-}
-
-static void mmio_atsd_regs_write(struct mmio_atsd_reg
- mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset,
- unsigned long val)
-{
- struct npu *npu;
- int i, reg;
-
- for (i = 0; i <= max_npu2_index; i++) {
- reg = mmio_atsd_reg[i].reg;
- if (reg < 0)
- continue;
-
- npu = mmio_atsd_reg[i].npu;
- __raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset);
- }
-}
-
-static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
- unsigned long pid)
-{
- unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT);
-
- /* Invalidating the entire process doesn't use a va */
- mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
-}
-
-static void mmio_invalidate_range(struct mmio_atsd_reg
- mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid,
- unsigned long start, unsigned long psize)
-{
- unsigned long launch = get_atsd_launch_val(pid, psize);
-
- /* Write all VAs first */
- mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start);
-
- /* Issue one barrier for all address writes */
- eieio();
-
- /* Launch */
- mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
-}
-
-#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
-
-static void mmio_invalidate_wait(
- struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
-{
- struct npu *npu;
- int i, reg;
-
- /* Wait for all invalidations to complete */
- for (i = 0; i <= max_npu2_index; i++) {
- if (mmio_atsd_reg[i].reg < 0)
- continue;
-
- /* Wait for completion */
- npu = mmio_atsd_reg[i].npu;
- reg = mmio_atsd_reg[i].reg;
- while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
- cpu_relax();
- }
-}
-
-/*
- * Acquires all the address translation shootdown (ATSD) registers required to
- * launch an ATSD on all links this npu_context is active on.
- */
-static void acquire_atsd_reg(struct npu_context *npu_context,
- struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
-{
- int i, j;
- struct npu *npu;
- struct pci_dev *npdev;
-
- for (i = 0; i <= max_npu2_index; i++) {
- mmio_atsd_reg[i].reg = -1;
- for (j = 0; j < NV_MAX_LINKS; j++) {
- /*
- * There are no ordering requirements with respect to
- * the setup of struct npu_context, but to ensure
- * consistent behaviour we need to ensure npdev[][] is
- * only read once.
- */
- npdev = READ_ONCE(npu_context->npdev[i][j]);
- if (!npdev)
- continue;
-
- npu = pci_bus_to_host(npdev->bus)->npu;
- if (!npu)
- continue;
-
- mmio_atsd_reg[i].npu = npu;
- mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
- while (mmio_atsd_reg[i].reg < 0) {
- mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
- cpu_relax();
- }
- break;
- }
- }
-}
-
-/*
- * Release previously acquired ATSD registers. To avoid deadlocks the registers
- * must be released in the same order they were acquired above in
- * acquire_atsd_reg.
- */
-static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
-{
- int i;
-
- for (i = 0; i <= max_npu2_index; i++) {
- /*
- * We can't rely on npu_context->npdev[][] being the same here
- * as when acquire_atsd_reg() was called, hence we use the
- * values stored in mmio_atsd_reg during the acquire phase
- * rather than re-reading npdev[][].
- */
- if (mmio_atsd_reg[i].reg < 0)
- continue;
-
- put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
- }
-}
-
-/*
- * Invalidate a virtual address range
- */
-static void mmio_invalidate(struct npu_context *npu_context,
- unsigned long start, unsigned long size)
-{
- struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
- unsigned long pid = npu_context->mm->context.id;
- unsigned long atsd_start = 0;
- unsigned long end = start + size - 1;
- int atsd_psize = MMU_PAGE_COUNT;
-
- /*
- * Convert the input range into one of the supported sizes. If the range
- * doesn't fit, use the next larger supported size. Invalidation latency
- * is high, so over-invalidation is preferred to issuing multiple
- * invalidates.
- *
- * A 4K page size isn't supported by NPU/GPU ATS, so that case is
- * ignored.
- */
- if (size == SZ_64K) {
- atsd_start = start;
- atsd_psize = MMU_PAGE_64K;
- } else if (ALIGN_DOWN(start, SZ_2M) == ALIGN_DOWN(end, SZ_2M)) {
- atsd_start = ALIGN_DOWN(start, SZ_2M);
- atsd_psize = MMU_PAGE_2M;
- } else if (ALIGN_DOWN(start, SZ_1G) == ALIGN_DOWN(end, SZ_1G)) {
- atsd_start = ALIGN_DOWN(start, SZ_1G);
- atsd_psize = MMU_PAGE_1G;
- }
-
- if (npu_context->nmmu_flush)
- /*
- * Unfortunately the nest mmu does not support flushing specific
- * addresses so we have to flush the whole mm once before
- * shooting down the GPU translation.
- */
- flush_all_mm(npu_context->mm);
-
- /*
- * Loop over all the NPUs this process is active on and launch
- * an invalidate.
- */
- acquire_atsd_reg(npu_context, mmio_atsd_reg);
-
- if (atsd_psize == MMU_PAGE_COUNT)
- mmio_invalidate_pid(mmio_atsd_reg, pid);
- else
- mmio_invalidate_range(mmio_atsd_reg, pid, atsd_start,
- atsd_psize);
-
- mmio_invalidate_wait(mmio_atsd_reg);
-
- /*
- * The GPU requires two flush ATSDs to ensure all entries have been
- * flushed. We use PID 0 as it will never be used for a process on the
- * GPU.
- */
- mmio_invalidate_pid(mmio_atsd_reg, 0);
- mmio_invalidate_wait(mmio_atsd_reg);
- mmio_invalidate_pid(mmio_atsd_reg, 0);
- mmio_invalidate_wait(mmio_atsd_reg);
-
- release_atsd_reg(mmio_atsd_reg);
-}
-
-static void pnv_npu2_mn_release(struct mmu_notifier *mn,
- struct mm_struct *mm)
-{
- struct npu_context *npu_context = mn_to_npu_context(mn);
-
- /* Call into device driver to stop requests to the NMMU */
- if (npu_context->release_cb)
- npu_context->release_cb(npu_context, npu_context->priv);
-
- /*
- * There should be no more translation requests for this PID, but we
- * need to ensure any entries for it are removed from the TLB.
- */
- mmio_invalidate(npu_context, 0, ~0UL);
-}
-
-static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long address,
- pte_t pte)
-{
- struct npu_context *npu_context = mn_to_npu_context(mn);
- mmio_invalidate(npu_context, address, PAGE_SIZE);
-}
-
-static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- struct npu_context *npu_context = mn_to_npu_context(mn);
- mmio_invalidate(npu_context, start, end - start);
-}
-
-static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
- .release = pnv_npu2_mn_release,
- .change_pte = pnv_npu2_mn_change_pte,
- .invalidate_range = pnv_npu2_mn_invalidate_range,
-};
-
-/*
- * Call into OPAL to setup the nmmu context for the current task in
- * the NPU. This must be called to setup the context tables before the
- * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
- *
- * A release callback should be registered to allow a device driver to
- * be notified that it should not launch any new translation requests
- * as the final TLB invalidate is about to occur.
- *
- * Returns an error if there no contexts are currently available or a
- * npu_context which should be passed to pnv_npu2_handle_fault().
- *
- * mmap_sem must be held in write mode and must not be called from interrupt
- * context.
- */
-struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
- unsigned long flags,
- void (*cb)(struct npu_context *, void *),
- void *priv)
-{
- int rc;
- u32 nvlink_index;
- struct device_node *nvlink_dn;
- struct mm_struct *mm = current->mm;
- struct npu *npu;
- struct npu_context *npu_context;
- struct pci_controller *hose;
-
- /*
- * At present we don't support GPUs connected to multiple NPUs and I'm
- * not sure the hardware does either.
- */
- struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
-
- if (!npdev)
- /* No nvlink associated with this GPU device */
- return ERR_PTR(-ENODEV);
-
- /* We only support DR/PR/HV in pnv_npu2_map_lpar_dev() */
- if (flags & ~(MSR_DR | MSR_PR | MSR_HV))
- return ERR_PTR(-EINVAL);
-
- nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
- if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
- &nvlink_index)))
- return ERR_PTR(-ENODEV);
-
- if (!mm || mm->context.id == 0) {
- /*
- * Kernel thread contexts are not supported and context id 0 is
- * reserved on the GPU.
- */
- return ERR_PTR(-EINVAL);
- }
-
- hose = pci_bus_to_host(npdev->bus);
- npu = hose->npu;
- if (!npu)
- return ERR_PTR(-ENODEV);
-
- /*
- * We store the npu pci device so we can more easily get at the
- * associated npus.
- */
- spin_lock(&npu_context_lock);
- npu_context = mm->context.npu_context;
- if (npu_context) {
- if (npu_context->release_cb != cb ||
- npu_context->priv != priv) {
- spin_unlock(&npu_context_lock);
- return ERR_PTR(-EINVAL);
- }
-
- WARN_ON(!kref_get_unless_zero(&npu_context->kref));
- }
- spin_unlock(&npu_context_lock);
-
- if (!npu_context) {
- /*
- * We can set up these fields without holding the
- * npu_context_lock as the npu_context hasn't been returned to
- * the caller meaning it can't be destroyed. Parallel allocation
- * is protected against by mmap_sem.
- */
- rc = -ENOMEM;
- npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
- if (npu_context) {
- kref_init(&npu_context->kref);
- npu_context->mm = mm;
- npu_context->mn.ops = &nv_nmmu_notifier_ops;
- rc = __mmu_notifier_register(&npu_context->mn, mm);
- }
-
- if (rc) {
- kfree(npu_context);
- return ERR_PTR(rc);
- }
-
- mm->context.npu_context = npu_context;
- }
-
- npu_context->release_cb = cb;
- npu_context->priv = priv;
-
- /*
- * npdev is a pci_dev pointer setup by the PCI code. We assign it to
- * npdev[][] to indicate to the mmu notifiers that an invalidation
- * should also be sent over this nvlink. The notifiers don't use any
- * other fields in npu_context, so we just need to ensure that when they
- * deference npu_context->npdev[][] it is either a valid pointer or
- * NULL.
- */
- WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
-
- if (!npu->nmmu_flush) {
- /*
- * If we're not explicitly flushing ourselves we need to mark
- * the thread for global flushes
- */
- npu_context->nmmu_flush = false;
- mm_context_add_copro(mm);
- } else
- npu_context->nmmu_flush = true;
-
- return npu_context;
-}
-EXPORT_SYMBOL(pnv_npu2_init_context);
-
-static void pnv_npu2_release_context(struct kref *kref)
-{
- struct npu_context *npu_context =
- container_of(kref, struct npu_context, kref);
-
- if (!npu_context->nmmu_flush)
- mm_context_remove_copro(npu_context->mm);
-
- npu_context->mm->context.npu_context = NULL;
-}
-
-/*
- * Destroy a context on the given GPU. May free the npu_context if it is no
- * longer active on any GPUs. Must not be called from interrupt context.
- */
-void pnv_npu2_destroy_context(struct npu_context *npu_context,
- struct pci_dev *gpdev)
-{
- int removed;
- struct npu *npu;
- struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
- struct device_node *nvlink_dn;
- u32 nvlink_index;
- struct pci_controller *hose;
-
- if (WARN_ON(!npdev))
- return;
-
- hose = pci_bus_to_host(npdev->bus);
- npu = hose->npu;
- if (!npu)
- return;
- nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
- if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
- &nvlink_index)))
- return;
- WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
- spin_lock(&npu_context_lock);
- removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
- spin_unlock(&npu_context_lock);
-
- /*
- * We need to do this outside of pnv_npu2_release_context so that it is
- * outside the spinlock as mmu_notifier_destroy uses SRCU.
- */
- if (removed) {
- mmu_notifier_unregister(&npu_context->mn,
- npu_context->mm);
-
- kfree(npu_context);
- }
-
-}
-EXPORT_SYMBOL(pnv_npu2_destroy_context);
-
-/*
- * Assumes mmap_sem is held for the contexts associated mm.
- */
-int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
- unsigned long *flags, unsigned long *status, int count)
-{
- u64 rc = 0, result = 0;
- int i, is_write;
- struct page *page[1];
- const char __user *u;
- char c;
-
- /* mmap_sem should be held so the struct_mm must be present */
- struct mm_struct *mm = context->mm;
-
- WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
-
- for (i = 0; i < count; i++) {
- is_write = flags[i] & NPU2_WRITE;
- rc = get_user_pages_remote(NULL, mm, ea[i], 1,
- is_write ? FOLL_WRITE : 0,
- page, NULL, NULL);
-
- if (rc != 1) {
- status[i] = rc;
- result = -EFAULT;
- continue;
- }
-
- /* Make sure partition scoped tree gets a pte */
- u = page_address(page[0]);
- if (__get_user(c, u))
- result = -EFAULT;
-
- status[i] = 0;
- put_page(page[0]);
- }
-
- return result;
-}
-EXPORT_SYMBOL(pnv_npu2_handle_fault);
-
-int pnv_npu2_init(struct pci_controller *hose)
-{
- unsigned int i;
- u64 mmio_atsd;
- static int npu_index;
- struct npu *npu;
- int ret;
-
- npu = kzalloc(sizeof(*npu), GFP_KERNEL);
- if (!npu)
- return -ENOMEM;
-
- npu->nmmu_flush = of_property_read_bool(hose->dn, "ibm,nmmu-flush");
-
- for (i = 0; i < ARRAY_SIZE(npu->mmio_atsd_regs) &&
- !of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
- i, &mmio_atsd); i++)
- npu->mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
-
- pr_info("NPU%d: Found %d MMIO ATSD registers", hose->global_number, i);
- npu->mmio_atsd_count = i;
- npu->mmio_atsd_usage = 0;
- npu_index++;
- if (WARN_ON(npu_index >= NV_MAX_NPUS)) {
- ret = -ENOSPC;
- goto fail_exit;
- }
- max_npu2_index = npu_index;
- npu->index = npu_index;
- hose->npu = npu;
-
- return 0;
-
-fail_exit:
- for (i = 0; i < npu->mmio_atsd_count; ++i)
- iounmap(npu->mmio_atsd_regs[i]);
-
- kfree(npu);
-
- return ret;
-}
-
-int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
- unsigned long msr)
-{
- int ret;
- struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
- struct pci_controller *hose;
- struct pnv_phb *nphb;
-
- if (!npdev)
- return -ENODEV;
-
- hose = pci_bus_to_host(npdev->bus);
- nphb = hose->private_data;
-
- dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=%u\n",
- nphb->opal_id, lparid);
- /*
- * Currently we only support radix and non-zero LPCR only makes sense
- * for hash tables so skiboot expects the LPCR parameter to be a zero.
- */
- ret = opal_npu_map_lpar(nphb->opal_id,
- PCI_DEVID(gpdev->bus->number, gpdev->devfn), lparid,
- 0 /* LPCR bits */);
- if (ret) {
- dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
- return ret;
- }
-
- dev_dbg(&gpdev->dev, "init context opalid=%llu msr=%lx\n",
- nphb->opal_id, msr);
- ret = opal_npu_init_context(nphb->opal_id, 0/*__unused*/, msr,
- PCI_DEVID(gpdev->bus->number, gpdev->devfn));
- if (ret < 0)
- dev_err(&gpdev->dev, "Failed to init context: %d\n", ret);
- else
- ret = 0;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(pnv_npu2_map_lpar_dev);
-
-void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr)
-{
- struct pci_dev *gpdev;
-
- list_for_each_entry(gpdev, &gpe->pbus->devices, bus_list)
- pnv_npu2_map_lpar_dev(gpdev, 0, msr);
-}
-
-int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev)
-{
- int ret;
- struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
- struct pci_controller *hose;
- struct pnv_phb *nphb;
-
- if (!npdev)
- return -ENODEV;
-
- hose = pci_bus_to_host(npdev->bus);
- nphb = hose->private_data;
-
- dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n",
- nphb->opal_id);
- ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/,
- PCI_DEVID(gpdev->bus->number, gpdev->devfn));
- if (ret < 0) {
- dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret);
- return ret;
- }
-
- /* Set LPID to 0 anyway, just to be safe */
- dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id);
- ret = opal_npu_map_lpar(nphb->opal_id,
- PCI_DEVID(gpdev->bus->number, gpdev->devfn), 0 /*LPID*/,
- 0 /* LPCR bits */);
- if (ret)
- dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev);
diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c
index 8c65aacda9c8..f8139948348e 100644
--- a/arch/powerpc/platforms/powernv/ocxl.c
+++ b/arch/powerpc/platforms/powernv/ocxl.c
@@ -2,7 +2,6 @@
// Copyright 2017 IBM Corp.
#include <asm/pnv-ocxl.h>
#include <asm/opal.h>
-#include <asm/xive.h>
#include <misc/ocxl-config.h>
#include "pci.h"
@@ -108,7 +107,8 @@ static int get_max_afu_index(struct pci_dev *dev, int *afu_idx)
int pos;
u32 val;
- pos = find_dvsec_from_pos(dev, OCXL_DVSEC_FUNC_ID, 0);
+ pos = pci_find_dvsec_capability(dev, PCI_VENDOR_ID_IBM,
+ OCXL_DVSEC_FUNC_ID);
if (!pos)
return -ESRCH;
@@ -172,12 +172,11 @@ static void pnv_ocxl_fixup_actag(struct pci_dev *dev)
if (phb->type != PNV_PHB_NPU_OCAPI)
return;
- mutex_lock(&links_list_lock);
+ guard(mutex)(&links_list_lock);
link = find_link(dev);
if (!link) {
dev_warn(&dev->dev, "couldn't update actag information\n");
- mutex_unlock(&links_list_lock);
return;
}
@@ -206,7 +205,6 @@ static void pnv_ocxl_fixup_actag(struct pci_dev *dev)
dev_dbg(&dev->dev, "total actags for function: %d\n",
link->fn_desired_actags[PCI_FUNC(dev->devfn)]);
- mutex_unlock(&links_list_lock);
}
DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_ocxl_fixup_actag);
@@ -253,12 +251,11 @@ int pnv_ocxl_get_actag(struct pci_dev *dev, u16 *base, u16 *enabled,
{
struct npu_link *link;
- mutex_lock(&links_list_lock);
+ guard(mutex)(&links_list_lock);
link = find_link(dev);
if (!link) {
dev_err(&dev->dev, "actag information not found\n");
- mutex_unlock(&links_list_lock);
return -ENODEV;
}
/*
@@ -274,7 +271,6 @@ int pnv_ocxl_get_actag(struct pci_dev *dev, u16 *base, u16 *enabled,
*enabled = link->fn_actags[PCI_FUNC(dev->devfn)].count;
*supported = link->fn_desired_actags[PCI_FUNC(dev->devfn)];
- mutex_unlock(&links_list_lock);
return 0;
}
EXPORT_SYMBOL_GPL(pnv_ocxl_get_actag);
@@ -289,16 +285,15 @@ int pnv_ocxl_get_pasid_count(struct pci_dev *dev, int *count)
* be used by a function depends on how many functions exist
* on the device. The NPU needs to be configured to know how
* many bits are available to PASIDs and how many are to be
- * used by the function BDF indentifier.
+ * used by the function BDF identifier.
*
* We only support one AFU-carrying function for now.
*/
- mutex_lock(&links_list_lock);
+ guard(mutex)(&links_list_lock);
link = find_link(dev);
if (!link) {
dev_err(&dev->dev, "actag information not found\n");
- mutex_unlock(&links_list_lock);
return -ENODEV;
}
@@ -309,7 +304,6 @@ int pnv_ocxl_get_pasid_count(struct pci_dev *dev, int *count)
break;
}
- mutex_unlock(&links_list_lock);
dev_dbg(&dev->dev, "%d PASIDs available for function\n",
rc ? 0 : *count);
return rc;
@@ -449,7 +443,7 @@ int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask,
if (!data)
return -ENOMEM;
- bdfn = (dev->bus->number << 8) | dev->devfn;
+ bdfn = pci_dev_id(dev);
rc = opal_npu_spa_setup(phb->opal_id, bdfn, virt_to_phys(spa_mem),
PE_mask);
if (rc) {
@@ -478,38 +472,121 @@ EXPORT_SYMBOL_GPL(pnv_ocxl_spa_release);
int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle)
{
struct spa_data *data = (struct spa_data *) platform_data;
- int rc;
- rc = opal_npu_spa_clear_cache(data->phb_opal_id, data->bdfn, pe_handle);
- return rc;
+ return opal_npu_spa_clear_cache(data->phb_opal_id, data->bdfn, pe_handle);
}
EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache);
-int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr)
+int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid,
+ uint64_t lpcr, void __iomem **arva)
{
- __be64 flags, trigger_page;
- s64 rc;
- u32 hwirq;
-
- hwirq = xive_native_alloc_irq();
- if (!hwirq)
- return -ENOENT;
-
- rc = opal_xive_get_irq_info(hwirq, &flags, NULL, &trigger_page, NULL,
- NULL);
- if (rc || !trigger_page) {
- xive_native_free_irq(hwirq);
- return -ENOENT;
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ u64 mmio_atsd;
+ int rc;
+
+ /* ATSD physical address.
+ * ATSD LAUNCH register: write access initiates a shoot down to
+ * initiate the TLB Invalidate command.
+ */
+ rc = of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
+ 0, &mmio_atsd);
+ if (rc) {
+ dev_info(&dev->dev, "No available ATSD found\n");
+ return rc;
+ }
+
+ /* Assign a register set to a Logical Partition and MMIO ATSD
+ * LPARID register to the required value.
+ */
+ rc = opal_npu_map_lpar(phb->opal_id, pci_dev_id(dev),
+ lparid, lpcr);
+ if (rc) {
+ dev_err(&dev->dev, "Error mapping device to LPAR: %d\n", rc);
+ return rc;
+ }
+
+ *arva = ioremap(mmio_atsd, 24);
+ if (!(*arva)) {
+ dev_warn(&dev->dev, "ioremap failed - mmio_atsd: %#llx\n", mmio_atsd);
+ rc = -ENOMEM;
}
- *irq = hwirq;
- *trigger_addr = be64_to_cpu(trigger_page);
- return 0;
+ return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_map_lpar);
+
+void pnv_ocxl_unmap_lpar(void __iomem *arva)
+{
+ iounmap(arva);
}
-EXPORT_SYMBOL_GPL(pnv_ocxl_alloc_xive_irq);
+EXPORT_SYMBOL_GPL(pnv_ocxl_unmap_lpar);
-void pnv_ocxl_free_xive_irq(u32 irq)
+void pnv_ocxl_tlb_invalidate(void __iomem *arva,
+ unsigned long pid,
+ unsigned long addr,
+ unsigned long page_size)
{
- xive_native_free_irq(irq);
+ unsigned long timeout = jiffies + (HZ * PNV_OCXL_ATSD_TIMEOUT);
+ u64 val = 0ull;
+ int pend;
+ u8 size;
+
+ if (!(arva))
+ return;
+
+ if (addr) {
+ /* load Abbreviated Virtual Address register with
+ * the necessary value
+ */
+ val |= FIELD_PREP(PNV_OCXL_ATSD_AVA_AVA, addr >> (63-51));
+ out_be64(arva + PNV_OCXL_ATSD_AVA, val);
+ }
+
+ /* Write access initiates a shoot down to initiate the
+ * TLB Invalidate command
+ */
+ val = PNV_OCXL_ATSD_LNCH_R;
+ val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_RIC, 0b10);
+ if (addr)
+ val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_IS, 0b00);
+ else {
+ val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_IS, 0b01);
+ val |= PNV_OCXL_ATSD_LNCH_OCAPI_SINGLETON;
+ }
+ val |= PNV_OCXL_ATSD_LNCH_PRS;
+ /* Actual Page Size to be invalidated
+ * 000 4KB
+ * 101 64KB
+ * 001 2MB
+ * 010 1GB
+ */
+ size = 0b101;
+ if (page_size == 0x1000)
+ size = 0b000;
+ if (page_size == 0x200000)
+ size = 0b001;
+ if (page_size == 0x40000000)
+ size = 0b010;
+ val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_AP, size);
+ val |= FIELD_PREP(PNV_OCXL_ATSD_LNCH_PID, pid);
+ out_be64(arva + PNV_OCXL_ATSD_LNCH, val);
+
+ /* Poll the ATSD status register to determine when the
+ * TLB Invalidate has been completed.
+ */
+ val = in_be64(arva + PNV_OCXL_ATSD_STAT);
+ pend = val >> 63;
+
+ while (pend) {
+ if (time_after_eq(jiffies, timeout)) {
+ pr_err("%s - Timeout while reading XTS MMIO ATSD status register (val=%#llx, pidr=0x%lx)\n",
+ __func__, val, pid);
+ return;
+ }
+ cpu_relax();
+ val = in_be64(arva + PNV_OCXL_ATSD_STAT);
+ pend = val >> 63;
+ }
}
-EXPORT_SYMBOL_GPL(pnv_ocxl_free_xive_irq);
+EXPORT_SYMBOL_GPL(pnv_ocxl_tlb_invalidate);
diff --git a/arch/powerpc/platforms/powernv/opal-async.c b/arch/powerpc/platforms/powernv/opal-async.c
index 18a355fa15e8..c094fdf5825c 100644
--- a/arch/powerpc/platforms/powernv/opal-async.c
+++ b/arch/powerpc/platforms/powernv/opal-async.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV OPAL asynchronous completion interfaces
*
* Copyright 2013-2017 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#undef DEBUG
@@ -108,7 +104,7 @@ static int __opal_async_release_token(int token)
*/
case ASYNC_TOKEN_DISPATCHED:
opal_async_tokens[token].state = ASYNC_TOKEN_ABANDONED;
- /* Fall through */
+ fallthrough;
default:
rc = 1;
}
diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c
new file mode 100644
index 000000000000..021b0ec29e24
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-call.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/percpu.h>
+#include <linux/jump_label.h>
+#include <asm/interrupt.h>
+#include <asm/opal-api.h>
+#include <asm/trace.h>
+#include <asm/asm-prototypes.h>
+
+#ifdef CONFIG_TRACEPOINTS
+/*
+ * Since the tracing code might execute OPAL calls we need to guard against
+ * recursion.
+ */
+static DEFINE_PER_CPU(unsigned int, opal_trace_depth);
+
+static void __trace_opal_entry(s64 a0, s64 a1, s64 a2, s64 a3,
+ s64 a4, s64 a5, s64 a6, s64 a7,
+ unsigned long opcode)
+{
+ unsigned int *depth;
+ unsigned long args[8];
+
+ depth = this_cpu_ptr(&opal_trace_depth);
+
+ if (*depth)
+ return;
+
+ args[0] = a0;
+ args[1] = a1;
+ args[2] = a2;
+ args[3] = a3;
+ args[4] = a4;
+ args[5] = a5;
+ args[6] = a6;
+ args[7] = a7;
+
+ (*depth)++;
+ trace_opal_entry(opcode, &args[0]);
+ (*depth)--;
+}
+
+static void __trace_opal_exit(unsigned long opcode, unsigned long retval)
+{
+ unsigned int *depth;
+
+ depth = this_cpu_ptr(&opal_trace_depth);
+
+ if (*depth)
+ return;
+
+ (*depth)++;
+ trace_opal_exit(opcode, retval);
+ (*depth)--;
+}
+
+static DEFINE_STATIC_KEY_FALSE(opal_tracepoint_key);
+
+int opal_tracepoint_regfunc(void)
+{
+ static_branch_inc(&opal_tracepoint_key);
+ return 0;
+}
+
+void opal_tracepoint_unregfunc(void)
+{
+ static_branch_dec(&opal_tracepoint_key);
+}
+
+static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3,
+ s64 a4, s64 a5, s64 a6, s64 a7,
+ unsigned long opcode, unsigned long msr)
+{
+ s64 ret;
+
+ __trace_opal_entry(a0, a1, a2, a3, a4, a5, a6, a7, opcode);
+ ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+ __trace_opal_exit(opcode, ret);
+
+ return ret;
+}
+
+#define DO_TRACE (static_branch_unlikely(&opal_tracepoint_key))
+
+#else /* CONFIG_TRACEPOINTS */
+
+static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3,
+ s64 a4, s64 a5, s64 a6, s64 a7,
+ unsigned long opcode, unsigned long msr)
+{
+ return 0;
+}
+
+#define DO_TRACE false
+#endif /* CONFIG_TRACEPOINTS */
+
+static int64_t opal_call(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
+ int64_t a4, int64_t a5, int64_t a6, int64_t a7, int64_t opcode)
+{
+ unsigned long flags;
+ unsigned long msr = mfmsr();
+ bool mmu = (msr & (MSR_IR|MSR_DR));
+ int64_t ret;
+
+ /* OPAL call / firmware may use SRR and/or HSRR */
+ srr_regs_clobbered();
+
+ msr &= ~MSR_EE;
+
+ if (unlikely(!mmu))
+ return __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+
+ local_save_flags(flags);
+ hard_irq_disable();
+
+ if (DO_TRACE) {
+ ret = __opal_call_trace(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+ } else {
+ ret = __opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode, msr);
+ }
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+#define OPAL_CALL(name, opcode) \
+int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3, \
+ int64_t a4, int64_t a5, int64_t a6, int64_t a7); \
+int64_t name(int64_t a0, int64_t a1, int64_t a2, int64_t a3, \
+ int64_t a4, int64_t a5, int64_t a6, int64_t a7) \
+{ \
+ return opal_call(a0, a1, a2, a3, a4, a5, a6, a7, opcode); \
+}
+
+OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL);
+OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE);
+OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ);
+OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE);
+OPAL_CALL(opal_rtc_read, OPAL_RTC_READ);
+OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE);
+OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN);
+OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT);
+OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2);
+OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM);
+OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM);
+OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT);
+OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS);
+OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY);
+OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY);
+OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE);
+OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD);
+OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD);
+OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE);
+OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD);
+OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD);
+OPAL_CALL(opal_set_xive, OPAL_SET_XIVE);
+OPAL_CALL(opal_get_xive, OPAL_GET_XIVE);
+OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
+OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS);
+OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR);
+OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET);
+OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT);
+OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC);
+OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE);
+OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW);
+OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW);
+OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY);
+OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE);
+OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV);
+OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE);
+OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE);
+OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE);
+OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32);
+OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64);
+OPAL_CALL(opal_start_cpu, OPAL_START_CPU);
+OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS);
+OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL);
+OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW);
+OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL);
+OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET);
+OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA);
+OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA);
+OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB);
+OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT);
+OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR);
+OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS);
+OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS);
+OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS);
+OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED);
+OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR);
+OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL);
+OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI);
+OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2);
+OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ);
+OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE);
+OPAL_CALL(opal_lpc_read, OPAL_LPC_READ);
+OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE);
+OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU);
+OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS);
+OPAL_CALL(opal_read_elog, OPAL_ELOG_READ);
+OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK);
+OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE);
+OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND);
+OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE);
+OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE);
+OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE);
+OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE);
+OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE);
+OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN);
+OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT);
+OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO);
+OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2);
+OPAL_CALL(opal_dump_read, OPAL_DUMP_READ);
+OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK);
+OPAL_CALL(opal_get_msg, OPAL_GET_MSG);
+OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC);
+OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION);
+OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND);
+OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT);
+OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ);
+OPAL_CALL(opal_get_param, OPAL_GET_PARAM);
+OPAL_CALL(opal_set_param, OPAL_SET_PARAM);
+OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI);
+OPAL_CALL(opal_handle_hmi2, OPAL_HANDLE_HMI2);
+OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE);
+OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG);
+OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION);
+OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION);
+OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE);
+OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO);
+OPAL_CALL(opal_tpo_read, OPAL_READ_TPO);
+OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND);
+OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV);
+OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST);
+OPAL_CALL(opal_flash_read, OPAL_FLASH_READ);
+OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE);
+OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE);
+OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG);
+OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR);
+OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR);
+OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH);
+OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE);
+OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE);
+OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE);
+OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE);
+OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR);
+OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR);
+OPAL_CALL(opal_int_eoi, OPAL_INT_EOI);
+OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR);
+OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL);
+OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR);
+OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET);
+OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO);
+OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO);
+OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO);
+OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE);
+OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK);
+OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK);
+OPAL_CALL(opal_xive_allocate_irq_raw, OPAL_XIVE_ALLOCATE_IRQ);
+OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ);
+OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
+OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
+OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC);
+OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP);
+OPAL_CALL(opal_xive_get_queue_state, OPAL_XIVE_GET_QUEUE_STATE);
+OPAL_CALL(opal_xive_set_queue_state, OPAL_XIVE_SET_QUEUE_STATE);
+OPAL_CALL(opal_xive_get_vp_state, OPAL_XIVE_GET_VP_STATE);
+OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET);
+OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR);
+OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT);
+OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START);
+OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP);
+OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP);
+OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP);
+OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO);
+OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO);
+OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR);
+OPAL_CALL(opal_quiesce, OPAL_QUIESCE);
+OPAL_CALL(opal_npu_spa_setup, OPAL_NPU_SPA_SETUP);
+OPAL_CALL(opal_npu_spa_clear_cache, OPAL_NPU_SPA_CLEAR_CACHE);
+OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET);
+OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR);
+OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
+OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64);
+OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE);
+OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT);
+OPAL_CALL(opal_mpipl_update, OPAL_MPIPL_UPDATE);
+OPAL_CALL(opal_mpipl_register_tag, OPAL_MPIPL_REGISTER_TAG);
+OPAL_CALL(opal_mpipl_query_tag, OPAL_MPIPL_QUERY_TAG);
+OPAL_CALL(opal_secvar_get, OPAL_SECVAR_GET);
+OPAL_CALL(opal_secvar_get_next, OPAL_SECVAR_GET_NEXT);
+OPAL_CALL(opal_secvar_enqueue_update, OPAL_SECVAR_ENQUEUE_UPDATE);
diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c
new file mode 100644
index 000000000000..784602a48afb
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -0,0 +1,663 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Interface for exporting the OPAL ELF core.
+ * Heavily inspired from fs/proc/vmcore.c
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "opal core: " fmt
+
+#include <linux/memblock.h>
+#include <linux/uaccess.h>
+#include <linux/proc_fs.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/slab.h>
+#include <linux/vmcore_info.h>
+#include <linux/of.h>
+
+#include <asm/page.h>
+#include <asm/opal.h>
+#include <asm/fadump-internal.h>
+
+#include "opal-fadump.h"
+
+#define MAX_PT_LOAD_CNT 8
+
+/* NT_AUXV note related info */
+#define AUXV_CNT 1
+#define AUXV_DESC_SZ (((2 * AUXV_CNT) + 1) * sizeof(Elf64_Off))
+
+struct opalcore_config {
+ u32 num_cpus;
+ /* PIR value of crashing CPU */
+ u32 crashing_cpu;
+
+ /* CPU state data info from F/W */
+ u64 cpu_state_destination_vaddr;
+ u64 cpu_state_data_size;
+ u64 cpu_state_entry_size;
+
+ /* OPAL memory to be exported as PT_LOAD segments */
+ u64 ptload_addr[MAX_PT_LOAD_CNT];
+ u64 ptload_size[MAX_PT_LOAD_CNT];
+ u64 ptload_cnt;
+
+ /* Pointer to the first PT_LOAD in the ELF core file */
+ Elf64_Phdr *ptload_phdr;
+
+ /* Total size of opalcore file. */
+ size_t opalcore_size;
+
+ /* Buffer for all the ELF core headers and the PT_NOTE */
+ size_t opalcorebuf_sz;
+ char *opalcorebuf;
+
+ /* NT_AUXV buffer */
+ char auxv_buf[AUXV_DESC_SZ];
+};
+
+struct opalcore {
+ struct list_head list;
+ u64 paddr;
+ size_t size;
+ loff_t offset;
+};
+
+static LIST_HEAD(opalcore_list);
+static struct opalcore_config *oc_conf;
+static const struct opal_mpipl_fadump *opalc_metadata;
+static const struct opal_mpipl_fadump *opalc_cpu_metadata;
+static struct kobject *mpipl_kobj;
+
+/*
+ * Set crashing CPU's signal to SIGUSR1. if the kernel is triggered
+ * by kernel, SIGTERM otherwise.
+ */
+bool kernel_initiated;
+
+static struct opalcore * __init get_new_element(void)
+{
+ return kzalloc(sizeof(struct opalcore), GFP_KERNEL);
+}
+
+static inline int is_opalcore_usable(void)
+{
+ return (oc_conf && oc_conf->opalcorebuf != NULL) ? 1 : 0;
+}
+
+static Elf64_Word *__init append_elf64_note(Elf64_Word *buf, char *name,
+ u32 type, void *data,
+ size_t data_len)
+{
+ Elf64_Nhdr *note = (Elf64_Nhdr *)buf;
+ Elf64_Word namesz = strlen(name) + 1;
+
+ note->n_namesz = cpu_to_be32(namesz);
+ note->n_descsz = cpu_to_be32(data_len);
+ note->n_type = cpu_to_be32(type);
+ buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf64_Word));
+ memcpy(buf, name, namesz);
+ buf += DIV_ROUND_UP(namesz, sizeof(Elf64_Word));
+ memcpy(buf, data, data_len);
+ buf += DIV_ROUND_UP(data_len, sizeof(Elf64_Word));
+
+ return buf;
+}
+
+static void __init fill_prstatus(struct elf_prstatus *prstatus, int pir,
+ struct pt_regs *regs)
+{
+ memset(prstatus, 0, sizeof(struct elf_prstatus));
+ elf_core_copy_regs(&(prstatus->pr_reg), regs);
+
+ /*
+ * Overload PID with PIR value.
+ * As a PIR value could also be '0', add an offset of '100'
+ * to every PIR to avoid misinterpretations in GDB.
+ */
+ prstatus->common.pr_pid = cpu_to_be32(100 + pir);
+ prstatus->common.pr_ppid = cpu_to_be32(1);
+
+ /*
+ * Indicate SIGUSR1 for crash initiated from kernel.
+ * SIGTERM otherwise.
+ */
+ if (pir == oc_conf->crashing_cpu) {
+ short sig;
+
+ sig = kernel_initiated ? SIGUSR1 : SIGTERM;
+ prstatus->common.pr_cursig = cpu_to_be16(sig);
+ }
+}
+
+static Elf64_Word *__init auxv_to_elf64_notes(Elf64_Word *buf,
+ u64 opal_boot_entry)
+{
+ Elf64_Off *bufp = (Elf64_Off *)oc_conf->auxv_buf;
+ int idx = 0;
+
+ memset(bufp, 0, AUXV_DESC_SZ);
+
+ /* Entry point of OPAL */
+ bufp[idx++] = cpu_to_be64(AT_ENTRY);
+ bufp[idx++] = cpu_to_be64(opal_boot_entry);
+
+ /* end of vector */
+ bufp[idx++] = cpu_to_be64(AT_NULL);
+
+ buf = append_elf64_note(buf, NN_AUXV, NT_AUXV,
+ oc_conf->auxv_buf, AUXV_DESC_SZ);
+ return buf;
+}
+
+/*
+ * Read from the ELF header and then the crash dump.
+ * Returns number of bytes read on success, -errno on failure.
+ */
+static ssize_t read_opalcore(struct file *file, struct kobject *kobj,
+ const struct bin_attribute *bin_attr, char *to,
+ loff_t pos, size_t count)
+{
+ struct opalcore *m;
+ ssize_t tsz, avail;
+ loff_t tpos = pos;
+
+ if (pos >= oc_conf->opalcore_size)
+ return 0;
+
+ /* Adjust count if it goes beyond opalcore size */
+ avail = oc_conf->opalcore_size - pos;
+ if (count > avail)
+ count = avail;
+
+ if (count == 0)
+ return 0;
+
+ /* Read ELF core header and/or PT_NOTE segment */
+ if (tpos < oc_conf->opalcorebuf_sz) {
+ tsz = min_t(size_t, oc_conf->opalcorebuf_sz - tpos, count);
+ memcpy(to, oc_conf->opalcorebuf + tpos, tsz);
+ to += tsz;
+ tpos += tsz;
+ count -= tsz;
+ }
+
+ list_for_each_entry(m, &opalcore_list, list) {
+ /* nothing more to read here */
+ if (count == 0)
+ break;
+
+ if (tpos < m->offset + m->size) {
+ void *addr;
+
+ tsz = min_t(size_t, m->offset + m->size - tpos, count);
+ addr = (void *)(m->paddr + tpos - m->offset);
+ memcpy(to, __va(addr), tsz);
+ to += tsz;
+ tpos += tsz;
+ count -= tsz;
+ }
+ }
+
+ return (tpos - pos);
+}
+
+static struct bin_attribute opal_core_attr __ro_after_init = {
+ .attr = {.name = "core", .mode = 0400},
+ .read = read_opalcore
+};
+
+/*
+ * Read CPU state dump data and convert it into ELF notes.
+ *
+ * Each register entry is of 16 bytes, A numerical identifier along with
+ * a GPR/SPR flag in the first 8 bytes and the register value in the next
+ * 8 bytes. For more details refer to F/W documentation.
+ */
+static Elf64_Word * __init opalcore_append_cpu_notes(Elf64_Word *buf)
+{
+ u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize;
+ struct hdat_fadump_thread_hdr *thdr;
+ struct elf_prstatus prstatus;
+ Elf64_Word *first_cpu_note;
+ struct pt_regs regs;
+ char *bufp;
+ int i;
+
+ size_per_thread = oc_conf->cpu_state_entry_size;
+ bufp = __va(oc_conf->cpu_state_destination_vaddr);
+
+ /*
+ * Offset for register entries, entry size and registers count is
+ * duplicated in every thread header in keeping with HDAT format.
+ * Use these values from the first thread header.
+ */
+ thdr = (struct hdat_fadump_thread_hdr *)bufp;
+ regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
+ be32_to_cpu(thdr->offset));
+ reg_esize = be32_to_cpu(thdr->esize);
+ regs_cnt = be32_to_cpu(thdr->ecnt);
+
+ pr_debug("--------CPU State Data------------\n");
+ pr_debug("NumCpus : %u\n", oc_conf->num_cpus);
+ pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
+ regs_offset, reg_esize, regs_cnt);
+
+ /*
+ * Skip past the first CPU note. Fill this note with the
+ * crashing CPU's prstatus.
+ */
+ first_cpu_note = buf;
+ buf = append_elf64_note(buf, NN_PRSTATUS, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+
+ for (i = 0; i < oc_conf->num_cpus; i++, bufp += size_per_thread) {
+ thdr = (struct hdat_fadump_thread_hdr *)bufp;
+ thread_pir = be32_to_cpu(thdr->pir);
+
+ pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n",
+ i, thread_pir, thdr->core_state);
+
+ /*
+ * Register state data of MAX cores is provided by firmware,
+ * but some of this cores may not be active. So, while
+ * processing register state data, check core state and
+ * skip threads that belong to inactive cores.
+ */
+ if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE)
+ continue;
+
+ opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
+ reg_esize, false, &regs);
+
+ pr_debug("PIR 0x%x - R1 : 0x%llx, NIP : 0x%llx\n", thread_pir,
+ be64_to_cpu(regs.gpr[1]), be64_to_cpu(regs.nip));
+ fill_prstatus(&prstatus, thread_pir, &regs);
+
+ if (thread_pir != oc_conf->crashing_cpu) {
+ buf = append_elf64_note(buf, NN_PRSTATUS,
+ NT_PRSTATUS, &prstatus,
+ sizeof(prstatus));
+ } else {
+ /*
+ * Add crashing CPU as the first NT_PRSTATUS note for
+ * GDB to process the core file appropriately.
+ */
+ append_elf64_note(first_cpu_note, NN_PRSTATUS,
+ NT_PRSTATUS, &prstatus,
+ sizeof(prstatus));
+ }
+ }
+
+ return buf;
+}
+
+static int __init create_opalcore(void)
+{
+ u64 opal_boot_entry, opal_base_addr, paddr;
+ u32 hdr_size, cpu_notes_size, count;
+ struct device_node *dn;
+ struct opalcore *new;
+ loff_t opalcore_off;
+ struct page *page;
+ Elf64_Phdr *phdr;
+ Elf64_Ehdr *elf;
+ int i, ret;
+ char *bufp;
+
+ /* Get size of header & CPU notes for OPAL core */
+ hdr_size = (sizeof(Elf64_Ehdr) +
+ ((oc_conf->ptload_cnt + 1) * sizeof(Elf64_Phdr)));
+ cpu_notes_size = ((oc_conf->num_cpus * (CRASH_CORE_NOTE_HEAD_BYTES +
+ CRASH_CORE_NOTE_NAME_BYTES +
+ CRASH_CORE_NOTE_DESC_BYTES)) +
+ (CRASH_CORE_NOTE_HEAD_BYTES +
+ CRASH_CORE_NOTE_NAME_BYTES + AUXV_DESC_SZ));
+
+ /* Allocate buffer to setup OPAL core */
+ oc_conf->opalcorebuf_sz = PAGE_ALIGN(hdr_size + cpu_notes_size);
+ oc_conf->opalcorebuf = alloc_pages_exact(oc_conf->opalcorebuf_sz,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!oc_conf->opalcorebuf) {
+ pr_err("Not enough memory to setup OPAL core (size: %lu)\n",
+ oc_conf->opalcorebuf_sz);
+ oc_conf->opalcorebuf_sz = 0;
+ return -ENOMEM;
+ }
+ count = oc_conf->opalcorebuf_sz / PAGE_SIZE;
+ page = virt_to_page(oc_conf->opalcorebuf);
+ for (i = 0; i < count; i++)
+ mark_page_reserved(page + i);
+
+ pr_debug("opalcorebuf = 0x%llx\n", (u64)oc_conf->opalcorebuf);
+
+ /* Read OPAL related device-tree entries */
+ dn = of_find_node_by_name(NULL, "ibm,opal");
+ if (dn) {
+ ret = of_property_read_u64(dn, "opal-base-address",
+ &opal_base_addr);
+ pr_debug("opal-base-address: %llx\n", opal_base_addr);
+ ret |= of_property_read_u64(dn, "opal-boot-address",
+ &opal_boot_entry);
+ pr_debug("opal-boot-address: %llx\n", opal_boot_entry);
+ }
+ if (!dn || ret)
+ pr_warn("WARNING: Failed to read OPAL base & entry values\n");
+
+ of_node_put(dn);
+
+ /* Use count to keep track of the program headers */
+ count = 0;
+
+ bufp = oc_conf->opalcorebuf;
+ elf = (Elf64_Ehdr *)bufp;
+ bufp += sizeof(Elf64_Ehdr);
+ memcpy(elf->e_ident, ELFMAG, SELFMAG);
+ elf->e_ident[EI_CLASS] = ELF_CLASS;
+ elf->e_ident[EI_DATA] = ELFDATA2MSB;
+ elf->e_ident[EI_VERSION] = EV_CURRENT;
+ elf->e_ident[EI_OSABI] = ELF_OSABI;
+ memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD);
+ elf->e_type = cpu_to_be16(ET_CORE);
+ elf->e_machine = cpu_to_be16(ELF_ARCH);
+ elf->e_version = cpu_to_be32(EV_CURRENT);
+ elf->e_entry = 0;
+ elf->e_phoff = cpu_to_be64(sizeof(Elf64_Ehdr));
+ elf->e_shoff = 0;
+ elf->e_flags = 0;
+
+ elf->e_ehsize = cpu_to_be16(sizeof(Elf64_Ehdr));
+ elf->e_phentsize = cpu_to_be16(sizeof(Elf64_Phdr));
+ elf->e_phnum = 0;
+ elf->e_shentsize = 0;
+ elf->e_shnum = 0;
+ elf->e_shstrndx = 0;
+
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = cpu_to_be32(PT_NOTE);
+ phdr->p_flags = 0;
+ phdr->p_align = 0;
+ phdr->p_paddr = phdr->p_vaddr = 0;
+ phdr->p_offset = cpu_to_be64(hdr_size);
+ phdr->p_filesz = phdr->p_memsz = cpu_to_be64(cpu_notes_size);
+ count++;
+
+ opalcore_off = oc_conf->opalcorebuf_sz;
+ oc_conf->ptload_phdr = (Elf64_Phdr *)bufp;
+ paddr = 0;
+ for (i = 0; i < oc_conf->ptload_cnt; i++) {
+ phdr = (Elf64_Phdr *)bufp;
+ bufp += sizeof(Elf64_Phdr);
+ phdr->p_type = cpu_to_be32(PT_LOAD);
+ phdr->p_flags = cpu_to_be32(PF_R|PF_W|PF_X);
+ phdr->p_align = 0;
+
+ new = get_new_element();
+ if (!new)
+ return -ENOMEM;
+ new->paddr = oc_conf->ptload_addr[i];
+ new->size = oc_conf->ptload_size[i];
+ new->offset = opalcore_off;
+ list_add_tail(&new->list, &opalcore_list);
+
+ phdr->p_paddr = cpu_to_be64(paddr);
+ phdr->p_vaddr = cpu_to_be64(opal_base_addr + paddr);
+ phdr->p_filesz = phdr->p_memsz =
+ cpu_to_be64(oc_conf->ptload_size[i]);
+ phdr->p_offset = cpu_to_be64(opalcore_off);
+
+ count++;
+ opalcore_off += oc_conf->ptload_size[i];
+ paddr += oc_conf->ptload_size[i];
+ }
+
+ elf->e_phnum = cpu_to_be16(count);
+
+ bufp = (char *)opalcore_append_cpu_notes((Elf64_Word *)bufp);
+ bufp = (char *)auxv_to_elf64_notes((Elf64_Word *)bufp, opal_boot_entry);
+
+ oc_conf->opalcore_size = opalcore_off;
+ return 0;
+}
+
+static void opalcore_cleanup(void)
+{
+ if (oc_conf == NULL)
+ return;
+
+ /* Remove OPAL core sysfs file */
+ sysfs_remove_bin_file(mpipl_kobj, &opal_core_attr);
+ oc_conf->ptload_phdr = NULL;
+ oc_conf->ptload_cnt = 0;
+
+ /* free the buffer used for setting up OPAL core */
+ if (oc_conf->opalcorebuf) {
+ void *end = (void *)((u64)oc_conf->opalcorebuf +
+ oc_conf->opalcorebuf_sz);
+
+ free_reserved_area(oc_conf->opalcorebuf, end, -1, NULL);
+ oc_conf->opalcorebuf = NULL;
+ oc_conf->opalcorebuf_sz = 0;
+ }
+
+ kfree(oc_conf);
+ oc_conf = NULL;
+}
+__exitcall(opalcore_cleanup);
+
+static void __init opalcore_config_init(void)
+{
+ u32 idx, cpu_data_version;
+ struct device_node *np;
+ const __be32 *prop;
+ u64 addr = 0;
+ int i, ret;
+
+ np = of_find_node_by_path("/ibm,opal/dump");
+ if (np == NULL)
+ return;
+
+ if (!of_device_is_compatible(np, "ibm,opal-dump")) {
+ pr_warn("Support missing for this f/w version!\n");
+ return;
+ }
+
+ /* Check if dump has been initiated on last reboot */
+ prop = of_get_property(np, "mpipl-boot", NULL);
+ if (!prop) {
+ of_node_put(np);
+ return;
+ }
+
+ /* Get OPAL metadata */
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_OPAL, &addr);
+ if ((ret != OPAL_SUCCESS) || !addr) {
+ pr_err("Failed to get OPAL metadata (%d)\n", ret);
+ goto error_out;
+ }
+
+ addr = be64_to_cpu(addr);
+ pr_debug("OPAL metadata addr: %llx\n", addr);
+ opalc_metadata = __va(addr);
+
+ /* Get OPAL CPU metadata */
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &addr);
+ if ((ret != OPAL_SUCCESS) || !addr) {
+ pr_err("Failed to get OPAL CPU metadata (%d)\n", ret);
+ goto error_out;
+ }
+
+ addr = be64_to_cpu(addr);
+ pr_debug("CPU metadata addr: %llx\n", addr);
+ opalc_cpu_metadata = __va(addr);
+
+ /* Allocate memory for config buffer */
+ oc_conf = kzalloc(sizeof(struct opalcore_config), GFP_KERNEL);
+ if (oc_conf == NULL)
+ goto error_out;
+
+ /* Parse OPAL metadata */
+ if (opalc_metadata->version != OPAL_MPIPL_VERSION) {
+ pr_warn("Supported OPAL metadata version: %u, found: %u!\n",
+ OPAL_MPIPL_VERSION, opalc_metadata->version);
+ pr_warn("WARNING: F/W using newer OPAL metadata format!!\n");
+ }
+
+ oc_conf->ptload_cnt = 0;
+ idx = be32_to_cpu(opalc_metadata->region_cnt);
+ if (idx > MAX_PT_LOAD_CNT) {
+ pr_warn("WARNING: OPAL regions count (%d) adjusted to limit (%d)",
+ idx, MAX_PT_LOAD_CNT);
+ idx = MAX_PT_LOAD_CNT;
+ }
+ for (i = 0; i < idx; i++) {
+ oc_conf->ptload_addr[oc_conf->ptload_cnt] =
+ be64_to_cpu(opalc_metadata->region[i].dest);
+ oc_conf->ptload_size[oc_conf->ptload_cnt++] =
+ be64_to_cpu(opalc_metadata->region[i].size);
+ }
+ oc_conf->ptload_cnt = i;
+ oc_conf->crashing_cpu = be32_to_cpu(opalc_metadata->crashing_pir);
+
+ if (!oc_conf->ptload_cnt) {
+ pr_err("OPAL memory regions not found\n");
+ goto error_out;
+ }
+
+ /* Parse OPAL CPU metadata */
+ cpu_data_version = be32_to_cpu(opalc_cpu_metadata->cpu_data_version);
+ if (cpu_data_version != HDAT_FADUMP_CPU_DATA_VER) {
+ pr_warn("Supported CPU data version: %u, found: %u!\n",
+ HDAT_FADUMP_CPU_DATA_VER, cpu_data_version);
+ pr_warn("WARNING: F/W using newer CPU state data format!!\n");
+ }
+
+ addr = be64_to_cpu(opalc_cpu_metadata->region[0].dest);
+ if (!addr) {
+ pr_err("CPU state data not found!\n");
+ goto error_out;
+ }
+ oc_conf->cpu_state_destination_vaddr = (u64)__va(addr);
+
+ oc_conf->cpu_state_data_size =
+ be64_to_cpu(opalc_cpu_metadata->region[0].size);
+ oc_conf->cpu_state_entry_size =
+ be32_to_cpu(opalc_cpu_metadata->cpu_data_size);
+
+ if ((oc_conf->cpu_state_entry_size == 0) ||
+ (oc_conf->cpu_state_entry_size > oc_conf->cpu_state_data_size)) {
+ pr_err("CPU state data is invalid.\n");
+ goto error_out;
+ }
+ oc_conf->num_cpus = (oc_conf->cpu_state_data_size /
+ oc_conf->cpu_state_entry_size);
+
+ of_node_put(np);
+ return;
+
+error_out:
+ pr_err("Could not export /sys/firmware/opal/core\n");
+ opalcore_cleanup();
+ of_node_put(np);
+}
+
+static ssize_t release_core_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int input = -1;
+
+ if (kstrtoint(buf, 0, &input))
+ return -EINVAL;
+
+ if (input == 1) {
+ if (oc_conf == NULL) {
+ pr_err("'/sys/firmware/opal/core' file not accessible!\n");
+ return -EPERM;
+ }
+
+ /*
+ * Take away '/sys/firmware/opal/core' and release all memory
+ * used for exporting this file.
+ */
+ opalcore_cleanup();
+ } else
+ return -EINVAL;
+
+ return count;
+}
+
+static struct kobj_attribute opalcore_rel_attr = __ATTR_WO(release_core);
+
+static struct attribute *mpipl_attr[] = {
+ &opalcore_rel_attr.attr,
+ NULL,
+};
+
+static const struct bin_attribute *const mpipl_bin_attr[] = {
+ &opal_core_attr,
+ NULL,
+
+};
+
+static const struct attribute_group mpipl_group = {
+ .attrs = mpipl_attr,
+ .bin_attrs = mpipl_bin_attr,
+};
+
+static int __init opalcore_init(void)
+{
+ int rc = -1;
+
+ opalcore_config_init();
+
+ if (oc_conf == NULL)
+ return rc;
+
+ create_opalcore();
+
+ /*
+ * If oc_conf->opalcorebuf= is set in the 2nd kernel,
+ * then capture the dump.
+ */
+ if (!(is_opalcore_usable())) {
+ pr_err("Failed to export /sys/firmware/opal/mpipl/core\n");
+ opalcore_cleanup();
+ return rc;
+ }
+
+ /* Set OPAL core file size */
+ opal_core_attr.size = oc_conf->opalcore_size;
+
+ mpipl_kobj = kobject_create_and_add("mpipl", opal_kobj);
+ if (!mpipl_kobj) {
+ pr_err("unable to create mpipl kobject\n");
+ return -ENOMEM;
+ }
+
+ /* Export OPAL core sysfs file */
+ rc = sysfs_create_group(mpipl_kobj, &mpipl_group);
+ if (rc) {
+ pr_err("mpipl sysfs group creation failed (%d)", rc);
+ opalcore_cleanup();
+ return rc;
+ }
+ /* The /sys/firmware/opal/core is moved to /sys/firmware/opal/mpipl/
+ * directory, need to create symlink at old location to maintain
+ * backward compatibility.
+ */
+ rc = compat_only_sysfs_link_entry_to_kobj(opal_kobj, mpipl_kobj,
+ "core", NULL);
+ if (rc) {
+ pr_err("unable to create core symlink (%d)\n", rc);
+ return rc;
+ }
+
+ return 0;
+}
+fs_initcall(opalcore_init);
diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c
index 198143833f00..cc3cc9ddf9d1 100644
--- a/arch/powerpc/platforms/powernv/opal-dump.c
+++ b/arch/powerpc/platforms/powernv/opal-dump.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV OPAL Dump Interface
*
* Copyright 2013,2014 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kobject.h>
@@ -92,9 +88,14 @@ static ssize_t dump_ack_store(struct dump_obj *dump_obj,
const char *buf,
size_t count)
{
- dump_send_ack(dump_obj->id);
- sysfs_remove_file_self(&dump_obj->kobj, &attr->attr);
- kobject_put(&dump_obj->kobj);
+ /*
+ * Try to self remove this attribute. If we are successful,
+ * delete the kobject itself.
+ */
+ if (sysfs_remove_file_self(&dump_obj->kobj, &attr->attr)) {
+ dump_send_ack(dump_obj->id);
+ kobject_put(&dump_obj->kobj);
+ }
return count;
}
@@ -149,7 +150,7 @@ static struct attribute *initiate_attrs[] = {
NULL,
};
-static struct attribute_group initiate_attr_group = {
+static const struct attribute_group initiate_attr_group = {
.attrs = initiate_attrs,
};
@@ -207,11 +208,12 @@ static struct attribute *dump_default_attrs[] = {
&ack_attribute.attr,
NULL,
};
+ATTRIBUTE_GROUPS(dump_default);
-static struct kobj_type dump_ktype = {
+static const struct kobj_type dump_ktype = {
.sysfs_ops = &dump_sysfs_ops,
.release = &dump_release,
- .default_attrs = dump_default_attrs,
+ .default_groups = dump_default_groups,
};
static int64_t dump_read_info(uint32_t *dump_id, uint32_t *dump_size, uint32_t *dump_type)
@@ -284,7 +286,7 @@ out:
}
static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buffer, loff_t pos, size_t count)
{
ssize_t rc;
@@ -322,15 +324,14 @@ static ssize_t dump_attr_read(struct file *filep, struct kobject *kobj,
return count;
}
-static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
- uint32_t type)
+static void create_dump_obj(uint32_t id, size_t size, uint32_t type)
{
struct dump_obj *dump;
int rc;
dump = kzalloc(sizeof(*dump), GFP_KERNEL);
if (!dump)
- return NULL;
+ return;
dump->kobj.kset = dump_kset;
@@ -350,21 +351,39 @@ static struct dump_obj *create_dump_obj(uint32_t id, size_t size,
rc = kobject_add(&dump->kobj, NULL, "0x%x-0x%x", type, id);
if (rc) {
kobject_put(&dump->kobj);
- return NULL;
+ return;
}
+ /*
+ * As soon as the sysfs file for this dump is created/activated there is
+ * a chance the opal_errd daemon (or any userspace) might read and
+ * acknowledge the dump before kobject_uevent() is called. If that
+ * happens then there is a potential race between
+ * dump_ack_store->kobject_put() and kobject_uevent() which leads to a
+ * use-after-free of a kernfs object resulting in a kernel crash.
+ *
+ * To avoid that, we need to take a reference on behalf of the bin file,
+ * so that our reference remains valid while we call kobject_uevent().
+ * We then drop our reference before exiting the function, leaving the
+ * bin file to drop the last reference (if it hasn't already).
+ */
+
+ /* Take a reference for the bin file */
+ kobject_get(&dump->kobj);
rc = sysfs_create_bin_file(&dump->kobj, &dump->dump_attr);
- if (rc) {
+ if (rc == 0) {
+ kobject_uevent(&dump->kobj, KOBJ_ADD);
+
+ pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
+ __func__, dump->id, dump->size);
+ } else {
+ /* Drop reference count taken for bin file */
kobject_put(&dump->kobj);
- return NULL;
}
- pr_info("%s: New platform dump. ID = 0x%x Size %u\n",
- __func__, dump->id, dump->size);
-
- kobject_uevent(&dump->kobj, KOBJ_ADD);
-
- return dump;
+ /* Drop our reference */
+ kobject_put(&dump->kobj);
+ return;
}
static irqreturn_t process_dump(int irq, void *data)
@@ -401,7 +420,7 @@ void __init opal_platform_dump_init(void)
int rc;
int dump_irq;
- /* ELOG not supported by firmware */
+ /* Dump not supported by firmware */
if (!opal_check_token(OPAL_DUMP_READ))
return;
diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c
index ba6e437abb4b..c3fc5d258146 100644
--- a/arch/powerpc/platforms/powernv/opal-elog.c
+++ b/arch/powerpc/platforms/powernv/opal-elog.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Error log support on PowerNV.
*
* Copyright 2013,2014 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/init.h>
@@ -76,9 +72,14 @@ static ssize_t elog_ack_store(struct elog_obj *elog_obj,
const char *buf,
size_t count)
{
- opal_send_ack_elog(elog_obj->id);
- sysfs_remove_file_self(&elog_obj->kobj, &attr->attr);
- kobject_put(&elog_obj->kobj);
+ /*
+ * Try to self remove this attribute. If we are successful,
+ * delete the kobject itself.
+ */
+ if (sysfs_remove_file_self(&elog_obj->kobj, &attr->attr)) {
+ opal_send_ack_elog(elog_obj->id);
+ kobject_put(&elog_obj->kobj);
+ }
return count;
}
@@ -143,18 +144,19 @@ static struct attribute *elog_default_attrs[] = {
&ack_attribute.attr,
NULL,
};
+ATTRIBUTE_GROUPS(elog_default);
-static struct kobj_type elog_ktype = {
+static const struct kobj_type elog_ktype = {
.sysfs_ops = &elog_sysfs_ops,
.release = &elog_release,
- .default_attrs = elog_default_attrs,
+ .default_groups = elog_default_groups,
};
/* Maximum size of a single log on FSP is 16KB */
#define OPAL_MAX_ERRLOG_SIZE 16384
static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buffer, loff_t pos, size_t count)
{
int opal_rc;
@@ -170,8 +172,8 @@ static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
opal_rc = opal_read_elog(__pa(elog->buffer),
elog->size, elog->id);
if (opal_rc != OPAL_SUCCESS) {
- pr_err("ELOG: log read failed for log-id=%llx\n",
- elog->id);
+ pr_err_ratelimited("ELOG: log read failed for log-id=%llx\n",
+ elog->id);
kfree(elog->buffer);
elog->buffer = NULL;
return -EIO;
@@ -183,14 +185,14 @@ static ssize_t raw_attr_read(struct file *filep, struct kobject *kobj,
return count;
}
-static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
+static void create_elog_obj(uint64_t id, size_t size, uint64_t type)
{
struct elog_obj *elog;
int rc;
elog = kzalloc(sizeof(*elog), GFP_KERNEL);
if (!elog)
- return NULL;
+ return;
elog->kobj.kset = elog_kset;
@@ -223,18 +225,37 @@ static struct elog_obj *create_elog_obj(uint64_t id, size_t size, uint64_t type)
rc = kobject_add(&elog->kobj, NULL, "0x%llx", id);
if (rc) {
kobject_put(&elog->kobj);
- return NULL;
+ return;
}
+ /*
+ * As soon as the sysfs file for this elog is created/activated there is
+ * a chance the opal_errd daemon (or any userspace) might read and
+ * acknowledge the elog before kobject_uevent() is called. If that
+ * happens then there is a potential race between
+ * elog_ack_store->kobject_put() and kobject_uevent() which leads to a
+ * use-after-free of a kernfs object resulting in a kernel crash.
+ *
+ * To avoid that, we need to take a reference on behalf of the bin file,
+ * so that our reference remains valid while we call kobject_uevent().
+ * We then drop our reference before exiting the function, leaving the
+ * bin file to drop the last reference (if it hasn't already).
+ */
+
+ /* Take a reference for the bin file */
+ kobject_get(&elog->kobj);
rc = sysfs_create_bin_file(&elog->kobj, &elog->raw_attr);
- if (rc) {
+ if (rc == 0) {
+ kobject_uevent(&elog->kobj, KOBJ_ADD);
+ } else {
+ /* Drop the reference taken for the bin file */
kobject_put(&elog->kobj);
- return NULL;
}
- kobject_uevent(&elog->kobj, KOBJ_ADD);
+ /* Drop our reference */
+ kobject_put(&elog->kobj);
- return elog;
+ return;
}
static irqreturn_t elog_event(int irq, void *data)
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
new file mode 100644
index 000000000000..c9c1dfb35464
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -0,0 +1,719 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Firmware-Assisted Dump support on POWER platform (OPAL).
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "opal fadump: " fmt
+
+#include <linux/string.h>
+#include <linux/seq_file.h>
+#include <linux/of.h>
+#include <linux/of_fdt.h>
+#include <linux/libfdt.h>
+#include <linux/mm.h>
+#include <linux/crash_dump.h>
+
+#include <asm/page.h>
+#include <asm/opal.h>
+#include <asm/fadump-internal.h>
+
+#include "opal-fadump.h"
+
+
+#ifdef CONFIG_PRESERVE_FA_DUMP
+/*
+ * When dump is active but PRESERVE_FA_DUMP is enabled on the kernel,
+ * ensure crash data is preserved in hope that the subsequent memory
+ * preserving kernel boot is going to process this crash data.
+ */
+void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
+{
+ const struct opal_fadump_mem_struct *opal_fdm_active;
+ const __be32 *prop;
+ unsigned long dn;
+ u64 addr = 0;
+ s64 ret;
+
+ dn = of_get_flat_dt_subnode_by_name(node, "dump");
+ if (dn == -FDT_ERR_NOTFOUND)
+ return;
+
+ /*
+ * Check if dump has been initiated on last reboot.
+ */
+ prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
+ if (!prop)
+ return;
+
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &addr);
+ if ((ret != OPAL_SUCCESS) || !addr) {
+ pr_debug("Could not get Kernel metadata (%lld)\n", ret);
+ return;
+ }
+
+ /*
+ * Preserve memory only if kernel memory regions are registered
+ * with f/w for MPIPL.
+ */
+ addr = be64_to_cpu(addr);
+ pr_debug("Kernel metadata addr: %llx\n", addr);
+ opal_fdm_active = (void *)addr;
+ if (be16_to_cpu(opal_fdm_active->registered_regions) == 0)
+ return;
+
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_BOOT_MEM, &addr);
+ if ((ret != OPAL_SUCCESS) || !addr) {
+ pr_err("Failed to get boot memory tag (%lld)\n", ret);
+ return;
+ }
+
+ /*
+ * Memory below this address can be used for booting a
+ * capture kernel or petitboot kernel. Preserve everything
+ * above this address for processing crashdump.
+ */
+ fadump_conf->boot_mem_top = be64_to_cpu(addr);
+ pr_debug("Preserve everything above %llx\n", fadump_conf->boot_mem_top);
+
+ pr_info("Firmware-assisted dump is active.\n");
+ fadump_conf->dump_active = 1;
+}
+
+#else /* CONFIG_PRESERVE_FA_DUMP */
+static const struct opal_fadump_mem_struct *opal_fdm_active;
+static const struct opal_mpipl_fadump *opal_cpu_metadata;
+static struct opal_fadump_mem_struct *opal_fdm;
+
+#ifdef CONFIG_OPAL_CORE
+extern bool kernel_initiated;
+#endif
+
+static int opal_fadump_unregister(struct fw_dump *fadump_conf);
+
+static void opal_fadump_update_config(struct fw_dump *fadump_conf,
+ const struct opal_fadump_mem_struct *fdm)
+{
+ pr_debug("Boot memory regions count: %d\n", be16_to_cpu(fdm->region_cnt));
+
+ /*
+ * The destination address of the first boot memory region is the
+ * destination address of boot memory regions.
+ */
+ fadump_conf->boot_mem_dest_addr = be64_to_cpu(fdm->rgn[0].dest);
+ pr_debug("Destination address of boot memory regions: %#016llx\n",
+ fadump_conf->boot_mem_dest_addr);
+
+ fadump_conf->fadumphdr_addr = be64_to_cpu(fdm->fadumphdr_addr);
+}
+
+/*
+ * This function is called in the capture kernel to get configuration details
+ * from metadata setup by the first kernel.
+ */
+static void __init opal_fadump_get_config(struct fw_dump *fadump_conf,
+ const struct opal_fadump_mem_struct *fdm)
+{
+ unsigned long base, size, last_end, hole_size;
+ int i;
+
+ if (!fadump_conf->dump_active)
+ return;
+
+ last_end = 0;
+ hole_size = 0;
+ fadump_conf->boot_memory_size = 0;
+
+ pr_debug("Boot memory regions:\n");
+ for (i = 0; i < be16_to_cpu(fdm->region_cnt); i++) {
+ base = be64_to_cpu(fdm->rgn[i].src);
+ size = be64_to_cpu(fdm->rgn[i].size);
+ pr_debug("\t[%03d] base: 0x%lx, size: 0x%lx\n", i, base, size);
+
+ fadump_conf->boot_mem_addr[i] = base;
+ fadump_conf->boot_mem_sz[i] = size;
+ fadump_conf->boot_memory_size += size;
+ hole_size += (base - last_end);
+
+ last_end = base + size;
+ }
+
+ /*
+ * Start address of reserve dump area (permanent reservation) for
+ * re-registering FADump after dump capture.
+ */
+ fadump_conf->reserve_dump_area_start = be64_to_cpu(fdm->rgn[0].dest);
+
+ /*
+ * Rarely, but it can so happen that system crashes before all
+ * boot memory regions are registered for MPIPL. In such
+ * cases, warn that the vmcore may not be accurate and proceed
+ * anyway as that is the best bet considering free pages, cache
+ * pages, user pages, etc are usually filtered out.
+ *
+ * Hope the memory that could not be preserved only has pages
+ * that are usually filtered out while saving the vmcore.
+ */
+ if (be16_to_cpu(fdm->region_cnt) > be16_to_cpu(fdm->registered_regions)) {
+ pr_warn("Not all memory regions were saved!!!\n");
+ pr_warn(" Unsaved memory regions:\n");
+ i = be16_to_cpu(fdm->registered_regions);
+ while (i < be16_to_cpu(fdm->region_cnt)) {
+ pr_warn("\t[%03d] base: 0x%llx, size: 0x%llx\n",
+ i, be64_to_cpu(fdm->rgn[i].src),
+ be64_to_cpu(fdm->rgn[i].size));
+ i++;
+ }
+
+ pr_warn("If the unsaved regions only contain pages that are filtered out (eg. free/user pages), the vmcore should still be usable.\n");
+ pr_warn("WARNING: If the unsaved regions contain kernel pages, the vmcore will be corrupted.\n");
+ }
+
+ fadump_conf->boot_mem_top = (fadump_conf->boot_memory_size + hole_size);
+ fadump_conf->boot_mem_regs_cnt = be16_to_cpu(fdm->region_cnt);
+ opal_fadump_update_config(fadump_conf, fdm);
+}
+
+/* Initialize kernel metadata */
+static void opal_fadump_init_metadata(struct opal_fadump_mem_struct *fdm)
+{
+ fdm->version = OPAL_FADUMP_VERSION;
+ fdm->region_cnt = cpu_to_be16(0);
+ fdm->registered_regions = cpu_to_be16(0);
+ fdm->fadumphdr_addr = cpu_to_be64(0);
+}
+
+static u64 opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
+{
+ u64 addr = fadump_conf->reserve_dump_area_start;
+ u16 reg_cnt;
+ int i;
+
+ opal_fdm = __va(fadump_conf->kernel_metadata);
+ opal_fadump_init_metadata(opal_fdm);
+
+ /* Boot memory regions */
+ reg_cnt = be16_to_cpu(opal_fdm->region_cnt);
+ for (i = 0; i < fadump_conf->boot_mem_regs_cnt; i++) {
+ opal_fdm->rgn[i].src = cpu_to_be64(fadump_conf->boot_mem_addr[i]);
+ opal_fdm->rgn[i].dest = cpu_to_be64(addr);
+ opal_fdm->rgn[i].size = cpu_to_be64(fadump_conf->boot_mem_sz[i]);
+
+ reg_cnt++;
+ addr += fadump_conf->boot_mem_sz[i];
+ }
+ opal_fdm->region_cnt = cpu_to_be16(reg_cnt);
+
+ /*
+ * Kernel metadata is passed to f/w and retrieved in capture kernel.
+ * So, use it to save fadump header address instead of calculating it.
+ */
+ opal_fdm->fadumphdr_addr = cpu_to_be64(be64_to_cpu(opal_fdm->rgn[0].dest) +
+ fadump_conf->boot_memory_size);
+
+ opal_fadump_update_config(fadump_conf, opal_fdm);
+
+ return addr;
+}
+
+static u64 opal_fadump_get_metadata_size(void)
+{
+ return PAGE_ALIGN(sizeof(struct opal_fadump_mem_struct));
+}
+
+static int opal_fadump_setup_metadata(struct fw_dump *fadump_conf)
+{
+ int err = 0;
+ s64 ret;
+
+ /*
+ * Use the last page(s) in FADump memory reservation for
+ * kernel metadata.
+ */
+ fadump_conf->kernel_metadata = (fadump_conf->reserve_dump_area_start +
+ fadump_conf->reserve_dump_area_size -
+ opal_fadump_get_metadata_size());
+ pr_info("Kernel metadata addr: %llx\n", fadump_conf->kernel_metadata);
+
+ /* Initialize kernel metadata before registering the address with f/w */
+ opal_fdm = __va(fadump_conf->kernel_metadata);
+ opal_fadump_init_metadata(opal_fdm);
+
+ /*
+ * Register metadata address with f/w. Can be retrieved in
+ * the capture kernel.
+ */
+ ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL,
+ fadump_conf->kernel_metadata);
+ if (ret != OPAL_SUCCESS) {
+ pr_err("Failed to set kernel metadata tag!\n");
+ err = -EPERM;
+ }
+
+ /*
+ * Register boot memory top address with f/w. Should be retrieved
+ * by a kernel that intends to preserve crash'ed kernel's memory.
+ */
+ ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_BOOT_MEM,
+ fadump_conf->boot_mem_top);
+ if (ret != OPAL_SUCCESS) {
+ pr_err("Failed to set boot memory tag!\n");
+ err = -EPERM;
+ }
+
+ return err;
+}
+
+static u64 opal_fadump_get_bootmem_min(void)
+{
+ return OPAL_FADUMP_MIN_BOOT_MEM;
+}
+
+static int opal_fadump_register(struct fw_dump *fadump_conf)
+{
+ s64 rc = OPAL_PARAMETER;
+ u16 registered_regs;
+ int i, err = -EIO;
+
+ registered_regs = be16_to_cpu(opal_fdm->registered_regions);
+ for (i = 0; i < be16_to_cpu(opal_fdm->region_cnt); i++) {
+ rc = opal_mpipl_update(OPAL_MPIPL_ADD_RANGE,
+ be64_to_cpu(opal_fdm->rgn[i].src),
+ be64_to_cpu(opal_fdm->rgn[i].dest),
+ be64_to_cpu(opal_fdm->rgn[i].size));
+ if (rc != OPAL_SUCCESS)
+ break;
+
+ registered_regs++;
+ }
+ opal_fdm->registered_regions = cpu_to_be16(registered_regs);
+
+ switch (rc) {
+ case OPAL_SUCCESS:
+ pr_info("Registration is successful!\n");
+ fadump_conf->dump_registered = 1;
+ err = 0;
+ break;
+ case OPAL_RESOURCE:
+ /* If MAX regions limit in f/w is hit, warn and proceed. */
+ pr_warn("%d regions could not be registered for MPIPL as MAX limit is reached!\n",
+ (be16_to_cpu(opal_fdm->region_cnt) -
+ be16_to_cpu(opal_fdm->registered_regions)));
+ fadump_conf->dump_registered = 1;
+ err = 0;
+ break;
+ case OPAL_PARAMETER:
+ pr_err("Failed to register. Parameter Error(%lld).\n", rc);
+ break;
+ case OPAL_HARDWARE:
+ pr_err("Support not available.\n");
+ fadump_conf->fadump_supported = 0;
+ fadump_conf->fadump_enabled = 0;
+ break;
+ default:
+ pr_err("Failed to register. Unknown Error(%lld).\n", rc);
+ break;
+ }
+
+ /*
+ * If some regions were registered before OPAL_MPIPL_ADD_RANGE
+ * OPAL call failed, unregister all regions.
+ */
+ if ((err < 0) && (be16_to_cpu(opal_fdm->registered_regions) > 0))
+ opal_fadump_unregister(fadump_conf);
+
+ return err;
+}
+
+static int opal_fadump_unregister(struct fw_dump *fadump_conf)
+{
+ s64 rc;
+
+ rc = opal_mpipl_update(OPAL_MPIPL_REMOVE_ALL, 0, 0, 0);
+ if (rc) {
+ pr_err("Failed to un-register - unexpected Error(%lld).\n", rc);
+ return -EIO;
+ }
+
+ opal_fdm->registered_regions = cpu_to_be16(0);
+ fadump_conf->dump_registered = 0;
+ return 0;
+}
+
+static int opal_fadump_invalidate(struct fw_dump *fadump_conf)
+{
+ s64 rc;
+
+ rc = opal_mpipl_update(OPAL_MPIPL_FREE_PRESERVED_MEMORY, 0, 0, 0);
+ if (rc) {
+ pr_err("Failed to invalidate - unexpected Error(%lld).\n", rc);
+ return -EIO;
+ }
+
+ fadump_conf->dump_active = 0;
+ opal_fdm_active = NULL;
+ return 0;
+}
+
+static void opal_fadump_cleanup(struct fw_dump *fadump_conf)
+{
+ s64 ret;
+
+ ret = opal_mpipl_register_tag(OPAL_MPIPL_TAG_KERNEL, 0);
+ if (ret != OPAL_SUCCESS)
+ pr_warn("Could not reset (%llu) kernel metadata tag!\n", ret);
+}
+
+/*
+ * Verify if CPU state data is available. If available, do a bit of sanity
+ * checking before processing this data.
+ */
+static bool __init is_opal_fadump_cpu_data_valid(struct fw_dump *fadump_conf)
+{
+ if (!opal_cpu_metadata)
+ return false;
+
+ fadump_conf->cpu_state_data_version =
+ be32_to_cpu(opal_cpu_metadata->cpu_data_version);
+ fadump_conf->cpu_state_entry_size =
+ be32_to_cpu(opal_cpu_metadata->cpu_data_size);
+ fadump_conf->cpu_state_dest_vaddr =
+ (u64)__va(be64_to_cpu(opal_cpu_metadata->region[0].dest));
+ fadump_conf->cpu_state_data_size =
+ be64_to_cpu(opal_cpu_metadata->region[0].size);
+
+ if (fadump_conf->cpu_state_data_version != HDAT_FADUMP_CPU_DATA_VER) {
+ pr_warn("Supported CPU state data version: %u, found: %d!\n",
+ HDAT_FADUMP_CPU_DATA_VER,
+ fadump_conf->cpu_state_data_version);
+ pr_warn("WARNING: F/W using newer CPU state data format!!\n");
+ }
+
+ if ((fadump_conf->cpu_state_dest_vaddr == 0) ||
+ (fadump_conf->cpu_state_entry_size == 0) ||
+ (fadump_conf->cpu_state_entry_size >
+ fadump_conf->cpu_state_data_size)) {
+ pr_err("CPU state data is invalid. Ignoring!\n");
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * Convert CPU state data saved at the time of crash into ELF notes.
+ *
+ * While the crashing CPU's register data is saved by the kernel, CPU state
+ * data for all CPUs is saved by f/w. In CPU state data provided by f/w,
+ * each register entry is of 16 bytes, a numerical identifier along with
+ * a GPR/SPR flag in the first 8 bytes and the register value in the next
+ * 8 bytes. For more details refer to F/W documentation. If this data is
+ * missing or in unsupported format, append crashing CPU's register data
+ * saved by the kernel in the PT_NOTE, to have something to work with in
+ * the vmcore file.
+ */
+static int __init
+opal_fadump_build_cpu_notes(struct fw_dump *fadump_conf,
+ struct fadump_crash_info_header *fdh)
+{
+ u32 thread_pir, size_per_thread, regs_offset, regs_cnt, reg_esize;
+ struct hdat_fadump_thread_hdr *thdr;
+ bool is_cpu_data_valid = false;
+ u32 num_cpus = 1, *note_buf;
+ struct pt_regs regs;
+ char *bufp;
+ int rc, i;
+
+ if (is_opal_fadump_cpu_data_valid(fadump_conf)) {
+ size_per_thread = fadump_conf->cpu_state_entry_size;
+ num_cpus = (fadump_conf->cpu_state_data_size / size_per_thread);
+ bufp = __va(fadump_conf->cpu_state_dest_vaddr);
+ is_cpu_data_valid = true;
+ }
+
+ rc = fadump_setup_cpu_notes_buf(num_cpus);
+ if (rc != 0)
+ return rc;
+
+ note_buf = (u32 *)fadump_conf->cpu_notes_buf_vaddr;
+ if (!is_cpu_data_valid)
+ goto out;
+
+ /*
+ * Offset for register entries, entry size and registers count is
+ * duplicated in every thread header in keeping with HDAT format.
+ * Use these values from the first thread header.
+ */
+ thdr = (struct hdat_fadump_thread_hdr *)bufp;
+ regs_offset = (offsetof(struct hdat_fadump_thread_hdr, offset) +
+ be32_to_cpu(thdr->offset));
+ reg_esize = be32_to_cpu(thdr->esize);
+ regs_cnt = be32_to_cpu(thdr->ecnt);
+
+ pr_debug("--------CPU State Data------------\n");
+ pr_debug("NumCpus : %u\n", num_cpus);
+ pr_debug("\tOffset: %u, Entry size: %u, Cnt: %u\n",
+ regs_offset, reg_esize, regs_cnt);
+
+ for (i = 0; i < num_cpus; i++, bufp += size_per_thread) {
+ thdr = (struct hdat_fadump_thread_hdr *)bufp;
+
+ thread_pir = be32_to_cpu(thdr->pir);
+ pr_debug("[%04d] PIR: 0x%x, core state: 0x%02x\n",
+ i, thread_pir, thdr->core_state);
+
+ /*
+ * If this is kernel initiated crash, crashing_cpu would be set
+ * appropriately and register data of the crashing CPU saved by
+ * crashing kernel. Add this saved register data of crashing CPU
+ * to elf notes and populate the pt_regs for the remaining CPUs
+ * from register state data provided by firmware.
+ */
+ if (fdh->crashing_cpu == thread_pir) {
+ note_buf = fadump_regs_to_elf_notes(note_buf,
+ &fdh->regs);
+ pr_debug("Crashing CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
+ fdh->crashing_cpu, fdh->regs.gpr[1],
+ fdh->regs.nip);
+ continue;
+ }
+
+ /*
+ * Register state data of MAX cores is provided by firmware,
+ * but some of this cores may not be active. So, while
+ * processing register state data, check core state and
+ * skip threads that belong to inactive cores.
+ */
+ if (thdr->core_state == HDAT_FADUMP_CORE_INACTIVE)
+ continue;
+
+ opal_fadump_read_regs((bufp + regs_offset), regs_cnt,
+ reg_esize, true, &regs);
+ note_buf = fadump_regs_to_elf_notes(note_buf, &regs);
+ pr_debug("CPU PIR: 0x%x - R1 : 0x%lx, NIP : 0x%lx\n",
+ thread_pir, regs.gpr[1], regs.nip);
+ }
+
+out:
+ /*
+ * CPU state data is invalid/unsupported. Try appending crashing CPU's
+ * register data, if it is saved by the kernel.
+ */
+ if (fadump_conf->cpu_notes_buf_vaddr == (u64)note_buf) {
+ if (fdh->crashing_cpu == FADUMP_CPU_UNKNOWN) {
+ fadump_free_cpu_notes_buf();
+ return -ENODEV;
+ }
+
+ pr_warn("WARNING: appending only crashing CPU's register data\n");
+ note_buf = fadump_regs_to_elf_notes(note_buf, &(fdh->regs));
+ }
+
+ final_note(note_buf);
+
+ pr_debug("Updating elfcore header (%llx) with cpu notes\n",
+ fadump_conf->elfcorehdr_addr);
+ fadump_update_elfcore_header((char *)fadump_conf->elfcorehdr_addr);
+ return 0;
+}
+
+static int __init opal_fadump_process(struct fw_dump *fadump_conf)
+{
+ struct fadump_crash_info_header *fdh;
+ int rc = -EINVAL;
+
+ if (!opal_fdm_active || !fadump_conf->fadumphdr_addr)
+ return rc;
+
+ fdh = __va(fadump_conf->fadumphdr_addr);
+
+#ifdef CONFIG_OPAL_CORE
+ /*
+ * If this is a kernel initiated crash, crashing_cpu would be set
+ * appropriately and register data of the crashing CPU saved by
+ * crashing kernel. Add this saved register data of crashing CPU
+ * to elf notes and populate the pt_regs for the remaining CPUs
+ * from register state data provided by firmware.
+ */
+ if (fdh->crashing_cpu != FADUMP_CPU_UNKNOWN)
+ kernel_initiated = true;
+#endif
+
+ return opal_fadump_build_cpu_notes(fadump_conf, fdh);
+}
+
+static void opal_fadump_region_show(struct fw_dump *fadump_conf,
+ struct seq_file *m)
+{
+ const struct opal_fadump_mem_struct *fdm_ptr;
+ u64 dumped_bytes = 0;
+ int i;
+
+ if (fadump_conf->dump_active)
+ fdm_ptr = opal_fdm_active;
+ else
+ fdm_ptr = opal_fdm;
+
+ for (i = 0; i < be16_to_cpu(fdm_ptr->region_cnt); i++) {
+ /*
+ * Only regions that are registered for MPIPL
+ * would have dump data.
+ */
+ if ((fadump_conf->dump_active) &&
+ (i < be16_to_cpu(fdm_ptr->registered_regions)))
+ dumped_bytes = be64_to_cpu(fdm_ptr->rgn[i].size);
+
+ seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
+ be64_to_cpu(fdm_ptr->rgn[i].src),
+ be64_to_cpu(fdm_ptr->rgn[i].dest));
+ seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
+ be64_to_cpu(fdm_ptr->rgn[i].size), dumped_bytes);
+ }
+
+ /* Dump is active. Show preserved area start address. */
+ if (fadump_conf->dump_active) {
+ seq_printf(m, "\nMemory above %#016llx is reserved for saving crash dump\n",
+ fadump_conf->boot_mem_top);
+ }
+}
+
+static void opal_fadump_trigger(struct fadump_crash_info_header *fdh,
+ const char *msg)
+{
+ int rc;
+
+ /*
+ * Unlike on pSeries platform, logical CPU number is not provided
+ * with architected register state data. So, store the crashing
+ * CPU's PIR instead to plug the appropriate register data for
+ * crashing CPU in the vmcore file.
+ */
+ fdh->crashing_cpu = (u32)mfspr(SPRN_PIR);
+
+ rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg);
+ if (rc == OPAL_UNSUPPORTED) {
+ pr_emerg("Reboot type %d not supported.\n",
+ OPAL_REBOOT_MPIPL);
+ } else if (rc == OPAL_HARDWARE)
+ pr_emerg("No backend support for MPIPL!\n");
+}
+
+/* FADUMP_MAX_MEM_REGS or lower */
+static int opal_fadump_max_boot_mem_rgns(void)
+{
+ return FADUMP_MAX_MEM_REGS;
+}
+
+static struct fadump_ops opal_fadump_ops = {
+ .fadump_init_mem_struct = opal_fadump_init_mem_struct,
+ .fadump_get_metadata_size = opal_fadump_get_metadata_size,
+ .fadump_setup_metadata = opal_fadump_setup_metadata,
+ .fadump_get_bootmem_min = opal_fadump_get_bootmem_min,
+ .fadump_register = opal_fadump_register,
+ .fadump_unregister = opal_fadump_unregister,
+ .fadump_invalidate = opal_fadump_invalidate,
+ .fadump_cleanup = opal_fadump_cleanup,
+ .fadump_process = opal_fadump_process,
+ .fadump_region_show = opal_fadump_region_show,
+ .fadump_trigger = opal_fadump_trigger,
+ .fadump_max_boot_mem_rgns = opal_fadump_max_boot_mem_rgns,
+};
+
+void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
+{
+ const __be32 *prop;
+ unsigned long dn;
+ __be64 be_addr;
+ u64 addr = 0;
+ int i, len;
+ s64 ret;
+
+ /*
+ * Check if Firmware-Assisted Dump is supported. if yes, check
+ * if dump has been initiated on last reboot.
+ */
+ dn = of_get_flat_dt_subnode_by_name(node, "dump");
+ if (dn == -FDT_ERR_NOTFOUND) {
+ pr_debug("FADump support is missing!\n");
+ return;
+ }
+
+ if (!of_flat_dt_is_compatible(dn, "ibm,opal-dump")) {
+ pr_err("Support missing for this f/w version!\n");
+ return;
+ }
+
+ prop = of_get_flat_dt_prop(dn, "fw-load-area", &len);
+ if (prop) {
+ /*
+ * Each f/w load area is an (address,size) pair,
+ * 2 cells each, totalling 4 cells per range.
+ */
+ for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
+ u64 base, end;
+
+ base = of_read_number(prop + (i * 4) + 0, 2);
+ end = base;
+ end += of_read_number(prop + (i * 4) + 2, 2);
+ if (end > OPAL_FADUMP_MIN_BOOT_MEM) {
+ pr_err("F/W load area: 0x%llx-0x%llx\n",
+ base, end);
+ pr_err("F/W version not supported!\n");
+ return;
+ }
+ }
+ }
+
+ fadump_conf->ops = &opal_fadump_ops;
+ fadump_conf->fadump_supported = 1;
+ /* TODO: Add support to pass additional parameters */
+ fadump_conf->param_area_supported = 0;
+
+ /*
+ * Firmware supports 32-bit field for size. Align it to PAGE_SIZE
+ * and request firmware to copy multiple kernel boot memory regions.
+ */
+ fadump_conf->max_copy_size = ALIGN_DOWN(U32_MAX, PAGE_SIZE);
+
+ /*
+ * Check if dump has been initiated on last reboot.
+ */
+ prop = of_get_flat_dt_prop(dn, "mpipl-boot", NULL);
+ if (!prop)
+ return;
+
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_KERNEL, &be_addr);
+ if ((ret != OPAL_SUCCESS) || !be_addr) {
+ pr_err("Failed to get Kernel metadata (%lld)\n", ret);
+ return;
+ }
+
+ addr = be64_to_cpu(be_addr);
+ pr_debug("Kernel metadata addr: %llx\n", addr);
+
+ opal_fdm_active = __va(addr);
+ if (opal_fdm_active->version != OPAL_FADUMP_VERSION) {
+ pr_warn("Supported kernel metadata version: %u, found: %d!\n",
+ OPAL_FADUMP_VERSION, opal_fdm_active->version);
+ pr_warn("WARNING: Kernel metadata format mismatch identified! Core file maybe corrupted..\n");
+ }
+
+ /* Kernel regions not registered with f/w for MPIPL */
+ if (be16_to_cpu(opal_fdm_active->registered_regions) == 0) {
+ opal_fdm_active = NULL;
+ return;
+ }
+
+ ret = opal_mpipl_query_tag(OPAL_MPIPL_TAG_CPU, &be_addr);
+ if (be_addr) {
+ addr = be64_to_cpu(be_addr);
+ pr_debug("CPU metadata addr: %llx\n", addr);
+ opal_cpu_metadata = __va(addr);
+ }
+
+ pr_info("Firmware-assisted dump is active.\n");
+ fadump_conf->dump_active = 1;
+ opal_fadump_get_config(fadump_conf, opal_fdm_active);
+}
+#endif /* !CONFIG_PRESERVE_FA_DUMP */
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h
new file mode 100644
index 000000000000..5eeb794b5eb1
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-fadump.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Firmware-Assisted Dump support on POWER platform (OPAL).
+ *
+ * Copyright 2019, Hari Bathini, IBM Corporation.
+ */
+
+#ifndef _POWERNV_OPAL_FADUMP_H
+#define _POWERNV_OPAL_FADUMP_H
+
+#include <asm/reg.h>
+
+/*
+ * With kernel & initrd loaded at 512MB (with 256MB size), enforce a minimum
+ * boot memory size of 768MB to ensure f/w loading kernel and initrd doesn't
+ * mess with crash'ed kernel's memory during MPIPL.
+ */
+#define OPAL_FADUMP_MIN_BOOT_MEM (0x30000000UL)
+
+/*
+ * OPAL FADump metadata structure format version
+ *
+ * OPAL FADump kernel metadata structure stores kernel metadata needed to
+ * register-for/process crash dump. Format version is used to keep a tab on
+ * the changes in the structure format. The changes, if any, to the format
+ * are expected to be minimal and backward compatible.
+ */
+#define OPAL_FADUMP_VERSION 0x1
+
+/*
+ * OPAL FADump kernel metadata
+ *
+ * The address of this structure will be registered with f/w for retrieving
+ * in the capture kernel to process the crash dump.
+ */
+struct opal_fadump_mem_struct {
+ u8 version;
+ u8 reserved[3];
+ __be16 region_cnt; /* number of regions */
+ __be16 registered_regions; /* Regions registered for MPIPL */
+ __be64 fadumphdr_addr;
+ struct opal_mpipl_region rgn[FADUMP_MAX_MEM_REGS];
+} __packed;
+
+/*
+ * CPU state data
+ *
+ * CPU state data information is provided by f/w. The format for this data
+ * is defined in the HDAT spec. Version is used to keep a tab on the changes
+ * in this CPU state data format. Changes to this format are unlikely, but
+ * if there are any changes, please refer to latest HDAT specification.
+ */
+#define HDAT_FADUMP_CPU_DATA_VER 1
+
+#define HDAT_FADUMP_CORE_INACTIVE (0x0F)
+
+/* HDAT thread header for register entries */
+struct hdat_fadump_thread_hdr {
+ __be32 pir;
+ /* 0x00 - 0x0F - The corresponding stop state of the core */
+ u8 core_state;
+ u8 reserved[3];
+
+ __be32 offset; /* Offset to Register Entries array */
+ __be32 ecnt; /* Number of entries */
+ __be32 esize; /* Alloc size of each array entry in bytes */
+ __be32 eactsz; /* Actual size of each array entry in bytes */
+} __packed;
+
+/* Register types populated by f/w */
+#define HDAT_FADUMP_REG_TYPE_GPR 0x01
+#define HDAT_FADUMP_REG_TYPE_SPR 0x02
+
+/* ID numbers used by f/w while populating certain registers */
+#define HDAT_FADUMP_REG_ID_NIP 0x7D0
+#define HDAT_FADUMP_REG_ID_MSR 0x7D1
+#define HDAT_FADUMP_REG_ID_CCR 0x7D2
+
+/* HDAT register entry. */
+struct hdat_fadump_reg_entry {
+ __be32 reg_type;
+ __be32 reg_num;
+ __be64 reg_val;
+} __packed;
+
+static inline void opal_fadump_set_regval_regnum(struct pt_regs *regs,
+ u32 reg_type, u32 reg_num,
+ u64 reg_val)
+{
+ if (reg_type == HDAT_FADUMP_REG_TYPE_GPR) {
+ if (reg_num < 32)
+ regs->gpr[reg_num] = reg_val;
+ return;
+ }
+
+ switch (reg_num) {
+ case SPRN_CTR:
+ regs->ctr = reg_val;
+ break;
+ case SPRN_LR:
+ regs->link = reg_val;
+ break;
+ case SPRN_XER:
+ regs->xer = reg_val;
+ break;
+ case SPRN_DAR:
+ regs->dar = reg_val;
+ break;
+ case SPRN_DSISR:
+ regs->dsisr = reg_val;
+ break;
+ case HDAT_FADUMP_REG_ID_NIP:
+ regs->nip = reg_val;
+ break;
+ case HDAT_FADUMP_REG_ID_MSR:
+ regs->msr = reg_val;
+ break;
+ case HDAT_FADUMP_REG_ID_CCR:
+ regs->ccr = reg_val;
+ break;
+ }
+}
+
+static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt,
+ unsigned int reg_entry_size,
+ bool cpu_endian,
+ struct pt_regs *regs)
+{
+ struct hdat_fadump_reg_entry *reg_entry;
+ u64 val;
+ int i;
+
+ memset(regs, 0, sizeof(struct pt_regs));
+
+ for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) {
+ reg_entry = (struct hdat_fadump_reg_entry *)bufp;
+ val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) :
+ (u64 __force)(reg_entry->reg_val));
+ opal_fadump_set_regval_regnum(regs,
+ be32_to_cpu(reg_entry->reg_type),
+ be32_to_cpu(reg_entry->reg_num),
+ val);
+ }
+}
+
+#endif /* _POWERNV_OPAL_FADUMP_H */
diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c
index b37015101bf6..a3f7a2928767 100644
--- a/arch/powerpc/platforms/powernv/opal-flash.c
+++ b/arch/powerpc/platforms/powernv/opal-flash.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV OPAL Firmware Update Interface
*
* Copyright 2013 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define DEBUG
@@ -436,7 +432,7 @@ static int alloc_image_buf(char *buffer, size_t count)
* and pre-allocate required memory.
*/
static ssize_t image_data_write(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buffer, loff_t pos, size_t count)
{
int rc;
@@ -516,7 +512,7 @@ static struct attribute *image_op_attrs[] = {
NULL /* need to NULL terminate the list of attributes */
};
-static struct attribute_group image_op_attr_group = {
+static const struct attribute_group image_op_attr_group = {
.attrs = image_op_attrs,
};
@@ -524,6 +520,10 @@ void __init opal_flash_update_init(void)
{
int ret;
+ /* Firmware update is not supported by firmware */
+ if (!opal_check_token(OPAL_FLASH_VALIDATE))
+ return;
+
/* Allocate validate image buffer */
validate_flash_data.buf = kzalloc(VALIDATE_BUF_SIZE, GFP_KERNEL);
if (!validate_flash_data.buf) {
diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c
index 586ec71a4e17..f0c1830deb51 100644
--- a/arch/powerpc/platforms/powernv/opal-hmi.c
+++ b/arch/powerpc/platforms/powernv/opal-hmi.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* OPAL hypervisor Maintenance interrupt handling support in PowerNV.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; If not, see <http://www.gnu.org/licenses/>.
- *
* Copyright 2014 IBM Corporation
* Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
*/
@@ -149,6 +137,43 @@ static void print_nx_checkstop_reason(const char *level,
xstop_reason[i].description);
}
+static void print_npu_checkstop_reason(const char *level,
+ struct OpalHMIEvent *hmi_evt)
+{
+ uint8_t reason, reason_count, i;
+
+ /*
+ * We may not have a checkstop reason on some combination of
+ * hardware and/or skiboot version
+ */
+ if (!hmi_evt->u.xstop_error.xstop_reason) {
+ printk("%s NPU checkstop on chip %x\n", level,
+ be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
+ return;
+ }
+
+ /*
+ * NPU2 has 3 FIRs. Reason encoded on a byte as:
+ * 2 bits for the FIR number
+ * 6 bits for the bit number
+ * It may be possible to find several reasons.
+ *
+ * We don't display a specific message per FIR bit as there
+ * are too many and most are meaningless without the workbook
+ * and/or hw team help anyway.
+ */
+ reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
+ sizeof(reason);
+ for (i = 0; i < reason_count; i++) {
+ reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
+ if (reason)
+ printk("%s NPU checkstop on chip %x: FIR%d bit %d is set\n",
+ level,
+ be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
+ reason >> 6, reason & 0x3F);
+ }
+}
+
static void print_checkstop_reason(const char *level,
struct OpalHMIEvent *hmi_evt)
{
@@ -160,6 +185,9 @@ static void print_checkstop_reason(const char *level,
case CHECKSTOP_TYPE_NX:
print_nx_checkstop_reason(level, hmi_evt);
break;
+ case CHECKSTOP_TYPE_NPU:
+ print_npu_checkstop_reason(level, hmi_evt);
+ break;
default:
printk("%s Unknown Malfunction Alert of type %d\n",
level, type);
@@ -185,6 +213,8 @@ static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
"A hypervisor resource error occurred",
"CAPP recovery process is in progress",
};
+ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
/* Print things out */
if (hmi_evt->version < OpalHMIEvt_V1) {
@@ -212,19 +242,22 @@ static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
break;
}
- printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
- level, sevstr,
- hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
- "Recovered" : "Not recovered");
- error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
- hmi_error_types[hmi_evt->type]
- : "Unknown";
- printk("%s Error detail: %s\n", level, error_info);
- printk("%s HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
- if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
- (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
- printk("%s TFMR: %016llx\n", level,
+ if (hmi_evt->severity != OpalHMI_SEV_NO_ERROR || __ratelimit(&rs)) {
+ printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
+ level, sevstr,
+ hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
+ "Recovered" : "Not recovered");
+ error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
+ hmi_error_types[hmi_evt->type]
+ : "Unknown";
+ printk("%s Error detail: %s\n", level, error_info);
+ printk("%s HMER: %016llx\n", level,
+ be64_to_cpu(hmi_evt->hmer));
+ if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
+ (hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
+ printk("%s TFMR: %016llx\n", level,
be64_to_cpu(hmi_evt->tfmr));
+ }
if (hmi_evt->version < OpalHMIEvt_V2)
return;
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
index 58a07948c76e..828fc4d88471 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* OPAL IMC interface detection driver
* Supported on POWERNV platform
@@ -5,23 +6,17 @@
* Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation.
* (C) 2017 Anju T Sudhakar, IBM Corporation.
* (C) 2017 Hemant K Shaw, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or later version.
*/
#include <linux/kernel.h>
#include <linux/platform_device.h>
#include <linux/of.h>
#include <linux/of_address.h>
-#include <linux/of_platform.h>
#include <linux/crash_dump.h>
+#include <linux/debugfs.h>
#include <asm/opal.h>
#include <asm/io.h>
#include <asm/imc-pmu.h>
#include <asm/cputhreads.h>
-#include <asm/debugfs.h>
static struct dentry *imc_debugfs_parent;
@@ -39,11 +34,10 @@ static int imc_mem_set(void *data, u64 val)
}
DEFINE_DEBUGFS_ATTRIBUTE(fops_imc_x64, imc_mem_get, imc_mem_set, "0x%016llx\n");
-static struct dentry *imc_debugfs_create_x64(const char *name, umode_t mode,
- struct dentry *parent, u64 *value)
+static void imc_debugfs_create_x64(const char *name, umode_t mode,
+ struct dentry *parent, u64 *value)
{
- return debugfs_create_file_unsafe(name, mode, parent,
- value, &fops_imc_x64);
+ debugfs_create_file_unsafe(name, mode, parent, value, &fops_imc_x64);
}
/*
@@ -57,41 +51,28 @@ static void export_imc_mode_and_cmd(struct device_node *node,
struct imc_pmu *pmu_ptr)
{
static u64 loc, *imc_mode_addr, *imc_cmd_addr;
- int chip = 0, nid;
char mode[16], cmd[16];
u32 cb_offset;
+ struct imc_mem_info *ptr = pmu_ptr->mem_info;
- imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
-
- /*
- * Return here, either because 'imc' directory already exists,
- * Or failed to create a new one.
- */
- if (!imc_debugfs_parent)
- return;
+ imc_debugfs_parent = debugfs_create_dir("imc", arch_debugfs_dir);
if (of_property_read_u32(node, "cb_offset", &cb_offset))
cb_offset = IMC_CNTL_BLK_OFFSET;
- for_each_node(nid) {
- loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset;
+ while (ptr->vbase != NULL) {
+ loc = (u64)(ptr->vbase) + cb_offset;
imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET);
- sprintf(mode, "imc_mode_%d", nid);
- if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent,
- imc_mode_addr))
- goto err;
+ sprintf(mode, "imc_mode_%d", (u32)(ptr->id));
+ imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent,
+ imc_mode_addr);
imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET);
- sprintf(cmd, "imc_cmd_%d", nid);
- if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent,
- imc_cmd_addr))
- goto err;
- chip++;
+ sprintf(cmd, "imc_cmd_%d", (u32)(ptr->id));
+ imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent,
+ imc_cmd_addr);
+ ptr++;
}
- return;
-
-err:
- debugfs_remove_recursive(imc_debugfs_parent);
}
/*
@@ -127,7 +108,7 @@ static int imc_get_mem_addr_nest(struct device_node *node,
nr_chips))
goto error;
- pmu_ptr->mem_info = kcalloc(nr_chips, sizeof(*pmu_ptr->mem_info),
+ pmu_ptr->mem_info = kcalloc(nr_chips + 1, sizeof(*pmu_ptr->mem_info),
GFP_KERNEL);
if (!pmu_ptr->mem_info)
goto error;
@@ -139,7 +120,6 @@ static int imc_get_mem_addr_nest(struct device_node *node,
}
pmu_ptr->imc_counter_mmaped = true;
- export_imc_mode_and_cmd(node, pmu_ptr);
kfree(base_addr_arr);
kfree(chipid_arr);
return 0;
@@ -155,31 +135,31 @@ error:
* and domain as the inputs.
* Allocates memory for the struct imc_pmu, sets up its domain, size and offsets
*/
-static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain)
+static struct imc_pmu *imc_pmu_create(struct device_node *parent, int pmu_index, int domain)
{
int ret = 0;
struct imc_pmu *pmu_ptr;
u32 offset;
+ /* Return for unknown domain */
+ if (domain < 0)
+ return NULL;
+
/* memory for pmu */
pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL);
if (!pmu_ptr)
- return -ENOMEM;
+ return NULL;
/* Set the domain */
pmu_ptr->domain = domain;
ret = of_property_read_u32(parent, "size", &pmu_ptr->counter_mem_size);
- if (ret) {
- ret = -EINVAL;
+ if (ret)
goto free_pmu;
- }
if (!of_property_read_u32(parent, "offset", &offset)) {
- if (imc_get_mem_addr_nest(parent, pmu_ptr, offset)) {
- ret = -EINVAL;
+ if (imc_get_mem_addr_nest(parent, pmu_ptr, offset))
goto free_pmu;
- }
}
/* Function to register IMC pmu */
@@ -190,14 +170,14 @@ static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain)
if (pmu_ptr->domain == IMC_DOMAIN_NEST)
kfree(pmu_ptr->mem_info);
kfree(pmu_ptr);
- return ret;
+ return NULL;
}
- return 0;
+ return pmu_ptr;
free_pmu:
kfree(pmu_ptr);
- return ret;
+ return NULL;
}
static void disable_nest_pmu_counters(void)
@@ -205,7 +185,7 @@ static void disable_nest_pmu_counters(void)
int nid, cpu;
const struct cpumask *l_cpumask;
- get_online_cpus();
+ cpus_read_lock();
for_each_node_with_cpus(nid) {
l_cpumask = cpumask_of_node(nid);
cpu = cpumask_first_and(l_cpumask, cpu_online_mask);
@@ -214,25 +194,25 @@ static void disable_nest_pmu_counters(void)
opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST,
get_hard_smp_processor_id(cpu));
}
- put_online_cpus();
+ cpus_read_unlock();
}
static void disable_core_pmu_counters(void)
{
- cpumask_t cores_map;
int cpu, rc;
- get_online_cpus();
+ cpus_read_lock();
/* Disable the IMC Core functions */
- cores_map = cpu_online_cores_map();
- for_each_cpu(cpu, &cores_map) {
+ for_each_online_cpu(cpu) {
+ if (cpu_first_thread_sibling(cpu) != cpu)
+ continue;
rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE,
get_hard_smp_processor_id(cpu));
if (rc)
pr_err("%s: Failed to stop Core (cpu = %d)\n",
- __FUNCTION__, cpu);
+ __func__, cpu);
}
- put_online_cpus();
+ cpus_read_unlock();
}
int get_max_nest_dev(void)
@@ -254,6 +234,7 @@ int get_max_nest_dev(void)
static int opal_imc_counters_probe(struct platform_device *pdev)
{
struct device_node *imc_dev = pdev->dev.of_node;
+ struct imc_pmu *pmu;
int pmu_count = 0, domain;
bool core_imc_reg = false, thread_imc_reg = false;
u32 type;
@@ -269,6 +250,7 @@ static int opal_imc_counters_probe(struct platform_device *pdev)
}
for_each_compatible_node(imc_dev, NULL, IMC_DTB_UNIT_COMPAT) {
+ pmu = NULL;
if (of_property_read_u32(imc_dev, "type", &type)) {
pr_warn("IMC Device without type property\n");
continue;
@@ -284,15 +266,22 @@ static int opal_imc_counters_probe(struct platform_device *pdev)
case IMC_TYPE_THREAD:
domain = IMC_DOMAIN_THREAD;
break;
+ case IMC_TYPE_TRACE:
+ domain = IMC_DOMAIN_TRACE;
+ break;
default:
pr_warn("IMC Unknown Device type \n");
domain = -1;
break;
}
- if (!imc_pmu_create(imc_dev, pmu_count, domain)) {
- if (domain == IMC_DOMAIN_NEST)
+ pmu = imc_pmu_create(imc_dev, pmu_count, domain);
+ if (pmu != NULL) {
+ if (domain == IMC_DOMAIN_NEST) {
+ if (!imc_debugfs_parent)
+ export_imc_mode_and_cmd(imc_dev, pmu);
pmu_count++;
+ }
if (domain == IMC_DOMAIN_CORE)
core_imc_reg = true;
if (domain == IMC_DOMAIN_THREAD)
@@ -300,10 +289,6 @@ static int opal_imc_counters_probe(struct platform_device *pdev)
}
}
- /* If none of the nest units are registered, remove debugfs interface */
- if (pmu_count == 0)
- debugfs_remove_recursive(imc_debugfs_parent);
-
/* If core imc is not registered, unregister thread-imc */
if (!core_imc_reg && thread_imc_reg)
unregister_thread_imc();
diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c
index bc97770a67db..e180bd8e1400 100644
--- a/arch/powerpc/platforms/powernv/opal-irqchip.c
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* This file implements an irqchip for OPAL events. Whenever there is
* an interrupt that is handled by OPAL we get passed a list of events
@@ -5,11 +6,6 @@
* interrupts to Linux so we implement an irqchip to handle them.
*
* Copyright Alistair Popple, IBM Corporation 2014.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
*/
#include <linux/bitops.h>
#include <linux/irq.h>
@@ -50,23 +46,20 @@ void opal_handle_events(void)
e = READ_ONCE(last_outstanding_events) & opal_event_irqchip.mask;
again:
while (e) {
- int virq, hwirq;
+ int hwirq;
hwirq = fls64(e) - 1;
e &= ~BIT_ULL(hwirq);
local_irq_disable();
- virq = irq_find_mapping(opal_event_irqchip.domain, hwirq);
- if (virq) {
- irq_enter();
- generic_handle_irq(virq);
- irq_exit();
- }
+ irq_enter();
+ generic_handle_domain_irq(opal_event_irqchip.domain, hwirq);
+ irq_exit();
local_irq_enable();
cond_resched();
}
- last_outstanding_events = 0;
+ WRITE_ONCE(last_outstanding_events, 0);
if (opal_poll_events(&events) != OPAL_SUCCESS)
return;
e = be64_to_cpu(events) & opal_event_irqchip.mask;
@@ -76,7 +69,7 @@ again:
bool opal_have_pending_events(void)
{
- if (last_outstanding_events & opal_event_irqchip.mask)
+ if (READ_ONCE(last_outstanding_events) & opal_event_irqchip.mask)
return true;
return false;
}
@@ -131,7 +124,7 @@ static irqreturn_t opal_interrupt(int irq, void *data)
__be64 events;
opal_handle_interrupt(virq_to_hw(irq), &events);
- last_outstanding_events = be64_to_cpu(events);
+ WRITE_ONCE(last_outstanding_events, be64_to_cpu(events));
if (opal_have_pending_events())
opal_wake_poller();
@@ -198,7 +191,8 @@ int __init opal_event_init(void)
* fall back to the legacy method (opal_event_request(...))
* anyway. */
dn = of_find_compatible_node(NULL, NULL, "ibm,opal-event");
- opal_event_irqchip.domain = irq_domain_add_linear(dn, MAX_NUM_EVENTS,
+ opal_event_irqchip.domain = irq_domain_create_linear(of_fwnode_handle(dn),
+ MAX_NUM_EVENTS,
&opal_event_domain_ops, &opal_event_irqchip);
of_node_put(dn);
if (!opal_event_irqchip.domain) {
@@ -282,11 +276,14 @@ int __init opal_event_init(void)
else
name = kasprintf(GFP_KERNEL, "opal");
+ if (!name)
+ continue;
/* Install interrupt handler */
rc = request_irq(r->start, opal_interrupt, r->flags & IRQD_TRIGGER_MASK,
name, NULL);
if (rc) {
pr_warn("Error %d requesting OPAL irq %d\n", rc, (int)r->start);
+ kfree(name);
continue;
}
}
diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c
index 55691950d981..bb4218fa796e 100644
--- a/arch/powerpc/platforms/powernv/opal-kmsg.c
+++ b/arch/powerpc/platforms/powernv/opal-kmsg.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* kmsg dumper that ensures the OPAL console fully flushes panic messages
*
* Author: Russell Currey <ruscur@russell.cc>
*
* Copyright 2015 IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
- * option) any later version.
*/
#include <linux/kmsg_dump.h>
@@ -24,13 +20,13 @@
* message, it just ensures that OPAL completely flushes the console buffer.
*/
static void kmsg_dump_opal_console_flush(struct kmsg_dumper *dumper,
- enum kmsg_dump_reason reason)
+ struct kmsg_dump_detail *detail)
{
/*
* Outside of a panic context the pollers will continue to run,
* so we don't need to do any special flushing.
*/
- if (reason != KMSG_DUMP_PANIC)
+ if (detail->reason != KMSG_DUMP_PANIC)
return;
opal_flush_console(0);
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index 2623996a193a..8a7f39e106bd 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV LPC bus handling.
*
* Copyright 2013 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -14,13 +10,13 @@
#include <linux/bug.h>
#include <linux/io.h>
#include <linux/slab.h>
+#include <linux/debugfs.h>
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/opal.h>
#include <asm/prom.h>
#include <linux/uaccess.h>
-#include <asm/debugfs.h>
#include <asm/isa-bridge.h>
static int opal_lpc_chip_id = -1;
@@ -201,7 +197,7 @@ static ssize_t lpc_debug_read(struct file *filp, char __user *ubuf,
/*
* Select access size based on count and alignment and
- * access type. IO and MEM only support byte acceses,
+ * access type. IO and MEM only support byte accesses,
* FW supports all 3.
*/
len = 1;
@@ -375,7 +371,7 @@ static int opal_lpc_init_debugfs(void)
if (opal_lpc_chip_id < 0)
return -ENODEV;
- root = debugfs_create_dir("lpc", powerpc_debugfs_root);
+ root = debugfs_create_dir("lpc", arch_debugfs_dir);
rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO);
rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM);
@@ -397,16 +393,17 @@ void __init opal_lpc_init(void)
for_each_compatible_node(np, NULL, "ibm,power8-lpc") {
if (!of_device_is_available(np))
continue;
- if (!of_get_property(np, "primary", NULL))
+ if (!of_property_present(np, "primary"))
continue;
opal_lpc_chip_id = of_get_ibm_chip_id(np);
+ of_node_put(np);
break;
}
if (opal_lpc_chip_id < 0)
return;
/* Does it support direct mapping ? */
- if (of_get_property(np, "ranges", NULL)) {
+ if (of_property_present(np, "ranges")) {
pr_info("OPAL: Found memory mapped LPC bus on chip %d\n",
opal_lpc_chip_id);
isa_bridge_init_non_pci(np);
diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c
index dcb42bcb5efa..a1754a28265d 100644
--- a/arch/powerpc/platforms/powernv/opal-memory-errors.c
+++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -1,20 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* OPAL asynchronus Memory error handling support in PowerNV.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright 2013 IBM Corporation
* Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
*/
@@ -95,7 +82,7 @@ static DECLARE_WORK(mem_error_work, mem_error_handler);
/*
* opal_memory_err_event - notifier handler that queues up the opal message
- * to be preocessed later.
+ * to be processed later.
*/
static int opal_memory_err_event(struct notifier_block *nb,
unsigned long msg_type, void *msg)
diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
index acd3206dfae3..992a6b379a66 100644
--- a/arch/powerpc/platforms/powernv/opal-msglog.c
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV OPAL in-memory console interface
*
* Copyright 2014 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <asm/io.h>
@@ -16,6 +12,8 @@
#include <linux/types.h>
#include <asm/barrier.h>
+#include "powernv.h"
+
/* OPAL in-memory console. Defined in OPAL source at core/console.c */
struct memcons {
__be64 magic;
@@ -33,23 +31,23 @@ struct memcons {
static struct memcons *opal_memcons = NULL;
-ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
+ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count)
{
const char *conbuf;
ssize_t ret;
size_t first_read = 0;
uint32_t out_pos, avail;
- if (!opal_memcons)
+ if (!mc)
return -ENODEV;
- out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos));
+ out_pos = be32_to_cpu(READ_ONCE(mc->out_pos));
/* Now we've read out_pos, put a barrier in before reading the new
* data it points to in conbuf. */
smp_rmb();
- conbuf = phys_to_virt(be64_to_cpu(opal_memcons->obuf_phys));
+ conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys));
/* When the buffer has wrapped, read from the out_pos marker to the end
* of the buffer, and then read the remaining data as in the un-wrapped
@@ -57,7 +55,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
if (out_pos & MEMCONS_OUT_POS_WRAP) {
out_pos &= MEMCONS_OUT_POS_MASK;
- avail = be32_to_cpu(opal_memcons->obuf_size) - out_pos;
+ avail = be32_to_cpu(mc->obuf_size) - out_pos;
ret = memory_read_from_buffer(to, count, &pos,
conbuf + out_pos, avail);
@@ -75,7 +73,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
}
/* Sanity check. The firmware should not do this to us. */
- if (out_pos > be32_to_cpu(opal_memcons->obuf_size)) {
+ if (out_pos > be32_to_cpu(mc->obuf_size)) {
pr_err("OPAL: memory console corruption. Aborting read.\n");
return -EINVAL;
}
@@ -90,44 +88,65 @@ out:
return ret;
}
+ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
+{
+ return memcons_copy(opal_memcons, to, pos, count);
+}
+
static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr, char *to,
+ const struct bin_attribute *bin_attr, char *to,
loff_t pos, size_t count)
{
return opal_msglog_copy(to, pos, count);
}
-static struct bin_attribute opal_msglog_attr = {
- .attr = {.name = "msglog", .mode = 0444},
+static struct bin_attribute opal_msglog_attr __ro_after_init = {
+ .attr = {.name = "msglog", .mode = 0400},
.read = opal_msglog_read
};
-void __init opal_msglog_init(void)
+struct memcons *__init memcons_init(struct device_node *node, const char *mc_prop_name)
{
u64 mcaddr;
struct memcons *mc;
- if (of_property_read_u64(opal_node, "ibm,opal-memcons", &mcaddr)) {
- pr_warn("OPAL: Property ibm,opal-memcons not found, no message log\n");
- return;
+ if (of_property_read_u64(node, mc_prop_name, &mcaddr)) {
+ pr_warn("%s property not found, no message log\n",
+ mc_prop_name);
+ goto out_err;
}
mc = phys_to_virt(mcaddr);
if (!mc) {
- pr_warn("OPAL: memory console address is invalid\n");
- return;
+ pr_warn("memory console address is invalid\n");
+ goto out_err;
}
if (be64_to_cpu(mc->magic) != MEMCONS_MAGIC) {
- pr_warn("OPAL: memory console version is invalid\n");
- return;
+ pr_warn("memory console version is invalid\n");
+ goto out_err;
}
- /* Report maximum size */
- opal_msglog_attr.size = be32_to_cpu(mc->ibuf_size) +
- be32_to_cpu(mc->obuf_size);
+ return mc;
+
+out_err:
+ return NULL;
+}
+
+u32 __init memcons_get_size(struct memcons *mc)
+{
+ return be32_to_cpu(mc->ibuf_size) + be32_to_cpu(mc->obuf_size);
+}
+
+void __init opal_msglog_init(void)
+{
+ opal_memcons = memcons_init(opal_node, "ibm,opal-memcons");
+ if (!opal_memcons) {
+ pr_warn("OPAL: memcons failed to load from ibm,opal-memcons\n");
+ return;
+ }
- opal_memcons = mc;
+ opal_msglog_attr.size = memcons_get_size(opal_memcons);
}
void __init opal_msglog_sysfs_init(void)
diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c
index 5584247f5029..380bc2d7ebbf 100644
--- a/arch/powerpc/platforms/powernv/opal-nvram.c
+++ b/arch/powerpc/platforms/powernv/opal-nvram.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV nvram code.
*
* Copyright 2011 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define DEBUG
diff --git a/arch/powerpc/platforms/powernv/opal-power.c b/arch/powerpc/platforms/powernv/opal-power.c
index 89ab1da57657..db99ffcb7b82 100644
--- a/arch/powerpc/platforms/powernv/opal-power.c
+++ b/arch/powerpc/platforms/powernv/opal-power.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV OPAL power control for graceful shutdown handling
*
* Copyright 2015 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "opal-power: " fmt
@@ -57,7 +53,7 @@ static bool detect_epow(void)
}
/* Check for existing EPOW, DPO events */
-static bool poweroff_pending(void)
+static bool __init poweroff_pending(void)
{
int rc;
__be64 opal_dpo_timeout;
diff --git a/arch/powerpc/platforms/powernv/opal-powercap.c b/arch/powerpc/platforms/powernv/opal-powercap.c
index d90ee4fc2c6a..ea917266aa17 100644
--- a/arch/powerpc/platforms/powernv/opal-powercap.c
+++ b/arch/powerpc/platforms/powernv/opal-powercap.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV OPAL Powercap interface
*
* Copyright 2017 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "opal-powercap: " fmt
@@ -17,7 +13,7 @@
#include <asm/opal.h>
-DEFINE_MUTEX(powercap_mutex);
+static DEFINE_MUTEX(powercap_mutex);
static struct kobject *powercap_kobj;
@@ -133,7 +129,7 @@ out_token:
return ret;
}
-static void powercap_add_attr(int handle, const char *name,
+static void __init powercap_add_attr(int handle, const char *name,
struct powercap_attr *attr)
{
attr->handle = handle;
@@ -157,7 +153,7 @@ void __init opal_powercap_init(void)
pcaps = kcalloc(of_get_child_count(powercap), sizeof(*pcaps),
GFP_KERNEL);
if (!pcaps)
- return;
+ goto out_put_powercap;
powercap_kobj = kobject_create_and_add("powercap", opal_kobj);
if (!powercap_kobj) {
@@ -200,6 +196,12 @@ void __init opal_powercap_init(void)
j = 0;
pcaps[i].pg.name = kasprintf(GFP_KERNEL, "%pOFn", node);
+ if (!pcaps[i].pg.name) {
+ kfree(pcaps[i].pattrs);
+ kfree(pcaps[i].pg.attrs);
+ goto out_pcaps_pattrs;
+ }
+
if (has_min) {
powercap_add_attr(min, "powercap-min",
&pcaps[i].pattrs[j]);
@@ -230,6 +232,7 @@ void __init opal_powercap_init(void)
}
i++;
}
+ of_node_put(powercap);
return;
@@ -240,6 +243,9 @@ out_pcaps_pattrs:
kfree(pcaps[i].pg.name);
}
kobject_put(powercap_kobj);
+ of_node_put(node);
out_pcaps:
kfree(pcaps);
+out_put_powercap:
+ of_node_put(powercap);
}
diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c
index 4070bb4e9da4..dc246ed4b7b4 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* OPAL Runtime Diagnostics interface driver
* Supported on POWERNV platform
*
* Copyright IBM Corporation 2015
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#define pr_fmt(fmt) "opal-prd: " fmt
@@ -32,13 +24,20 @@
#include <linux/uaccess.h>
-/**
+struct opal_prd_msg {
+ union {
+ struct opal_prd_msg_header header;
+ DECLARE_FLEX_ARRAY(u8, data);
+ };
+};
+
+/*
* The msg member must be at the end of the struct, as it's followed by the
* message data.
*/
struct opal_prd_msg_queue_item {
- struct list_head list;
- struct opal_prd_msg_header msg;
+ struct list_head list;
+ struct opal_prd_msg msg;
};
static struct device_node *prd_node;
@@ -67,6 +66,8 @@ static bool opal_prd_range_is_valid(uint64_t addr, uint64_t size)
const char *label;
addrp = of_get_address(node, 0, &range_size, NULL);
+ if (!addrp)
+ continue;
range_addr = of_read_number(addrp, 2);
range_end = range_addr + range_size;
@@ -113,7 +114,6 @@ static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma)
{
size_t addr, size;
pgprot_t page_prot;
- int rc;
pr_devel("opal_prd_mmap(0x%016lx, 0x%016lx, 0x%lx, 0x%lx)\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
@@ -129,10 +129,8 @@ static int opal_prd_mmap(struct file *file, struct vm_area_struct *vma)
page_prot = phys_mem_access_prot(file, vma->vm_pgoff,
size, vma->vm_page_prot);
- rc = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size,
+ return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, size,
page_prot);
-
- return rc;
}
static bool opal_msg_queue_empty(void)
@@ -167,7 +165,7 @@ static ssize_t opal_prd_read(struct file *file, char __user *buf,
int rc;
/* we need at least a header's worth of data */
- if (count < sizeof(item->msg))
+ if (count < sizeof(item->msg.header))
return -EINVAL;
if (*ppos)
@@ -197,7 +195,7 @@ static ssize_t opal_prd_read(struct file *file, char __user *buf,
return -EINTR;
}
- size = be16_to_cpu(item->msg.size);
+ size = be16_to_cpu(item->msg.header.size);
if (size > count) {
err = -EINVAL;
goto err_requeue;
@@ -225,8 +223,8 @@ static ssize_t opal_prd_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct opal_prd_msg_header hdr;
+ struct opal_prd_msg *msg;
ssize_t size;
- void *msg;
int rc;
size = sizeof(hdr);
@@ -258,12 +256,12 @@ static ssize_t opal_prd_write(struct file *file, const char __user *buf,
static int opal_prd_release(struct inode *inode, struct file *file)
{
- struct opal_prd_msg_header msg;
+ struct opal_prd_msg msg;
- msg.size = cpu_to_be16(sizeof(msg));
- msg.type = OPAL_PRD_MSG_TYPE_FINI;
+ msg.header.size = cpu_to_be16(sizeof(msg));
+ msg.header.type = OPAL_PRD_MSG_TYPE_FINI;
- opal_prd_msg((struct opal_prd_msg *)&msg);
+ opal_prd_msg(&msg);
atomic_xchg(&prd_usage, 0);
@@ -350,7 +348,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb,
int msg_size, item_size;
unsigned long flags;
- if (msg_type != OPAL_MSG_PRD)
+ if (msg_type != OPAL_MSG_PRD && msg_type != OPAL_MSG_PRD2)
return 0;
/* Calculate total size of the message and item we need to store. The
@@ -363,7 +361,7 @@ static int opal_prd_msg_notifier(struct notifier_block *nb,
if (!item)
return -ENOMEM;
- memcpy(&item->msg, msg->params, msg_size);
+ memcpy(&item->msg.data, msg->params, msg_size);
spin_lock_irqsave(&opal_prd_msg_queue_lock, flags);
list_add_tail(&item->list, &opal_prd_msg_queue);
@@ -380,6 +378,12 @@ static struct notifier_block opal_prd_event_nb = {
.priority = 0,
};
+static struct notifier_block opal_prd_event_nb2 = {
+ .notifier_call = opal_prd_msg_notifier,
+ .next = NULL,
+ .priority = 0,
+};
+
static int opal_prd_probe(struct platform_device *pdev)
{
int rc;
@@ -401,22 +405,31 @@ static int opal_prd_probe(struct platform_device *pdev)
return rc;
}
+ rc = opal_message_notifier_register(OPAL_MSG_PRD2, &opal_prd_event_nb2);
+ if (rc) {
+ pr_err("Couldn't register PRD2 event notifier\n");
+ opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb);
+ return rc;
+ }
+
rc = misc_register(&opal_prd_dev);
if (rc) {
pr_err("failed to register miscdev\n");
opal_message_notifier_unregister(OPAL_MSG_PRD,
&opal_prd_event_nb);
+ opal_message_notifier_unregister(OPAL_MSG_PRD2,
+ &opal_prd_event_nb2);
return rc;
}
return 0;
}
-static int opal_prd_remove(struct platform_device *pdev)
+static void opal_prd_remove(struct platform_device *pdev)
{
misc_deregister(&opal_prd_dev);
opal_message_notifier_unregister(OPAL_MSG_PRD, &opal_prd_event_nb);
- return 0;
+ opal_message_notifier_unregister(OPAL_MSG_PRD2, &opal_prd_event_nb2);
}
static const struct of_device_id opal_prd_match[] = {
@@ -430,7 +443,7 @@ static struct platform_driver opal_prd_driver = {
.of_match_table = opal_prd_match,
},
.probe = opal_prd_probe,
- .remove = opal_prd_remove,
+ .remove = opal_prd_remove,
};
module_platform_driver(opal_prd_driver);
diff --git a/arch/powerpc/platforms/powernv/opal-psr.c b/arch/powerpc/platforms/powernv/opal-psr.c
index 74986b35cf77..6441e17b6996 100644
--- a/arch/powerpc/platforms/powernv/opal-psr.c
+++ b/arch/powerpc/platforms/powernv/opal-psr.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV OPAL Power-Shift-Ratio interface
*
* Copyright 2017 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "opal-psr: " fmt
@@ -17,11 +13,11 @@
#include <asm/opal.h>
-DEFINE_MUTEX(psr_mutex);
+static DEFINE_MUTEX(psr_mutex);
static struct kobject *psr_kobj;
-struct psr_attr {
+static struct psr_attr {
u32 handle;
struct kobj_attribute attr;
} *psr_attrs;
@@ -139,7 +135,7 @@ void __init opal_psr_init(void)
psr_attrs = kcalloc(of_get_child_count(psr), sizeof(*psr_attrs),
GFP_KERNEL);
if (!psr_attrs)
- return;
+ goto out_put_psr;
psr_kobj = kobject_create_and_add("psr", opal_kobj);
if (!psr_kobj) {
@@ -166,10 +162,14 @@ void __init opal_psr_init(void)
}
i++;
}
+ of_node_put(psr);
return;
out_kobj:
+ of_node_put(node);
kobject_put(psr_kobj);
out:
kfree(psr_attrs);
+out_put_psr:
+ of_node_put(psr);
}
diff --git a/arch/powerpc/platforms/powernv/opal-rtc.c b/arch/powerpc/platforms/powernv/opal-rtc.c
index 42ec642a3eba..79011a263aa6 100644
--- a/arch/powerpc/platforms/powernv/opal-rtc.c
+++ b/arch/powerpc/platforms/powernv/opal-rtc.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV Real Time Clock.
*
* Copyright 2011 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
@@ -15,14 +11,15 @@
#include <linux/bcd.h>
#include <linux/rtc.h>
#include <linux/delay.h>
-#include <linux/platform_device.h>
+#include <linux/of.h>
#include <linux/of_platform.h>
+#include <linux/platform_device.h>
#include <asm/opal.h>
#include <asm/firmware.h>
#include <asm/machdep.h>
-static void opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm)
+static void __init opal_to_tm(u32 y_m_d, u64 h_m_s_ms, struct rtc_time *tm)
{
tm->tm_year = ((bcd2bin(y_m_d >> 24) * 100) +
bcd2bin((y_m_d >> 16) & 0xff)) - 1900;
diff --git a/arch/powerpc/platforms/powernv/opal-secvar.c b/arch/powerpc/platforms/powernv/opal-secvar.c
new file mode 100644
index 000000000000..6ac410f4d3c7
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-secvar.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PowerNV code for secure variables
+ *
+ * Copyright (C) 2019 IBM Corporation
+ * Author: Claudio Carvalho
+ * Nayna Jain
+ *
+ * APIs to access secure variables managed by OPAL.
+ */
+
+#define pr_fmt(fmt) "secvar: "fmt
+
+#include <linux/types.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <asm/opal.h>
+#include <asm/secvar.h>
+#include <asm/secure_boot.h>
+
+static int opal_status_to_err(int rc)
+{
+ int err;
+
+ switch (rc) {
+ case OPAL_SUCCESS:
+ err = 0;
+ break;
+ case OPAL_UNSUPPORTED:
+ err = -ENXIO;
+ break;
+ case OPAL_PARAMETER:
+ err = -EINVAL;
+ break;
+ case OPAL_RESOURCE:
+ err = -ENOSPC;
+ break;
+ case OPAL_HARDWARE:
+ err = -EIO;
+ break;
+ case OPAL_NO_MEM:
+ err = -ENOMEM;
+ break;
+ case OPAL_EMPTY:
+ err = -ENOENT;
+ break;
+ case OPAL_PARTIAL:
+ err = -EFBIG;
+ break;
+ default:
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+static int opal_get_variable(const char *key, u64 ksize, u8 *data, u64 *dsize)
+{
+ int rc;
+
+ if (!key || !dsize)
+ return -EINVAL;
+
+ *dsize = cpu_to_be64(*dsize);
+
+ rc = opal_secvar_get(key, ksize, data, dsize);
+
+ *dsize = be64_to_cpu(*dsize);
+
+ return opal_status_to_err(rc);
+}
+
+static int opal_get_next_variable(const char *key, u64 *keylen, u64 keybufsize)
+{
+ int rc;
+
+ if (!key || !keylen)
+ return -EINVAL;
+
+ *keylen = cpu_to_be64(*keylen);
+
+ rc = opal_secvar_get_next(key, keylen, keybufsize);
+
+ *keylen = be64_to_cpu(*keylen);
+
+ return opal_status_to_err(rc);
+}
+
+static int opal_set_variable(const char *key, u64 ksize, u8 *data, u64 dsize)
+{
+ int rc;
+
+ if (!key || !data)
+ return -EINVAL;
+
+ rc = opal_secvar_enqueue_update(key, ksize, data, dsize);
+
+ return opal_status_to_err(rc);
+}
+
+static ssize_t opal_secvar_format(char *buf, size_t bufsize)
+{
+ ssize_t rc = 0;
+ struct device_node *node;
+ const char *format;
+
+ node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend");
+ if (!of_device_is_available(node)) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ rc = of_property_read_string(node, "format", &format);
+ if (rc)
+ goto out;
+
+ rc = snprintf(buf, bufsize, "%s", format);
+
+out:
+ of_node_put(node);
+
+ return rc;
+}
+
+static int opal_secvar_max_size(u64 *max_size)
+{
+ int rc;
+ struct device_node *node;
+
+ node = of_find_compatible_node(NULL, NULL, "ibm,secvar-backend");
+ if (!node)
+ return -ENODEV;
+
+ if (!of_device_is_available(node)) {
+ rc = -ENODEV;
+ goto out;
+ }
+
+ rc = of_property_read_u64(node, "max-var-size", max_size);
+
+out:
+ of_node_put(node);
+ return rc;
+}
+
+static const struct secvar_operations opal_secvar_ops = {
+ .get = opal_get_variable,
+ .get_next = opal_get_next_variable,
+ .set = opal_set_variable,
+ .format = opal_secvar_format,
+ .max_size = opal_secvar_max_size,
+};
+
+static int opal_secvar_probe(struct platform_device *pdev)
+{
+ if (!opal_check_token(OPAL_SECVAR_GET)
+ || !opal_check_token(OPAL_SECVAR_GET_NEXT)
+ || !opal_check_token(OPAL_SECVAR_ENQUEUE_UPDATE)) {
+ pr_err("OPAL doesn't support secure variables\n");
+ return -ENODEV;
+ }
+
+ return set_secvar_ops(&opal_secvar_ops);
+}
+
+static const struct of_device_id opal_secvar_match[] = {
+ { .compatible = "ibm,secvar-backend",},
+ {},
+};
+
+static struct platform_driver opal_secvar_driver = {
+ .driver = {
+ .name = "secvar",
+ .of_match_table = opal_secvar_match,
+ },
+};
+
+static int __init opal_secvar_init(void)
+{
+ return platform_driver_probe(&opal_secvar_driver, opal_secvar_probe);
+}
+device_initcall(opal_secvar_init);
diff --git a/arch/powerpc/platforms/powernv/opal-sensor-groups.c b/arch/powerpc/platforms/powernv/opal-sensor-groups.c
index 179609220e6f..9944376b115c 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor-groups.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor-groups.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV OPAL Sensor-groups interface
*
* Copyright 2017 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "opal-sensor-groups: " fmt
@@ -17,7 +13,7 @@
#include <asm/opal.h>
-DEFINE_MUTEX(sg_mutex);
+static DEFINE_MUTEX(sg_mutex);
static struct kobject *sg_kobj;
@@ -130,7 +126,7 @@ static void add_attr(int handle, struct sg_attr *attr, int index)
attr->attr.store = ops_info[index].store;
}
-static int add_attr_group(const __be32 *ops, int len, struct sensor_group *sg,
+static int __init add_attr_group(const __be32 *ops, int len, struct sensor_group *sg,
u32 handle)
{
int i, j;
@@ -148,7 +144,7 @@ static int add_attr_group(const __be32 *ops, int len, struct sensor_group *sg,
return sysfs_create_group(sg_kobj, &sg->sg);
}
-static int get_nr_attrs(const __be32 *ops, int len)
+static int __init get_nr_attrs(const __be32 *ops, int len)
{
int i, j;
int nr_attrs = 0;
@@ -174,7 +170,7 @@ void __init opal_sensor_groups_init(void)
sgs = kcalloc(of_get_child_count(sg), sizeof(*sgs), GFP_KERNEL);
if (!sgs)
- return;
+ goto out_sg_put;
sg_kobj = kobject_create_and_add("sensor_groups", opal_kobj);
if (!sg_kobj) {
@@ -226,6 +222,7 @@ void __init opal_sensor_groups_init(void)
}
i++;
}
+ of_node_put(sg);
return;
@@ -235,6 +232,9 @@ out_sgs_sgattrs:
kfree(sgs[i].sg.attrs);
}
kobject_put(sg_kobj);
+ of_node_put(node);
out_sgs:
kfree(sgs);
+out_sg_put:
+ of_node_put(sg);
}
diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
index 35a5f4b9aeb5..8880a1c14573 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -1,25 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV sensor code
*
* Copyright (C) 2013 IBM
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/delay.h>
+#include <linux/of.h>
#include <linux/of_platform.h>
+#include <linux/platform_device.h>
#include <asm/opal.h>
#include <asm/machdep.h>
diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c
index 916a4b7b1bb5..a12312afe4ef 100644
--- a/arch/powerpc/platforms/powernv/opal-sysparam.c
+++ b/arch/powerpc/platforms/powernv/opal-sysparam.c
@@ -1,21 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV system parameter code
*
* Copyright (C) 2013 IBM
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/kobject.h>
diff --git a/arch/powerpc/platforms/powernv/opal-tracepoints.c b/arch/powerpc/platforms/powernv/opal-tracepoints.c
index f16a43540e30..91b36541b9e5 100644
--- a/arch/powerpc/platforms/powernv/opal-tracepoints.c
+++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c
@@ -2,7 +2,6 @@
#include <linux/percpu.h>
#include <linux/jump_label.h>
#include <asm/trace.h>
-#include <asm/asm-prototypes.h>
#ifdef CONFIG_JUMP_LABEL
struct static_key opal_tracepoint_key = STATIC_KEY_INIT;
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index f4875fe3f8ff..0ed95f753416 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* PowerNV OPAL API wrappers
*
* Copyright 2011 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/jump_label.h>
@@ -17,317 +13,51 @@
#include <asm/asm-compat.h>
#include <asm/feature-fixups.h>
- .section ".text"
-
-#ifdef CONFIG_TRACEPOINTS
-#ifdef CONFIG_JUMP_LABEL
-#define OPAL_BRANCH(LABEL) \
- ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key)
-#else
-
- .section ".toc","aw"
-
- .globl opal_tracepoint_refcount
-opal_tracepoint_refcount:
- .8byte 0
-
- .section ".text"
+ .section ".text"
/*
- * We branch around this in early init by using an unconditional cpu
- * feature.
+ * r3-r10 - OPAL call arguments
+ * STK_PARAM(R11) - OPAL opcode
+ * STK_PARAM(R12) - MSR to restore
*/
-#define OPAL_BRANCH(LABEL) \
-BEGIN_FTR_SECTION; \
- b 1f; \
-END_FTR_SECTION(0, 1); \
- ld r11,opal_tracepoint_refcount@toc(r2); \
- cmpdi r11,0; \
- bne- LABEL; \
-1:
-
-#endif
-
-#else
-#define OPAL_BRANCH(LABEL)
-#endif
-
-/*
- * DO_OPAL_CALL assumes:
- * r0 = opal call token
- * r12 = msr
- * LR has been saved
- */
-#define DO_OPAL_CALL() \
- mfcr r11; \
- stw r11,8(r1); \
- li r11,0; \
- ori r11,r11,MSR_EE; \
- std r12,PACASAVEDMSR(r13); \
- andc r12,r12,r11; \
- mtmsrd r12,1; \
- LOAD_REG_ADDR(r11,opal_return); \
- mtlr r11; \
- li r11,MSR_DR|MSR_IR|MSR_LE;\
- andc r12,r12,r11; \
- mtspr SPRN_HSRR1,r12; \
- LOAD_REG_ADDR(r11,opal); \
- ld r12,8(r11); \
- ld r2,0(r11); \
- mtspr SPRN_HSRR0,r12; \
+_GLOBAL_TOC(__opal_call)
+ mflr r0
+ std r0,PPC_LR_STKOFF(r1)
+ ld r12,STK_PARAM(R12)(r1)
+ li r0,MSR_IR|MSR_DR|MSR_LE
+ andc r12,r12,r0
+ LOAD_REG_ADDR(r11, opal_return)
+ mtlr r11
+ LOAD_REG_ADDR(r11, opal)
+ ld r2,0(r11)
+ ld r11,8(r11)
+ mtspr SPRN_HSRR0,r11
+ mtspr SPRN_HSRR1,r12
+ /* set token to r0 */
+ ld r0,STK_PARAM(R11)(r1)
hrfid
-
-#define OPAL_CALL(name, token) \
- _GLOBAL_TOC(name); \
- mfmsr r12; \
- mflr r0; \
- andi. r11,r12,MSR_IR|MSR_DR; \
- std r0,PPC_LR_STKOFF(r1); \
- li r0,token; \
- beq opal_real_call; \
- OPAL_BRANCH(opal_tracepoint_entry) \
- DO_OPAL_CALL()
-
-
opal_return:
/*
- * Fixup endian on OPAL return... we should be able to simplify
- * this by instead converting the below trampoline to a set of
- * bytes (always BE) since MSR:LE will end up fixed up as a side
- * effect of the rfid.
+ * Restore MSR on OPAL return. The MSR is set to big-endian.
*/
- FIXUP_ENDIAN_HV
- ld r2,PACATOC(r13);
- lwz r4,8(r1);
- ld r5,PPC_LR_STKOFF(r1);
- ld r6,PACASAVEDMSR(r13);
- mtcr r4;
- mtspr SPRN_HSRR0,r5;
- mtspr SPRN_HSRR1,r6;
- hrfid
-
-opal_real_call:
- mfcr r11
- stw r11,8(r1)
- /* Set opal return address */
- LOAD_REG_ADDR(r11, opal_return_realmode)
- mtlr r11
- li r11,MSR_LE
- andc r12,r12,r11
- mtspr SPRN_HSRR1,r12
- LOAD_REG_ADDR(r11,opal)
- ld r12,8(r11)
- ld r2,0(r11)
- mtspr SPRN_HSRR0,r12
- hrfid
-
-opal_return_realmode:
- FIXUP_ENDIAN_HV
- ld r2,PACATOC(r13);
- lwz r11,8(r1);
- ld r12,PPC_LR_STKOFF(r1)
- mtcr r11;
- mtlr r12
- blr
-
-#ifdef CONFIG_TRACEPOINTS
-opal_tracepoint_entry:
- stdu r1,-STACKFRAMESIZE(r1)
- std r0,STK_REG(R23)(r1)
- std r3,STK_REG(R24)(r1)
- std r4,STK_REG(R25)(r1)
- std r5,STK_REG(R26)(r1)
- std r6,STK_REG(R27)(r1)
- std r7,STK_REG(R28)(r1)
- std r8,STK_REG(R29)(r1)
- std r9,STK_REG(R30)(r1)
- std r10,STK_REG(R31)(r1)
- mr r3,r0
- addi r4,r1,STK_REG(R24)
- bl __trace_opal_entry
- ld r0,STK_REG(R23)(r1)
- ld r3,STK_REG(R24)(r1)
- ld r4,STK_REG(R25)(r1)
- ld r5,STK_REG(R26)(r1)
- ld r6,STK_REG(R27)(r1)
- ld r7,STK_REG(R28)(r1)
- ld r8,STK_REG(R29)(r1)
- ld r9,STK_REG(R30)(r1)
- ld r10,STK_REG(R31)(r1)
-
- /* setup LR so we return via tracepoint_return */
- LOAD_REG_ADDR(r11,opal_tracepoint_return)
- std r11,16(r1)
-
- mfmsr r12
- DO_OPAL_CALL()
-
-opal_tracepoint_return:
- std r3,STK_REG(R31)(r1)
- mr r4,r3
- ld r3,STK_REG(R23)(r1)
- bl __trace_opal_exit
- ld r3,STK_REG(R31)(r1)
- addi r1,r1,STACKFRAMESIZE
- ld r0,16(r1)
+#ifdef __BIG_ENDIAN__
+ ld r11,STK_PARAM(R12)(r1)
+ mtmsrd r11
+#else
+ /* Endian can only be switched with rfi, must byte reverse MSR load */
+ .short 0x4039 /* li r10,STK_PARAM(R12) */
+ .byte (STK_PARAM(R12) >> 8) & 0xff
+ .byte STK_PARAM(R12) & 0xff
+
+ .long 0x280c6a7d /* ldbrx r11,r10,r1 */
+ .long 0x05009f42 /* bcl 20,31,$+4 */
+ .long 0xa602487d /* mflr r10 */
+ .long 0x14004a39 /* addi r10,r10,20 */
+ .long 0xa64b5a7d /* mthsrr0 r10 */
+ .long 0xa64b7b7d /* mthsrr1 r11 */
+ .long 0x2402004c /* hrfid */
+#endif
+ LOAD_PACA_TOC()
+ ld r0,PPC_LR_STKOFF(r1)
mtlr r0
blr
-#endif
-
-
-OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL);
-OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE);
-OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ);
-OPAL_CALL(opal_console_write_buffer_space, OPAL_CONSOLE_WRITE_BUFFER_SPACE);
-OPAL_CALL(opal_rtc_read, OPAL_RTC_READ);
-OPAL_CALL(opal_rtc_write, OPAL_RTC_WRITE);
-OPAL_CALL(opal_cec_power_down, OPAL_CEC_POWER_DOWN);
-OPAL_CALL(opal_cec_reboot, OPAL_CEC_REBOOT);
-OPAL_CALL(opal_cec_reboot2, OPAL_CEC_REBOOT2);
-OPAL_CALL(opal_read_nvram, OPAL_READ_NVRAM);
-OPAL_CALL(opal_write_nvram, OPAL_WRITE_NVRAM);
-OPAL_CALL(opal_handle_interrupt, OPAL_HANDLE_INTERRUPT);
-OPAL_CALL(opal_poll_events, OPAL_POLL_EVENTS);
-OPAL_CALL(opal_pci_set_hub_tce_memory, OPAL_PCI_SET_HUB_TCE_MEMORY);
-OPAL_CALL(opal_pci_set_phb_tce_memory, OPAL_PCI_SET_PHB_TCE_MEMORY);
-OPAL_CALL(opal_pci_config_read_byte, OPAL_PCI_CONFIG_READ_BYTE);
-OPAL_CALL(opal_pci_config_read_half_word, OPAL_PCI_CONFIG_READ_HALF_WORD);
-OPAL_CALL(opal_pci_config_read_word, OPAL_PCI_CONFIG_READ_WORD);
-OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE);
-OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD);
-OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD);
-OPAL_CALL(opal_set_xive, OPAL_SET_XIVE);
-OPAL_CALL(opal_get_xive, OPAL_GET_XIVE);
-OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER);
-OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS);
-OPAL_CALL(opal_pci_eeh_freeze_clear, OPAL_PCI_EEH_FREEZE_CLEAR);
-OPAL_CALL(opal_pci_eeh_freeze_set, OPAL_PCI_EEH_FREEZE_SET);
-OPAL_CALL(opal_pci_err_inject, OPAL_PCI_ERR_INJECT);
-OPAL_CALL(opal_pci_shpc, OPAL_PCI_SHPC);
-OPAL_CALL(opal_pci_phb_mmio_enable, OPAL_PCI_PHB_MMIO_ENABLE);
-OPAL_CALL(opal_pci_set_phb_mem_window, OPAL_PCI_SET_PHB_MEM_WINDOW);
-OPAL_CALL(opal_pci_map_pe_mmio_window, OPAL_PCI_MAP_PE_MMIO_WINDOW);
-OPAL_CALL(opal_pci_set_phb_table_memory, OPAL_PCI_SET_PHB_TABLE_MEMORY);
-OPAL_CALL(opal_pci_set_pe, OPAL_PCI_SET_PE);
-OPAL_CALL(opal_pci_set_peltv, OPAL_PCI_SET_PELTV);
-OPAL_CALL(opal_pci_set_mve, OPAL_PCI_SET_MVE);
-OPAL_CALL(opal_pci_set_mve_enable, OPAL_PCI_SET_MVE_ENABLE);
-OPAL_CALL(opal_pci_get_xive_reissue, OPAL_PCI_GET_XIVE_REISSUE);
-OPAL_CALL(opal_pci_set_xive_reissue, OPAL_PCI_SET_XIVE_REISSUE);
-OPAL_CALL(opal_pci_set_xive_pe, OPAL_PCI_SET_XIVE_PE);
-OPAL_CALL(opal_get_xive_source, OPAL_GET_XIVE_SOURCE);
-OPAL_CALL(opal_get_msi_32, OPAL_GET_MSI_32);
-OPAL_CALL(opal_get_msi_64, OPAL_GET_MSI_64);
-OPAL_CALL(opal_start_cpu, OPAL_START_CPU);
-OPAL_CALL(opal_query_cpu_status, OPAL_QUERY_CPU_STATUS);
-OPAL_CALL(opal_write_oppanel, OPAL_WRITE_OPPANEL);
-OPAL_CALL(opal_pci_map_pe_dma_window, OPAL_PCI_MAP_PE_DMA_WINDOW);
-OPAL_CALL(opal_pci_map_pe_dma_window_real, OPAL_PCI_MAP_PE_DMA_WINDOW_REAL);
-OPAL_CALL(opal_pci_reset, OPAL_PCI_RESET);
-OPAL_CALL(opal_pci_get_hub_diag_data, OPAL_PCI_GET_HUB_DIAG_DATA);
-OPAL_CALL(opal_pci_get_phb_diag_data, OPAL_PCI_GET_PHB_DIAG_DATA);
-OPAL_CALL(opal_pci_fence_phb, OPAL_PCI_FENCE_PHB);
-OPAL_CALL(opal_pci_reinit, OPAL_PCI_REINIT);
-OPAL_CALL(opal_pci_mask_pe_error, OPAL_PCI_MASK_PE_ERROR);
-OPAL_CALL(opal_set_slot_led_status, OPAL_SET_SLOT_LED_STATUS);
-OPAL_CALL(opal_get_epow_status, OPAL_GET_EPOW_STATUS);
-OPAL_CALL(opal_get_dpo_status, OPAL_GET_DPO_STATUS);
-OPAL_CALL(opal_set_system_attention_led, OPAL_SET_SYSTEM_ATTENTION_LED);
-OPAL_CALL(opal_pci_next_error, OPAL_PCI_NEXT_ERROR);
-OPAL_CALL(opal_pci_poll, OPAL_PCI_POLL);
-OPAL_CALL(opal_pci_msi_eoi, OPAL_PCI_MSI_EOI);
-OPAL_CALL(opal_pci_get_phb_diag_data2, OPAL_PCI_GET_PHB_DIAG_DATA2);
-OPAL_CALL(opal_xscom_read, OPAL_XSCOM_READ);
-OPAL_CALL(opal_xscom_write, OPAL_XSCOM_WRITE);
-OPAL_CALL(opal_lpc_read, OPAL_LPC_READ);
-OPAL_CALL(opal_lpc_write, OPAL_LPC_WRITE);
-OPAL_CALL(opal_return_cpu, OPAL_RETURN_CPU);
-OPAL_CALL(opal_reinit_cpus, OPAL_REINIT_CPUS);
-OPAL_CALL(opal_read_elog, OPAL_ELOG_READ);
-OPAL_CALL(opal_send_ack_elog, OPAL_ELOG_ACK);
-OPAL_CALL(opal_get_elog_size, OPAL_ELOG_SIZE);
-OPAL_CALL(opal_resend_pending_logs, OPAL_ELOG_RESEND);
-OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE);
-OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE);
-OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE);
-OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE);
-OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE);
-OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN);
-OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT);
-OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO);
-OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2);
-OPAL_CALL(opal_dump_read, OPAL_DUMP_READ);
-OPAL_CALL(opal_dump_ack, OPAL_DUMP_ACK);
-OPAL_CALL(opal_get_msg, OPAL_GET_MSG);
-OPAL_CALL(opal_write_oppanel_async, OPAL_WRITE_OPPANEL_ASYNC);
-OPAL_CALL(opal_check_completion, OPAL_CHECK_ASYNC_COMPLETION);
-OPAL_CALL(opal_dump_resend_notification, OPAL_DUMP_RESEND);
-OPAL_CALL(opal_sync_host_reboot, OPAL_SYNC_HOST_REBOOT);
-OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ);
-OPAL_CALL(opal_get_param, OPAL_GET_PARAM);
-OPAL_CALL(opal_set_param, OPAL_SET_PARAM);
-OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI);
-OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE);
-OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG);
-OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION);
-OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION);
-OPAL_CALL(opal_pci_set_phb_cxl_mode, OPAL_PCI_SET_PHB_CAPI_MODE);
-OPAL_CALL(opal_tpo_write, OPAL_WRITE_TPO);
-OPAL_CALL(opal_tpo_read, OPAL_READ_TPO);
-OPAL_CALL(opal_ipmi_send, OPAL_IPMI_SEND);
-OPAL_CALL(opal_ipmi_recv, OPAL_IPMI_RECV);
-OPAL_CALL(opal_i2c_request, OPAL_I2C_REQUEST);
-OPAL_CALL(opal_flash_read, OPAL_FLASH_READ);
-OPAL_CALL(opal_flash_write, OPAL_FLASH_WRITE);
-OPAL_CALL(opal_flash_erase, OPAL_FLASH_ERASE);
-OPAL_CALL(opal_prd_msg, OPAL_PRD_MSG);
-OPAL_CALL(opal_leds_get_ind, OPAL_LEDS_GET_INDICATOR);
-OPAL_CALL(opal_leds_set_ind, OPAL_LEDS_SET_INDICATOR);
-OPAL_CALL(opal_console_flush, OPAL_CONSOLE_FLUSH);
-OPAL_CALL(opal_get_device_tree, OPAL_GET_DEVICE_TREE);
-OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE);
-OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE);
-OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE);
-OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR);
-OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR);
-OPAL_CALL(opal_int_eoi, OPAL_INT_EOI);
-OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR);
-OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL);
-OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR);
-OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET);
-OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO);
-OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG);
-OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG);
-OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO);
-OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO);
-OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE);
-OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK);
-OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK);
-OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ);
-OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ);
-OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
-OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
-OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC);
-OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP);
-OPAL_CALL(opal_signal_system_reset, OPAL_SIGNAL_SYSTEM_RESET);
-OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT);
-OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT);
-OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR);
-OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT);
-OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START);
-OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP);
-OPAL_CALL(opal_pci_set_p2p, OPAL_PCI_SET_P2P);
-OPAL_CALL(opal_get_powercap, OPAL_GET_POWERCAP);
-OPAL_CALL(opal_set_powercap, OPAL_SET_POWERCAP);
-OPAL_CALL(opal_get_power_shift_ratio, OPAL_GET_POWER_SHIFT_RATIO);
-OPAL_CALL(opal_set_power_shift_ratio, OPAL_SET_POWER_SHIFT_RATIO);
-OPAL_CALL(opal_sensor_group_clear, OPAL_SENSOR_GROUP_CLEAR);
-OPAL_CALL(opal_quiesce, OPAL_QUIESCE);
-OPAL_CALL(opal_npu_spa_setup, OPAL_NPU_SPA_SETUP);
-OPAL_CALL(opal_npu_spa_clear_cache, OPAL_NPU_SPA_CLEAR_CACHE);
-OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET);
-OPAL_CALL(opal_pci_get_pbcq_tunnel_bar, OPAL_PCI_GET_PBCQ_TUNNEL_BAR);
-OPAL_CALL(opal_pci_set_pbcq_tunnel_bar, OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
-OPAL_CALL(opal_sensor_read_u64, OPAL_SENSOR_READ_U64);
-OPAL_CALL(opal_sensor_group_enable, OPAL_SENSOR_GROUP_ENABLE);
-OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT);
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index 22d5e1110dbb..748c2b97fa53 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -1,12 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * PowerNV LPC bus handling.
+ * PowerNV SCOM bus debugfs interface
*
+ * Copyright 2010 Benjamin Herrenschmidt, IBM Corp
+ * <benh@kernel.crashing.org>
+ * and David Gibson, IBM Corporation.
* Copyright 2013 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -14,62 +13,13 @@
#include <linux/bug.h>
#include <linux/gfp.h>
#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/debugfs.h>
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/opal.h>
-#include <asm/scom.h>
-
-/*
- * We could probably fit that inside the scom_map_t
- * which is a void* after all but it's really too ugly
- * so let's kmalloc it for now
- */
-struct opal_scom_map {
- uint32_t chip;
- uint64_t addr;
-};
-
-static scom_map_t opal_scom_map(struct device_node *dev, u64 reg, u64 count)
-{
- struct opal_scom_map *m;
- const __be32 *gcid;
-
- if (!of_get_property(dev, "scom-controller", NULL)) {
- pr_err("%s: device %pOF is not a SCOM controller\n",
- __func__, dev);
- return SCOM_MAP_INVALID;
- }
- gcid = of_get_property(dev, "ibm,chip-id", NULL);
- if (!gcid) {
- pr_err("%s: device %pOF has no ibm,chip-id\n",
- __func__, dev);
- return SCOM_MAP_INVALID;
- }
- m = kmalloc(sizeof(*m), GFP_KERNEL);
- if (!m)
- return NULL;
- m->chip = be32_to_cpup(gcid);
- m->addr = reg;
-
- return (scom_map_t)m;
-}
-
-static void opal_scom_unmap(scom_map_t map)
-{
- kfree(map);
-}
-
-static int opal_xscom_err_xlate(int64_t rc)
-{
- switch(rc) {
- case 0:
- return 0;
- /* Add more translations if necessary */
- default:
- return -EIO;
- }
-}
+#include <asm/prom.h>
static u64 opal_scom_unmangle(u64 addr)
{
@@ -102,39 +52,159 @@ static u64 opal_scom_unmangle(u64 addr)
return addr;
}
-static int opal_scom_read(scom_map_t map, u64 reg, u64 *value)
+static int opal_scom_read(uint32_t chip, uint64_t addr, u64 reg, u64 *value)
{
- struct opal_scom_map *m = map;
int64_t rc;
__be64 v;
- reg = opal_scom_unmangle(m->addr + reg);
- rc = opal_xscom_read(m->chip, reg, (__be64 *)__pa(&v));
+ reg = opal_scom_unmangle(addr + reg);
+ rc = opal_xscom_read(chip, reg, (__be64 *)__pa(&v));
+ if (rc) {
+ *value = 0xfffffffffffffffful;
+ return -EIO;
+ }
*value = be64_to_cpu(v);
- return opal_xscom_err_xlate(rc);
+ return 0;
}
-static int opal_scom_write(scom_map_t map, u64 reg, u64 value)
+static int opal_scom_write(uint32_t chip, uint64_t addr, u64 reg, u64 value)
{
- struct opal_scom_map *m = map;
int64_t rc;
- reg = opal_scom_unmangle(m->addr + reg);
- rc = opal_xscom_write(m->chip, reg, value);
- return opal_xscom_err_xlate(rc);
+ reg = opal_scom_unmangle(addr + reg);
+ rc = opal_xscom_write(chip, reg, value);
+ if (rc)
+ return -EIO;
+ return 0;
}
-static const struct scom_controller opal_scom_controller = {
- .map = opal_scom_map,
- .unmap = opal_scom_unmap,
- .read = opal_scom_read,
- .write = opal_scom_write
+struct scom_debug_entry {
+ u32 chip;
+ struct debugfs_blob_wrapper path;
+ char name[16];
};
-static int opal_xscom_init(void)
+static ssize_t scom_debug_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos)
{
- if (firmware_has_feature(FW_FEATURE_OPAL))
- scom_init(&opal_scom_controller);
+ struct scom_debug_entry *ent = filp->private_data;
+ u64 __user *ubuf64 = (u64 __user *)ubuf;
+ loff_t off = *ppos;
+ ssize_t done = 0;
+ u64 reg, reg_base, reg_cnt, val;
+ int rc;
+
+ if (off < 0 || (off & 7) || (count & 7))
+ return -EINVAL;
+ reg_base = off >> 3;
+ reg_cnt = count >> 3;
+
+ for (reg = 0; reg < reg_cnt; reg++) {
+ rc = opal_scom_read(ent->chip, reg_base, reg, &val);
+ if (!rc)
+ rc = put_user(val, ubuf64);
+ if (rc) {
+ if (!done)
+ done = rc;
+ break;
+ }
+ ubuf64++;
+ *ppos += 8;
+ done += 8;
+ }
+ return done;
+}
+
+static ssize_t scom_debug_write(struct file *filp, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct scom_debug_entry *ent = filp->private_data;
+ u64 __user *ubuf64 = (u64 __user *)ubuf;
+ loff_t off = *ppos;
+ ssize_t done = 0;
+ u64 reg, reg_base, reg_cnt, val;
+ int rc;
+
+ if (off < 0 || (off & 7) || (count & 7))
+ return -EINVAL;
+ reg_base = off >> 3;
+ reg_cnt = count >> 3;
+
+ for (reg = 0; reg < reg_cnt; reg++) {
+ rc = get_user(val, ubuf64);
+ if (!rc)
+ rc = opal_scom_write(ent->chip, reg_base, reg, val);
+ if (rc) {
+ if (!done)
+ done = rc;
+ break;
+ }
+ ubuf64++;
+ done += 8;
+ }
+ return done;
+}
+
+static const struct file_operations scom_debug_fops = {
+ .read = scom_debug_read,
+ .write = scom_debug_write,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static int scom_debug_init_one(struct dentry *root, struct device_node *dn,
+ int chip)
+{
+ struct scom_debug_entry *ent;
+ struct dentry *dir;
+
+ ent = kzalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+
+ ent->chip = chip;
+ snprintf(ent->name, 16, "%08x", chip);
+ ent->path.data = (void *)kasprintf(GFP_KERNEL, "%pOF", dn);
+ if (!ent->path.data) {
+ kfree(ent);
+ return -ENOMEM;
+ }
+
+ ent->path.size = strlen((char *)ent->path.data);
+
+ dir = debugfs_create_dir(ent->name, root);
+ if (IS_ERR(dir)) {
+ kfree(ent->path.data);
+ kfree(ent);
+ return -1;
+ }
+
+ debugfs_create_blob("devspec", 0400, dir, &ent->path);
+ debugfs_create_file("access", 0600, dir, ent, &scom_debug_fops);
+
return 0;
}
-machine_arch_initcall(powernv, opal_xscom_init);
+
+static int scom_debug_init(void)
+{
+ struct device_node *dn;
+ struct dentry *root;
+ int chip, rc;
+
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return 0;
+
+ root = debugfs_create_dir("scom", arch_debugfs_dir);
+ if (IS_ERR(root))
+ return -1;
+
+ rc = 0;
+ for_each_node_with_property(dn, "scom-controller") {
+ chip = of_get_ibm_chip_id(dn);
+ WARN_ON(chip == -1);
+ rc |= scom_debug_init_one(root, dn, chip);
+ }
+
+ return rc;
+}
+device_initcall(scom_debug_init);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 79586f127521..09bd93464b4f 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV OPAL high level interfaces
*
* Copyright 2011 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "opal: " fmt
@@ -26,7 +22,6 @@
#include <linux/memblock.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
-#include <linux/printk.h>
#include <linux/kmsg_dump.h>
#include <linux/console.h>
#include <linux/sched/debug.h>
@@ -40,6 +35,16 @@
#include "powernv.h"
+#define OPAL_MSG_QUEUE_MAX 16
+
+struct opal_msg_node {
+ struct list_head list;
+ struct opal_msg msg;
+};
+
+static DEFINE_SPINLOCK(msg_list_lock);
+static LIST_HEAD(msg_list);
+
/* /sys/firmware/opal */
struct kobject *opal_kobj;
@@ -55,6 +60,8 @@ struct mcheck_recoverable_range {
u64 recover_addr;
};
+static int msg_list_size;
+
static struct mcheck_recoverable_range *mc_recoverable_range;
static int mc_recoverable_range_len;
@@ -63,8 +70,10 @@ static DEFINE_SPINLOCK(opal_write_lock);
static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
static uint32_t opal_heartbeat;
static struct task_struct *kopald_tsk;
+static struct opal_msg *opal_msg;
+static u32 opal_msg_size __ro_after_init;
-void opal_configure_cores(void)
+void __init opal_configure_cores(void)
{
u64 reinit_flags = 0;
@@ -171,8 +180,7 @@ int __init early_init_dt_scan_recoverable_ranges(unsigned long node,
/*
* Allocate a buffer to hold the MC recoverable ranges.
*/
- mc_recoverable_range =__va(memblock_phys_alloc(size, __alignof__(u64)));
- memset(mc_recoverable_range, 0, size);
+ mc_recoverable_range = memblock_alloc_or_panic(size, __alignof__(u64));
for (i = 0; i < mc_recoverable_range_len; i++) {
mc_recoverable_range[i].start_addr =
@@ -205,16 +213,18 @@ static int __init opal_register_exception_handlers(void)
glue = 0x7000;
/*
- * Check if we are running on newer firmware that exports
- * OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to patch
- * the HMI interrupt and we catch it directly in Linux.
+ * Only ancient OPAL firmware requires this.
+ * Specifically, firmware from FW810.00 (released June 2014)
+ * through FW810.20 (Released October 2014).
*
- * For older firmware (i.e currently released POWER8 System Firmware
- * as of today <= SV810_087), we fallback to old behavior and let OPAL
- * patch the HMI vector and handle it inside OPAL firmware.
+ * Check if we are running on newer (post Oct 2014) firmware that
+ * exports the OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to
+ * patch the HMI interrupt and we catch it directly in Linux.
*
- * For newer firmware (in development/yet to be released) we will
- * start catching/handling HMI directly in Linux.
+ * For older firmware (i.e < FW810.20), we fallback to old behavior and
+ * let OPAL patch the HMI vector and handle it inside OPAL firmware.
+ *
+ * For newer firmware we catch/handle the HMI directly in Linux.
*/
if (!opal_check_token(OPAL_HANDLE_HMI)) {
pr_info("Old firmware detected, OPAL handles HMIs.\n");
@@ -224,6 +234,11 @@ static int __init opal_register_exception_handlers(void)
glue += 128;
}
+ /*
+ * Only applicable to ancient firmware, all modern
+ * (post March 2015/skiboot 5.0) firmware will just return
+ * OPAL_UNSUPPORTED.
+ */
opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue);
#endif
@@ -231,6 +246,43 @@ static int __init opal_register_exception_handlers(void)
}
machine_early_initcall(powernv, opal_register_exception_handlers);
+static void queue_replay_msg(void *msg)
+{
+ struct opal_msg_node *msg_node;
+
+ if (msg_list_size < OPAL_MSG_QUEUE_MAX) {
+ msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
+ if (msg_node) {
+ INIT_LIST_HEAD(&msg_node->list);
+ memcpy(&msg_node->msg, msg, sizeof(struct opal_msg));
+ list_add_tail(&msg_node->list, &msg_list);
+ msg_list_size++;
+ } else
+ pr_warn_once("message queue no memory\n");
+
+ if (msg_list_size >= OPAL_MSG_QUEUE_MAX)
+ pr_warn_once("message queue full\n");
+ }
+}
+
+static void dequeue_replay_msg(enum opal_msg_type msg_type)
+{
+ struct opal_msg_node *msg_node, *tmp;
+
+ list_for_each_entry_safe(msg_node, tmp, &msg_list, list) {
+ if (be32_to_cpu(msg_node->msg.msg_type) != msg_type)
+ continue;
+
+ atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
+ msg_type,
+ &msg_node->msg);
+
+ list_del(&msg_node->list);
+ kfree(msg_node);
+ msg_list_size--;
+ }
+}
+
/*
* Opal message notifier based on message type. Allow subscribers to get
* notified for specific messgae type.
@@ -238,14 +290,30 @@ machine_early_initcall(powernv, opal_register_exception_handlers);
int opal_message_notifier_register(enum opal_msg_type msg_type,
struct notifier_block *nb)
{
+ int ret;
+ unsigned long flags;
+
if (!nb || msg_type >= OPAL_MSG_TYPE_MAX) {
pr_warn("%s: Invalid arguments, msg_type:%d\n",
__func__, msg_type);
return -EINVAL;
}
- return atomic_notifier_chain_register(
- &opal_msg_notifier_head[msg_type], nb);
+ spin_lock_irqsave(&msg_list_lock, flags);
+ ret = atomic_notifier_chain_register(
+ &opal_msg_notifier_head[msg_type], nb);
+
+ /*
+ * If the registration succeeded, replay any queued messages that came
+ * in prior to the notifier chain registration. msg_list_lock held here
+ * to ensure they're delivered prior to any subsequent messages.
+ */
+ if (ret == 0)
+ dequeue_replay_msg(msg_type);
+
+ spin_unlock_irqrestore(&msg_list_lock, flags);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(opal_message_notifier_register);
@@ -259,6 +327,23 @@ EXPORT_SYMBOL_GPL(opal_message_notifier_unregister);
static void opal_message_do_notify(uint32_t msg_type, void *msg)
{
+ unsigned long flags;
+ bool queued = false;
+
+ spin_lock_irqsave(&msg_list_lock, flags);
+ if (opal_msg_notifier_head[msg_type].head == NULL) {
+ /*
+ * Queue up the msg since no notifiers have registered
+ * yet for this msg_type.
+ */
+ queue_replay_msg(msg);
+ queued = true;
+ }
+ spin_unlock_irqrestore(&msg_list_lock, flags);
+
+ if (queued)
+ return;
+
/* notify subscribers */
atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
msg_type, msg);
@@ -267,14 +352,9 @@ static void opal_message_do_notify(uint32_t msg_type, void *msg)
static void opal_handle_message(void)
{
s64 ret;
- /*
- * TODO: pre-allocate a message buffer depending on opal-msg-size
- * value in /proc/device-tree.
- */
- static struct opal_msg msg;
u32 type;
- ret = opal_get_msg(__pa(&msg), sizeof(msg));
+ ret = opal_get_msg(__pa(opal_msg), opal_msg_size);
/* No opal message pending. */
if (ret == OPAL_RESOURCE)
return;
@@ -286,14 +366,14 @@ static void opal_handle_message(void)
return;
}
- type = be32_to_cpu(msg.msg_type);
+ type = be32_to_cpu(opal_msg->msg_type);
/* Sanity check */
if (type >= OPAL_MSG_TYPE_MAX) {
pr_warn_once("%s: Unknown message type: %u\n", __func__, type);
return;
}
- opal_message_do_notify(type, (void *)&msg);
+ opal_message_do_notify(type, (void *)opal_msg);
}
static irqreturn_t opal_message_notify(int irq, void *data)
@@ -302,10 +382,24 @@ static irqreturn_t opal_message_notify(int irq, void *data)
return IRQ_HANDLED;
}
-static int __init opal_message_init(void)
+static int __init opal_message_init(struct device_node *opal_node)
{
int ret, i, irq;
+ ret = of_property_read_u32(opal_node, "opal-msg-size", &opal_msg_size);
+ if (ret) {
+ pr_notice("Failed to read opal-msg-size property\n");
+ opal_msg_size = sizeof(struct opal_msg);
+ }
+
+ opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
+ if (!opal_msg) {
+ opal_msg_size = sizeof(struct opal_msg);
+ /* Try to allocate fixed message size */
+ opal_msg = kmalloc(opal_msg_size, GFP_KERNEL);
+ BUG_ON(opal_msg == NULL);
+ }
+
for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
@@ -327,7 +421,7 @@ static int __init opal_message_init(void)
return 0;
}
-int opal_get_chars(uint32_t vtermno, char *buf, int count)
+ssize_t opal_get_chars(uint32_t vtermno, u8 *buf, size_t count)
{
s64 rc;
__be64 evt, len;
@@ -344,10 +438,11 @@ int opal_get_chars(uint32_t vtermno, char *buf, int count)
return 0;
}
-static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, bool atomic)
+static ssize_t __opal_put_chars(uint32_t vtermno, const u8 *data,
+ size_t total_len, bool atomic)
{
unsigned long flags = 0 /* shut up gcc */;
- int written;
+ ssize_t written;
__be64 olen;
s64 rc;
@@ -387,7 +482,7 @@ static int __opal_put_chars(uint32_t vtermno, const char *data, int total_len, b
if (atomic) {
/* Should not happen */
pr_warn("atomic console write returned partial "
- "len=%d written=%d\n", total_len, written);
+ "len=%zu written=%zd\n", total_len, written);
}
if (!written)
written = -EAGAIN;
@@ -400,7 +495,7 @@ out:
return written;
}
-int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
+ssize_t opal_put_chars(uint32_t vtermno, const u8 *data, size_t total_len)
{
return __opal_put_chars(vtermno, data, total_len, false);
}
@@ -411,7 +506,8 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
* true at the moment because console space can race with OPAL's console
* writes.
*/
-int opal_put_chars_atomic(uint32_t vtermno, const char *data, int total_len)
+ssize_t opal_put_chars_atomic(uint32_t vtermno, const u8 *data,
+ size_t total_len)
{
return __opal_put_chars(vtermno, data, total_len, true);
}
@@ -491,7 +587,7 @@ static int opal_recover_mce(struct pt_regs *regs,
{
int recovered = 0;
- if (!(regs->msr & MSR_RI)) {
+ if (regs_is_unrecoverable(regs)) {
/* If MSR_RI isn't set, we cannot recover */
pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
recovered = 0;
@@ -504,7 +600,7 @@ static int opal_recover_mce(struct pt_regs *regs,
recovered = 0;
}
- if (!recovered && evt->severity == MCE_SEV_ERROR_SYNC) {
+ if (!recovered && evt->sync_error) {
/*
* Try to kill processes if we get a synchronous machine check
* (e.g., one caused by execution of this instruction). This
@@ -527,7 +623,7 @@ static int opal_recover_mce(struct pt_regs *regs,
*/
recovered = 0;
} else {
- die("Machine check", regs, SIGBUS);
+ die_mce("Machine check", regs, SIGBUS);
recovered = 1;
}
}
@@ -587,7 +683,7 @@ int opal_machine_check(struct pt_regs *regs)
evt.version);
return 0;
}
- machine_check_print_event_info(&evt, user_mode(regs));
+ machine_check_print_event_info(&evt, user_mode(regs), false);
if (opal_recover_mce(regs, &evt))
return 1;
@@ -613,7 +709,28 @@ int opal_hmi_exception_early(struct pt_regs *regs)
return 0;
}
-/* HMI exception handler called in virtual mode during check_irq_replay. */
+int opal_hmi_exception_early2(struct pt_regs *regs)
+{
+ s64 rc;
+ __be64 out_flags;
+
+ /*
+ * call opal hmi handler.
+ * Check 64-bit flag mask to find out if an event was generated,
+ * and whether TB is still valid or not etc.
+ */
+ rc = opal_handle_hmi2(&out_flags);
+ if (rc != OPAL_SUCCESS)
+ return 0;
+
+ if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_NEW_EVENT)
+ local_paca->hmi_event_available = 1;
+ if (be64_to_cpu(out_flags) & OPAL_HMI_FLAGS_TOD_TB_FAIL)
+ tb_invalid = true;
+ return 1;
+}
+
+/* HMI exception handler called in virtual mode when irqs are next enabled. */
int opal_handle_hmi_exception(struct pt_regs *regs)
{
/*
@@ -655,13 +772,13 @@ bool opal_mce_check_early_recovery(struct pt_regs *regs)
* Setup regs->nip to rfi into fixup address.
*/
if (recover_addr)
- regs->nip = recover_addr;
+ regs_set_return_ip(regs, recover_addr);
out:
return !!recover_addr;
}
-static int opal_sysfs_init(void)
+static int __init opal_sysfs_init(void)
{
opal_kobj = kobject_create_and_add("opal", firmware_kobj);
if (!opal_kobj) {
@@ -672,45 +789,77 @@ static int opal_sysfs_init(void)
return 0;
}
-static ssize_t symbol_map_read(struct file *fp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t count)
+static int opal_add_one_export(struct kobject *parent, const char *export_name,
+ struct device_node *np, const char *prop_name)
{
- return memory_read_from_buffer(buf, count, &off, bin_attr->private,
- bin_attr->size);
-}
+ struct bin_attribute *attr = NULL;
+ const char *name = NULL;
+ u64 vals[2];
+ int rc;
-static BIN_ATTR_RO(symbol_map, 0);
+ rc = of_property_read_u64_array(np, prop_name, &vals[0], 2);
+ if (rc)
+ goto out;
-static void opal_export_symmap(void)
-{
- const __be64 *syms;
- unsigned int size;
- struct device_node *fw;
- int rc;
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (!attr) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ name = kstrdup(export_name, GFP_KERNEL);
+ if (!name) {
+ rc = -ENOMEM;
+ goto out;
+ }
- fw = of_find_node_by_path("/ibm,opal/firmware");
- if (!fw)
- return;
- syms = of_get_property(fw, "symbol-map", &size);
- if (!syms || size != 2 * sizeof(__be64))
- return;
+ sysfs_bin_attr_init(attr);
+ attr->attr.name = name;
+ attr->attr.mode = 0400;
+ attr->read = sysfs_bin_attr_simple_read;
+ attr->private = __va(vals[0]);
+ attr->size = vals[1];
- /* Setup attributes */
- bin_attr_symbol_map.private = __va(be64_to_cpu(syms[0]));
- bin_attr_symbol_map.size = be64_to_cpu(syms[1]);
+ rc = sysfs_create_bin_file(parent, attr);
+out:
+ if (rc) {
+ kfree(name);
+ kfree(attr);
+ }
- rc = sysfs_create_bin_file(opal_kobj, &bin_attr_symbol_map);
- if (rc)
- pr_warn("Error %d creating OPAL symbols file\n", rc);
+ return rc;
}
-static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
- struct bin_attribute *bin_attr, char *buf,
- loff_t off, size_t count)
+static void opal_add_exported_attrs(struct device_node *np,
+ struct kobject *kobj)
{
- return memory_read_from_buffer(buf, count, &off, bin_attr->private,
- bin_attr->size);
+ struct device_node *child;
+ struct property *prop;
+
+ for_each_property_of_node(np, prop) {
+ int rc;
+
+ if (!strcmp(prop->name, "name") ||
+ !strcmp(prop->name, "phandle"))
+ continue;
+
+ rc = opal_add_one_export(kobj, prop->name, np, prop->name);
+ if (rc) {
+ pr_warn("Unable to add export %pOF/%s, rc = %d!\n",
+ np, prop->name, rc);
+ }
+ }
+
+ for_each_child_of_node(np, child) {
+ struct kobject *child_kobj;
+
+ child_kobj = kobject_create_and_add(child->name, kobj);
+ if (!child_kobj) {
+ pr_err("Unable to create export dir for %pOF\n", child);
+ continue;
+ }
+
+ opal_add_exported_attrs(child, child_kobj);
+ }
}
/*
@@ -722,11 +871,8 @@ static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
*/
static void opal_export_attrs(void)
{
- struct bin_attribute *attr;
struct device_node *np;
- struct property *prop;
struct kobject *kobj;
- u64 vals[2];
int rc;
np = of_find_node_by_path("/ibm,opal/firmware/exports");
@@ -737,44 +883,20 @@ static void opal_export_attrs(void)
kobj = kobject_create_and_add("exports", opal_kobj);
if (!kobj) {
pr_warn("kobject_create_and_add() of exports failed\n");
+ of_node_put(np);
return;
}
- for_each_property_of_node(np, prop) {
- if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle"))
- continue;
-
- if (of_property_read_u64_array(np, prop->name, &vals[0], 2))
- continue;
-
- attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ opal_add_exported_attrs(np, kobj);
- if (attr == NULL) {
- pr_warn("Failed kmalloc for bin_attribute!");
- continue;
- }
-
- sysfs_bin_attr_init(attr);
- attr->attr.name = kstrdup(prop->name, GFP_KERNEL);
- attr->attr.mode = 0400;
- attr->read = export_attr_read;
- attr->private = __va(vals[0]);
- attr->size = vals[1];
-
- if (attr->attr.name == NULL) {
- pr_warn("Failed kstrdup for bin_attribute attr.name");
- kfree(attr);
- continue;
- }
-
- rc = sysfs_create_bin_file(kobj, attr);
- if (rc) {
- pr_warn("Error %d creating OPAL sysfs exports/%s file\n",
- rc, prop->name);
- kfree(attr->attr.name);
- kfree(attr);
- }
- }
+ /*
+ * NB: symbol_map existed before the generic export interface so it
+ * lives under the top level opal_kobj.
+ */
+ rc = opal_add_one_export(opal_kobj, "symbol_map",
+ np->parent, "symbol-map");
+ if (rc)
+ pr_warn("Error %d creating OPAL symbols file\n", rc);
of_node_put(np);
}
@@ -807,7 +929,7 @@ static void __init opal_dump_region_init(void)
"rc = %d\n", rc);
}
-static void opal_pdev_init(const char *compatible)
+static void __init opal_pdev_init(const char *compatible)
{
struct device_node *np;
@@ -822,6 +944,8 @@ static void __init opal_imc_init_dev(void)
np = of_find_compatible_node(NULL, NULL, IMC_DTB_COMPAT);
if (np)
of_platform_device_create(np, NULL, NULL);
+
+ of_node_put(np);
}
static int kopald(void *unused)
@@ -851,7 +975,7 @@ void opal_wake_poller(void)
wake_up_process(kopald_tsk);
}
-static void opal_init_heartbeat(void)
+static void __init opal_init_heartbeat(void)
{
/* Old firwmware, we assume the HVC heartbeat is sufficient */
if (of_property_read_u32(opal_node, "ibm,heartbeat-ms",
@@ -885,7 +1009,7 @@ static int __init opal_init(void)
}
/* Initialise OPAL messaging system */
- opal_message_init();
+ opal_message_init(opal_node);
/* Initialise OPAL asynchronous completion interface */
opal_async_comp_init();
@@ -921,8 +1045,6 @@ static int __init opal_init(void)
/* Create "opal" kobject under /sys/firmware */
rc = opal_sysfs_init();
if (rc == 0) {
- /* Export symbol map to userspace */
- opal_export_symmap();
/* Setup dump region interface */
opal_dump_region_init();
/* Setup error log interface */
@@ -935,11 +1057,10 @@ static int __init opal_init(void)
opal_sys_param_init();
/* Setup message log sysfs interface. */
opal_msglog_sysfs_init();
+ /* Add all export properties*/
+ opal_export_attrs();
}
- /* Export all properties */
- opal_export_attrs();
-
/* Initialize platform devices: IPMI backend, PRD & flash interface */
opal_pdev_init("ibm,opal-ipmi");
opal_pdev_init("ibm,opal-flash");
@@ -963,6 +1084,9 @@ static int __init opal_init(void)
/* Initialise OPAL Power control interface */
opal_power_control_init();
+ /* Initialize OPAL secure variables */
+ opal_pdev_init("ibm,secvar-backend");
+
return 0;
}
machine_subsys_initcall(powernv, opal_init);
diff --git a/arch/powerpc/platforms/powernv/pci-cxl.c b/arch/powerpc/platforms/powernv/pci-cxl.c
deleted file mode 100644
index 1b18111453d7..000000000000
--- a/arch/powerpc/platforms/powernv/pci-cxl.c
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright 2014-2016 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <asm/pnv-pci.h>
-#include <asm/opal.h>
-
-#include "pci.h"
-
-int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t mode)
-{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
- struct pnv_ioda_pe *pe;
- int rc;
-
- pe = pnv_ioda_get_pe(dev);
- if (!pe)
- return -ENODEV;
-
- pe_info(pe, "Switching PHB to CXL\n");
-
- rc = opal_pci_set_phb_cxl_mode(phb->opal_id, mode, pe->pe_number);
- if (rc == OPAL_UNSUPPORTED)
- dev_err(&dev->dev, "Required cxl mode not supported by firmware - update skiboot\n");
- else if (rc)
- dev_err(&dev->dev, "opal_pci_set_phb_cxl_mode failed: %i\n", rc);
-
- return rc;
-}
-EXPORT_SYMBOL(pnv_phb_to_cxl_mode);
-
-/* Find PHB for cxl dev and allocate MSI hwirqs?
- * Returns the absolute hardware IRQ number
- */
-int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num)
-{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
- int hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, num);
-
- if (hwirq < 0) {
- dev_warn(&dev->dev, "Failed to find a free MSI\n");
- return -ENOSPC;
- }
-
- return phb->msi_base + hwirq;
-}
-EXPORT_SYMBOL(pnv_cxl_alloc_hwirqs);
-
-void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num)
-{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
-
- msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, num);
-}
-EXPORT_SYMBOL(pnv_cxl_release_hwirqs);
-
-void pnv_cxl_release_hwirq_ranges(struct cxl_irq_ranges *irqs,
- struct pci_dev *dev)
-{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
- int i, hwirq;
-
- for (i = 1; i < CXL_IRQ_RANGES; i++) {
- if (!irqs->range[i])
- continue;
- pr_devel("cxl release irq range 0x%x: offset: 0x%lx limit: %ld\n",
- i, irqs->offset[i],
- irqs->range[i]);
- hwirq = irqs->offset[i] - phb->msi_base;
- msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq,
- irqs->range[i]);
- }
-}
-EXPORT_SYMBOL(pnv_cxl_release_hwirq_ranges);
-
-int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
- struct pci_dev *dev, int num)
-{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
- int i, hwirq, try;
-
- memset(irqs, 0, sizeof(struct cxl_irq_ranges));
-
- /* 0 is reserved for the multiplexed PSL DSI interrupt */
- for (i = 1; i < CXL_IRQ_RANGES && num; i++) {
- try = num;
- while (try) {
- hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, try);
- if (hwirq >= 0)
- break;
- try /= 2;
- }
- if (!try)
- goto fail;
-
- irqs->offset[i] = phb->msi_base + hwirq;
- irqs->range[i] = try;
- pr_devel("cxl alloc irq range 0x%x: offset: 0x%lx limit: %li\n",
- i, irqs->offset[i], irqs->range[i]);
- num -= try;
- }
- if (num)
- goto fail;
-
- return 0;
-fail:
- pnv_cxl_release_hwirq_ranges(irqs, dev);
- return -ENOSPC;
-}
-EXPORT_SYMBOL(pnv_cxl_alloc_hwirq_ranges);
-
-int pnv_cxl_get_irq_count(struct pci_dev *dev)
-{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
-
- return phb->msi_bmp.irq_count;
-}
-EXPORT_SYMBOL(pnv_cxl_get_irq_count);
-
-int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, unsigned int hwirq,
- unsigned int virq)
-{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
- unsigned int xive_num = hwirq - phb->msi_base;
- struct pnv_ioda_pe *pe;
- int rc;
-
- if (!(pe = pnv_ioda_get_pe(dev)))
- return -ENODEV;
-
- /* Assign XIVE to PE */
- rc = opal_pci_set_xive_pe(phb->opal_id, pe->pe_number, xive_num);
- if (rc) {
- pe_warn(pe, "%s: OPAL error %d setting msi_base 0x%x "
- "hwirq 0x%x XIVE 0x%x PE\n",
- pci_name(dev), rc, phb->msi_base, hwirq, xive_num);
- return -EIO;
- }
- pnv_set_msi_irq_chip(phb, virq);
-
- return 0;
-}
-EXPORT_SYMBOL(pnv_cxl_ioda_msi_setup);
-
-#if IS_MODULE(CONFIG_CXL)
-static inline int get_cxl_module(void)
-{
- struct module *cxl_module;
-
- mutex_lock(&module_mutex);
-
- cxl_module = find_module("cxl");
- if (cxl_module)
- __module_get(cxl_module);
-
- mutex_unlock(&module_mutex);
-
- if (!cxl_module)
- return -ENODEV;
-
- return 0;
-}
-#else
-static inline int get_cxl_module(void) { return 0; }
-#endif
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index 697449afb3f7..e96324502db0 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -17,6 +17,34 @@
#include <asm/tce.h>
#include "pci.h"
+unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
+{
+ struct pci_controller *hose = phb->hose;
+ struct device_node *dn = hose->dn;
+ unsigned long mask = 0;
+ int i, rc, count;
+ u32 val;
+
+ count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
+ if (count <= 0) {
+ mask = SZ_4K | SZ_64K;
+ /* Add 16M for POWER8 by default */
+ if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+ !cpu_has_feature(CPU_FTR_ARCH_300))
+ mask |= SZ_16M | SZ_256M;
+ return mask;
+ }
+
+ for (i = 0; i < count; i++) {
+ rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
+ i, &val);
+ if (rc == 0)
+ mask |= 1ULL << val;
+ }
+
+ return mask;
+}
+
void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
void *tce_mem, u64 tce_size,
u64 dma_offset, unsigned int page_shift)
@@ -36,7 +64,8 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
struct page *tce_mem = NULL;
__be64 *addr;
- tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT);
+ tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN,
+ shift - PAGE_SHIFT);
if (!tce_mem) {
pr_err("Failed to allocate a TCE memory, level shift=%d\n",
shift);
@@ -48,6 +77,9 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift)
return addr;
}
+static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
+ unsigned long size, unsigned int levels);
+
static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
{
__be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base;
@@ -57,9 +89,9 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
while (level) {
int n = (idx & mask) >> (level * shift);
- unsigned long tce;
+ unsigned long oldtce, tce = be64_to_cpu(READ_ONCE(tmp[n]));
- if (tmp[n] == 0) {
+ if (!tce) {
__be64 *tmp2;
if (!alloc)
@@ -70,10 +102,15 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc)
if (!tmp2)
return NULL;
- tmp[n] = cpu_to_be64(__pa(tmp2) |
- TCE_PCI_READ | TCE_PCI_WRITE);
+ tce = __pa(tmp2) | TCE_PCI_READ | TCE_PCI_WRITE;
+ oldtce = be64_to_cpu(cmpxchg(&tmp[n], 0,
+ cpu_to_be64(tce)));
+ if (oldtce) {
+ pnv_pci_ioda2_table_do_free_pages(tmp2,
+ ilog2(tbl->it_level_size) + 3, 1);
+ tce = oldtce;
+ }
}
- tce = be64_to_cpu(tmp[n]);
tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
idx &= ~mask;
@@ -108,8 +145,7 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
#ifdef CONFIG_IOMMU_API
int pnv_tce_xchg(struct iommu_table *tbl, long index,
- unsigned long *hpa, enum dma_data_direction *direction,
- bool alloc)
+ unsigned long *hpa, enum dma_data_direction *direction)
{
u64 proto_tce = iommu_direction_to_tce_perm(*direction);
unsigned long newtce = *hpa | proto_tce, oldtce;
@@ -127,9 +163,9 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index,
}
if (!ptce) {
- ptce = pnv_tce(tbl, false, idx, alloc);
+ ptce = pnv_tce(tbl, false, idx, true);
if (!ptce)
- return alloc ? H_HARDWARE : H_TOO_HARD;
+ return -ENOMEM;
}
if (newtce & TCE_PCI_WRITE)
@@ -161,6 +197,9 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
if (ptce)
*ptce = cpu_to_be64(0);
+ else
+ /* Skip the rest of the level */
+ i |= tbl->it_level_size - 1;
}
}
@@ -260,7 +299,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
unsigned int table_shift = max_t(unsigned int, entries_shift + 3,
PAGE_SHIFT);
const unsigned long tce_table_size = 1UL << table_shift;
- unsigned int tmplevels = levels;
if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
return -EINVAL;
@@ -268,9 +306,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
if (!is_power_of_2(window_size))
return -EINVAL;
- if (alloc_userspace_copy && (window_size > (1ULL << 32)))
- tmplevels = 1;
-
/* Adjust direct table size from window_size and levels */
entries_shift = (entries_shift + levels - 1) / levels;
level_shift = entries_shift + 3;
@@ -281,7 +316,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
/* Allocate TCE table */
addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
- tmplevels, tce_table_size, &offset, &total_allocated);
+ 1, tce_table_size, &offset, &total_allocated);
/* addr==NULL means that the first level allocation failed */
if (!addr)
@@ -292,18 +327,18 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
* we did not allocate as much as we wanted,
* release partially allocated table.
*/
- if (tmplevels == levels && offset < tce_table_size)
+ if (levels == 1 && offset < tce_table_size)
goto free_tces_exit;
/* Allocate userspace view of the TCE table */
if (alloc_userspace_copy) {
offset = 0;
uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
- tmplevels, tce_table_size, &offset,
+ 1, tce_table_size, &offset,
&total_allocated_uas);
if (!uas)
goto free_tces_exit;
- if (tmplevels == levels && (offset < tce_table_size ||
+ if (levels == 1 && (offset < tce_table_size ||
total_allocated_uas != total_allocated))
goto free_uas_exit;
}
@@ -313,13 +348,12 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
page_shift);
tbl->it_level_size = 1ULL << (level_shift - 3);
tbl->it_indirect_levels = levels - 1;
- tbl->it_allocated_size = total_allocated;
tbl->it_userspace = uas;
tbl->it_nid = nid;
pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n",
window_size, tce_table_size, bus_offset, tbl->it_base,
- tbl->it_userspace, tmplevels, levels);
+ tbl->it_userspace, 1, levels);
return 0;
@@ -333,14 +367,6 @@ free_tces_exit:
return -ENOMEM;
}
-static void pnv_iommu_table_group_link_free(struct rcu_head *head)
-{
- struct iommu_table_group_link *tgl = container_of(head,
- struct iommu_table_group_link, rcu);
-
- kfree(tgl);
-}
-
void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
struct iommu_table_group *table_group)
{
@@ -353,14 +379,18 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
/* Remove link to a group from table's list of attached groups */
found = false;
+
+ rcu_read_lock();
list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
if (tgl->table_group == table_group) {
list_del_rcu(&tgl->next);
- call_rcu(&tgl->rcu, pnv_iommu_table_group_link_free);
+ kfree_rcu(tgl, rcu);
found = true;
break;
}
}
+ rcu_read_unlock();
+
if (WARN_ON(!found))
return;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 7db3119f8a5b..b0c1d9d16fb5 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Support PCI/PCIe on PowerNV platforms
*
* Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#undef DEBUG
@@ -19,15 +15,18 @@
#include <linux/init.h>
#include <linux/memblock.h>
#include <linux/irq.h>
+#include <linux/irqchip/irq-msi-lib.h>
#include <linux/io.h>
#include <linux/msi.h>
#include <linux/iommu.h>
#include <linux/rculist.h>
#include <linux/sizes.h>
+#include <linux/debugfs.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
#include <asm/sections.h>
#include <asm/io.h>
-#include <asm/prom.h>
#include <asm/pci-bridge.h>
#include <asm/machdep.h>
#include <asm/msi_bitmap.h>
@@ -36,23 +35,19 @@
#include <asm/iommu.h>
#include <asm/tce.h>
#include <asm/xics.h>
-#include <asm/debugfs.h>
#include <asm/firmware.h>
#include <asm/pnv-pci.h>
#include <asm/mmzone.h>
-#include <misc/cxl-base.h>
-
#include "powernv.h"
#include "pci.h"
#include "../../../../drivers/pci/pci.h"
-#define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */
-#define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */
-#define PNV_IODA1_DMA32_SEGSIZE 0x10000000
+/* This array is indexed with enum pnv_phb_type */
+static const char * const pnv_phb_names[] = { "IODA2", "NPU_OCAPI" };
-static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
- "NPU_OCAPI" };
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+static void pnv_pci_configure_bus(struct pci_bus *bus);
void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
const char *fmt, ...)
@@ -67,7 +62,7 @@ void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
vaf.va = &args;
if (pe->flags & PNV_IODA_PE_DEV)
- strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
+ strscpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix));
else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
sprintf(pfix, "%04x:%02x ",
pci_domain_nr(pe->pbus), pe->pbus->number);
@@ -116,32 +111,13 @@ static int __init pci_reset_phbs_setup(char *str)
early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
-static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
-{
- /*
- * WARNING: We cannot rely on the resource flags. The Linux PCI
- * allocation code sometimes decides to put a 64-bit prefetchable
- * BAR in the 32-bit window, so we have to compare the addresses.
- *
- * For simplicity we only test resource start.
- */
- return (r->start >= phb->ioda.m64_base &&
- r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
-}
-
-static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
-{
- unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
-
- return (resource_flags & flags) == flags;
-}
-
static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
{
s64 rc;
phb->ioda.pe_array[pe_no].phb = phb;
phb->ioda.pe_array[pe_no].pe_number = pe_no;
+ phb->ioda.pe_array[pe_no].dma_setup_done = false;
/*
* Clear the PE frozen state as it might be put into frozen state
@@ -165,35 +141,58 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
return;
}
+ mutex_lock(&phb->ioda.pe_alloc_mutex);
if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
pr_debug("%s: PE %x was reserved on PHB#%x\n",
__func__, pe_no, phb->hose->global_number);
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
pnv_ioda_init_pe(phb, pe_no);
}
-static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
+struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count)
{
- long pe;
+ struct pnv_ioda_pe *ret = NULL;
+ int run = 0, pe, i;
+ mutex_lock(&phb->ioda.pe_alloc_mutex);
+
+ /* scan backwards for a run of @count cleared bits */
for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
- if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
- return pnv_ioda_init_pe(phb, pe);
+ if (test_bit(pe, phb->ioda.pe_alloc)) {
+ run = 0;
+ continue;
+ }
+
+ run++;
+ if (run == count)
+ break;
+ }
+ if (run != count)
+ goto out;
+
+ for (i = pe; i < pe + count; i++) {
+ set_bit(i, phb->ioda.pe_alloc);
+ pnv_ioda_init_pe(phb, i);
}
+ ret = &phb->ioda.pe_array[pe];
- return NULL;
+out:
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
+ return ret;
}
-static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
+void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
{
struct pnv_phb *phb = pe->phb;
unsigned int pe_num = pe->pe_number;
WARN_ON(pe->pdev);
- WARN_ON(pe->npucomp); /* NPUs are not supposed to be freed */
- kfree(pe->npucomp);
memset(pe, 0, sizeof(struct pnv_ioda_pe));
+
+ mutex_lock(&phb->ioda.pe_alloc_mutex);
clear_bit(pe_num, phb->ioda.pe_alloc);
+ mutex_unlock(&phb->ioda.pe_alloc_mutex);
}
/* The default M64 BAR is shared by all PEs */
@@ -253,8 +252,7 @@ fail:
static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
unsigned long *pe_bitmap)
{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
struct resource *r;
resource_size_t base, sgsz, start, end;
int segno, i;
@@ -266,8 +264,8 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
if (!r->parent || !pnv_pci_is_m64(phb, r))
continue;
- start = _ALIGN_DOWN(r->start - base, sgsz);
- end = _ALIGN_UP(r->end - base, sgsz);
+ start = ALIGN_DOWN(r->start - base, sgsz);
+ end = ALIGN(r->end - base, sgsz);
for (segno = start / sgsz; segno < end / sgsz; segno++) {
if (pe_bitmap)
set_bit(segno, pe_bitmap);
@@ -277,64 +275,6 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
}
}
-static int pnv_ioda1_init_m64(struct pnv_phb *phb)
-{
- struct resource *r;
- int index;
-
- /*
- * There are 16 M64 BARs, each of which has 8 segments. So
- * there are as many M64 segments as the maximum number of
- * PEs, which is 128.
- */
- for (index = 0; index < PNV_IODA1_M64_NUM; index++) {
- unsigned long base, segsz = phb->ioda.m64_segsize;
- int64_t rc;
-
- base = phb->ioda.m64_base +
- index * PNV_IODA1_M64_SEGS * segsz;
- rc = opal_pci_set_phb_mem_window(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, index, base, 0,
- PNV_IODA1_M64_SEGS * segsz);
- if (rc != OPAL_SUCCESS) {
- pr_warn(" Error %lld setting M64 PHB#%x-BAR#%d\n",
- rc, phb->hose->global_number, index);
- goto fail;
- }
-
- rc = opal_pci_phb_mmio_enable(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, index,
- OPAL_ENABLE_M64_SPLIT);
- if (rc != OPAL_SUCCESS) {
- pr_warn(" Error %lld enabling M64 PHB#%x-BAR#%d\n",
- rc, phb->hose->global_number, index);
- goto fail;
- }
- }
-
- /*
- * Exclude the segments for reserved and root bus PE, which
- * are first or last two PEs.
- */
- r = &phb->hose->mem_resources[1];
- if (phb->ioda.reserved_pe_idx == 0)
- r->start += (2 * phb->ioda.m64_segsize);
- else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1))
- r->end -= (2 * phb->ioda.m64_segsize);
- else
- WARN(1, "Wrong reserved PE#%x on PHB#%x\n",
- phb->ioda.reserved_pe_idx, phb->hose->global_number);
-
- return 0;
-
-fail:
- for ( ; index >= 0; index--)
- opal_pci_phb_mmio_enable(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64);
-
- return -EIO;
-}
-
static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
unsigned long *pe_bitmap,
bool all)
@@ -352,8 +292,7 @@ static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus,
static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
{
- struct pci_controller *hose = pci_bus_to_host(bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
struct pnv_ioda_pe *master_pe, *pe;
unsigned long size, *pe_alloc;
int i;
@@ -363,7 +302,7 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
return NULL;
/* Allocate bitmap */
- size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
+ size = ALIGN(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
pe_alloc = kzalloc(size, GFP_KERNEL);
if (!pe_alloc) {
pr_warn("%s: Out of memory !\n",
@@ -404,26 +343,6 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
pe->master = master_pe;
list_add_tail(&pe->list, &master_pe->slaves);
}
-
- /*
- * P7IOC supports M64DT, which helps mapping M64 segment
- * to one particular PE#. However, PHB3 has fixed mapping
- * between M64 segment and PE#. In order to have same logic
- * for P7IOC and PHB3, we enforce fixed mapping between M64
- * segment and PE# on P7IOC.
- */
- if (phb->type == PNV_PHB_IODA1) {
- int64_t rc;
-
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
- pe->pe_number, OPAL_M64_WINDOW_TYPE,
- pe->pe_number / PNV_IODA1_M64_SEGS,
- pe->pe_number % PNV_IODA1_M64_SEGS);
- if (rc != OPAL_SUCCESS)
- pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n",
- __func__, rc, phb->hose->global_number,
- pe->pe_number);
- }
}
kfree(pe_alloc);
@@ -439,7 +358,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
const __be32 *r;
u64 pci_addr;
- if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) {
+ if (phb->type != PNV_PHB_IODA2) {
pr_info(" Not support M64 window\n");
return;
}
@@ -514,10 +433,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
* Setup init functions for M64 based on IODA version, IODA3 uses
* the IODA2 code.
*/
- if (phb->type == PNV_PHB_IODA1)
- phb->init_m64 = pnv_ioda1_init_m64;
- else
- phb->init_m64 = pnv_ioda2_init_m64;
+ phb->init_m64 = pnv_ioda2_init_m64;
}
static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no)
@@ -662,10 +578,19 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
return state;
}
+struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn)
+{
+ int pe_number = phb->ioda.pe_rmap[bdfn];
+
+ if (pe_number == IODA_INVALID_PE)
+ return NULL;
+
+ return &phb->ioda.pe_array[pe_number];
+}
+
struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
struct pci_dn *pdn = pci_get_pdn(dev);
if (!pdn)
@@ -779,7 +704,35 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb,
return 0;
}
-static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+static void pnv_ioda_unset_peltv(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe,
+ struct pci_dev *parent)
+{
+ int64_t rc;
+
+ while (parent) {
+ struct pci_dn *pdn = pci_get_pdn(parent);
+
+ if (pdn && pdn->pe_number != IODA_INVALID_PE) {
+ rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
+ pe->pe_number,
+ OPAL_REMOVE_PE_FROM_DOMAIN);
+ /* XXX What to do in case of error ? */
+ }
+ parent = parent->bus->self;
+ }
+
+ opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
+ OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+
+ /* Disassociate PE in PELT */
+ rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
+ pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
+ if (rc)
+ pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc);
+}
+
+int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
{
struct pci_dev *parent;
uint8_t bcomp, dcomp, fcomp;
@@ -794,7 +747,7 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
parent = pe->pbus->self;
if (pe->flags & PNV_IODA_PE_BUS_ALL)
- count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
+ count = resource_size(&pe->pbus->busn_res);
else
count = 1;
@@ -829,29 +782,17 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
for (rid = pe->rid; rid < rid_end; rid++)
phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
- /* Release from all parents PELT-V */
- while (parent) {
- struct pci_dn *pdn = pci_get_pdn(parent);
- if (pdn && pdn->pe_number != IODA_INVALID_PE) {
- rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
- pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
- /* XXX What to do in case of error ? */
- }
- parent = parent->bus->self;
- }
-
- opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
- OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
+ /*
+ * Release from all parents PELT-V. NPUs don't have a PELTV
+ * table
+ */
+ if (phb->type != PNV_PHB_NPU_OCAPI)
+ pnv_ioda_unset_peltv(phb, pe, parent);
- /* Disassociate PE in PELT */
- rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
- pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
- if (rc)
- pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
if (rc)
- pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
+ pe_err(pe, "OPAL error %lld trying to setup PELT table\n", rc);
pe->pbus = NULL;
pe->pdev = NULL;
@@ -862,9 +803,8 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
return 0;
}
-static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
+int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
{
- struct pci_dev *parent;
uint8_t bcomp, dcomp, fcomp;
long rc, rid_end, rid;
@@ -874,9 +814,8 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
- parent = pe->pbus->self;
if (pe->flags & PNV_IODA_PE_BUS_ALL)
- count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1;
+ count = resource_size(&pe->pbus->busn_res);
else
count = 1;
@@ -895,12 +834,6 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
}
rid_end = pe->rid + (count << 8);
} else {
-#ifdef CONFIG_PCI_IOV
- if (pe->flags & PNV_IODA_PE_VF)
- parent = pe->parent_dev;
- else
-#endif /* CONFIG_PCI_IOV */
- parent = pe->pdev->bus->self;
bcomp = OpalPciBusAll;
dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
@@ -924,128 +857,21 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
* Configure PELTV. NPUs don't have a PELTV table so skip
* configuration on them.
*/
- if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
+ if (phb->type != PNV_PHB_NPU_OCAPI)
pnv_ioda_set_peltv(phb, pe, true);
/* Setup reverse map */
for (rid = pe->rid; rid < rid_end; rid++)
phb->ioda.pe_rmap[rid] = pe->pe_number;
- /* Setup one MVTs on IODA1 */
- if (phb->type != PNV_PHB_IODA1) {
- pe->mve_number = 0;
- goto out;
- }
-
- pe->mve_number = pe->pe_number;
- rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number);
- if (rc != OPAL_SUCCESS) {
- pe_err(pe, "OPAL error %ld setting up MVE %x\n",
- rc, pe->mve_number);
- pe->mve_number = -1;
- } else {
- rc = opal_pci_set_mve_enable(phb->opal_id,
- pe->mve_number, OPAL_ENABLE_MVE);
- if (rc) {
- pe_err(pe, "OPAL error %ld enabling MVE %x\n",
- rc, pe->mve_number);
- pe->mve_number = -1;
- }
- }
-
-out:
- return 0;
-}
-
-#ifdef CONFIG_PCI_IOV
-static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
-{
- struct pci_dn *pdn = pci_get_pdn(dev);
- int i;
- struct resource *res, res2;
- resource_size_t size;
- u16 num_vfs;
-
- if (!dev->is_physfn)
- return -EINVAL;
-
- /*
- * "offset" is in VFs. The M64 windows are sized so that when they
- * are segmented, each segment is the same size as the IOV BAR.
- * Each segment is in a separate PE, and the high order bits of the
- * address are the PE number. Therefore, each VF's BAR is in a
- * separate PE, and changing the IOV BAR start address changes the
- * range of PEs the VFs are in.
- */
- num_vfs = pdn->num_vfs;
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = &dev->resource[i + PCI_IOV_RESOURCES];
- if (!res->flags || !res->parent)
- continue;
-
- /*
- * The actual IOV BAR range is determined by the start address
- * and the actual size for num_vfs VFs BAR. This check is to
- * make sure that after shifting, the range will not overlap
- * with another device.
- */
- size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
- res2.flags = res->flags;
- res2.start = res->start + (size * offset);
- res2.end = res2.start + (size * num_vfs) - 1;
-
- if (res2.end > res->end) {
- dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
- i, &res2, res, num_vfs, offset);
- return -EBUSY;
- }
- }
-
- /*
- * Since M64 BAR shares segments among all possible 256 PEs,
- * we have to shift the beginning of PF IOV BAR to make it start from
- * the segment which belongs to the PE number assigned to the first VF.
- * This creates a "hole" in the /proc/iomem which could be used for
- * allocating other resources so we reserve this area below and
- * release when IOV is released.
- */
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = &dev->resource[i + PCI_IOV_RESOURCES];
- if (!res->flags || !res->parent)
- continue;
-
- size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
- res2 = *res;
- res->start += size * offset;
-
- dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
- i, &res2, res, (offset > 0) ? "En" : "Dis",
- num_vfs, offset);
-
- if (offset < 0) {
- devm_release_resource(&dev->dev, &pdn->holes[i]);
- memset(&pdn->holes[i], 0, sizeof(pdn->holes[i]));
- }
-
- pci_update_resource(dev, i + PCI_IOV_RESOURCES);
+ pe->mve_number = 0;
- if (offset > 0) {
- pdn->holes[i].start = res2.start;
- pdn->holes[i].end = res2.start + size * offset - 1;
- pdn->holes[i].flags = IORESOURCE_BUS;
- pdn->holes[i].name = "pnv_iov_reserved";
- devm_request_resource(&dev->dev, res->parent,
- &pdn->holes[i]);
- }
- }
return 0;
}
-#endif /* CONFIG_PCI_IOV */
static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
struct pci_dn *pdn = pci_get_pdn(dev);
struct pnv_ioda_pe *pe;
@@ -1057,27 +883,26 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
if (pdn->pe_number != IODA_INVALID_PE)
return NULL;
- pe = pnv_ioda_alloc_pe(phb);
+ pe = pnv_ioda_alloc_pe(phb, 1);
if (!pe) {
pr_warn("%s: Not enough PE# available, disabling device\n",
pci_name(dev));
return NULL;
}
- /* NOTE: We get only one ref to the pci_dev for the pdn, not for the
- * pointer in the PE data structure, both should be destroyed at the
- * same time. However, this needs to be looked at more closely again
- * once we actually start removing things (Hotplug, SR-IOV, ...)
+ /* NOTE: We don't get a reference for the pointer in the PE
+ * data structure, both the device and PE structures should be
+ * destroyed at the same time.
*
* At some point we want to remove the PDN completely anyways
*/
- pci_dev_get(dev);
pdn->pe_number = pe->pe_number;
pe->flags = PNV_IODA_PE_DEV;
pe->pdev = dev;
pe->pbus = NULL;
pe->mve_number = -1;
pe->rid = dev->bus->number << 8 | pdn->devfn;
+ pe->device_count++;
pe_info(pe, "Associated device to PE\n");
@@ -1086,44 +911,16 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
pnv_ioda_free_pe(pe);
pdn->pe_number = IODA_INVALID_PE;
pe->pdev = NULL;
- pci_dev_put(dev);
return NULL;
}
/* Put PE to the list */
+ mutex_lock(&phb->ioda.pe_list_mutex);
list_add_tail(&pe->list, &phb->ioda.pe_list);
-
+ mutex_unlock(&phb->ioda.pe_list_mutex);
return pe;
}
-static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
-{
- struct pci_dev *dev;
-
- list_for_each_entry(dev, &bus->devices, bus_list) {
- struct pci_dn *pdn = pci_get_pdn(dev);
-
- if (pdn == NULL) {
- pr_warn("%s: No device node associated with device !\n",
- pci_name(dev));
- continue;
- }
-
- /*
- * In partial hotplug case, the PCI device might be still
- * associated with the PE and needn't attach it to the PE
- * again.
- */
- if (pdn->pe_number != IODA_INVALID_PE)
- continue;
-
- pe->device_count++;
- pdn->pe_number = pe->pe_number;
- if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
- pnv_ioda_setup_same_PE(dev->subordinate, pe);
- }
-}
-
/*
* There're 2 types of PCI bus sensitive PEs: One that is compromised of
* single PCI bus. Another one that contains the primary PCI bus and its
@@ -1132,8 +929,7 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
*/
static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
{
- struct pci_controller *hose = pci_bus_to_host(bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
struct pnv_ioda_pe *pe = NULL;
unsigned int pe_num;
@@ -1142,15 +938,13 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
* We should reuse it instead of allocating a new one.
*/
pe_num = phb->ioda.pe_rmap[bus->number << 8];
- if (pe_num != IODA_INVALID_PE) {
+ if (WARN_ON(pe_num != IODA_INVALID_PE)) {
pe = &phb->ioda.pe_array[pe_num];
- pnv_ioda_setup_same_PE(bus, pe);
return NULL;
}
/* PE number for root bus should have been reserved */
- if (pci_is_root_bus(bus) &&
- phb->ioda.root_pe_idx != IODA_INVALID_PE)
+ if (pci_is_root_bus(bus))
pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
/* Check if PE is determined by M64 */
@@ -1159,7 +953,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
/* The PE number isn't pinned by M64 */
if (!pe)
- pe = pnv_ioda_alloc_pe(phb);
+ pe = pnv_ioda_alloc_pe(phb, 1);
if (!pe) {
pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
@@ -1174,11 +968,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
pe->rid = bus->busn_res.start << 8;
if (all)
- pe_info(pe, "Secondary bus %d..%d associated with PE#%x\n",
- bus->busn_res.start, bus->busn_res.end, pe->pe_number);
+ pe_info(pe, "Secondary bus %pad..%pad associated with PE#%x\n",
+ &bus->busn_res.start, &bus->busn_res.end,
+ pe->pe_number);
else
- pe_info(pe, "Secondary bus %d associated with PE#%x\n",
- bus->busn_res.start, pe->pe_number);
+ pe_info(pe, "Secondary bus %pad associated with PE#%x\n",
+ &bus->busn_res.start, pe->pe_number);
if (pnv_ioda_configure_pe(phb, pe)) {
/* XXX What do we do here ? */
@@ -1187,598 +982,66 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
return NULL;
}
- /* Associate it with all child devices */
- pnv_ioda_setup_same_PE(bus, pe);
-
/* Put PE to the list */
list_add_tail(&pe->list, &phb->ioda.pe_list);
return pe;
}
-static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
+static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
{
- int pe_num, found_pe = false, rc;
- long rid;
- struct pnv_ioda_pe *pe;
- struct pci_dev *gpu_pdev;
- struct pci_dn *npu_pdn;
- struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus);
- struct pnv_phb *phb = hose->private_data;
-
- /*
- * Due to a hardware errata PE#0 on the NPU is reserved for
- * error handling. This means we only have three PEs remaining
- * which need to be assigned to four links, implying some
- * links must share PEs.
- *
- * To achieve this we assign PEs such that NPUs linking the
- * same GPU get assigned the same PE.
- */
- gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev);
- for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
- pe = &phb->ioda.pe_array[pe_num];
- if (!pe->pdev)
- continue;
-
- if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) {
- /*
- * This device has the same peer GPU so should
- * be assigned the same PE as the existing
- * peer NPU.
- */
- dev_info(&npu_pdev->dev,
- "Associating to existing PE %x\n", pe_num);
- pci_dev_get(npu_pdev);
- npu_pdn = pci_get_pdn(npu_pdev);
- rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
- npu_pdn->pe_number = pe_num;
- phb->ioda.pe_rmap[rid] = pe->pe_number;
-
- /* Map the PE to this link */
- rc = opal_pci_set_pe(phb->opal_id, pe_num, rid,
- OpalPciBusAll,
- OPAL_COMPARE_RID_DEVICE_NUMBER,
- OPAL_COMPARE_RID_FUNCTION_NUMBER,
- OPAL_MAP_PE);
- WARN_ON(rc != OPAL_SUCCESS);
- found_pe = true;
- break;
- }
- }
-
- if (!found_pe)
- /*
- * Could not find an existing PE so allocate a new
- * one.
- */
- return pnv_ioda_setup_dev_PE(npu_pdev);
- else
- return pe;
-}
-
-static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
-{
- struct pci_dev *pdev;
-
- list_for_each_entry(pdev, &bus->devices, bus_list)
- pnv_ioda_setup_npu_PE(pdev);
-}
-
-static void pnv_pci_ioda_setup_PEs(void)
-{
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pci_bus *bus;
- struct pci_dev *pdev;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+ struct pci_dn *pdn = pci_get_pdn(pdev);
struct pnv_ioda_pe *pe;
- list_for_each_entry(hose, &hose_list, list_node) {
- phb = hose->private_data;
- if (phb->type == PNV_PHB_NPU_NVLINK) {
- /* PE#0 is needed for error reporting */
- pnv_ioda_reserve_pe(phb, 0);
- pnv_ioda_setup_npu_PEs(hose->bus);
- if (phb->model == PNV_PHB_MODEL_NPU2)
- WARN_ON_ONCE(pnv_npu2_init(hose));
- }
- if (phb->type == PNV_PHB_NPU_OCAPI) {
- bus = hose->bus;
- list_for_each_entry(pdev, &bus->devices, bus_list)
- pnv_ioda_setup_dev_PE(pdev);
- }
- }
- list_for_each_entry(hose, &hose_list, list_node) {
- phb = hose->private_data;
- if (phb->type != PNV_PHB_IODA2)
- continue;
-
- list_for_each_entry(pe, &phb->ioda.pe_list, list)
- pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
- }
-}
-
-#ifdef CONFIG_PCI_IOV
-static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
-{
- struct pci_bus *bus;
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pci_dn *pdn;
- int i, j;
- int m64_bars;
-
- bus = pdev->bus;
- hose = pci_bus_to_host(bus);
- phb = hose->private_data;
- pdn = pci_get_pdn(pdev);
-
- if (pdn->m64_single_mode)
- m64_bars = num_vfs;
- else
- m64_bars = 1;
-
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
- for (j = 0; j < m64_bars; j++) {
- if (pdn->m64_map[j][i] == IODA_INVALID_M64)
- continue;
- opal_pci_phb_mmio_enable(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
- clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
- pdn->m64_map[j][i] = IODA_INVALID_M64;
- }
-
- kfree(pdn->m64_map);
- return 0;
-}
-
-static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
-{
- struct pci_bus *bus;
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pci_dn *pdn;
- unsigned int win;
- struct resource *res;
- int i, j;
- int64_t rc;
- int total_vfs;
- resource_size_t size, start;
- int pe_num;
- int m64_bars;
-
- bus = pdev->bus;
- hose = pci_bus_to_host(bus);
- phb = hose->private_data;
- pdn = pci_get_pdn(pdev);
- total_vfs = pci_sriov_get_totalvfs(pdev);
-
- if (pdn->m64_single_mode)
- m64_bars = num_vfs;
- else
- m64_bars = 1;
-
- pdn->m64_map = kmalloc_array(m64_bars,
- sizeof(*pdn->m64_map),
- GFP_KERNEL);
- if (!pdn->m64_map)
- return -ENOMEM;
- /* Initialize the m64_map to IODA_INVALID_M64 */
- for (i = 0; i < m64_bars ; i++)
- for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
- pdn->m64_map[i][j] = IODA_INVALID_M64;
-
-
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = &pdev->resource[i + PCI_IOV_RESOURCES];
- if (!res->flags || !res->parent)
- continue;
-
- for (j = 0; j < m64_bars; j++) {
- do {
- win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
- phb->ioda.m64_bar_idx + 1, 0);
-
- if (win >= phb->ioda.m64_bar_idx + 1)
- goto m64_failed;
- } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
-
- pdn->m64_map[j][i] = win;
-
- if (pdn->m64_single_mode) {
- size = pci_iov_resource_size(pdev,
- PCI_IOV_RESOURCES + i);
- start = res->start + size * j;
- } else {
- size = resource_size(res);
- start = res->start;
- }
-
- /* Map the M64 here */
- if (pdn->m64_single_mode) {
- pe_num = pdn->pe_num_map[j];
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
- pe_num, OPAL_M64_WINDOW_TYPE,
- pdn->m64_map[j][i], 0);
- }
-
- rc = opal_pci_set_phb_mem_window(phb->opal_id,
- OPAL_M64_WINDOW_TYPE,
- pdn->m64_map[j][i],
- start,
- 0, /* unused */
- size);
-
-
- if (rc != OPAL_SUCCESS) {
- dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
- win, rc);
- goto m64_failed;
- }
-
- if (pdn->m64_single_mode)
- rc = opal_pci_phb_mmio_enable(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
- else
- rc = opal_pci_phb_mmio_enable(phb->opal_id,
- OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
-
- if (rc != OPAL_SUCCESS) {
- dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
- win, rc);
- goto m64_failed;
- }
- }
- }
- return 0;
-
-m64_failed:
- pnv_pci_vf_release_m64(pdev, num_vfs);
- return -EBUSY;
-}
-
-static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
- int num);
-
-static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe)
-{
- struct iommu_table *tbl;
- int64_t rc;
-
- tbl = pe->table_group.tables[0];
- rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
- if (rc)
- pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
-
- pnv_pci_ioda2_set_bypass(pe, false);
- if (pe->table_group.group) {
- iommu_group_put(pe->table_group.group);
- BUG_ON(pe->table_group.group);
- }
- iommu_tce_table_put(tbl);
-}
-
-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
-{
- struct pci_bus *bus;
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pnv_ioda_pe *pe, *pe_n;
- struct pci_dn *pdn;
-
- bus = pdev->bus;
- hose = pci_bus_to_host(bus);
- phb = hose->private_data;
- pdn = pci_get_pdn(pdev);
-
- if (!pdev->is_physfn)
- return;
-
- list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
- if (pe->parent_dev != pdev)
- continue;
-
- pnv_pci_ioda2_release_dma_pe(pdev, pe);
-
- /* Remove from list */
- mutex_lock(&phb->ioda.pe_list_mutex);
- list_del(&pe->list);
- mutex_unlock(&phb->ioda.pe_list_mutex);
-
- pnv_ioda_deconfigure_pe(phb, pe);
-
- pnv_ioda_free_pe(pe);
- }
-}
-
-void pnv_pci_sriov_disable(struct pci_dev *pdev)
-{
- struct pci_bus *bus;
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pnv_ioda_pe *pe;
- struct pci_dn *pdn;
- u16 num_vfs, i;
-
- bus = pdev->bus;
- hose = pci_bus_to_host(bus);
- phb = hose->private_data;
- pdn = pci_get_pdn(pdev);
- num_vfs = pdn->num_vfs;
-
- /* Release VF PEs */
- pnv_ioda_release_vf_PE(pdev);
-
- if (phb->type == PNV_PHB_IODA2) {
- if (!pdn->m64_single_mode)
- pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
-
- /* Release M64 windows */
- pnv_pci_vf_release_m64(pdev, num_vfs);
-
- /* Release PE numbers */
- if (pdn->m64_single_mode) {
- for (i = 0; i < num_vfs; i++) {
- if (pdn->pe_num_map[i] == IODA_INVALID_PE)
- continue;
-
- pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
- pnv_ioda_free_pe(pe);
- }
- } else
- bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
- /* Releasing pe_num_map */
- kfree(pdn->pe_num_map);
- }
-}
-
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
- struct pnv_ioda_pe *pe);
-#ifdef CONFIG_IOMMU_API
-static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
- struct iommu_table_group *table_group, struct pci_bus *bus);
-
-#endif
-static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
-{
- struct pci_bus *bus;
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pnv_ioda_pe *pe;
- int pe_num;
- u16 vf_index;
- struct pci_dn *pdn;
-
- bus = pdev->bus;
- hose = pci_bus_to_host(bus);
- phb = hose->private_data;
- pdn = pci_get_pdn(pdev);
-
- if (!pdev->is_physfn)
- return;
-
- /* Reserve PE for each VF */
- for (vf_index = 0; vf_index < num_vfs; vf_index++) {
- if (pdn->m64_single_mode)
- pe_num = pdn->pe_num_map[vf_index];
- else
- pe_num = *pdn->pe_num_map + vf_index;
-
- pe = &phb->ioda.pe_array[pe_num];
- pe->pe_number = pe_num;
- pe->phb = phb;
- pe->flags = PNV_IODA_PE_VF;
- pe->pbus = NULL;
- pe->parent_dev = pdev;
- pe->mve_number = -1;
- pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) |
- pci_iov_virtfn_devfn(pdev, vf_index);
-
- pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
- hose->global_number, pdev->bus->number,
- PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)),
- PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num);
-
- if (pnv_ioda_configure_pe(phb, pe)) {
- /* XXX What do we do here ? */
- pnv_ioda_free_pe(pe);
- pe->pdev = NULL;
- continue;
- }
-
- /* Put PE to the list */
- mutex_lock(&phb->ioda.pe_list_mutex);
- list_add_tail(&pe->list, &phb->ioda.pe_list);
- mutex_unlock(&phb->ioda.pe_list_mutex);
-
- pnv_pci_ioda2_setup_dma_pe(phb, pe);
-#ifdef CONFIG_IOMMU_API
- pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL);
-#endif
- }
-}
-
-int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
-{
- struct pci_bus *bus;
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pnv_ioda_pe *pe;
- struct pci_dn *pdn;
- int ret;
- u16 i;
-
- bus = pdev->bus;
- hose = pci_bus_to_host(bus);
- phb = hose->private_data;
- pdn = pci_get_pdn(pdev);
-
- if (phb->type == PNV_PHB_IODA2) {
- if (!pdn->vfs_expanded) {
- dev_info(&pdev->dev, "don't support this SRIOV device"
- " with non 64bit-prefetchable IOV BAR\n");
- return -ENOSPC;
- }
-
- /*
- * When M64 BARs functions in Single PE mode, the number of VFs
- * could be enabled must be less than the number of M64 BARs.
- */
- if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
- dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
- return -EBUSY;
- }
-
- /* Allocating pe_num_map */
- if (pdn->m64_single_mode)
- pdn->pe_num_map = kmalloc_array(num_vfs,
- sizeof(*pdn->pe_num_map),
- GFP_KERNEL);
- else
- pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
-
- if (!pdn->pe_num_map)
- return -ENOMEM;
-
- if (pdn->m64_single_mode)
- for (i = 0; i < num_vfs; i++)
- pdn->pe_num_map[i] = IODA_INVALID_PE;
-
- /* Calculate available PE for required VFs */
- if (pdn->m64_single_mode) {
- for (i = 0; i < num_vfs; i++) {
- pe = pnv_ioda_alloc_pe(phb);
- if (!pe) {
- ret = -EBUSY;
- goto m64_failed;
- }
+ /* Check if the BDFN for this device is associated with a PE yet */
+ pe = pnv_pci_bdfn_to_pe(phb, pci_dev_id(pdev));
+ if (!pe) {
+ /* VF PEs should be pre-configured in pnv_pci_sriov_enable() */
+ if (WARN_ON(pdev->is_virtfn))
+ return;
- pdn->pe_num_map[i] = pe->pe_number;
- }
- } else {
- mutex_lock(&phb->ioda.pe_alloc_mutex);
- *pdn->pe_num_map = bitmap_find_next_zero_area(
- phb->ioda.pe_alloc, phb->ioda.total_pe_num,
- 0, num_vfs, 0);
- if (*pdn->pe_num_map >= phb->ioda.total_pe_num) {
- mutex_unlock(&phb->ioda.pe_alloc_mutex);
- dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
- kfree(pdn->pe_num_map);
- return -EBUSY;
- }
- bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
- mutex_unlock(&phb->ioda.pe_alloc_mutex);
- }
- pdn->num_vfs = num_vfs;
+ pnv_pci_configure_bus(pdev->bus);
+ pe = pnv_pci_bdfn_to_pe(phb, pci_dev_id(pdev));
+ pci_info(pdev, "Configured PE#%x\n", pe ? pe->pe_number : 0xfffff);
- /* Assign M64 window accordingly */
- ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
- if (ret) {
- dev_info(&pdev->dev, "Not enough M64 window resources\n");
- goto m64_failed;
- }
/*
- * When using one M64 BAR to map one IOV BAR, we need to shift
- * the IOV BAR according to the PE# allocated to the VFs.
- * Otherwise, the PE# for the VF will conflict with others.
+ * If we can't setup the IODA PE something has gone horribly
+ * wrong and we can't enable DMA for the device.
*/
- if (!pdn->m64_single_mode) {
- ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
- if (ret)
- goto m64_failed;
- }
+ if (WARN_ON(!pe))
+ return;
+ } else {
+ pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number);
}
- /* Setup VF PEs */
- pnv_ioda_setup_vf_PE(pdev, num_vfs);
-
- return 0;
-
-m64_failed:
- if (pdn->m64_single_mode) {
- for (i = 0; i < num_vfs; i++) {
- if (pdn->pe_num_map[i] == IODA_INVALID_PE)
- continue;
-
- pe = &phb->ioda.pe_array[pdn->pe_num_map[i]];
- pnv_ioda_free_pe(pe);
- }
- } else
- bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
-
- /* Releasing pe_num_map */
- kfree(pdn->pe_num_map);
-
- return ret;
-}
-
-int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
-{
- pnv_pci_sriov_disable(pdev);
-
- /* Release PCI data */
- remove_dev_pci_data(pdev);
- return 0;
-}
-
-int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
-{
- /* Allocate PCI data */
- add_dev_pci_data(pdev);
-
- return pnv_pci_sriov_enable(pdev, num_vfs);
-}
-#endif /* CONFIG_PCI_IOV */
-
-static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
-{
- struct pci_dn *pdn = pci_get_pdn(pdev);
- struct pnv_ioda_pe *pe;
-
/*
- * The function can be called while the PE#
- * hasn't been assigned. Do nothing for the
- * case.
+ * We assume that bridges *probably* don't need to do any DMA so we can
+ * skip allocating a TCE table, etc unless we get a non-bridge device.
*/
- if (!pdn || pdn->pe_number == IODA_INVALID_PE)
- return;
+ if (!pe->dma_setup_done && !pci_is_bridge(pdev)) {
+ switch (phb->type) {
+ case PNV_PHB_IODA2:
+ pnv_pci_ioda2_setup_dma_pe(phb, pe);
+ break;
+ default:
+ pr_warn("%s: No DMA for PHB#%x (type %d)\n",
+ __func__, phb->hose->global_number, phb->type);
+ }
+ }
+
+ if (pdn)
+ pdn->pe_number = pe->pe_number;
+ pe->device_count++;
- pe = &phb->ioda.pe_array[pdn->pe_number];
WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
- set_dma_offset(&pdev->dev, pe->tce_bypass_base);
+ pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
- /*
- * Note: iommu_add_device() will fail here as
- * for physical PE: the device is already added by now;
- * for virtual PE: sysfs entries are not ready yet and
- * tce_iommu_bus_notifier will add the device to a group later.
- */
-}
-
-static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe)
-{
- unsigned short vendor = 0;
- struct pci_dev *pdev;
-
- if (pe->device_count == 1)
- return true;
-
- /* pe->pdev should be set if it's a single device, pe->pbus if not */
- if (!pe->pbus)
- return true;
-
- list_for_each_entry(pdev, &pe->pbus->devices, bus_list) {
- if (!vendor) {
- vendor = pdev->vendor;
- continue;
- }
-
- if (pdev->vendor != vendor)
- return false;
- }
- return true;
+ /* PEs with a DMA weight of zero won't have a group */
+ if (pe->table_group.group)
+ iommu_add_device(&pe->table_group, &pdev->dev);
}
/*
@@ -1850,235 +1113,79 @@ err:
return -EIO;
}
-static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
+static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev,
+ u64 dma_mask)
{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
struct pci_dn *pdn = pci_get_pdn(pdev);
struct pnv_ioda_pe *pe;
- uint64_t top;
- bool bypass = false;
- s64 rc;
if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
- return -ENODEV;
+ return false;
pe = &phb->ioda.pe_array[pdn->pe_number];
if (pe->tce_bypass_enabled) {
- top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
- bypass = (dma_mask >= top);
- }
-
- if (bypass) {
- dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n");
- set_dma_ops(&pdev->dev, &dma_nommu_ops);
- } else {
- /*
- * If the device can't set the TCE bypass bit but still wants
- * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
- * bypass the 32-bit region and be usable for 64-bit DMAs.
- * The device needs to be able to address all of this space.
- */
- if (dma_mask >> 32 &&
- dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
- pnv_pci_ioda_pe_single_vendor(pe) &&
- phb->model == PNV_PHB_MODEL_PHB3) {
- /* Configure the bypass mode */
- rc = pnv_pci_ioda_dma_64bit_bypass(pe);
- if (rc)
- return rc;
- /* 4GB offset bypasses 32-bit space */
- set_dma_offset(&pdev->dev, (1ULL << 32));
- set_dma_ops(&pdev->dev, &dma_nommu_ops);
- } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) {
- /*
- * Fail the request if a DMA mask between 32 and 64 bits
- * was requested but couldn't be fulfilled. Ideally we
- * would do this for 64-bits but historically we have
- * always fallen back to 32-bits.
- */
- return -ENOMEM;
- } else {
- dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
- set_dma_ops(&pdev->dev, &dma_iommu_ops);
- }
- }
- *pdev->dev.dma_mask = dma_mask;
-
- /* Update peer npu devices */
- pnv_npu_try_dma_set_bypass(pdev, bypass);
-
- return 0;
-}
-
-static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev)
-{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
- struct pci_dn *pdn = pci_get_pdn(pdev);
- struct pnv_ioda_pe *pe;
- u64 end, mask;
-
- if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
- return 0;
-
- pe = &phb->ioda.pe_array[pdn->pe_number];
- if (!pe->tce_bypass_enabled)
- return __dma_get_required_mask(&pdev->dev);
-
-
- end = pe->tce_bypass_base + memblock_end_of_DRAM();
- mask = 1ULL << (fls64(end) - 1);
- mask += mask - 1;
-
- return mask;
-}
-
-static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
-{
- struct pci_dev *dev;
-
- list_for_each_entry(dev, &bus->devices, bus_list) {
- set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
- set_dma_offset(&dev->dev, pe->tce_bypass_base);
-
- if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
- pnv_ioda_setup_bus_dma(pe, dev->subordinate);
+ u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
+ if (dma_mask >= top)
+ return true;
}
-}
-
-static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb,
- bool real_mode)
-{
- return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) :
- (phb->regs + 0x210);
-}
-
-static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl,
- unsigned long index, unsigned long npages, bool rm)
-{
- struct iommu_table_group_link *tgl = list_first_entry_or_null(
- &tbl->it_group_list, struct iommu_table_group_link,
- next);
- struct pnv_ioda_pe *pe = container_of(tgl->table_group,
- struct pnv_ioda_pe, table_group);
- __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
- unsigned long start, end, inc;
-
- start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset);
- end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset +
- npages - 1);
-
- /* p7ioc-style invalidation, 2 TCEs per write */
- start |= (1ull << 63);
- end |= (1ull << 63);
- inc = 16;
- end |= inc - 1; /* round up end to be different than start */
-
- mb(); /* Ensure above stores are visible */
- while (start <= end) {
- if (rm)
- __raw_rm_writeq_be(start, invalidate);
- else
- __raw_writeq_be(start, invalidate);
-
- start += inc;
- }
/*
- * The iommu layer will do another mb() for us on build()
- * and we don't care on free()
+ * If the device can't set the TCE bypass bit but still wants
+ * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to
+ * bypass the 32-bit region and be usable for 64-bit DMAs.
+ * The device needs to be able to address all of this space.
*/
-}
-
-static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index,
- long npages, unsigned long uaddr,
- enum dma_data_direction direction,
- unsigned long attrs)
-{
- int ret = pnv_tce_build(tbl, index, npages, uaddr, direction,
- attrs);
-
- if (!ret)
- pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
+ if (dma_mask >> 32 &&
+ dma_mask > (memory_hotplug_max() + (1ULL << 32)) &&
+ /* pe->pdev should be set if it's a single device, pe->pbus if not */
+ (pe->device_count == 1 || !pe->pbus) &&
+ phb->model == PNV_PHB_MODEL_PHB3) {
+ /* Configure the bypass mode */
+ s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe);
+ if (rc)
+ return false;
+ /* 4GB offset bypasses 32-bit space */
+ pdev->dev.archdata.dma_offset = (1ULL << 32);
+ return true;
+ }
- return ret;
+ return false;
}
-#ifdef CONFIG_IOMMU_API
-static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
- unsigned long *hpa, enum dma_data_direction *direction)
+static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb)
{
- long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
-
- if (!ret)
- pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false);
-
- return ret;
+ return phb->regs + 0x210;
}
-static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
+#ifdef CONFIG_IOMMU_API
+/* Common for IODA1 and IODA2 */
+static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index,
unsigned long *hpa, enum dma_data_direction *direction)
{
- long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
-
- if (!ret)
- pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
-
- return ret;
-}
-#endif
-
-static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
- long npages)
-{
- pnv_tce_free(tbl, index, npages);
-
- pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false);
+ return pnv_tce_xchg(tbl, index, hpa, direction);
}
-
-static struct iommu_table_ops pnv_ioda1_iommu_ops = {
- .set = pnv_ioda1_tce_build,
-#ifdef CONFIG_IOMMU_API
- .exchange = pnv_ioda1_tce_xchg,
- .exchange_rm = pnv_ioda1_tce_xchg_rm,
- .useraddrptr = pnv_tce_useraddrptr,
#endif
- .clear = pnv_ioda1_tce_free,
- .get = pnv_tce_get,
-};
#define PHB3_TCE_KILL_INVAL_ALL PPC_BIT(0)
#define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1)
#define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2)
-static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
-{
- __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
- const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
-
- mb(); /* Ensure previous TCE table stores are visible */
- if (rm)
- __raw_rm_writeq_be(val, invalidate);
- else
- __raw_writeq_be(val, invalidate);
-}
-
static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe)
{
/* 01xb - invalidate TCEs that match the specified PE# */
- __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false);
+ __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb);
unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF);
mb(); /* Ensure above stores are visible */
__raw_writeq_be(val, invalidate);
}
-static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
+static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe,
unsigned shift, unsigned long index,
unsigned long npages)
{
- __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm);
+ __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb);
unsigned long start, end, inc;
/* We'll invalidate DMA address in PE scope */
@@ -2093,10 +1200,7 @@ static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm,
mb();
while (start <= end) {
- if (rm)
- __raw_rm_writeq_be(start, invalidate);
- else
- __raw_writeq_be(start, invalidate);
+ __raw_writeq_be(start, invalidate);
start += inc;
}
}
@@ -2113,7 +1217,7 @@ static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe)
}
static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
- unsigned long index, unsigned long npages, bool rm)
+ unsigned long index, unsigned long npages)
{
struct iommu_table_group_link *tgl;
@@ -2123,22 +1227,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
struct pnv_phb *phb = pe->phb;
unsigned int shift = tbl->it_page_shift;
- /*
- * NVLink1 can use the TCE kill register directly as
- * it's the same as PHB3. NVLink2 is different and
- * should go via the OPAL call.
- */
- if (phb->model == PNV_PHB_MODEL_NPU) {
- /*
- * The NVLink hardware does not support TCE kill
- * per TCE entry so we have to invalidate
- * the entire cache for it.
- */
- pnv_pci_phb3_tce_invalidate_entire(phb, rm);
- continue;
- }
if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs)
- pnv_pci_phb3_tce_invalidate(pe, rm, shift,
+ pnv_pci_phb3_tce_invalidate(pe, shift,
index, npages);
else
opal_pci_tce_kill(phb->opal_id,
@@ -2148,14 +1238,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
}
}
-void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
-{
- if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
- pnv_pci_phb3_tce_invalidate_entire(phb, rm);
- else
- opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
-}
-
static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
long npages, unsigned long uaddr,
enum dma_data_direction direction,
@@ -2165,48 +1247,24 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
attrs);
if (!ret)
- pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+ pnv_pci_ioda2_tce_invalidate(tbl, index, npages);
return ret;
}
-#ifdef CONFIG_IOMMU_API
-static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
- unsigned long *hpa, enum dma_data_direction *direction)
-{
- long ret = pnv_tce_xchg(tbl, index, hpa, direction, true);
-
- if (!ret)
- pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false);
-
- return ret;
-}
-
-static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
- unsigned long *hpa, enum dma_data_direction *direction)
-{
- long ret = pnv_tce_xchg(tbl, index, hpa, direction, false);
-
- if (!ret)
- pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
-
- return ret;
-}
-#endif
-
static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
long npages)
{
pnv_tce_free(tbl, index, npages);
- pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false);
+ pnv_pci_ioda2_tce_invalidate(tbl, index, npages);
}
static struct iommu_table_ops pnv_ioda2_iommu_ops = {
.set = pnv_ioda2_tce_build,
#ifdef CONFIG_IOMMU_API
- .exchange = pnv_ioda2_tce_xchg,
- .exchange_rm = pnv_ioda2_tce_xchg_rm,
+ .xchg_no_kill = pnv_ioda_tce_xchg_no_kill,
+ .tce_kill = pnv_pci_ioda2_tce_invalidate,
.useraddrptr = pnv_tce_useraddrptr,
#endif
.clear = pnv_ioda2_tce_free,
@@ -2214,178 +1272,6 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = {
.free = pnv_pci_ioda2_table_free_pages,
};
-static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data)
-{
- unsigned int *weight = (unsigned int *)data;
-
- /* This is quite simplistic. The "base" weight of a device
- * is 10. 0 means no DMA is to be accounted for it.
- */
- if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL)
- return 0;
-
- if (dev->class == PCI_CLASS_SERIAL_USB_UHCI ||
- dev->class == PCI_CLASS_SERIAL_USB_OHCI ||
- dev->class == PCI_CLASS_SERIAL_USB_EHCI)
- *weight += 3;
- else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID)
- *weight += 15;
- else
- *weight += 10;
-
- return 0;
-}
-
-static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe)
-{
- unsigned int weight = 0;
-
- /* SRIOV VF has same DMA32 weight as its PF */
-#ifdef CONFIG_PCI_IOV
- if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) {
- pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight);
- return weight;
- }
-#endif
-
- if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) {
- pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight);
- } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) {
- struct pci_dev *pdev;
-
- list_for_each_entry(pdev, &pe->pbus->devices, bus_list)
- pnv_pci_ioda_dev_dma_weight(pdev, &weight);
- } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) {
- pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight);
- }
-
- return weight;
-}
-
-static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
- struct pnv_ioda_pe *pe)
-{
-
- struct page *tce_mem = NULL;
- struct iommu_table *tbl;
- unsigned int weight, total_weight = 0;
- unsigned int tce32_segsz, base, segs, avail, i;
- int64_t rc;
- void *addr;
-
- /* XXX FIXME: Handle 64-bit only DMA devices */
- /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */
- /* XXX FIXME: Allocate multi-level tables on PHB3 */
- weight = pnv_pci_ioda_pe_dma_weight(pe);
- if (!weight)
- return;
-
- pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight,
- &total_weight);
- segs = (weight * phb->ioda.dma32_count) / total_weight;
- if (!segs)
- segs = 1;
-
- /*
- * Allocate contiguous DMA32 segments. We begin with the expected
- * number of segments. With one more attempt, the number of DMA32
- * segments to be allocated is decreased by one until one segment
- * is allocated successfully.
- */
- do {
- for (base = 0; base <= phb->ioda.dma32_count - segs; base++) {
- for (avail = 0, i = base; i < base + segs; i++) {
- if (phb->ioda.dma32_segmap[i] ==
- IODA_INVALID_PE)
- avail++;
- }
-
- if (avail == segs)
- goto found;
- }
- } while (--segs);
-
- if (!segs) {
- pe_warn(pe, "No available DMA32 segments\n");
- return;
- }
-
-found:
- tbl = pnv_pci_table_alloc(phb->hose->node);
- if (WARN_ON(!tbl))
- return;
-
- iommu_register_group(&pe->table_group, phb->hose->global_number,
- pe->pe_number);
- pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
-
- /* Grab a 32-bit TCE table */
- pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n",
- weight, total_weight, base, segs);
- pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n",
- base * PNV_IODA1_DMA32_SEGSIZE,
- (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1);
-
- /* XXX Currently, we allocate one big contiguous table for the
- * TCEs. We only really need one chunk per 256M of TCE space
- * (ie per segment) but that's an optimization for later, it
- * requires some added smarts with our get/put_tce implementation
- *
- * Each TCE page is 4KB in size and each TCE entry occupies 8
- * bytes
- */
- tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3);
- tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
- get_order(tce32_segsz * segs));
- if (!tce_mem) {
- pe_err(pe, " Failed to allocate a 32-bit TCE memory\n");
- goto fail;
- }
- addr = page_address(tce_mem);
- memset(addr, 0, tce32_segsz * segs);
-
- /* Configure HW */
- for (i = 0; i < segs; i++) {
- rc = opal_pci_map_pe_dma_window(phb->opal_id,
- pe->pe_number,
- base + i, 1,
- __pa(addr) + tce32_segsz * i,
- tce32_segsz, IOMMU_PAGE_SIZE_4K);
- if (rc) {
- pe_err(pe, " Failed to configure 32-bit TCE table,"
- " err %ld\n", rc);
- goto fail;
- }
- }
-
- /* Setup DMA32 segment mapping */
- for (i = base; i < base + segs; i++)
- phb->ioda.dma32_segmap[i] = pe->pe_number;
-
- /* Setup linux iommu table */
- pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs,
- base * PNV_IODA1_DMA32_SEGSIZE,
- IOMMU_PAGE_SHIFT_4K);
-
- tbl->it_ops = &pnv_ioda1_iommu_ops;
- pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift;
- pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
- iommu_init_table(tbl, phb->hose->node);
-
- if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
- pnv_ioda_setup_bus_dma(pe, pe->pbus);
-
- return;
- fail:
- /* XXX Failure: Try to fallback to 64-bit only ? */
- if (tce_mem)
- __free_pages(tce_mem, get_order(tce32_segsz * segs));
- if (tbl) {
- pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
- iommu_tce_table_put(tbl);
- }
-}
-
static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
int num, struct iommu_table *tbl)
{
@@ -2398,9 +1284,9 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
const __u64 win_size = tbl->it_size << tbl->it_page_shift;
- pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num,
- start_addr, start_addr + win_size - 1,
- IOMMU_PAGE_SIZE(tbl));
+ pe_info(pe, "Setting up window#%d %llx..%llx pg=%lx\n",
+ num, start_addr, start_addr + win_size - 1,
+ IOMMU_PAGE_SIZE(tbl));
/*
* Map TCE table through TVT. The TVE index is the PE number
@@ -2414,7 +1300,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
size << 3,
IOMMU_PAGE_SIZE(tbl));
if (rc) {
- pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
+ pe_err(pe, "Failed to configure TCE table, err %lld\n", rc);
return rc;
}
@@ -2425,7 +1311,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
return 0;
}
-void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
+static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
{
uint16_t window_id = (pe->pe_number << 1 ) + 1;
int64_t rc;
@@ -2487,6 +1373,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
{
struct iommu_table *tbl = NULL;
long rc;
+ unsigned long res_start, res_end;
/*
* crashkernel= specifies the kdump kernel's maximum memory at
@@ -2500,35 +1387,70 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
* DMA window can be larger than available memory, which will
* cause errors later.
*/
- const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory);
+ const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_PAGE_ORDER);
- rc = pnv_pci_ioda2_create_table(&pe->table_group, 0,
- IOMMU_PAGE_SHIFT_4K,
- window_size,
- POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl);
+ /*
+ * We create the default window as big as we can. The constraint is
+ * the max order of allocation possible. The TCE table is likely to
+ * end up being multilevel and with on-demand allocation in place,
+ * the initial use is not going to be huge as the default window aims
+ * to support crippled devices (i.e. not fully 64bit DMAble) only.
+ */
+ /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */
+ const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory);
+ /* Each TCE level cannot exceed maxblock so go multilevel if needed */
+ unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT);
+ unsigned long tcelevel_order = ilog2(maxblock >> 3);
+ unsigned int levels = tces_order / tcelevel_order;
+
+ if (tces_order % tcelevel_order)
+ levels += 1;
+ /*
+ * We try to stick to default levels (which is >1 at the moment) in
+ * order to save memory by relying on on-demain TCE level allocation.
+ */
+ levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS);
+
+ rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT,
+ window_size, levels, false, &tbl);
if (rc) {
pe_err(pe, "Failed to create 32-bit TCE table, err %ld",
rc);
return rc;
}
- iommu_init_table(tbl, pe->phb->hose->node);
+ /* We use top part of 32bit space for MMIO so exclude it from DMA */
+ res_start = 0;
+ res_end = 0;
+ if (window_size > pe->phb->ioda.m32_pci_base) {
+ res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift;
+ res_end = min(window_size, SZ_4G) >> tbl->it_page_shift;
+ }
- rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+ tbl->it_index = (pe->phb->hose->global_number << 16) | pe->pe_number;
+ if (iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end))
+ rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
+ else
+ rc = -ENOMEM;
if (rc) {
- pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
- rc);
+ pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", rc);
iommu_tce_table_put(tbl);
- return rc;
+ tbl = NULL; /* This clears iommu_table_base below */
}
-
if (!pnv_iommu_bypass_disabled)
pnv_pci_ioda2_set_bypass(pe, true);
+ /*
+ * Set table base for the case of IOMMU DMA use. Usually this is done
+ * from dma_dev_setup() which is not called when a device is returned
+ * from VFIO so do it here.
+ */
+ if (pe->pdev)
+ set_iommu_table_base(&pe->pdev->dev, tbl);
+
return 0;
}
-#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV)
static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
int num)
{
@@ -2552,7 +1474,6 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group,
return ret;
}
-#endif
#ifdef CONFIG_IOMMU_API
unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
@@ -2576,7 +1497,7 @@ unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
direct_table_size = 1UL << table_shift;
for ( ; levels; --levels) {
- bytes += _ALIGN_UP(tce_table_size, direct_table_size);
+ bytes += ALIGN(tce_table_size, direct_table_size);
tce_table_size /= direct_table_size;
tce_table_size <<= 3;
@@ -2592,29 +1513,63 @@ static long pnv_pci_ioda2_create_table_userspace(
int num, __u32 page_shift, __u64 window_size, __u32 levels,
struct iommu_table **ptbl)
{
- return pnv_pci_ioda2_create_table(table_group,
+ long ret = pnv_pci_ioda2_create_table(table_group,
num, page_shift, window_size, levels, true, ptbl);
+
+ if (!ret)
+ (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size(
+ page_shift, window_size, levels);
+ return ret;
}
-static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
+{
+ struct pci_dev *dev;
+
+ list_for_each_entry(dev, &bus->devices, bus_list) {
+ set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
+ dev->dev.archdata.dma_offset = pe->tce_bypass_base;
+
+ if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
+ pnv_ioda_setup_bus_dma(pe, dev->subordinate);
+ }
+}
+
+static long pnv_ioda2_take_ownership(struct iommu_table_group *table_group,
+ struct device *dev __maybe_unused)
{
struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
table_group);
/* Store @tbl as pnv_pci_ioda2_unset_window() resets it */
struct iommu_table *tbl = pe->table_group.tables[0];
+ /*
+ * iommu_ops transfers the ownership per a device and we mode
+ * the group ownership with the first device in the group.
+ */
+ if (!tbl)
+ return 0;
+
pnv_pci_ioda2_set_bypass(pe, false);
pnv_pci_ioda2_unset_window(&pe->table_group, 0);
if (pe->pbus)
pnv_ioda_setup_bus_dma(pe, pe->pbus);
+ else if (pe->pdev)
+ set_iommu_table_base(&pe->pdev->dev, NULL);
iommu_tce_table_put(tbl);
+
+ return 0;
}
-static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
+static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group,
+ struct device *dev __maybe_unused)
{
struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
table_group);
+ /* See the comment about iommu_ops above */
+ if (pe->table_group.tables[0])
+ return;
pnv_pci_ioda2_setup_default_config(pe);
if (pe->pbus)
pnv_ioda_setup_bus_dma(pe, pe->pbus);
@@ -2628,145 +1583,13 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
.take_ownership = pnv_ioda2_take_ownership,
.release_ownership = pnv_ioda2_release_ownership,
};
-
-static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe,
- struct iommu_table_group *table_group,
- struct pci_bus *bus)
-{
- struct pci_dev *dev;
-
- list_for_each_entry(dev, &bus->devices, bus_list) {
- iommu_add_device(table_group, &dev->dev);
-
- if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
- pnv_ioda_setup_bus_iommu_group_add_devices(pe,
- table_group, dev->subordinate);
- }
-}
-
-static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
- struct iommu_table_group *table_group, struct pci_bus *bus)
-{
-
- if (pe->flags & PNV_IODA_PE_DEV)
- iommu_add_device(table_group, &pe->pdev->dev);
-
- if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus)
- pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group,
- bus);
-}
-
-static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
-
-static void pnv_pci_ioda_setup_iommu_api(void)
-{
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pnv_ioda_pe *pe;
-
- /*
- * There are 4 types of PEs:
- * - PNV_IODA_PE_BUS: a downstream port with an adapter,
- * created from pnv_pci_setup_bridge();
- * - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it,
- * created from pnv_pci_setup_bridge();
- * - PNV_IODA_PE_VF: a SRIOV virtual function,
- * created from pnv_pcibios_sriov_enable();
- * - PNV_IODA_PE_DEV: an NPU or OCAPI device,
- * created from pnv_pci_ioda_fixup().
- *
- * Normally a PE is represented by an IOMMU group, however for
- * devices with side channels the groups need to be more strict.
- */
- list_for_each_entry(hose, &hose_list, list_node) {
- phb = hose->private_data;
-
- if (phb->type == PNV_PHB_NPU_NVLINK ||
- phb->type == PNV_PHB_NPU_OCAPI)
- continue;
-
- list_for_each_entry(pe, &phb->ioda.pe_list, list) {
- struct iommu_table_group *table_group;
-
- table_group = pnv_try_setup_npu_table_group(pe);
- if (!table_group) {
- if (!pnv_pci_ioda_pe_dma_weight(pe))
- continue;
-
- table_group = &pe->table_group;
- iommu_register_group(&pe->table_group,
- pe->phb->hose->global_number,
- pe->pe_number);
- }
- pnv_ioda_setup_bus_iommu_group(pe, table_group,
- pe->pbus);
- }
- }
-
- /*
- * Now we have all PHBs discovered, time to add NPU devices to
- * the corresponding IOMMU groups.
- */
- list_for_each_entry(hose, &hose_list, list_node) {
- unsigned long pgsizes;
-
- phb = hose->private_data;
-
- if (phb->type != PNV_PHB_NPU_NVLINK)
- continue;
-
- pgsizes = pnv_ioda_parse_tce_sizes(phb);
- list_for_each_entry(pe, &phb->ioda.pe_list, list) {
- /*
- * IODA2 bridges get this set up from
- * pci_controller_ops::setup_bridge but NPU bridges
- * do not have this hook defined so we do it here.
- */
- pe->table_group.pgsizes = pgsizes;
- pnv_npu_compound_attach(pe);
- }
- }
-}
-#else /* !CONFIG_IOMMU_API */
-static void pnv_pci_ioda_setup_iommu_api(void) { };
#endif
-static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
-{
- struct pci_controller *hose = phb->hose;
- struct device_node *dn = hose->dn;
- unsigned long mask = 0;
- int i, rc, count;
- u32 val;
-
- count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
- if (count <= 0) {
- mask = SZ_4K | SZ_64K;
- /* Add 16M for POWER8 by default */
- if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
- !cpu_has_feature(CPU_FTR_ARCH_300))
- mask |= SZ_16M | SZ_256M;
- return mask;
- }
-
- for (i = 0; i < count; i++) {
- rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
- i, &val);
- if (rc == 0)
- mask |= 1ULL << val;
- }
-
- return mask;
-}
-
-static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
- struct pnv_ioda_pe *pe)
+void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+ struct pnv_ioda_pe *pe)
{
int64_t rc;
- if (!pnv_pci_ioda_pe_dma_weight(pe))
- return;
-
/* TVE #1 is selected by PCI address bit 59 */
pe->tce_bypass_base = 1ull << 59;
@@ -2781,61 +1604,37 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
IOMMU_TABLE_GROUP_MAX_TABLES;
pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
-#ifdef CONFIG_IOMMU_API
- pe->table_group.ops = &pnv_pci_ioda2_ops;
-#endif
rc = pnv_pci_ioda2_setup_default_config(pe);
if (rc)
return;
- if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
- pnv_ioda_setup_bus_dma(pe, pe->pbus);
-}
-
-int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
-{
- struct pnv_phb *phb = container_of(chip, struct pnv_phb,
- ioda.irq_chip);
-
- return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+#ifdef CONFIG_IOMMU_API
+ pe->table_group.ops = &pnv_pci_ioda2_ops;
+ iommu_register_group(&pe->table_group, phb->hose->global_number,
+ pe->pe_number);
+#endif
+ pe->dma_setup_done = true;
}
-static void pnv_ioda2_msi_eoi(struct irq_data *d)
+/*
+ * Called from KVM in real mode to EOI passthru interrupts. The ICP
+ * EOI is handled directly in KVM in kvmppc_deliver_irq_passthru().
+ *
+ * The IRQ data is mapped in the PCI-MSI domain and the EOI OPAL call
+ * needs an HW IRQ number mapped in the XICS IRQ domain. The HW IRQ
+ * numbers of the in-the-middle MSI domain are vector numbers and it's
+ * good enough for OPAL. Use that.
+ */
+int64_t pnv_opal_pci_msi_eoi(struct irq_data *d)
{
- int64_t rc;
- unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
- struct irq_chip *chip = irq_data_get_irq_chip(d);
-
- rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
- WARN_ON_ONCE(rc);
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d->parent_data);
+ struct pnv_phb *phb = hose->private_data;
- icp_native_eoi(d);
+ return opal_pci_msi_eoi(phb->opal_id, d->parent_data->hwirq);
}
-
-void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
-{
- struct irq_data *idata;
- struct irq_chip *ichip;
-
- /* The MSI EOI OPAL call is only needed on PHB3 */
- if (phb->model != PNV_PHB_MODEL_PHB3)
- return;
-
- if (!phb->ioda.irq_chip_init) {
- /*
- * First time we setup an MSI IRQ, we need to setup the
- * corresponding IRQ chip to route correctly.
- */
- idata = irq_get_irq_data(virq);
- ichip = irq_data_get_irq_chip(idata);
- phb->ioda.irq_chip_init = 1;
- phb->ioda.irq_chip = *ichip;
- phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
- }
- irq_set_chip(virq, &phb->ioda.irq_chip);
-}
+static struct irq_chip pnv_pci_msi_irq_chip;
/*
* Returns true iff chip is something that we could call
@@ -2843,19 +1642,21 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq)
*/
bool is_pnv_opal_msi(struct irq_chip *chip)
{
- return chip->irq_eoi == pnv_ioda2_msi_eoi;
+ return chip == &pnv_pci_msi_irq_chip;
}
EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
-static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
- unsigned int hwirq, unsigned int virq,
- unsigned int is_64, struct msi_msg *msg)
+static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
+ unsigned int xive_num,
+ unsigned int is_64, struct msi_msg *msg)
{
struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
- unsigned int xive_num = hwirq - phb->msi_base;
__be32 data;
int rc;
+ dev_dbg(&dev->dev, "%s: setup %s-bit MSI for vector #%d\n", __func__,
+ is_64 ? "64" : "32", xive_num);
+
/* No PE assigned ? bail out ... no MSI for you ! */
if (pe == NULL)
return -ENXIO;
@@ -2903,17 +1704,188 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
}
msg->data = be32_to_cpu(data);
- pnv_set_msi_irq_chip(phb, virq);
+ return 0;
+}
+
+static void pnv_msi_shutdown(struct irq_data *d)
+{
+ d = d->parent_data;
+ if (d->chip->irq_shutdown)
+ d->chip->irq_shutdown(d);
+}
+
+static bool pnv_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *real_parent, struct msi_domain_info *info)
+{
+ struct irq_chip *chip = info->chip;
+
+ if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info))
+ return false;
+
+ chip->irq_shutdown = pnv_msi_shutdown;
+ return true;
+}
+
+#define PNV_PCI_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \
+ MSI_FLAG_USE_DEF_CHIP_OPS | \
+ MSI_FLAG_PCI_MSI_MASK_PARENT)
+#define PNV_PCI_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \
+ MSI_FLAG_PCI_MSIX | \
+ MSI_FLAG_MULTI_PCI_MSI)
+
+static const struct msi_parent_ops pnv_msi_parent_ops = {
+ .required_flags = PNV_PCI_MSI_FLAGS_REQUIRED,
+ .supported_flags = PNV_PCI_MSI_FLAGS_SUPPORTED,
+ .chip_flags = MSI_CHIP_FLAG_SET_EOI,
+ .bus_select_token = DOMAIN_BUS_NEXUS,
+ .bus_select_mask = MATCH_PCI_MSI,
+ .prefix = "PNV-",
+ .init_dev_msi_info = pnv_init_dev_msi_info,
+};
+
+static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg)
+{
+ struct msi_desc *entry = irq_data_get_msi_desc(d);
+ struct pci_dev *pdev = msi_desc_to_pci_dev(entry);
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+ struct pnv_phb *phb = hose->private_data;
+ int rc;
- pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
- " address=%x_%08x data=%x PE# %x\n",
- pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
- msg->address_hi, msg->address_lo, data, pe->pe_number);
+ rc = __pnv_pci_ioda_msi_setup(phb, pdev, d->hwirq,
+ entry->pci.msi_attrib.is_64, msg);
+ if (rc)
+ dev_err(&pdev->dev, "Failed to setup %s-bit MSI #%ld : %d\n",
+ entry->pci.msi_attrib.is_64 ? "64" : "32", d->hwirq, rc);
+}
+
+/*
+ * The IRQ data is mapped in the MSI domain in which HW IRQ numbers
+ * correspond to vector numbers.
+ */
+static void pnv_msi_eoi(struct irq_data *d)
+{
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+ struct pnv_phb *phb = hose->private_data;
+
+ if (phb->model == PNV_PHB_MODEL_PHB3) {
+ /*
+ * The EOI OPAL call takes an OPAL HW IRQ number but
+ * since it is translated into a vector number in
+ * OPAL, use that directly.
+ */
+ WARN_ON_ONCE(opal_pci_msi_eoi(phb->opal_id, d->hwirq));
+ }
+
+ irq_chip_eoi_parent(d);
+}
+
+static struct irq_chip pnv_msi_irq_chip = {
+ .name = "PNV-MSI",
+ .irq_shutdown = pnv_msi_shutdown,
+ .irq_mask = irq_chip_mask_parent,
+ .irq_unmask = irq_chip_unmask_parent,
+ .irq_eoi = pnv_msi_eoi,
+ .irq_set_affinity = irq_chip_set_affinity_parent,
+ .irq_compose_msi_msg = pnv_msi_compose_msg,
+};
+
+static int pnv_irq_parent_domain_alloc(struct irq_domain *domain,
+ unsigned int virq, int hwirq)
+{
+ struct irq_fwspec parent_fwspec;
+ int ret;
+
+ parent_fwspec.fwnode = domain->parent->fwnode;
+ parent_fwspec.param_count = 2;
+ parent_fwspec.param[0] = hwirq;
+ parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING;
+
+ ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs, void *arg)
+{
+ struct pci_controller *hose = domain->host_data;
+ struct pnv_phb *phb = hose->private_data;
+ msi_alloc_info_t *info = arg;
+ struct pci_dev *pdev = msi_desc_to_pci_dev(info->desc);
+ int hwirq;
+ int i, ret;
+
+ hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, nr_irqs);
+ if (hwirq < 0) {
+ dev_warn(&pdev->dev, "failed to find a free MSI\n");
+ return -ENOSPC;
+ }
+
+ dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__,
+ hose->dn, virq, hwirq, nr_irqs);
+
+ for (i = 0; i < nr_irqs; i++) {
+ ret = pnv_irq_parent_domain_alloc(domain, virq + i,
+ phb->msi_base + hwirq + i);
+ if (ret)
+ goto out;
+
+ irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i,
+ &pnv_msi_irq_chip, hose);
+ }
+
+ return 0;
+
+out:
+ irq_domain_free_irqs_parent(domain, virq, i);
+ msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, nr_irqs);
+ return ret;
+}
+
+static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq,
+ unsigned int nr_irqs)
+{
+ struct irq_data *d = irq_domain_get_irq_data(domain, virq);
+ struct pci_controller *hose = irq_data_get_irq_chip_data(d);
+ struct pnv_phb *phb = hose->private_data;
+
+ pr_debug("%s bridge %pOF %d/%lx #%d\n", __func__, hose->dn,
+ virq, d->hwirq, nr_irqs);
+
+ msi_bitmap_free_hwirqs(&phb->msi_bmp, d->hwirq, nr_irqs);
+ irq_domain_free_irqs_parent(domain, virq, nr_irqs);
+}
+
+static const struct irq_domain_ops pnv_irq_domain_ops = {
+ .select = msi_lib_irq_domain_select,
+ .alloc = pnv_irq_domain_alloc,
+ .free = pnv_irq_domain_free,
+};
+
+static int __init pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count)
+{
+ struct irq_domain *parent = irq_get_default_domain();
+ struct irq_domain_info info = {
+ .fwnode = of_fwnode_handle(hose->dn),
+ .ops = &pnv_irq_domain_ops,
+ .host_data = hose,
+ .size = count,
+ .parent = parent,
+ };
+
+ hose->dev_domain = msi_create_parent_irq_domain(&info, &pnv_msi_parent_ops);
+ if (!hose->dev_domain) {
+ pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n",
+ hose->dn, hose->global_number);
+ return -ENOMEM;
+ }
return 0;
}
-static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
+static void __init pnv_pci_init_ioda_msis(struct pnv_phb *phb)
{
unsigned int count;
const __be32 *prop = of_get_property(phb->hose->dn,
@@ -2933,102 +1905,11 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
return;
}
- phb->msi_setup = pnv_pci_ioda_msi_setup;
- phb->msi32_support = 1;
pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
count, phb->msi_base);
-}
-
-#ifdef CONFIG_PCI_IOV
-static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
-{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
- const resource_size_t gate = phb->ioda.m64_segsize >> 2;
- struct resource *res;
- int i;
- resource_size_t size, total_vf_bar_sz;
- struct pci_dn *pdn;
- int mul, total_vfs;
-
- if (!pdev->is_physfn || pci_dev_is_added(pdev))
- return;
-
- pdn = pci_get_pdn(pdev);
- pdn->vfs_expanded = 0;
- pdn->m64_single_mode = false;
-
- total_vfs = pci_sriov_get_totalvfs(pdev);
- mul = phb->ioda.total_pe_num;
- total_vf_bar_sz = 0;
-
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = &pdev->resource[i + PCI_IOV_RESOURCES];
- if (!res->flags || res->parent)
- continue;
- if (!pnv_pci_is_m64_flags(res->flags)) {
- dev_warn(&pdev->dev, "Don't support SR-IOV with"
- " non M64 VF BAR%d: %pR. \n",
- i, res);
- goto truncate_iov;
- }
-
- total_vf_bar_sz += pci_iov_resource_size(pdev,
- i + PCI_IOV_RESOURCES);
-
- /*
- * If bigger than quarter of M64 segment size, just round up
- * power of two.
- *
- * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
- * with other devices, IOV BAR size is expanded to be
- * (total_pe * VF_BAR_size). When VF_BAR_size is half of M64
- * segment size , the expanded size would equal to half of the
- * whole M64 space size, which will exhaust the M64 Space and
- * limit the system flexibility. This is a design decision to
- * set the boundary to quarter of the M64 segment size.
- */
- if (total_vf_bar_sz > gate) {
- mul = roundup_pow_of_two(total_vfs);
- dev_info(&pdev->dev,
- "VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
- total_vf_bar_sz, gate, mul);
- pdn->m64_single_mode = true;
- break;
- }
- }
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = &pdev->resource[i + PCI_IOV_RESOURCES];
- if (!res->flags || res->parent)
- continue;
-
- size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
- /*
- * On PHB3, the minimum size alignment of M64 BAR in single
- * mode is 32MB.
- */
- if (pdn->m64_single_mode && (size < SZ_32M))
- goto truncate_iov;
- dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
- res->end = res->start + size * mul - 1;
- dev_dbg(&pdev->dev, " %pR\n", res);
- dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
- i, res, mul);
- }
- pdn->vfs_expanded = mul;
-
- return;
-
-truncate_iov:
- /* To save MMIO space, IOV BAR is truncated. */
- for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
- res = &pdev->resource[i + PCI_IOV_RESOURCES];
- res->flags = 0;
- res->end = res->start - 1;
- }
+ pnv_msi_allocate_domains(phb->hose, count);
}
-#endif /* CONFIG_PCI_IOV */
static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
struct resource *res)
@@ -3038,7 +1919,8 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
int index;
int64_t rc;
- if (!res || !res->flags || res->start > res->end)
+ if (!res || !res->flags || res->start > res->end ||
+ res->flags & IORESOURCE_UNSET)
return;
if (res->flags & IORESOURCE_IO) {
@@ -3089,7 +1971,7 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe,
/*
* This function is supposed to be called on basis of PE from top
- * to bottom style. So the the I/O or MMIO segment assigned to
+ * to bottom style. So the I/O or MMIO segment assigned to
* parent PE could be overridden by its child PEs if necessary.
*/
static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
@@ -3124,19 +2006,9 @@ static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe)
#ifdef CONFIG_DEBUG_FS
static int pnv_pci_diag_data_set(void *data, u64 val)
{
- struct pci_controller *hose;
- struct pnv_phb *phb;
+ struct pnv_phb *phb = data;
s64 ret;
- if (val != 1ULL)
- return -EINVAL;
-
- hose = (struct pci_controller *)data;
- if (!hose || !hose->private_data)
- return -ENODEV;
-
- phb = hose->private_data;
-
/* Retrieve the diag data from firmware */
ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data,
phb->diag_data_size);
@@ -3148,8 +2020,35 @@ static int pnv_pci_diag_data_set(void *data, u64 val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_diag_data_fops, NULL,
- pnv_pci_diag_data_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, pnv_pci_diag_data_set,
+ "%llu\n");
+
+static int pnv_pci_ioda_pe_dump(void *data, u64 val)
+{
+ struct pnv_phb *phb = data;
+ int pe_num;
+
+ for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) {
+ struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_num];
+
+ if (!test_bit(pe_num, phb->ioda.pe_alloc))
+ continue;
+
+ pe_warn(pe, "rid: %04x dev count: %2d flags: %s%s%s%s%s%s\n",
+ pe->rid, pe->device_count,
+ (pe->flags & PNV_IODA_PE_DEV) ? "dev " : "",
+ (pe->flags & PNV_IODA_PE_BUS) ? "bus " : "",
+ (pe->flags & PNV_IODA_PE_BUS_ALL) ? "all " : "",
+ (pe->flags & PNV_IODA_PE_MASTER) ? "master " : "",
+ (pe->flags & PNV_IODA_PE_SLAVE) ? "slave " : "",
+ (pe->flags & PNV_IODA_PE_VF) ? "vf " : "");
+ }
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_ioda_pe_dump_fops, NULL,
+ pnv_pci_ioda_pe_dump, "%llu\n");
#endif /* CONFIG_DEBUG_FS */
@@ -3163,19 +2062,13 @@ static void pnv_pci_ioda_create_dbgfs(void)
list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
phb = hose->private_data;
- /* Notify initialization of PHB done */
- phb->initialized = 1;
-
sprintf(name, "PCI%04x", hose->global_number);
- phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
- if (!phb->dbgfs) {
- pr_warn("%s: Error on creating debugfs on PHB#%x\n",
- __func__, hose->global_number);
- continue;
- }
+ phb->dbgfs = debugfs_create_dir(name, arch_debugfs_dir);
- debugfs_create_file("dump_diag_regs", 0200, phb->dbgfs, hose,
- &pnv_pci_diag_data_fops);
+ debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs,
+ phb, &pnv_pci_diag_data_fops);
+ debugfs_create_file_unsafe("dump_ioda_pe_state", 0200, phb->dbgfs,
+ phb, &pnv_pci_ioda_pe_dump_fops);
}
#endif /* CONFIG_DEBUG_FS */
}
@@ -3217,8 +2110,6 @@ static void pnv_pci_enable_bridges(void)
static void pnv_pci_ioda_fixup(void)
{
- pnv_pci_ioda_setup_PEs();
- pnv_pci_ioda_setup_iommu_api();
pnv_pci_ioda_create_dbgfs();
pnv_pci_enable_bridges();
@@ -3243,10 +2134,9 @@ static void pnv_pci_ioda_fixup(void)
static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
unsigned long type)
{
- struct pci_dev *bridge;
- struct pci_controller *hose = pci_bus_to_host(bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
int num_pci_bridges = 0;
+ struct pci_dev *bridge;
bridge = bus->self;
while (bridge) {
@@ -3330,28 +2220,16 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
}
}
-static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
+static void pnv_pci_configure_bus(struct pci_bus *bus)
{
- struct pci_controller *hose = pci_bus_to_host(bus);
- struct pnv_phb *phb = hose->private_data;
struct pci_dev *bridge = bus->self;
struct pnv_ioda_pe *pe;
- bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
-
- /* Extend bridge's windows if necessary */
- pnv_pci_fixup_bridge_resources(bus, type);
+ bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
- /* The PE for root bus should be realized before any one else */
- if (!phb->ioda.root_pe_populated) {
- pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false);
- if (pe) {
- phb->ioda.root_pe_idx = pe->pe_number;
- phb->ioda.root_pe_populated = true;
- }
- }
+ dev_info(&bus->dev, "Configuring PE for bus\n");
/* Don't assign PE to PCI bus, which doesn't have subordinate devices */
- if (list_empty(&bus->devices))
+ if (WARN_ON(list_empty(&bus->devices)))
return;
/* Reserve PEs according to used M64 resources */
@@ -3367,17 +2245,6 @@ static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
return;
pnv_ioda_setup_pe_seg(pe);
- switch (phb->type) {
- case PNV_PHB_IODA1:
- pnv_pci_ioda1_setup_dma_pe(phb, pe);
- break;
- case PNV_PHB_IODA2:
- pnv_pci_ioda2_setup_dma_pe(phb, pe);
- break;
- default:
- pr_warn("%s: No DMA for PHB#%x (type %d)\n",
- __func__, phb->hose->global_number, phb->type);
- }
}
static resource_size_t pnv_pci_default_alignment(void)
@@ -3385,134 +2252,50 @@ static resource_size_t pnv_pci_default_alignment(void)
return PAGE_SIZE;
}
-#ifdef CONFIG_PCI_IOV
-static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
- int resno)
-{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
- struct pci_dn *pdn = pci_get_pdn(pdev);
- resource_size_t align;
-
- /*
- * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
- * SR-IOV. While from hardware perspective, the range mapped by M64
- * BAR should be size aligned.
- *
- * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
- * powernv-specific hardware restriction is gone. But if just use the
- * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
- * in one segment of M64 #15, which introduces the PE conflict between
- * PF and VF. Based on this, the minimum alignment of an IOV BAR is
- * m64_segsize.
- *
- * This function returns the total IOV BAR size if M64 BAR is in
- * Shared PE mode or just VF BAR size if not.
- * If the M64 BAR is in Single PE mode, return the VF BAR size or
- * M64 segment size if IOV BAR size is less.
- */
- align = pci_iov_resource_size(pdev, resno);
- if (!pdn->vfs_expanded)
- return align;
- if (pdn->m64_single_mode)
- return max(align, (resource_size_t)phb->ioda.m64_segsize);
-
- return pdn->vfs_expanded * align;
-}
-#endif /* CONFIG_PCI_IOV */
-
/* Prevent enabling devices for which we couldn't properly
* assign a PE
*/
static bool pnv_pci_enable_device_hook(struct pci_dev *dev)
{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
struct pci_dn *pdn;
- /* The function is probably called while the PEs have
- * not be created yet. For example, resource reassignment
- * during PCI probe period. We just skip the check if
- * PEs isn't ready.
- */
- if (!phb->initialized)
- return true;
-
pdn = pci_get_pdn(dev);
- if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+ if (!pdn || pdn->pe_number == IODA_INVALID_PE) {
+ pci_err(dev, "pci_enable_device() blocked, no PE assigned.\n");
return false;
-
- return true;
-}
-
-static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
- int num)
-{
- struct pnv_ioda_pe *pe = container_of(table_group,
- struct pnv_ioda_pe, table_group);
- struct pnv_phb *phb = pe->phb;
- unsigned int idx;
- long rc;
-
- pe_info(pe, "Removing DMA window #%d\n", num);
- for (idx = 0; idx < phb->ioda.dma32_count; idx++) {
- if (phb->ioda.dma32_segmap[idx] != pe->pe_number)
- continue;
-
- rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
- idx, 0, 0ul, 0ul, 0ul);
- if (rc != OPAL_SUCCESS) {
- pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n",
- rc, idx);
- return rc;
- }
-
- phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE;
}
- pnv_pci_unlink_table_and_group(table_group->tables[num], table_group);
- return OPAL_SUCCESS;
+ return true;
}
-static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
+static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev)
{
- unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
- struct iommu_table *tbl = pe->table_group.tables[0];
- int64_t rc;
-
- if (!weight)
- return;
+ struct pci_dn *pdn;
+ struct pnv_ioda_pe *pe;
- rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
- if (rc != OPAL_SUCCESS)
- return;
+ pdn = pci_get_pdn(dev);
+ if (!pdn)
+ return false;
- pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false);
- if (pe->table_group.group) {
- iommu_group_put(pe->table_group.group);
- WARN_ON(pe->table_group.group);
+ if (pdn->pe_number == IODA_INVALID_PE) {
+ pe = pnv_ioda_setup_dev_PE(dev);
+ if (!pe)
+ return false;
}
-
- free_pages(tbl->it_base, get_order(tbl->it_size << 3));
- iommu_tce_table_put(tbl);
+ return true;
}
-static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
+void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
{
struct iommu_table *tbl = pe->table_group.tables[0];
- unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
-#ifdef CONFIG_IOMMU_API
int64_t rc;
-#endif
- if (!weight)
+ if (!pe->dma_setup_done)
return;
-#ifdef CONFIG_IOMMU_API
rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
if (rc)
- pe_warn(pe, "OPAL error %ld release DMA window\n", rc);
-#endif
+ pe_warn(pe, "OPAL error %lld release DMA window\n", rc);
pnv_pci_ioda2_set_bypass(pe, false);
if (pe->table_group.group) {
@@ -3535,17 +2318,11 @@ static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
if (map[idx] != pe->pe_number)
continue;
- if (win == OPAL_M64_WINDOW_TYPE)
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
- phb->ioda.reserved_pe_idx, win,
- idx / PNV_IODA1_M64_SEGS,
- idx % PNV_IODA1_M64_SEGS);
- else
- rc = opal_pci_map_pe_mmio_window(phb->opal_id,
- phb->ioda.reserved_pe_idx, win, 0, idx);
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ phb->ioda.reserved_pe_idx, win, 0, idx);
if (rc != OPAL_SUCCESS)
- pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n",
+ pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n",
rc, win, idx);
map[idx] = IODA_INVALID_PE;
@@ -3556,14 +2333,7 @@ static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
{
struct pnv_phb *phb = pe->phb;
- if (phb->type == PNV_PHB_IODA1) {
- pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE,
- phb->ioda.io_segmap);
- pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
- phb->ioda.m32_segmap);
- pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE,
- phb->ioda.m64_segmap);
- } else if (phb->type == PNV_PHB_IODA2) {
+ if (phb->type == PNV_PHB_IODA2) {
pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE,
phb->ioda.m32_segmap);
}
@@ -3574,14 +2344,18 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
struct pnv_phb *phb = pe->phb;
struct pnv_ioda_pe *slave, *tmp;
+ pe_info(pe, "Releasing PE\n");
+
+ mutex_lock(&phb->ioda.pe_list_mutex);
list_del(&pe->list);
+ mutex_unlock(&phb->ioda.pe_list_mutex);
+
switch (phb->type) {
- case PNV_PHB_IODA1:
- pnv_pci_ioda1_release_pe_dma(pe);
- break;
case PNV_PHB_IODA2:
pnv_pci_ioda2_release_pe_dma(pe);
break;
+ case PNV_PHB_NPU_OCAPI:
+ break;
default:
WARN_ON(1);
}
@@ -3603,26 +2377,35 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
* that it can be populated again in PCI hot add path. The PE
* shouldn't be destroyed as it's the global reserved resource.
*/
- if (phb->ioda.root_pe_populated &&
- phb->ioda.root_pe_idx == pe->pe_number)
- phb->ioda.root_pe_populated = false;
- else
- pnv_ioda_free_pe(pe);
+ if (phb->ioda.root_pe_idx == pe->pe_number)
+ return;
+
+ pnv_ioda_free_pe(pe);
}
static void pnv_pci_release_device(struct pci_dev *pdev)
{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
struct pci_dn *pdn = pci_get_pdn(pdev);
struct pnv_ioda_pe *pe;
+ /* The VF PE state is torn down when sriov_disable() is called */
if (pdev->is_virtfn)
return;
if (!pdn || pdn->pe_number == IODA_INVALID_PE)
return;
+#ifdef CONFIG_PCI_IOV
+ /*
+ * FIXME: Try move this to sriov_disable(). It's here since we allocate
+ * the iov state at probe time since we need to fiddle with the IOV
+ * resources.
+ */
+ if (pdev->is_physfn)
+ kfree(pdev->dev.archdata.iov_data);
+#endif
+
/*
* PCI hotplug can happen as part of EEH error recovery. The @pdn
* isn't removed and added afterwards in this scenario. We should
@@ -3639,15 +2422,6 @@ static void pnv_pci_release_device(struct pci_dev *pdev)
pnv_ioda_release_pe(pe);
}
-static void pnv_npu_disable_device(struct pci_dev *pdev)
-{
- struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
- struct eeh_pe *eehpe = edev ? edev->pe : NULL;
-
- if (eehpe && eeh_ops && eeh_ops->reset)
- eeh_ops->reset(eehpe, EEH_RESET_HOT);
-}
-
static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
{
struct pnv_phb *phb = hose->private_data;
@@ -3656,43 +2430,64 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
OPAL_ASSERT_RESET);
}
-static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
- .dma_dev_setup = pnv_pci_dma_dev_setup,
- .dma_bus_setup = pnv_pci_dma_bus_setup,
- .setup_msi_irqs = pnv_setup_msi_irqs,
- .teardown_msi_irqs = pnv_teardown_msi_irqs,
- .enable_device_hook = pnv_pci_enable_device_hook,
- .release_device = pnv_pci_release_device,
- .window_alignment = pnv_pci_window_alignment,
- .setup_bridge = pnv_pci_setup_bridge,
- .reset_secondary_bus = pnv_pci_reset_secondary_bus,
- .dma_set_mask = pnv_pci_ioda_dma_set_mask,
- .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask,
- .shutdown = pnv_pci_ioda_shutdown,
-};
+static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus)
+{
+ struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
+ struct pnv_ioda_pe *pe;
+
+ list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+ if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
+ continue;
-static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask)
+ if (!pe->pbus)
+ continue;
+
+ if (bus->number == ((pe->rid >> 8) & 0xFF)) {
+ pe->pbus = bus;
+ break;
+ }
+ }
+}
+
+#ifdef CONFIG_IOMMU_API
+static struct iommu_group *pnv_pci_device_group(struct pci_controller *hose,
+ struct pci_dev *pdev)
{
- dev_err_once(&npdev->dev,
- "%s operation unsupported for NVLink devices\n",
- __func__);
- return -EPERM;
+ struct pnv_phb *phb = hose->private_data;
+ struct pnv_ioda_pe *pe;
+
+ if (WARN_ON(!phb))
+ return ERR_PTR(-ENODEV);
+
+ pe = pnv_pci_bdfn_to_pe(phb, pci_dev_id(pdev));
+ if (!pe)
+ return ERR_PTR(-ENODEV);
+
+ if (!pe->table_group.group)
+ return ERR_PTR(-ENODEV);
+
+ return iommu_group_ref_get(pe->table_group.group);
}
+#endif
-static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
- .dma_dev_setup = pnv_pci_dma_dev_setup,
- .setup_msi_irqs = pnv_setup_msi_irqs,
- .teardown_msi_irqs = pnv_teardown_msi_irqs,
+static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
+ .dma_dev_setup = pnv_pci_ioda_dma_dev_setup,
+ .dma_bus_setup = pnv_pci_ioda_dma_bus_setup,
+ .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
.enable_device_hook = pnv_pci_enable_device_hook,
+ .release_device = pnv_pci_release_device,
.window_alignment = pnv_pci_window_alignment,
+ .setup_bridge = pnv_pci_fixup_bridge_resources,
.reset_secondary_bus = pnv_pci_reset_secondary_bus,
- .dma_set_mask = pnv_npu_dma_set_mask,
.shutdown = pnv_pci_ioda_shutdown,
- .disable_device = pnv_npu_disable_device,
+#ifdef CONFIG_IOMMU_API
+ .device_group = pnv_pci_device_group,
+#endif
};
static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
- .enable_device_hook = pnv_pci_enable_device_hook,
+ .enable_device_hook = pnv_ocapi_enable_device_hook,
+ .release_device = pnv_pci_release_device,
.window_alignment = pnv_pci_window_alignment,
.reset_secondary_bus = pnv_pci_reset_secondary_bus,
.shutdown = pnv_pci_ioda_shutdown,
@@ -3704,7 +2499,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
struct pci_controller *hose;
struct pnv_phb *phb;
unsigned long size, m64map_off, m32map_off, pemap_off;
- unsigned long iomap_off = 0, dma32map_off = 0;
+ struct pnv_ioda_pe *root_pe;
struct resource r;
const __be64 *prop64;
const __be32 *prop32;
@@ -3727,14 +2522,17 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb_id = be64_to_cpup(prop64);
pr_debug(" PHB-ID : 0x%016llx\n", phb_id);
- phb = memblock_alloc(sizeof(*phb), SMP_CACHE_BYTES);
+ phb = kzalloc(sizeof(*phb), GFP_KERNEL);
+ if (!phb)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(*phb));
/* Allocate PCI controller */
phb->hose = hose = pcibios_alloc_controller(np);
if (!phb->hose) {
pr_err(" Can't allocate PCI controller for %pOF\n",
np);
- memblock_free(__pa(phb), sizeof(struct pnv_phb));
+ memblock_free(phb, sizeof(struct pnv_phb));
return;
}
@@ -3759,10 +2557,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb->model = PNV_PHB_MODEL_P7IOC;
else if (of_device_is_compatible(np, "ibm,power8-pciex"))
phb->model = PNV_PHB_MODEL_PHB3;
- else if (of_device_is_compatible(np, "ibm,power8-npu-pciex"))
- phb->model = PNV_PHB_MODEL_NPU;
- else if (of_device_is_compatible(np, "ibm,power9-npu-pciex"))
- phb->model = PNV_PHB_MODEL_NPU2;
else
phb->model = PNV_PHB_MODEL_UNKNOWN;
@@ -3773,7 +2567,10 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
else
phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE;
- phb->diag_data = memblock_alloc(phb->diag_data_size, SMP_CACHE_BYTES);
+ phb->diag_data = kzalloc(phb->diag_data_size, GFP_KERNEL);
+ if (!phb->diag_data)
+ panic("%s: Failed to allocate %u bytes\n", __func__,
+ phb->diag_data_size);
/* Parse 32-bit and IO ranges (if any) */
pci_process_bridge_OF_ranges(hose, np, !hose->global_number);
@@ -3812,27 +2609,19 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num;
phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
- /* Calculate how many 32-bit TCE segments we have */
- phb->ioda.dma32_count = phb->ioda.m32_pci_base /
- PNV_IODA1_DMA32_SEGSIZE;
-
/* Allocate aux data & arrays. We don't have IO ports on PHB3 */
- size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
+ size = ALIGN(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
sizeof(unsigned long));
m64map_off = size;
size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
m32map_off = size;
size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]);
- if (phb->type == PNV_PHB_IODA1) {
- iomap_off = size;
- size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]);
- dma32map_off = size;
- size += phb->ioda.dma32_count *
- sizeof(phb->ioda.dma32_segmap[0]);
- }
pemap_off = size;
size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe);
- aux = memblock_alloc(size, SMP_CACHE_BYTES);
+ aux = kzalloc(size, GFP_KERNEL);
+ if (!aux)
+ panic("%s: Failed to allocate %lu bytes\n", __func__, size);
+
phb->ioda.pe_alloc = aux;
phb->ioda.m64_segmap = aux + m64map_off;
phb->ioda.m32_segmap = aux + m32map_off;
@@ -3840,15 +2629,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb->ioda.m64_segmap[segno] = IODA_INVALID_PE;
phb->ioda.m32_segmap[segno] = IODA_INVALID_PE;
}
- if (phb->type == PNV_PHB_IODA1) {
- phb->ioda.io_segmap = aux + iomap_off;
- for (segno = 0; segno < phb->ioda.total_pe_num; segno++)
- phb->ioda.io_segmap[segno] = IODA_INVALID_PE;
-
- phb->ioda.dma32_segmap = aux + dma32map_off;
- for (segno = 0; segno < phb->ioda.dma32_count; segno++)
- phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE;
- }
phb->ioda.pe_array = aux + pemap_off;
/*
@@ -3864,16 +2644,14 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
} else {
- phb->ioda.root_pe_idx = IODA_INVALID_PE;
+ /* otherwise just allocate one */
+ root_pe = pnv_ioda_alloc_pe(phb, 1);
+ phb->ioda.root_pe_idx = root_pe->pe_number;
}
INIT_LIST_HEAD(&phb->ioda.pe_list);
mutex_init(&phb->ioda.pe_list_mutex);
- /* Calculate how many 32-bit TCE segments we have */
- phb->ioda.dma32_count = phb->ioda.m32_pci_base /
- PNV_IODA1_DMA32_SEGSIZE;
-
#if 0 /* We should really do that ... */
rc = opal_pci_set_phb_mem_window(opal->phb_id,
window_type,
@@ -3912,21 +2690,17 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
switch (phb->type) {
- case PNV_PHB_NPU_NVLINK:
- hose->controller_ops = pnv_npu_ioda_controller_ops;
- break;
case PNV_PHB_NPU_OCAPI:
hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
break;
default:
- phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
hose->controller_ops = pnv_pci_ioda_controller_ops;
}
ppc_md.pcibios_default_alignment = pnv_pci_default_alignment;
#ifdef CONFIG_PCI_IOV
- ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
+ ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov;
ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;
@@ -3944,9 +2718,12 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
* shutdown PCI devices correctly. We already got IODA table
* cleaned out. So we have to issue PHB reset to stop all PCI
* transactions from previous kernel. The ppc_pci_reset_phbs
- * kernel parameter will force this reset too.
+ * kernel parameter will force this reset too. Additionally,
+ * if the IODA reset above failed then use a bigger hammer.
+ * This can happen if we get a PHB fatal error in very early
+ * boot.
*/
- if (is_kdump_kernel() || pci_reset_phbs) {
+ if (is_kdump_kernel() || pci_reset_phbs || rc) {
pr_info(" Issue PHB reset ...\n");
pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
@@ -3955,6 +2732,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
/* Remove M64 resource if we can't configure it successfully */
if (!phb->init_m64 || phb->init_m64(phb))
hose->mem_resources[1].flags = 0;
+
+ /* create pci_dn's for DT nodes under this PHB */
+ pci_devs_phb_init_dynamic(hose);
}
void __init pnv_pci_init_ioda2_phb(struct device_node *np)
@@ -3962,11 +2742,6 @@ void __init pnv_pci_init_ioda2_phb(struct device_node *np)
pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2);
}
-void __init pnv_pci_init_npu_phb(struct device_node *np)
-{
- pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK);
-}
-
void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
{
pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI);
@@ -3974,8 +2749,7 @@ void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- struct pnv_phb *phb = hose->private_data;
+ struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus);
if (!machine_is(powernv))
return;
@@ -3984,27 +2758,3 @@ static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
dev->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
}
DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pnv_npu2_opencapi_cfg_size_fixup);
-
-void __init pnv_pci_init_ioda_hub(struct device_node *np)
-{
- struct device_node *phbn;
- const __be64 *prop64;
- u64 hub_id;
-
- pr_info("Probing IODA IO-Hub %pOF\n", np);
-
- prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
- if (!prop64) {
- pr_err(" Missing \"ibm,opal-hubid\" property !\n");
- return;
- }
- hub_id = be64_to_cpup(prop64);
- pr_devel(" HUB-ID : 0x%016llx\n", hub_id);
-
- /* Count child PHBs */
- for_each_child_of_node(np, phbn) {
- /* Look for IODA1 PHBs */
- if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
- pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1);
- }
-}
diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
new file mode 100644
index 000000000000..cc7b1dd54ac6
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/pci-sriov.c
@@ -0,0 +1,760 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/kernel.h>
+#include <linux/ioport.h>
+#include <linux/bitmap.h>
+#include <linux/pci.h>
+
+#include <asm/opal.h>
+
+#include "pci.h"
+
+/*
+ * The majority of the complexity in supporting SR-IOV on PowerNV comes from
+ * the need to put the MMIO space for each VF into a separate PE. Internally
+ * the PHB maps MMIO addresses to a specific PE using the "Memory BAR Table".
+ * The MBT historically only applied to the 64bit MMIO window of the PHB
+ * so it's common to see it referred to as the "M64BT".
+ *
+ * An MBT entry stores the mapped range as an <base>,<mask> pair. This forces
+ * the address range that we want to map to be power-of-two sized and aligned.
+ * For conventional PCI devices this isn't really an issue since PCI device BARs
+ * have the same requirement.
+ *
+ * For a SR-IOV BAR things are a little more awkward since size and alignment
+ * are not coupled. The alignment is set based on the per-VF BAR size, but
+ * the total BAR area is: number-of-vfs * per-vf-size. The number of VFs
+ * isn't necessarily a power of two, so neither is the total size. To fix that
+ * we need to finesse (read: hack) the Linux BAR allocator so that it will
+ * allocate the SR-IOV BARs in a way that lets us map them using the MBT.
+ *
+ * The changes to size and alignment that we need to do depend on the "mode"
+ * of MBT entry that we use. We only support SR-IOV on PHB3 (IODA2) and above,
+ * so as a baseline we can assume that we have the following BAR modes
+ * available:
+ *
+ * NB: $PE_COUNT is the number of PEs that the PHB supports.
+ *
+ * a) A segmented BAR that splits the mapped range into $PE_COUNT equally sized
+ * segments. The n'th segment is mapped to the n'th PE.
+ * b) An un-segmented BAR that maps the whole address range to a specific PE.
+ *
+ *
+ * We prefer to use mode a) since it only requires one MBT entry per SR-IOV BAR
+ * For comparison b) requires one entry per-VF per-BAR, or:
+ * (num-vfs * num-sriov-bars) in total. To use a) we need the size of each segment
+ * to equal the size of the per-VF BAR area. So:
+ *
+ * new_size = per-vf-size * number-of-PEs
+ *
+ * The alignment for the SR-IOV BAR also needs to be changed from per-vf-size
+ * to "new_size", calculated above. Implementing this is a convoluted process
+ * which requires several hooks in the PCI core:
+ *
+ * 1. In pcibios_device_add() we call pnv_pci_ioda_fixup_iov().
+ *
+ * At this point the device has been probed and the device's BARs are sized,
+ * but no resource allocations have been done. The SR-IOV BARs are sized
+ * based on the maximum number of VFs supported by the device and we need
+ * to increase that to new_size.
+ *
+ * 2. Later, when Linux actually assigns resources it tries to make the resource
+ * allocations for each PCI bus as compact as possible. As a part of that it
+ * sorts the BARs on a bus by their required alignment, which is calculated
+ * using pci_resource_alignment().
+ *
+ * For IOV resources this goes:
+ * pci_resource_alignment()
+ * pci_sriov_resource_alignment()
+ * pcibios_sriov_resource_alignment()
+ * pnv_pci_iov_resource_alignment()
+ *
+ * Our hook overrides the default alignment, equal to the per-vf-size, with
+ * new_size computed above.
+ *
+ * 3. When userspace enables VFs for a device:
+ *
+ * sriov_enable()
+ * pcibios_sriov_enable()
+ * pnv_pcibios_sriov_enable()
+ *
+ * This is where we actually allocate PE numbers for each VF and setup the
+ * MBT mapping for each SR-IOV BAR. In steps 1) and 2) we setup an "arena"
+ * where each MBT segment is equal in size to the VF BAR so we can shift
+ * around the actual SR-IOV BAR location within this arena. We need this
+ * ability because the PE space is shared by all devices on the same PHB.
+ * When using mode a) described above segment 0 in maps to PE#0 which might
+ * be already being used by another device on the PHB.
+ *
+ * As a result we need allocate a contigious range of PE numbers, then shift
+ * the address programmed into the SR-IOV BAR of the PF so that the address
+ * of VF0 matches up with the segment corresponding to the first allocated
+ * PE number. This is handled in pnv_pci_vf_resource_shift().
+ *
+ * Once all that is done we return to the PCI core which then enables VFs,
+ * scans them and creates pci_devs for each. The init process for a VF is
+ * largely the same as a normal device, but the VF is inserted into the IODA
+ * PE that we allocated for it rather than the PE associated with the bus.
+ *
+ * 4. When userspace disables VFs we unwind the above in
+ * pnv_pcibios_sriov_disable(). Fortunately this is relatively simple since
+ * we don't need to validate anything, just tear down the mappings and
+ * move SR-IOV resource back to its "proper" location.
+ *
+ * That's how mode a) works. In theory mode b) (single PE mapping) is less work
+ * since we can map each individual VF with a separate BAR. However, there's a
+ * few limitations:
+ *
+ * 1) For IODA2 mode b) has a minimum alignment requirement of 32MB. This makes
+ * it only usable for devices with very large per-VF BARs. Such devices are
+ * similar to Big Foot. They definitely exist, but I've never seen one.
+ *
+ * 2) The number of MBT entries that we have is limited. PHB3 and PHB4 only
+ * 16 total and some are needed for. Most SR-IOV capable network cards can support
+ * more than 16 VFs on each port.
+ *
+ * We use b) when using a) would use more than 1/4 of the entire 64 bit MMIO
+ * window of the PHB.
+ *
+ *
+ *
+ * PHB4 (IODA3) added a few new features that would be useful for SR-IOV. It
+ * allowed the MBT to map 32bit MMIO space in addition to 64bit which allows
+ * us to support SR-IOV BARs in the 32bit MMIO window. This is useful since
+ * the Linux BAR allocation will place any BAR marked as non-prefetchable into
+ * the non-prefetchable bridge window, which is 32bit only. It also added two
+ * new modes:
+ *
+ * c) A segmented BAR similar to a), but each segment can be individually
+ * mapped to any PE. This is matches how the 32bit MMIO window worked on
+ * IODA1&2.
+ *
+ * d) A segmented BAR with 8, 64, or 128 segments. This works similarly to a),
+ * but with fewer segments and configurable base PE.
+ *
+ * i.e. The n'th segment maps to the (n + base)'th PE.
+ *
+ * The base PE is also required to be a multiple of the window size.
+ *
+ * Unfortunately, the OPAL API doesn't currently (as of skiboot v6.6) allow us
+ * to exploit any of the IODA3 features.
+ */
+
+static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
+{
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+ struct resource *res;
+ int i;
+ resource_size_t vf_bar_sz;
+ struct pnv_iov_data *iov;
+ int mul;
+
+ iov = kzalloc(sizeof(*iov), GFP_KERNEL);
+ if (!iov)
+ goto disable_iov;
+ pdev->dev.archdata.iov_data = iov;
+ mul = phb->ioda.total_pe_num;
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || res->parent)
+ continue;
+ if (!pnv_pci_is_m64_flags(res->flags)) {
+ dev_warn(&pdev->dev, "Don't support SR-IOV with non M64 VF BAR%d: %pR. \n",
+ i, res);
+ goto disable_iov;
+ }
+
+ vf_bar_sz = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+
+ /*
+ * Generally, one segmented M64 BAR maps one IOV BAR. However,
+ * if a VF BAR is too large we end up wasting a lot of space.
+ * If each VF needs more than 1/4 of the default m64 segment
+ * then each VF BAR should be mapped in single-PE mode to reduce
+ * the amount of space required. This does however limit the
+ * number of VFs we can support.
+ *
+ * The 1/4 limit is arbitrary and can be tweaked.
+ */
+ if (vf_bar_sz > (phb->ioda.m64_segsize >> 2)) {
+ /*
+ * On PHB3, the minimum size alignment of M64 BAR in
+ * single mode is 32MB. If this VF BAR is smaller than
+ * 32MB, but still too large for a segmented window
+ * then we can't map it and need to disable SR-IOV for
+ * this device.
+ */
+ if (vf_bar_sz < SZ_32M) {
+ pci_err(pdev, "VF BAR%d: %pR can't be mapped in single PE mode\n",
+ i, res);
+ goto disable_iov;
+ }
+
+ iov->m64_single_mode[i] = true;
+ continue;
+ }
+
+ /*
+ * This BAR can be mapped with one segmented window, so adjust
+ * te resource size to accommodate.
+ */
+ pci_dbg(pdev, " Fixing VF BAR%d: %pR to\n", i, res);
+ res->end = res->start + vf_bar_sz * mul - 1;
+ pci_dbg(pdev, " %pR\n", res);
+
+ pci_info(pdev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
+ i, res, mul);
+
+ iov->need_shift = true;
+ }
+
+ return;
+
+disable_iov:
+ /* Save ourselves some MMIO space by disabling the unusable BARs */
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ res->flags = 0;
+ res->end = res->start - 1;
+ }
+
+ pdev->dev.archdata.iov_data = NULL;
+ kfree(iov);
+}
+
+void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev)
+{
+ if (pdev->is_virtfn) {
+ struct pnv_ioda_pe *pe = pnv_ioda_get_pe(pdev);
+
+ /*
+ * VF PEs are single-device PEs so their pdev pointer needs to
+ * be set. The pdev doesn't exist when the PE is allocated (in
+ * (pcibios_sriov_enable()) so we fix it up here.
+ */
+ pe->pdev = pdev;
+ WARN_ON(!(pe->flags & PNV_IODA_PE_VF));
+ } else if (pdev->is_physfn) {
+ /*
+ * For PFs adjust their allocated IOV resources to match what
+ * the PHB can support using its M64 BAR table.
+ */
+ pnv_pci_ioda_fixup_iov_resources(pdev);
+ }
+}
+
+resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
+ int resno)
+{
+ resource_size_t align = pci_iov_resource_size(pdev, resno);
+ struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
+ struct pnv_iov_data *iov = pnv_iov_get(pdev);
+
+ /*
+ * iov can be null if we have an SR-IOV device with IOV BAR that can't
+ * be placed in the m64 space (i.e. The BAR is 32bit or non-prefetch).
+ * In that case we don't allow VFs to be enabled since one of their
+ * BARs would not be placed in the correct PE.
+ */
+ if (!iov)
+ return align;
+
+ /*
+ * If we're using single mode then we can just use the native VF BAR
+ * alignment. We validated that it's possible to use a single PE
+ * window above when we did the fixup.
+ */
+ if (iov->m64_single_mode[resno - PCI_IOV_RESOURCES])
+ return align;
+
+ /*
+ * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
+ * SR-IOV. While from hardware perspective, the range mapped by M64
+ * BAR should be size aligned.
+ *
+ * This function returns the total IOV BAR size if M64 BAR is in
+ * Shared PE mode or just VF BAR size if not.
+ * If the M64 BAR is in Single PE mode, return the VF BAR size or
+ * M64 segment size if IOV BAR size is less.
+ */
+ return phb->ioda.total_pe_num * align;
+}
+
+static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pnv_iov_data *iov;
+ struct pnv_phb *phb;
+ int window_id;
+
+ phb = pci_bus_to_pnvhb(pdev->bus);
+ iov = pnv_iov_get(pdev);
+
+ for_each_set_bit(window_id, iov->used_m64_bar_mask, MAX_M64_BARS) {
+ opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ window_id,
+ 0);
+
+ clear_bit(window_id, &phb->ioda.m64_bar_alloc);
+ }
+
+ return 0;
+}
+
+
+/*
+ * PHB3 and beyond support segmented windows. The window's address range
+ * is subdivided into phb->ioda.total_pe_num segments and there's a 1-1
+ * mapping between PEs and segments.
+ */
+static int64_t pnv_ioda_map_m64_segmented(struct pnv_phb *phb,
+ int window_id,
+ resource_size_t start,
+ resource_size_t size)
+{
+ int64_t rc;
+
+ rc = opal_pci_set_phb_mem_window(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ window_id,
+ start,
+ 0, /* unused */
+ size);
+ if (rc)
+ goto out;
+
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ window_id,
+ OPAL_ENABLE_M64_SPLIT);
+out:
+ if (rc)
+ pr_err("Failed to map M64 window #%d: %lld\n", window_id, rc);
+
+ return rc;
+}
+
+static int64_t pnv_ioda_map_m64_single(struct pnv_phb *phb,
+ int pe_num,
+ int window_id,
+ resource_size_t start,
+ resource_size_t size)
+{
+ int64_t rc;
+
+ /*
+ * The API for setting up m64 mmio windows seems to have been designed
+ * with P7-IOC in mind. For that chip each M64 BAR (window) had a fixed
+ * split of 8 equally sized segments each of which could individually
+ * assigned to a PE.
+ *
+ * The problem with this is that the API doesn't have any way to
+ * communicate the number of segments we want on a BAR. This wasn't
+ * a problem for p7-ioc since you didn't have a choice, but the
+ * single PE windows added in PHB3 don't map cleanly to this API.
+ *
+ * As a result we've got this slightly awkward process where we
+ * call opal_pci_map_pe_mmio_window() to put the single in single
+ * PE mode, and set the PE for the window before setting the address
+ * bounds. We need to do it this way because the single PE windows
+ * for PHB3 have different alignment requirements on PHB3.
+ */
+ rc = opal_pci_map_pe_mmio_window(phb->opal_id,
+ pe_num,
+ OPAL_M64_WINDOW_TYPE,
+ window_id,
+ 0);
+ if (rc)
+ goto out;
+
+ /*
+ * NB: In single PE mode the window needs to be aligned to 32MB
+ */
+ rc = opal_pci_set_phb_mem_window(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ window_id,
+ start,
+ 0, /* ignored by FW, m64 is 1-1 */
+ size);
+ if (rc)
+ goto out;
+
+ /*
+ * Now actually enable it. We specified the BAR should be in "non-split"
+ * mode so FW will validate that the BAR is in single PE mode.
+ */
+ rc = opal_pci_phb_mmio_enable(phb->opal_id,
+ OPAL_M64_WINDOW_TYPE,
+ window_id,
+ OPAL_ENABLE_M64_NON_SPLIT);
+out:
+ if (rc)
+ pr_err("Error mapping single PE BAR\n");
+
+ return rc;
+}
+
+static int pnv_pci_alloc_m64_bar(struct pnv_phb *phb, struct pnv_iov_data *iov)
+{
+ int win;
+
+ do {
+ win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
+ phb->ioda.m64_bar_idx + 1, 0);
+
+ if (win >= phb->ioda.m64_bar_idx + 1)
+ return -1;
+ } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
+
+ set_bit(win, iov->used_m64_bar_mask);
+
+ return win;
+}
+
+static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pnv_iov_data *iov;
+ struct pnv_phb *phb;
+ int win;
+ struct resource *res;
+ int i, j;
+ int64_t rc;
+ resource_size_t size, start;
+ int base_pe_num;
+
+ phb = pci_bus_to_pnvhb(pdev->bus);
+ iov = pnv_iov_get(pdev);
+
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &pdev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || !res->parent)
+ continue;
+
+ /* don't need single mode? map everything in one go! */
+ if (!iov->m64_single_mode[i]) {
+ win = pnv_pci_alloc_m64_bar(phb, iov);
+ if (win < 0)
+ goto m64_failed;
+
+ size = resource_size(res);
+ start = res->start;
+
+ rc = pnv_ioda_map_m64_segmented(phb, win, start, size);
+ if (rc)
+ goto m64_failed;
+
+ continue;
+ }
+
+ /* otherwise map each VF with single PE BARs */
+ size = pci_iov_resource_size(pdev, PCI_IOV_RESOURCES + i);
+ base_pe_num = iov->vf_pe_arr[0].pe_number;
+
+ for (j = 0; j < num_vfs; j++) {
+ win = pnv_pci_alloc_m64_bar(phb, iov);
+ if (win < 0)
+ goto m64_failed;
+
+ start = res->start + size * j;
+ rc = pnv_ioda_map_m64_single(phb, win,
+ base_pe_num + j,
+ start,
+ size);
+ if (rc)
+ goto m64_failed;
+ }
+ }
+ return 0;
+
+m64_failed:
+ pnv_pci_vf_release_m64(pdev, num_vfs);
+ return -EBUSY;
+}
+
+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
+{
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe, *pe_n;
+
+ phb = pci_bus_to_pnvhb(pdev->bus);
+
+ if (!pdev->is_physfn)
+ return;
+
+ /* FIXME: Use pnv_ioda_release_pe()? */
+ list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
+ if (pe->parent_dev != pdev)
+ continue;
+
+ pnv_pci_ioda2_release_pe_dma(pe);
+
+ /* Remove from list */
+ mutex_lock(&phb->ioda.pe_list_mutex);
+ list_del(&pe->list);
+ mutex_unlock(&phb->ioda.pe_list_mutex);
+
+ pnv_ioda_deconfigure_pe(phb, pe);
+
+ pnv_ioda_free_pe(pe);
+ }
+}
+
+static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
+{
+ struct resource *res, res2;
+ struct pnv_iov_data *iov;
+ resource_size_t size;
+ u16 num_vfs;
+ int i;
+
+ if (!dev->is_physfn)
+ return -EINVAL;
+ iov = pnv_iov_get(dev);
+
+ /*
+ * "offset" is in VFs. The M64 windows are sized so that when they
+ * are segmented, each segment is the same size as the IOV BAR.
+ * Each segment is in a separate PE, and the high order bits of the
+ * address are the PE number. Therefore, each VF's BAR is in a
+ * separate PE, and changing the IOV BAR start address changes the
+ * range of PEs the VFs are in.
+ */
+ num_vfs = iov->num_vfs;
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &dev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || !res->parent)
+ continue;
+ if (iov->m64_single_mode[i])
+ continue;
+
+ /*
+ * The actual IOV BAR range is determined by the start address
+ * and the actual size for num_vfs VFs BAR. This check is to
+ * make sure that after shifting, the range will not overlap
+ * with another device.
+ */
+ size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+ res2.flags = res->flags;
+ res2.start = res->start + (size * offset);
+ res2.end = res2.start + (size * num_vfs) - 1;
+
+ if (res2.end > res->end) {
+ dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n",
+ i, &res2, res, num_vfs, offset);
+ return -EBUSY;
+ }
+ }
+
+ /*
+ * Since M64 BAR shares segments among all possible 256 PEs,
+ * we have to shift the beginning of PF IOV BAR to make it start from
+ * the segment which belongs to the PE number assigned to the first VF.
+ * This creates a "hole" in the /proc/iomem which could be used for
+ * allocating other resources so we reserve this area below and
+ * release when IOV is released.
+ */
+ for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+ res = &dev->resource[i + PCI_IOV_RESOURCES];
+ if (!res->flags || !res->parent)
+ continue;
+ if (iov->m64_single_mode[i])
+ continue;
+
+ size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
+ res2 = *res;
+ res->start += size * offset;
+
+ dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n",
+ i, &res2, res, (offset > 0) ? "En" : "Dis",
+ num_vfs, offset);
+
+ if (offset < 0) {
+ devm_release_resource(&dev->dev, &iov->holes[i]);
+ memset(&iov->holes[i], 0, sizeof(iov->holes[i]));
+ }
+
+ pci_update_resource(dev, i + PCI_IOV_RESOURCES);
+
+ if (offset > 0) {
+ iov->holes[i].start = res2.start;
+ iov->holes[i].end = res2.start + size * offset - 1;
+ iov->holes[i].flags = IORESOURCE_BUS;
+ iov->holes[i].name = "pnv_iov_reserved";
+ devm_request_resource(&dev->dev, res->parent,
+ &iov->holes[i]);
+ }
+ }
+ return 0;
+}
+
+static void pnv_pci_sriov_disable(struct pci_dev *pdev)
+{
+ u16 num_vfs, base_pe;
+ struct pnv_iov_data *iov;
+
+ iov = pnv_iov_get(pdev);
+ if (WARN_ON(!iov))
+ return;
+
+ num_vfs = iov->num_vfs;
+ base_pe = iov->vf_pe_arr[0].pe_number;
+
+ /* Release VF PEs */
+ pnv_ioda_release_vf_PE(pdev);
+
+ /* Un-shift the IOV BARs if we need to */
+ if (iov->need_shift)
+ pnv_pci_vf_resource_shift(pdev, -base_pe);
+
+ /* Release M64 windows */
+ pnv_pci_vf_release_m64(pdev, num_vfs);
+}
+
+static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe;
+ int pe_num;
+ u16 vf_index;
+ struct pnv_iov_data *iov;
+ struct pci_dn *pdn;
+
+ if (!pdev->is_physfn)
+ return;
+
+ phb = pci_bus_to_pnvhb(pdev->bus);
+ pdn = pci_get_pdn(pdev);
+ iov = pnv_iov_get(pdev);
+
+ /* Reserve PE for each VF */
+ for (vf_index = 0; vf_index < num_vfs; vf_index++) {
+ int vf_devfn = pci_iov_virtfn_devfn(pdev, vf_index);
+ int vf_bus = pci_iov_virtfn_bus(pdev, vf_index);
+ struct pci_dn *vf_pdn;
+
+ pe = &iov->vf_pe_arr[vf_index];
+ pe->phb = phb;
+ pe->flags = PNV_IODA_PE_VF;
+ pe->pbus = NULL;
+ pe->parent_dev = pdev;
+ pe->mve_number = -1;
+ pe->rid = (vf_bus << 8) | vf_devfn;
+
+ pe_num = pe->pe_number;
+ pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n",
+ pci_domain_nr(pdev->bus), pdev->bus->number,
+ PCI_SLOT(vf_devfn), PCI_FUNC(vf_devfn), pe_num);
+
+ if (pnv_ioda_configure_pe(phb, pe)) {
+ /* XXX What do we do here ? */
+ pnv_ioda_free_pe(pe);
+ pe->pdev = NULL;
+ continue;
+ }
+
+ /* Put PE to the list */
+ mutex_lock(&phb->ioda.pe_list_mutex);
+ list_add_tail(&pe->list, &phb->ioda.pe_list);
+ mutex_unlock(&phb->ioda.pe_list_mutex);
+
+ /* associate this pe to its pdn */
+ list_for_each_entry(vf_pdn, &pdn->parent->child_list, list) {
+ if (vf_pdn->busno == vf_bus &&
+ vf_pdn->devfn == vf_devfn) {
+ vf_pdn->pe_number = pe_num;
+ break;
+ }
+ }
+
+ pnv_pci_ioda2_setup_dma_pe(phb, pe);
+ }
+}
+
+static int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+ struct pnv_ioda_pe *base_pe;
+ struct pnv_iov_data *iov;
+ struct pnv_phb *phb;
+ int ret;
+ u16 i;
+
+ phb = pci_bus_to_pnvhb(pdev->bus);
+ iov = pnv_iov_get(pdev);
+
+ /*
+ * There's a calls to IODA2 PE setup code littered throughout. We could
+ * probably fix that, but we'd still have problems due to the
+ * restriction inherent on IODA1 PHBs.
+ *
+ * NB: We class IODA3 as IODA2 since they're very similar.
+ */
+ if (phb->type != PNV_PHB_IODA2) {
+ pci_err(pdev, "SR-IOV is not supported on this PHB\n");
+ return -ENXIO;
+ }
+
+ if (!iov) {
+ dev_info(&pdev->dev, "don't support this SRIOV device with non 64bit-prefetchable IOV BAR\n");
+ return -ENOSPC;
+ }
+
+ /* allocate a contiguous block of PEs for our VFs */
+ base_pe = pnv_ioda_alloc_pe(phb, num_vfs);
+ if (!base_pe) {
+ pci_err(pdev, "Unable to allocate PEs for %d VFs\n", num_vfs);
+ return -EBUSY;
+ }
+
+ iov->vf_pe_arr = base_pe;
+ iov->num_vfs = num_vfs;
+
+ /* Assign M64 window accordingly */
+ ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
+ if (ret) {
+ dev_info(&pdev->dev, "Not enough M64 window resources\n");
+ goto m64_failed;
+ }
+
+ /*
+ * When using one M64 BAR to map one IOV BAR, we need to shift
+ * the IOV BAR according to the PE# allocated to the VFs.
+ * Otherwise, the PE# for the VF will conflict with others.
+ */
+ if (iov->need_shift) {
+ ret = pnv_pci_vf_resource_shift(pdev, base_pe->pe_number);
+ if (ret)
+ goto shift_failed;
+ }
+
+ /* Setup VF PEs */
+ pnv_ioda_setup_vf_PE(pdev, num_vfs);
+
+ return 0;
+
+shift_failed:
+ pnv_pci_vf_release_m64(pdev, num_vfs);
+
+m64_failed:
+ for (i = 0; i < num_vfs; i++)
+ pnv_ioda_free_pe(&iov->vf_pe_arr[i]);
+
+ return ret;
+}
+
+int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
+{
+ pnv_pci_sriov_disable(pdev);
+
+ /* Release PCI data */
+ remove_sriov_vf_pdns(pdev);
+ return 0;
+}
+
+int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+{
+ /* Allocate PCI data */
+ add_sriov_vf_pdns(pdev);
+
+ return pnv_pci_sriov_enable(pdev, num_vfs);
+}
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 45fb70b4bfa7..b2c1da025410 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Support PCI/PCIe on PowerNV platforms
*
* Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -18,11 +14,9 @@
#include <linux/io.h>
#include <linux/msi.h>
#include <linux/iommu.h>
-#include <linux/sched/mm.h>
#include <asm/sections.h>
#include <asm/io.h>
-#include <asm/prom.h>
#include <asm/pci-bridge.h>
#include <asm/machdep.h>
#include <asm/msi_bitmap.h>
@@ -38,12 +32,9 @@
#include "powernv.h"
#include "pci.h"
-static DEFINE_MUTEX(p2p_mutex);
-static DEFINE_MUTEX(tunnel_mutex);
-
int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id)
{
- struct device_node *parent = np;
+ struct device_node *node = np;
u32 bdfn;
u64 phbid;
int ret;
@@ -53,24 +44,29 @@ int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id)
return -ENXIO;
bdfn = ((bdfn & 0x00ffff00) >> 8);
- while ((parent = of_get_parent(parent))) {
- if (!PCI_DN(parent)) {
- of_node_put(parent);
+ for (node = np; node; node = of_get_parent(node)) {
+ if (!PCI_DN(node)) {
+ of_node_put(node);
break;
}
- if (!of_device_is_compatible(parent, "ibm,ioda2-phb")) {
- of_node_put(parent);
+ if (!of_device_is_compatible(node, "ibm,ioda2-phb") &&
+ !of_device_is_compatible(node, "ibm,ioda3-phb") &&
+ !of_device_is_compatible(node, "ibm,ioda2-npu2-opencapi-phb")) {
+ of_node_put(node);
continue;
}
- ret = of_property_read_u64(parent, "ibm,opal-phbid", &phbid);
+ ret = of_property_read_u64(node, "ibm,opal-phbid", &phbid);
if (ret) {
- of_node_put(parent);
+ of_node_put(node);
return -ENXIO;
}
- *id = PCI_SLOT_ID(phbid, bdfn);
+ if (of_device_is_compatible(node, "ibm,ioda2-npu2-opencapi-phb"))
+ *id = PCI_PHB_SLOT_ID(phbid);
+ else
+ *id = PCI_SLOT_ID(phbid, bdfn);
return 0;
}
@@ -160,75 +156,6 @@ exit:
}
EXPORT_SYMBOL_GPL(pnv_pci_set_power_state);
-int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
-{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
- struct msi_desc *entry;
- struct msi_msg msg;
- int hwirq;
- unsigned int virq;
- int rc;
-
- if (WARN_ON(!phb) || !phb->msi_bmp.bitmap)
- return -ENODEV;
-
- if (pdev->no_64bit_msi && !phb->msi32_support)
- return -ENODEV;
-
- for_each_pci_msi_entry(entry, pdev) {
- if (!entry->msi_attrib.is_64 && !phb->msi32_support) {
- pr_warn("%s: Supports only 64-bit MSIs\n",
- pci_name(pdev));
- return -ENXIO;
- }
- hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1);
- if (hwirq < 0) {
- pr_warn("%s: Failed to find a free MSI\n",
- pci_name(pdev));
- return -ENOSPC;
- }
- virq = irq_create_mapping(NULL, phb->msi_base + hwirq);
- if (!virq) {
- pr_warn("%s: Failed to map MSI to linux irq\n",
- pci_name(pdev));
- msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
- return -ENOMEM;
- }
- rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq,
- virq, entry->msi_attrib.is_64, &msg);
- if (rc) {
- pr_warn("%s: Failed to setup MSI\n", pci_name(pdev));
- irq_dispose_mapping(virq);
- msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1);
- return rc;
- }
- irq_set_msi_desc(virq, entry);
- pci_write_msi_msg(virq, &msg);
- }
- return 0;
-}
-
-void pnv_teardown_msi_irqs(struct pci_dev *pdev)
-{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
- struct msi_desc *entry;
- irq_hw_number_t hwirq;
-
- if (WARN_ON(!phb))
- return;
-
- for_each_pci_msi_entry(entry, pdev) {
- if (!entry->irq)
- continue;
- hwirq = virq_to_hw(entry->irq);
- irq_set_msi_desc(entry->irq, NULL);
- irq_dispose_mapping(entry->irq);
- msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, 1);
- }
-}
-
/* Nicely print the contents of the PE State Tables (PEST). */
static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size)
{
@@ -713,7 +640,7 @@ int pnv_pci_cfg_write(struct pci_dn *pdn,
return PCIBIOS_SUCCESSFUL;
}
-#if CONFIG_EEH
+#ifdef CONFIG_EEH
static bool pnv_pci_cfg_check(struct pci_dn *pdn)
{
struct eeh_dev *edev = NULL;
@@ -814,259 +741,6 @@ struct iommu_table *pnv_pci_table_alloc(int nid)
return tbl;
}
-void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
-{
- struct pci_controller *hose = pci_bus_to_host(pdev->bus);
- struct pnv_phb *phb = hose->private_data;
-#ifdef CONFIG_PCI_IOV
- struct pnv_ioda_pe *pe;
- struct pci_dn *pdn;
-
- /* Fix the VF pdn PE number */
- if (pdev->is_virtfn) {
- pdn = pci_get_pdn(pdev);
- WARN_ON(pdn->pe_number != IODA_INVALID_PE);
- list_for_each_entry(pe, &phb->ioda.pe_list, list) {
- if (pe->rid == ((pdev->bus->number << 8) |
- (pdev->devfn & 0xff))) {
- pdn->pe_number = pe->pe_number;
- pe->pdev = pdev;
- break;
- }
- }
- }
-#endif /* CONFIG_PCI_IOV */
-
- if (phb && phb->dma_dev_setup)
- phb->dma_dev_setup(phb, pdev);
-}
-
-void pnv_pci_dma_bus_setup(struct pci_bus *bus)
-{
- struct pci_controller *hose = bus->sysdata;
- struct pnv_phb *phb = hose->private_data;
- struct pnv_ioda_pe *pe;
-
- list_for_each_entry(pe, &phb->ioda.pe_list, list) {
- if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
- continue;
-
- if (!pe->pbus)
- continue;
-
- if (bus->number == ((pe->rid >> 8) & 0xFF)) {
- pe->pbus = bus;
- break;
- }
- }
-}
-
-int pnv_pci_set_p2p(struct pci_dev *initiator, struct pci_dev *target, u64 desc)
-{
- struct pci_controller *hose;
- struct pnv_phb *phb_init, *phb_target;
- struct pnv_ioda_pe *pe_init;
- int rc;
-
- if (!opal_check_token(OPAL_PCI_SET_P2P))
- return -ENXIO;
-
- hose = pci_bus_to_host(initiator->bus);
- phb_init = hose->private_data;
-
- hose = pci_bus_to_host(target->bus);
- phb_target = hose->private_data;
-
- pe_init = pnv_ioda_get_pe(initiator);
- if (!pe_init)
- return -ENODEV;
-
- /*
- * Configuring the initiator's PHB requires to adjust its
- * TVE#1 setting. Since the same device can be an initiator
- * several times for different target devices, we need to keep
- * a reference count to know when we can restore the default
- * bypass setting on its TVE#1 when disabling. Opal is not
- * tracking PE states, so we add a reference count on the PE
- * in linux.
- *
- * For the target, the configuration is per PHB, so we keep a
- * target reference count on the PHB.
- */
- mutex_lock(&p2p_mutex);
-
- if (desc & OPAL_PCI_P2P_ENABLE) {
- /* always go to opal to validate the configuration */
- rc = opal_pci_set_p2p(phb_init->opal_id, phb_target->opal_id,
- desc, pe_init->pe_number);
-
- if (rc != OPAL_SUCCESS) {
- rc = -EIO;
- goto out;
- }
-
- pe_init->p2p_initiator_count++;
- phb_target->p2p_target_count++;
- } else {
- if (!pe_init->p2p_initiator_count ||
- !phb_target->p2p_target_count) {
- rc = -EINVAL;
- goto out;
- }
-
- if (--pe_init->p2p_initiator_count == 0)
- pnv_pci_ioda2_set_bypass(pe_init, true);
-
- if (--phb_target->p2p_target_count == 0) {
- rc = opal_pci_set_p2p(phb_init->opal_id,
- phb_target->opal_id, desc,
- pe_init->pe_number);
- if (rc != OPAL_SUCCESS) {
- rc = -EIO;
- goto out;
- }
- }
- }
- rc = 0;
-out:
- mutex_unlock(&p2p_mutex);
- return rc;
-}
-EXPORT_SYMBOL_GPL(pnv_pci_set_p2p);
-
-struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev)
-{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
-
- return of_node_get(hose->dn);
-}
-EXPORT_SYMBOL(pnv_pci_get_phb_node);
-
-int pnv_pci_enable_tunnel(struct pci_dev *dev, u64 *asnind)
-{
- struct device_node *np;
- const __be32 *prop;
- struct pnv_ioda_pe *pe;
- uint16_t window_id;
- int rc;
-
- if (!radix_enabled())
- return -ENXIO;
-
- if (!(np = pnv_pci_get_phb_node(dev)))
- return -ENXIO;
-
- prop = of_get_property(np, "ibm,phb-indications", NULL);
- of_node_put(np);
-
- if (!prop || !prop[1])
- return -ENXIO;
-
- *asnind = (u64)be32_to_cpu(prop[1]);
- pe = pnv_ioda_get_pe(dev);
- if (!pe)
- return -ENODEV;
-
- /* Increase real window size to accept as_notify messages. */
- window_id = (pe->pe_number << 1 ) + 1;
- rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id, pe->pe_number,
- window_id, pe->tce_bypass_base,
- (uint64_t)1 << 48);
- return opal_error_code(rc);
-}
-EXPORT_SYMBOL_GPL(pnv_pci_enable_tunnel);
-
-int pnv_pci_disable_tunnel(struct pci_dev *dev)
-{
- struct pnv_ioda_pe *pe;
-
- pe = pnv_ioda_get_pe(dev);
- if (!pe)
- return -ENODEV;
-
- /* Restore default real window size. */
- pnv_pci_ioda2_set_bypass(pe, true);
- return 0;
-}
-EXPORT_SYMBOL_GPL(pnv_pci_disable_tunnel);
-
-int pnv_pci_set_tunnel_bar(struct pci_dev *dev, u64 addr, int enable)
-{
- __be64 val;
- struct pci_controller *hose;
- struct pnv_phb *phb;
- u64 tunnel_bar;
- int rc;
-
- if (!opal_check_token(OPAL_PCI_GET_PBCQ_TUNNEL_BAR))
- return -ENXIO;
- if (!opal_check_token(OPAL_PCI_SET_PBCQ_TUNNEL_BAR))
- return -ENXIO;
-
- hose = pci_bus_to_host(dev->bus);
- phb = hose->private_data;
-
- mutex_lock(&tunnel_mutex);
- rc = opal_pci_get_pbcq_tunnel_bar(phb->opal_id, &val);
- if (rc != OPAL_SUCCESS) {
- rc = -EIO;
- goto out;
- }
- tunnel_bar = be64_to_cpu(val);
- if (enable) {
- /*
- * Only one device per PHB can use atomics.
- * Our policy is first-come, first-served.
- */
- if (tunnel_bar) {
- if (tunnel_bar != addr)
- rc = -EBUSY;
- else
- rc = 0; /* Setting same address twice is ok */
- goto out;
- }
- } else {
- /*
- * The device that owns atomics and wants to release
- * them must pass the same address with enable == 0.
- */
- if (tunnel_bar != addr) {
- rc = -EPERM;
- goto out;
- }
- addr = 0x0ULL;
- }
- rc = opal_pci_set_pbcq_tunnel_bar(phb->opal_id, addr);
- rc = opal_error_code(rc);
-out:
- mutex_unlock(&tunnel_mutex);
- return rc;
-}
-EXPORT_SYMBOL_GPL(pnv_pci_set_tunnel_bar);
-
-#ifdef CONFIG_PPC64 /* for thread.tidr */
-int pnv_pci_get_as_notify_info(struct task_struct *task, u32 *lpid, u32 *pid,
- u32 *tid)
-{
- struct mm_struct *mm = NULL;
-
- if (task == NULL)
- return -EINVAL;
-
- mm = get_task_mm(task);
- if (mm == NULL)
- return -EINVAL;
-
- *pid = mm->context.id;
- mmput(mm);
-
- *tid = task->thread.tidr;
- *lpid = mfspr(SPRN_LPID);
- return 0;
-}
-EXPORT_SYMBOL_GPL(pnv_pci_get_as_notify_info);
-#endif
-
void pnv_pci_shutdown(void)
{
struct pci_controller *hose;
@@ -1079,7 +753,7 @@ void pnv_pci_shutdown(void)
/* Fixup wrong class code in p7ioc and p8 root complex */
static void pnv_p7ioc_rc_quirk(struct pci_dev *dev)
{
- dev->class = PCI_CLASS_BRIDGE_PCI << 8;
+ dev->class = PCI_CLASS_BRIDGE_PCI_NORMAL;
}
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk);
@@ -1093,10 +767,22 @@ void __init pnv_pci_init(void)
if (!firmware_has_feature(FW_FEATURE_OPAL))
return;
- /* Look for IODA IO-Hubs. */
- for_each_compatible_node(np, NULL, "ibm,ioda-hub") {
- pnv_pci_init_ioda_hub(np);
- }
+#ifdef CONFIG_PCIEPORTBUS
+ /*
+ * On PowerNV PCIe devices are (currently) managed in cooperation
+ * with firmware. This isn't *strictly* required, but there's enough
+ * assumptions baked into both firmware and the platform code that
+ * it's unwise to allow the portbus services to be used.
+ *
+ * We need to fix this eventually, but for now set this flag to disable
+ * the portbus driver. The AER service isn't required since that AER
+ * events are handled via EEH. The pciehp hotplug driver can't work
+ * without kernel changes (and portbus binding breaks pnv_php). The
+ * other services also require some thinking about how we're going
+ * to integrate them.
+ */
+ pcie_ports_disabled = true;
+#endif
/* Look for ioda2 built-in PHB3's */
for_each_compatible_node(np, NULL, "ibm,ioda2-phb")
@@ -1106,17 +792,6 @@ void __init pnv_pci_init(void)
for_each_compatible_node(np, NULL, "ibm,ioda3-phb")
pnv_pci_init_ioda2_phb(np);
- /* Look for NPU PHBs */
- for_each_compatible_node(np, NULL, "ibm,ioda2-npu-phb")
- pnv_pci_init_npu_phb(np);
-
- /*
- * Look for NPU2 PHBs which we treat mostly as NPU PHBs with
- * the exception of TCE kill which requires an OPAL call.
- */
- for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-phb")
- pnv_pci_init_npu_phb(np);
-
/* Look for NPU2 OpenCAPI PHBs */
for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-opencapi-phb")
pnv_pci_init_npu2_opencapi_phb(np);
@@ -1124,46 +799,3 @@ void __init pnv_pci_init(void)
/* Configure IOMMU DMA hooks */
set_pci_dma_ops(&dma_iommu_ops);
}
-
-static int pnv_tce_iommu_bus_notifier(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct device *dev = data;
- struct pci_dev *pdev;
- struct pci_dn *pdn;
- struct pnv_ioda_pe *pe;
- struct pci_controller *hose;
- struct pnv_phb *phb;
-
- switch (action) {
- case BUS_NOTIFY_ADD_DEVICE:
- pdev = to_pci_dev(dev);
- pdn = pci_get_pdn(pdev);
- hose = pci_bus_to_host(pdev->bus);
- phb = hose->private_data;
-
- WARN_ON_ONCE(!phb);
- if (!pdn || pdn->pe_number == IODA_INVALID_PE || !phb)
- return 0;
-
- pe = &phb->ioda.pe_array[pdn->pe_number];
- iommu_add_device(&pe->table_group, dev);
- return 0;
- case BUS_NOTIFY_DEL_DEVICE:
- iommu_del_device(dev);
- return 0;
- default:
- return 0;
- }
-}
-
-static struct notifier_block pnv_tce_iommu_bus_nb = {
- .notifier_call = pnv_tce_iommu_bus_notifier,
-};
-
-static int __init pnv_tce_iommu_bus_notifier_init(void)
-{
- bus_register_notifier(&pci_bus_type, &pnv_tce_iommu_bus_nb);
- return 0;
-}
-machine_subsys_initcall_sync(powernv, pnv_tce_iommu_bus_notifier_init);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 8e36da379252..42075501663b 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -2,6 +2,7 @@
#ifndef __POWERNV_PCI_H
#define __POWERNV_PCI_H
+#include <linux/compiler.h> /* for __printf */
#include <linux/iommu.h>
#include <asm/iommu.h>
#include <asm/msi_bitmap.h>
@@ -9,10 +10,8 @@
struct pci_dn;
enum pnv_phb_type {
- PNV_PHB_IODA1 = 0,
- PNV_PHB_IODA2 = 1,
- PNV_PHB_NPU_NVLINK = 2,
- PNV_PHB_NPU_OCAPI = 3,
+ PNV_PHB_IODA2,
+ PNV_PHB_NPU_OCAPI,
};
/* Precise PHB model for error management */
@@ -20,8 +19,6 @@ enum pnv_phb_model {
PNV_PHB_MODEL_UNKNOWN,
PNV_PHB_MODEL_P7IOC,
PNV_PHB_MODEL_PHB3,
- PNV_PHB_MODEL_NPU,
- PNV_PHB_MODEL_NPU2,
};
#define PNV_PCI_DIAG_BUF_SIZE 8192
@@ -32,6 +29,24 @@ enum pnv_phb_model {
#define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */
#define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */
+/*
+ * A brief note on PNV_IODA_PE_BUS_ALL
+ *
+ * This is needed because of the behaviour of PCIe-to-PCI bridges. The PHB uses
+ * the Requester ID field of the PCIe request header to determine the device
+ * (and PE) that initiated a DMA. In legacy PCI individual memory read/write
+ * requests aren't tagged with the RID. To work around this the PCIe-to-PCI
+ * bridge will use (secondary_bus_no << 8) | 0x00 as the RID on the PCIe side.
+ *
+ * PCIe-to-X bridges have a similar issue even though PCI-X requests also have
+ * a RID in the transaction header. The PCIe-to-X bridge is permitted to "take
+ * ownership" of a transaction by a PCI-X device when forwarding it to the PCIe
+ * side of the bridge.
+ *
+ * To work around these problems we use the BUS_ALL flag since every subordinate
+ * bus of the bridge should go into the same PE.
+ */
+
/* Indicates operations are frozen for a PE: MMIO in PESTA & DMA in PESTB. */
#define PNV_IODA_STOPPED_STATE 0x8000000000000000
@@ -62,13 +77,19 @@ struct pnv_ioda_pe {
/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
struct iommu_table_group table_group;
- struct npu_comp *npucomp;
/* 64-bit TCE bypass region */
bool tce_bypass_enabled;
uint64_t tce_bypass_base;
- /* MSIs. MVE index is identical for for 32 and 64 bit MSI
+ /*
+ * Used to track whether we've done DMA setup for this PE or not. We
+ * want to defer allocating TCE tables, etc until we've added a
+ * non-bridge device to the PE.
+ */
+ bool dma_setup_done;
+
+ /* MSIs. MVE index is identical for 32 and 64 bit MSI
* and -1 if not supported. (It's actually identical to the
* PE number)
*/
@@ -78,9 +99,6 @@ struct pnv_ioda_pe {
struct pnv_ioda_pe *master;
struct list_head slaves;
- /* PCI peer-to-peer*/
- int p2p_initiator_count;
-
/* Link in list of PE#s */
struct list_head list;
};
@@ -96,7 +114,6 @@ struct pnv_phb {
int flags;
void __iomem *regs;
u64 regs_phys;
- int initialized;
spinlock_t lock;
#ifdef CONFIG_DEBUG_FS
@@ -105,12 +122,7 @@ struct pnv_phb {
#endif
unsigned int msi_base;
- unsigned int msi32_support;
struct msi_bitmap msi_bmp;
- int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev,
- unsigned int hwirq, unsigned int virq,
- unsigned int is_64, struct msi_msg *msg);
- void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
int (*init_m64)(struct pnv_phb *phb);
int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
@@ -121,7 +133,6 @@ struct pnv_phb {
unsigned int total_pe_num;
unsigned int reserved_pe_idx;
unsigned int root_pe_idx;
- bool root_pe_populated;
/* 32-bit MMIO window */
unsigned int m32_size;
@@ -133,6 +144,7 @@ struct pnv_phb {
unsigned long m64_size;
unsigned long m64_segsize;
unsigned long m64_base;
+#define MAX_M64_BARS 64
unsigned long m64_bar_alloc;
/* IO ports */
@@ -150,12 +162,7 @@ struct pnv_phb {
unsigned int *m32_segmap;
unsigned int *io_segmap;
- /* DMA32 segment maps - IODA1 only */
- unsigned int dma32_count;
- unsigned int *dma32_segmap;
-
/* IRQ chip */
- int irq_chip_init;
struct irq_chip irq_chip;
/* Sorted list of used PE's based
@@ -171,10 +178,91 @@ struct pnv_phb {
/* PHB and hub diagnostics */
unsigned int diag_data_size;
u8 *diag_data;
+};
+
+
+/* IODA PE management */
+
+static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
+{
+ /*
+ * WARNING: We cannot rely on the resource flags. The Linux PCI
+ * allocation code sometimes decides to put a 64-bit prefetchable
+ * BAR in the 32-bit window, so we have to compare the addresses.
+ *
+ * For simplicity we only test resource start.
+ */
+ return (r->start >= phb->ioda.m64_base &&
+ r->start < (phb->ioda.m64_base + phb->ioda.m64_size));
+}
+
+static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags)
+{
+ unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH);
+
+ return (resource_flags & flags) == flags;
+}
+
+int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
+int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
+
+void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
+void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe);
+
+struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count);
+void pnv_ioda_free_pe(struct pnv_ioda_pe *pe);
- int p2p_target_count;
+#ifdef CONFIG_PCI_IOV
+/*
+ * For SR-IOV we want to put each VF's MMIO resource in to a separate PE.
+ * This requires a bit of acrobatics with the MMIO -> PE configuration
+ * and this structure is used to keep track of it all.
+ */
+struct pnv_iov_data {
+ /* number of VFs enabled */
+ u16 num_vfs;
+
+ /* pointer to the array of VF PEs. num_vfs long*/
+ struct pnv_ioda_pe *vf_pe_arr;
+
+ /* Did we map the VF BAR with single-PE IODA BARs? */
+ bool m64_single_mode[PCI_SRIOV_NUM_BARS];
+
+ /*
+ * True if we're using any segmented windows. In that case we need
+ * shift the start of the IOV resource the segment corresponding to
+ * the allocated PE.
+ */
+ bool need_shift;
+
+ /*
+ * Bit mask used to track which m64 windows are used to map the
+ * SR-IOV BARs for this device.
+ */
+ DECLARE_BITMAP(used_m64_bar_mask, MAX_M64_BARS);
+
+ /*
+ * If we map the SR-IOV BARs with a segmented window then
+ * parts of that window will be "claimed" by other PEs.
+ *
+ * "holes" here is used to reserve the leading portion
+ * of the window that is used by other (non VF) PEs.
+ */
+ struct resource holes[PCI_SRIOV_NUM_BARS];
};
+static inline struct pnv_iov_data *pnv_iov_get(struct pci_dev *pdev)
+{
+ return pdev->dev.archdata.iov_data;
+}
+
+void pnv_pci_ioda_fixup_iov(struct pci_dev *pdev);
+resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, int resno);
+
+int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs);
+int pnv_pcibios_sriov_disable(struct pci_dev *pdev);
+#endif /* CONFIG_PCI_IOV */
+
extern struct pci_ops pnv_pci_ops;
void pnv_pci_dump_phb_diag_data(struct pci_controller *hose,
@@ -185,25 +273,18 @@ int pnv_pci_cfg_write(struct pci_dn *pdn,
int where, int size, u32 val);
extern struct iommu_table *pnv_pci_table_alloc(int nid);
-extern void pnv_pci_init_ioda_hub(struct device_node *np);
extern void pnv_pci_init_ioda2_phb(struct device_node *np);
-extern void pnv_pci_init_npu_phb(struct device_node *np);
extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np);
-extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr);
extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
-extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
-extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
-extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
-extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
+extern struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn);
extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
-extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
-extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
__u64 window_size, __u32 levels);
extern int pnv_eeh_post_init(void);
+__printf(3, 4)
extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
const char *fmt, ...);
#define pe_err(pe, fmt, ...) \
@@ -213,17 +294,8 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
#define pe_info(pe, fmt, ...) \
pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
-/* Nvlink functions */
-extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
-extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
-extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
-extern struct iommu_table_group *pnv_try_setup_npu_table_group(
- struct pnv_ioda_pe *pe);
-extern struct iommu_table_group *pnv_npu_compound_attach(
- struct pnv_ioda_pe *pe);
-
/* pci-ioda-tce.c */
-#define POWERNV_IOMMU_DEFAULT_LEVELS 1
+#define POWERNV_IOMMU_DEFAULT_LEVELS 2
#define POWERNV_IOMMU_MAX_LEVELS 5
extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
@@ -231,8 +303,7 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
unsigned long attrs);
extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages);
extern int pnv_tce_xchg(struct iommu_table *tbl, long index,
- unsigned long *hpa, enum dma_data_direction *direction,
- bool alloc);
+ unsigned long *hpa, enum dma_data_direction *direction);
extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index,
bool alloc);
extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index);
@@ -251,4 +322,16 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
void *tce_mem, u64 tce_size,
u64 dma_offset, unsigned int page_shift);
+extern unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
+
+static inline struct pnv_phb *pci_bus_to_pnvhb(struct pci_bus *bus)
+{
+ struct pci_controller *hose = bus->sysdata;
+
+ if (hose)
+ return hose->private_data;
+
+ return NULL;
+}
+
#endif /* __POWERNV_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index fd4a1c5a6369..866efdc103fd 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -2,6 +2,13 @@
#ifndef _POWERNV_H
#define _POWERNV_H
+/*
+ * There's various hacks scattered throughout the generic powerpc arch code
+ * that needs to call into powernv platform stuff. The prototypes for those
+ * functions are in asm/powernv.h
+ */
+#include <asm/powernv.h>
+
#ifdef CONFIG_SMP
extern void pnv_smp_init(void);
#else
@@ -30,4 +37,11 @@ extern void opal_event_shutdown(void);
bool cpu_core_split_required(void);
+struct memcons;
+ssize_t memcons_copy(struct memcons *mc, char *to, loff_t pos, size_t count);
+u32 __init memcons_get_size(struct memcons *mc);
+struct memcons *__init memcons_init(struct device_node *node, const char *mc_prop_name);
+
+void pnv_rng_init(void);
+
#endif /* _POWERNV_H */
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
index 718f50ed22f1..196aa70fe043 100644
--- a/arch/powerpc/platforms/powernv/rng.c
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright 2013, Michael Ellerman, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "powernv-rng: " fmt
@@ -21,33 +17,28 @@
#include <asm/prom.h>
#include <asm/machdep.h>
#include <asm/smp.h>
+#include "powernv.h"
#define DARN_ERR 0xFFFFFFFFFFFFFFFFul
-struct powernv_rng {
+struct pnv_rng {
void __iomem *regs;
void __iomem *regs_real;
unsigned long mask;
};
-static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng);
+static DEFINE_PER_CPU(struct pnv_rng *, pnv_rng);
-
-int powernv_hwrng_present(void)
-{
- struct powernv_rng *rng;
-
- rng = get_cpu_var(powernv_rng);
- put_cpu_var(rng);
- return rng != NULL;
-}
-
-static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
+static unsigned long rng_whiten(struct pnv_rng *rng, unsigned long val)
{
unsigned long parity;
/* Calculate the parity of the value */
- asm ("popcntd %0,%1" : "=r" (parity) : "r" (val));
+ asm (".machine push; \
+ .machine power7; \
+ popcntd %0,%1; \
+ .machine pop;"
+ : "=r" (parity) : "r" (val));
/* xor our value with the previous mask */
val ^= rng->mask;
@@ -58,18 +49,7 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
return val;
}
-int powernv_get_random_real_mode(unsigned long *v)
-{
- struct powernv_rng *rng;
-
- rng = raw_cpu_read(powernv_rng);
-
- *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real));
-
- return 1;
-}
-
-int powernv_get_random_darn(unsigned long *v)
+static int pnv_get_random_darn(unsigned long *v)
{
unsigned long val;
@@ -84,7 +64,7 @@ int powernv_get_random_darn(unsigned long *v)
return 1;
}
-static int initialise_darn(void)
+static int __init initialise_darn(void)
{
unsigned long val;
int i;
@@ -93,32 +73,31 @@ static int initialise_darn(void)
return -ENODEV;
for (i = 0; i < 10; i++) {
- if (powernv_get_random_darn(&val)) {
- ppc_md.get_random_seed = powernv_get_random_darn;
+ if (pnv_get_random_darn(&val)) {
+ ppc_md.get_random_seed = pnv_get_random_darn;
return 0;
}
}
-
- pr_warn("Unable to use DARN for get_random_seed()\n");
-
return -EIO;
}
-int powernv_get_random_long(unsigned long *v)
+int pnv_get_random_long(unsigned long *v)
{
- struct powernv_rng *rng;
-
- rng = get_cpu_var(powernv_rng);
-
- *v = rng_whiten(rng, in_be64(rng->regs));
-
- put_cpu_var(rng);
-
+ struct pnv_rng *rng;
+
+ if (mfmsr() & MSR_DR) {
+ rng = get_cpu_var(pnv_rng);
+ *v = rng_whiten(rng, in_be64(rng->regs));
+ put_cpu_var(rng);
+ } else {
+ rng = raw_cpu_read(pnv_rng);
+ *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real));
+ }
return 1;
}
-EXPORT_SYMBOL_GPL(powernv_get_random_long);
+EXPORT_SYMBOL_GPL(pnv_get_random_long);
-static __init void rng_init_per_cpu(struct powernv_rng *rng,
+static __init void rng_init_per_cpu(struct pnv_rng *rng,
struct device_node *dn)
{
int chip_id, cpu;
@@ -128,16 +107,16 @@ static __init void rng_init_per_cpu(struct powernv_rng *rng,
pr_warn("No ibm,chip-id found for %pOF.\n", dn);
for_each_possible_cpu(cpu) {
- if (per_cpu(powernv_rng, cpu) == NULL ||
+ if (per_cpu(pnv_rng, cpu) == NULL ||
cpu_to_chip_id(cpu) == chip_id) {
- per_cpu(powernv_rng, cpu) = rng;
+ per_cpu(pnv_rng, cpu) = rng;
}
}
}
static __init int rng_create(struct device_node *dn)
{
- struct powernv_rng *rng;
+ struct pnv_rng *rng;
struct resource res;
unsigned long val;
@@ -163,32 +142,59 @@ static __init int rng_create(struct device_node *dn)
rng_init_per_cpu(rng, dn);
- pr_info_once("Registering arch random hook.\n");
-
- ppc_md.get_random_seed = powernv_get_random_long;
+ ppc_md.get_random_seed = pnv_get_random_long;
return 0;
}
-static __init int rng_init(void)
+static int __init pnv_get_random_long_early(unsigned long *v)
{
struct device_node *dn;
- int rc;
-
- for_each_compatible_node(dn, NULL, "ibm,power-rng") {
- rc = rng_create(dn);
- if (rc) {
- pr_err("Failed creating rng for %pOF (%d).\n",
- dn, rc);
- continue;
- }
- /* Create devices for hwrng driver */
- of_platform_device_create(dn, NULL, NULL);
- }
+ if (!slab_is_available())
+ return 0;
+
+ if (cmpxchg(&ppc_md.get_random_seed, pnv_get_random_long_early,
+ NULL) != pnv_get_random_long_early)
+ return 0;
+
+ for_each_compatible_node(dn, NULL, "ibm,power-rng")
+ rng_create(dn);
+
+ if (!ppc_md.get_random_seed)
+ return 0;
+ return ppc_md.get_random_seed(v);
+}
+
+void __init pnv_rng_init(void)
+{
+ struct device_node *dn;
+
+ /* Prefer darn over the rest. */
+ if (!initialise_darn())
+ return;
+
+ dn = of_find_compatible_node(NULL, NULL, "ibm,power-rng");
+ if (dn)
+ ppc_md.get_random_seed = pnv_get_random_long_early;
+
+ of_node_put(dn);
+}
- initialise_darn();
+static int __init pnv_rng_late_init(void)
+{
+ struct device_node *dn;
+ unsigned long v;
+
+ /* In case it wasn't called during init for some other reason. */
+ if (ppc_md.get_random_seed == pnv_get_random_long_early)
+ pnv_get_random_long_early(&v);
+
+ if (ppc_md.get_random_seed == pnv_get_random_long) {
+ for_each_compatible_node(dn, NULL, "ibm,power-rng")
+ of_platform_device_create(dn, NULL, NULL);
+ }
return 0;
}
-machine_subsys_initcall(powernv, rng_init);
+machine_subsys_initcall(powernv, pnv_rng_late_init);
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 14befee4b3f1..4dbb47ddbdcc 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PowerNV setup code.
*
* Copyright 2011 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#undef DEBUG
@@ -21,6 +17,7 @@
#include <linux/console.h>
#include <linux/delay.h>
#include <linux/irq.h>
+#include <linux/seq_buf.h>
#include <linux/seq_file.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
@@ -28,6 +25,7 @@
#include <linux/bug.h>
#include <linux/pci.h>
#include <linux/cpufreq.h>
+#include <linux/memblock.h>
#include <asm/machdep.h>
#include <asm/firmware.h>
@@ -43,7 +41,7 @@
#include "powernv.h"
-static bool fw_feature_is(const char *state, const char *name,
+static bool __init fw_feature_is(const char *state, const char *name,
struct device_node *fw_features)
{
struct device_node *np;
@@ -58,7 +56,7 @@ static bool fw_feature_is(const char *state, const char *name,
return rc;
}
-static void init_fw_feat_flags(struct device_node *np)
+static void __init init_fw_feat_flags(struct device_node *np)
{
if (fw_feature_is("enabled", "inst-spec-barrier-ori31,31,0", np))
security_ftr_set(SEC_FTR_SPEC_BAR_ORI31);
@@ -99,9 +97,18 @@ static void init_fw_feat_flags(struct device_node *np)
if (fw_feature_is("disabled", "needs-spec-barrier-for-bound-checks", np))
security_ftr_clear(SEC_FTR_BNDS_CHK_SPEC_BAR);
+
+ if (fw_feature_is("enabled", "no-need-l1d-flush-msr-pr-1-to-0", np))
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_ENTRY);
+
+ if (fw_feature_is("enabled", "no-need-l1d-flush-kernel-on-user-access", np))
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_UACCESS);
+
+ if (fw_feature_is("enabled", "no-need-store-drain-on-priv-state-switch", np))
+ security_ftr_clear(SEC_FTR_STF_BARRIER);
}
-static void pnv_setup_rfi_flush(void)
+static void __init pnv_setup_security_mitigations(void)
{
struct device_node *np, *fw_features;
enum l1d_flush_type type;
@@ -125,27 +132,68 @@ static void pnv_setup_rfi_flush(void)
type = L1D_FLUSH_ORI;
}
+ /*
+ * The issues addressed by the entry and uaccess flush don't affect P7
+ * or P8, so on bare metal disable them explicitly in case firmware does
+ * not include the features to disable them. POWER9 and newer processors
+ * should have the appropriate firmware flags.
+ */
+ if (pvr_version_is(PVR_POWER7) || pvr_version_is(PVR_POWER7p) ||
+ pvr_version_is(PVR_POWER8E) || pvr_version_is(PVR_POWER8NVL) ||
+ pvr_version_is(PVR_POWER8)) {
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_ENTRY);
+ security_ftr_clear(SEC_FTR_L1D_FLUSH_UACCESS);
+ }
+
enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && \
(security_ftr_enabled(SEC_FTR_L1D_FLUSH_PR) || \
security_ftr_enabled(SEC_FTR_L1D_FLUSH_HV));
setup_rfi_flush(type, enable);
setup_count_cache_flush();
+
+ enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+ security_ftr_enabled(SEC_FTR_L1D_FLUSH_ENTRY);
+ setup_entry_flush(enable);
+
+ enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) &&
+ security_ftr_enabled(SEC_FTR_L1D_FLUSH_UACCESS);
+ setup_uaccess_flush(enable);
+
+ setup_stf_barrier();
+}
+
+static void __init pnv_check_guarded_cores(void)
+{
+ struct device_node *dn;
+ int bad_count = 0;
+
+ for_each_node_by_type(dn, "cpu") {
+ if (of_property_match_string(dn, "status", "bad") >= 0)
+ bad_count++;
+ }
+
+ if (bad_count) {
+ printk(" _ _______________\n");
+ pr_cont(" | | / \\\n");
+ pr_cont(" | | | WARNING! |\n");
+ pr_cont(" | | | |\n");
+ pr_cont(" | | | It looks like |\n");
+ pr_cont(" |_| | you have %*d |\n", 3, bad_count);
+ pr_cont(" _ | guarded cores |\n");
+ pr_cont(" (_) \\_______________/\n");
+ }
}
static void __init pnv_setup_arch(void)
{
set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
- pnv_setup_rfi_flush();
- setup_stf_barrier();
+ pnv_setup_security_mitigations();
/* Initialize SMP */
pnv_smp_init();
- /* Setup PCI */
- pnv_pci_init();
-
/* Setup RTC and NVRAM callbacks */
if (firmware_has_feature(FW_FEATURE_OPAL))
opal_nvram_init();
@@ -153,11 +201,36 @@ static void __init pnv_setup_arch(void)
/* Enable NAP mode */
powersave_nap = 1;
+ pnv_check_guarded_cores();
+
/* XXX PMCS */
+
+ pnv_rng_init();
+}
+
+static void __init pnv_add_hw_description(void)
+{
+ struct device_node *dn;
+ const char *s;
+
+ dn = of_find_node_by_path("/ibm,opal/firmware");
+ if (!dn)
+ return;
+
+ if (of_property_read_string(dn, "version", &s) == 0 ||
+ of_property_read_string(dn, "git-id", &s) == 0)
+ seq_buf_printf(&ppc_hw_desc, "opal:%s ", s);
+
+ if (of_property_read_string(dn, "mi-version", &s) == 0)
+ seq_buf_printf(&ppc_hw_desc, "mi:%s ", s);
+
+ of_node_put(dn);
}
static void __init pnv_init(void)
{
+ pnv_add_hw_description();
+
/*
* Initialize the LPC bus now so that legacy serial
* ports can be found on it
@@ -170,6 +243,21 @@ static void __init pnv_init(void)
else
#endif
add_preferred_console("hvc", 0, NULL);
+
+#ifdef CONFIG_PPC_64S_HASH_MMU
+ if (!radix_enabled()) {
+ size_t size = sizeof(struct slb_entry) * mmu_slb_size;
+ int i;
+
+ /* Allocate per cpu area to save old slb contents during MCE */
+ for_each_possible_cpu(i) {
+ paca_ptrs[i]->mce_faulty_slbs =
+ memblock_alloc_node(size,
+ __alignof__(struct slb_entry),
+ cpu_to_node(i));
+ }
+ }
+#endif
}
static void __init pnv_init_IRQ(void)
@@ -224,10 +312,16 @@ static void __noreturn pnv_restart(char *cmd)
pnv_prepare_going_down();
do {
- if (!cmd)
+ if (!cmd || !strlen(cmd))
rc = opal_cec_reboot();
else if (strcmp(cmd, "full") == 0)
rc = opal_cec_reboot2(OPAL_REBOOT_FULL_IPL, NULL);
+ else if (strcmp(cmd, "mpipl") == 0)
+ rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, NULL);
+ else if (strcmp(cmd, "error") == 0)
+ rc = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, NULL);
+ else if (strcmp(cmd, "fast") == 0)
+ rc = opal_cec_reboot2(OPAL_REBOOT_FAST, NULL);
else
rc = OPAL_UNSUPPORTED;
@@ -385,10 +479,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
}
#endif /* CONFIG_KEXEC_CORE */
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
static unsigned long pnv_memory_block_size(void)
{
- return 256UL * 1024 * 1024;
+ return memory_block_size;
}
#endif
@@ -401,15 +495,15 @@ static void __init pnv_setup_machdep_opal(void)
/* ppc_md.system_reset_exception gets filled in by pnv_smp_init() */
ppc_md.machine_check_exception = opal_machine_check;
ppc_md.mce_check_early_recovery = opal_mce_check_early_recovery;
- ppc_md.hmi_exception_early = opal_hmi_exception_early;
+ if (opal_check_token(OPAL_HANDLE_HMI2))
+ ppc_md.hmi_exception_early = opal_hmi_exception_early2;
+ else
+ ppc_md.hmi_exception_early = opal_hmi_exception_early;
ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
}
static int __init pnv_probe(void)
{
- if (!of_machine_is_compatible("ibm,powernv"))
- return 0;
-
if (firmware_has_feature(FW_FEATURE_OPAL))
pnv_setup_machdep_opal();
@@ -473,20 +567,21 @@ static long pnv_machine_check_early(struct pt_regs *regs)
define_machine(powernv) {
.name = "PowerNV",
+ .compatible = "ibm,powernv",
.probe = pnv_probe,
.setup_arch = pnv_setup_arch,
.init_IRQ = pnv_init_IRQ,
.show_cpuinfo = pnv_show_cpuinfo,
.get_proc_freq = pnv_get_proc_freq,
+ .discover_phbs = pnv_pci_init,
.progress = pnv_progress,
.machine_shutdown = pnv_shutdown,
.power_save = NULL,
- .calibrate_decr = generic_calibrate_decr,
.machine_check_early = pnv_machine_check_early,
#ifdef CONFIG_KEXEC_CORE
.kexec_cpu_down = pnv_kexec_cpu_down,
#endif
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
.memory_block_size = pnv_memory_block_size,
#endif
};
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 0d354e19ef92..8f41ef364fc6 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* SMP support for PowerNV machines.
*
* Copyright 2011 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -32,13 +28,15 @@
#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/runlatch.h>
-#include <asm/code-patching.h>
+#include <asm/text-patching.h>
#include <asm/dbell.h>
#include <asm/kvm_ppc.h>
#include <asm/ppc-opcode.h>
#include <asm/cpuidle.h>
#include <asm/kexec.h>
#include <asm/reg.h>
+#include <asm/powernv.h>
+#include <asm/systemcfg.h>
#include "powernv.h"
@@ -46,7 +44,7 @@
#include <asm/udbg.h>
#define DBG(fmt...) udbg_printf(fmt)
#else
-#define DBG(fmt...)
+#define DBG(fmt...) do { } while (0)
#endif
static void pnv_smp_setup_cpu(int cpu)
@@ -139,32 +137,42 @@ static int pnv_smp_cpu_disable(void)
* the generic fixup_irqs. --BenH.
*/
set_cpu_online(cpu, false);
- vdso_data->processorCount--;
+#ifdef CONFIG_PPC64_PROC_SYSTEMCFG
+ systemcfg->processorCount--;
+#endif
if (cpu == boot_cpuid)
boot_cpuid = cpumask_any(cpu_online_mask);
if (xive_enabled())
xive_smp_disable_cpu();
else
xics_migrate_irqs_away();
+
+ cleanup_cpu_mmu_context();
+
return 0;
}
-static void pnv_smp_cpu_kill_self(void)
+static void pnv_flush_interrupts(void)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ if (xive_enabled())
+ xive_flush_interrupt();
+ else
+ icp_opal_flush_interrupt();
+ } else {
+ icp_native_flush_interrupt();
+ }
+}
+
+static void pnv_cpu_offline_self(void)
{
+ unsigned long srr1, unexpected_mask, wmask;
unsigned int cpu;
- unsigned long srr1, wmask;
+ u64 lpcr_val;
/* Standard hot unplug procedure */
- /*
- * This hard disables local interurpts, ensuring we have no lazy
- * irqs pending.
- */
- WARN_ON(irqs_disabled());
- hard_irq_disable();
- WARN_ON(lazy_irq_pending());
idle_task_exit();
- current->active_mm = NULL; /* for sanity */
cpu = smp_processor_id();
DBG("CPU%d offline\n", cpu);
generic_set_cpu_dead(cpu);
@@ -174,6 +182,40 @@ static void pnv_smp_cpu_kill_self(void)
if (cpu_has_feature(CPU_FTR_ARCH_207S))
wmask = SRR1_WAKEMASK_P8;
+ /*
+ * This turns the irq soft-disabled state we're called with, into a
+ * hard-disabled state with pending irq_happened interrupts cleared.
+ *
+ * PACA_IRQ_DEC - Decrementer should be ignored.
+ * PACA_IRQ_HMI - Can be ignored, processing is done in real mode.
+ * PACA_IRQ_DBELL, EE, PMI - Unexpected.
+ */
+ hard_irq_disable();
+ if (generic_check_cpu_restart(cpu))
+ goto out;
+
+ unexpected_mask = ~(PACA_IRQ_DEC | PACA_IRQ_HMI | PACA_IRQ_HARD_DIS);
+ if (local_paca->irq_happened & unexpected_mask) {
+ if (local_paca->irq_happened & PACA_IRQ_EE)
+ pnv_flush_interrupts();
+ DBG("CPU%d Unexpected exit while offline irq_happened=%lx!\n",
+ cpu, local_paca->irq_happened);
+ }
+ local_paca->irq_happened = PACA_IRQ_HARD_DIS;
+
+ /*
+ * We don't want to take decrementer interrupts while we are
+ * offline, so clear LPCR:PECE1. We keep PECE2 (and
+ * LPCR_PECE_HVEE on P9) enabled so as to let IPIs in.
+ *
+ * If the CPU gets woken up by a special wakeup, ensure that
+ * the SLW engine sets LPCR with decrementer bit cleared, else
+ * the CPU will come back to the kernel due to a spurious
+ * wakeup.
+ */
+ lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
+ pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
+
while (!generic_check_cpu_restart(cpu)) {
/*
* Clear IPI flag, since we don't handle IPIs while
@@ -182,10 +224,11 @@ static void pnv_smp_cpu_kill_self(void)
* for coming online, which are handled via
* generic_check_cpu_restart() calls.
*/
- kvmppc_set_host_ipi(cpu, 0);
+ kvmppc_clear_host_ipi(cpu);
srr1 = pnv_cpu_offline(cpu);
+ WARN_ON_ONCE(!irqs_disabled());
WARN_ON(lazy_irq_pending());
/*
@@ -201,13 +244,7 @@ static void pnv_smp_cpu_kill_self(void)
*/
if (((srr1 & wmask) == SRR1_WAKEEE) ||
((srr1 & wmask) == SRR1_WAKEHVI)) {
- if (cpu_has_feature(CPU_FTR_ARCH_300)) {
- if (xive_enabled())
- xive_flush_interrupt();
- else
- icp_opal_flush_interrupt();
- } else
- icp_native_flush_interrupt();
+ pnv_flush_interrupts();
} else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
asm volatile(PPC_MSGCLR(%0) : : "r" (msg));
@@ -246,6 +283,16 @@ static void pnv_smp_cpu_kill_self(void)
}
+ /*
+ * Re-enable decrementer interrupts in LPCR.
+ *
+ * Further, we want stop states to be woken up by decrementer
+ * for non-hotplug cases. So program the LPCR via stop api as
+ * well.
+ */
+ lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1;
+ pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val);
+out:
DBG("CPU%d coming online...\n", cpu);
}
@@ -301,7 +348,7 @@ static void __init pnv_smp_probe(void)
}
}
-static int pnv_system_reset_exception(struct pt_regs *regs)
+noinstr static int pnv_system_reset_exception(struct pt_regs *regs)
{
if (smp_handle_nmi_ipi(regs))
return 1;
@@ -376,6 +423,7 @@ static struct smp_ops_t pnv_smp_ops = {
#ifdef CONFIG_HOTPLUG_CPU
.cpu_disable = pnv_smp_cpu_disable,
.cpu_die = generic_cpu_die,
+ .cpu_offline_self = pnv_cpu_offline_self,
#endif /* CONFIG_HOTPLUG_CPU */
};
@@ -389,8 +437,7 @@ void __init pnv_smp_init(void)
smp_ops = &pnv_smp_ops;
#ifdef CONFIG_HOTPLUG_CPU
- ppc_md.cpu_die = pnv_smp_cpu_kill_self;
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_CRASH_DUMP
crash_wake_offline = 1;
#endif
#endif
diff --git a/arch/powerpc/platforms/powernv/subcore-asm.S b/arch/powerpc/platforms/powernv/subcore-asm.S
index 39bb24aa8f34..e038f6761790 100644
--- a/arch/powerpc/platforms/powernv/subcore-asm.S
+++ b/arch/powerpc/platforms/powernv/subcore-asm.S
@@ -1,10 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <asm/asm-offsets.h>
diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c
index 45563004feda..393e747541fb 100644
--- a/arch/powerpc/platforms/powernv/subcore.c
+++ b/arch/powerpc/platforms/powernv/subcore.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright 2013, Michael (Ellerman|Neuling), IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "powernv: " fmt
@@ -24,6 +20,8 @@
#include <asm/opal.h>
#include <asm/smp.h>
+#include <trace/events/ipi.h>
+
#include "subcore.h"
#include "powernv.h"
@@ -173,6 +171,16 @@ static void update_hid_in_slw(u64 hid0)
}
}
+static inline void update_power8_hid0(unsigned long hid0)
+{
+ /*
+ * The HID0 update on Power8 should at the very least be
+ * preceded by a SYNC instruction followed by an ISYNC
+ * instruction
+ */
+ asm volatile("sync; mtspr %0,%1; isync":: "i"(SPRN_HID0), "r"(hid0));
+}
+
static void unsplit_core(void)
{
u64 hid0, mask;
@@ -183,7 +191,7 @@ static void unsplit_core(void)
cpu = smp_processor_id();
if (cpu_thread_in_core(cpu) != 0) {
while (mfspr(SPRN_HID0) & mask)
- power7_idle_insn(PNV_THREAD_NAP);
+ power7_idle_type(PNV_THREAD_NAP);
per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT;
return;
@@ -409,13 +417,16 @@ static DEVICE_ATTR(subcores_per_core, 0644,
static int subcore_init(void)
{
+ struct device *dev_root;
unsigned pvr_ver;
+ int rc = 0;
pvr_ver = PVR_VER(mfspr(SPRN_PVR));
if (pvr_ver != PVR_POWER8 &&
pvr_ver != PVR_POWER8E &&
- pvr_ver != PVR_POWER8NVL)
+ pvr_ver != PVR_POWER8NVL &&
+ pvr_ver != PVR_HX_C2000)
return 0;
/*
@@ -429,7 +440,11 @@ static int subcore_init(void)
set_subcores_per_core(1);
- return device_create_file(cpu_subsys.dev_root,
- &dev_attr_subcores_per_core);
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ rc = device_create_file(dev_root, &dev_attr_subcores_per_core);
+ put_device(dev_root);
+ }
+ return rc;
}
machine_device_initcall(powernv, subcore_init);
diff --git a/arch/powerpc/platforms/powernv/subcore.h b/arch/powerpc/platforms/powernv/subcore.h
index 84e02ae52895..413fd85d9bc2 100644
--- a/arch/powerpc/platforms/powernv/subcore.h
+++ b/arch/powerpc/platforms/powernv/subcore.h
@@ -1,10 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright 2013, Michael Ellerman, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/* These are ordered and tested with <= */
@@ -13,13 +9,13 @@
#define SYNC_STEP_REAL_MODE 2 /* Set by secondary when in real mode */
#define SYNC_STEP_FINISHED 3 /* Set by secondary when split/unsplit is done */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
#ifdef CONFIG_SMP
void split_core_secondary_loop(u8 *state);
extern void update_subcore_sibling_mask(void);
#else
-static inline void update_subcore_sibling_mask(void) { };
+static inline void update_subcore_sibling_mask(void) { }
#endif /* CONFIG_SMP */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
diff --git a/arch/powerpc/platforms/powernv/ultravisor.c b/arch/powerpc/platforms/powernv/ultravisor.c
new file mode 100644
index 000000000000..c526871a1229
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/ultravisor.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ultravisor high level interfaces
+ *
+ * Copyright 2019, IBM Corporation.
+ *
+ */
+#include <linux/init.h>
+#include <linux/printk.h>
+#include <linux/of_fdt.h>
+#include <linux/of.h>
+
+#include <asm/ultravisor.h>
+#include <asm/firmware.h>
+#include <asm/machdep.h>
+
+#include "powernv.h"
+
+static struct kobject *ultravisor_kobj;
+
+int __init early_init_dt_scan_ultravisor(unsigned long node, const char *uname,
+ int depth, void *data)
+{
+ if (!of_flat_dt_is_compatible(node, "ibm,ultravisor"))
+ return 0;
+
+ powerpc_firmware_features |= FW_FEATURE_ULTRAVISOR;
+ pr_debug("Ultravisor detected!\n");
+ return 1;
+}
+
+static struct memcons *uv_memcons;
+
+static ssize_t uv_msglog_read(struct file *file, struct kobject *kobj,
+ const struct bin_attribute *bin_attr, char *to,
+ loff_t pos, size_t count)
+{
+ return memcons_copy(uv_memcons, to, pos, count);
+}
+
+static struct bin_attribute uv_msglog_attr __ro_after_init = {
+ .attr = {.name = "msglog", .mode = 0400},
+ .read = uv_msglog_read
+};
+
+static int __init uv_init(void)
+{
+ struct device_node *node;
+
+ if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR))
+ return 0;
+
+ node = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware");
+ if (!node)
+ return -ENODEV;
+
+ uv_memcons = memcons_init(node, "memcons");
+ of_node_put(node);
+ if (!uv_memcons)
+ return -ENOENT;
+
+ uv_msglog_attr.size = memcons_get_size(uv_memcons);
+
+ ultravisor_kobj = kobject_create_and_add("ultravisor", firmware_kobj);
+ if (!ultravisor_kobj)
+ return -ENOMEM;
+
+ return sysfs_create_bin_file(ultravisor_kobj, &uv_msglog_attr);
+}
+machine_subsys_initcall(powernv, uv_init);
diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c
index 4d3929fbc08f..3ce89a4b54be 100644
--- a/arch/powerpc/platforms/powernv/vas-debug.c
+++ b/arch/powerpc/platforms/powernv/vas-debug.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright 2016-17 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "vas: " fmt
@@ -13,6 +9,7 @@
#include <linux/slab.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <asm/vas.h>
#include "vas.h"
static struct dentry *vas_debugfs;
@@ -32,7 +29,7 @@ static char *cop_to_str(int cop)
static int info_show(struct seq_file *s, void *private)
{
- struct vas_window *window = s->private;
+ struct pnv_vas_window *window = s->private;
mutex_lock(&vas_mutex);
@@ -40,9 +37,9 @@ static int info_show(struct seq_file *s, void *private)
if (!window->hvwc_map)
goto unlock;
- seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop),
+ seq_printf(s, "Type: %s, %s\n", cop_to_str(window->vas_win.cop),
window->tx_win ? "Send" : "Receive");
- seq_printf(s, "Pid : %d\n", window->pid);
+ seq_printf(s, "Pid : %d\n", vas_window_pid(&window->vas_win));
unlock:
mutex_unlock(&vas_mutex);
@@ -51,7 +48,7 @@ unlock:
DEFINE_SHOW_ATTRIBUTE(info);
-static inline void print_reg(struct seq_file *s, struct vas_window *win,
+static inline void print_reg(struct seq_file *s, struct pnv_vas_window *win,
char *name, u32 reg)
{
seq_printf(s, "0x%016llx %s\n", read_hvwc_reg(win, name, reg), name);
@@ -59,7 +56,7 @@ static inline void print_reg(struct seq_file *s, struct vas_window *win,
static int hvwc_show(struct seq_file *s, void *private)
{
- struct vas_window *window = s->private;
+ struct pnv_vas_window *window = s->private;
mutex_lock(&vas_mutex);
@@ -107,8 +104,10 @@ unlock:
DEFINE_SHOW_ATTRIBUTE(hvwc);
-void vas_window_free_dbgdir(struct vas_window *window)
+void vas_window_free_dbgdir(struct pnv_vas_window *pnv_win)
{
+ struct vas_window *window = &pnv_win->vas_win;
+
if (window->dbgdir) {
debugfs_remove_recursive(window->dbgdir);
kfree(window->dbgname);
@@ -117,42 +116,24 @@ void vas_window_free_dbgdir(struct vas_window *window)
}
}
-void vas_window_init_dbgdir(struct vas_window *window)
+void vas_window_init_dbgdir(struct pnv_vas_window *window)
{
- struct dentry *f, *d;
+ struct dentry *d;
if (!window->vinst->dbgdir)
return;
- window->dbgname = kzalloc(16, GFP_KERNEL);
- if (!window->dbgname)
+ window->vas_win.dbgname = kzalloc(16, GFP_KERNEL);
+ if (!window->vas_win.dbgname)
return;
- snprintf(window->dbgname, 16, "w%d", window->winid);
-
- d = debugfs_create_dir(window->dbgname, window->vinst->dbgdir);
- if (IS_ERR(d))
- goto free_name;
-
- window->dbgdir = d;
-
- f = debugfs_create_file("info", 0444, d, window, &info_fops);
- if (IS_ERR(f))
- goto remove_dir;
+ snprintf(window->vas_win.dbgname, 16, "w%d", window->vas_win.winid);
- f = debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops);
- if (IS_ERR(f))
- goto remove_dir;
+ d = debugfs_create_dir(window->vas_win.dbgname, window->vinst->dbgdir);
+ window->vas_win.dbgdir = d;
- return;
-
-remove_dir:
- debugfs_remove_recursive(window->dbgdir);
- window->dbgdir = NULL;
-
-free_name:
- kfree(window->dbgname);
- window->dbgname = NULL;
+ debugfs_create_file("info", 0444, d, window, &info_fops);
+ debugfs_create_file("hvwc", 0444, d, window, &hvwc_fops);
}
void vas_instance_init_dbgdir(struct vas_instance *vinst)
@@ -160,8 +141,6 @@ void vas_instance_init_dbgdir(struct vas_instance *vinst)
struct dentry *d;
vas_init_dbgdir();
- if (!vas_debugfs)
- return;
vinst->dbgname = kzalloc(16, GFP_KERNEL);
if (!vinst->dbgname)
@@ -170,16 +149,7 @@ void vas_instance_init_dbgdir(struct vas_instance *vinst)
snprintf(vinst->dbgname, 16, "v%d", vinst->vas_id);
d = debugfs_create_dir(vinst->dbgname, vas_debugfs);
- if (IS_ERR(d))
- goto free_name;
-
vinst->dbgdir = d;
- return;
-
-free_name:
- kfree(vinst->dbgname);
- vinst->dbgname = NULL;
- vinst->dbgdir = NULL;
}
/*
@@ -195,6 +165,4 @@ void vas_init_dbgdir(void)
first_time = false;
vas_debugfs = debugfs_create_dir("vas", NULL);
- if (IS_ERR(vas_debugfs))
- vas_debugfs = NULL;
}
diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c
new file mode 100644
index 000000000000..2b47d5a86328
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-fault.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * VAS Fault handling.
+ * Copyright 2019, IBM Corporation
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/sched/signal.h>
+#include <linux/mmu_context.h>
+#include <asm/icswx.h>
+
+#include "vas.h"
+
+/*
+ * The maximum FIFO size for fault window can be 8MB
+ * (VAS_RX_FIFO_SIZE_MAX). Using 4MB FIFO since each VAS
+ * instance will be having fault window.
+ * 8MB FIFO can be used if expects more faults for each VAS
+ * instance.
+ */
+#define VAS_FAULT_WIN_FIFO_SIZE (4 << 20)
+
+static void dump_fifo(struct vas_instance *vinst, void *entry)
+{
+ unsigned long *end = vinst->fault_fifo + vinst->fault_fifo_size;
+ unsigned long *fifo = entry;
+ int i;
+
+ pr_err("Fault fifo size %d, Max crbs %d\n", vinst->fault_fifo_size,
+ vinst->fault_fifo_size / CRB_SIZE);
+
+ /* Dump 10 CRB entries or until end of FIFO */
+ pr_err("Fault FIFO Dump:\n");
+ for (i = 0; i < 10*(CRB_SIZE/8) && fifo < end; i += 4, fifo += 4) {
+ pr_err("[%.3d, %p]: 0x%.16lx 0x%.16lx 0x%.16lx 0x%.16lx\n",
+ i, fifo, *fifo, *(fifo+1), *(fifo+2), *(fifo+3));
+ }
+}
+
+/*
+ * Process valid CRBs in fault FIFO.
+ * NX process user space requests, return credit and update the status
+ * in CRB. If it encounters transalation error when accessing CRB or
+ * request buffers, raises interrupt on the CPU to handle the fault.
+ * It takes credit on fault window, updates nx_fault_stamp in CRB with
+ * the following information and pastes CRB in fault FIFO.
+ *
+ * pswid - window ID of the window on which the request is sent.
+ * fault_storage_addr - fault address
+ *
+ * It can raise a single interrupt for multiple faults. Expects OS to
+ * process all valid faults and return credit for each fault on user
+ * space and fault windows. This fault FIFO control will be done with
+ * credit mechanism. NX can continuously paste CRBs until credits are not
+ * available on fault window. Otherwise, returns with RMA_reject.
+ *
+ * Total credits available on fault window: FIFO_SIZE(4MB)/CRBS_SIZE(128)
+ *
+ */
+irqreturn_t vas_fault_thread_fn(int irq, void *data)
+{
+ struct vas_instance *vinst = data;
+ struct coprocessor_request_block *crb, *entry;
+ struct coprocessor_request_block buf;
+ struct pnv_vas_window *window;
+ unsigned long flags;
+ void *fifo;
+
+ crb = &buf;
+
+ /*
+ * VAS can interrupt with multiple page faults. So process all
+ * valid CRBs within fault FIFO until reaches invalid CRB.
+ * We use CCW[0] and pswid to validate CRBs:
+ *
+ * CCW[0] Reserved bit. When NX pastes CRB, CCW[0]=0
+ * OS sets this bit to 1 after reading CRB.
+ * pswid NX assigns window ID. Set pswid to -1 after
+ * reading CRB from fault FIFO.
+ *
+ * We exit this function if no valid CRBs are available to process.
+ * So acquire fault_lock and reset fifo_in_progress to 0 before
+ * exit.
+ * In case kernel receives another interrupt with different page
+ * fault, interrupt handler returns with IRQ_HANDLED if
+ * fifo_in_progress is set. Means these new faults will be
+ * handled by the current thread. Otherwise set fifo_in_progress
+ * and return IRQ_WAKE_THREAD to wake up thread.
+ */
+ while (true) {
+ spin_lock_irqsave(&vinst->fault_lock, flags);
+ /*
+ * Advance the fault fifo pointer to next CRB.
+ * Use CRB_SIZE rather than sizeof(*crb) since the latter is
+ * aligned to CRB_ALIGN (256) but the CRB written to by VAS is
+ * only CRB_SIZE in len.
+ */
+ fifo = vinst->fault_fifo + (vinst->fault_crbs * CRB_SIZE);
+ entry = fifo;
+
+ if ((entry->stamp.nx.pswid == cpu_to_be32(FIFO_INVALID_ENTRY))
+ || (entry->ccw & cpu_to_be32(CCW0_INVALID))) {
+ vinst->fifo_in_progress = 0;
+ spin_unlock_irqrestore(&vinst->fault_lock, flags);
+ return IRQ_HANDLED;
+ }
+
+ spin_unlock_irqrestore(&vinst->fault_lock, flags);
+ vinst->fault_crbs++;
+ if (vinst->fault_crbs == (vinst->fault_fifo_size / CRB_SIZE))
+ vinst->fault_crbs = 0;
+
+ memcpy(crb, fifo, CRB_SIZE);
+ entry->stamp.nx.pswid = cpu_to_be32(FIFO_INVALID_ENTRY);
+ entry->ccw |= cpu_to_be32(CCW0_INVALID);
+ /*
+ * Return credit for the fault window.
+ */
+ vas_return_credit(vinst->fault_win, false);
+
+ pr_devel("VAS[%d] fault_fifo %p, fifo %p, fault_crbs %d\n",
+ vinst->vas_id, vinst->fault_fifo, fifo,
+ vinst->fault_crbs);
+
+ vas_dump_crb(crb);
+ window = vas_pswid_to_window(vinst,
+ be32_to_cpu(crb->stamp.nx.pswid));
+
+ if (IS_ERR(window)) {
+ /*
+ * We got an interrupt about a specific send
+ * window but we can't find that window and we can't
+ * even clean it up (return credit on user space
+ * window).
+ * But we should not get here.
+ * TODO: Disable IRQ.
+ */
+ dump_fifo(vinst, (void *)entry);
+ pr_err("VAS[%d] fault_fifo %p, fifo %p, pswid 0x%x, fault_crbs %d bad CRB?\n",
+ vinst->vas_id, vinst->fault_fifo, fifo,
+ be32_to_cpu(crb->stamp.nx.pswid),
+ vinst->fault_crbs);
+
+ WARN_ON_ONCE(1);
+ } else {
+ /*
+ * NX sees faults only with user space windows.
+ */
+ if (window->user_win)
+ vas_update_csb(crb, &window->vas_win.task_ref);
+ else
+ WARN_ON_ONCE(!window->user_win);
+
+ /*
+ * Return credit for send window after processing
+ * fault CRB.
+ */
+ vas_return_credit(window, true);
+ }
+ }
+}
+
+irqreturn_t vas_fault_handler(int irq, void *dev_id)
+{
+ struct vas_instance *vinst = dev_id;
+ irqreturn_t ret = IRQ_WAKE_THREAD;
+ unsigned long flags;
+
+ /*
+ * NX can generate an interrupt for multiple faults. So the
+ * fault handler thread process all CRBs until finds invalid
+ * entry. In case if NX sees continuous faults, it is possible
+ * that the thread function entered with the first interrupt
+ * can execute and process all valid CRBs.
+ * So wake up thread only if the fault thread is not in progress.
+ */
+ spin_lock_irqsave(&vinst->fault_lock, flags);
+
+ if (vinst->fifo_in_progress)
+ ret = IRQ_HANDLED;
+ else
+ vinst->fifo_in_progress = 1;
+
+ spin_unlock_irqrestore(&vinst->fault_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Fault window is opened per VAS instance. NX pastes fault CRB in fault
+ * FIFO upon page faults.
+ */
+int vas_setup_fault_window(struct vas_instance *vinst)
+{
+ struct vas_rx_win_attr attr;
+ struct vas_window *win;
+
+ vinst->fault_fifo_size = VAS_FAULT_WIN_FIFO_SIZE;
+ vinst->fault_fifo = kzalloc(vinst->fault_fifo_size, GFP_KERNEL);
+ if (!vinst->fault_fifo) {
+ pr_err("Unable to alloc %d bytes for fault_fifo\n",
+ vinst->fault_fifo_size);
+ return -ENOMEM;
+ }
+
+ /*
+ * Invalidate all CRB entries. NX pastes valid entry for each fault.
+ */
+ memset(vinst->fault_fifo, FIFO_INVALID_ENTRY, vinst->fault_fifo_size);
+ vas_init_rx_win_attr(&attr, VAS_COP_TYPE_FAULT);
+
+ attr.rx_fifo_size = vinst->fault_fifo_size;
+ attr.rx_fifo = __pa(vinst->fault_fifo);
+
+ /*
+ * Max creds is based on number of CRBs can fit in the FIFO.
+ * (fault_fifo_size/CRB_SIZE). If 8MB FIFO is used, max creds
+ * will be 0xffff since the receive creds field is 16bits wide.
+ */
+ attr.wcreds_max = vinst->fault_fifo_size / CRB_SIZE;
+ attr.lnotify_lpid = 0;
+ attr.lnotify_pid = mfspr(SPRN_PID);
+ attr.lnotify_tid = mfspr(SPRN_PID);
+
+ win = vas_rx_win_open(vinst->vas_id, VAS_COP_TYPE_FAULT, &attr);
+ if (IS_ERR(win)) {
+ pr_err("VAS: Error %ld opening FaultWin\n", PTR_ERR(win));
+ kfree(vinst->fault_fifo);
+ return PTR_ERR(win);
+ }
+
+ vinst->fault_win = container_of(win, struct pnv_vas_window, vas_win);
+
+ pr_devel("VAS: Created FaultWin %d, LPID/PID/TID [%d/%d/%d]\n",
+ vinst->fault_win->vas_win.winid, attr.lnotify_lpid,
+ attr.lnotify_pid, attr.lnotify_tid);
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/vas-trace.h b/arch/powerpc/platforms/powernv/vas-trace.h
index a449b9f0c12e..ca2e08f2ddc0 100644
--- a/arch/powerpc/platforms/powernv/vas-trace.h
+++ b/arch/powerpc/platforms/powernv/vas-trace.h
@@ -80,7 +80,7 @@ TRACE_EVENT( vas_tx_win_open,
TRACE_EVENT( vas_paste_crb,
TP_PROTO(struct task_struct *tsk,
- struct vas_window *win),
+ struct pnv_vas_window *win),
TP_ARGS(tsk, win),
@@ -96,7 +96,7 @@ TRACE_EVENT( vas_paste_crb,
TP_fast_assign(
__entry->pid = tsk->pid;
__entry->vasid = win->vinst->vas_id;
- __entry->winid = win->winid;
+ __entry->winid = win->vas_win.winid;
__entry->paste_kaddr = (unsigned long)win->paste_kaddr
),
diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index e59e0e60e5b5..5147df3a18ac 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright 2016-17 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "vas: " fmt
@@ -16,8 +12,11 @@
#include <linux/log2.h>
#include <linux/rcupdate.h>
#include <linux/cred.h>
+#include <linux/sched/mm.h>
+#include <linux/mmu_context.h>
#include <asm/switch_to.h>
#include <asm/ppc-opcode.h>
+#include <asm/vas.h>
#include "vas.h"
#include "copy-paste.h"
@@ -28,14 +27,14 @@
* Compute the paste address region for the window @window using the
* ->paste_base_addr and ->paste_win_id_shift we got from device tree.
*/
-static void compute_paste_address(struct vas_window *window, u64 *addr, int *len)
+void vas_win_paste_addr(struct pnv_vas_window *window, u64 *addr, int *len)
{
int winid;
u64 base, shift;
base = window->vinst->paste_base_addr;
shift = window->vinst->paste_win_id_shift;
- winid = window->winid;
+ winid = window->vas_win.winid;
*addr = base + (winid << shift);
if (len)
@@ -44,33 +43,23 @@ static void compute_paste_address(struct vas_window *window, u64 *addr, int *len
pr_debug("Txwin #%d: Paste addr 0x%llx\n", winid, *addr);
}
-u64 vas_win_paste_addr(struct vas_window *win)
-{
- u64 addr;
-
- compute_paste_address(win, &addr, NULL);
-
- return addr;
-}
-EXPORT_SYMBOL(vas_win_paste_addr);
-
-static inline void get_hvwc_mmio_bar(struct vas_window *window,
+static inline void get_hvwc_mmio_bar(struct pnv_vas_window *window,
u64 *start, int *len)
{
u64 pbaddr;
pbaddr = window->vinst->hvwc_bar_start;
- *start = pbaddr + window->winid * VAS_HVWC_SIZE;
+ *start = pbaddr + window->vas_win.winid * VAS_HVWC_SIZE;
*len = VAS_HVWC_SIZE;
}
-static inline void get_uwc_mmio_bar(struct vas_window *window,
+static inline void get_uwc_mmio_bar(struct pnv_vas_window *window,
u64 *start, int *len)
{
u64 pbaddr;
pbaddr = window->vinst->uwc_bar_start;
- *start = pbaddr + window->winid * VAS_UWC_SIZE;
+ *start = pbaddr + window->vas_win.winid * VAS_UWC_SIZE;
*len = VAS_UWC_SIZE;
}
@@ -79,7 +68,7 @@ static inline void get_uwc_mmio_bar(struct vas_window *window,
* space. Unlike MMIO regions (map_mmio_region() below), paste region must
* be mapped cache-able and is only applicable to send windows.
*/
-static void *map_paste_region(struct vas_window *txwin)
+static void *map_paste_region(struct pnv_vas_window *txwin)
{
int len;
void *map;
@@ -87,12 +76,12 @@ static void *map_paste_region(struct vas_window *txwin)
u64 start;
name = kasprintf(GFP_KERNEL, "window-v%d-w%d", txwin->vinst->vas_id,
- txwin->winid);
+ txwin->vas_win.winid);
if (!name)
goto free_name;
txwin->paste_addr_name = name;
- compute_paste_address(txwin, &start, &len);
+ vas_win_paste_addr(txwin, &start, &len);
if (!request_mem_region(start, len, name)) {
pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n",
@@ -144,13 +133,13 @@ static void unmap_region(void *addr, u64 start, int len)
/*
* Unmap the paste address region for a window.
*/
-static void unmap_paste_region(struct vas_window *window)
+static void unmap_paste_region(struct pnv_vas_window *window)
{
int len;
u64 busaddr_start;
if (window->paste_kaddr) {
- compute_paste_address(window, &busaddr_start, &len);
+ vas_win_paste_addr(window, &busaddr_start, &len);
unmap_region(window->paste_kaddr, busaddr_start, len);
window->paste_kaddr = NULL;
kfree(window->paste_addr_name);
@@ -165,7 +154,7 @@ static void unmap_paste_region(struct vas_window *window)
* path, just minimize the time we hold the mutex for now. We can add
* a per-instance mutex later if necessary.
*/
-static void unmap_winctx_mmio_bars(struct vas_window *window)
+static void unmap_winctx_mmio_bars(struct pnv_vas_window *window)
{
int len;
void *uwc_map;
@@ -198,7 +187,7 @@ static void unmap_winctx_mmio_bars(struct vas_window *window)
* OS/User Window Context (UWC) MMIO Base Address Region for the given window.
* Map these bus addresses and save the mapped kernel addresses in @window.
*/
-int map_winctx_mmio_bars(struct vas_window *window)
+static int map_winctx_mmio_bars(struct pnv_vas_window *window)
{
int len;
u64 start;
@@ -226,7 +215,7 @@ int map_winctx_mmio_bars(struct vas_window *window)
* registers are not sequential. And, we can only write to offsets
* with valid registers.
*/
-void reset_window_regs(struct vas_window *window)
+static void reset_window_regs(struct pnv_vas_window *window)
{
write_hvwc_reg(window, VREG(LPID), 0ULL);
write_hvwc_reg(window, VREG(PID), 0ULL);
@@ -282,7 +271,7 @@ void reset_window_regs(struct vas_window *window)
* want to add fields to vas_winctx and move the initialization to
* init_vas_winctx_regs().
*/
-static void init_xlate_regs(struct vas_window *window, bool user_win)
+static void init_xlate_regs(struct pnv_vas_window *window, bool user_win)
{
u64 lpcr, val;
@@ -347,7 +336,7 @@ static void init_xlate_regs(struct vas_window *window, bool user_win)
*
* TODO: Reserved (aka dedicated) send buffers are not supported yet.
*/
-static void init_rsvd_tx_buf_count(struct vas_window *txwin,
+static void init_rsvd_tx_buf_count(struct pnv_vas_window *txwin,
struct vas_winctx *winctx)
{
write_hvwc_reg(txwin, VREG(TX_RSVD_BUF_COUNT), 0ULL);
@@ -369,7 +358,8 @@ static void init_rsvd_tx_buf_count(struct vas_window *txwin,
* as a one-time task? That could work for NX but what about other
* receivers? Let the receivers tell us the rx-fifo buffers for now.
*/
-int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx)
+static void init_winctx_regs(struct pnv_vas_window *window,
+ struct vas_winctx *winctx)
{
u64 val;
int fifo_size;
@@ -387,7 +377,7 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx)
init_xlate_regs(window, winctx->user_win);
val = 0ULL;
- val = SET_FIELD(VAS_FAULT_TX_WIN, val, 0);
+ val = SET_FIELD(VAS_FAULT_TX_WIN, val, winctx->fault_win_id);
write_hvwc_reg(window, VREG(FAULT_TX_WIN), val);
/* In PowerNV, interrupts go to HV. */
@@ -414,7 +404,7 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx)
*
* See also: Design note in function header.
*/
- val = __pa(winctx->rx_fifo);
+ val = winctx->rx_fifo;
val = SET_FIELD(VAS_PAGE_MIGRATION_SELECT, val, 0);
write_hvwc_reg(window, VREG(LFIFO_BAR), val);
@@ -511,8 +501,6 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx)
val = SET_FIELD(VAS_WINCTL_NX_WIN, val, winctx->nx_win);
val = SET_FIELD(VAS_WINCTL_OPEN, val, 1);
write_hvwc_reg(window, VREG(WINCTL), val);
-
- return 0;
}
static void vas_release_window_id(struct ida *ida, int winid)
@@ -532,10 +520,10 @@ static int vas_assign_window_id(struct ida *ida)
return winid;
}
-static void vas_window_free(struct vas_window *window)
+static void vas_window_free(struct pnv_vas_window *window)
{
- int winid = window->winid;
struct vas_instance *vinst = window->vinst;
+ int winid = window->vas_win.winid;
unmap_winctx_mmio_bars(window);
@@ -546,10 +534,10 @@ static void vas_window_free(struct vas_window *window)
vas_release_window_id(&vinst->ida, winid);
}
-static struct vas_window *vas_window_alloc(struct vas_instance *vinst)
+static struct pnv_vas_window *vas_window_alloc(struct vas_instance *vinst)
{
int winid;
- struct vas_window *window;
+ struct pnv_vas_window *window;
winid = vas_assign_window_id(&vinst->ida);
if (winid < 0)
@@ -560,7 +548,7 @@ static struct vas_window *vas_window_alloc(struct vas_instance *vinst)
goto out_free;
window->vinst = vinst;
- window->winid = winid;
+ window->vas_win.winid = winid;
if (map_winctx_mmio_bars(window))
goto out_free;
@@ -575,7 +563,7 @@ out_free:
return ERR_PTR(-ENOMEM);
}
-static void put_rx_win(struct vas_window *rxwin)
+static void put_rx_win(struct pnv_vas_window *rxwin)
{
/* Better not be a send window! */
WARN_ON_ONCE(rxwin->tx_win);
@@ -591,10 +579,11 @@ static void put_rx_win(struct vas_window *rxwin)
*
* NOTE: We access ->windows[] table and assume that vinst->mutex is held.
*/
-static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid)
+static struct pnv_vas_window *get_user_rxwin(struct vas_instance *vinst,
+ u32 pswid)
{
int vasid, winid;
- struct vas_window *rxwin;
+ struct pnv_vas_window *rxwin;
decode_pswid(pswid, &vasid, &winid);
@@ -603,7 +592,7 @@ static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid)
rxwin = vinst->windows[winid];
- if (!rxwin || rxwin->tx_win || rxwin->cop != VAS_COP_TYPE_FTW)
+ if (!rxwin || rxwin->tx_win || rxwin->vas_win.cop != VAS_COP_TYPE_FTW)
return ERR_PTR(-EINVAL);
return rxwin;
@@ -615,10 +604,10 @@ static struct vas_window *get_user_rxwin(struct vas_instance *vinst, u32 pswid)
*
* See also function header of set_vinst_win().
*/
-static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst,
+static struct pnv_vas_window *get_vinst_rxwin(struct vas_instance *vinst,
enum vas_cop_type cop, u32 pswid)
{
- struct vas_window *rxwin;
+ struct pnv_vas_window *rxwin;
mutex_lock(&vinst->mutex);
@@ -651,9 +640,9 @@ static struct vas_window *get_vinst_rxwin(struct vas_instance *vinst,
* window, we also save the window in the ->rxwin[] table.
*/
static void set_vinst_win(struct vas_instance *vinst,
- struct vas_window *window)
+ struct pnv_vas_window *window)
{
- int id = window->winid;
+ int id = window->vas_win.winid;
mutex_lock(&vinst->mutex);
@@ -662,8 +651,8 @@ static void set_vinst_win(struct vas_instance *vinst,
* unless its a user (FTW) window.
*/
if (!window->user_win && !window->tx_win) {
- WARN_ON_ONCE(vinst->rxwin[window->cop]);
- vinst->rxwin[window->cop] = window;
+ WARN_ON_ONCE(vinst->rxwin[window->vas_win.cop]);
+ vinst->rxwin[window->vas_win.cop] = window;
}
WARN_ON_ONCE(vinst->windows[id] != NULL);
@@ -676,16 +665,16 @@ static void set_vinst_win(struct vas_instance *vinst,
* Clear this window from the table(s) of windows for this VAS instance.
* See also function header of set_vinst_win().
*/
-static void clear_vinst_win(struct vas_window *window)
+static void clear_vinst_win(struct pnv_vas_window *window)
{
- int id = window->winid;
+ int id = window->vas_win.winid;
struct vas_instance *vinst = window->vinst;
mutex_lock(&vinst->mutex);
if (!window->user_win && !window->tx_win) {
- WARN_ON_ONCE(!vinst->rxwin[window->cop]);
- vinst->rxwin[window->cop] = NULL;
+ WARN_ON_ONCE(!vinst->rxwin[window->vas_win.cop]);
+ vinst->rxwin[window->vas_win.cop] = NULL;
}
WARN_ON_ONCE(vinst->windows[id] != window);
@@ -694,7 +683,7 @@ static void clear_vinst_win(struct vas_window *window)
mutex_unlock(&vinst->mutex);
}
-static void init_winctx_for_rxwin(struct vas_window *rxwin,
+static void init_winctx_for_rxwin(struct pnv_vas_window *rxwin,
struct vas_rx_win_attr *rxattr,
struct vas_winctx *winctx)
{
@@ -715,7 +704,7 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
winctx->rx_fifo = rxattr->rx_fifo;
winctx->rx_fifo_size = rxattr->rx_fifo_size;
- winctx->wcreds_max = rxwin->wcreds_max;
+ winctx->wcreds_max = rxwin->vas_win.wcreds_max;
winctx->pin_win = rxattr->pin_win;
winctx->nx_win = rxattr->nx_win;
@@ -750,7 +739,7 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
*/
winctx->fifo_disable = true;
winctx->intr_disable = true;
- winctx->rx_fifo = NULL;
+ winctx->rx_fifo = 0;
}
winctx->lnotify_lpid = rxattr->lnotify_lpid;
@@ -762,6 +751,8 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
winctx->min_scope = VAS_SCOPE_LOCAL;
winctx->max_scope = VAS_SCOPE_VECTORED_GROUP;
+ if (rxwin->vinst->virq)
+ winctx->irq_port = rxwin->vinst->irq_port;
}
static bool rx_win_args_valid(enum vas_cop_type cop,
@@ -782,7 +773,7 @@ static bool rx_win_args_valid(enum vas_cop_type cop,
if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX)
return false;
- if (attr->wcreds_max > VAS_RX_WCREDS_MAX)
+ if (!attr->wcreds_max)
return false;
if (attr->nx_win) {
@@ -827,7 +818,8 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop)
{
memset(rxattr, 0, sizeof(*rxattr));
- if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) {
+ if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI ||
+ cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) {
rxattr->pin_win = true;
rxattr->nx_win = true;
rxattr->fault_win = false;
@@ -841,9 +833,9 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop)
rxattr->fault_win = true;
rxattr->notify_disable = true;
rxattr->rx_wcred_mode = true;
- rxattr->tx_wcred_mode = true;
rxattr->rx_win_ord_mode = true;
- rxattr->tx_win_ord_mode = true;
+ rxattr->rej_no_credit = true;
+ rxattr->tc_mode = VAS_THRESH_DISABLED;
} else if (cop == VAS_COP_TYPE_FTW) {
rxattr->user_win = true;
rxattr->intr_disable = true;
@@ -861,7 +853,7 @@ EXPORT_SYMBOL_GPL(vas_init_rx_win_attr);
struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop,
struct vas_rx_win_attr *rxattr)
{
- struct vas_window *rxwin;
+ struct pnv_vas_window *rxwin;
struct vas_winctx winctx;
struct vas_instance *vinst;
@@ -880,23 +872,21 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop,
rxwin = vas_window_alloc(vinst);
if (IS_ERR(rxwin)) {
pr_devel("Unable to allocate memory for Rx window\n");
- return rxwin;
+ return (struct vas_window *)rxwin;
}
rxwin->tx_win = false;
rxwin->nx_win = rxattr->nx_win;
rxwin->user_win = rxattr->user_win;
- rxwin->cop = cop;
- rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
- if (rxattr->user_win)
- rxwin->pid = task_pid_vnr(current);
+ rxwin->vas_win.cop = cop;
+ rxwin->vas_win.wcreds_max = rxattr->wcreds_max;
init_winctx_for_rxwin(rxwin, rxattr, &winctx);
init_winctx_regs(rxwin, &winctx);
set_vinst_win(vinst, rxwin);
- return rxwin;
+ return &rxwin->vas_win;
}
EXPORT_SYMBOL_GPL(vas_rx_win_open);
@@ -904,7 +894,8 @@ void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, enum vas_cop_type cop)
{
memset(txattr, 0, sizeof(*txattr));
- if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) {
+ if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI ||
+ cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) {
txattr->rej_no_credit = false;
txattr->rx_wcred_mode = true;
txattr->tx_wcred_mode = true;
@@ -916,7 +907,7 @@ void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, enum vas_cop_type cop)
}
EXPORT_SYMBOL_GPL(vas_init_tx_win_attr);
-static void init_winctx_for_txwin(struct vas_window *txwin,
+static void init_winctx_for_txwin(struct pnv_vas_window *txwin,
struct vas_tx_win_attr *txattr,
struct vas_winctx *winctx)
{
@@ -937,7 +928,7 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
*/
memset(winctx, 0, sizeof(struct vas_winctx));
- winctx->wcreds_max = txwin->wcreds_max;
+ winctx->wcreds_max = txwin->vas_win.wcreds_max;
winctx->user_win = txattr->user_win;
winctx->nx_win = txwin->rxwin->nx_win;
@@ -957,14 +948,24 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
winctx->lpid = txattr->lpid;
winctx->pidr = txattr->pidr;
- winctx->rx_win_id = txwin->rxwin->winid;
+ winctx->rx_win_id = txwin->rxwin->vas_win.winid;
+ /*
+ * IRQ and fault window setup is successful. Set fault window
+ * for the send window so that ready to handle faults.
+ */
+ if (txwin->vinst->virq)
+ winctx->fault_win_id = txwin->vinst->fault_win->vas_win.winid;
winctx->dma_type = VAS_DMA_TYPE_INJECT;
winctx->tc_mode = txattr->tc_mode;
winctx->min_scope = VAS_SCOPE_LOCAL;
winctx->max_scope = VAS_SCOPE_VECTORED_GROUP;
+ if (txwin->vinst->virq)
+ winctx->irq_port = txwin->vinst->irq_port;
- winctx->pswid = 0;
+ winctx->pswid = txattr->pswid ? txattr->pswid :
+ encode_pswid(txwin->vinst->vas_id,
+ txwin->vas_win.winid);
}
static bool tx_win_args_valid(enum vas_cop_type cop,
@@ -979,9 +980,14 @@ static bool tx_win_args_valid(enum vas_cop_type cop,
if (attr->wcreds_max > VAS_TX_WCREDS_MAX)
return false;
- if (attr->user_win &&
- (cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count))
- return false;
+ if (attr->user_win) {
+ if (attr->rsvd_txbuf_count)
+ return false;
+
+ if (cop != VAS_COP_TYPE_FTW && cop != VAS_COP_TYPE_GZIP &&
+ cop != VAS_COP_TYPE_GZIP_HIPRI)
+ return false;
+ }
return true;
}
@@ -990,8 +996,8 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
struct vas_tx_win_attr *attr)
{
int rc;
- struct vas_window *txwin;
- struct vas_window *rxwin;
+ struct pnv_vas_window *txwin;
+ struct pnv_vas_window *rxwin;
struct vas_winctx winctx;
struct vas_instance *vinst;
@@ -1017,7 +1023,7 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
rxwin = get_vinst_rxwin(vinst, cop, attr->pswid);
if (IS_ERR(rxwin)) {
pr_devel("No RxWin for vasid %d, cop %d\n", vasid, cop);
- return rxwin;
+ return (struct vas_window *)rxwin;
}
txwin = vas_window_alloc(vinst);
@@ -1026,13 +1032,12 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
goto put_rxwin;
}
- txwin->cop = cop;
+ txwin->vas_win.cop = cop;
txwin->tx_win = 1;
txwin->rxwin = rxwin;
txwin->nx_win = txwin->rxwin->nx_win;
- txwin->pid = attr->pid;
txwin->user_win = attr->user_win;
- txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
+ txwin->vas_win.wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
init_winctx_for_txwin(txwin, attr, &winctx);
@@ -1054,17 +1059,24 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
}
} else {
/*
- * A user mapping must ensure that context switch issues
- * CP_ABORT for this thread.
+ * Interrupt handler or fault window setup failed. Means
+ * NX can not generate fault for page fault. So not
+ * opening for user space tx window.
*/
- rc = set_thread_uses_vas();
+ if (!vinst->virq) {
+ rc = -ENODEV;
+ goto free_window;
+ }
+ rc = get_vas_user_win_ref(&txwin->vas_win.task_ref);
if (rc)
goto free_window;
+
+ vas_user_win_add_mm_context(&txwin->vas_win.task_ref);
}
set_vinst_win(vinst, txwin);
- return txwin;
+ return &txwin->vas_win;
free_window:
vas_window_free(txwin);
@@ -1083,12 +1095,14 @@ int vas_copy_crb(void *crb, int offset)
EXPORT_SYMBOL_GPL(vas_copy_crb);
#define RMA_LSMP_REPORT_ENABLE PPC_BIT(53)
-int vas_paste_crb(struct vas_window *txwin, int offset, bool re)
+int vas_paste_crb(struct vas_window *vwin, int offset, bool re)
{
+ struct pnv_vas_window *txwin;
int rc;
void *addr;
uint64_t val;
+ txwin = container_of(vwin, struct pnv_vas_window, vas_win);
trace_vas_paste_crb(current, txwin);
/*
@@ -1118,7 +1132,7 @@ int vas_paste_crb(struct vas_window *txwin, int offset, bool re)
else
rc = -EINVAL;
- pr_debug("Txwin #%d: Msg count %llu\n", txwin->winid,
+ pr_debug("Txwin #%d: Msg count %llu\n", txwin->vas_win.winid,
read_hvwc_reg(txwin, VREG(LRFIFO_PUSH)));
return rc;
@@ -1138,10 +1152,11 @@ EXPORT_SYMBOL_GPL(vas_paste_crb);
* user space. (NX-842 driver waits for CSB and Fast thread-wakeup
* doesn't use credit checking).
*/
-static void poll_window_credits(struct vas_window *window)
+static void poll_window_credits(struct pnv_vas_window *window)
{
u64 val;
int creds, mode;
+ int count = 0;
val = read_hvwc_reg(window, VREG(WINCTL));
if (window->tx_win)
@@ -1160,10 +1175,28 @@ retry:
creds = GET_FIELD(VAS_LRX_WCRED, val);
}
- if (creds < window->wcreds_max) {
+ /*
+ * Takes around few milliseconds to complete all pending requests
+ * and return credits.
+ * TODO: Scan fault FIFO and invalidate CRBs points to this window
+ * and issue CRB Kill to stop all pending requests. Need only
+ * if there is a bug in NX or fault handling in kernel.
+ */
+ if (creds < window->vas_win.wcreds_max) {
val = 0;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(msecs_to_jiffies(10));
+ count++;
+ /*
+ * Process can not close send window until all credits are
+ * returned.
+ */
+ if (!(count % 1000))
+ pr_warn_ratelimited("VAS: pid %d stuck. Waiting for credits returned for Window(%d). creds %d, Retries %d\n",
+ vas_window_pid(&window->vas_win),
+ window->vas_win.winid,
+ creds, count);
+
goto retry;
}
}
@@ -1173,10 +1206,11 @@ retry:
* short time to queue a CRB, so window should not be busy for too long.
* Trying 5ms intervals.
*/
-static void poll_window_busy_state(struct vas_window *window)
+static void poll_window_busy_state(struct pnv_vas_window *window)
{
int busy;
u64 val;
+ int count = 0;
retry:
val = read_hvwc_reg(window, VREG(WIN_STATUS));
@@ -1184,7 +1218,17 @@ retry:
if (busy) {
val = 0;
set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(msecs_to_jiffies(5));
+ schedule_timeout(msecs_to_jiffies(10));
+ count++;
+ /*
+ * Takes around few milliseconds to process all pending
+ * requests.
+ */
+ if (!(count % 1000))
+ pr_warn_ratelimited("VAS: pid %d stuck. Window (ID=%d) is in busy state. Retries %d\n",
+ vas_window_pid(&window->vas_win),
+ window->vas_win.winid, count);
+
goto retry;
}
}
@@ -1205,7 +1249,7 @@ retry:
* casting out becomes necessary we should consider offloading the
* job to a worker thread, so the window close can proceed quickly.
*/
-static void poll_window_castout(struct vas_window *window)
+static void poll_window_castout(struct pnv_vas_window *window)
{
/* stub for now */
}
@@ -1214,7 +1258,7 @@ static void poll_window_castout(struct vas_window *window)
* Unpin and close a window so no new requests are accepted and the
* hardware can evict this window from cache if necessary.
*/
-static void unpin_close_window(struct vas_window *window)
+static void unpin_close_window(struct pnv_vas_window *window)
{
u64 val;
@@ -1236,11 +1280,15 @@ static void unpin_close_window(struct vas_window *window)
*
* Besides the hardware, kernel has some bookkeeping of course.
*/
-int vas_win_close(struct vas_window *window)
+int vas_win_close(struct vas_window *vwin)
{
- if (!window)
+ struct pnv_vas_window *window;
+
+ if (!vwin)
return 0;
+ window = container_of(vwin, struct pnv_vas_window, vas_win);
+
if (!window->tx_win && atomic_read(&window->num_txwins) != 0) {
pr_devel("Attempting to close an active Rx window!\n");
WARN_ON_ONCE(1);
@@ -1249,19 +1297,24 @@ int vas_win_close(struct vas_window *window)
unmap_paste_region(window);
- clear_vinst_win(window);
-
poll_window_busy_state(window);
unpin_close_window(window);
poll_window_credits(window);
+ clear_vinst_win(window);
+
poll_window_castout(window);
/* if send window, drop reference to matching receive window */
- if (window->tx_win)
+ if (window->tx_win) {
+ if (window->user_win) {
+ mm_context_remove_vas_window(vwin->task_ref.mm);
+ put_vas_user_win_ref(&vwin->task_ref);
+ }
put_rx_win(window->rxwin);
+ }
vas_window_free(window);
@@ -1270,10 +1323,149 @@ int vas_win_close(struct vas_window *window)
EXPORT_SYMBOL_GPL(vas_win_close);
/*
- * Return a system-wide unique window id for the window @win.
+ * Return credit for the given window.
+ * Send windows and fault window uses credit mechanism as follows:
+ *
+ * Send windows:
+ * - The default number of credits available for each send window is
+ * 1024. It means 1024 requests can be issued asynchronously at the
+ * same time. If the credit is not available, that request will be
+ * returned with RMA_Busy.
+ * - One credit is taken when NX request is issued.
+ * - This credit is returned after NX processed that request.
+ * - If NX encounters translation error, kernel will return the
+ * credit on the specific send window after processing the fault CRB.
+ *
+ * Fault window:
+ * - The total number credits available is FIFO_SIZE/CRB_SIZE.
+ * Means 4MB/128 in the current implementation. If credit is not
+ * available, RMA_Reject is returned.
+ * - A credit is taken when NX pastes CRB in fault FIFO.
+ * - The kernel with return credit on fault window after reading entry
+ * from fault FIFO.
+ */
+void vas_return_credit(struct pnv_vas_window *window, bool tx)
+{
+ uint64_t val;
+
+ val = 0ULL;
+ if (tx) { /* send window */
+ val = SET_FIELD(VAS_TX_WCRED, val, 1);
+ write_hvwc_reg(window, VREG(TX_WCRED_ADDER), val);
+ } else {
+ val = SET_FIELD(VAS_LRX_WCRED, val, 1);
+ write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), val);
+ }
+}
+
+struct pnv_vas_window *vas_pswid_to_window(struct vas_instance *vinst,
+ uint32_t pswid)
+{
+ struct pnv_vas_window *window;
+ int winid;
+
+ if (!pswid) {
+ pr_devel("%s: called for pswid 0!\n", __func__);
+ return ERR_PTR(-ESRCH);
+ }
+
+ decode_pswid(pswid, NULL, &winid);
+
+ if (winid >= VAS_WINDOWS_PER_CHIP)
+ return ERR_PTR(-ESRCH);
+
+ /*
+ * If application closes the window before the hardware
+ * returns the fault CRB, we should wait in vas_win_close()
+ * for the pending requests. so the window must be active
+ * and the process alive.
+ *
+ * If its a kernel process, we should not get any faults and
+ * should not get here.
+ */
+ window = vinst->windows[winid];
+
+ if (!window) {
+ pr_err("PSWID decode: Could not find window for winid %d pswid %d vinst 0x%p\n",
+ winid, pswid, vinst);
+ return NULL;
+ }
+
+ /*
+ * Do some sanity checks on the decoded window. Window should be
+ * NX GZIP user send window. FTW windows should not incur faults
+ * since their CRBs are ignored (not queued on FIFO or processed
+ * by NX).
+ */
+ if (!window->tx_win || !window->user_win || !window->nx_win ||
+ window->vas_win.cop == VAS_COP_TYPE_FAULT ||
+ window->vas_win.cop == VAS_COP_TYPE_FTW) {
+ pr_err("PSWID decode: id %d, tx %d, user %d, nx %d, cop %d\n",
+ winid, window->tx_win, window->user_win,
+ window->nx_win, window->vas_win.cop);
+ WARN_ON(1);
+ }
+
+ return window;
+}
+
+static struct vas_window *vas_user_win_open(int vas_id, u64 flags,
+ enum vas_cop_type cop_type)
+{
+ struct vas_tx_win_attr txattr = {};
+
+ vas_init_tx_win_attr(&txattr, cop_type);
+
+ txattr.lpid = mfspr(SPRN_LPID);
+ txattr.pidr = mfspr(SPRN_PID);
+ txattr.user_win = true;
+ txattr.rsvd_txbuf_count = false;
+ txattr.pswid = false;
+
+ pr_devel("Pid %d: Opening txwin, PIDR %ld\n", txattr.pidr,
+ mfspr(SPRN_PID));
+
+ return vas_tx_win_open(vas_id, cop_type, &txattr);
+}
+
+static u64 vas_user_win_paste_addr(struct vas_window *txwin)
+{
+ struct pnv_vas_window *win;
+ u64 paste_addr;
+
+ win = container_of(txwin, struct pnv_vas_window, vas_win);
+ vas_win_paste_addr(win, &paste_addr, NULL);
+
+ return paste_addr;
+}
+
+static int vas_user_win_close(struct vas_window *txwin)
+{
+ vas_win_close(txwin);
+
+ return 0;
+}
+
+static const struct vas_user_win_ops vops = {
+ .open_win = vas_user_win_open,
+ .paste_addr = vas_user_win_paste_addr,
+ .close_win = vas_user_win_close,
+};
+
+/*
+ * Supporting only nx-gzip coprocessor type now, but this API code
+ * extended to other coprocessor types later.
*/
-u32 vas_win_id(struct vas_window *win)
+int vas_register_api_powernv(struct module *mod, enum vas_cop_type cop_type,
+ const char *name)
+{
+
+ return vas_register_coproc_api(mod, cop_type, name, &vops);
+}
+EXPORT_SYMBOL_GPL(vas_register_api_powernv);
+
+void vas_unregister_api_powernv(void)
{
- return encode_pswid(win->vinst->vas_id, win->winid);
+ vas_unregister_coproc_api();
}
-EXPORT_SYMBOL_GPL(vas_win_id);
+EXPORT_SYMBOL_GPL(vas_unregister_api_powernv);
diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
index 5a2b24cbbc88..9c9650319f3b 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright 2016-17 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "vas: " fmt
@@ -18,7 +14,10 @@
#include <linux/of_platform.h>
#include <linux/of_address.h>
#include <linux/of.h>
+#include <linux/irqdomain.h>
+#include <linux/interrupt.h>
#include <asm/prom.h>
+#include <asm/xive.h>
#include "vas.h"
@@ -27,12 +26,35 @@ static LIST_HEAD(vas_instances);
static DEFINE_PER_CPU(int, cpu_vas_id);
+static int vas_irq_fault_window_setup(struct vas_instance *vinst)
+{
+ int rc = 0;
+
+ rc = request_threaded_irq(vinst->virq, vas_fault_handler,
+ vas_fault_thread_fn, 0, vinst->name, vinst);
+
+ if (rc) {
+ pr_err("VAS[%d]: Request IRQ(%d) failed with %d\n",
+ vinst->vas_id, vinst->virq, rc);
+ goto out;
+ }
+
+ rc = vas_setup_fault_window(vinst);
+ if (rc)
+ free_irq(vinst->virq, vinst);
+
+out:
+ return rc;
+}
+
static int init_vas_instance(struct platform_device *pdev)
{
- int rc, cpu, vasid;
- struct resource *res;
- struct vas_instance *vinst;
struct device_node *dn = pdev->dev.of_node;
+ struct vas_instance *vinst;
+ struct xive_irq_data *xd;
+ uint32_t chipid, hwirq;
+ struct resource *res;
+ int rc, cpu, vasid;
rc = of_property_read_u32(dn, "ibm,vas-id", &vasid);
if (rc) {
@@ -40,6 +62,12 @@ static int init_vas_instance(struct platform_device *pdev)
return -ENODEV;
}
+ rc = of_property_read_u32(dn, "ibm,chip-id", &chipid);
+ if (rc) {
+ pr_err("No ibm,chip-id property for %s?\n", pdev->name);
+ return -ENODEV;
+ }
+
if (pdev->num_resources != 4) {
pr_err("Unexpected DT configuration for [%s, %d]\n",
pdev->name, vasid);
@@ -50,6 +78,12 @@ static int init_vas_instance(struct platform_device *pdev)
if (!vinst)
return -ENOMEM;
+ vinst->name = kasprintf(GFP_KERNEL, "vas-%d", vasid);
+ if (!vinst->name) {
+ kfree(vinst);
+ return -ENOMEM;
+ }
+
INIT_LIST_HEAD(&vinst->node);
ida_init(&vinst->ida);
mutex_init(&vinst->mutex);
@@ -73,9 +107,32 @@ static int init_vas_instance(struct platform_device *pdev)
vinst->paste_win_id_shift = 63 - res->end;
- pr_devel("Initialized instance [%s, %d], paste_base 0x%llx, "
- "paste_win_id_shift 0x%llx\n", pdev->name, vasid,
- vinst->paste_base_addr, vinst->paste_win_id_shift);
+ hwirq = xive_native_alloc_irq_on_chip(chipid);
+ if (!hwirq) {
+ pr_err("Inst%d: Unable to allocate global irq for chip %d\n",
+ vinst->vas_id, chipid);
+ return -ENOENT;
+ }
+
+ vinst->virq = irq_create_mapping(NULL, hwirq);
+ if (!vinst->virq) {
+ pr_err("Inst%d: Unable to map global irq %d\n",
+ vinst->vas_id, hwirq);
+ return -EINVAL;
+ }
+
+ xd = irq_get_chip_data(vinst->virq);
+ if (!xd) {
+ pr_err("Inst%d: Invalid virq %d\n",
+ vinst->vas_id, vinst->virq);
+ return -EINVAL;
+ }
+
+ vinst->irq_port = xd->trig_page;
+ pr_devel("Initialized instance [%s, %d] paste_base 0x%llx paste_win_id_shift 0x%llx IRQ %d Port 0x%llx\n",
+ pdev->name, vasid, vinst->paste_base_addr,
+ vinst->paste_win_id_shift, vinst->virq,
+ vinst->irq_port);
for_each_possible_cpu(cpu) {
if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn))
@@ -86,6 +143,22 @@ static int init_vas_instance(struct platform_device *pdev)
list_add(&vinst->node, &vas_instances);
mutex_unlock(&vas_mutex);
+ spin_lock_init(&vinst->fault_lock);
+ /*
+ * IRQ and fault handling setup is needed only for user space
+ * send windows.
+ */
+ if (vinst->virq) {
+ rc = vas_irq_fault_window_setup(vinst);
+ /*
+ * Fault window is used only for user space send windows.
+ * So if vinst->virq is NULL, tx_win_open returns -ENODEV
+ * for user space.
+ */
+ if (rc)
+ vinst->virq = 0;
+ }
+
vas_instance_init_dbgdir(vinst);
dev_set_drvdata(&pdev->dev, vinst);
@@ -93,6 +166,7 @@ static int init_vas_instance(struct platform_device *pdev)
return 0;
free_vinst:
+ kfree(vinst->name);
kfree(vinst);
return -ENODEV;
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index f5493dbdd7ff..08d9d3d5a22b 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -1,10 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright 2016-17 IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#ifndef _VAS_H
@@ -105,11 +101,9 @@
/*
* Initial per-process credits.
* Max send window credits: 4K-1 (12-bits in VAS_TX_WCRED)
- * Max receive window credits: 64K-1 (16 bits in VAS_LRX_WCRED)
*
* TODO: Needs tuning for per-process credits
*/
-#define VAS_RX_WCREDS_MAX ((64 << 10) - 1)
#define VAS_TX_WCREDS_MAX ((4 << 10) - 1)
#define VAS_WCREDS_DEFAULT (1 << 10)
@@ -300,6 +294,22 @@ enum vas_notify_after_count {
};
/*
+ * NX can generate an interrupt for multiple faults and expects kernel
+ * to process all of them. So read all valid CRB entries until find the
+ * invalid one. So use pswid which is pasted by NX and ccw[0] (reserved
+ * bit in BE) to check valid CRB. CCW[0] will not be touched by user
+ * space. Application gets CRB formt error if it updates this bit.
+ *
+ * Invalidate FIFO during allocation and process all entries from last
+ * successful read until finds invalid pswid and ccw[0] values.
+ * After reading each CRB entry from fault FIFO, the kernel invalidate
+ * it by updating pswid with FIFO_INVALID_ENTRY and CCW[0] with
+ * CCW0_INVALID.
+ */
+#define FIFO_INVALID_ENTRY 0xffffffff
+#define CCW0_INVALID 1
+
+/*
* One per instance of VAS. Each instance will have a separate set of
* receive windows, one per coprocessor type.
*
@@ -317,39 +327,43 @@ struct vas_instance {
u64 paste_base_addr;
u64 paste_win_id_shift;
+ u64 irq_port;
+ int virq;
+ int fault_crbs;
+ int fault_fifo_size;
+ int fifo_in_progress; /* To wake up thread or return IRQ_HANDLED */
+ spinlock_t fault_lock; /* Protects fifo_in_progress update */
+ void *fault_fifo;
+ struct pnv_vas_window *fault_win; /* Fault window */
+
struct mutex mutex;
- struct vas_window *rxwin[VAS_COP_TYPE_MAX];
- struct vas_window *windows[VAS_WINDOWS_PER_CHIP];
+ struct pnv_vas_window *rxwin[VAS_COP_TYPE_MAX];
+ struct pnv_vas_window *windows[VAS_WINDOWS_PER_CHIP];
+ char *name;
char *dbgname;
struct dentry *dbgdir;
};
/*
- * In-kernel state a VAS window. One per window.
+ * In-kernel state a VAS window on PowerNV. One per window.
*/
-struct vas_window {
+struct pnv_vas_window {
+ struct vas_window vas_win;
/* Fields common to send and receive windows */
struct vas_instance *vinst;
- int winid;
bool tx_win; /* True if send window */
bool nx_win; /* True if NX window */
bool user_win; /* True if user space window */
void *hvwc_map; /* HV window context */
void *uwc_map; /* OS/User window context */
- pid_t pid; /* Linux process id of owner */
- int wcreds_max; /* Window credits */
-
- char *dbgname;
- struct dentry *dbgdir;
/* Fields applicable only to send windows */
void *paste_kaddr;
char *paste_addr_name;
- struct vas_window *rxwin;
+ struct pnv_vas_window *rxwin;
- /* Feilds applicable only to receive windows */
- enum vas_cop_type cop;
+ /* Fields applicable only to receive windows */
atomic_t num_txwins;
};
@@ -362,7 +376,7 @@ struct vas_window {
* is a container for the register fields in the window context.
*/
struct vas_winctx {
- void *rx_fifo;
+ u64 rx_fifo;
int rx_fifo_size;
int wcreds_max;
int rsvd_txbuf_count;
@@ -408,19 +422,32 @@ extern struct mutex vas_mutex;
extern struct vas_instance *find_vas_instance(int vasid);
extern void vas_init_dbgdir(void);
extern void vas_instance_init_dbgdir(struct vas_instance *vinst);
-extern void vas_window_init_dbgdir(struct vas_window *win);
-extern void vas_window_free_dbgdir(struct vas_window *win);
+extern void vas_window_init_dbgdir(struct pnv_vas_window *win);
+extern void vas_window_free_dbgdir(struct pnv_vas_window *win);
+extern int vas_setup_fault_window(struct vas_instance *vinst);
+extern irqreturn_t vas_fault_thread_fn(int irq, void *data);
+extern irqreturn_t vas_fault_handler(int irq, void *dev_id);
+extern void vas_return_credit(struct pnv_vas_window *window, bool tx);
+extern struct pnv_vas_window *vas_pswid_to_window(struct vas_instance *vinst,
+ uint32_t pswid);
+extern void vas_win_paste_addr(struct pnv_vas_window *window, u64 *addr,
+ int *len);
+
+static inline int vas_window_pid(struct vas_window *window)
+{
+ return pid_vnr(window->task_ref.pid);
+}
-static inline void vas_log_write(struct vas_window *win, char *name,
+static inline void vas_log_write(struct pnv_vas_window *win, char *name,
void *regptr, u64 val)
{
if (val)
pr_debug("%swin #%d: %s reg %p, val 0x%016llx\n",
- win->tx_win ? "Tx" : "Rx", win->winid, name,
- regptr, val);
+ win->tx_win ? "Tx" : "Rx", win->vas_win.winid,
+ name, regptr, val);
}
-static inline void write_uwc_reg(struct vas_window *win, char *name,
+static inline void write_uwc_reg(struct pnv_vas_window *win, char *name,
s32 reg, u64 val)
{
void *regptr;
@@ -431,7 +458,7 @@ static inline void write_uwc_reg(struct vas_window *win, char *name,
out_be64(regptr, val);
}
-static inline void write_hvwc_reg(struct vas_window *win, char *name,
+static inline void write_hvwc_reg(struct pnv_vas_window *win, char *name,
s32 reg, u64 val)
{
void *regptr;
@@ -442,7 +469,7 @@ static inline void write_hvwc_reg(struct vas_window *win, char *name,
out_be64(regptr, val);
}
-static inline u64 read_hvwc_reg(struct vas_window *win,
+static inline u64 read_hvwc_reg(struct pnv_vas_window *win,
char *name __maybe_unused, s32 reg)
{
return in_be64(win->hvwc_map+reg);
@@ -460,12 +487,7 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
*/
static inline u32 encode_pswid(int vasid, int winid)
{
- u32 pswid = 0;
-
- pswid |= vasid << (31 - 7);
- pswid |= winid;
-
- return pswid;
+ return ((u32)winid | (vasid << (31 - 7)));
}
static inline void decode_pswid(u32 pswid, int *vasid, int *winid)