14 files changed, 743 insertions, 131 deletions
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 3732118a0482..6c9d5199a7e2 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -17,3 +17,4 @@ obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
 obj-$(CONFIG_PPC_MEMTRACE)	+= memtrace.o
 obj-$(CONFIG_PPC_VAS)	+= vas.o vas-window.o vas-debug.o
 obj-$(CONFIG_PPC_FTW)	+= nx-ftw.o
+obj-$(CONFIG_OCXL_BASE)	+= ocxl.o
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 4650fb294e7a..33c86c1a1720 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -43,6 +43,22 @@
 
 static int eeh_event_irq = -EINVAL;
 
+void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
+{
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+
+	if (!pdev->is_virtfn)
+		return;
+
+	/*
+	 * The following operations will fail if VF's sysfs files
+	 * aren't created or its resources aren't finalized.
+	 */
+	eeh_add_device_early(pdn);
+	eeh_add_device_late(pdev);
+	eeh_sysfs_add_device(pdev);
+}
+
 static int pnv_eeh_init(void)
 {
 	struct pci_controller *hose;
@@ -86,6 +102,7 @@ static int pnv_eeh_init(void)
 	}
 
 	eeh_set_pe_aux_size(max_diag_size);
+	ppc_md.pcibios_bus_add_device = pnv_pcibios_bus_add_device;
 
 	return 0;
 }
@@ -1638,70 +1655,11 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 	return ret;
 }
 
-static int pnv_eeh_restore_vf_config(struct pci_dn *pdn)
-{
-	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
-	u32 devctl, cmd, cap2, aer_capctl;
-	int old_mps;
-
-	if (edev->pcie_cap) {
-		/* Restore MPS */
-		old_mps = (ffs(pdn->mps) - 8) << 5;
-		eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
-				     2, &devctl);
-		devctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
-		devctl |= old_mps;
-		eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
-				      2, devctl);
-
-		/* Disable Completion Timeout */
-		eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP2,
-				     4, &cap2);
-		if (cap2 & 0x10) {
-			eeh_ops->read_config(pdn,
-					     edev->pcie_cap + PCI_EXP_DEVCTL2,
-					     4, &cap2);
-			cap2 |= 0x10;
-			eeh_ops->write_config(pdn,
-					      edev->pcie_cap + PCI_EXP_DEVCTL2,
-					      4, cap2);
-		}
-	}
-
-	/* Enable SERR and parity checking */
-	eeh_ops->read_config(pdn, PCI_COMMAND, 2, &cmd);
-	cmd |= (PCI_COMMAND_PARITY | PCI_COMMAND_SERR);
-	eeh_ops->write_config(pdn, PCI_COMMAND, 2, cmd);
-
-	/* Enable report various errors */
-	if (edev->pcie_cap) {
-		eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
-				     2, &devctl);
-		devctl &= ~PCI_EXP_DEVCTL_CERE;
-		devctl |= (PCI_EXP_DEVCTL_NFERE |
-			   PCI_EXP_DEVCTL_FERE |
-			   PCI_EXP_DEVCTL_URRE);
-		eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
-				      2, devctl);
-	}
-
-	/* Enable ECRC generation and check */
-	if (edev->pcie_cap && edev->aer_cap) {
-		eeh_ops->read_config(pdn, edev->aer_cap + PCI_ERR_CAP,
-				     4, &aer_capctl);
-		aer_capctl |= (PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
-		eeh_ops->write_config(pdn, edev->aer_cap + PCI_ERR_CAP,
-				      4, aer_capctl);
-	}
-
-	return 0;
-}
-
 static int pnv_eeh_restore_config(struct pci_dn *pdn)
 {
 	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
 	struct pnv_phb *phb;
-	s64 ret;
+	s64 ret = 0;
 	int config_addr = (pdn->busno << 8) | (pdn->devfn);
 
 	if (!edev)
@@ -1715,7 +1673,7 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn)
 	 * to be exported by firmware in extendible way.
 	 */
 	if (edev->physfn) {
-		ret = pnv_eeh_restore_vf_config(pdn);
+		ret = eeh_restore_vf_config(pdn);
 	} else {
 		phb = pdn->phb->private_data;
 		ret = opal_pci_reinit(phb->opal_id,
@@ -1728,7 +1686,7 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn)
 		return -EIO;
 	}
 
-	return 0;
+	return ret;
 }
 
 static struct eeh_ops pnv_eeh_ops = {
@@ -1746,25 +1704,10 @@ static struct eeh_ops pnv_eeh_ops = {
 	.read_config            = pnv_eeh_read_config,
 	.write_config           = pnv_eeh_write_config,
 	.next_error		= pnv_eeh_next_error,
-	.restore_config		= pnv_eeh_restore_config
+	.restore_config		= pnv_eeh_restore_config,
+	.notify_resume		= NULL
 };
 
-void pcibios_bus_add_device(struct pci_dev *pdev)
-{
-	struct pci_dn *pdn = pci_get_pdn(pdev);
-
-	if (!pdev->is_virtfn)
-		return;
-
-	/*
-	 * The following operations will fail if VF's sysfs files
-	 * aren't created or its resources aren't finalized.
-	 */
-	eeh_add_device_early(pdn);
-	eeh_add_device_late(pdev);
-	eeh_sysfs_add_device(pdev);
-}
-
 #ifdef CONFIG_PCI_IOV
 static void pnv_pci_fixup_vf_mps(struct pci_dev *pdev)
 {
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index f6cbc1a71472..0a253b64ac5f 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -39,7 +39,10 @@
  */
 static struct pci_dev *get_pci_dev(struct device_node *dn)
 {
-	return PCI_DN(dn)->pcidev;
+	struct pci_dn *pdn = PCI_DN(dn);
+
+	return pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
+					   pdn->busno, pdn->devfn);
 }
 
 /* Given a NPU device get the associated PCI device. */
@@ -277,7 +280,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
 	int64_t rc = 0;
 	phys_addr_t top = memblock_end_of_DRAM();
 
-	if (phb->type != PNV_PHB_NPU || !npe->pdev)
+	if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
 		return -EINVAL;
 
 	rc = pnv_npu_unset_window(npe, 0);
diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c
new file mode 100644
index 000000000000..fa9b53af3c7b
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/ocxl.c
@@ -0,0 +1,515 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright 2017 IBM Corp.
+#include <asm/pnv-ocxl.h>
+#include <asm/opal.h>
+#include <asm/xive.h>
+#include <misc/ocxl-config.h>
+#include "pci.h"
+
+#define PNV_OCXL_TL_P9_RECV_CAP		0x000000000000000Full
+#define PNV_OCXL_ACTAG_MAX		64
+/* PASIDs are 20-bit, but on P9, NPU can only handle 15 bits */
+#define PNV_OCXL_PASID_BITS		15
+#define PNV_OCXL_PASID_MAX		((1 << PNV_OCXL_PASID_BITS) - 1)
+
+#define AFU_PRESENT (1 << 31)
+#define AFU_INDEX_MASK 0x3F000000
+#define AFU_INDEX_SHIFT 24
+#define ACTAG_MASK 0xFFF
+
+
+struct actag_range {
+	u16 start;
+	u16 count;
+};
+
+struct npu_link {
+	struct list_head list;
+	int domain;
+	int bus;
+	int dev;
+	u16 fn_desired_actags[8];
+	struct actag_range fn_actags[8];
+	bool assignment_done;
+};
+static struct list_head links_list = LIST_HEAD_INIT(links_list);
+static DEFINE_MUTEX(links_list_lock);
+
+
+/*
+ * opencapi actags handling:
+ *
+ * When sending commands, the opencapi device references the memory
+ * context it's targeting with an 'actag', which is really an alias
+ * for a (BDF, pasid) combination. When it receives a command, the NPU
+ * must do a lookup of the actag to identify the memory context. The
+ * hardware supports a finite number of actags per link (64 for
+ * POWER9).
+ *
+ * The device can carry multiple functions, and each function can have
+ * multiple AFUs. Each AFU advertises in its config space the number
+ * of desired actags. The host must configure in the config space of
+ * the AFU how many actags the AFU is really allowed to use (which can
+ * be less than what the AFU desires).
+ *
+ * When a PCI function is probed by the driver, it has no visibility
+ * about the other PCI functions and how many actags they'd like,
+ * which makes it impossible to distribute actags fairly among AFUs.
+ *
+ * Unfortunately, the only way to know how many actags a function
+ * desires is by looking at the data for each AFU in the config space
+ * and add them up. Similarly, the only way to know how many actags
+ * all the functions of the physical device desire is by adding the
+ * previously computed function counts. Then we can match that against
+ * what the hardware supports.
+ *
+ * To get a comprehensive view, we use a 'pci fixup': at the end of
+ * PCI enumeration, each function counts how many actags its AFUs
+ * desire and we save it in a 'npu_link' structure, shared between all
+ * the PCI functions of a same device. Therefore, when the first
+ * function is probed by the driver, we can get an idea of the total
+ * count of desired actags for the device, and assign the actags to
+ * the AFUs, by pro-rating if needed.
+ */
+
+static int find_dvsec_from_pos(struct pci_dev *dev, int dvsec_id, int pos)
+{
+	int vsec = pos;
+	u16 vendor, id;
+
+	while ((vsec = pci_find_next_ext_capability(dev, vsec,
+						    OCXL_EXT_CAP_ID_DVSEC))) {
+		pci_read_config_word(dev, vsec + OCXL_DVSEC_VENDOR_OFFSET,
+				&vendor);
+		pci_read_config_word(dev, vsec + OCXL_DVSEC_ID_OFFSET, &id);
+		if (vendor == PCI_VENDOR_ID_IBM && id == dvsec_id)
+			return vsec;
+	}
+	return 0;
+}
+
+static int find_dvsec_afu_ctrl(struct pci_dev *dev, u8 afu_idx)
+{
+	int vsec = 0;
+	u8 idx;
+
+	while ((vsec = find_dvsec_from_pos(dev, OCXL_DVSEC_AFU_CTRL_ID,
+					   vsec))) {
+		pci_read_config_byte(dev, vsec + OCXL_DVSEC_AFU_CTRL_AFU_IDX,
+				&idx);
+		if (idx == afu_idx)
+			return vsec;
+	}
+	return 0;
+}
+
+static int get_max_afu_index(struct pci_dev *dev, int *afu_idx)
+{
+	int pos;
+	u32 val;
+
+	pos = find_dvsec_from_pos(dev, OCXL_DVSEC_FUNC_ID, 0);
+	if (!pos)
+		return -ESRCH;
+
+	pci_read_config_dword(dev, pos + OCXL_DVSEC_FUNC_OFF_INDEX, &val);
+	if (val & AFU_PRESENT)
+		*afu_idx = (val & AFU_INDEX_MASK) >> AFU_INDEX_SHIFT;
+	else
+		*afu_idx = -1;
+	return 0;
+}
+
+static int get_actag_count(struct pci_dev *dev, int afu_idx, int *actag)
+{
+	int pos;
+	u16 actag_sup;
+
+	pos = find_dvsec_afu_ctrl(dev, afu_idx);
+	if (!pos)
+		return -ESRCH;
+
+	pci_read_config_word(dev, pos + OCXL_DVSEC_AFU_CTRL_ACTAG_SUP,
+			&actag_sup);
+	*actag = actag_sup & ACTAG_MASK;
+	return 0;
+}
+
+static struct npu_link *find_link(struct pci_dev *dev)
+{
+	struct npu_link *link;
+
+	list_for_each_entry(link, &links_list, list) {
+		/* The functions of a device all share the same link */
+		if (link->domain == pci_domain_nr(dev->bus) &&
+			link->bus == dev->bus->number &&
+			link->dev == PCI_SLOT(dev->devfn)) {
+			return link;
+		}
+	}
+
+	/* link doesn't exist yet. Allocate one */
+	link = kzalloc(sizeof(struct npu_link), GFP_KERNEL);
+	if (!link)
+		return NULL;
+	link->domain = pci_domain_nr(dev->bus);
+	link->bus = dev->bus->number;
+	link->dev = PCI_SLOT(dev->devfn);
+	list_add(&link->list, &links_list);
+	return link;
+}
+
+static void pnv_ocxl_fixup_actag(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct npu_link *link;
+	int rc, afu_idx = -1, i, actag;
+
+	if (!machine_is(powernv))
+		return;
+
+	if (phb->type != PNV_PHB_NPU_OCAPI)
+		return;
+
+	mutex_lock(&links_list_lock);
+
+	link = find_link(dev);
+	if (!link) {
+		dev_warn(&dev->dev, "couldn't update actag information\n");
+		mutex_unlock(&links_list_lock);
+		return;
+	}
+
+	/*
+	 * Check how many actags are desired for the AFUs under that
+	 * function and add it to the count for the link
+	 */
+	rc = get_max_afu_index(dev, &afu_idx);
+	if (rc) {
+		/* Most likely an invalid config space */
+		dev_dbg(&dev->dev, "couldn't find AFU information\n");
+		afu_idx = -1;
+	}
+
+	link->fn_desired_actags[PCI_FUNC(dev->devfn)] = 0;
+	for (i = 0; i <= afu_idx; i++) {
+		/*
+		 * AFU index 'holes' are allowed. So don't fail if we
+		 * can't read the actag info for an index
+		 */
+		rc = get_actag_count(dev, i, &actag);
+		if (rc)
+			continue;
+		link->fn_desired_actags[PCI_FUNC(dev->devfn)] += actag;
+	}
+	dev_dbg(&dev->dev, "total actags for function: %d\n",
+		link->fn_desired_actags[PCI_FUNC(dev->devfn)]);
+
+	mutex_unlock(&links_list_lock);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_ocxl_fixup_actag);
+
+static u16 assign_fn_actags(u16 desired, u16 total)
+{
+	u16 count;
+
+	if (total <= PNV_OCXL_ACTAG_MAX)
+		count = desired;
+	else
+		count = PNV_OCXL_ACTAG_MAX * desired / total;
+
+	return count;
+}
+
+static void assign_actags(struct npu_link *link)
+{
+	u16 actag_count, range_start = 0, total_desired = 0;
+	int i;
+
+	for (i = 0; i < 8; i++)
+		total_desired += link->fn_desired_actags[i];
+
+	for (i = 0; i < 8; i++) {
+		if (link->fn_desired_actags[i]) {
+			actag_count = assign_fn_actags(
+				link->fn_desired_actags[i],
+				total_desired);
+			link->fn_actags[i].start = range_start;
+			link->fn_actags[i].count = actag_count;
+			range_start += actag_count;
+			WARN_ON(range_start >= PNV_OCXL_ACTAG_MAX);
+		}
+		pr_debug("link %x:%x:%x fct %d actags: start=%d count=%d (desired=%d)\n",
+			link->domain, link->bus, link->dev, i,
+			link->fn_actags[i].start, link->fn_actags[i].count,
+			link->fn_desired_actags[i]);
+	}
+	link->assignment_done = true;
+}
+
+int pnv_ocxl_get_actag(struct pci_dev *dev, u16 *base, u16 *enabled,
+		u16 *supported)
+{
+	struct npu_link *link;
+
+	mutex_lock(&links_list_lock);
+
+	link = find_link(dev);
+	if (!link) {
+		dev_err(&dev->dev, "actag information not found\n");
+		mutex_unlock(&links_list_lock);
+		return -ENODEV;
+	}
+	/*
+	 * On p9, we only have 64 actags per link, so they must be
+	 * shared by all the functions of the same adapter. We counted
+	 * the desired actag counts during PCI enumeration, so that we
+	 * can allocate a pro-rated number of actags to each function.
+	 */
+	if (!link->assignment_done)
+		assign_actags(link);
+
+	*base      = link->fn_actags[PCI_FUNC(dev->devfn)].start;
+	*enabled   = link->fn_actags[PCI_FUNC(dev->devfn)].count;
+	*supported = link->fn_desired_actags[PCI_FUNC(dev->devfn)];
+
+	mutex_unlock(&links_list_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_get_actag);
+
+int pnv_ocxl_get_pasid_count(struct pci_dev *dev, int *count)
+{
+	struct npu_link *link;
+	int i, rc = -EINVAL;
+
+	/*
+	 * The number of PASIDs (process address space ID) which can
+	 * be used by a function depends on how many functions exist
+	 * on the device. The NPU needs to be configured to know how
+	 * many bits are available to PASIDs and how many are to be
+	 * used by the function BDF indentifier.
+	 *
+	 * We only support one AFU-carrying function for now.
+	 */
+	mutex_lock(&links_list_lock);
+
+	link = find_link(dev);
+	if (!link) {
+		dev_err(&dev->dev, "actag information not found\n");
+		mutex_unlock(&links_list_lock);
+		return -ENODEV;
+	}
+
+	for (i = 0; i < 8; i++)
+		if (link->fn_desired_actags[i] && (i == PCI_FUNC(dev->devfn))) {
+			*count = PNV_OCXL_PASID_MAX;
+			rc = 0;
+			break;
+		}
+
+	mutex_unlock(&links_list_lock);
+	dev_dbg(&dev->dev, "%d PASIDs available for function\n",
+		rc ? 0 : *count);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_get_pasid_count);
+
+static void set_templ_rate(unsigned int templ, unsigned int rate, char *buf)
+{
+	int shift, idx;
+
+	WARN_ON(templ > PNV_OCXL_TL_MAX_TEMPLATE);
+	idx = (PNV_OCXL_TL_MAX_TEMPLATE - templ) / 2;
+	shift = 4 * (1 - ((PNV_OCXL_TL_MAX_TEMPLATE - templ) % 2));
+	buf[idx] |= rate << shift;
+}
+
+int pnv_ocxl_get_tl_cap(struct pci_dev *dev, long *cap,
+			char *rate_buf, int rate_buf_size)
+{
+	if (rate_buf_size != PNV_OCXL_TL_RATE_BUF_SIZE)
+		return -EINVAL;
+	/*
+	 * The TL capabilities are a characteristic of the NPU, so
+	 * we go with hard-coded values.
+	 *
+	 * The receiving rate of each template is encoded on 4 bits.
+	 *
+	 * On P9:
+	 * - templates 0 -> 3 are supported
+	 * - templates 0, 1 and 3 have a 0 receiving rate
+	 * - template 2 has receiving rate of 1 (extra cycle)
+	 */
+	memset(rate_buf, 0, rate_buf_size);
+	set_templ_rate(2, 1, rate_buf);
+	*cap = PNV_OCXL_TL_P9_RECV_CAP;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_get_tl_cap);
+
+int pnv_ocxl_set_tl_conf(struct pci_dev *dev, long cap,
+			uint64_t rate_buf_phys, int rate_buf_size)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	int rc;
+
+	if (rate_buf_size != PNV_OCXL_TL_RATE_BUF_SIZE)
+		return -EINVAL;
+
+	rc = opal_npu_tl_set(phb->opal_id, dev->devfn, cap,
+			rate_buf_phys, rate_buf_size);
+	if (rc) {
+		dev_err(&dev->dev, "Can't configure host TL: %d\n", rc);
+		return -EINVAL;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_set_tl_conf);
+
+int pnv_ocxl_get_xsl_irq(struct pci_dev *dev, int *hwirq)
+{
+	int rc;
+
+	rc = of_property_read_u32(dev->dev.of_node, "ibm,opal-xsl-irq", hwirq);
+	if (rc) {
+		dev_err(&dev->dev,
+			"Can't get translation interrupt for device\n");
+		return rc;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_get_xsl_irq);
+
+void pnv_ocxl_unmap_xsl_regs(void __iomem *dsisr, void __iomem *dar,
+			void __iomem *tfc, void __iomem *pe_handle)
+{
+	iounmap(dsisr);
+	iounmap(dar);
+	iounmap(tfc);
+	iounmap(pe_handle);
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_unmap_xsl_regs);
+
+int pnv_ocxl_map_xsl_regs(struct pci_dev *dev, void __iomem **dsisr,
+			void __iomem **dar, void __iomem **tfc,
+			void __iomem **pe_handle)
+{
+	u64 reg;
+	int i, j, rc = 0;
+	void __iomem *regs[4];
+
+	/*
+	 * opal stores the mmio addresses of the DSISR, DAR, TFC and
+	 * PE_HANDLE registers in a device tree property, in that
+	 * order
+	 */
+	for (i = 0; i < 4; i++) {
+		rc = of_property_read_u64_index(dev->dev.of_node,
+						"ibm,opal-xsl-mmio", i, &reg);
+		if (rc)
+			break;
+		regs[i] = ioremap(reg, 8);
+		if (!regs[i]) {
+			rc = -EINVAL;
+			break;
+		}
+	}
+	if (rc) {
+		dev_err(&dev->dev, "Can't map translation mmio registers\n");
+		for (j = i - 1; j >= 0; j--)
+			iounmap(regs[j]);
+	} else {
+		*dsisr = regs[0];
+		*dar = regs[1];
+		*tfc = regs[2];
+		*pe_handle = regs[3];
+	}
+	return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_map_xsl_regs);
+
+struct spa_data {
+	u64 phb_opal_id;
+	u32 bdfn;
+};
+
+int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask,
+		void **platform_data)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	struct spa_data *data;
+	u32 bdfn;
+	int rc;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	bdfn = (dev->bus->number << 8) | dev->devfn;
+	rc = opal_npu_spa_setup(phb->opal_id, bdfn, virt_to_phys(spa_mem),
+				PE_mask);
+	if (rc) {
+		dev_err(&dev->dev, "Can't setup Shared Process Area: %d\n", rc);
+		kfree(data);
+		return rc;
+	}
+	data->phb_opal_id = phb->opal_id;
+	data->bdfn = bdfn;
+	*platform_data = (void *) data;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_spa_setup);
+
+void pnv_ocxl_spa_release(void *platform_data)
+{
+	struct spa_data *data = (struct spa_data *) platform_data;
+	int rc;
+
+	rc = opal_npu_spa_setup(data->phb_opal_id, data->bdfn, 0, 0);
+	WARN_ON(rc);
+	kfree(data);
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_spa_release);
+
+int pnv_ocxl_spa_remove_pe(void *platform_data, int pe_handle)
+{
+	struct spa_data *data = (struct spa_data *) platform_data;
+	int rc;
+
+	rc = opal_npu_spa_clear_cache(data->phb_opal_id, data->bdfn, pe_handle);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe);
+
+int pnv_ocxl_alloc_xive_irq(u32 *irq, u64 *trigger_addr)
+{
+	__be64 flags, trigger_page;
+	s64 rc;
+	u32 hwirq;
+
+	hwirq = xive_native_alloc_irq();
+	if (!hwirq)
+		return -ENOENT;
+
+	rc = opal_xive_get_irq_info(hwirq, &flags, NULL, &trigger_page, NULL,
+				NULL);
+	if (rc || !trigger_page) {
+		xive_native_free_irq(hwirq);
+		return -ENOENT;
+	}
+	*irq = hwirq;
+	*trigger_addr = be64_to_cpu(trigger_page);
+	return 0;
+
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_alloc_xive_irq);
+
+void pnv_ocxl_free_xive_irq(u32 irq)
+{
+	xive_native_free_irq(irq);
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_free_xive_irq);
diff --git a/arch/powerpc/platforms/powernv/opal-dump.c b/arch/powerpc/platforms/powernv/opal-dump.c
index 4c827826c05e..0dc8fa4e0af2 100644
--- a/arch/powerpc/platforms/powernv/opal-dump.c
+++ b/arch/powerpc/platforms/powernv/opal-dump.c
@@ -103,9 +103,9 @@ static ssize_t dump_ack_store(struct dump_obj *dump_obj,
  * due to the dynamic size of the dump
  */
 static struct dump_attribute id_attribute =
-	__ATTR(id, S_IRUGO, dump_id_show, NULL);
+	__ATTR(id, 0444, dump_id_show, NULL);
 static struct dump_attribute type_attribute =
-	__ATTR(type, S_IRUGO, dump_type_show, NULL);
+	__ATTR(type, 0444, dump_type_show, NULL);
 static struct dump_attribute ack_attribute =
 	__ATTR(acknowledge, 0660, dump_ack_show, dump_ack_store);
 
diff --git a/arch/powerpc/platforms/powernv/opal-elog.c b/arch/powerpc/platforms/powernv/opal-elog.c
index ecd6d9177d13..ba6e437abb4b 100644
--- a/arch/powerpc/platforms/powernv/opal-elog.c
+++ b/arch/powerpc/platforms/powernv/opal-elog.c
@@ -83,9 +83,9 @@ static ssize_t elog_ack_store(struct elog_obj *elog_obj,
 }
 
 static struct elog_attribute id_attribute =
-	__ATTR(id, S_IRUGO, elog_id_show, NULL);
+	__ATTR(id, 0444, elog_id_show, NULL);
 static struct elog_attribute type_attribute =
-	__ATTR(type, S_IRUGO, elog_type_show, NULL);
+	__ATTR(type, 0444, elog_type_show, NULL);
 static struct elog_attribute ack_attribute =
 	__ATTR(acknowledge, 0660, elog_ack_show, elog_ack_store);
 
diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c
index 465ea105b771..dd4c9b8b8a81 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -21,6 +21,78 @@
 #include <asm/io.h>
 #include <asm/imc-pmu.h>
 #include <asm/cputhreads.h>
+#include <asm/debugfs.h>
+
+static struct dentry *imc_debugfs_parent;
+
+/* Helpers to export imc command and mode via debugfs */
+static int imc_mem_get(void *data, u64 *val)
+{
+	*val = cpu_to_be64(*(u64 *)data);
+	return 0;
+}
+
+static int imc_mem_set(void *data, u64 val)
+{
+	*(u64 *)data = cpu_to_be64(val);
+	return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(fops_imc_x64, imc_mem_get, imc_mem_set, "0x%016llx\n");
+
+static struct dentry *imc_debugfs_create_x64(const char *name, umode_t mode,
+					     struct dentry *parent, u64  *value)
+{
+	return debugfs_create_file_unsafe(name, mode, parent,
+					  value, &fops_imc_x64);
+}
+
+/*
+ * export_imc_mode_and_cmd: Create a debugfs interface
+ *                     for imc_cmd and imc_mode
+ *                     for each node in the system.
+ *  imc_mode and imc_cmd can be changed by echo into
+ *  this interface.
+ */
+static void export_imc_mode_and_cmd(struct device_node *node,
+				    struct imc_pmu *pmu_ptr)
+{
+	static u64 loc, *imc_mode_addr, *imc_cmd_addr;
+	int chip = 0, nid;
+	char mode[16], cmd[16];
+	u32 cb_offset;
+
+	imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
+
+	/*
+	 * Return here, either because 'imc' directory already exists,
+	 * Or failed to create a new one.
+	 */
+	if (!imc_debugfs_parent)
+		return;
+
+	if (of_property_read_u32(node, "cb_offset", &cb_offset))
+		cb_offset = IMC_CNTL_BLK_OFFSET;
+
+	for_each_node(nid) {
+		loc = (u64)(pmu_ptr->mem_info[chip].vbase) + cb_offset;
+		imc_mode_addr = (u64 *)(loc + IMC_CNTL_BLK_MODE_OFFSET);
+		sprintf(mode, "imc_mode_%d", nid);
+		if (!imc_debugfs_create_x64(mode, 0600, imc_debugfs_parent,
+					    imc_mode_addr))
+			goto err;
+
+		imc_cmd_addr = (u64 *)(loc + IMC_CNTL_BLK_CMD_OFFSET);
+		sprintf(cmd, "imc_cmd_%d", nid);
+		if (!imc_debugfs_create_x64(cmd, 0600, imc_debugfs_parent,
+					    imc_cmd_addr))
+			goto err;
+		chip++;
+	}
+	return;
+
+err:
+	debugfs_remove_recursive(imc_debugfs_parent);
+}
 
 /*
  * imc_get_mem_addr_nest: Function to get nest counter memory region
@@ -65,6 +137,7 @@ static int imc_get_mem_addr_nest(struct device_node *node,
 	}
 
 	pmu_ptr->imc_counter_mmaped = true;
+	export_imc_mode_and_cmd(node, pmu_ptr);
 	kfree(base_addr_arr);
 	kfree(chipid_arr);
 	return 0;
@@ -213,6 +286,10 @@ static int opal_imc_counters_probe(struct platform_device *pdev)
 		}
 	}
 
+	/* If none of the nest units are registered, remove debugfs interface */
+	if (pmu_count == 0)
+		debugfs_remove_recursive(imc_debugfs_parent);
+
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/powernv/opal-sysparam.c b/arch/powerpc/platforms/powernv/opal-sysparam.c
index 23fb6647dced..6fd4092798d5 100644
--- a/arch/powerpc/platforms/powernv/opal-sysparam.c
+++ b/arch/powerpc/platforms/powernv/opal-sysparam.c
@@ -260,13 +260,13 @@ void __init opal_sys_param_init(void)
 		/* If the parameter is read-only or read-write */
 		switch (perm[i] & 3) {
 		case OPAL_SYSPARAM_READ:
-			attr[i].kobj_attr.attr.mode = S_IRUGO;
+			attr[i].kobj_attr.attr.mode = 0444;
 			break;
 		case OPAL_SYSPARAM_WRITE:
-			attr[i].kobj_attr.attr.mode = S_IWUSR;
+			attr[i].kobj_attr.attr.mode = 0200;
 			break;
 		case OPAL_SYSPARAM_RW:
-			attr[i].kobj_attr.attr.mode = S_IRUGO | S_IWUSR;
+			attr[i].kobj_attr.attr.mode = 0644;
 			break;
 		default:
 			break;
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 6f4b00a2ac46..1b2936ba6040 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -320,3 +320,6 @@ OPAL_CALL(opal_set_powercap,			OPAL_SET_POWERCAP);
 OPAL_CALL(opal_get_power_shift_ratio,		OPAL_GET_POWER_SHIFT_RATIO);
 OPAL_CALL(opal_set_power_shift_ratio,		OPAL_SET_POWER_SHIFT_RATIO);
 OPAL_CALL(opal_sensor_group_clear,		OPAL_SENSOR_GROUP_CLEAR);
+OPAL_CALL(opal_npu_spa_setup,			OPAL_NPU_SPA_SETUP);
+OPAL_CALL(opal_npu_spa_clear_cache,		OPAL_NPU_SPA_CLEAR_CACHE);
+OPAL_CALL(opal_npu_tl_set,			OPAL_NPU_TL_SET);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 041ddbd1fc57..c15182765ff5 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -127,7 +127,7 @@ int __init early_init_dt_scan_opal(unsigned long node,
 
 	if (of_flat_dt_is_compatible(node, "ibm,opal-v3")) {
 		powerpc_firmware_features |= FW_FEATURE_OPAL;
-		pr_info("OPAL detected !\n");
+		pr_debug("OPAL detected !\n");
 	} else {
 		panic("OPAL != V3 detected, no longer supported.\n");
 	}
@@ -239,8 +239,8 @@ int opal_message_notifier_register(enum opal_msg_type msg_type,
 					struct notifier_block *nb)
 {
 	if (!nb || msg_type >= OPAL_MSG_TYPE_MAX) {
-		pr_warning("%s: Invalid arguments, msg_type:%d\n",
-			   __func__, msg_type);
+		pr_warn("%s: Invalid arguments, msg_type:%d\n",
+			__func__, msg_type);
 		return -EINVAL;
 	}
 
@@ -281,8 +281,8 @@ static void opal_handle_message(void)
 
 	/* check for errors. */
 	if (ret) {
-		pr_warning("%s: Failed to retrieve opal message, err=%lld\n",
-				__func__, ret);
+		pr_warn("%s: Failed to retrieve opal message, err=%lld\n",
+			__func__, ret);
 		return;
 	}
 
@@ -461,24 +461,14 @@ static int opal_recover_mce(struct pt_regs *regs,
 
 void pnv_platform_error_reboot(struct pt_regs *regs, const char *msg)
 {
-	/*
-	 * This is mostly taken from kernel/panic.c, but tries to do
-	 * relatively minimal work. Don't use delay functions (TB may
-	 * be broken), don't crash dump (need to set a firmware log),
-	 * don't run notifiers. We do want to get some information to
-	 * Linux console.
-	 */
-	console_verbose();
-	bust_spinlocks(1);
+	panic_flush_kmsg_start();
+
 	pr_emerg("Hardware platform error: %s\n", msg);
 	if (regs)
 		show_regs(regs);
 	smp_send_stop();
-	printk_safe_flush_on_panic();
-	kmsg_dump(KMSG_DUMP_PANIC);
-	bust_spinlocks(0);
-	debug_locks_off();
-	console_flush_on_panic();
+
+	panic_flush_kmsg_end();
 
 	/*
 	 * Don't bother to shut things down because this will
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9582aeb1fe4c..496e47696ed0 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -54,7 +54,8 @@
 #define POWERNV_IOMMU_DEFAULT_LEVELS	1
 #define POWERNV_IOMMU_MAX_LEVELS	5
 
-static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU" };
+static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
+					      "NPU_OCAPI" };
 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
 
 void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
@@ -89,6 +90,7 @@ void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
 }
 
 static bool pnv_iommu_bypass_disabled __read_mostly;
+static bool pci_reset_phbs __read_mostly;
 
 static int __init iommu_setup(char *str)
 {
@@ -110,6 +112,14 @@ static int __init iommu_setup(char *str)
 }
 early_param("iommu", iommu_setup);
 
+static int __init pci_reset_phbs_setup(char *str)
+{
+	pci_reset_phbs = true;
+	return 0;
+}
+
+early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup);
+
 static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r)
 {
 	/*
@@ -924,7 +934,7 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe)
 	 * Configure PELTV. NPUs don't have a PELTV table so skip
 	 * configuration on them.
 	 */
-	if (phb->type != PNV_PHB_NPU)
+	if (phb->type != PNV_PHB_NPU_NVLINK && phb->type != PNV_PHB_NPU_OCAPI)
 		pnv_ioda_set_peltv(phb, pe, true);
 
 	/* Setup reverse map */
@@ -1059,8 +1069,8 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 
 	pe = pnv_ioda_alloc_pe(phb);
 	if (!pe) {
-		pr_warning("%s: Not enough PE# available, disabling device\n",
-			   pci_name(dev));
+		pr_warn("%s: Not enough PE# available, disabling device\n",
+			pci_name(dev));
 		return NULL;
 	}
 
@@ -1072,7 +1082,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
 	 * At some point we want to remove the PDN completely anyways
 	 */
 	pci_dev_get(dev);
-	pdn->pcidev = dev;
 	pdn->pe_number = pe->pe_number;
 	pe->flags = PNV_IODA_PE_DEV;
 	pe->pdev = dev;
@@ -1119,7 +1128,6 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
 			continue;
 
 		pe->device_count++;
-		pdn->pcidev = dev;
 		pdn->pe_number = pe->pe_number;
 		if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
 			pnv_ioda_setup_same_PE(dev->subordinate, pe);
@@ -1164,7 +1172,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
 		pe = pnv_ioda_alloc_pe(phb);
 
 	if (!pe) {
-		pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n",
+		pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
 			__func__, pci_domain_nr(bus), bus->number);
 		return NULL;
 	}
@@ -1234,7 +1242,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
 			pci_dev_get(npu_pdev);
 			npu_pdn = pci_get_pdn(npu_pdev);
 			rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
-			npu_pdn->pcidev = npu_pdev;
 			npu_pdn->pe_number = pe_num;
 			phb->ioda.pe_rmap[rid] = pe->pe_number;
 
@@ -1272,16 +1279,23 @@ static void pnv_pci_ioda_setup_PEs(void)
 {
 	struct pci_controller *hose, *tmp;
 	struct pnv_phb *phb;
+	struct pci_bus *bus;
+	struct pci_dev *pdev;
 
 	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
 		phb = hose->private_data;
-		if (phb->type == PNV_PHB_NPU) {
+		if (phb->type == PNV_PHB_NPU_NVLINK) {
 			/* PE#0 is needed for error reporting */
 			pnv_ioda_reserve_pe(phb, 0);
 			pnv_ioda_setup_npu_PEs(hose->bus);
 			if (phb->model == PNV_PHB_MODEL_NPU2)
 				pnv_npu2_init(phb);
 		}
+		if (phb->type == PNV_PHB_NPU_OCAPI) {
+			bus = hose->bus;
+			list_for_each_entry(pdev, &bus->devices, bus_list)
+				pnv_ioda_setup_dev_PE(pdev);
+		}
 	}
 }
 
@@ -1692,7 +1706,7 @@ m64_failed:
 	return ret;
 }
 
-int pcibios_sriov_disable(struct pci_dev *pdev)
+int pnv_pcibios_sriov_disable(struct pci_dev *pdev)
 {
 	pnv_pci_sriov_disable(pdev);
 
@@ -1701,7 +1715,7 @@ int pcibios_sriov_disable(struct pci_dev *pdev)
 	return 0;
 }
 
-int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
+int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 {
 	/* Allocate PCI data */
 	add_dev_pci_data(pdev);
@@ -2572,7 +2586,6 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
 	unsigned long direct_table_size;
 
 	if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) ||
-			(window_size > memory_hotplug_max()) ||
 			!is_power_of_2(window_size))
 		return 0;
 
@@ -2640,7 +2653,7 @@ static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque)
 
 	hose = pci_bus_to_host(pdev->bus);
 	phb = hose->private_data;
-	if (phb->type != PNV_PHB_NPU)
+	if (phb->type != PNV_PHB_NPU_NVLINK)
 		return 0;
 
 	*ptmppe = &phb->ioda.pe_array[pdn->pe_number];
@@ -2724,7 +2737,7 @@ static void pnv_pci_ioda_setup_iommu_api(void)
 	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
 		phb = hose->private_data;
 
-		if (phb->type != PNV_PHB_NPU)
+		if (phb->type != PNV_PHB_NPU_NVLINK)
 			continue;
 
 		list_for_each_entry(pe, &phb->ioda.pe_list, list) {
@@ -3293,7 +3306,7 @@ static void pnv_pci_ioda_create_dbgfs(void)
 		sprintf(name, "PCI%04x", hose->global_number);
 		phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root);
 		if (!phb->dbgfs) {
-			pr_warning("%s: Error on creating debugfs on PHB#%x\n",
+			pr_warn("%s: Error on creating debugfs on PHB#%x\n",
 				__func__, hose->global_number);
 			continue;
 		}
@@ -3774,6 +3787,13 @@ static const struct pci_controller_ops pnv_npu_ioda_controller_ops = {
 	.shutdown		= pnv_pci_ioda_shutdown,
 };
 
+static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
+	.enable_device_hook	= pnv_pci_enable_device_hook,
+	.window_alignment	= pnv_pci_window_alignment,
+	.reset_secondary_bus	= pnv_pci_reset_secondary_bus,
+	.shutdown		= pnv_pci_ioda_shutdown,
+};
+
 #ifdef CONFIG_CXL_BASE
 const struct pci_controller_ops pnv_cxl_cx4_ioda_controller_ops = {
 	.dma_dev_setup		= pnv_pci_dma_dev_setup,
@@ -4007,9 +4027,14 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	 */
 	ppc_md.pcibios_fixup = pnv_pci_ioda_fixup;
 
-	if (phb->type == PNV_PHB_NPU) {
+	switch (phb->type) {
+	case PNV_PHB_NPU_NVLINK:
 		hose->controller_ops = pnv_npu_ioda_controller_ops;
-	} else {
+		break;
+	case PNV_PHB_NPU_OCAPI:
+		hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops;
+		break;
+	default:
 		phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
 		hose->controller_ops = pnv_pci_ioda_controller_ops;
 	}
@@ -4019,6 +4044,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 #ifdef CONFIG_PCI_IOV
 	ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
 	ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment;
+	ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable;
+	ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable;
 #endif
 
 	pci_add_flags(PCI_REASSIGN_ALL_RSRC);
@@ -4026,15 +4053,16 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
 	/* Reset IODA tables to a clean state */
 	rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET);
 	if (rc)
-		pr_warning("  OPAL Error %ld performing IODA table reset !\n", rc);
+		pr_warn("  OPAL Error %ld performing IODA table reset !\n", rc);
 
 	/*
 	 * If we're running in kdump kernel, the previous kernel never
 	 * shutdown PCI devices correctly. We already got IODA table
 	 * cleaned out. So we have to issue PHB reset to stop all PCI
-	 * transactions from previous kernel.
+	 * transactions from previous kernel. The ppc_pci_reset_phbs
+	 * kernel parameter will force this reset too.
 	 */
-	if (is_kdump_kernel()) {
+	if (is_kdump_kernel() || pci_reset_phbs) {
 		pr_info("  Issue PHB reset ...\n");
 		pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL);
 		pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE);
@@ -4052,8 +4080,26 @@ void __init pnv_pci_init_ioda2_phb(struct device_node *np)
 
 void __init pnv_pci_init_npu_phb(struct device_node *np)
 {
-	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU);
+	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_NVLINK);
+}
+
+void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np)
+{
+	pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI);
+}
+
+static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+
+	if (!machine_is(powernv))
+		return;
+
+	if (phb->type == PNV_PHB_NPU_OCAPI)
+		dev->cfg_size = PCI_CFG_SPACE_EXP_SIZE;
 }
+DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pnv_npu2_opencapi_cfg_size_fixup);
 
 void __init pnv_pci_init_ioda_hub(struct device_node *np)
 {
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 5422f4a6317c..69d102cbf48f 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -1142,6 +1142,10 @@ void __init pnv_pci_init(void)
 	for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-phb")
 		pnv_pci_init_npu_phb(np);
 
+	/* Look for NPU2 OpenCAPI PHBs */
+	for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-opencapi-phb")
+		pnv_pci_init_npu2_opencapi_phb(np);
+
 	/* Configure IOMMU DMA hooks */
 	set_pci_dma_ops(&dma_iommu_ops);
 }
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index b772d7473896..eada4b6068cb 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -12,9 +12,10 @@ struct pci_dn;
 #define NV_NMMU_ATSD_REGS 8
 
 enum pnv_phb_type {
-	PNV_PHB_IODA1	= 0,
-	PNV_PHB_IODA2	= 1,
-	PNV_PHB_NPU	= 2,
+	PNV_PHB_IODA1		= 0,
+	PNV_PHB_IODA2		= 1,
+	PNV_PHB_NPU_NVLINK	= 2,
+	PNV_PHB_NPU_OCAPI	= 3,
 };
 
 /* Precise PHB model for error management */
@@ -227,6 +228,7 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 extern void pnv_pci_init_ioda_hub(struct device_node *np);
 extern void pnv_pci_init_ioda2_phb(struct device_node *np);
 extern void pnv_pci_init_npu_phb(struct device_node *np);
+extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np);
 extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
 extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
 
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index ba030669eca1..9664c8461f03 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -37,6 +37,8 @@
 #include <asm/kvm_ppc.h>
 #include <asm/ppc-opcode.h>
 #include <asm/cpuidle.h>
+#include <asm/kexec.h>
+#include <asm/reg.h>
 
 #include "powernv.h"
 
@@ -209,9 +211,32 @@ static void pnv_smp_cpu_kill_self(void)
 		} else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
 			unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
 			asm volatile(PPC_MSGCLR(%0) : : "r" (msg));
+		} else if ((srr1 & wmask) == SRR1_WAKERESET) {
+			irq_set_pending_from_srr1(srr1);
+			/* Does not return */
 		}
+
 		smp_mb();
 
+		/*
+		 * For kdump kernels, we process the ipi and jump to
+		 * crash_ipi_callback
+		 */
+		if (kdump_in_progress()) {
+			/*
+			 * If we got to this point, we've not used
+			 * NMI's, otherwise we would have gone
+			 * via the SRR1_WAKERESET path. We are
+			 * using regular IPI's for waking up offline
+			 * threads.
+			 */
+			struct pt_regs regs;
+
+			ppc_save_regs(&regs);
+			crash_ipi_callback(&regs);
+			/* Does not return */
+		}
+
 		if (cpu_core_split_required())
 			continue;
 
@@ -371,5 +396,8 @@ void __init pnv_smp_init(void)
 
 #ifdef CONFIG_HOTPLUG_CPU
 	ppc_md.cpu_die	= pnv_smp_cpu_kill_self;
+#ifdef CONFIG_KEXEC_CORE
+	crash_wake_offline = 1;
+#endif
 #endif
 }