summaryrefslogtreecommitdiff
path: root/arch/powerpc/platforms/powernv
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r--arch/powerpc/platforms/powernv/Kconfig3
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c7
-rw-r--r--arch/powerpc/platforms/powernv/idle.c109
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c470
-rw-r--r--arch/powerpc/platforms/powernv/opal-lpc.c3
-rw-r--r--arch/powerpc/platforms/powernv/opal-sensor.c4
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S71
-rw-r--r--arch/powerpc/platforms/powernv/opal-xscom.c27
-rw-r--r--arch/powerpc/platforms/powernv/opal.c79
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c25
-rw-r--r--arch/powerpc/platforms/powernv/pci.c5
-rw-r--r--arch/powerpc/platforms/powernv/pci.h17
-rw-r--r--arch/powerpc/platforms/powernv/powernv.h2
-rw-r--r--arch/powerpc/platforms/powernv/rng.c2
-rw-r--r--arch/powerpc/platforms/powernv/setup.c19
-rw-r--r--arch/powerpc/platforms/powernv/smp.c107
16 files changed, 828 insertions, 122 deletions
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 3a07e4dcf97c..6a6f4ef46b9e 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -4,6 +4,7 @@ config PPC_POWERNV
select PPC_NATIVE
select PPC_XICS
select PPC_ICP_NATIVE
+ select PPC_XIVE_NATIVE
select PPC_P7_NAP
select PCI
select PCI_MSI
@@ -19,6 +20,8 @@ config PPC_POWERNV
select CPU_FREQ_GOV_ONDEMAND
select CPU_FREQ_GOV_CONSERVATIVE
select PPC_DOORBELL
+ select MMU_NOTIFIER
+ select FORCE_SMP
default y
config OPAL_PRD
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 6fb5522acd70..d2f19821d71d 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1102,6 +1102,13 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
return -EIO;
}
+ /*
+ * If dealing with the root bus (or the bus underneath the
+ * root port), we reset the bus underneath the root port.
+ *
+ * The cxl driver depends on this behaviour for bi-modal card
+ * switching.
+ */
if (pci_is_root_bus(bus) ||
pci_is_root_bus(bus->parent))
return pnv_eeh_root_reset(hose, option);
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 4ee837e6391a..445f30a2c5ef 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -53,19 +53,6 @@ static int pnv_save_sprs_for_deep_states(void)
uint64_t pir = get_hard_smp_processor_id(cpu);
uint64_t hsprg0_val = (uint64_t)&paca[cpu];
- if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
- /*
- * HSPRG0 is used to store the cpu's pointer to paca.
- * Hence last 3 bits are guaranteed to be 0. Program
- * slw to restore HSPRG0 with 63rd bit set, so that
- * when a thread wakes up at 0x100 we can use this bit
- * to distinguish between fastsleep and deep winkle.
- * This is not necessary with stop/psscr since PLS
- * field of psscr indicates which state we are waking
- * up from.
- */
- hsprg0_val |= 1;
- }
rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
if (rc != 0)
return rc;
@@ -122,9 +109,12 @@ static void pnv_alloc_idle_core_states(void)
for (i = 0; i < nr_cores; i++) {
int first_cpu = i * threads_per_core;
int node = cpu_to_node(first_cpu);
+ size_t paca_ptr_array_size;
core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
*core_idle_state = PNV_CORE_IDLE_THREAD_BITS;
+ paca_ptr_array_size = (threads_per_core *
+ sizeof(struct paca_struct *));
for (j = 0; j < threads_per_core; j++) {
int cpu = first_cpu + j;
@@ -132,6 +122,11 @@ static void pnv_alloc_idle_core_states(void)
paca[cpu].core_idle_state_ptr = core_idle_state;
paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
paca[cpu].thread_mask = 1 << j;
+ if (!cpu_has_feature(CPU_FTR_POWER9_DD1))
+ continue;
+ paca[cpu].thread_sibling_pacas =
+ kmalloc_node(paca_ptr_array_size,
+ GFP_KERNEL, node);
}
}
@@ -147,7 +142,6 @@ u32 pnv_get_supported_cpuidle_states(void)
}
EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
-
static void pnv_fastsleep_workaround_apply(void *info)
{
@@ -241,8 +235,9 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
* The default stop state that will be used by ppc_md.power_save
* function on platforms that support stop instruction.
*/
-u64 pnv_default_stop_val;
-u64 pnv_default_stop_mask;
+static u64 pnv_default_stop_val;
+static u64 pnv_default_stop_mask;
+static bool default_stop_found;
/*
* Used for ppc_md.power_save which needs a function with no parameters
@@ -262,8 +257,42 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
* psscr value and mask of the deepest stop idle state.
* Used when a cpu is offlined.
*/
-u64 pnv_deepest_stop_psscr_val;
-u64 pnv_deepest_stop_psscr_mask;
+static u64 pnv_deepest_stop_psscr_val;
+static u64 pnv_deepest_stop_psscr_mask;
+static bool deepest_stop_found;
+
+/*
+ * pnv_cpu_offline: A function that puts the CPU into the deepest
+ * available platform idle state on a CPU-Offline.
+ */
+unsigned long pnv_cpu_offline(unsigned int cpu)
+{
+ unsigned long srr1;
+
+ u32 idle_states = pnv_get_supported_cpuidle_states();
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) {
+ srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val,
+ pnv_deepest_stop_psscr_mask);
+ } else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
+ srr1 = power7_winkle();
+ } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
+ (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
+ srr1 = power7_sleep();
+ } else if (idle_states & OPAL_PM_NAP_ENABLED) {
+ srr1 = power7_nap(1);
+ } else {
+ /* This is the fallback method. We emulate snooze */
+ while (!generic_check_cpu_restart(cpu)) {
+ HMT_low();
+ HMT_very_low();
+ }
+ srr1 = 0;
+ HMT_medium();
+ }
+
+ return srr1;
+}
/*
* Power ISA 3.0 idle initialization.
@@ -352,7 +381,6 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
u32 *residency_ns = NULL;
u64 max_residency_ns = 0;
int rc = 0, i;
- bool default_stop_found = false, deepest_stop_found = false;
psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL);
psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL);
@@ -432,21 +460,24 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
}
}
- if (!default_stop_found) {
- pnv_default_stop_val = PSSCR_HV_DEFAULT_VAL;
- pnv_default_stop_mask = PSSCR_HV_DEFAULT_MASK;
- pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n",
+ if (unlikely(!default_stop_found)) {
+ pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n");
+ } else {
+ ppc_md.power_save = power9_idle;
+ pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n",
pnv_default_stop_val, pnv_default_stop_mask);
}
- if (!deepest_stop_found) {
- pnv_deepest_stop_psscr_val = PSSCR_HV_DEFAULT_VAL;
- pnv_deepest_stop_psscr_mask = PSSCR_HV_DEFAULT_MASK;
- pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n",
+ if (unlikely(!deepest_stop_found)) {
+ pr_warn("cpuidle-powernv: No suitable stop state for CPU-Hotplug. Offlined CPUs will busy wait");
+ } else {
+ pr_info("cpuidle-powernv: Deepest stop: psscr = 0x%016llx,mask=0x%016llx\n",
pnv_deepest_stop_psscr_val,
pnv_deepest_stop_psscr_mask);
}
+ pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n",
+ pnv_first_deep_stop_state);
out:
kfree(psscr_val);
kfree(psscr_mask);
@@ -524,10 +555,30 @@ static int __init pnv_init_idle_states(void)
pnv_alloc_idle_core_states();
+ /*
+ * For each CPU, record its PACA address in each of it's
+ * sibling thread's PACA at the slot corresponding to this
+ * CPU's index in the core.
+ */
+ if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+ int cpu;
+
+ pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thread sibling PACA\n");
+ for_each_possible_cpu(cpu) {
+ int base_cpu = cpu_first_thread_sibling(cpu);
+ int idx = cpu_thread_in_core(cpu);
+ int i;
+
+ for (i = 0; i < threads_per_core; i++) {
+ int j = base_cpu + i;
+
+ paca[j].thread_sibling_pacas[idx] = &paca[cpu];
+ }
+ }
+ }
+
if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED)
ppc_md.power_save = power7_idle;
- else if (supported_cpuidle_states & OPAL_PM_STOP_INST_FAST)
- ppc_md.power_save = power9_idle;
out:
return 0;
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 1c383f38031d..067defeea691 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -9,11 +9,20 @@
* License as published by the Free Software Foundation.
*/
+#include <linux/slab.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mmu_context.h>
+#include <linux/of.h>
#include <linux/export.h>
#include <linux/pci.h>
#include <linux/memblock.h>
#include <linux/iommu.h>
+#include <asm/tlb.h>
+#include <asm/powernv.h>
+#include <asm/reg.h>
+#include <asm/opal.h>
+#include <asm/io.h>
#include <asm/iommu.h>
#include <asm/pnv-pci.h>
#include <asm/msi_bitmap.h>
@@ -22,6 +31,8 @@
#include "powernv.h"
#include "pci.h"
+#define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
+
/*
* Other types of TCE cache invalidation are not functional in the
* hardware.
@@ -37,6 +48,12 @@ struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
struct device_node *dn;
struct pci_dev *gpdev;
+ if (WARN_ON(!npdev))
+ return NULL;
+
+ if (WARN_ON(!npdev->dev.of_node))
+ return NULL;
+
/* Get assoicated PCI device */
dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
if (!dn)
@@ -55,6 +72,12 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
struct device_node *dn;
struct pci_dev *npdev;
+ if (WARN_ON(!gpdev))
+ return NULL;
+
+ if (WARN_ON(!gpdev->dev.of_node))
+ return NULL;
+
/* Get assoicated PCI device */
dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
if (!dn)
@@ -180,7 +203,7 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
return rc;
}
- pnv_pci_phb3_tce_invalidate_entire(phb, false);
+ pnv_pci_ioda2_tce_invalidate_entire(phb, false);
/* Add the table to the list so its TCE cache will get invalidated */
pnv_pci_link_table_and_group(phb->hose->node, num,
@@ -204,7 +227,7 @@ long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
return rc;
}
- pnv_pci_phb3_tce_invalidate_entire(phb, false);
+ pnv_pci_ioda2_tce_invalidate_entire(phb, false);
pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
&npe->table_group);
@@ -270,7 +293,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
0 /* bypass base */, top);
if (rc == OPAL_SUCCESS)
- pnv_pci_phb3_tce_invalidate_entire(phb, false);
+ pnv_pci_ioda2_tce_invalidate_entire(phb, false);
return rc;
}
@@ -334,7 +357,7 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
return;
}
- pnv_pci_phb3_tce_invalidate_entire(npe->phb, false);
+ pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
}
struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
@@ -359,3 +382,442 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
return gpe;
}
+
+/* Maximum number of nvlinks per npu */
+#define NV_MAX_LINKS 6
+
+/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
+static int max_npu2_index;
+
+struct npu_context {
+ struct mm_struct *mm;
+ struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
+ struct mmu_notifier mn;
+ struct kref kref;
+
+ /* Callback to stop translation requests on a given GPU */
+ struct npu_context *(*release_cb)(struct npu_context *, void *);
+
+ /*
+ * Private pointer passed to the above callback for usage by
+ * device drivers.
+ */
+ void *priv;
+};
+
+/*
+ * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
+ * if none are available.
+ */
+static int get_mmio_atsd_reg(struct npu *npu)
+{
+ int i;
+
+ for (i = 0; i < npu->mmio_atsd_count; i++) {
+ if (!test_and_set_bit(i, &npu->mmio_atsd_usage))
+ return i;
+ }
+
+ return -ENOSPC;
+}
+
+static void put_mmio_atsd_reg(struct npu *npu, int reg)
+{
+ clear_bit(reg, &npu->mmio_atsd_usage);
+}
+
+/* MMIO ATSD register offsets */
+#define XTS_ATSD_AVA 1
+#define XTS_ATSD_STAT 2
+
+static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,
+ unsigned long va)
+{
+ int mmio_atsd_reg;
+
+ do {
+ mmio_atsd_reg = get_mmio_atsd_reg(npu);
+ cpu_relax();
+ } while (mmio_atsd_reg < 0);
+
+ __raw_writeq(cpu_to_be64(va),
+ npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA);
+ eieio();
+ __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]);
+
+ return mmio_atsd_reg;
+}
+
+static int mmio_invalidate_pid(struct npu *npu, unsigned long pid)
+{
+ unsigned long launch;
+
+ /* IS set to invalidate matching PID */
+ launch = PPC_BIT(12);
+
+ /* PRS set to process-scoped */
+ launch |= PPC_BIT(13);
+
+ /* AP */
+ launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+
+ /* PID */
+ launch |= pid << PPC_BITLSHIFT(38);
+
+ /* Invalidating the entire process doesn't use a va */
+ return mmio_launch_invalidate(npu, launch, 0);
+}
+
+static int mmio_invalidate_va(struct npu *npu, unsigned long va,
+ unsigned long pid)
+{
+ unsigned long launch;
+
+ /* IS set to invalidate target VA */
+ launch = 0;
+
+ /* PRS set to process scoped */
+ launch |= PPC_BIT(13);
+
+ /* AP */
+ launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+
+ /* PID */
+ launch |= pid << PPC_BITLSHIFT(38);
+
+ return mmio_launch_invalidate(npu, launch, va);
+}
+
+#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
+
+/*
+ * Invalidate either a single address or an entire PID depending on
+ * the value of va.
+ */
+static void mmio_invalidate(struct npu_context *npu_context, int va,
+ unsigned long address)
+{
+ int i, j, reg;
+ struct npu *npu;
+ struct pnv_phb *nphb;
+ struct pci_dev *npdev;
+ struct {
+ struct npu *npu;
+ int reg;
+ } mmio_atsd_reg[NV_MAX_NPUS];
+ unsigned long pid = npu_context->mm->context.id;
+
+ /*
+ * Loop over all the NPUs this process is active on and launch
+ * an invalidate.
+ */
+ for (i = 0; i <= max_npu2_index; i++) {
+ mmio_atsd_reg[i].reg = -1;
+ for (j = 0; j < NV_MAX_LINKS; j++) {
+ npdev = npu_context->npdev[i][j];
+ if (!npdev)
+ continue;
+
+ nphb = pci_bus_to_host(npdev->bus)->private_data;
+ npu = &nphb->npu;
+ mmio_atsd_reg[i].npu = npu;
+
+ if (va)
+ mmio_atsd_reg[i].reg =
+ mmio_invalidate_va(npu, address, pid);
+ else
+ mmio_atsd_reg[i].reg =
+ mmio_invalidate_pid(npu, pid);
+
+ /*
+ * The NPU hardware forwards the shootdown to all GPUs
+ * so we only have to launch one shootdown per NPU.
+ */
+ break;
+ }
+ }
+
+ /*
+ * Unfortunately the nest mmu does not support flushing specific
+ * addresses so we have to flush the whole mm.
+ */
+ flush_tlb_mm(npu_context->mm);
+
+ /* Wait for all invalidations to complete */
+ for (i = 0; i <= max_npu2_index; i++) {
+ if (mmio_atsd_reg[i].reg < 0)
+ continue;
+
+ /* Wait for completion */
+ npu = mmio_atsd_reg[i].npu;
+ reg = mmio_atsd_reg[i].reg;
+ while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
+ cpu_relax();
+ put_mmio_atsd_reg(npu, reg);
+ }
+}
+
+static void pnv_npu2_mn_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct npu_context *npu_context = mn_to_npu_context(mn);
+
+ /* Call into device driver to stop requests to the NMMU */
+ if (npu_context->release_cb)
+ npu_context->release_cb(npu_context, npu_context->priv);
+
+ /*
+ * There should be no more translation requests for this PID, but we
+ * need to ensure any entries for it are removed from the TLB.
+ */
+ mmio_invalidate(npu_context, 0, 0);
+}
+
+static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address,
+ pte_t pte)
+{
+ struct npu_context *npu_context = mn_to_npu_context(mn);
+
+ mmio_invalidate(npu_context, 1, address);
+}
+
+static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct npu_context *npu_context = mn_to_npu_context(mn);
+
+ mmio_invalidate(npu_context, 1, address);
+}
+
+static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct npu_context *npu_context = mn_to_npu_context(mn);
+ unsigned long address;
+
+ for (address = start; address <= end; address += PAGE_SIZE)
+ mmio_invalidate(npu_context, 1, address);
+}
+
+static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
+ .release = pnv_npu2_mn_release,
+ .change_pte = pnv_npu2_mn_change_pte,
+ .invalidate_page = pnv_npu2_mn_invalidate_page,
+ .invalidate_range = pnv_npu2_mn_invalidate_range,
+};
+
+/*
+ * Call into OPAL to setup the nmmu context for the current task in
+ * the NPU. This must be called to setup the context tables before the
+ * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
+ *
+ * A release callback should be registered to allow a device driver to
+ * be notified that it should not launch any new translation requests
+ * as the final TLB invalidate is about to occur.
+ *
+ * Returns an error if there no contexts are currently available or a
+ * npu_context which should be passed to pnv_npu2_handle_fault().
+ *
+ * mmap_sem must be held in write mode.
+ */
+struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
+ unsigned long flags,
+ struct npu_context *(*cb)(struct npu_context *, void *),
+ void *priv)
+{
+ int rc;
+ u32 nvlink_index;
+ struct device_node *nvlink_dn;
+ struct mm_struct *mm = current->mm;
+ struct pnv_phb *nphb;
+ struct npu *npu;
+ struct npu_context *npu_context;
+
+ /*
+ * At present we don't support GPUs connected to multiple NPUs and I'm
+ * not sure the hardware does either.
+ */
+ struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return ERR_PTR(-ENODEV);
+
+ if (!npdev)
+ /* No nvlink associated with this GPU device */
+ return ERR_PTR(-ENODEV);
+
+ if (!mm) {
+ /* kernel thread contexts are not supported */
+ return ERR_PTR(-EINVAL);
+ }
+
+ nphb = pci_bus_to_host(npdev->bus)->private_data;
+ npu = &nphb->npu;
+
+ /*
+ * Setup the NPU context table for a particular GPU. These need to be
+ * per-GPU as we need the tables to filter ATSDs when there are no
+ * active contexts on a particular GPU.
+ */
+ rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
+ PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+ if (rc < 0)
+ return ERR_PTR(-ENOSPC);
+
+ /*
+ * We store the npu pci device so we can more easily get at the
+ * associated npus.
+ */
+ npu_context = mm->context.npu_context;
+ if (!npu_context) {
+ npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
+ if (!npu_context)
+ return ERR_PTR(-ENOMEM);
+
+ mm->context.npu_context = npu_context;
+ npu_context->mm = mm;
+ npu_context->mn.ops = &nv_nmmu_notifier_ops;
+ __mmu_notifier_register(&npu_context->mn, mm);
+ kref_init(&npu_context->kref);
+ } else {
+ kref_get(&npu_context->kref);
+ }
+
+ npu_context->release_cb = cb;
+ npu_context->priv = priv;
+ nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
+ if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
+ &nvlink_index)))
+ return ERR_PTR(-ENODEV);
+ npu_context->npdev[npu->index][nvlink_index] = npdev;
+
+ return npu_context;
+}
+EXPORT_SYMBOL(pnv_npu2_init_context);
+
+static void pnv_npu2_release_context(struct kref *kref)
+{
+ struct npu_context *npu_context =
+ container_of(kref, struct npu_context, kref);
+
+ npu_context->mm->context.npu_context = NULL;
+ mmu_notifier_unregister(&npu_context->mn,
+ npu_context->mm);
+
+ kfree(npu_context);
+}
+
+void pnv_npu2_destroy_context(struct npu_context *npu_context,
+ struct pci_dev *gpdev)
+{
+ struct pnv_phb *nphb, *phb;
+ struct npu *npu;
+ struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+ struct device_node *nvlink_dn;
+ u32 nvlink_index;
+
+ if (WARN_ON(!npdev))
+ return;
+
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return;
+
+ nphb = pci_bus_to_host(npdev->bus)->private_data;
+ npu = &nphb->npu;
+ phb = pci_bus_to_host(gpdev->bus)->private_data;
+ nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
+ if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
+ &nvlink_index)))
+ return;
+ npu_context->npdev[npu->index][nvlink_index] = NULL;
+ opal_npu_destroy_context(phb->opal_id, npu_context->mm->context.id,
+ PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+ kref_put(&npu_context->kref, pnv_npu2_release_context);
+}
+EXPORT_SYMBOL(pnv_npu2_destroy_context);
+
+/*
+ * Assumes mmap_sem is held for the contexts associated mm.
+ */
+int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
+ unsigned long *flags, unsigned long *status, int count)
+{
+ u64 rc = 0, result = 0;
+ int i, is_write;
+ struct page *page[1];
+
+ /* mmap_sem should be held so the struct_mm must be present */
+ struct mm_struct *mm = context->mm;
+
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return -ENODEV;
+
+ WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ for (i = 0; i < count; i++) {
+ is_write = flags[i] & NPU2_WRITE;
+ rc = get_user_pages_remote(NULL, mm, ea[i], 1,
+ is_write ? FOLL_WRITE : 0,
+ page, NULL, NULL);
+
+ /*
+ * To support virtualised environments we will have to do an
+ * access to the page to ensure it gets faulted into the
+ * hypervisor. For the moment virtualisation is not supported in
+ * other areas so leave the access out.
+ */
+ if (rc != 1) {
+ status[i] = rc;
+ result = -EFAULT;
+ continue;
+ }
+
+ status[i] = 0;
+ put_page(page[0]);
+ }
+
+ return result;
+}
+EXPORT_SYMBOL(pnv_npu2_handle_fault);
+
+int pnv_npu2_init(struct pnv_phb *phb)
+{
+ unsigned int i;
+ u64 mmio_atsd;
+ struct device_node *dn;
+ struct pci_dev *gpdev;
+ static int npu_index;
+ uint64_t rc = 0;
+
+ for_each_child_of_node(phb->hose->dn, dn) {
+ gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
+ if (gpdev) {
+ rc = opal_npu_map_lpar(phb->opal_id,
+ PCI_DEVID(gpdev->bus->number, gpdev->devfn),
+ 0, 0);
+ if (rc)
+ dev_err(&gpdev->dev,
+ "Error %lld mapping device to LPAR\n",
+ rc);
+ }
+ }
+
+ for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd",
+ i, &mmio_atsd); i++)
+ phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
+
+ pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i);
+ phb->npu.mmio_atsd_count = i;
+ phb->npu.mmio_atsd_usage = 0;
+ npu_index++;
+ if (WARN_ON(npu_index >= NV_MAX_NPUS))
+ return -ENOSPC;
+ max_npu2_index = npu_index;
+ phb->npu.index = npu_index;
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index a91d7876fae2..6c7ad1d8b32e 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -12,7 +12,6 @@
#include <linux/kernel.h>
#include <linux/of.h>
#include <linux/bug.h>
-#include <linux/debugfs.h>
#include <linux/io.h>
#include <linux/slab.h>
@@ -21,7 +20,7 @@
#include <asm/opal.h>
#include <asm/prom.h>
#include <linux/uaccess.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
#include <asm/isa-bridge.h>
static int opal_lpc_chip_id = -1;
diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
index 308efd170c27..aa267f120033 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -64,6 +64,10 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
*sensor_data = be32_to_cpu(data);
break;
+ case OPAL_WRONG_STATE:
+ ret = -EIO;
+ break;
+
default:
ret = opal_error_code(ret);
break;
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index da8a0f7a035c..f620572f891f 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -50,21 +50,13 @@ END_FTR_SECTION(0, 1); \
#define OPAL_BRANCH(LABEL)
#endif
-/* TODO:
- *
- * - Trace irqs in/off (needs saving/restoring all args, argh...)
- * - Get r11 feed up by Dave so I can have better register usage
+/*
+ * DO_OPAL_CALL assumes:
+ * r0 = opal call token
+ * r12 = msr
+ * LR has been saved
*/
-
-#define OPAL_CALL(name, token) \
- _GLOBAL_TOC(name); \
- mfmsr r12; \
- mflr r0; \
- andi. r11,r12,MSR_IR|MSR_DR; \
- std r0,PPC_LR_STKOFF(r1); \
- li r0,token; \
- beq opal_real_call; \
- OPAL_BRANCH(opal_tracepoint_entry) \
+#define DO_OPAL_CALL() \
mfcr r11; \
stw r11,8(r1); \
li r11,0; \
@@ -83,6 +75,18 @@ END_FTR_SECTION(0, 1); \
mtspr SPRN_HSRR0,r12; \
hrfid
+#define OPAL_CALL(name, token) \
+ _GLOBAL_TOC(name); \
+ mfmsr r12; \
+ mflr r0; \
+ andi. r11,r12,MSR_IR|MSR_DR; \
+ std r0,PPC_LR_STKOFF(r1); \
+ li r0,token; \
+ beq opal_real_call; \
+ OPAL_BRANCH(opal_tracepoint_entry) \
+ DO_OPAL_CALL()
+
+
opal_return:
/*
* Fixup endian on OPAL return... we should be able to simplify
@@ -148,26 +152,13 @@ opal_tracepoint_entry:
ld r8,STK_REG(R29)(r1)
ld r9,STK_REG(R30)(r1)
ld r10,STK_REG(R31)(r1)
+
+ /* setup LR so we return via tracepoint_return */
LOAD_REG_ADDR(r11,opal_tracepoint_return)
- mfcr r12
std r11,16(r1)
- stw r12,8(r1)
- li r11,0
+
mfmsr r12
- ori r11,r11,MSR_EE
- std r12,PACASAVEDMSR(r13)
- andc r12,r12,r11
- mtmsrd r12,1
- LOAD_REG_ADDR(r11,opal_return)
- mtlr r11
- li r11,MSR_DR|MSR_IR|MSR_LE
- andc r12,r12,r11
- mtspr SPRN_HSRR1,r12
- LOAD_REG_ADDR(r11,opal)
- ld r12,8(r11)
- ld r2,0(r11)
- mtspr SPRN_HSRR0,r12
- hrfid
+ DO_OPAL_CALL()
opal_tracepoint_return:
std r3,STK_REG(R31)(r1)
@@ -301,3 +292,21 @@ OPAL_CALL(opal_int_eoi, OPAL_INT_EOI);
OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR);
OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL);
OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR);
+OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET);
+OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO);
+OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO);
+OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO);
+OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE);
+OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK);
+OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK);
+OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ);
+OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ);
+OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
+OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
+OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC);
+OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP);
+OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT);
+OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT);
+OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index d0ac535cf5d7..28651fb25417 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -73,25 +73,32 @@ static int opal_xscom_err_xlate(int64_t rc)
static u64 opal_scom_unmangle(u64 addr)
{
+ u64 tmp;
+
/*
- * XSCOM indirect addresses have the top bit set. Additionally
- * the rest of the top 3 nibbles is always 0.
+ * XSCOM addresses use the top nibble to set indirect mode and
+ * its form. Bits 4-11 are always 0.
*
* Because the debugfs interface uses signed offsets and shifts
* the address left by 3, we basically cannot use the top 4 bits
* of the 64-bit address, and thus cannot use the indirect bit.
*
- * To deal with that, we support the indirect bit being in bit
- * 4 (IBM notation) instead of bit 0 in this API, we do the
- * conversion here. To leave room for further xscom address
- * expansion, we only clear out the top byte
+ * To deal with that, we support the indirect bits being in
+ * bits 4-7 (IBM notation) instead of bit 0-3 in this API, we
+ * do the conversion here.
*
- * For in-kernel use, we also support the real indirect bit, so
- * we test for any of the top 5 bits
+ * For in-kernel use, we don't need to do this mangling. In
+ * kernel won't have bits 4-7 set.
*
+ * So:
+ * debugfs will always set 0-3 = 0 and clear 4-7
+ * kernel will always clear 0-3 = 0 and set 4-7
*/
- if (addr & (0x1full << 59))
- addr = (addr & ~(0xffull << 56)) | (1ull << 63);
+ tmp = addr;
+ tmp &= 0x0f00000000000000;
+ addr &= 0xf0ffffffffffffff;
+ addr |= tmp << 4;
+
return addr;
}
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index e0f856bfbfe8..7925a9d72cca 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -435,7 +435,7 @@ int opal_machine_check(struct pt_regs *regs)
evt.version);
return 0;
}
- machine_check_print_event_info(&evt);
+ machine_check_print_event_info(&evt, user_mode(regs));
if (opal_recover_mce(regs, &evt))
return 1;
@@ -595,6 +595,80 @@ static void opal_export_symmap(void)
pr_warn("Error %d creating OPAL symbols file\n", rc);
}
+static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
+ struct bin_attribute *bin_attr, char *buf,
+ loff_t off, size_t count)
+{
+ return memory_read_from_buffer(buf, count, &off, bin_attr->private,
+ bin_attr->size);
+}
+
+/*
+ * opal_export_attrs: creates a sysfs node for each property listed in
+ * the device-tree under /ibm,opal/firmware/exports/
+ * All new sysfs nodes are created under /opal/exports/.
+ * This allows for reserved memory regions (e.g. HDAT) to be read.
+ * The new sysfs nodes are only readable by root.
+ */
+static void opal_export_attrs(void)
+{
+ struct bin_attribute *attr;
+ struct device_node *np;
+ struct property *prop;
+ struct kobject *kobj;
+ u64 vals[2];
+ int rc;
+
+ np = of_find_node_by_path("/ibm,opal/firmware/exports");
+ if (!np)
+ return;
+
+ /* Create new 'exports' directory - /sys/firmware/opal/exports */
+ kobj = kobject_create_and_add("exports", opal_kobj);
+ if (!kobj) {
+ pr_warn("kobject_create_and_add() of exports failed\n");
+ return;
+ }
+
+ for_each_property_of_node(np, prop) {
+ if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle"))
+ continue;
+
+ if (of_property_read_u64_array(np, prop->name, &vals[0], 2))
+ continue;
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+
+ if (attr == NULL) {
+ pr_warn("Failed kmalloc for bin_attribute!");
+ continue;
+ }
+
+ sysfs_bin_attr_init(attr);
+ attr->attr.name = kstrdup(prop->name, GFP_KERNEL);
+ attr->attr.mode = 0400;
+ attr->read = export_attr_read;
+ attr->private = __va(vals[0]);
+ attr->size = vals[1];
+
+ if (attr->attr.name == NULL) {
+ pr_warn("Failed kstrdup for bin_attribute attr.name");
+ kfree(attr);
+ continue;
+ }
+
+ rc = sysfs_create_bin_file(kobj, attr);
+ if (rc) {
+ pr_warn("Error %d creating OPAL sysfs exports/%s file\n",
+ rc, prop->name);
+ kfree(attr->attr.name);
+ kfree(attr);
+ }
+ }
+
+ of_node_put(np);
+}
+
static void __init opal_dump_region_init(void)
{
void *addr;
@@ -733,6 +807,9 @@ static int __init opal_init(void)
opal_msglog_sysfs_init();
}
+ /* Export all properties */
+ opal_export_attrs();
+
/* Initialize platform devices: IPMI backend, PRD & flash interface */
opal_pdev_init("ibm,opal-ipmi");
opal_pdev_init("ibm,opal-flash");
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index ee4cdb5b893f..6fdbd383f676 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -14,7 +14,6 @@
#include <linux/kernel.h>
#include <linux/pci.h>
#include <linux/crash_dump.h>
-#include <linux/debugfs.h>
#include <linux/delay.h>
#include <linux/string.h>
#include <linux/init.h>
@@ -38,7 +37,7 @@
#include <asm/iommu.h>
#include <asm/tce.h>
#include <asm/xics.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
#include <asm/firmware.h>
#include <asm/pnv-pci.h>
#include <asm/mmzone.h>
@@ -1262,6 +1261,8 @@ static void pnv_pci_ioda_setup_PEs(void)
/* PE#0 is needed for error reporting */
pnv_ioda_reserve_pe(phb, 0);
pnv_ioda_setup_npu_PEs(hose->bus);
+ if (phb->model == PNV_PHB_MODEL_NPU2)
+ pnv_npu2_init(phb);
}
}
}
@@ -1894,7 +1895,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
#define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1)
#define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2)
-void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
{
__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
@@ -1990,6 +1991,14 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
}
}
+void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+{
+ if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
+ pnv_pci_phb3_tce_invalidate_entire(phb, rm);
+ else
+ opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
+}
+
static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
long npages, unsigned long uaddr,
enum dma_data_direction direction,
@@ -2150,6 +2159,9 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
found:
tbl = pnv_pci_table_alloc(phb->hose->node);
+ if (WARN_ON(!tbl))
+ return;
+
iommu_register_group(&pe->table_group, phb->hose->global_number,
pe->pe_number);
pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
@@ -2436,7 +2448,8 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
tce_table_size /= direct_table_size;
tce_table_size <<= 3;
- tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size);
+ tce_table_size = max_t(unsigned long,
+ tce_table_size, direct_table_size);
}
return bytes;
@@ -2757,9 +2770,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
if (rc)
return;
- if (pe->flags & PNV_IODA_PE_DEV)
- iommu_add_device(&pe->pdev->dev);
- else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+ if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
}
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 204a829ff506..935ccb249a8a 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -758,7 +758,7 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
{
- return *(pnv_tce(tbl, index - tbl->it_offset));
+ return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset)));
}
struct iommu_table *pnv_pci_table_alloc(int nid)
@@ -766,6 +766,9 @@ struct iommu_table *pnv_pci_table_alloc(int nid)
struct iommu_table *tbl;
tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
+ if (!tbl)
+ return NULL;
+
INIT_LIST_HEAD_RCU(&tbl->it_group_list);
kref_init(&tbl->it_kref);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index e1d3e5526b54..18c8a2fa03b8 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -7,6 +7,9 @@
struct pci_dn;
+/* Maximum possible number of ATSD MMIO registers per NPU */
+#define NV_NMMU_ATSD_REGS 8
+
enum pnv_phb_type {
PNV_PHB_IODA1 = 0,
PNV_PHB_IODA2 = 1,
@@ -174,6 +177,16 @@ struct pnv_phb {
struct OpalIoP7IOCErrorData hub_diag;
} diag;
+ /* Nvlink2 data */
+ struct npu {
+ int index;
+ __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
+ unsigned int mmio_atsd_count;
+
+ /* Bitmask for MMIO register usage */
+ unsigned long mmio_atsd_usage;
+ } npu;
+
#ifdef CONFIG_CXL_BASE
struct cxl_afu *cxl_afu;
#endif
@@ -229,14 +242,14 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
/* Nvlink functions */
extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
-extern void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
+extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
struct iommu_table *tbl);
extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
-
+extern int pnv_npu2_init(struct pnv_phb *phb);
/* cxl functions */
extern bool pnv_cxl_enable_device_hook(struct pci_dev *dev);
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index 613052232475..6dbc0a1da1f6 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -18,8 +18,6 @@ static inline void pnv_pci_shutdown(void) { }
#endif
extern u32 pnv_get_supported_cpuidle_states(void);
-extern u64 pnv_deepest_stop_psscr_val;
-extern u64 pnv_deepest_stop_psscr_mask;
extern void pnv_lpc_init(void);
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
index 5dcbdea1afac..1a9d84371a4d 100644
--- a/arch/powerpc/platforms/powernv/rng.c
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -62,7 +62,7 @@ int powernv_get_random_real_mode(unsigned long *v)
rng = raw_cpu_read(powernv_rng);
- *v = rng_whiten(rng, in_rm64(rng->regs_real));
+ *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real));
return 1;
}
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index d50c7d99baaf..2dc7e5fb86c3 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -32,6 +32,7 @@
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/xics.h>
+#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/kexec.h>
#include <asm/smp.h>
@@ -76,7 +77,9 @@ static void __init pnv_init(void)
static void __init pnv_init_IRQ(void)
{
- xics_init();
+ /* Try using a XIVE if available, otherwise use a XICS */
+ if (!xive_native_init())
+ xics_init();
WARN_ON(!ppc_md.get_irq);
}
@@ -95,6 +98,10 @@ static void pnv_show_cpuinfo(struct seq_file *m)
else
seq_printf(m, "firmware\t: BML\n");
of_node_put(root);
+ if (radix_enabled())
+ seq_printf(m, "MMU\t\t: Radix\n");
+ else
+ seq_printf(m, "MMU\t\t: Hash\n");
}
static void pnv_prepare_going_down(void)
@@ -218,10 +225,12 @@ static void pnv_kexec_wait_secondaries_down(void)
static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
{
- xics_kexec_teardown_cpu(secondary);
+ if (xive_enabled())
+ xive_kexec_teardown_cpu(secondary);
+ else
+ xics_kexec_teardown_cpu(secondary);
/* On OPAL, we return all CPUs to firmware */
-
if (!firmware_has_feature(FW_FEATURE_OPAL))
return;
@@ -237,6 +246,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
/* Primary waits for the secondaries to have reached OPAL */
pnv_kexec_wait_secondaries_down();
+ /* Switch XIVE back to emulation mode */
+ if (xive_enabled())
+ xive_shutdown();
+
/*
* We might be running as little-endian - now that interrupts
* are disabled, reset the HILE bit to big-endian so we don't
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 8b67e1eefb5c..4aff754b6f2c 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -29,12 +29,14 @@
#include <asm/vdso_datapage.h>
#include <asm/cputhreads.h>
#include <asm/xics.h>
+#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/runlatch.h>
#include <asm/code-patching.h>
#include <asm/dbell.h>
#include <asm/kvm_ppc.h>
#include <asm/ppc-opcode.h>
+#include <asm/cpuidle.h>
#include "powernv.h"
@@ -47,13 +49,10 @@
static void pnv_smp_setup_cpu(int cpu)
{
- if (cpu != boot_cpuid)
+ if (xive_enabled())
+ xive_smp_setup_cpu();
+ else if (cpu != boot_cpuid)
xics_setup_cpu();
-
-#ifdef CONFIG_PPC_DOORBELL
- if (cpu_has_feature(CPU_FTR_DBELL))
- doorbell_setup_this_cpu();
-#endif
}
static int pnv_smp_kick_cpu(int nr)
@@ -132,7 +131,10 @@ static int pnv_smp_cpu_disable(void)
vdso_data->processorCount--;
if (cpu == boot_cpuid)
boot_cpuid = cpumask_any(cpu_online_mask);
- xics_migrate_irqs_away();
+ if (xive_enabled())
+ xive_smp_disable_cpu();
+ else
+ xics_migrate_irqs_away();
return 0;
}
@@ -140,7 +142,6 @@ static void pnv_smp_cpu_kill_self(void)
{
unsigned int cpu;
unsigned long srr1, wmask;
- u32 idle_states;
/* Standard hot unplug procedure */
local_irq_disable();
@@ -155,8 +156,6 @@ static void pnv_smp_cpu_kill_self(void)
if (cpu_has_feature(CPU_FTR_ARCH_207S))
wmask = SRR1_WAKEMASK_P8;
- idle_states = pnv_get_supported_cpuidle_states();
-
/* We don't want to take decrementer interrupts while we are offline,
* so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9)
* enabled as to let IPIs in.
@@ -184,19 +183,7 @@ static void pnv_smp_cpu_kill_self(void)
kvmppc_set_host_ipi(cpu, 0);
ppc64_runlatch_off();
-
- if (cpu_has_feature(CPU_FTR_ARCH_300)) {
- srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val,
- pnv_deepest_stop_psscr_mask);
- } else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
- srr1 = power7_winkle();
- } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
- (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
- srr1 = power7_sleep();
- } else {
- srr1 = power7_nap(1);
- }
-
+ srr1 = pnv_cpu_offline(cpu);
ppc64_runlatch_on();
/*
@@ -213,9 +200,12 @@ static void pnv_smp_cpu_kill_self(void)
if (((srr1 & wmask) == SRR1_WAKEEE) ||
((srr1 & wmask) == SRR1_WAKEHVI) ||
(local_paca->irq_happened & PACA_IRQ_EE)) {
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- icp_opal_flush_interrupt();
- else
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ if (xive_enabled())
+ xive_flush_interrupt();
+ else
+ icp_opal_flush_interrupt();
+ } else
icp_native_flush_interrupt();
} else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
@@ -252,10 +242,69 @@ static int pnv_cpu_bootable(unsigned int nr)
return smp_generic_cpu_bootable(nr);
}
+static int pnv_smp_prepare_cpu(int cpu)
+{
+ if (xive_enabled())
+ return xive_smp_prepare_cpu(cpu);
+ return 0;
+}
+
+/* Cause IPI as setup by the interrupt controller (xics or xive) */
+static void (*ic_cause_ipi)(int cpu);
+
+static void pnv_cause_ipi(int cpu)
+{
+ if (doorbell_try_core_ipi(cpu))
+ return;
+
+ ic_cause_ipi(cpu);
+}
+
+static void pnv_p9_dd1_cause_ipi(int cpu)
+{
+ int this_cpu = get_cpu();
+
+ /*
+ * POWER9 DD1 has a global addressed msgsnd, but for now we restrict
+ * IPIs to same core, because it requires additional synchronization
+ * for inter-core doorbells which we do not implement.
+ */
+ if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu)))
+ doorbell_global_ipi(cpu);
+ else
+ ic_cause_ipi(cpu);
+
+ put_cpu();
+}
+
+static void __init pnv_smp_probe(void)
+{
+ if (xive_enabled())
+ xive_smp_probe();
+ else
+ xics_smp_probe();
+
+ if (cpu_has_feature(CPU_FTR_DBELL)) {
+ ic_cause_ipi = smp_ops->cause_ipi;
+ WARN_ON(!ic_cause_ipi);
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+ smp_ops->cause_ipi = pnv_p9_dd1_cause_ipi;
+ else
+ smp_ops->cause_ipi = doorbell_global_ipi;
+ } else {
+ smp_ops->cause_ipi = pnv_cause_ipi;
+ }
+ }
+}
+
static struct smp_ops_t pnv_smp_ops = {
- .message_pass = smp_muxed_ipi_message_pass,
- .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */
- .probe = xics_smp_probe,
+ .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
+ .cause_ipi = NULL, /* Filled at runtime by pnv_smp_probe() */
+ .cause_nmi_ipi = NULL,
+ .probe = pnv_smp_probe,
+ .prepare_cpu = pnv_smp_prepare_cpu,
.kick_cpu = pnv_smp_kick_cpu,
.setup_cpu = pnv_smp_setup_cpu,
.cpu_bootable = pnv_cpu_bootable,