diff options
Diffstat (limited to 'arch/powerpc/platforms/powernv/pci-ioda.c')
| -rw-r--r-- | arch/powerpc/platforms/powernv/pci-ioda.c | 2801 |
1 files changed, 748 insertions, 2053 deletions
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b900eb1d5e17..b0c1d9d16fb5 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1,12 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Support PCI/PCIe on PowerNV platforms * * Copyright 2011 Benjamin Herrenschmidt, IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. */ #undef DEBUG @@ -17,18 +13,20 @@ #include <linux/delay.h> #include <linux/string.h> #include <linux/init.h> -#include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/irq.h> +#include <linux/irqchip/irq-msi-lib.h> #include <linux/io.h> #include <linux/msi.h> -#include <linux/memblock.h> #include <linux/iommu.h> #include <linux/rculist.h> #include <linux/sizes.h> +#include <linux/debugfs.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> #include <asm/sections.h> #include <asm/io.h> -#include <asm/prom.h> #include <asm/pci-bridge.h> #include <asm/machdep.h> #include <asm/msi_bitmap.h> @@ -37,25 +35,19 @@ #include <asm/iommu.h> #include <asm/tce.h> #include <asm/xics.h> -#include <asm/debugfs.h> #include <asm/firmware.h> #include <asm/pnv-pci.h> #include <asm/mmzone.h> -#include <misc/cxl-base.h> - #include "powernv.h" #include "pci.h" +#include "../../../../drivers/pci/pci.h" -#define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */ -#define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */ -#define PNV_IODA1_DMA32_SEGSIZE 0x10000000 - -#define POWERNV_IOMMU_DEFAULT_LEVELS 1 -#define POWERNV_IOMMU_MAX_LEVELS 5 +/* This array is indexed with enum pnv_phb_type */ +static const char * const pnv_phb_names[] = { "IODA2", "NPU_OCAPI" }; -static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU" }; -static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); +static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); +static void pnv_pci_configure_bus(struct pci_bus *bus); void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, const char *fmt, ...) @@ -70,7 +62,7 @@ void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, vaf.va = &args; if (pe->flags & PNV_IODA_PE_DEV) - strlcpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); + strscpy(pfix, dev_name(&pe->pdev->dev), sizeof(pfix)); else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) sprintf(pfix, "%04x:%02x ", pci_domain_nr(pe->pbus), pe->pbus->number); @@ -89,6 +81,7 @@ void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, } static bool pnv_iommu_bypass_disabled __read_mostly; +static bool pci_reset_phbs __read_mostly; static int __init iommu_setup(char *str) { @@ -110,25 +103,13 @@ static int __init iommu_setup(char *str) } early_param("iommu", iommu_setup); -static inline bool pnv_pci_is_m64(struct pnv_phb *phb, struct resource *r) +static int __init pci_reset_phbs_setup(char *str) { - /* - * WARNING: We cannot rely on the resource flags. The Linux PCI - * allocation code sometimes decides to put a 64-bit prefetchable - * BAR in the 32-bit window, so we have to compare the addresses. - * - * For simplicity we only test resource start. - */ - return (r->start >= phb->ioda.m64_base && - r->start < (phb->ioda.m64_base + phb->ioda.m64_size)); + pci_reset_phbs = true; + return 0; } -static inline bool pnv_pci_is_m64_flags(unsigned long resource_flags) -{ - unsigned long flags = (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH); - - return (resource_flags & flags) == flags; -} +early_param("ppc_pci_reset_phbs", pci_reset_phbs_setup); static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no) { @@ -136,6 +117,7 @@ static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no) phb->ioda.pe_array[pe_no].phb = phb; phb->ioda.pe_array[pe_no].pe_number = pe_no; + phb->ioda.pe_array[pe_no].dma_setup_done = false; /* * Clear the PE frozen state as it might be put into frozen state @@ -159,34 +141,58 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) return; } + mutex_lock(&phb->ioda.pe_alloc_mutex); if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) pr_debug("%s: PE %x was reserved on PHB#%x\n", __func__, pe_no, phb->hose->global_number); + mutex_unlock(&phb->ioda.pe_alloc_mutex); pnv_ioda_init_pe(phb, pe_no); } -static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb) +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count) { - long pe; + struct pnv_ioda_pe *ret = NULL; + int run = 0, pe, i; + mutex_lock(&phb->ioda.pe_alloc_mutex); + + /* scan backwards for a run of @count cleared bits */ for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) { - if (!test_and_set_bit(pe, phb->ioda.pe_alloc)) - return pnv_ioda_init_pe(phb, pe); + if (test_bit(pe, phb->ioda.pe_alloc)) { + run = 0; + continue; + } + + run++; + if (run == count) + break; + } + if (run != count) + goto out; + + for (i = pe; i < pe + count; i++) { + set_bit(i, phb->ioda.pe_alloc); + pnv_ioda_init_pe(phb, i); } + ret = &phb->ioda.pe_array[pe]; - return NULL; +out: + mutex_unlock(&phb->ioda.pe_alloc_mutex); + return ret; } -static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) +void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) { struct pnv_phb *phb = pe->phb; unsigned int pe_num = pe->pe_number; WARN_ON(pe->pdev); - memset(pe, 0, sizeof(struct pnv_ioda_pe)); + + mutex_lock(&phb->ioda.pe_alloc_mutex); clear_bit(pe_num, phb->ioda.pe_alloc); + mutex_unlock(&phb->ioda.pe_alloc_mutex); } /* The default M64 BAR is shared by all PEs */ @@ -246,8 +252,7 @@ fail: static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, unsigned long *pe_bitmap) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct resource *r; resource_size_t base, sgsz, start, end; int segno, i; @@ -259,8 +264,8 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, if (!r->parent || !pnv_pci_is_m64(phb, r)) continue; - start = _ALIGN_DOWN(r->start - base, sgsz); - end = _ALIGN_UP(r->end - base, sgsz); + start = ALIGN_DOWN(r->start - base, sgsz); + end = ALIGN(r->end - base, sgsz); for (segno = start / sgsz; segno < end / sgsz; segno++) { if (pe_bitmap) set_bit(segno, pe_bitmap); @@ -270,64 +275,6 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, } } -static int pnv_ioda1_init_m64(struct pnv_phb *phb) -{ - struct resource *r; - int index; - - /* - * There are 16 M64 BARs, each of which has 8 segments. So - * there are as many M64 segments as the maximum number of - * PEs, which is 128. - */ - for (index = 0; index < PNV_IODA1_M64_NUM; index++) { - unsigned long base, segsz = phb->ioda.m64_segsize; - int64_t rc; - - base = phb->ioda.m64_base + - index * PNV_IODA1_M64_SEGS * segsz; - rc = opal_pci_set_phb_mem_window(phb->opal_id, - OPAL_M64_WINDOW_TYPE, index, base, 0, - PNV_IODA1_M64_SEGS * segsz); - if (rc != OPAL_SUCCESS) { - pr_warn(" Error %lld setting M64 PHB#%x-BAR#%d\n", - rc, phb->hose->global_number, index); - goto fail; - } - - rc = opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, index, - OPAL_ENABLE_M64_SPLIT); - if (rc != OPAL_SUCCESS) { - pr_warn(" Error %lld enabling M64 PHB#%x-BAR#%d\n", - rc, phb->hose->global_number, index); - goto fail; - } - } - - /* - * Exclude the segments for reserved and root bus PE, which - * are first or last two PEs. - */ - r = &phb->hose->mem_resources[1]; - if (phb->ioda.reserved_pe_idx == 0) - r->start += (2 * phb->ioda.m64_segsize); - else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) - r->end -= (2 * phb->ioda.m64_segsize); - else - WARN(1, "Wrong reserved PE#%x on PHB#%x\n", - phb->ioda.reserved_pe_idx, phb->hose->global_number); - - return 0; - -fail: - for ( ; index >= 0; index--) - opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64); - - return -EIO; -} - static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus, unsigned long *pe_bitmap, bool all) @@ -345,8 +292,7 @@ static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus, static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); struct pnv_ioda_pe *master_pe, *pe; unsigned long size, *pe_alloc; int i; @@ -356,7 +302,7 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) return NULL; /* Allocate bitmap */ - size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); + size = ALIGN(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); pe_alloc = kzalloc(size, GFP_KERNEL); if (!pe_alloc) { pr_warn("%s: Out of memory !\n", @@ -397,26 +343,6 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) pe->master = master_pe; list_add_tail(&pe->list, &master_pe->slaves); } - - /* - * P7IOC supports M64DT, which helps mapping M64 segment - * to one particular PE#. However, PHB3 has fixed mapping - * between M64 segment and PE#. In order to have same logic - * for P7IOC and PHB3, we enforce fixed mapping between M64 - * segment and PE# on P7IOC. - */ - if (phb->type == PNV_PHB_IODA1) { - int64_t rc; - - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe->pe_number, OPAL_M64_WINDOW_TYPE, - pe->pe_number / PNV_IODA1_M64_SEGS, - pe->pe_number % PNV_IODA1_M64_SEGS); - if (rc != OPAL_SUCCESS) - pr_warn("%s: Error %lld mapping M64 for PHB#%x-PE#%x\n", - __func__, rc, phb->hose->global_number, - pe->pe_number); - } } kfree(pe_alloc); @@ -432,7 +358,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) const __be32 *r; u64 pci_addr; - if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) { + if (phb->type != PNV_PHB_IODA2) { pr_info(" Not support M64 window\n"); return; } @@ -444,8 +370,8 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) r = of_get_property(dn, "ibm,opal-m64-window", NULL); if (!r) { - pr_info(" No <ibm,opal-m64-window> on %s\n", - dn->full_name); + pr_info(" No <ibm,opal-m64-window> on %pOF\n", + dn); return; } @@ -507,12 +433,7 @@ static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) * Setup init functions for M64 based on IODA version, IODA3 uses * the IODA2 code. */ - if (phb->type == PNV_PHB_IODA1) - phb->init_m64 = pnv_ioda1_init_m64; - else - phb->init_m64 = pnv_ioda2_init_m64; - phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe; - phb->pick_m64_pe = pnv_ioda_pick_m64_pe; + phb->init_m64 = pnv_ioda2_init_m64; } static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no) @@ -598,8 +519,8 @@ static int pnv_ioda_unfreeze_pe(struct pnv_phb *phb, int pe_no, int opt) static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) { struct pnv_ioda_pe *slave, *pe; - u8 fstate, state; - __be16 pcierr; + u8 fstate = 0, state; + __be16 pcierr = 0; s64 rc; /* Sanity check on PE number */ @@ -657,14 +578,19 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no) return state; } -/* Currently those 2 are only used when MSIs are enabled, this will change - * but in the meantime, we need to protect them to avoid warnings - */ -#ifdef CONFIG_PCI_MSI +struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn) +{ + int pe_number = phb->ioda.pe_rmap[bdfn]; + + if (pe_number == IODA_INVALID_PE) + return NULL; + + return &phb->ioda.pe_array[pe_number]; +} + struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); struct pci_dn *pdn = pci_get_pdn(dev); if (!pdn) @@ -673,7 +599,6 @@ struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev) return NULL; return &phb->ioda.pe_array[pdn->pe_number]; } -#endif /* CONFIG_PCI_MSI */ static int pnv_ioda_set_one_peltv(struct pnv_phb *phb, struct pnv_ioda_pe *parent, @@ -779,7 +704,35 @@ static int pnv_ioda_set_peltv(struct pnv_phb *phb, return 0; } -static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) +static void pnv_ioda_unset_peltv(struct pnv_phb *phb, + struct pnv_ioda_pe *pe, + struct pci_dev *parent) +{ + int64_t rc; + + while (parent) { + struct pci_dn *pdn = pci_get_pdn(parent); + + if (pdn && pdn->pe_number != IODA_INVALID_PE) { + rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, + pe->pe_number, + OPAL_REMOVE_PE_FROM_DOMAIN); + /* XXX What to do in case of error ? */ + } + parent = parent->bus->self; + } + + opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, + OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + + /* Disassociate PE in PELT */ + rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, + pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); + if (rc) + pe_warn(pe, "OPAL error %lld remove self from PELTV\n", rc); +} + +int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { struct pci_dev *parent; uint8_t bcomp, dcomp, fcomp; @@ -794,7 +747,7 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; parent = pe->pbus->self; if (pe->flags & PNV_IODA_PE_BUS_ALL) - count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; + count = resource_size(&pe->pbus->busn_res); else count = 1; @@ -829,29 +782,17 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) for (rid = pe->rid; rid < rid_end; rid++) phb->ioda.pe_rmap[rid] = IODA_INVALID_PE; - /* Release from all parents PELT-V */ - while (parent) { - struct pci_dn *pdn = pci_get_pdn(parent); - if (pdn && pdn->pe_number != IODA_INVALID_PE) { - rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number, - pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); - /* XXX What to do in case of error ? */ - } - parent = parent->bus->self; - } - - opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number, - OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + /* + * Release from all parents PELT-V. NPUs don't have a PELTV + * table + */ + if (phb->type != PNV_PHB_NPU_OCAPI) + pnv_ioda_unset_peltv(phb, pe, parent); - /* Disassociate PE in PELT */ - rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number, - pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN); - if (rc) - pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc); rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid, bcomp, dcomp, fcomp, OPAL_UNMAP_PE); if (rc) - pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc); + pe_err(pe, "OPAL error %lld trying to setup PELT table\n", rc); pe->pbus = NULL; pe->pdev = NULL; @@ -862,9 +803,8 @@ static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) return 0; } -static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) +int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) { - struct pci_dev *parent; uint8_t bcomp, dcomp, fcomp; long rc, rid_end, rid; @@ -874,9 +814,8 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER; fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER; - parent = pe->pbus->self; if (pe->flags & PNV_IODA_PE_BUS_ALL) - count = pe->pbus->busn_res.end - pe->pbus->busn_res.start + 1; + count = resource_size(&pe->pbus->busn_res); else count = 1; @@ -895,12 +834,6 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) } rid_end = pe->rid + (count << 8); } else { -#ifdef CONFIG_PCI_IOV - if (pe->flags & PNV_IODA_PE_VF) - parent = pe->parent_dev; - else -#endif /* CONFIG_PCI_IOV */ - parent = pe->pdev->bus->self; bcomp = OpalPciBusAll; dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER; fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER; @@ -924,110 +857,21 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe) * Configure PELTV. NPUs don't have a PELTV table so skip * configuration on them. */ - if (phb->type != PNV_PHB_NPU) + if (phb->type != PNV_PHB_NPU_OCAPI) pnv_ioda_set_peltv(phb, pe, true); /* Setup reverse map */ for (rid = pe->rid; rid < rid_end; rid++) phb->ioda.pe_rmap[rid] = pe->pe_number; - /* Setup one MVTs on IODA1 */ - if (phb->type != PNV_PHB_IODA1) { - pe->mve_number = 0; - goto out; - } - - pe->mve_number = pe->pe_number; - rc = opal_pci_set_mve(phb->opal_id, pe->mve_number, pe->pe_number); - if (rc != OPAL_SUCCESS) { - pe_err(pe, "OPAL error %ld setting up MVE %x\n", - rc, pe->mve_number); - pe->mve_number = -1; - } else { - rc = opal_pci_set_mve_enable(phb->opal_id, - pe->mve_number, OPAL_ENABLE_MVE); - if (rc) { - pe_err(pe, "OPAL error %ld enabling MVE %x\n", - rc, pe->mve_number); - pe->mve_number = -1; - } - } + pe->mve_number = 0; -out: return 0; } -#ifdef CONFIG_PCI_IOV -static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) -{ - struct pci_dn *pdn = pci_get_pdn(dev); - int i; - struct resource *res, res2; - resource_size_t size; - u16 num_vfs; - - if (!dev->is_physfn) - return -EINVAL; - - /* - * "offset" is in VFs. The M64 windows are sized so that when they - * are segmented, each segment is the same size as the IOV BAR. - * Each segment is in a separate PE, and the high order bits of the - * address are the PE number. Therefore, each VF's BAR is in a - * separate PE, and changing the IOV BAR start address changes the - * range of PEs the VFs are in. - */ - num_vfs = pdn->num_vfs; - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &dev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || !res->parent) - continue; - - /* - * The actual IOV BAR range is determined by the start address - * and the actual size for num_vfs VFs BAR. This check is to - * make sure that after shifting, the range will not overlap - * with another device. - */ - size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); - res2.flags = res->flags; - res2.start = res->start + (size * offset); - res2.end = res2.start + (size * num_vfs) - 1; - - if (res2.end > res->end) { - dev_err(&dev->dev, "VF BAR%d: %pR would extend past %pR (trying to enable %d VFs shifted by %d)\n", - i, &res2, res, num_vfs, offset); - return -EBUSY; - } - } - - /* - * After doing so, there would be a "hole" in the /proc/iomem when - * offset is a positive value. It looks like the device return some - * mmio back to the system, which actually no one could use it. - */ - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &dev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || !res->parent) - continue; - - size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES); - res2 = *res; - res->start += size * offset; - - dev_info(&dev->dev, "VF BAR%d: %pR shifted to %pR (%sabling %d VFs shifted by %d)\n", - i, &res2, res, (offset > 0) ? "En" : "Dis", - num_vfs, offset); - pci_update_resource(dev, i + PCI_IOV_RESOURCES); - } - return 0; -} -#endif /* CONFIG_PCI_IOV */ - static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); struct pci_dn *pdn = pci_get_pdn(dev); struct pnv_ioda_pe *pe; @@ -1039,28 +883,26 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) if (pdn->pe_number != IODA_INVALID_PE) return NULL; - pe = pnv_ioda_alloc_pe(phb); + pe = pnv_ioda_alloc_pe(phb, 1); if (!pe) { - pr_warning("%s: Not enough PE# available, disabling device\n", - pci_name(dev)); + pr_warn("%s: Not enough PE# available, disabling device\n", + pci_name(dev)); return NULL; } - /* NOTE: We get only one ref to the pci_dev for the pdn, not for the - * pointer in the PE data structure, both should be destroyed at the - * same time. However, this needs to be looked at more closely again - * once we actually start removing things (Hotplug, SR-IOV, ...) + /* NOTE: We don't get a reference for the pointer in the PE + * data structure, both the device and PE structures should be + * destroyed at the same time. * * At some point we want to remove the PDN completely anyways */ - pci_dev_get(dev); - pdn->pcidev = dev; pdn->pe_number = pe->pe_number; pe->flags = PNV_IODA_PE_DEV; pe->pdev = dev; pe->pbus = NULL; pe->mve_number = -1; pe->rid = dev->bus->number << 8 | pdn->devfn; + pe->device_count++; pe_info(pe, "Associated device to PE\n"); @@ -1069,45 +911,16 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev) pnv_ioda_free_pe(pe); pdn->pe_number = IODA_INVALID_PE; pe->pdev = NULL; - pci_dev_put(dev); return NULL; } /* Put PE to the list */ + mutex_lock(&phb->ioda.pe_list_mutex); list_add_tail(&pe->list, &phb->ioda.pe_list); - + mutex_unlock(&phb->ioda.pe_list_mutex); return pe; } -static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - struct pci_dn *pdn = pci_get_pdn(dev); - - if (pdn == NULL) { - pr_warn("%s: No device node associated with device !\n", - pci_name(dev)); - continue; - } - - /* - * In partial hotplug case, the PCI device might be still - * associated with the PE and needn't attach it to the PE - * again. - */ - if (pdn->pe_number != IODA_INVALID_PE) - continue; - - pe->device_count++; - pdn->pcidev = dev; - pdn->pe_number = pe->pe_number; - if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_same_PE(dev->subordinate, pe); - } -} - /* * There're 2 types of PCI bus sensitive PEs: One that is compromised of * single PCI bus. Another one that contains the primary PCI bus and its @@ -1116,8 +929,7 @@ static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe) */ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); struct pnv_ioda_pe *pe = NULL; unsigned int pe_num; @@ -1126,27 +938,25 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) * We should reuse it instead of allocating a new one. */ pe_num = phb->ioda.pe_rmap[bus->number << 8]; - if (pe_num != IODA_INVALID_PE) { + if (WARN_ON(pe_num != IODA_INVALID_PE)) { pe = &phb->ioda.pe_array[pe_num]; - pnv_ioda_setup_same_PE(bus, pe); return NULL; } /* PE number for root bus should have been reserved */ - if (pci_is_root_bus(bus) && - phb->ioda.root_pe_idx != IODA_INVALID_PE) + if (pci_is_root_bus(bus)) pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx]; /* Check if PE is determined by M64 */ - if (!pe && phb->pick_m64_pe) - pe = phb->pick_m64_pe(bus, all); + if (!pe) + pe = pnv_ioda_pick_m64_pe(bus, all); /* The PE number isn't pinned by M64 */ if (!pe) - pe = pnv_ioda_alloc_pe(phb); + pe = pnv_ioda_alloc_pe(phb, 1); if (!pe) { - pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", + pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n", __func__, pci_domain_nr(bus), bus->number); return NULL; } @@ -1158,11 +968,12 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) pe->rid = bus->busn_res.start << 8; if (all) - pe_info(pe, "Secondary bus %d..%d associated with PE#%x\n", - bus->busn_res.start, bus->busn_res.end, pe->pe_number); + pe_info(pe, "Secondary bus %pad..%pad associated with PE#%x\n", + &bus->busn_res.start, &bus->busn_res.end, + pe->pe_number); else - pe_info(pe, "Secondary bus %d associated with PE#%x\n", - bus->busn_res.start, pe->pe_number); + pe_info(pe, "Secondary bus %pad associated with PE#%x\n", + &bus->busn_res.start, pe->pe_number); if (pnv_ioda_configure_pe(phb, pe)) { /* XXX What do we do here ? */ @@ -1171,576 +982,66 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) return NULL; } - /* Associate it with all child devices */ - pnv_ioda_setup_same_PE(bus, pe); - /* Put PE to the list */ list_add_tail(&pe->list, &phb->ioda.pe_list); return pe; } -static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) +static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev) { - int pe_num, found_pe = false, rc; - long rid; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); + struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; - struct pci_dev *gpu_pdev; - struct pci_dn *npu_pdn; - struct pci_controller *hose = pci_bus_to_host(npu_pdev->bus); - struct pnv_phb *phb = hose->private_data; - - /* - * Due to a hardware errata PE#0 on the NPU is reserved for - * error handling. This means we only have three PEs remaining - * which need to be assigned to four links, implying some - * links must share PEs. - * - * To achieve this we assign PEs such that NPUs linking the - * same GPU get assigned the same PE. - */ - gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev); - for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) { - pe = &phb->ioda.pe_array[pe_num]; - if (!pe->pdev) - continue; - - if (pnv_pci_get_gpu_dev(pe->pdev) == gpu_pdev) { - /* - * This device has the same peer GPU so should - * be assigned the same PE as the existing - * peer NPU. - */ - dev_info(&npu_pdev->dev, - "Associating to existing PE %x\n", pe_num); - pci_dev_get(npu_pdev); - npu_pdn = pci_get_pdn(npu_pdev); - rid = npu_pdev->bus->number << 8 | npu_pdn->devfn; - npu_pdn->pcidev = npu_pdev; - npu_pdn->pe_number = pe_num; - phb->ioda.pe_rmap[rid] = pe->pe_number; - - /* Map the PE to this link */ - rc = opal_pci_set_pe(phb->opal_id, pe_num, rid, - OpalPciBusAll, - OPAL_COMPARE_RID_DEVICE_NUMBER, - OPAL_COMPARE_RID_FUNCTION_NUMBER, - OPAL_MAP_PE); - WARN_ON(rc != OPAL_SUCCESS); - found_pe = true; - break; - } - } - - if (!found_pe) - /* - * Could not find an existing PE so allocate a new - * one. - */ - return pnv_ioda_setup_dev_PE(npu_pdev); - else - return pe; -} - -static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus) -{ - struct pci_dev *pdev; - - list_for_each_entry(pdev, &bus->devices, bus_list) - pnv_ioda_setup_npu_PE(pdev); -} - -static void pnv_pci_ioda_setup_PEs(void) -{ - struct pci_controller *hose, *tmp; - struct pnv_phb *phb; - - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - phb = hose->private_data; - if (phb->type == PNV_PHB_NPU) { - /* PE#0 is needed for error reporting */ - pnv_ioda_reserve_pe(phb, 0); - pnv_ioda_setup_npu_PEs(hose->bus); - if (phb->model == PNV_PHB_MODEL_NPU2) - pnv_npu2_init(phb); - } - } -} - -#ifdef CONFIG_PCI_IOV -static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pci_dn *pdn; - int i, j; - int m64_bars; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (pdn->m64_single_mode) - m64_bars = num_vfs; - else - m64_bars = 1; - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) - for (j = 0; j < m64_bars; j++) { - if (pdn->m64_map[j][i] == IODA_INVALID_M64) - continue; - opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0); - clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc); - pdn->m64_map[j][i] = IODA_INVALID_M64; - } - - kfree(pdn->m64_map); - return 0; -} - -static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pci_dn *pdn; - unsigned int win; - struct resource *res; - int i, j; - int64_t rc; - int total_vfs; - resource_size_t size, start; - int pe_num; - int m64_bars; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - total_vfs = pci_sriov_get_totalvfs(pdev); - - if (pdn->m64_single_mode) - m64_bars = num_vfs; - else - m64_bars = 1; - - pdn->m64_map = kmalloc_array(m64_bars, - sizeof(*pdn->m64_map), - GFP_KERNEL); - if (!pdn->m64_map) - return -ENOMEM; - /* Initialize the m64_map to IODA_INVALID_M64 */ - for (i = 0; i < m64_bars ; i++) - for (j = 0; j < PCI_SRIOV_NUM_BARS; j++) - pdn->m64_map[i][j] = IODA_INVALID_M64; - - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || !res->parent) - continue; - - for (j = 0; j < m64_bars; j++) { - do { - win = find_next_zero_bit(&phb->ioda.m64_bar_alloc, - phb->ioda.m64_bar_idx + 1, 0); - - if (win >= phb->ioda.m64_bar_idx + 1) - goto m64_failed; - } while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc)); - - pdn->m64_map[j][i] = win; - - if (pdn->m64_single_mode) { - size = pci_iov_resource_size(pdev, - PCI_IOV_RESOURCES + i); - start = res->start + size * j; - } else { - size = resource_size(res); - start = res->start; - } - - /* Map the M64 here */ - if (pdn->m64_single_mode) { - pe_num = pdn->pe_num_map[j]; - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - pe_num, OPAL_M64_WINDOW_TYPE, - pdn->m64_map[j][i], 0); - } - - rc = opal_pci_set_phb_mem_window(phb->opal_id, - OPAL_M64_WINDOW_TYPE, - pdn->m64_map[j][i], - start, - 0, /* unused */ - size); - - - if (rc != OPAL_SUCCESS) { - dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n", - win, rc); - goto m64_failed; - } - - if (pdn->m64_single_mode) - rc = opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2); - else - rc = opal_pci_phb_mmio_enable(phb->opal_id, - OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1); - - if (rc != OPAL_SUCCESS) { - dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n", - win, rc); - goto m64_failed; - } - } - } - return 0; - -m64_failed: - pnv_pci_vf_release_m64(pdev, num_vfs); - return -EBUSY; -} - -static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, - int num); -static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable); - -static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe *pe) -{ - struct iommu_table *tbl; - int64_t rc; - - tbl = pe->table_group.tables[0]; - rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); - if (rc) - pe_warn(pe, "OPAL error %ld release DMA window\n", rc); - - pnv_pci_ioda2_set_bypass(pe, false); - if (pe->table_group.group) { - iommu_group_put(pe->table_group.group); - BUG_ON(pe->table_group.group); - } - iommu_tce_table_put(tbl); -} - -static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe, *pe_n; - struct pci_dn *pdn; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (!pdev->is_physfn) - return; - - list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) { - if (pe->parent_dev != pdev) - continue; - - pnv_pci_ioda2_release_dma_pe(pdev, pe); - - /* Remove from list */ - mutex_lock(&phb->ioda.pe_list_mutex); - list_del(&pe->list); - mutex_unlock(&phb->ioda.pe_list_mutex); - - pnv_ioda_deconfigure_pe(phb, pe); - - pnv_ioda_free_pe(pe); - } -} - -void pnv_pci_sriov_disable(struct pci_dev *pdev) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - struct pci_dn *pdn; - u16 num_vfs, i; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - num_vfs = pdn->num_vfs; - - /* Release VF PEs */ - pnv_ioda_release_vf_PE(pdev); - - if (phb->type == PNV_PHB_IODA2) { - if (!pdn->m64_single_mode) - pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map); - - /* Release M64 windows */ - pnv_pci_vf_release_m64(pdev, num_vfs); - - /* Release PE numbers */ - if (pdn->m64_single_mode) { - for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] == IODA_INVALID_PE) - continue; - - pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; - pnv_ioda_free_pe(pe); - } - } else - bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); - /* Releasing pe_num_map */ - kfree(pdn->pe_num_map); - } -} - -static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe); -static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - int pe_num; - u16 vf_index; - struct pci_dn *pdn; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (!pdev->is_physfn) - return; - - /* Reserve PE for each VF */ - for (vf_index = 0; vf_index < num_vfs; vf_index++) { - if (pdn->m64_single_mode) - pe_num = pdn->pe_num_map[vf_index]; - else - pe_num = *pdn->pe_num_map + vf_index; - - pe = &phb->ioda.pe_array[pe_num]; - pe->pe_number = pe_num; - pe->phb = phb; - pe->flags = PNV_IODA_PE_VF; - pe->pbus = NULL; - pe->parent_dev = pdev; - pe->mve_number = -1; - pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | - pci_iov_virtfn_devfn(pdev, vf_index); - - pe_info(pe, "VF %04d:%02d:%02d.%d associated with PE#%x\n", - hose->global_number, pdev->bus->number, - PCI_SLOT(pci_iov_virtfn_devfn(pdev, vf_index)), - PCI_FUNC(pci_iov_virtfn_devfn(pdev, vf_index)), pe_num); - - if (pnv_ioda_configure_pe(phb, pe)) { - /* XXX What do we do here ? */ - pnv_ioda_free_pe(pe); - pe->pdev = NULL; - continue; - } - - /* Put PE to the list */ - mutex_lock(&phb->ioda.pe_list_mutex); - list_add_tail(&pe->list, &phb->ioda.pe_list); - mutex_unlock(&phb->ioda.pe_list_mutex); - - pnv_pci_ioda2_setup_dma_pe(phb, pe); - } -} - -int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) -{ - struct pci_bus *bus; - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe; - struct pci_dn *pdn; - int ret; - u16 i; - - bus = pdev->bus; - hose = pci_bus_to_host(bus); - phb = hose->private_data; - pdn = pci_get_pdn(pdev); - - if (phb->type == PNV_PHB_IODA2) { - if (!pdn->vfs_expanded) { - dev_info(&pdev->dev, "don't support this SRIOV device" - " with non 64bit-prefetchable IOV BAR\n"); - return -ENOSPC; - } - - /* - * When M64 BARs functions in Single PE mode, the number of VFs - * could be enabled must be less than the number of M64 BARs. - */ - if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) { - dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n"); - return -EBUSY; - } - - /* Allocating pe_num_map */ - if (pdn->m64_single_mode) - pdn->pe_num_map = kmalloc_array(num_vfs, - sizeof(*pdn->pe_num_map), - GFP_KERNEL); - else - pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL); - - if (!pdn->pe_num_map) - return -ENOMEM; - - if (pdn->m64_single_mode) - for (i = 0; i < num_vfs; i++) - pdn->pe_num_map[i] = IODA_INVALID_PE; - /* Calculate available PE for required VFs */ - if (pdn->m64_single_mode) { - for (i = 0; i < num_vfs; i++) { - pe = pnv_ioda_alloc_pe(phb); - if (!pe) { - ret = -EBUSY; - goto m64_failed; - } + /* Check if the BDFN for this device is associated with a PE yet */ + pe = pnv_pci_bdfn_to_pe(phb, pci_dev_id(pdev)); + if (!pe) { + /* VF PEs should be pre-configured in pnv_pci_sriov_enable() */ + if (WARN_ON(pdev->is_virtfn)) + return; - pdn->pe_num_map[i] = pe->pe_number; - } - } else { - mutex_lock(&phb->ioda.pe_alloc_mutex); - *pdn->pe_num_map = bitmap_find_next_zero_area( - phb->ioda.pe_alloc, phb->ioda.total_pe_num, - 0, num_vfs, 0); - if (*pdn->pe_num_map >= phb->ioda.total_pe_num) { - mutex_unlock(&phb->ioda.pe_alloc_mutex); - dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); - kfree(pdn->pe_num_map); - return -EBUSY; - } - bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); - mutex_unlock(&phb->ioda.pe_alloc_mutex); - } - pdn->num_vfs = num_vfs; + pnv_pci_configure_bus(pdev->bus); + pe = pnv_pci_bdfn_to_pe(phb, pci_dev_id(pdev)); + pci_info(pdev, "Configured PE#%x\n", pe ? pe->pe_number : 0xfffff); - /* Assign M64 window accordingly */ - ret = pnv_pci_vf_assign_m64(pdev, num_vfs); - if (ret) { - dev_info(&pdev->dev, "Not enough M64 window resources\n"); - goto m64_failed; - } /* - * When using one M64 BAR to map one IOV BAR, we need to shift - * the IOV BAR according to the PE# allocated to the VFs. - * Otherwise, the PE# for the VF will conflict with others. + * If we can't setup the IODA PE something has gone horribly + * wrong and we can't enable DMA for the device. */ - if (!pdn->m64_single_mode) { - ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map); - if (ret) - goto m64_failed; - } + if (WARN_ON(!pe)) + return; + } else { + pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number); } - /* Setup VF PEs */ - pnv_ioda_setup_vf_PE(pdev, num_vfs); - - return 0; - -m64_failed: - if (pdn->m64_single_mode) { - for (i = 0; i < num_vfs; i++) { - if (pdn->pe_num_map[i] == IODA_INVALID_PE) - continue; - - pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; - pnv_ioda_free_pe(pe); - } - } else - bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); - - /* Releasing pe_num_map */ - kfree(pdn->pe_num_map); - - return ret; -} - -int pcibios_sriov_disable(struct pci_dev *pdev) -{ - pnv_pci_sriov_disable(pdev); - - /* Release PCI data */ - remove_dev_pci_data(pdev); - return 0; -} - -int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) -{ - /* Allocate PCI data */ - add_dev_pci_data(pdev); - - return pnv_pci_sriov_enable(pdev, num_vfs); -} -#endif /* CONFIG_PCI_IOV */ - -static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev) -{ - struct pci_dn *pdn = pci_get_pdn(pdev); - struct pnv_ioda_pe *pe; - /* - * The function can be called while the PE# - * hasn't been assigned. Do nothing for the - * case. + * We assume that bridges *probably* don't need to do any DMA so we can + * skip allocating a TCE table, etc unless we get a non-bridge device. */ - if (!pdn || pdn->pe_number == IODA_INVALID_PE) - return; + if (!pe->dma_setup_done && !pci_is_bridge(pdev)) { + switch (phb->type) { + case PNV_PHB_IODA2: + pnv_pci_ioda2_setup_dma_pe(phb, pe); + break; + default: + pr_warn("%s: No DMA for PHB#%x (type %d)\n", + __func__, phb->hose->global_number, phb->type); + } + } + + if (pdn) + pdn->pe_number = pe->pe_number; + pe->device_count++; - pe = &phb->ioda.pe_array[pdn->pe_number]; WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops); - set_dma_offset(&pdev->dev, pe->tce_bypass_base); + pdev->dev.archdata.dma_offset = pe->tce_bypass_base; set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]); - /* - * Note: iommu_add_device() will fail here as - * for physical PE: the device is already added by now; - * for virtual PE: sysfs entries are not ready yet and - * tce_iommu_bus_notifier will add the device to a group later. - */ -} - -static bool pnv_pci_ioda_pe_single_vendor(struct pnv_ioda_pe *pe) -{ - unsigned short vendor = 0; - struct pci_dev *pdev; - if (pe->device_count == 1) - return true; - - /* pe->pdev should be set if it's a single device, pe->pbus if not */ - if (!pe->pbus) - return true; - - list_for_each_entry(pdev, &pe->pbus->devices, bus_list) { - if (!vendor) { - vendor = pdev->vendor; - continue; - } - - if (pdev->vendor != vendor) - return false; - } - - return true; + /* PEs with a DMA weight of zero won't have a group */ + if (pe->table_group.group) + iommu_add_device(&pe->table_group, &pdev->dev); } /* @@ -1812,238 +1113,79 @@ err: return -EIO; } -static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) +static bool pnv_pci_ioda_iommu_bypass_supported(struct pci_dev *pdev, + u64 dma_mask) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; - uint64_t top; - bool bypass = false; - s64 rc; if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return -ENODEV;; + return false; pe = &phb->ioda.pe_array[pdn->pe_number]; if (pe->tce_bypass_enabled) { - top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; - bypass = (dma_mask >= top); + u64 top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1; + if (dma_mask >= top) + return true; } - if (bypass) { - dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n"); - set_dma_ops(&pdev->dev, &dma_direct_ops); - } else { - /* - * If the device can't set the TCE bypass bit but still wants - * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to - * bypass the 32-bit region and be usable for 64-bit DMAs. - * The device needs to be able to address all of this space. - */ - if (dma_mask >> 32 && - dma_mask > (memory_hotplug_max() + (1ULL << 32)) && - pnv_pci_ioda_pe_single_vendor(pe) && - phb->model == PNV_PHB_MODEL_PHB3) { - /* Configure the bypass mode */ - rc = pnv_pci_ioda_dma_64bit_bypass(pe); - if (rc) - return rc; - /* 4GB offset bypasses 32-bit space */ - set_dma_offset(&pdev->dev, (1ULL << 32)); - set_dma_ops(&pdev->dev, &dma_direct_ops); - } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) { - /* - * Fail the request if a DMA mask between 32 and 64 bits - * was requested but couldn't be fulfilled. Ideally we - * would do this for 64-bits but historically we have - * always fallen back to 32-bits. - */ - return -ENOMEM; - } else { - dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); - set_dma_ops(&pdev->dev, &dma_iommu_ops); - } - } - *pdev->dev.dma_mask = dma_mask; - - /* Update peer npu devices */ - pnv_npu_try_dma_set_bypass(pdev, bypass); - - return 0; -} - -static u64 pnv_pci_ioda_dma_get_required_mask(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pci_get_pdn(pdev); - struct pnv_ioda_pe *pe; - u64 end, mask; - - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) - return 0; - - pe = &phb->ioda.pe_array[pdn->pe_number]; - if (!pe->tce_bypass_enabled) - return __dma_get_required_mask(&pdev->dev); - - - end = pe->tce_bypass_base + memblock_end_of_DRAM(); - mask = 1ULL << (fls64(end) - 1); - mask += mask - 1; - - return mask; -} - -static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, - struct pci_bus *bus, - bool add_to_group) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); - set_dma_offset(&dev->dev, pe->tce_bypass_base); - if (add_to_group) - iommu_add_device(&dev->dev); - - if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) - pnv_ioda_setup_bus_dma(pe, dev->subordinate, - add_to_group); - } -} - -static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb, - bool real_mode) -{ - return real_mode ? (__be64 __iomem *)(phb->regs_phys + 0x210) : - (phb->regs + 0x210); -} - -static void pnv_pci_p7ioc_tce_invalidate(struct iommu_table *tbl, - unsigned long index, unsigned long npages, bool rm) -{ - struct iommu_table_group_link *tgl = list_first_entry_or_null( - &tbl->it_group_list, struct iommu_table_group_link, - next); - struct pnv_ioda_pe *pe = container_of(tgl->table_group, - struct pnv_ioda_pe, table_group); - __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm); - unsigned long start, end, inc; - - start = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset); - end = __pa(((__be64 *)tbl->it_base) + index - tbl->it_offset + - npages - 1); - - /* p7ioc-style invalidation, 2 TCEs per write */ - start |= (1ull << 63); - end |= (1ull << 63); - inc = 16; - end |= inc - 1; /* round up end to be different than start */ - - mb(); /* Ensure above stores are visible */ - while (start <= end) { - if (rm) - __raw_rm_writeq(cpu_to_be64(start), invalidate); - else - __raw_writeq(cpu_to_be64(start), invalidate); - start += inc; - } - /* - * The iommu layer will do another mb() for us on build() - * and we don't care on free() + * If the device can't set the TCE bypass bit but still wants + * to access 4GB or more, on PHB3 we can reconfigure TVE#0 to + * bypass the 32-bit region and be usable for 64-bit DMAs. + * The device needs to be able to address all of this space. */ + if (dma_mask >> 32 && + dma_mask > (memory_hotplug_max() + (1ULL << 32)) && + /* pe->pdev should be set if it's a single device, pe->pbus if not */ + (pe->device_count == 1 || !pe->pbus) && + phb->model == PNV_PHB_MODEL_PHB3) { + /* Configure the bypass mode */ + s64 rc = pnv_pci_ioda_dma_64bit_bypass(pe); + if (rc) + return false; + /* 4GB offset bypasses 32-bit space */ + pdev->dev.archdata.dma_offset = (1ULL << 32); + return true; + } + + return false; } -static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, - long npages, unsigned long uaddr, - enum dma_data_direction direction, - unsigned long attrs) +static inline __be64 __iomem *pnv_ioda_get_inval_reg(struct pnv_phb *phb) { - int ret = pnv_tce_build(tbl, index, npages, uaddr, direction, - attrs); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false); - - return ret; + return phb->regs + 0x210; } #ifdef CONFIG_IOMMU_API -static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, +/* Common for IODA1 and IODA2 */ +static int pnv_ioda_tce_xchg_no_kill(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false); - - return ret; -} - -static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction); - - if (!ret) - pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true); - - return ret; + return pnv_tce_xchg(tbl, index, hpa, direction); } #endif -static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, - long npages) -{ - pnv_tce_free(tbl, index, npages); - - pnv_pci_p7ioc_tce_invalidate(tbl, index, npages, false); -} - -static struct iommu_table_ops pnv_ioda1_iommu_ops = { - .set = pnv_ioda1_tce_build, -#ifdef CONFIG_IOMMU_API - .exchange = pnv_ioda1_tce_xchg, - .exchange_rm = pnv_ioda1_tce_xchg_rm, -#endif - .clear = pnv_ioda1_tce_free, - .get = pnv_tce_get, -}; - #define PHB3_TCE_KILL_INVAL_ALL PPC_BIT(0) #define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1) #define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2) -static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm) -{ - __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm); - const unsigned long val = PHB3_TCE_KILL_INVAL_ALL; - - mb(); /* Ensure previous TCE table stores are visible */ - if (rm) - __raw_rm_writeq(cpu_to_be64(val), invalidate); - else - __raw_writeq(cpu_to_be64(val), invalidate); -} - static inline void pnv_pci_phb3_tce_invalidate_pe(struct pnv_ioda_pe *pe) { /* 01xb - invalidate TCEs that match the specified PE# */ - __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, false); + __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb); unsigned long val = PHB3_TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF); mb(); /* Ensure above stores are visible */ - __raw_writeq(cpu_to_be64(val), invalidate); + __raw_writeq_be(val, invalidate); } -static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm, +static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, unsigned shift, unsigned long index, unsigned long npages) { - __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb, rm); + __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(pe->phb); unsigned long start, end, inc; /* We'll invalidate DMA address in PE scope */ @@ -2058,10 +1200,7 @@ static void pnv_pci_phb3_tce_invalidate(struct pnv_ioda_pe *pe, bool rm, mb(); while (start <= end) { - if (rm) - __raw_rm_writeq(cpu_to_be64(start), invalidate); - else - __raw_writeq(cpu_to_be64(start), invalidate); + __raw_writeq_be(start, invalidate); start += inc; } } @@ -2078,7 +1217,7 @@ static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) } static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, - unsigned long index, unsigned long npages, bool rm) + unsigned long index, unsigned long npages) { struct iommu_table_group_link *tgl; @@ -2088,22 +1227,8 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, struct pnv_phb *phb = pe->phb; unsigned int shift = tbl->it_page_shift; - /* - * NVLink1 can use the TCE kill register directly as - * it's the same as PHB3. NVLink2 is different and - * should go via the OPAL call. - */ - if (phb->model == PNV_PHB_MODEL_NPU) { - /* - * The NVLink hardware does not support TCE kill - * per TCE entry so we have to invalidate - * the entire cache for it. - */ - pnv_pci_phb3_tce_invalidate_entire(phb, rm); - continue; - } if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs) - pnv_pci_phb3_tce_invalidate(pe, rm, shift, + pnv_pci_phb3_tce_invalidate(pe, shift, index, npages); else opal_pci_tce_kill(phb->opal_id, @@ -2113,14 +1238,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, } } -void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm) -{ - if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3) - pnv_pci_phb3_tce_invalidate_entire(phb, rm); - else - opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0); -} - static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, @@ -2130,239 +1247,31 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, attrs); if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); + pnv_pci_ioda2_tce_invalidate(tbl, index, npages); return ret; } -#ifdef CONFIG_IOMMU_API -static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction); - - if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); - - return ret; -} - -static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) -{ - long ret = pnv_tce_xchg(tbl, index, hpa, direction); - - if (!ret) - pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true); - - return ret; -} -#endif - static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, long npages) { pnv_tce_free(tbl, index, npages); - pnv_pci_ioda2_tce_invalidate(tbl, index, npages, false); -} - -static void pnv_ioda2_table_free(struct iommu_table *tbl) -{ - pnv_pci_ioda2_table_free_pages(tbl); + pnv_pci_ioda2_tce_invalidate(tbl, index, npages); } static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, #ifdef CONFIG_IOMMU_API - .exchange = pnv_ioda2_tce_xchg, - .exchange_rm = pnv_ioda2_tce_xchg_rm, + .xchg_no_kill = pnv_ioda_tce_xchg_no_kill, + .tce_kill = pnv_pci_ioda2_tce_invalidate, + .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda2_tce_free, .get = pnv_tce_get, - .free = pnv_ioda2_table_free, + .free = pnv_pci_ioda2_table_free_pages, }; -static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data) -{ - unsigned int *weight = (unsigned int *)data; - - /* This is quite simplistic. The "base" weight of a device - * is 10. 0 means no DMA is to be accounted for it. - */ - if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) - return 0; - - if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || - dev->class == PCI_CLASS_SERIAL_USB_OHCI || - dev->class == PCI_CLASS_SERIAL_USB_EHCI) - *weight += 3; - else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) - *weight += 15; - else - *weight += 10; - - return 0; -} - -static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe) -{ - unsigned int weight = 0; - - /* SRIOV VF has same DMA32 weight as its PF */ -#ifdef CONFIG_PCI_IOV - if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) { - pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight); - return weight; - } -#endif - - if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) { - pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight); - } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) { - struct pci_dev *pdev; - - list_for_each_entry(pdev, &pe->pbus->devices, bus_list) - pnv_pci_ioda_dev_dma_weight(pdev, &weight); - } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) { - pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight); - } - - return weight; -} - -static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) -{ - - struct page *tce_mem = NULL; - struct iommu_table *tbl; - unsigned int weight, total_weight = 0; - unsigned int tce32_segsz, base, segs, avail, i; - int64_t rc; - void *addr; - - /* XXX FIXME: Handle 64-bit only DMA devices */ - /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ - /* XXX FIXME: Allocate multi-level tables on PHB3 */ - weight = pnv_pci_ioda_pe_dma_weight(pe); - if (!weight) - return; - - pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight, - &total_weight); - segs = (weight * phb->ioda.dma32_count) / total_weight; - if (!segs) - segs = 1; - - /* - * Allocate contiguous DMA32 segments. We begin with the expected - * number of segments. With one more attempt, the number of DMA32 - * segments to be allocated is decreased by one until one segment - * is allocated successfully. - */ - do { - for (base = 0; base <= phb->ioda.dma32_count - segs; base++) { - for (avail = 0, i = base; i < base + segs; i++) { - if (phb->ioda.dma32_segmap[i] == - IODA_INVALID_PE) - avail++; - } - - if (avail == segs) - goto found; - } - } while (--segs); - - if (!segs) { - pe_warn(pe, "No available DMA32 segments\n"); - return; - } - -found: - tbl = pnv_pci_table_alloc(phb->hose->node); - if (WARN_ON(!tbl)) - return; - - iommu_register_group(&pe->table_group, phb->hose->global_number, - pe->pe_number); - pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); - - /* Grab a 32-bit TCE table */ - pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n", - weight, total_weight, base, segs); - pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", - base * PNV_IODA1_DMA32_SEGSIZE, - (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1); - - /* XXX Currently, we allocate one big contiguous table for the - * TCEs. We only really need one chunk per 256M of TCE space - * (ie per segment) but that's an optimization for later, it - * requires some added smarts with our get/put_tce implementation - * - * Each TCE page is 4KB in size and each TCE entry occupies 8 - * bytes - */ - tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3); - tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, - get_order(tce32_segsz * segs)); - if (!tce_mem) { - pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); - goto fail; - } - addr = page_address(tce_mem); - memset(addr, 0, tce32_segsz * segs); - - /* Configure HW */ - for (i = 0; i < segs; i++) { - rc = opal_pci_map_pe_dma_window(phb->opal_id, - pe->pe_number, - base + i, 1, - __pa(addr) + tce32_segsz * i, - tce32_segsz, IOMMU_PAGE_SIZE_4K); - if (rc) { - pe_err(pe, " Failed to configure 32-bit TCE table," - " err %ld\n", rc); - goto fail; - } - } - - /* Setup DMA32 segment mapping */ - for (i = base; i < base + segs; i++) - phb->ioda.dma32_segmap[i] = pe->pe_number; - - /* Setup linux iommu table */ - pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs, - base * PNV_IODA1_DMA32_SEGSIZE, - IOMMU_PAGE_SHIFT_4K); - - tbl->it_ops = &pnv_ioda1_iommu_ops; - pe->table_group.tce32_start = tbl->it_offset << tbl->it_page_shift; - pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift; - iommu_init_table(tbl, phb->hose->node); - - if (pe->flags & PNV_IODA_PE_DEV) { - /* - * Setting table base here only for carrying iommu_group - * further down to let iommu_add_device() do the job. - * pnv_pci_ioda_dma_dev_setup will override it later anyway. - */ - set_iommu_table_base(&pe->pdev->dev, tbl); - iommu_add_device(&pe->pdev->dev); - } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) - pnv_ioda_setup_bus_dma(pe, pe->pbus, true); - - return; - fail: - /* XXX Failure: Try to fallback to 64-bit only ? */ - if (tce_mem) - __free_pages(tce_mem, get_order(tce32_segsz * segs)); - if (tbl) { - pnv_pci_unlink_table_and_group(tbl, &pe->table_group); - iommu_tce_table_put(tbl); - } -} - static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, int num, struct iommu_table *tbl) { @@ -2375,9 +1284,9 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; const __u64 win_size = tbl->it_size << tbl->it_page_shift; - pe_info(pe, "Setting up window#%d %llx..%llx pg=%x\n", num, - start_addr, start_addr + win_size - 1, - IOMMU_PAGE_SIZE(tbl)); + pe_info(pe, "Setting up window#%d %llx..%llx pg=%lx\n", + num, start_addr, start_addr + win_size - 1, + IOMMU_PAGE_SIZE(tbl)); /* * Map TCE table through TVT. The TVE index is the PE number @@ -2391,7 +1300,7 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group, size << 3, IOMMU_PAGE_SIZE(tbl)); if (rc) { - pe_err(pe, "Failed to configure TCE table, err %ld\n", rc); + pe_err(pe, "Failed to configure TCE table, err %lld\n", rc); return rc; } @@ -2430,13 +1339,9 @@ static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) pe->tce_bypass_enabled = enable; } -static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, - __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table *tbl); - static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, int num, __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table **ptbl) + bool alloc_userspace_copy, struct iommu_table **ptbl) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, table_group); @@ -2453,7 +1358,7 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, ret = pnv_pci_ioda2_table_alloc_pages(nid, bus_offset, page_shift, window_size, - levels, tbl); + levels, alloc_userspace_copy, tbl); if (ret) { iommu_tce_table_put(tbl); return ret; @@ -2468,6 +1373,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) { struct iommu_table *tbl = NULL; long rc; + unsigned long res_start, res_end; /* * crashkernel= specifies the kdump kernel's maximum memory at @@ -2481,43 +1387,70 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) * DMA window can be larger than available memory, which will * cause errors later. */ - const u64 window_size = min((u64)pe->table_group.tce32_size, max_memory); + const u64 maxblock = 1UL << (PAGE_SHIFT + MAX_PAGE_ORDER); - rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, - IOMMU_PAGE_SHIFT_4K, - window_size, - POWERNV_IOMMU_DEFAULT_LEVELS, &tbl); + /* + * We create the default window as big as we can. The constraint is + * the max order of allocation possible. The TCE table is likely to + * end up being multilevel and with on-demand allocation in place, + * the initial use is not going to be huge as the default window aims + * to support crippled devices (i.e. not fully 64bit DMAble) only. + */ + /* iommu_table::it_map uses 1 bit per IOMMU page, hence 8 */ + const u64 window_size = min((maxblock * 8) << PAGE_SHIFT, max_memory); + /* Each TCE level cannot exceed maxblock so go multilevel if needed */ + unsigned long tces_order = ilog2(window_size >> PAGE_SHIFT); + unsigned long tcelevel_order = ilog2(maxblock >> 3); + unsigned int levels = tces_order / tcelevel_order; + + if (tces_order % tcelevel_order) + levels += 1; + /* + * We try to stick to default levels (which is >1 at the moment) in + * order to save memory by relying on on-demain TCE level allocation. + */ + levels = max_t(unsigned int, levels, POWERNV_IOMMU_DEFAULT_LEVELS); + + rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, PAGE_SHIFT, + window_size, levels, false, &tbl); if (rc) { pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); return rc; } - iommu_init_table(tbl, pe->phb->hose->node); + /* We use top part of 32bit space for MMIO so exclude it from DMA */ + res_start = 0; + res_end = 0; + if (window_size > pe->phb->ioda.m32_pci_base) { + res_start = pe->phb->ioda.m32_pci_base >> tbl->it_page_shift; + res_end = min(window_size, SZ_4G) >> tbl->it_page_shift; + } - rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); + tbl->it_index = (pe->phb->hose->global_number << 16) | pe->pe_number; + if (iommu_init_table(tbl, pe->phb->hose->node, res_start, res_end)) + rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl); + else + rc = -ENOMEM; if (rc) { - pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", - rc); + pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", rc); iommu_tce_table_put(tbl); - return rc; + tbl = NULL; /* This clears iommu_table_base below */ } - if (!pnv_iommu_bypass_disabled) pnv_pci_ioda2_set_bypass(pe, true); /* - * Setting table base here only for carrying iommu_group - * further down to let iommu_add_device() do the job. - * pnv_pci_ioda_dma_dev_setup will override it later anyway. + * Set table base for the case of IOMMU DMA use. Usually this is done + * from dma_dev_setup() which is not called when a device is returned + * from VFIO so do it here. */ - if (pe->flags & PNV_IODA_PE_DEV) + if (pe->pdev) set_iommu_table_base(&pe->pdev->dev, tbl); return 0; } -#if defined(CONFIG_IOMMU_API) || defined(CONFIG_PCI_IOV) static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, int num) { @@ -2541,10 +1474,9 @@ static long pnv_pci_ioda2_unset_window(struct iommu_table_group *table_group, return ret; } -#endif #ifdef CONFIG_IOMMU_API -static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, +unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, __u64 window_size, __u32 levels) { unsigned long bytes = 0; @@ -2555,7 +1487,6 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, unsigned long direct_table_size; if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS) || - (window_size > memory_hotplug_max()) || !is_power_of_2(window_size)) return 0; @@ -2566,7 +1497,7 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, direct_table_size = 1UL << table_shift; for ( ; levels; --levels) { - bytes += _ALIGN_UP(tce_table_size, direct_table_size); + bytes += ALIGN(tce_table_size, direct_table_size); tce_table_size /= direct_table_size; tce_table_size <<= 3; @@ -2574,305 +1505,94 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, tce_table_size, direct_table_size); } - return bytes; + return bytes + bytes; /* one for HW table, one for userspace copy */ +} + +static long pnv_pci_ioda2_create_table_userspace( + struct iommu_table_group *table_group, + int num, __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table **ptbl) +{ + long ret = pnv_pci_ioda2_create_table(table_group, + num, page_shift, window_size, levels, true, ptbl); + + if (!ret) + (*ptbl)->it_allocated_size = pnv_pci_ioda2_get_table_size( + page_shift, window_size, levels); + return ret; } -static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) +static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + set_iommu_table_base(&dev->dev, pe->table_group.tables[0]); + dev->dev.archdata.dma_offset = pe->tce_bypass_base; + + if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) + pnv_ioda_setup_bus_dma(pe, dev->subordinate); + } +} + +static long pnv_ioda2_take_ownership(struct iommu_table_group *table_group, + struct device *dev __maybe_unused) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, table_group); /* Store @tbl as pnv_pci_ioda2_unset_window() resets it */ struct iommu_table *tbl = pe->table_group.tables[0]; + /* + * iommu_ops transfers the ownership per a device and we mode + * the group ownership with the first device in the group. + */ + if (!tbl) + return 0; + pnv_pci_ioda2_set_bypass(pe, false); pnv_pci_ioda2_unset_window(&pe->table_group, 0); if (pe->pbus) - pnv_ioda_setup_bus_dma(pe, pe->pbus, false); + pnv_ioda_setup_bus_dma(pe, pe->pbus); + else if (pe->pdev) + set_iommu_table_base(&pe->pdev->dev, NULL); iommu_tce_table_put(tbl); + + return 0; } -static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) +static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group, + struct device *dev __maybe_unused) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, table_group); + /* See the comment about iommu_ops above */ + if (pe->table_group.tables[0]) + return; pnv_pci_ioda2_setup_default_config(pe); if (pe->pbus) - pnv_ioda_setup_bus_dma(pe, pe->pbus, false); + pnv_ioda_setup_bus_dma(pe, pe->pbus); } static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .get_table_size = pnv_pci_ioda2_get_table_size, - .create_table = pnv_pci_ioda2_create_table, + .create_table = pnv_pci_ioda2_create_table_userspace, .set_window = pnv_pci_ioda2_set_window, .unset_window = pnv_pci_ioda2_unset_window, .take_ownership = pnv_ioda2_take_ownership, .release_ownership = pnv_ioda2_release_ownership, }; - -static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque) -{ - struct pci_controller *hose; - struct pnv_phb *phb; - struct pnv_ioda_pe **ptmppe = opaque; - struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); - struct pci_dn *pdn = pci_get_pdn(pdev); - - if (!pdn || pdn->pe_number == IODA_INVALID_PE) - return 0; - - hose = pci_bus_to_host(pdev->bus); - phb = hose->private_data; - if (phb->type != PNV_PHB_NPU) - return 0; - - *ptmppe = &phb->ioda.pe_array[pdn->pe_number]; - - return 1; -} - -/* - * This returns PE of associated NPU. - * This assumes that NPU is in the same IOMMU group with GPU and there is - * no other PEs. - */ -static struct pnv_ioda_pe *gpe_table_group_to_npe( - struct iommu_table_group *table_group) -{ - struct pnv_ioda_pe *npe = NULL; - int ret = iommu_group_for_each_dev(table_group->group, &npe, - gpe_table_group_to_npe_cb); - - BUG_ON(!ret || !npe); - - return npe; -} - -static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group, - int num, struct iommu_table *tbl) -{ - long ret = pnv_pci_ioda2_set_window(table_group, num, tbl); - - if (ret) - return ret; - - ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num, tbl); - if (ret) - pnv_pci_ioda2_unset_window(table_group, num); - - return ret; -} - -static long pnv_pci_ioda2_npu_unset_window( - struct iommu_table_group *table_group, - int num) -{ - long ret = pnv_pci_ioda2_unset_window(table_group, num); - - if (ret) - return ret; - - return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num); -} - -static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group) -{ - /* - * Detach NPU first as pnv_ioda2_take_ownership() will destroy - * the iommu_table if 32bit DMA is enabled. - */ - pnv_npu_take_ownership(gpe_table_group_to_npe(table_group)); - pnv_ioda2_take_ownership(table_group); -} - -static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = { - .get_table_size = pnv_pci_ioda2_get_table_size, - .create_table = pnv_pci_ioda2_create_table, - .set_window = pnv_pci_ioda2_npu_set_window, - .unset_window = pnv_pci_ioda2_npu_unset_window, - .take_ownership = pnv_ioda2_npu_take_ownership, - .release_ownership = pnv_ioda2_release_ownership, -}; - -static void pnv_pci_ioda_setup_iommu_api(void) -{ - struct pci_controller *hose, *tmp; - struct pnv_phb *phb; - struct pnv_ioda_pe *pe, *gpe; - - /* - * Now we have all PHBs discovered, time to add NPU devices to - * the corresponding IOMMU groups. - */ - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { - phb = hose->private_data; - - if (phb->type != PNV_PHB_NPU) - continue; - - list_for_each_entry(pe, &phb->ioda.pe_list, list) { - gpe = pnv_pci_npu_setup_iommu(pe); - if (gpe) - gpe->table_group.ops = &pnv_pci_ioda2_npu_ops; - } - } -} -#else /* !CONFIG_IOMMU_API */ -static void pnv_pci_ioda_setup_iommu_api(void) { }; #endif -static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift, - unsigned levels, unsigned long limit, - unsigned long *current_offset, unsigned long *total_allocated) -{ - struct page *tce_mem = NULL; - __be64 *addr, *tmp; - unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT; - unsigned long allocated = 1UL << (order + PAGE_SHIFT); - unsigned entries = 1UL << (shift - 3); - long i; - - tce_mem = alloc_pages_node(nid, GFP_KERNEL, order); - if (!tce_mem) { - pr_err("Failed to allocate a TCE memory, order=%d\n", order); - return NULL; - } - addr = page_address(tce_mem); - memset(addr, 0, allocated); - *total_allocated += allocated; - - --levels; - if (!levels) { - *current_offset += allocated; - return addr; - } - - for (i = 0; i < entries; ++i) { - tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift, - levels, limit, current_offset, total_allocated); - if (!tmp) - break; - - addr[i] = cpu_to_be64(__pa(tmp) | - TCE_PCI_READ | TCE_PCI_WRITE); - - if (*current_offset >= limit) - break; - } - - return addr; -} - -static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, - unsigned long size, unsigned level); - -static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, - __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table *tbl) -{ - void *addr; - unsigned long offset = 0, level_shift, total_allocated = 0; - const unsigned window_shift = ilog2(window_size); - unsigned entries_shift = window_shift - page_shift; - unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT); - const unsigned long tce_table_size = 1UL << table_shift; - - if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) - return -EINVAL; - - if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size)) - return -EINVAL; - - /* Adjust direct table size from window_size and levels */ - entries_shift = (entries_shift + levels - 1) / levels; - level_shift = entries_shift + 3; - level_shift = max_t(unsigned, level_shift, PAGE_SHIFT); - - if ((level_shift - 3) * levels + page_shift >= 60) - return -EINVAL; - - /* Allocate TCE table */ - addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - levels, tce_table_size, &offset, &total_allocated); - - /* addr==NULL means that the first level allocation failed */ - if (!addr) - return -ENOMEM; - - /* - * First level was allocated but some lower level failed as - * we did not allocate as much as we wanted, - * release partially allocated table. - */ - if (offset < tce_table_size) { - pnv_pci_ioda2_table_do_free_pages(addr, - 1ULL << (level_shift - 3), levels - 1); - return -ENOMEM; - } - - /* Setup linux iommu table */ - pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset, - page_shift); - tbl->it_level_size = 1ULL << (level_shift - 3); - tbl->it_indirect_levels = levels - 1; - tbl->it_allocated_size = total_allocated; - - pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n", - window_size, tce_table_size, bus_offset); - - return 0; -} - -static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, - unsigned long size, unsigned level) -{ - const unsigned long addr_ul = (unsigned long) addr & - ~(TCE_PCI_READ | TCE_PCI_WRITE); - - if (level) { - long i; - u64 *tmp = (u64 *) addr_ul; - - for (i = 0; i < size; ++i) { - unsigned long hpa = be64_to_cpu(tmp[i]); - - if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE))) - continue; - - pnv_pci_ioda2_table_do_free_pages(__va(hpa), size, - level - 1); - } - } - - free_pages(addr_ul, get_order(size << 3)); -} - -static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) -{ - const unsigned long size = tbl->it_indirect_levels ? - tbl->it_level_size : tbl->it_size; - - if (!tbl->it_size) - return; - - pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size, - tbl->it_indirect_levels); -} - -static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, - struct pnv_ioda_pe *pe) +void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, + struct pnv_ioda_pe *pe) { int64_t rc; - if (!pnv_pci_ioda_pe_dma_weight(pe)) - return; - /* TVE #1 is selected by PCI address bit 59 */ pe->tce_bypass_base = 1ull << 59; - iommu_register_group(&pe->table_group, phb->hose->global_number, - pe->pe_number); - /* The PE will reserve all possible 32-bits space */ pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", phb->ioda.m32_pci_base); @@ -2883,63 +1603,38 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pe->table_group.max_dynamic_windows_supported = IOMMU_TABLE_GROUP_MAX_TABLES; pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS; - pe->table_group.pgsizes = SZ_4K | SZ_64K | SZ_16M; -#ifdef CONFIG_IOMMU_API - pe->table_group.ops = &pnv_pci_ioda2_ops; -#endif + pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb); rc = pnv_pci_ioda2_setup_default_config(pe); if (rc) return; - if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) - pnv_ioda_setup_bus_dma(pe, pe->pbus, true); -} - -#ifdef CONFIG_PCI_MSI -int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) -{ - struct pnv_phb *phb = container_of(chip, struct pnv_phb, - ioda.irq_chip); - - return opal_pci_msi_eoi(phb->opal_id, hw_irq); +#ifdef CONFIG_IOMMU_API + pe->table_group.ops = &pnv_pci_ioda2_ops; + iommu_register_group(&pe->table_group, phb->hose->global_number, + pe->pe_number); +#endif + pe->dma_setup_done = true; } -static void pnv_ioda2_msi_eoi(struct irq_data *d) +/* + * Called from KVM in real mode to EOI passthru interrupts. The ICP + * EOI is handled directly in KVM in kvmppc_deliver_irq_passthru(). + * + * The IRQ data is mapped in the PCI-MSI domain and the EOI OPAL call + * needs an HW IRQ number mapped in the XICS IRQ domain. The HW IRQ + * numbers of the in-the-middle MSI domain are vector numbers and it's + * good enough for OPAL. Use that. + */ +int64_t pnv_opal_pci_msi_eoi(struct irq_data *d) { - int64_t rc; - unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); - struct irq_chip *chip = irq_data_get_irq_chip(d); - - rc = pnv_opal_pci_msi_eoi(chip, hw_irq); - WARN_ON_ONCE(rc); + struct pci_controller *hose = irq_data_get_irq_chip_data(d->parent_data); + struct pnv_phb *phb = hose->private_data; - icp_native_eoi(d); + return opal_pci_msi_eoi(phb->opal_id, d->parent_data->hwirq); } - -void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) -{ - struct irq_data *idata; - struct irq_chip *ichip; - - /* The MSI EOI OPAL call is only needed on PHB3 */ - if (phb->model != PNV_PHB_MODEL_PHB3) - return; - - if (!phb->ioda.irq_chip_init) { - /* - * First time we setup an MSI IRQ, we need to setup the - * corresponding IRQ chip to route correctly. - */ - idata = irq_get_irq_data(virq); - ichip = irq_data_get_irq_chip(idata); - phb->ioda.irq_chip_init = 1; - phb->ioda.irq_chip = *ichip; - phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi; - } - irq_set_chip(virq, &phb->ioda.irq_chip); -} +static struct irq_chip pnv_pci_msi_irq_chip; /* * Returns true iff chip is something that we could call @@ -2947,19 +1642,21 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) */ bool is_pnv_opal_msi(struct irq_chip *chip) { - return chip->irq_eoi == pnv_ioda2_msi_eoi; + return chip == &pnv_pci_msi_irq_chip; } EXPORT_SYMBOL_GPL(is_pnv_opal_msi); -static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int virq, - unsigned int is_64, struct msi_msg *msg) +static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int xive_num, + unsigned int is_64, struct msi_msg *msg) { struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); - unsigned int xive_num = hwirq - phb->msi_base; __be32 data; int rc; + dev_dbg(&dev->dev, "%s: setup %s-bit MSI for vector #%d\n", __func__, + is_64 ? "64" : "32", xive_num); + /* No PE assigned ? bail out ... no MSI for you ! */ if (pe == NULL) return -ENXIO; @@ -3007,17 +1704,188 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, } msg->data = be32_to_cpu(data); - pnv_set_msi_irq_chip(phb, virq); + return 0; +} + +static void pnv_msi_shutdown(struct irq_data *d) +{ + d = d->parent_data; + if (d->chip->irq_shutdown) + d->chip->irq_shutdown(d); +} + +static bool pnv_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) +{ + struct irq_chip *chip = info->chip; + + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; + + chip->irq_shutdown = pnv_msi_shutdown; + return true; +} + +#define PNV_PCI_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | \ + MSI_FLAG_USE_DEF_CHIP_OPS | \ + MSI_FLAG_PCI_MSI_MASK_PARENT) +#define PNV_PCI_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | \ + MSI_FLAG_PCI_MSIX | \ + MSI_FLAG_MULTI_PCI_MSI) + +static const struct msi_parent_ops pnv_msi_parent_ops = { + .required_flags = PNV_PCI_MSI_FLAGS_REQUIRED, + .supported_flags = PNV_PCI_MSI_FLAGS_SUPPORTED, + .chip_flags = MSI_CHIP_FLAG_SET_EOI, + .bus_select_token = DOMAIN_BUS_NEXUS, + .bus_select_mask = MATCH_PCI_MSI, + .prefix = "PNV-", + .init_dev_msi_info = pnv_init_dev_msi_info, +}; + +static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg) +{ + struct msi_desc *entry = irq_data_get_msi_desc(d); + struct pci_dev *pdev = msi_desc_to_pci_dev(entry); + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + int rc; + + rc = __pnv_pci_ioda_msi_setup(phb, pdev, d->hwirq, + entry->pci.msi_attrib.is_64, msg); + if (rc) + dev_err(&pdev->dev, "Failed to setup %s-bit MSI #%ld : %d\n", + entry->pci.msi_attrib.is_64 ? "64" : "32", d->hwirq, rc); +} + +/* + * The IRQ data is mapped in the MSI domain in which HW IRQ numbers + * correspond to vector numbers. + */ +static void pnv_msi_eoi(struct irq_data *d) +{ + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + + if (phb->model == PNV_PHB_MODEL_PHB3) { + /* + * The EOI OPAL call takes an OPAL HW IRQ number but + * since it is translated into a vector number in + * OPAL, use that directly. + */ + WARN_ON_ONCE(opal_pci_msi_eoi(phb->opal_id, d->hwirq)); + } + + irq_chip_eoi_parent(d); +} + +static struct irq_chip pnv_msi_irq_chip = { + .name = "PNV-MSI", + .irq_shutdown = pnv_msi_shutdown, + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = pnv_msi_eoi, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_compose_msi_msg = pnv_msi_compose_msg, +}; + +static int pnv_irq_parent_domain_alloc(struct irq_domain *domain, + unsigned int virq, int hwirq) +{ + struct irq_fwspec parent_fwspec; + int ret; + + parent_fwspec.fwnode = domain->parent->fwnode; + parent_fwspec.param_count = 2; + parent_fwspec.param[0] = hwirq; + parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec); + if (ret) + return ret; + + return 0; +} + +static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct pci_controller *hose = domain->host_data; + struct pnv_phb *phb = hose->private_data; + msi_alloc_info_t *info = arg; + struct pci_dev *pdev = msi_desc_to_pci_dev(info->desc); + int hwirq; + int i, ret; + + hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, nr_irqs); + if (hwirq < 0) { + dev_warn(&pdev->dev, "failed to find a free MSI\n"); + return -ENOSPC; + } + + dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__, + hose->dn, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) { + ret = pnv_irq_parent_domain_alloc(domain, virq + i, + phb->msi_base + hwirq + i); + if (ret) + goto out; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &pnv_msi_irq_chip, hose); + } + + return 0; - pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," - " address=%x_%08x data=%x PE# %x\n", - pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, - msg->address_hi, msg->address_lo, data, pe->pe_number); +out: + irq_domain_free_irqs_parent(domain, virq, i); + msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, nr_irqs); + return ret; +} + +static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + + pr_debug("%s bridge %pOF %d/%lx #%d\n", __func__, hose->dn, + virq, d->hwirq, nr_irqs); + + msi_bitmap_free_hwirqs(&phb->msi_bmp, d->hwirq, nr_irqs); + irq_domain_free_irqs_parent(domain, virq, nr_irqs); +} + +static const struct irq_domain_ops pnv_irq_domain_ops = { + .select = msi_lib_irq_domain_select, + .alloc = pnv_irq_domain_alloc, + .free = pnv_irq_domain_free, +}; + +static int __init pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count) +{ + struct irq_domain *parent = irq_get_default_domain(); + struct irq_domain_info info = { + .fwnode = of_fwnode_handle(hose->dn), + .ops = &pnv_irq_domain_ops, + .host_data = hose, + .size = count, + .parent = parent, + }; + + hose->dev_domain = msi_create_parent_irq_domain(&info, &pnv_msi_parent_ops); + if (!hose->dev_domain) { + pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n", + hose->dn, hose->global_number); + return -ENOMEM; + } return 0; } -static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) +static void __init pnv_pci_init_ioda_msis(struct pnv_phb *phb) { unsigned int count; const __be32 *prop = of_get_property(phb->hose->dn, @@ -3037,105 +1905,11 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) return; } - phb->msi_setup = pnv_pci_ioda_msi_setup; - phb->msi32_support = 1; pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", count, phb->msi_base); -} -#else -static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { } -#endif /* CONFIG_PCI_MSI */ - -#ifdef CONFIG_PCI_IOV -static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - const resource_size_t gate = phb->ioda.m64_segsize >> 2; - struct resource *res; - int i; - resource_size_t size, total_vf_bar_sz; - struct pci_dn *pdn; - int mul, total_vfs; - - if (!pdev->is_physfn || pdev->is_added) - return; - - pdn = pci_get_pdn(pdev); - pdn->vfs_expanded = 0; - pdn->m64_single_mode = false; - - total_vfs = pci_sriov_get_totalvfs(pdev); - mul = phb->ioda.total_pe_num; - total_vf_bar_sz = 0; - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || res->parent) - continue; - if (!pnv_pci_is_m64_flags(res->flags)) { - dev_warn(&pdev->dev, "Don't support SR-IOV with" - " non M64 VF BAR%d: %pR. \n", - i, res); - goto truncate_iov; - } - - total_vf_bar_sz += pci_iov_resource_size(pdev, - i + PCI_IOV_RESOURCES); - /* - * If bigger than quarter of M64 segment size, just round up - * power of two. - * - * Generally, one M64 BAR maps one IOV BAR. To avoid conflict - * with other devices, IOV BAR size is expanded to be - * (total_pe * VF_BAR_size). When VF_BAR_size is half of M64 - * segment size , the expanded size would equal to half of the - * whole M64 space size, which will exhaust the M64 Space and - * limit the system flexibility. This is a design decision to - * set the boundary to quarter of the M64 segment size. - */ - if (total_vf_bar_sz > gate) { - mul = roundup_pow_of_two(total_vfs); - dev_info(&pdev->dev, - "VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n", - total_vf_bar_sz, gate, mul); - pdn->m64_single_mode = true; - break; - } - } - - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - if (!res->flags || res->parent) - continue; - - size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES); - /* - * On PHB3, the minimum size alignment of M64 BAR in single - * mode is 32MB. - */ - if (pdn->m64_single_mode && (size < SZ_32M)) - goto truncate_iov; - dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res); - res->end = res->start + size * mul - 1; - dev_dbg(&pdev->dev, " %pR\n", res); - dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)", - i, res, mul); - } - pdn->vfs_expanded = mul; - - return; - -truncate_iov: - /* To save MMIO space, IOV BAR is truncated. */ - for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { - res = &pdev->resource[i + PCI_IOV_RESOURCES]; - res->flags = 0; - res->end = res->start - 1; - } + pnv_msi_allocate_domains(phb->hose, count); } -#endif /* CONFIG_PCI_IOV */ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, struct resource *res) @@ -3145,7 +1919,8 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, int index; int64_t rc; - if (!res || !res->flags || res->start > res->end) + if (!res || !res->flags || res->start > res->end || + res->flags & IORESOURCE_UNSET) return; if (res->flags & IORESOURCE_IO) { @@ -3196,7 +1971,7 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, /* * This function is supposed to be called on basis of PE from top - * to bottom style. So the the I/O or MMIO segment assigned to + * to bottom style. So the I/O or MMIO segment assigned to * parent PE could be overridden by its child PEs if necessary. */ static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe) @@ -3231,19 +2006,9 @@ static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe) #ifdef CONFIG_DEBUG_FS static int pnv_pci_diag_data_set(void *data, u64 val) { - struct pci_controller *hose; - struct pnv_phb *phb; + struct pnv_phb *phb = data; s64 ret; - if (val != 1ULL) - return -EINVAL; - - hose = (struct pci_controller *)data; - if (!hose || !hose->private_data) - return -ENODEV; - - phb = hose->private_data; - /* Retrieve the diag data from firmware */ ret = opal_pci_get_phb_diag_data2(phb->opal_id, phb->diag_data, phb->diag_data_size); @@ -3255,8 +2020,35 @@ static int pnv_pci_diag_data_set(void *data, u64 val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, - pnv_pci_diag_data_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_diag_data_fops, NULL, pnv_pci_diag_data_set, + "%llu\n"); + +static int pnv_pci_ioda_pe_dump(void *data, u64 val) +{ + struct pnv_phb *phb = data; + int pe_num; + + for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) { + struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_num]; + + if (!test_bit(pe_num, phb->ioda.pe_alloc)) + continue; + + pe_warn(pe, "rid: %04x dev count: %2d flags: %s%s%s%s%s%s\n", + pe->rid, pe->device_count, + (pe->flags & PNV_IODA_PE_DEV) ? "dev " : "", + (pe->flags & PNV_IODA_PE_BUS) ? "bus " : "", + (pe->flags & PNV_IODA_PE_BUS_ALL) ? "all " : "", + (pe->flags & PNV_IODA_PE_MASTER) ? "master " : "", + (pe->flags & PNV_IODA_PE_SLAVE) ? "slave " : "", + (pe->flags & PNV_IODA_PE_VF) ? "vf " : ""); + } + + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(pnv_pci_ioda_pe_dump_fops, NULL, + pnv_pci_ioda_pe_dump, "%llu\n"); #endif /* CONFIG_DEBUG_FS */ @@ -3270,32 +2062,60 @@ static void pnv_pci_ioda_create_dbgfs(void) list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { phb = hose->private_data; - /* Notify initialization of PHB done */ - phb->initialized = 1; - sprintf(name, "PCI%04x", hose->global_number); - phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); - if (!phb->dbgfs) { - pr_warning("%s: Error on creating debugfs on PHB#%x\n", - __func__, hose->global_number); - continue; - } + phb->dbgfs = debugfs_create_dir(name, arch_debugfs_dir); - debugfs_create_file("dump_diag_regs", 0200, phb->dbgfs, hose, - &pnv_pci_diag_data_fops); + debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs, + phb, &pnv_pci_diag_data_fops); + debugfs_create_file_unsafe("dump_ioda_pe_state", 0200, phb->dbgfs, + phb, &pnv_pci_ioda_pe_dump_fops); } #endif /* CONFIG_DEBUG_FS */ } +static void pnv_pci_enable_bridge(struct pci_bus *bus) +{ + struct pci_dev *dev = bus->self; + struct pci_bus *child; + + /* Empty bus ? bail */ + if (list_empty(&bus->devices)) + return; + + /* + * If there's a bridge associated with that bus enable it. This works + * around races in the generic code if the enabling is done during + * parallel probing. This can be removed once those races have been + * fixed. + */ + if (dev) { + int rc = pci_enable_device(dev); + if (rc) + pci_err(dev, "Error enabling bridge (%d)\n", rc); + pci_set_master(dev); + } + + /* Perform the same to child busses */ + list_for_each_entry(child, &bus->children, node) + pnv_pci_enable_bridge(child); +} + +static void pnv_pci_enable_bridges(void) +{ + struct pci_controller *hose; + + list_for_each_entry(hose, &hose_list, list_node) + pnv_pci_enable_bridge(hose->bus); +} + static void pnv_pci_ioda_fixup(void) { - pnv_pci_ioda_setup_PEs(); - pnv_pci_ioda_setup_iommu_api(); pnv_pci_ioda_create_dbgfs(); + pnv_pci_enable_bridges(); + #ifdef CONFIG_EEH - eeh_init(); - eeh_addr_cache_build(); + pnv_eeh_post_init(); #endif } @@ -3314,10 +2134,9 @@ static void pnv_pci_ioda_fixup(void) static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus, unsigned long type) { - struct pci_dev *bridge; - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); int num_pci_bridges = 0; + struct pci_dev *bridge; bridge = bus->self; while (bridge) { @@ -3401,33 +2220,20 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus, } } -static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type) +static void pnv_pci_configure_bus(struct pci_bus *bus) { - struct pci_controller *hose = pci_bus_to_host(bus); - struct pnv_phb *phb = hose->private_data; struct pci_dev *bridge = bus->self; struct pnv_ioda_pe *pe; - bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE); + bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE); - /* Extend bridge's windows if necessary */ - pnv_pci_fixup_bridge_resources(bus, type); - - /* The PE for root bus should be realized before any one else */ - if (!phb->ioda.root_pe_populated) { - pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false); - if (pe) { - phb->ioda.root_pe_idx = pe->pe_number; - phb->ioda.root_pe_populated = true; - } - } + dev_info(&bus->dev, "Configuring PE for bus\n"); /* Don't assign PE to PCI bus, which doesn't have subordinate devices */ - if (list_empty(&bus->devices)) + if (WARN_ON(list_empty(&bus->devices))) return; /* Reserve PEs according to used M64 resources */ - if (phb->reserve_m64_pe) - phb->reserve_m64_pe(bus, NULL, all); + pnv_ioda_reserve_m64_pe(bus, NULL, all); /* * Assign PE. We might run here because of partial hotplug. @@ -3439,17 +2245,6 @@ static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type) return; pnv_ioda_setup_pe_seg(pe); - switch (phb->type) { - case PNV_PHB_IODA1: - pnv_pci_ioda1_setup_dma_pe(phb, pe); - break; - case PNV_PHB_IODA2: - pnv_pci_ioda2_setup_dma_pe(phb, pe); - break; - default: - pr_warn("%s: No DMA for PHB#%x (type %d)\n", - __func__, phb->hose->global_number, phb->type); - } } static resource_size_t pnv_pci_default_alignment(void) @@ -3457,134 +2252,50 @@ static resource_size_t pnv_pci_default_alignment(void) return PAGE_SIZE; } -#ifdef CONFIG_PCI_IOV -static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev, - int resno) -{ - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; - struct pci_dn *pdn = pci_get_pdn(pdev); - resource_size_t align; - - /* - * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the - * SR-IOV. While from hardware perspective, the range mapped by M64 - * BAR should be size aligned. - * - * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra - * powernv-specific hardware restriction is gone. But if just use the - * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with - * in one segment of M64 #15, which introduces the PE conflict between - * PF and VF. Based on this, the minimum alignment of an IOV BAR is - * m64_segsize. - * - * This function returns the total IOV BAR size if M64 BAR is in - * Shared PE mode or just VF BAR size if not. - * If the M64 BAR is in Single PE mode, return the VF BAR size or - * M64 segment size if IOV BAR size is less. - */ - align = pci_iov_resource_size(pdev, resno); - if (!pdn->vfs_expanded) - return align; - if (pdn->m64_single_mode) - return max(align, (resource_size_t)phb->ioda.m64_segsize); - - return pdn->vfs_expanded * align; -} -#endif /* CONFIG_PCI_IOV */ - /* Prevent enabling devices for which we couldn't properly * assign a PE */ -bool pnv_pci_enable_device_hook(struct pci_dev *dev) +static bool pnv_pci_enable_device_hook(struct pci_dev *dev) { - struct pci_controller *hose = pci_bus_to_host(dev->bus); - struct pnv_phb *phb = hose->private_data; struct pci_dn *pdn; - /* The function is probably called while the PEs have - * not be created yet. For example, resource reassignment - * during PCI probe period. We just skip the check if - * PEs isn't ready. - */ - if (!phb->initialized) - return true; - pdn = pci_get_pdn(dev); - if (!pdn || pdn->pe_number == IODA_INVALID_PE) + if (!pdn || pdn->pe_number == IODA_INVALID_PE) { + pci_err(dev, "pci_enable_device() blocked, no PE assigned.\n"); return false; - - return true; -} - -static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group, - int num) -{ - struct pnv_ioda_pe *pe = container_of(table_group, - struct pnv_ioda_pe, table_group); - struct pnv_phb *phb = pe->phb; - unsigned int idx; - long rc; - - pe_info(pe, "Removing DMA window #%d\n", num); - for (idx = 0; idx < phb->ioda.dma32_count; idx++) { - if (phb->ioda.dma32_segmap[idx] != pe->pe_number) - continue; - - rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number, - idx, 0, 0ul, 0ul, 0ul); - if (rc != OPAL_SUCCESS) { - pe_warn(pe, "Failure %ld unmapping DMA32 segment#%d\n", - rc, idx); - return rc; - } - - phb->ioda.dma32_segmap[idx] = IODA_INVALID_PE; } - pnv_pci_unlink_table_and_group(table_group->tables[num], table_group); - return OPAL_SUCCESS; + return true; } -static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe) +static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev) { - unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe); - struct iommu_table *tbl = pe->table_group.tables[0]; - int64_t rc; - - if (!weight) - return; + struct pci_dn *pdn; + struct pnv_ioda_pe *pe; - rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0); - if (rc != OPAL_SUCCESS) - return; + pdn = pci_get_pdn(dev); + if (!pdn) + return false; - pnv_pci_p7ioc_tce_invalidate(tbl, tbl->it_offset, tbl->it_size, false); - if (pe->table_group.group) { - iommu_group_put(pe->table_group.group); - WARN_ON(pe->table_group.group); + if (pdn->pe_number == IODA_INVALID_PE) { + pe = pnv_ioda_setup_dev_PE(dev); + if (!pe) + return false; } - - free_pages(tbl->it_base, get_order(tbl->it_size << 3)); - iommu_tce_table_put(tbl); + return true; } -static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) +void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) { struct iommu_table *tbl = pe->table_group.tables[0]; - unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe); -#ifdef CONFIG_IOMMU_API int64_t rc; -#endif - if (!weight) + if (!pe->dma_setup_done) return; -#ifdef CONFIG_IOMMU_API rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0); if (rc) - pe_warn(pe, "OPAL error %ld release DMA window\n", rc); -#endif + pe_warn(pe, "OPAL error %lld release DMA window\n", rc); pnv_pci_ioda2_set_bypass(pe, false); if (pe->table_group.group) { @@ -3592,7 +2303,6 @@ static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) WARN_ON(pe->table_group.group); } - pnv_pci_ioda2_table_free_pages(tbl); iommu_tce_table_put(tbl); } @@ -3608,17 +2318,11 @@ static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe, if (map[idx] != pe->pe_number) continue; - if (win == OPAL_M64_WINDOW_TYPE) - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - phb->ioda.reserved_pe_idx, win, - idx / PNV_IODA1_M64_SEGS, - idx % PNV_IODA1_M64_SEGS); - else - rc = opal_pci_map_pe_mmio_window(phb->opal_id, - phb->ioda.reserved_pe_idx, win, 0, idx); + rc = opal_pci_map_pe_mmio_window(phb->opal_id, + phb->ioda.reserved_pe_idx, win, 0, idx); if (rc != OPAL_SUCCESS) - pe_warn(pe, "Error %ld unmapping (%d) segment#%d\n", + pe_warn(pe, "Error %lld unmapping (%d) segment#%d\n", rc, win, idx); map[idx] = IODA_INVALID_PE; @@ -3629,14 +2333,7 @@ static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe) { struct pnv_phb *phb = pe->phb; - if (phb->type == PNV_PHB_IODA1) { - pnv_ioda_free_pe_seg(pe, OPAL_IO_WINDOW_TYPE, - phb->ioda.io_segmap); - pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE, - phb->ioda.m32_segmap); - pnv_ioda_free_pe_seg(pe, OPAL_M64_WINDOW_TYPE, - phb->ioda.m64_segmap); - } else if (phb->type == PNV_PHB_IODA2) { + if (phb->type == PNV_PHB_IODA2) { pnv_ioda_free_pe_seg(pe, OPAL_M32_WINDOW_TYPE, phb->ioda.m32_segmap); } @@ -3647,14 +2344,18 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) struct pnv_phb *phb = pe->phb; struct pnv_ioda_pe *slave, *tmp; + pe_info(pe, "Releasing PE\n"); + + mutex_lock(&phb->ioda.pe_list_mutex); list_del(&pe->list); + mutex_unlock(&phb->ioda.pe_list_mutex); + switch (phb->type) { - case PNV_PHB_IODA1: - pnv_pci_ioda1_release_pe_dma(pe); - break; case PNV_PHB_IODA2: pnv_pci_ioda2_release_pe_dma(pe); break; + case PNV_PHB_NPU_OCAPI: + break; default: WARN_ON(1); } @@ -3676,26 +2377,35 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe) * that it can be populated again in PCI hot add path. The PE * shouldn't be destroyed as it's the global reserved resource. */ - if (phb->ioda.root_pe_populated && - phb->ioda.root_pe_idx == pe->pe_number) - phb->ioda.root_pe_populated = false; - else - pnv_ioda_free_pe(pe); + if (phb->ioda.root_pe_idx == pe->pe_number) + return; + + pnv_ioda_free_pe(pe); } static void pnv_pci_release_device(struct pci_dev *pdev) { - struct pci_controller *hose = pci_bus_to_host(pdev->bus); - struct pnv_phb *phb = hose->private_data; + struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); struct pci_dn *pdn = pci_get_pdn(pdev); struct pnv_ioda_pe *pe; + /* The VF PE state is torn down when sriov_disable() is called */ if (pdev->is_virtfn) return; if (!pdn || pdn->pe_number == IODA_INVALID_PE) return; +#ifdef CONFIG_PCI_IOV + /* + * FIXME: Try move this to sriov_disable(). It's here since we allocate + * the iov state at probe time since we need to fiddle with the IOV + * resources. + */ + if (pdev->is_physfn) + kfree(pdev->dev.archdata.iov_data); +#endif + /* * PCI hotplug can happen as part of EEH error recovery. The @pdn * isn't removed and added afterwards in this scenario. We should @@ -3720,63 +2430,68 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose) OPAL_ASSERT_RESET); } -static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, - .dma_bus_setup = pnv_pci_dma_bus_setup, -#ifdef CONFIG_PCI_MSI - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, -#endif - .enable_device_hook = pnv_pci_enable_device_hook, - .release_device = pnv_pci_release_device, - .window_alignment = pnv_pci_window_alignment, - .setup_bridge = pnv_pci_setup_bridge, - .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_pci_ioda_dma_set_mask, - .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, - .shutdown = pnv_pci_ioda_shutdown, -}; - -static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) +static void pnv_pci_ioda_dma_bus_setup(struct pci_bus *bus) { - dev_err_once(&npdev->dev, - "%s operation unsupported for NVLink devices\n", - __func__); - return -EPERM; + struct pnv_phb *phb = pci_bus_to_pnvhb(bus); + struct pnv_ioda_pe *pe; + + list_for_each_entry(pe, &phb->ioda.pe_list, list) { + if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))) + continue; + + if (!pe->pbus) + continue; + + if (bus->number == ((pe->rid >> 8) & 0xFF)) { + pe->pbus = bus; + break; + } + } } -static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, -#ifdef CONFIG_PCI_MSI - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, +#ifdef CONFIG_IOMMU_API +static struct iommu_group *pnv_pci_device_group(struct pci_controller *hose, + struct pci_dev *pdev) +{ + struct pnv_phb *phb = hose->private_data; + struct pnv_ioda_pe *pe; + + if (WARN_ON(!phb)) + return ERR_PTR(-ENODEV); + + pe = pnv_pci_bdfn_to_pe(phb, pci_dev_id(pdev)); + if (!pe) + return ERR_PTR(-ENODEV); + + if (!pe->table_group.group) + return ERR_PTR(-ENODEV); + + return iommu_group_ref_get(pe->table_group.group); +} #endif + +static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { + .dma_dev_setup = pnv_pci_ioda_dma_dev_setup, + .dma_bus_setup = pnv_pci_ioda_dma_bus_setup, + .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported, .enable_device_hook = pnv_pci_enable_device_hook, + .release_device = pnv_pci_release_device, .window_alignment = pnv_pci_window_alignment, + .setup_bridge = pnv_pci_fixup_bridge_resources, .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_npu_dma_set_mask, .shutdown = pnv_pci_ioda_shutdown, +#ifdef CONFIG_IOMMU_API + .device_group = pnv_pci_device_group, +#endif }; -#ifdef CONFIG_CXL_BASE -const struct pci_controller_ops pnv_cxl_cx4_ioda_controller_ops = { - .dma_dev_setup = pnv_pci_dma_dev_setup, - .dma_bus_setup = pnv_pci_dma_bus_setup, -#ifdef CONFIG_PCI_MSI - .setup_msi_irqs = pnv_cxl_cx4_setup_msi_irqs, - .teardown_msi_irqs = pnv_cxl_cx4_teardown_msi_irqs, -#endif - .enable_device_hook = pnv_cxl_enable_device_hook, - .disable_device = pnv_cxl_disable_device, +static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = { + .enable_device_hook = pnv_ocapi_enable_device_hook, .release_device = pnv_pci_release_device, .window_alignment = pnv_pci_window_alignment, - .setup_bridge = pnv_pci_setup_bridge, .reset_secondary_bus = pnv_pci_reset_secondary_bus, - .dma_set_mask = pnv_pci_ioda_dma_set_mask, - .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, .shutdown = pnv_pci_ioda_shutdown, }; -#endif static void __init pnv_pci_init_ioda_phb(struct device_node *np, u64 hub_id, int ioda_type) @@ -3784,7 +2499,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, struct pci_controller *hose; struct pnv_phb *phb; unsigned long size, m64map_off, m32map_off, pemap_off; - unsigned long iomap_off = 0, dma32map_off = 0; + struct pnv_ioda_pe *root_pe; struct resource r; const __be64 *prop64; const __be32 *prop32; @@ -3797,8 +2512,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, if (!of_device_is_available(np)) return; - pr_info("Initializing %s PHB (%s)\n", - pnv_phb_names[ioda_type], of_node_full_name(np)); + pr_info("Initializing %s PHB (%pOF)\n", pnv_phb_names[ioda_type], np); prop64 = of_get_property(np, "ibm,opal-phbid", NULL); if (!prop64) { @@ -3808,14 +2522,17 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb_id = be64_to_cpup(prop64); pr_debug(" PHB-ID : 0x%016llx\n", phb_id); - phb = memblock_virt_alloc(sizeof(struct pnv_phb), 0); + phb = kzalloc(sizeof(*phb), GFP_KERNEL); + if (!phb) + panic("%s: Failed to allocate %zu bytes\n", __func__, + sizeof(*phb)); /* Allocate PCI controller */ phb->hose = hose = pcibios_alloc_controller(np); if (!phb->hose) { - pr_err(" Can't allocate PCI controller for %s\n", - np->full_name); - memblock_free(__pa(phb), sizeof(struct pnv_phb)); + pr_err(" Can't allocate PCI controller for %pOF\n", + np); + memblock_free(phb, sizeof(struct pnv_phb)); return; } @@ -3825,7 +2542,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, hose->first_busno = be32_to_cpu(prop32[0]); hose->last_busno = be32_to_cpu(prop32[1]); } else { - pr_warn(" Broken <bus-range> on %s\n", np->full_name); + pr_warn(" Broken <bus-range> on %pOF\n", np); hose->first_busno = 0; hose->last_busno = 0xff; } @@ -3840,10 +2557,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->model = PNV_PHB_MODEL_P7IOC; else if (of_device_is_compatible(np, "ibm,power8-pciex")) phb->model = PNV_PHB_MODEL_PHB3; - else if (of_device_is_compatible(np, "ibm,power8-npu-pciex")) - phb->model = PNV_PHB_MODEL_NPU; - else if (of_device_is_compatible(np, "ibm,power9-npu-pciex")) - phb->model = PNV_PHB_MODEL_NPU2; else phb->model = PNV_PHB_MODEL_UNKNOWN; @@ -3854,7 +2567,10 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, else phb->diag_data_size = PNV_PCI_DIAG_BUF_SIZE; - phb->diag_data = memblock_virt_alloc(phb->diag_data_size, 0); + phb->diag_data = kzalloc(phb->diag_data_size, GFP_KERNEL); + if (!phb->diag_data) + panic("%s: Failed to allocate %u bytes\n", __func__, + phb->diag_data_size); /* Parse 32-bit and IO ranges (if any) */ pci_process_bridge_OF_ranges(hose, np, !hose->global_number); @@ -3893,27 +2609,19 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num; phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ - /* Calculate how many 32-bit TCE segments we have */ - phb->ioda.dma32_count = phb->ioda.m32_pci_base / - PNV_IODA1_DMA32_SEGSIZE; - /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ - size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8, + size = ALIGN(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8, sizeof(unsigned long)); m64map_off = size; size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]); m32map_off = size; size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]); - if (phb->type == PNV_PHB_IODA1) { - iomap_off = size; - size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]); - dma32map_off = size; - size += phb->ioda.dma32_count * - sizeof(phb->ioda.dma32_segmap[0]); - } pemap_off = size; size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe); - aux = memblock_virt_alloc(size, 0); + aux = kzalloc(size, GFP_KERNEL); + if (!aux) + panic("%s: Failed to allocate %lu bytes\n", __func__, size); + phb->ioda.pe_alloc = aux; phb->ioda.m64_segmap = aux + m64map_off; phb->ioda.m32_segmap = aux + m32map_off; @@ -3921,15 +2629,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->ioda.m64_segmap[segno] = IODA_INVALID_PE; phb->ioda.m32_segmap[segno] = IODA_INVALID_PE; } - if (phb->type == PNV_PHB_IODA1) { - phb->ioda.io_segmap = aux + iomap_off; - for (segno = 0; segno < phb->ioda.total_pe_num; segno++) - phb->ioda.io_segmap[segno] = IODA_INVALID_PE; - - phb->ioda.dma32_segmap = aux + dma32map_off; - for (segno = 0; segno < phb->ioda.dma32_count; segno++) - phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE; - } phb->ioda.pe_array = aux + pemap_off; /* @@ -3945,16 +2644,14 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1; pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx); } else { - phb->ioda.root_pe_idx = IODA_INVALID_PE; + /* otherwise just allocate one */ + root_pe = pnv_ioda_alloc_pe(phb, 1); + phb->ioda.root_pe_idx = root_pe->pe_number; } INIT_LIST_HEAD(&phb->ioda.pe_list); mutex_init(&phb->ioda.pe_list_mutex); - /* Calculate how many 32-bit TCE segments we have */ - phb->ioda.dma32_count = phb->ioda.m32_pci_base / - PNV_IODA1_DMA32_SEGSIZE; - #if 0 /* We should really do that ... */ rc = opal_pci_set_phb_mem_window(opal->phb_id, window_type, @@ -3992,18 +2689,21 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, */ ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; - if (phb->type == PNV_PHB_NPU) { - hose->controller_ops = pnv_npu_ioda_controller_ops; - } else { - phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; + switch (phb->type) { + case PNV_PHB_NPU_OCAPI: + hose->controller_ops = pnv_npu_ocapi_ioda_controller_ops; + break; + default: hose->controller_ops = pnv_pci_ioda_controller_ops; } ppc_md.pcibios_default_alignment = pnv_pci_default_alignment; #ifdef CONFIG_PCI_IOV - ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources; + ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov; ppc_md.pcibios_iov_resource_alignment = pnv_pci_iov_resource_alignment; + ppc_md.pcibios_sriov_enable = pnv_pcibios_sriov_enable; + ppc_md.pcibios_sriov_disable = pnv_pcibios_sriov_disable; #endif pci_add_flags(PCI_REASSIGN_ALL_RSRC); @@ -4011,15 +2711,19 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, /* Reset IODA tables to a clean state */ rc = opal_pci_reset(phb_id, OPAL_RESET_PCI_IODA_TABLE, OPAL_ASSERT_RESET); if (rc) - pr_warning(" OPAL Error %ld performing IODA table reset !\n", rc); + pr_warn(" OPAL Error %ld performing IODA table reset !\n", rc); /* * If we're running in kdump kernel, the previous kernel never * shutdown PCI devices correctly. We already got IODA table * cleaned out. So we have to issue PHB reset to stop all PCI - * transactions from previous kernel. + * transactions from previous kernel. The ppc_pci_reset_phbs + * kernel parameter will force this reset too. Additionally, + * if the IODA reset above failed then use a bigger hammer. + * This can happen if we get a PHB fatal error in very early + * boot. */ - if (is_kdump_kernel()) { + if (is_kdump_kernel() || pci_reset_phbs || rc) { pr_info(" Issue PHB reset ...\n"); pnv_eeh_phb_reset(hose, EEH_RESET_FUNDAMENTAL); pnv_eeh_phb_reset(hose, EEH_RESET_DEACTIVATE); @@ -4028,6 +2732,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, /* Remove M64 resource if we can't configure it successfully */ if (!phb->init_m64 || phb->init_m64(phb)) hose->mem_resources[1].flags = 0; + + /* create pci_dn's for DT nodes under this PHB */ + pci_devs_phb_init_dynamic(hose); } void __init pnv_pci_init_ioda2_phb(struct device_node *np) @@ -4035,31 +2742,19 @@ void __init pnv_pci_init_ioda2_phb(struct device_node *np) pnv_pci_init_ioda_phb(np, 0, PNV_PHB_IODA2); } -void __init pnv_pci_init_npu_phb(struct device_node *np) +void __init pnv_pci_init_npu2_opencapi_phb(struct device_node *np) { - pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU); + pnv_pci_init_ioda_phb(np, 0, PNV_PHB_NPU_OCAPI); } -void __init pnv_pci_init_ioda_hub(struct device_node *np) +static void pnv_npu2_opencapi_cfg_size_fixup(struct pci_dev *dev) { - struct device_node *phbn; - const __be64 *prop64; - u64 hub_id; + struct pnv_phb *phb = pci_bus_to_pnvhb(dev->bus); - pr_info("Probing IODA IO-Hub %s\n", np->full_name); - - prop64 = of_get_property(np, "ibm,opal-hubid", NULL); - if (!prop64) { - pr_err(" Missing \"ibm,opal-hubid\" property !\n"); + if (!machine_is(powernv)) return; - } - hub_id = be64_to_cpup(prop64); - pr_devel(" HUB-ID : 0x%016llx\n", hub_id); - /* Count child PHBs */ - for_each_child_of_node(np, phbn) { - /* Look for IODA1 PHBs */ - if (of_device_is_compatible(phbn, "ibm,ioda-phb")) - pnv_pci_init_ioda_phb(phbn, hub_id, PNV_PHB_IODA1); - } + if (phb->type == PNV_PHB_NPU_OCAPI) + dev->cfg_size = PCI_CFG_SPACE_EXP_SIZE; } +DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pnv_npu2_opencapi_cfg_size_fixup); |
