From 191c22879fbcfd98a7fe9a51786ef41253b1549b Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 4 Jul 2018 16:13:45 +1000 Subject: powerpc/powernv: Move TCE manupulation code to its own file Right now we have allocation code in pci-ioda.c and traversing code in pci.c, let's keep them toghether. However both files are big enough already so let's move this business to a new file. While we at it, move the code which links IOMMU table groups to IOMMU tables as it is not specific to any PNV PHB model. These puts exported symbols from the new file together. This fixes several warnings from checkpatch.pl like this: "WARNING: Prefer 'unsigned int' to bare use of 'unsigned'". As this is almost cut-n-paste, there should be no behavioral change. Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci.h | 41 +++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'arch/powerpc/platforms/powernv/pci.h') diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index eada4b6068cb..fa90f60e89ce 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -201,13 +201,6 @@ struct pnv_phb { }; extern struct pci_ops pnv_pci_ops; -extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, enum dma_data_direction direction, - unsigned long attrs); -extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); -extern int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction); -extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); void pnv_pci_dump_phb_diag_data(struct pci_controller *hose, unsigned char *log_buff); @@ -217,14 +210,6 @@ int pnv_pci_cfg_write(struct pci_dn *pdn, int where, int size, u32 val); extern struct iommu_table *pnv_pci_table_alloc(int nid); -extern long pnv_pci_link_table_and_group(int node, int num, - struct iommu_table *tbl, - struct iommu_table_group *table_group); -extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, - struct iommu_table_group *table_group); -extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, - void *tce_mem, u64 tce_size, - u64 dma_offset, unsigned page_shift); extern void pnv_pci_init_ioda_hub(struct device_node *np); extern void pnv_pci_init_ioda2_phb(struct device_node *np); extern void pnv_pci_init_npu_phb(struct device_node *np); @@ -272,4 +257,30 @@ extern void pnv_cxl_cx4_teardown_msi_irqs(struct pci_dev *pdev); /* phb ops (cxl switches these when enabling the kernel api on the phb) */ extern const struct pci_controller_ops pnv_cxl_cx4_ioda_controller_ops; +/* pci-ioda-tce.c */ +#define POWERNV_IOMMU_DEFAULT_LEVELS 1 +#define POWERNV_IOMMU_MAX_LEVELS 5 + +extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, + unsigned long uaddr, enum dma_data_direction direction, + unsigned long attrs); +extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); +extern int pnv_tce_xchg(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction); +extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); + +extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, + __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table *tbl); +extern void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); + +extern long pnv_pci_link_table_and_group(int node, int num, + struct iommu_table *tbl, + struct iommu_table_group *table_group); +extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl, + struct iommu_table_group *table_group); +extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl, + void *tce_mem, u64 tce_size, + u64 dma_offset, unsigned int page_shift); + #endif /* __POWERNV_PCI_H */ -- cgit From 090bad39b237aad92d8e01baa033699cf0c81cbe Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 4 Jul 2018 16:13:47 +1000 Subject: powerpc/powernv: Add indirect levels to it_userspace We want to support sparse memory and therefore huge chunks of DMA windows do not need to be mapped. If a DMA window big enough to require 2 or more indirect levels, and a DMA window is used to map all RAM (which is a default case for 64bit window), we can actually save some memory by not allocation TCE for regions which we are not going to map anyway. The hardware tables alreary support indirect levels but we also keep host-physical-to-userspace translation array which is allocated by vmalloc() and is a flat array which might use quite some memory. This converts it_userspace from vmalloc'ed array to a multi level table. As the format becomes platform dependend, this replaces the direct access to it_usespace with a iommu_table_ops::useraddrptr hook which returns a pointer to the userspace copy of a TCE; future extension will return NULL if the level was not allocated. This should not change non-KVM handling of TCE tables and it_userspace will not be allocated for non-KVM tables. Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/platforms/powernv/pci.h') diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index fa90f60e89ce..2962f6ddb2a8 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -267,11 +267,12 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); extern int pnv_tce_xchg(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction); +extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index); extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table *tbl); + bool alloc_userspace_copy, struct iommu_table *tbl); extern void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); extern long pnv_pci_link_table_and_group(int node, int num, -- cgit From a68bd1267b7286b1687905651b404e765046de25 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 4 Jul 2018 16:13:49 +1000 Subject: powerpc/powernv/ioda: Allocate indirect TCE levels on demand At the moment we allocate the entire TCE table, twice (hardware part and userspace translation cache). This normally works as we normally have contigous memory and the guest will map entire RAM for 64bit DMA. However if we have sparse RAM (one example is a memory device), then we will allocate TCEs which will never be used as the guest only maps actual memory for DMA. If it is a single level TCE table, there is nothing we can really do but if it a multilevel table, we can skip allocating TCEs we know we won't need. This adds ability to allocate only first level, saving memory. This changes iommu_table::free() to avoid allocating of an extra level; iommu_table::set() will do this when needed. This adds @alloc parameter to iommu_table::exchange() to tell the callback if it can allocate an extra level; the flag is set to "false" for the realmode KVM handlers of H_PUT_TCE hcalls and the callback returns H_TOO_HARD. This still requires the entire table to be counted in mm::locked_vm. To be conservative, this only does on-demand allocation when the usespace cache table is requested which is the case of VFIO. The example math for a system replicating a powernv setup with NVLink2 in a guest: 16GB RAM mapped at 0x0 128GB GPU RAM window (16GB of actual RAM) mapped at 0x244000000000 the table to cover that all with 64K pages takes: (((0x244000000000 + 0x2000000000) >> 16)*8)>>20 = 4556MB If we allocate only necessary TCE levels, we will only need: (((0x400000000 + 0x400000000) >> 16)*8)>>20 = 4MB (plus some for indirect levels). Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/platforms/powernv/pci.h') diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 2962f6ddb2a8..0020937fc694 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -266,8 +266,10 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long attrs); extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); extern int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction); -extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index); + unsigned long *hpa, enum dma_data_direction *direction, + bool alloc); +extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, + bool alloc); extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, -- cgit