diff options
Diffstat (limited to 'drivers/vfio/pci')
-rw-r--r-- | drivers/vfio/pci/mlx5/cmd.c | 14 | ||||
-rw-r--r-- | drivers/vfio/pci/nvgrace-gpu/main.c | 169 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_config.c | 13 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_core.c | 40 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_rdwr.c | 38 | ||||
-rw-r--r-- | drivers/vfio/pci/virtio/migrate.c | 6 |
6 files changed, 199 insertions, 81 deletions
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index eb7387ee6ebd..11eda6b207f1 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -408,7 +408,7 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) buf->dma_dir, 0); } - /* Undo alloc_pages_bulk_array() */ + /* Undo alloc_pages_bulk() */ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) __free_page(sg_page_iter_page(&sg_iter)); sg_free_append_table(&buf->table); @@ -431,8 +431,8 @@ static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, return -ENOMEM; do { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, - page_list); + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill, + page_list); if (!filled) { ret = -ENOMEM; goto err; @@ -1342,7 +1342,7 @@ static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) { int i; - /* Undo alloc_pages_bulk_array() */ + /* Undo alloc_pages_bulk() */ for (i = 0; i < recv_buf->npages; i++) __free_page(recv_buf->page_list[i]); @@ -1361,9 +1361,9 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, return -ENOMEM; for (;;) { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, - npages - done, - recv_buf->page_list + done); + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, + npages - done, + recv_buf->page_list + done); if (!filled) goto err; diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index a467085038f0..e5ac39c4cc6b 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -5,6 +5,8 @@ #include <linux/sizes.h> #include <linux/vfio_pci_core.h> +#include <linux/delay.h> +#include <linux/jiffies.h> /* * The device memory usable to the workloads running in the VM is cached @@ -17,12 +19,21 @@ #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX -/* Memory size expected as non cached and reserved by the VM driver */ -#define RESMEM_SIZE SZ_1G - /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ #define MEMBLK_SIZE SZ_512M +#define DVSEC_BITMAP_OFFSET 0xA +#define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0) + +#define GPU_CAP_DVSEC_REGISTER 3 + +#define C2C_LINK_BAR0_OFFSET 0x1498 +#define HBM_TRAINING_BAR0_OFFSET 0x200BC +#define STATUS_READY 0xFF + +#define POLL_QUANTUM_MS 1000 +#define POLL_TIMEOUT_MS (30 * 1000) + /* * The state of the two device memory region - resmem and usemem - is * saved as struct mem_region. @@ -46,6 +57,7 @@ struct nvgrace_gpu_pci_core_device { struct mem_region resmem; /* Lock to control device memory kernel mapping */ struct mutex remap_lock; + bool has_mig_hw_bug; }; static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) @@ -66,7 +78,7 @@ nvgrace_gpu_memregion(int index, if (index == USEMEM_REGION_INDEX) return &nvdev->usemem; - if (index == RESMEM_REGION_INDEX) + if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) return &nvdev->resmem; return NULL; @@ -751,40 +763,67 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, u64 memphys, u64 memlength) { int ret = 0; + u64 resmem_size = 0; /* - * The VM GPU device driver needs a non-cacheable region to support - * the MIG feature. Since the device memory is mapped as NORMAL cached, - * carve out a region from the end with a different NORMAL_NC - * property (called as reserved memory and represented as resmem). This - * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while - * exposing the rest (termed as usable memory and represented using usemem) - * as cacheable 64b BAR (region 4 and 5). + * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable + * region to support the MIG feature owing to a hardware bug. Since the + * device memory is mapped as NORMAL cached, carve out a region from the end + * with a different NORMAL_NC property (called as reserved memory and + * represented as resmem). This region then is exposed as a 64b BAR + * (region 2 and 3) to the VM, while exposing the rest (termed as usable + * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5). * * devmem (memlength) * |-------------------------------------------------| * | | * usemem.memphys resmem.memphys + * + * This hardware bug is fixed on the Grace Blackwell platforms and the + * presence of the bug can be determined through nvdev->has_mig_hw_bug. + * Thus on systems with the hardware fix, there is no need to partition + * the GPU device memory and the entire memory is usable and mapped as + * NORMAL cached (i.e. resmem size is 0). */ + if (nvdev->has_mig_hw_bug) + resmem_size = SZ_1G; + nvdev->usemem.memphys = memphys; /* * The device memory exposed to the VM is added to the kernel by the - * VM driver module in chunks of memory block size. Only the usable - * memory (usemem) is added to the kernel for usage by the VM - * workloads. Make the usable memory size memblock aligned. + * VM driver module in chunks of memory block size. Note that only the + * usable memory (usemem) is added to the kernel for usage by the VM + * workloads. */ - if (check_sub_overflow(memlength, RESMEM_SIZE, + if (check_sub_overflow(memlength, resmem_size, &nvdev->usemem.memlength)) { ret = -EOVERFLOW; goto done; } /* - * The USEMEM part of the device memory has to be MEMBLK_SIZE - * aligned. This is a hardwired ABI value between the GPU FW and - * VFIO driver. The VM device driver is also aware of it and make - * use of the value for its calculation to determine USEMEM size. + * The usemem region is exposed as a 64B Bar composed of region 4 and 5. + * Calculate and save the BAR size for the region. + */ + nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); + + /* + * If the hardware has the fix for MIG, there is no requirement + * for splitting the device memory to create RESMEM. The entire + * device memory is usable and will be USEMEM. Return here for + * such case. + */ + if (!nvdev->has_mig_hw_bug) + goto done; + + /* + * When the device memory is split to workaround the MIG bug on + * Grace Hopper, the USEMEM part of the device memory has to be + * MEMBLK_SIZE aligned. This is a hardwired ABI value between the + * GPU FW and VFIO driver. The VM device driver is also aware of it + * and make use of the value for its calculation to determine USEMEM + * size. Note that the device memory may not be 512M aligned. */ nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, MEMBLK_SIZE); @@ -803,15 +842,93 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, } /* - * The memory regions are exposed as BARs. Calculate and save - * the BAR size for them. + * The resmem region is exposed as a 64b BAR composed of region 2 and 3 + * for Grace Hopper. Calculate and save the BAR size for the region. */ - nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); done: return ret; } +static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) +{ + int pcie_dvsec; + u16 dvsec_ctrl16; + + pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA, + GPU_CAP_DVSEC_REGISTER); + + if (pcie_dvsec) { + pci_read_config_word(pdev, + pcie_dvsec + DVSEC_BITMAP_OFFSET, + &dvsec_ctrl16); + + if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM) + return false; + } + + return true; +} + +/* + * To reduce the system bootup time, the HBM training has + * been moved out of the UEFI on the Grace-Blackwell systems. + * + * The onus of checking whether the HBM training has completed + * thus falls on the module. The HBM training status can be + * determined from a BAR0 register. + * + * Similarly, another BAR0 register exposes the status of the + * CPU-GPU chip-to-chip (C2C) cache coherent interconnect. + * + * Poll these register and check for 30s. If the HBM training is + * not complete or if the C2C link is not ready, fail the probe. + * + * While the wait is not required on Grace Hopper systems, it + * is beneficial to make the check to ensure the device is in an + * expected state. + * + * Ensure that the BAR0 region is enabled before accessing the + * registers. + */ +static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); + void __iomem *io; + int ret = -ETIME; + + ret = pci_enable_device(pdev); + if (ret) + return ret; + + ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME); + if (ret) + goto request_region_exit; + + io = pci_iomap(pdev, 0, 0); + if (!io) { + ret = -ENOMEM; + goto iomap_exit; + } + + do { + if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && + (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { + ret = 0; + goto reg_check_exit; + } + msleep(POLL_QUANTUM_MS); + } while (!time_after(jiffies, timeout)); + +reg_check_exit: + pci_iounmap(pdev, io); +iomap_exit: + pci_release_selected_regions(pdev, 1 << 0); +request_region_exit: + pci_disable_device(pdev); + return ret; +} + static int nvgrace_gpu_probe(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -820,6 +937,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, u64 memphys, memlength; int ret; + ret = nvgrace_gpu_wait_device_ready(pdev); + if (ret) + return ret; + ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); if (!ret) ops = &nvgrace_gpu_pci_ops; @@ -832,6 +953,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, dev_set_drvdata(&pdev->dev, &nvdev->core_device); if (ops == &nvgrace_gpu_pci_ops) { + nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev); + /* * Device memory properties are identified in the host ACPI * table. Set the nvgrace_gpu_pci_core_device structure. @@ -868,6 +991,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, /* GH200 SKU */ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, + /* GB200 SKU */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) }, {} }; diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index ea2745c1ac5e..94142581c98c 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -511,13 +511,13 @@ static void vfio_bar_fixup(struct vfio_pci_core_device *vdev) mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1); mask |= PCI_ROM_ADDRESS_ENABLE; *vbar &= cpu_to_le32((u32)mask); - } else if (pdev->resource[PCI_ROM_RESOURCE].flags & - IORESOURCE_ROM_SHADOW) { - mask = ~(0x20000 - 1); + } else if (pdev->rom && pdev->romlen) { + mask = ~(roundup_pow_of_two(pdev->romlen) - 1); mask |= PCI_ROM_ADDRESS_ENABLE; *vbar &= cpu_to_le32((u32)mask); - } else + } else { *vbar = 0; + } vdev->bardirty = false; } @@ -1389,11 +1389,12 @@ static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epo switch (ecap) { case PCI_EXT_CAP_ID_VNDR: - ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword); + ret = pci_read_config_dword(pdev, epos + PCI_VNDR_HEADER, + &dword); if (ret) return pcibios_err_to_errno(ret); - return dword >> PCI_VSEC_HDR_LEN_SHIFT; + return PCI_VNDR_HEADER_LEN(dword); case PCI_EXT_CAP_ID_VC: case PCI_EXT_CAP_ID_VC9: case PCI_EXT_CAP_ID_MFVC: diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 1a4ed5a357d3..586e49efb81b 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1054,31 +1054,27 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); info.flags = 0; + info.size = 0; - /* Report the BAR size, not the ROM size */ - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - /* Shadow ROMs appear as PCI option ROMs */ - if (pdev->resource[PCI_ROM_RESOURCE].flags & - IORESOURCE_ROM_SHADOW) - info.size = 0x20000; - else - break; - } - - /* - * Is it really there? Enable memory decode for implicit access - * in pci_map_rom(). - */ - cmd = vfio_pci_memory_lock_and_enable(vdev); - io = pci_map_rom(pdev, &size); - if (io) { + if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { + /* + * Check ROM content is valid. Need to enable memory + * decode for ROM access in pci_map_rom(). + */ + cmd = vfio_pci_memory_lock_and_enable(vdev); + io = pci_map_rom(pdev, &size); + if (io) { + info.flags = VFIO_REGION_INFO_FLAG_READ; + /* Report the BAR size, not the ROM size. */ + info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + pci_unmap_rom(pdev, io); + } + vfio_pci_memory_unlock_and_restore(vdev, cmd); + } else if (pdev->rom && pdev->romlen) { info.flags = VFIO_REGION_INFO_FLAG_READ; - pci_unmap_rom(pdev, io); - } else { - info.size = 0; + /* Report BAR size as power of two. */ + info.size = roundup_pow_of_two(pdev->romlen); } - vfio_pci_memory_unlock_and_restore(vdev, cmd); break; } diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 66b72c289284..6192788c8ba3 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -16,6 +16,7 @@ #include <linux/io.h> #include <linux/vfio.h> #include <linux/vgaarb.h> +#include <linux/io-64-nonatomic-lo-hi.h> #include "vfio_pci_priv.h" @@ -61,9 +62,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_iowrite##size); VFIO_IOWRITE(8) VFIO_IOWRITE(16) VFIO_IOWRITE(32) -#ifdef iowrite64 VFIO_IOWRITE(64) -#endif #define VFIO_IOREAD(size) \ int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \ @@ -89,9 +88,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_ioread##size); VFIO_IOREAD(8) VFIO_IOREAD(16) VFIO_IOREAD(32) -#ifdef ioread64 VFIO_IOREAD(64) -#endif #define VFIO_IORDWR(size) \ static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\ @@ -127,9 +124,7 @@ static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\ VFIO_IORDWR(8) VFIO_IORDWR(16) VFIO_IORDWR(32) -#if defined(ioread64) && defined(iowrite64) VFIO_IORDWR(64) -#endif /* * Read or write from an __iomem region (MMIO or I/O port) with an excluded @@ -155,7 +150,6 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, else fillable = 0; -#if defined(ioread64) && defined(iowrite64) if (fillable >= 8 && !(off % 8)) { ret = vfio_pci_iordwr64(vdev, iswrite, test_mem, io, buf, off, &filled); @@ -163,7 +157,6 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, return ret; } else -#endif if (fillable >= 4 && !(off % 4)) { ret = vfio_pci_iordwr32(vdev, iswrite, test_mem, io, buf, off, &filled); @@ -244,9 +237,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, if (pci_resource_start(pdev, bar)) end = pci_resource_len(pdev, bar); - else if (bar == PCI_ROM_RESOURCE && - pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW) - end = 0x20000; + else if (bar == PCI_ROM_RESOURCE && pdev->rom && pdev->romlen) + end = roundup_pow_of_two(pdev->romlen); else return -EINVAL; @@ -261,11 +253,14 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, * excluded range at the end of the actual ROM. This makes * filling large ROM BARs much faster. */ - io = pci_map_rom(pdev, &x_start); - if (!io) { - done = -ENOMEM; - goto out; + if (pci_resource_start(pdev, bar)) { + io = pci_map_rom(pdev, &x_start); + } else { + io = ioremap(pdev->rom, pdev->romlen); + x_start = pdev->romlen; } + if (!io) + return -ENOMEM; x_end = end; } else { int ret = vfio_pci_core_setup_barmap(vdev, bar); @@ -288,8 +283,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, if (done >= 0) *ppos += done; - if (bar == PCI_ROM_RESOURCE) - pci_unmap_rom(pdev, io); + if (bar == PCI_ROM_RESOURCE) { + if (pci_resource_start(pdev, bar)) + pci_unmap_rom(pdev, io); + else + iounmap(io); + } + out: return done; } @@ -381,12 +381,10 @@ static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd, vfio_pci_core_iowrite32(ioeventfd->vdev, test_mem, ioeventfd->data, ioeventfd->addr); break; -#ifdef iowrite64 case 8: vfio_pci_core_iowrite64(ioeventfd->vdev, test_mem, ioeventfd->data, ioeventfd->addr); break; -#endif } } @@ -440,10 +438,8 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, pos >= vdev->msix_offset + vdev->msix_size)) return -EINVAL; -#ifndef iowrite64 if (count == 8) return -EINVAL; -#endif ret = vfio_pci_core_setup_barmap(vdev, bar); if (ret) diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c index ee54f4c17857..ba92bb4e9af9 100644 --- a/drivers/vfio/pci/virtio/migrate.c +++ b/drivers/vfio/pci/virtio/migrate.c @@ -77,8 +77,8 @@ static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf, return -ENOMEM; do { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, - page_list); + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill, + page_list); if (!filled) { ret = -ENOMEM; goto err; @@ -112,7 +112,7 @@ static void virtiovf_free_data_buffer(struct virtiovf_data_buffer *buf) { struct sg_page_iter sg_iter; - /* Undo alloc_pages_bulk_array() */ + /* Undo alloc_pages_bulk() */ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) __free_page(sg_page_iter_page(&sg_iter)); sg_free_append_table(&buf->table); |