diff options
Diffstat (limited to 'drivers/vfio/pci/nvgrace-gpu')
-rw-r--r-- | drivers/vfio/pci/nvgrace-gpu/main.c | 169 |
1 files changed, 147 insertions, 22 deletions
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index a467085038f0..e5ac39c4cc6b 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -5,6 +5,8 @@ #include <linux/sizes.h> #include <linux/vfio_pci_core.h> +#include <linux/delay.h> +#include <linux/jiffies.h> /* * The device memory usable to the workloads running in the VM is cached @@ -17,12 +19,21 @@ #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX -/* Memory size expected as non cached and reserved by the VM driver */ -#define RESMEM_SIZE SZ_1G - /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ #define MEMBLK_SIZE SZ_512M +#define DVSEC_BITMAP_OFFSET 0xA +#define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0) + +#define GPU_CAP_DVSEC_REGISTER 3 + +#define C2C_LINK_BAR0_OFFSET 0x1498 +#define HBM_TRAINING_BAR0_OFFSET 0x200BC +#define STATUS_READY 0xFF + +#define POLL_QUANTUM_MS 1000 +#define POLL_TIMEOUT_MS (30 * 1000) + /* * The state of the two device memory region - resmem and usemem - is * saved as struct mem_region. @@ -46,6 +57,7 @@ struct nvgrace_gpu_pci_core_device { struct mem_region resmem; /* Lock to control device memory kernel mapping */ struct mutex remap_lock; + bool has_mig_hw_bug; }; static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) @@ -66,7 +78,7 @@ nvgrace_gpu_memregion(int index, if (index == USEMEM_REGION_INDEX) return &nvdev->usemem; - if (index == RESMEM_REGION_INDEX) + if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) return &nvdev->resmem; return NULL; @@ -751,40 +763,67 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, u64 memphys, u64 memlength) { int ret = 0; + u64 resmem_size = 0; /* - * The VM GPU device driver needs a non-cacheable region to support - * the MIG feature. Since the device memory is mapped as NORMAL cached, - * carve out a region from the end with a different NORMAL_NC - * property (called as reserved memory and represented as resmem). This - * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while - * exposing the rest (termed as usable memory and represented using usemem) - * as cacheable 64b BAR (region 4 and 5). + * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable + * region to support the MIG feature owing to a hardware bug. Since the + * device memory is mapped as NORMAL cached, carve out a region from the end + * with a different NORMAL_NC property (called as reserved memory and + * represented as resmem). This region then is exposed as a 64b BAR + * (region 2 and 3) to the VM, while exposing the rest (termed as usable + * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5). * * devmem (memlength) * |-------------------------------------------------| * | | * usemem.memphys resmem.memphys + * + * This hardware bug is fixed on the Grace Blackwell platforms and the + * presence of the bug can be determined through nvdev->has_mig_hw_bug. + * Thus on systems with the hardware fix, there is no need to partition + * the GPU device memory and the entire memory is usable and mapped as + * NORMAL cached (i.e. resmem size is 0). */ + if (nvdev->has_mig_hw_bug) + resmem_size = SZ_1G; + nvdev->usemem.memphys = memphys; /* * The device memory exposed to the VM is added to the kernel by the - * VM driver module in chunks of memory block size. Only the usable - * memory (usemem) is added to the kernel for usage by the VM - * workloads. Make the usable memory size memblock aligned. + * VM driver module in chunks of memory block size. Note that only the + * usable memory (usemem) is added to the kernel for usage by the VM + * workloads. */ - if (check_sub_overflow(memlength, RESMEM_SIZE, + if (check_sub_overflow(memlength, resmem_size, &nvdev->usemem.memlength)) { ret = -EOVERFLOW; goto done; } /* - * The USEMEM part of the device memory has to be MEMBLK_SIZE - * aligned. This is a hardwired ABI value between the GPU FW and - * VFIO driver. The VM device driver is also aware of it and make - * use of the value for its calculation to determine USEMEM size. + * The usemem region is exposed as a 64B Bar composed of region 4 and 5. + * Calculate and save the BAR size for the region. + */ + nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); + + /* + * If the hardware has the fix for MIG, there is no requirement + * for splitting the device memory to create RESMEM. The entire + * device memory is usable and will be USEMEM. Return here for + * such case. + */ + if (!nvdev->has_mig_hw_bug) + goto done; + + /* + * When the device memory is split to workaround the MIG bug on + * Grace Hopper, the USEMEM part of the device memory has to be + * MEMBLK_SIZE aligned. This is a hardwired ABI value between the + * GPU FW and VFIO driver. The VM device driver is also aware of it + * and make use of the value for its calculation to determine USEMEM + * size. Note that the device memory may not be 512M aligned. */ nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, MEMBLK_SIZE); @@ -803,15 +842,93 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, } /* - * The memory regions are exposed as BARs. Calculate and save - * the BAR size for them. + * The resmem region is exposed as a 64b BAR composed of region 2 and 3 + * for Grace Hopper. Calculate and save the BAR size for the region. */ - nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); done: return ret; } +static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) +{ + int pcie_dvsec; + u16 dvsec_ctrl16; + + pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA, + GPU_CAP_DVSEC_REGISTER); + + if (pcie_dvsec) { + pci_read_config_word(pdev, + pcie_dvsec + DVSEC_BITMAP_OFFSET, + &dvsec_ctrl16); + + if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM) + return false; + } + + return true; +} + +/* + * To reduce the system bootup time, the HBM training has + * been moved out of the UEFI on the Grace-Blackwell systems. + * + * The onus of checking whether the HBM training has completed + * thus falls on the module. The HBM training status can be + * determined from a BAR0 register. + * + * Similarly, another BAR0 register exposes the status of the + * CPU-GPU chip-to-chip (C2C) cache coherent interconnect. + * + * Poll these register and check for 30s. If the HBM training is + * not complete or if the C2C link is not ready, fail the probe. + * + * While the wait is not required on Grace Hopper systems, it + * is beneficial to make the check to ensure the device is in an + * expected state. + * + * Ensure that the BAR0 region is enabled before accessing the + * registers. + */ +static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); + void __iomem *io; + int ret = -ETIME; + + ret = pci_enable_device(pdev); + if (ret) + return ret; + + ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME); + if (ret) + goto request_region_exit; + + io = pci_iomap(pdev, 0, 0); + if (!io) { + ret = -ENOMEM; + goto iomap_exit; + } + + do { + if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && + (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { + ret = 0; + goto reg_check_exit; + } + msleep(POLL_QUANTUM_MS); + } while (!time_after(jiffies, timeout)); + +reg_check_exit: + pci_iounmap(pdev, io); +iomap_exit: + pci_release_selected_regions(pdev, 1 << 0); +request_region_exit: + pci_disable_device(pdev); + return ret; +} + static int nvgrace_gpu_probe(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -820,6 +937,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, u64 memphys, memlength; int ret; + ret = nvgrace_gpu_wait_device_ready(pdev); + if (ret) + return ret; + ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); if (!ret) ops = &nvgrace_gpu_pci_ops; @@ -832,6 +953,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, dev_set_drvdata(&pdev->dev, &nvdev->core_device); if (ops == &nvgrace_gpu_pci_ops) { + nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev); + /* * Device memory properties are identified in the host ACPI * table. Set the nvgrace_gpu_pci_core_device structure. @@ -868,6 +991,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, /* GH200 SKU */ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, + /* GB200 SKU */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) }, {} }; |