vfio/nvgrace-gpu: Add support for huge pfnmap

NVIDIA's Grace based systems have large device memory. The device memory is mapped as VM_PFNMAP in the VMM VMA. The nvgrace-gpu module could make use of the huge PFNMAP support added in mm [1]. To make use of the huge pfnmap support, fault/huge_fault ops based mapping mechanism needs to be implemented. Currently nvgrace-gpu module relies on remap_pfn_range to do the mapping during VM bootup. Replace it to instead rely on fault and use vfio_pci_vmf_insert_pfn to setup the mapping. Moreover to enable huge pfnmap, nvgrace-gpu module is updated by adding huge_fault ops implementation. The implementation establishes mapping according to the order request. Note that if the PFN or the VMA address is unaligned to the order, the mapping fallbacks to the PTE level. Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1] Cc: Shameer Kolothum <skolothumtho@nvidia.com> Cc: Alex Williamson <alex@shazbot.org> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Vikram Sethi <vsethi@nvidia.com> Reviewed-by: Zhi Wang <zhiw@nvidia.com> Reviewed-by: Shameer Kolothum <skolothumtho@nvidia.com> Signed-off-by: Ankit Agrawal <ankita@nvidia.com> Link: https://lore.kernel.org/r/20251127170632.3477-3-ankita@nvidia.com Signed-off-by: Alex Williamson <alex@shazbot.org>
author: Ankit Agrawal <ankita@nvidia.com> 2025-11-27 17:06:28 +0000
committer: Alex Williamson <alex@shazbot.org> 2025-11-28 10:07:25 -0700
commit: 9db65489b87298f20baef683e70143c108959793 (patch)
tree: 95700211f2344db6a7cdb84816c4761ea3e18967
parent: 9b92bc7554b543dc00a0a0b62904a9ef2ad5c4b0 (diff)
1 files changed, 59 insertions, 22 deletions
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index e33f24fbb0a4..3034d6adf576 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -131,6 +131,59 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
 	vfio_pci_core_close_device(core_vdev);
 }
 
+static unsigned long addr_to_pgoff(struct vm_area_struct *vma,
+				   unsigned long addr)
+{
+	u64 pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+	return ((addr - vma->vm_start) >> PAGE_SHIFT) + pgoff;
+}
+
+static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf,
+						  unsigned int order)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct nvgrace_gpu_pci_core_device *nvdev = vma->vm_private_data;
+	struct vfio_pci_core_device *vdev = &nvdev->core_device;
+	unsigned int index =
+		vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+	vm_fault_t ret = VM_FAULT_FALLBACK;
+	struct mem_region *memregion;
+	unsigned long pfn, addr;
+
+	memregion = nvgrace_gpu_memregion(index, nvdev);
+	if (!memregion)
+		return VM_FAULT_SIGBUS;
+
+	addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+	pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr);
+
+	if (is_aligned_for_order(vma, addr, pfn, order)) {
+		scoped_guard(rwsem_read, &vdev->memory_lock)
+			ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order);
+	}
+
+	dev_dbg_ratelimited(&vdev->pdev->dev,
+			    "%s order = %d pfn 0x%lx: 0x%x\n",
+			    __func__, order, pfn,
+			    (unsigned int)ret);
+
+	return ret;
+}
+
+static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
+{
+	return nvgrace_gpu_vfio_pci_huge_fault(vmf, 0);
+}
+
+static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
+	.fault = nvgrace_gpu_vfio_pci_fault,
+#ifdef CONFIG_ARCH_SUPPORTS_HUGE_PFNMAP
+	.huge_fault = nvgrace_gpu_vfio_pci_huge_fault,
+#endif
+};
+
 static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
 			    struct vm_area_struct *vma)
 {
@@ -138,10 +191,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
 		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
 			     core_device.vdev);
 	struct mem_region *memregion;
-	unsigned long start_pfn;
 	u64 req_len, pgoff, end;
 	unsigned int index;
-	int ret = 0;
 
 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
 
@@ -158,17 +209,18 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
 
 	if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
-	    check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) ||
 	    check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
 		return -EOVERFLOW;
 
 	/*
-	 * Check that the mapping request does not go beyond available device
-	 * memory size
+	 * Check that the mapping request does not go beyond the exposed
+	 * device memory size.
 	 */
 	if (end > memregion->memlength)
 		return -EINVAL;
 
+	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+
 	/*
 	 * The carved out region of the device memory needs the NORMAL_NC
 	 * property. Communicate as such to the hypervisor.
@@ -185,23 +237,8 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
 		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
 	}
 
-	/*
-	 * Perform a PFN map to the memory and back the device BAR by the
-	 * GPU memory.
-	 *
-	 * The available GPU memory size may not be power-of-2 aligned. The
-	 * remainder is only backed by vfio_device_ops read/write handlers.
-	 *
-	 * During device reset, the GPU is safely disconnected to the CPU
-	 * and access to the BAR will be immediately returned preventing
-	 * machine check.
-	 */
-	ret = remap_pfn_range(vma, vma->vm_start, start_pfn,
-			      req_len, vma->vm_page_prot);
-	if (ret)
-		return ret;
-
-	vma->vm_pgoff = start_pfn;
+	vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
+	vma->vm_private_data = nvdev;
 
 	return 0;
 }
author	Ankit Agrawal <ankita@nvidia.com>	2025-11-27 17:06:28 +0000
committer	Alex Williamson <alex@shazbot.org>	2025-11-28 10:07:25 -0700
commit	9db65489b87298f20baef683e70143c108959793 (patch)
tree	95700211f2344db6a7cdb84816c4761ea3e18967
parent	9b92bc7554b543dc00a0a0b62904a9ef2ad5c4b0 (diff)