drm/i915: enable PS64 support for DG2

It turns out that on production DG2/ATS HW we should have support for PS64. This feature allows to provide a 64K TLB hint at the PTE level, which is a lot more flexible than the current method of enabling 64K GTT pages for the entire page-table, since that leads to all kinds of annoying restrictions, as documented in: commit caa574ffc4aaf4f29b890223878c63e2e7772f62 Author: Matthew Auld <matthew.auld@intel.com> Date: Sat Feb 19 00:17:49 2022 +0530 drm/i915/uapi: document behaviour for DG2 64K support On discrete platforms like DG2, we need to support a minimum page size of 64K when dealing with device local-memory. This is quite tricky for various reasons, so try to document the new implicit uapi for this. With PS64, we can now drop the 2M GTT alignment restriction, and instead only require 64K or larger when dealing with lmem. We still use the compact-pt layout when possible, but only when we are certain that this doesn't interfere with userspace. Note that this is a change in uAPI behaviour, but hopefully shouldn't be a concern (IGT is at least able to autodetect the alignment), since we are only making the GTT alignment constraint less restrictive. Based on a patch from CQ Tang. v2: update the comment wrt scratch page v3: (Nirmoy) - Fix the selftest to actually use the random size, plus some comment improvements, also drop the rem stuff. Reported-by: Michal Mrozek <michal.mrozek@intel.com> Signed-off-by: Matthew Auld <matthew.auld@intel.com> Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com> Cc: Stuart Summers <stuart.summers@intel.com> Cc: Jordan Justen <jordan.l.justen@intel.com> Cc: Yang A Shi <yang.a.shi@intel.com> Cc: Nirmoy Das <nirmoy.das@intel.com> Cc: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> Reviewed-by: Nirmoy Das <nirmoy.das@intel.com> Acked-by: Michal Mrozek <michal.mrozek@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221004114915.221708-1-matthew.auld@intel.com
author: Matthew Auld <matthew.auld@intel.com> 2022-10-04 12:49:14 +0100
committer: Matthew Auld <matthew.auld@intel.com> 2022-10-14 18:08:04 +0100
commit: 8133a6daad4e72748e239a02775a853ca7ed798b (patch)
tree: 6a905495b329a2e381da7245bfe71a828acedc9d /drivers/gpu/drm/i915/gt/gen8_ppgtt.c
parent: e55427b46852f11ca37f33abb7d7ec76bb4c9ed3 (diff)
1 files changed, 45 insertions, 36 deletions
diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index 8dd60d5ef905..4daaa6f55668 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -476,6 +476,7 @@ xehpsdv_ppgtt_insert_huge(struct i915_address_space *vm,
 	const gen8_pte_t pte_encode = vm->pte_encode(0, cache_level, flags);
 	unsigned int rem = sg_dma_len(iter->sg);
 	u64 start = vma_res->start;
+	u64 end = start + vma_res->vma_size;
 
 	GEM_BUG_ON(!i915_vm_is_4lvl(vm));
 
@@ -489,9 +490,10 @@ xehpsdv_ppgtt_insert_huge(struct i915_address_space *vm,
 		gen8_pte_t encode = pte_encode;
 		unsigned int page_size;
 		gen8_pte_t *vaddr;
-		u16 index, max;
+		u16 index, max, nent, i;
 
 		max = I915_PDES;
+		nent = 1;
 
 		if (vma_res->bi.page_sizes.sg & I915_GTT_PAGE_SIZE_2M &&
 		    IS_ALIGNED(iter->dma, I915_GTT_PAGE_SIZE_2M) &&
@@ -503,25 +505,37 @@ xehpsdv_ppgtt_insert_huge(struct i915_address_space *vm,
 
 			vaddr = px_vaddr(pd);
 		} else {
-			if (encode & GEN12_PPGTT_PTE_LM) {
-				GEM_BUG_ON(__gen8_pte_index(start, 0) % 16);
-				GEM_BUG_ON(rem < I915_GTT_PAGE_SIZE_64K);
-				GEM_BUG_ON(!IS_ALIGNED(iter->dma,
-						       I915_GTT_PAGE_SIZE_64K));
-
-				index = __gen8_pte_index(start, 0) / 16;
-				page_size = I915_GTT_PAGE_SIZE_64K;
-
-				max /= 16;
-
-				vaddr = px_vaddr(pd);
-				vaddr[__gen8_pte_index(start, 1)] |= GEN12_PDE_64K;
+			index =  __gen8_pte_index(start, 0);
+			page_size = I915_GTT_PAGE_SIZE;
 
-				pt->is_compact = true;
-			} else {
-				GEM_BUG_ON(pt->is_compact);
-				index =  __gen8_pte_index(start, 0);
-				page_size = I915_GTT_PAGE_SIZE;
+			if (vma_res->bi.page_sizes.sg & I915_GTT_PAGE_SIZE_64K) {
+				/*
+				 * Device local-memory on these platforms should
+				 * always use 64K pages or larger (including GTT
+				 * alignment), therefore if we know the whole
+				 * page-table needs to be filled we can always
+				 * safely use the compact-layout. Otherwise fall
+				 * back to the TLB hint with PS64. If this is
+				 * system memory we only bother with PS64.
+				 */
+				if ((encode & GEN12_PPGTT_PTE_LM) &&
+				    end - start >= SZ_2M && !index) {
+					index = __gen8_pte_index(start, 0) / 16;
+					page_size = I915_GTT_PAGE_SIZE_64K;
+
+					max /= 16;
+
+					vaddr = px_vaddr(pd);
+					vaddr[__gen8_pte_index(start, 1)] |= GEN12_PDE_64K;
+
+					pt->is_compact = true;
+				} else if (IS_ALIGNED(iter->dma, I915_GTT_PAGE_SIZE_64K) &&
+					   rem >= I915_GTT_PAGE_SIZE_64K &&
+					   !(index % 16)) {
+					encode |= GEN12_PTE_PS64;
+					page_size = I915_GTT_PAGE_SIZE_64K;
+					nent = 16;
+				}
 			}
 
 			vaddr = px_vaddr(pt);
@@ -529,7 +543,12 @@ xehpsdv_ppgtt_insert_huge(struct i915_address_space *vm,
 
 		do {
 			GEM_BUG_ON(rem < page_size);
-			vaddr[index++] = encode | iter->dma;
+
+			for (i = 0; i < nent; i++) {
+				vaddr[index++] =
+					encode | (iter->dma + i *
+						  I915_GTT_PAGE_SIZE);
+			}
 
 			start += page_size;
 			iter->dma += page_size;
@@ -745,6 +764,8 @@ static void __xehpsdv_ppgtt_insert_entry_lm(struct i915_address_space *vm,
 	GEM_BUG_ON(!IS_ALIGNED(addr, SZ_64K));
 	GEM_BUG_ON(!IS_ALIGNED(offset, SZ_64K));
 
+	/* XXX: we don't strictly need to use this layout */
+
 	if (!pt->is_compact) {
 		vaddr = px_vaddr(pd);
 		vaddr[gen8_pd_index(idx, 1)] |= GEN12_PDE_64K;
@@ -935,22 +956,10 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt,
 		ppgtt->vm.alloc_pt_dma = alloc_pt_dma;
 
 	/*
-	 * On some platforms the hw has dropped support for 4K GTT pages
-	 * when dealing with LMEM, and due to the design of 64K GTT
-	 * pages in the hw, we can only mark the *entire* page-table as
-	 * operating in 64K GTT mode, since the enable bit is still on
-	 * the pde, and not the pte. And since we still need to allow
-	 * 4K GTT pages for SMEM objects, we can't have a "normal" 4K
-	 * page-table with scratch pointing to LMEM, since that's
-	 * undefined from the hw pov. The simplest solution is to just
-	 * move the 64K scratch page to SMEM on all platforms and call
-	 * it a day, since that should work for all configurations.
-	 *
-	 * Using SMEM instead of LMEM has the additional advantage of
-	 * not reserving high performance memory for a "never" used
-	 * filler page. It also removes the device access that would
-	 * be required to initialise the scratch page, reducing pressure
-	 * on an even scarcer resource.
+	 * Using SMEM here instead of LMEM has the advantage of not reserving
+	 * high performance memory for a "never" used filler page. It also
+	 * removes the device access that would be required to initialise the
+	 * scratch page, reducing pressure on an even scarcer resource.
 	 */
 	ppgtt->vm.alloc_scratch_dma = alloc_pt_dma;
author	Matthew Auld <matthew.auld@intel.com>	2022-10-04 12:49:14 +0100
committer	Matthew Auld <matthew.auld@intel.com>	2022-10-14 18:08:04 +0100
commit	8133a6daad4e72748e239a02775a853ca7ed798b (patch)
tree	6a905495b329a2e381da7245bfe71a828acedc9d /drivers/gpu/drm/i915/gt/gen8_ppgtt.c
parent	e55427b46852f11ca37f33abb7d7ec76bb4c9ed3 (diff)