summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/xe/xe_migrate.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/xe/xe_migrate.c')
-rw-r--r--drivers/gpu/drm/xe/xe_migrate.c369
1 files changed, 323 insertions, 46 deletions
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c
index a36ce7dce8cc..2184af413b91 100644
--- a/drivers/gpu/drm/xe/xe_migrate.c
+++ b/drivers/gpu/drm/xe/xe_migrate.c
@@ -29,6 +29,7 @@
#include "xe_lrc.h"
#include "xe_map.h"
#include "xe_mocs.h"
+#include "xe_printk.h"
#include "xe_pt.h"
#include "xe_res_cursor.h"
#include "xe_sa.h"
@@ -57,6 +58,13 @@ struct xe_migrate {
u64 usm_batch_base_ofs;
/** @cleared_mem_ofs: VM offset of @cleared_bo. */
u64 cleared_mem_ofs;
+ /** @large_page_copy_ofs: VM offset of 2M pages used for large copies */
+ u64 large_page_copy_ofs;
+ /**
+ * @large_page_copy_pdes: BO offset to writeout 2M pages (PDEs) used for
+ * large copies
+ */
+ u64 large_page_copy_pdes;
/**
* @fence: dma-fence representing the last migration job batch.
* Protected by @job_mutex.
@@ -288,6 +296,12 @@ static int xe_migrate_prepare_vm(struct xe_tile *tile, struct xe_migrate *m,
(i + 1) * 8, u64, entry);
}
+ /* Reserve 2M PDEs */
+ level = 1;
+ m->large_page_copy_ofs = NUM_PT_SLOTS << xe_pt_shift(level);
+ m->large_page_copy_pdes = map_ofs + XE_PAGE_SIZE * level +
+ NUM_PT_SLOTS * 8;
+
/* Set up a 1GiB NULL mapping at 255GiB offset. */
level = 2;
xe_map_wr(xe, &bo->vmap, map_ofs + XE_PAGE_SIZE * level + 255 * 8, u64,
@@ -686,9 +700,9 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
}
#define EMIT_COPY_DW 10
-static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
- u64 src_ofs, u64 dst_ofs, unsigned int size,
- unsigned int pitch)
+static void emit_xy_fast_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
+ u64 dst_ofs, unsigned int size,
+ unsigned int pitch)
{
struct xe_device *xe = gt_to_xe(gt);
u32 mocs = 0;
@@ -717,6 +731,61 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
bb->cs[bb->len++] = upper_32_bits(src_ofs);
}
+#define PAGE_COPY_MODE_PS SZ_256 /* hw uses 256 bytes as the page-size */
+static void emit_mem_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
+ u64 dst_ofs, unsigned int size, unsigned int pitch)
+{
+ u32 mode, copy_type, width;
+
+ xe_gt_assert(gt, IS_ALIGNED(size, pitch));
+ xe_gt_assert(gt, pitch <= U16_MAX);
+ xe_gt_assert(gt, pitch);
+ xe_gt_assert(gt, size);
+
+ if (IS_ALIGNED(size, PAGE_COPY_MODE_PS) &&
+ IS_ALIGNED(lower_32_bits(src_ofs), PAGE_COPY_MODE_PS) &&
+ IS_ALIGNED(lower_32_bits(dst_ofs), PAGE_COPY_MODE_PS)) {
+ mode = MEM_COPY_PAGE_COPY_MODE;
+ copy_type = 0; /* linear copy */
+ width = size / PAGE_COPY_MODE_PS;
+ } else if (pitch > 1) {
+ xe_gt_assert(gt, size / pitch <= U16_MAX);
+ mode = 0; /* BYTE_COPY */
+ copy_type = MEM_COPY_MATRIX_COPY;
+ width = pitch;
+ } else {
+ mode = 0; /* BYTE_COPY */
+ copy_type = 0; /* linear copy */
+ width = size;
+ }
+
+ xe_gt_assert(gt, width <= U16_MAX);
+
+ bb->cs[bb->len++] = MEM_COPY_CMD | mode | copy_type;
+ bb->cs[bb->len++] = width - 1;
+ bb->cs[bb->len++] = size / pitch - 1; /* ignored by hw for page-copy/linear above */
+ bb->cs[bb->len++] = pitch - 1;
+ bb->cs[bb->len++] = pitch - 1;
+ bb->cs[bb->len++] = lower_32_bits(src_ofs);
+ bb->cs[bb->len++] = upper_32_bits(src_ofs);
+ bb->cs[bb->len++] = lower_32_bits(dst_ofs);
+ bb->cs[bb->len++] = upper_32_bits(dst_ofs);
+ bb->cs[bb->len++] = FIELD_PREP(MEM_COPY_SRC_MOCS_INDEX_MASK, gt->mocs.uc_index) |
+ FIELD_PREP(MEM_COPY_DST_MOCS_INDEX_MASK, gt->mocs.uc_index);
+}
+
+static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
+ u64 src_ofs, u64 dst_ofs, unsigned int size,
+ unsigned int pitch)
+{
+ struct xe_device *xe = gt_to_xe(gt);
+
+ if (xe->info.has_mem_copy_instr)
+ emit_mem_copy(gt, bb, src_ofs, dst_ofs, size, pitch);
+ else
+ emit_xy_fast_copy(gt, bb, src_ofs, dst_ofs, size, pitch);
+}
+
static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm)
{
return usm ? m->usm_batch_base_ofs : m->batch_base_ofs;
@@ -834,7 +903,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
&ccs_it);
while (size) {
- u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */
+ u32 batch_size = 1; /* MI_BATCH_BUFFER_END */
struct xe_sched_job *job;
struct xe_bb *bb;
u32 flush_flags = 0;
@@ -980,15 +1049,27 @@ struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate)
return migrate->q->lrc[0];
}
-static int emit_flush_invalidate(struct xe_exec_queue *q, u32 *dw, int i,
- u32 flags)
+static u64 migrate_vm_ppgtt_addr_tlb_inval(void)
+{
+ /*
+ * The migrate VM is self-referential so it can modify its own PTEs (see
+ * pte_update_size() or emit_pte() functions). We reserve NUM_KERNEL_PDE
+ * entries for kernel operations (copies, clears, CCS migrate), and
+ * suballocate the rest to user operations (binds/unbinds). With
+ * NUM_KERNEL_PDE = 15, NUM_KERNEL_PDE - 1 is already used for PTE updates,
+ * so assign NUM_KERNEL_PDE - 2 for TLB invalidation.
+ */
+ return (NUM_KERNEL_PDE - 2) * XE_PAGE_SIZE;
+}
+
+static int emit_flush_invalidate(u32 *dw, int i, u32 flags)
{
- struct xe_lrc *lrc = xe_exec_queue_lrc(q);
+ u64 addr = migrate_vm_ppgtt_addr_tlb_inval();
+
dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
MI_FLUSH_IMM_DW | flags;
- dw[i++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc)) |
- MI_FLUSH_DW_USE_GTT;
- dw[i++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(lrc));
+ dw[i++] = lower_32_bits(addr);
+ dw[i++] = upper_32_bits(addr);
dw[i++] = MI_NOOP;
dw[i++] = MI_NOOP;
@@ -1101,11 +1182,11 @@ int xe_migrate_ccs_rw_copy(struct xe_tile *tile, struct xe_exec_queue *q,
emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
- bb->len = emit_flush_invalidate(q, bb->cs, bb->len, flush_flags);
+ bb->len = emit_flush_invalidate(bb->cs, bb->len, flush_flags);
flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_pltt,
src_L0_ofs, dst_is_pltt,
src_L0, ccs_ofs, true);
- bb->len = emit_flush_invalidate(q, bb->cs, bb->len, flush_flags);
+ bb->len = emit_flush_invalidate(bb->cs, bb->len, flush_flags);
size -= src_L0;
}
@@ -1130,6 +1211,128 @@ struct xe_exec_queue *xe_migrate_exec_queue(struct xe_migrate *migrate)
return migrate->q;
}
+/**
+ * xe_migrate_vram_copy_chunk() - Copy a chunk of a VRAM buffer object.
+ * @vram_bo: The VRAM buffer object.
+ * @vram_offset: The VRAM offset.
+ * @sysmem_bo: The sysmem buffer object.
+ * @sysmem_offset: The sysmem offset.
+ * @size: The size of VRAM chunk to copy.
+ * @dir: The direction of the copy operation.
+ *
+ * Copies a portion of a buffer object between VRAM and system memory.
+ * On Xe2 platforms that support flat CCS, VRAM data is decompressed when
+ * copying to system memory.
+ *
+ * Return: Pointer to a dma_fence representing the last copy batch, or
+ * an error pointer on failure. If there is a failure, any copy operation
+ * started by the function call has been synced.
+ */
+struct dma_fence *xe_migrate_vram_copy_chunk(struct xe_bo *vram_bo, u64 vram_offset,
+ struct xe_bo *sysmem_bo, u64 sysmem_offset,
+ u64 size, enum xe_migrate_copy_dir dir)
+{
+ struct xe_device *xe = xe_bo_device(vram_bo);
+ struct xe_tile *tile = vram_bo->tile;
+ struct xe_gt *gt = tile->primary_gt;
+ struct xe_migrate *m = tile->migrate;
+ struct dma_fence *fence = NULL;
+ struct ttm_resource *vram = vram_bo->ttm.resource;
+ struct ttm_resource *sysmem = sysmem_bo->ttm.resource;
+ struct xe_res_cursor vram_it, sysmem_it;
+ u64 vram_L0_ofs, sysmem_L0_ofs;
+ u32 vram_L0_pt, sysmem_L0_pt;
+ u64 vram_L0, sysmem_L0;
+ bool to_sysmem = (dir == XE_MIGRATE_COPY_TO_SRAM);
+ bool use_comp_pat = to_sysmem &&
+ GRAPHICS_VER(xe) >= 20 && xe_device_has_flat_ccs(xe);
+ int pass = 0;
+ int err;
+
+ xe_assert(xe, IS_ALIGNED(vram_offset | sysmem_offset | size, PAGE_SIZE));
+ xe_assert(xe, xe_bo_is_vram(vram_bo));
+ xe_assert(xe, !xe_bo_is_vram(sysmem_bo));
+ xe_assert(xe, !range_overflows(vram_offset, size, (u64)vram_bo->ttm.base.size));
+ xe_assert(xe, !range_overflows(sysmem_offset, size, (u64)sysmem_bo->ttm.base.size));
+
+ xe_res_first(vram, vram_offset, size, &vram_it);
+ xe_res_first_sg(xe_bo_sg(sysmem_bo), sysmem_offset, size, &sysmem_it);
+
+ while (size) {
+ u32 pte_flags = PTE_UPDATE_FLAG_IS_VRAM;
+ u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */
+ struct xe_sched_job *job;
+ struct xe_bb *bb;
+ u32 update_idx;
+ bool usm = xe->info.has_usm;
+ u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
+
+ sysmem_L0 = xe_migrate_res_sizes(m, &sysmem_it);
+ vram_L0 = min(xe_migrate_res_sizes(m, &vram_it), sysmem_L0);
+
+ xe_dbg(xe, "Pass %u, size: %llu\n", pass++, vram_L0);
+
+ pte_flags |= use_comp_pat ? PTE_UPDATE_FLAG_IS_COMP_PTE : 0;
+ batch_size += pte_update_size(m, pte_flags, vram, &vram_it, &vram_L0,
+ &vram_L0_ofs, &vram_L0_pt, 0, 0, avail_pts);
+
+ batch_size += pte_update_size(m, 0, sysmem, &sysmem_it, &vram_L0, &sysmem_L0_ofs,
+ &sysmem_L0_pt, 0, avail_pts, avail_pts);
+ batch_size += EMIT_COPY_DW;
+
+ bb = xe_bb_new(gt, batch_size, usm);
+ if (IS_ERR(bb)) {
+ err = PTR_ERR(bb);
+ return ERR_PTR(err);
+ }
+
+ if (xe_migrate_allow_identity(vram_L0, &vram_it))
+ xe_res_next(&vram_it, vram_L0);
+ else
+ emit_pte(m, bb, vram_L0_pt, true, use_comp_pat, &vram_it, vram_L0, vram);
+
+ emit_pte(m, bb, sysmem_L0_pt, false, false, &sysmem_it, vram_L0, sysmem);
+
+ bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
+ update_idx = bb->len;
+
+ if (to_sysmem)
+ emit_copy(gt, bb, vram_L0_ofs, sysmem_L0_ofs, vram_L0, XE_PAGE_SIZE);
+ else
+ emit_copy(gt, bb, sysmem_L0_ofs, vram_L0_ofs, vram_L0, XE_PAGE_SIZE);
+
+ job = xe_bb_create_migration_job(m->q, bb, xe_migrate_batch_base(m, usm),
+ update_idx);
+ if (IS_ERR(job)) {
+ xe_bb_free(bb, NULL);
+ err = PTR_ERR(job);
+ return ERR_PTR(err);
+ }
+
+ xe_sched_job_add_migrate_flush(job, MI_INVALIDATE_TLB);
+
+ xe_assert(xe, dma_resv_test_signaled(vram_bo->ttm.base.resv,
+ DMA_RESV_USAGE_BOOKKEEP));
+ xe_assert(xe, dma_resv_test_signaled(sysmem_bo->ttm.base.resv,
+ DMA_RESV_USAGE_BOOKKEEP));
+
+ scoped_guard(mutex, &m->job_mutex) {
+ xe_sched_job_arm(job);
+ dma_fence_put(fence);
+ fence = dma_fence_get(&job->drm.s_fence->finished);
+ xe_sched_job_push(job);
+
+ dma_fence_put(m->fence);
+ m->fence = dma_fence_get(fence);
+ }
+
+ xe_bb_free(bb, fence);
+ size -= vram_L0;
+ }
+
+ return fence;
+}
+
static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
u32 size, u32 pitch)
{
@@ -1287,7 +1490,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
/* Calculate final sizes and batch size.. */
pte_flags = clear_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0;
- batch_size = 2 +
+ batch_size = 1 +
pte_update_size(m, pte_flags, src, &src_it,
&clear_L0, &clear_L0_ofs, &clear_L0_pt,
clear_bo_data ? emit_clear_cmd_len(gt) : 0, 0,
@@ -1766,16 +1969,22 @@ static u32 pte_update_cmd_size(u64 size)
static void build_pt_update_batch_sram(struct xe_migrate *m,
struct xe_bb *bb, u32 pt_offset,
struct drm_pagemap_addr *sram_addr,
- u32 size)
+ u32 size, int level)
{
u16 pat_index = tile_to_xe(m->tile)->pat.idx[XE_CACHE_WB];
+ u64 gpu_page_size = 0x1ull << xe_pt_shift(level);
u32 ptes;
int i = 0;
- ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);
+ xe_tile_assert(m->tile, PAGE_ALIGNED(size));
+
+ ptes = DIV_ROUND_UP(size, gpu_page_size);
while (ptes) {
u32 chunk = min(MAX_PTE_PER_SDI, ptes);
+ if (!level)
+ chunk = ALIGN_DOWN(chunk, PAGE_SIZE / XE_PAGE_SIZE);
+
bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk);
bb->cs[bb->len++] = pt_offset;
bb->cs[bb->len++] = 0;
@@ -1784,30 +1993,70 @@ static void build_pt_update_batch_sram(struct xe_migrate *m,
ptes -= chunk;
while (chunk--) {
- u64 addr = sram_addr[i].addr & PAGE_MASK;
+ u64 addr = sram_addr[i].addr;
+ u64 pte;
xe_tile_assert(m->tile, sram_addr[i].proto ==
DRM_INTERCONNECT_SYSTEM);
xe_tile_assert(m->tile, addr);
- addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe,
- addr, pat_index,
- 0, false, 0);
- bb->cs[bb->len++] = lower_32_bits(addr);
- bb->cs[bb->len++] = upper_32_bits(addr);
-
- i++;
+ xe_tile_assert(m->tile, PAGE_ALIGNED(addr));
+
+again:
+ pte = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe,
+ addr, pat_index,
+ level, false, 0);
+ bb->cs[bb->len++] = lower_32_bits(pte);
+ bb->cs[bb->len++] = upper_32_bits(pte);
+
+ if (gpu_page_size < PAGE_SIZE) {
+ addr += XE_PAGE_SIZE;
+ if (!PAGE_ALIGNED(addr)) {
+ chunk--;
+ goto again;
+ }
+ i++;
+ } else {
+ i += gpu_page_size / PAGE_SIZE;
+ }
}
}
}
-enum xe_migrate_copy_dir {
- XE_MIGRATE_COPY_TO_VRAM,
- XE_MIGRATE_COPY_TO_SRAM,
-};
+static bool xe_migrate_vram_use_pde(struct drm_pagemap_addr *sram_addr,
+ unsigned long size)
+{
+ u32 large_size = (0x1 << xe_pt_shift(1));
+ unsigned long i, incr = large_size / PAGE_SIZE;
+
+ for (i = 0; i < DIV_ROUND_UP(size, PAGE_SIZE); i += incr)
+ if (PAGE_SIZE << sram_addr[i].order != large_size)
+ return false;
+
+ return true;
+}
#define XE_CACHELINE_BYTES 64ull
#define XE_CACHELINE_MASK (XE_CACHELINE_BYTES - 1)
+static u32 xe_migrate_copy_pitch(struct xe_device *xe, u32 len)
+{
+ u32 pitch;
+
+ if (IS_ALIGNED(len, PAGE_SIZE))
+ pitch = PAGE_SIZE;
+ else if (IS_ALIGNED(len, SZ_4K))
+ pitch = SZ_4K;
+ else if (IS_ALIGNED(len, SZ_256))
+ pitch = SZ_256;
+ else if (IS_ALIGNED(len, 4))
+ pitch = 4;
+ else
+ pitch = 1;
+
+ xe_assert(xe, pitch > 1 || xe->info.has_mem_copy_instr);
+ return pitch;
+}
+
static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
unsigned long len,
unsigned long sram_offset,
@@ -1819,24 +2068,25 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
struct xe_device *xe = gt_to_xe(gt);
bool use_usm_batch = xe->info.has_usm;
struct dma_fence *fence = NULL;
- u32 batch_size = 2;
+ u32 batch_size = 1;
u64 src_L0_ofs, dst_L0_ofs;
struct xe_sched_job *job;
struct xe_bb *bb;
u32 update_idx, pt_slot = 0;
unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE);
- unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ?
- PAGE_SIZE : 4;
+ unsigned int pitch = xe_migrate_copy_pitch(xe, len);
int err;
unsigned long i, j;
+ bool use_pde = xe_migrate_vram_use_pde(sram_addr, len + sram_offset);
- if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) ||
- (sram_offset | vram_addr) & XE_CACHELINE_MASK))
+ if (!xe->info.has_mem_copy_instr &&
+ drm_WARN_ON(&xe->drm,
+ (!IS_ALIGNED(len, pitch)) || (sram_offset | vram_addr) & XE_CACHELINE_MASK))
return ERR_PTR(-EOPNOTSUPP);
xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER);
- batch_size += pte_update_cmd_size(len);
+ batch_size += pte_update_cmd_size(npages << PAGE_SHIFT);
batch_size += EMIT_COPY_DW;
bb = xe_bb_new(gt, batch_size, use_usm_batch);
@@ -1853,7 +2103,7 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
* struct drm_pagemap_addr. Ensure this is the case even with higher
* orders.
*/
- for (i = 0; i < npages;) {
+ for (i = 0; !use_pde && i < npages;) {
unsigned int order = sram_addr[i].order;
for (j = 1; j < NR_PAGES(order) && i + j < npages; j++)
@@ -1863,16 +2113,26 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m,
i += NR_PAGES(order);
}
- build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
- sram_addr, len + sram_offset);
+ if (use_pde)
+ build_pt_update_batch_sram(m, bb, m->large_page_copy_pdes,
+ sram_addr, npages << PAGE_SHIFT, 1);
+ else
+ build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE,
+ sram_addr, npages << PAGE_SHIFT, 0);
if (dir == XE_MIGRATE_COPY_TO_VRAM) {
- src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
+ if (use_pde)
+ src_L0_ofs = m->large_page_copy_ofs + sram_offset;
+ else
+ src_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
dst_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
} else {
src_L0_ofs = xe_migrate_vram_ofs(xe, vram_addr, false);
- dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
+ if (use_pde)
+ dst_L0_ofs = m->large_page_copy_ofs + sram_offset;
+ else
+ dst_L0_ofs = xe_migrate_vm_addr(pt_slot, 0) + sram_offset;
}
bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
@@ -1918,7 +2178,7 @@ err:
*
* Copy from an array dma addresses to a VRAM device physical address
*
- * Return: dma fence for migrate to signal completion on succees, ERR_PTR on
+ * Return: dma fence for migrate to signal completion on success, ERR_PTR on
* failure
*/
struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
@@ -1939,7 +2199,7 @@ struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m,
*
* Copy from a VRAM device physical address to an array dma addresses
*
- * Return: dma fence for migrate to signal completion on succees, ERR_PTR on
+ * Return: dma fence for migrate to signal completion on success, ERR_PTR on
* failure
*/
struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m,
@@ -2040,8 +2300,10 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
xe_bo_assert_held(bo);
/* Use bounce buffer for small access and unaligned access */
- if (!IS_ALIGNED(len, XE_CACHELINE_BYTES) ||
- !IS_ALIGNED((unsigned long)buf + offset, XE_CACHELINE_BYTES)) {
+ if (!xe->info.has_mem_copy_instr &&
+ (!IS_ALIGNED(len, 4) ||
+ !IS_ALIGNED(page_offset, XE_CACHELINE_BYTES) ||
+ !IS_ALIGNED(offset, XE_CACHELINE_BYTES))) {
int buf_offset = 0;
void *bounce;
int err;
@@ -2103,6 +2365,7 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) +
cursor.start;
int current_bytes;
+ u32 pitch;
if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER)
current_bytes = min_t(int, bytes_left,
@@ -2110,13 +2373,13 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo,
else
current_bytes = min_t(int, bytes_left, cursor.size);
- if (current_bytes & ~PAGE_MASK) {
- int pitch = 4;
-
+ pitch = xe_migrate_copy_pitch(xe, current_bytes);
+ if (xe->info.has_mem_copy_instr)
+ current_bytes = min_t(int, current_bytes, U16_MAX * pitch);
+ else
current_bytes = min_t(int, current_bytes,
round_down(S16_MAX * pitch,
XE_CACHELINE_BYTES));
- }
__fence = xe_migrate_vram(m, current_bytes,
(unsigned long)buf & ~PAGE_MASK,
@@ -2188,6 +2451,20 @@ void xe_migrate_job_unlock(struct xe_migrate *m, struct xe_exec_queue *q)
xe_vm_assert_held(q->vm); /* User queues VM's should be locked */
}
+#if IS_ENABLED(CONFIG_PROVE_LOCKING)
+/**
+ * xe_migrate_job_lock_assert() - Assert migrate job lock held of queue
+ * @q: Migrate queue
+ */
+void xe_migrate_job_lock_assert(struct xe_exec_queue *q)
+{
+ struct xe_migrate *m = gt_to_tile(q->gt)->migrate;
+
+ xe_gt_assert(q->gt, q == m->q);
+ lockdep_assert_held(&m->job_mutex);
+}
+#endif
+
#if IS_ENABLED(CONFIG_DRM_XE_KUNIT_TEST)
#include "tests/xe_migrate.c"
#endif