diff options
Diffstat (limited to 'drivers/gpu/drm/xe/xe_migrate.c')
| -rw-r--r-- | drivers/gpu/drm/xe/xe_migrate.c | 132 |
1 files changed, 107 insertions, 25 deletions
diff --git a/drivers/gpu/drm/xe/xe_migrate.c b/drivers/gpu/drm/xe/xe_migrate.c index 3112c966c67d..56a5804726e9 100644 --- a/drivers/gpu/drm/xe/xe_migrate.c +++ b/drivers/gpu/drm/xe/xe_migrate.c @@ -699,9 +699,9 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb, } #define EMIT_COPY_DW 10 -static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, - u64 src_ofs, u64 dst_ofs, unsigned int size, - unsigned int pitch) +static void emit_xy_fast_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, + u64 dst_ofs, unsigned int size, + unsigned int pitch) { struct xe_device *xe = gt_to_xe(gt); u32 mocs = 0; @@ -730,6 +730,61 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, bb->cs[bb->len++] = upper_32_bits(src_ofs); } +#define PAGE_COPY_MODE_PS SZ_256 /* hw uses 256 bytes as the page-size */ +static void emit_mem_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs, + u64 dst_ofs, unsigned int size, unsigned int pitch) +{ + u32 mode, copy_type, width; + + xe_gt_assert(gt, IS_ALIGNED(size, pitch)); + xe_gt_assert(gt, pitch <= U16_MAX); + xe_gt_assert(gt, pitch); + xe_gt_assert(gt, size); + + if (IS_ALIGNED(size, PAGE_COPY_MODE_PS) && + IS_ALIGNED(lower_32_bits(src_ofs), PAGE_COPY_MODE_PS) && + IS_ALIGNED(lower_32_bits(dst_ofs), PAGE_COPY_MODE_PS)) { + mode = MEM_COPY_PAGE_COPY_MODE; + copy_type = 0; /* linear copy */ + width = size / PAGE_COPY_MODE_PS; + } else if (pitch > 1) { + xe_gt_assert(gt, size / pitch <= U16_MAX); + mode = 0; /* BYTE_COPY */ + copy_type = MEM_COPY_MATRIX_COPY; + width = pitch; + } else { + mode = 0; /* BYTE_COPY */ + copy_type = 0; /* linear copy */ + width = size; + } + + xe_gt_assert(gt, width <= U16_MAX); + + bb->cs[bb->len++] = MEM_COPY_CMD | mode | copy_type; + bb->cs[bb->len++] = width - 1; + bb->cs[bb->len++] = size / pitch - 1; /* ignored by hw for page-copy/linear above */ + bb->cs[bb->len++] = pitch - 1; + bb->cs[bb->len++] = pitch - 1; + bb->cs[bb->len++] = lower_32_bits(src_ofs); + bb->cs[bb->len++] = upper_32_bits(src_ofs); + bb->cs[bb->len++] = lower_32_bits(dst_ofs); + bb->cs[bb->len++] = upper_32_bits(dst_ofs); + bb->cs[bb->len++] = FIELD_PREP(MEM_COPY_SRC_MOCS_INDEX_MASK, gt->mocs.uc_index) | + FIELD_PREP(MEM_COPY_DST_MOCS_INDEX_MASK, gt->mocs.uc_index); +} + +static void emit_copy(struct xe_gt *gt, struct xe_bb *bb, + u64 src_ofs, u64 dst_ofs, unsigned int size, + unsigned int pitch) +{ + struct xe_device *xe = gt_to_xe(gt); + + if (xe->info.has_mem_copy_instr) + emit_mem_copy(gt, bb, src_ofs, dst_ofs, size, pitch); + else + emit_xy_fast_copy(gt, bb, src_ofs, dst_ofs, size, pitch); +} + static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm) { return usm ? m->usm_batch_base_ofs : m->batch_base_ofs; @@ -847,7 +902,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, &ccs_it); while (size) { - u32 batch_size = 2; /* arb_clear() + MI_BATCH_BUFFER_END */ + u32 batch_size = 1; /* MI_BATCH_BUFFER_END */ struct xe_sched_job *job; struct xe_bb *bb; u32 flush_flags = 0; @@ -1312,7 +1367,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m, /* Calculate final sizes and batch size.. */ pte_flags = clear_vram ? PTE_UPDATE_FLAG_IS_VRAM : 0; - batch_size = 2 + + batch_size = 1 + pte_update_size(m, pte_flags, src, &src_it, &clear_L0, &clear_L0_ofs, &clear_L0_pt, clear_bo_data ? emit_clear_cmd_len(gt) : 0, 0, @@ -1798,11 +1853,15 @@ static void build_pt_update_batch_sram(struct xe_migrate *m, u32 ptes; int i = 0; + xe_tile_assert(m->tile, PAGE_ALIGNED(size)); + ptes = DIV_ROUND_UP(size, gpu_page_size); while (ptes) { u32 chunk = min(MAX_PTE_PER_SDI, ptes); - chunk = ALIGN_DOWN(chunk, PAGE_SIZE / XE_PAGE_SIZE); + if (!level) + chunk = ALIGN_DOWN(chunk, PAGE_SIZE / XE_PAGE_SIZE); + bb->cs[bb->len++] = MI_STORE_DATA_IMM | MI_SDI_NUM_QW(chunk); bb->cs[bb->len++] = pt_offset; bb->cs[bb->len++] = 0; @@ -1811,12 +1870,13 @@ static void build_pt_update_batch_sram(struct xe_migrate *m, ptes -= chunk; while (chunk--) { - u64 addr = sram_addr[i].addr & ~(gpu_page_size - 1); - u64 pte, orig_addr = addr; + u64 addr = sram_addr[i].addr; + u64 pte; xe_tile_assert(m->tile, sram_addr[i].proto == DRM_INTERCONNECT_SYSTEM); xe_tile_assert(m->tile, addr); + xe_tile_assert(m->tile, PAGE_ALIGNED(addr)); again: pte = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe, @@ -1827,7 +1887,7 @@ again: if (gpu_page_size < PAGE_SIZE) { addr += XE_PAGE_SIZE; - if (orig_addr + PAGE_SIZE != addr) { + if (!PAGE_ALIGNED(addr)) { chunk--; goto again; } @@ -1860,6 +1920,25 @@ enum xe_migrate_copy_dir { #define XE_CACHELINE_BYTES 64ull #define XE_CACHELINE_MASK (XE_CACHELINE_BYTES - 1) +static u32 xe_migrate_copy_pitch(struct xe_device *xe, u32 len) +{ + u32 pitch; + + if (IS_ALIGNED(len, PAGE_SIZE)) + pitch = PAGE_SIZE; + else if (IS_ALIGNED(len, SZ_4K)) + pitch = SZ_4K; + else if (IS_ALIGNED(len, SZ_256)) + pitch = SZ_256; + else if (IS_ALIGNED(len, 4)) + pitch = 4; + else + pitch = 1; + + xe_assert(xe, pitch > 1 || xe->info.has_mem_copy_instr); + return pitch; +} + static struct dma_fence *xe_migrate_vram(struct xe_migrate *m, unsigned long len, unsigned long sram_offset, @@ -1871,25 +1950,25 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m, struct xe_device *xe = gt_to_xe(gt); bool use_usm_batch = xe->info.has_usm; struct dma_fence *fence = NULL; - u32 batch_size = 2; + u32 batch_size = 1; u64 src_L0_ofs, dst_L0_ofs; struct xe_sched_job *job; struct xe_bb *bb; u32 update_idx, pt_slot = 0; unsigned long npages = DIV_ROUND_UP(len + sram_offset, PAGE_SIZE); - unsigned int pitch = len >= PAGE_SIZE && !(len & ~PAGE_MASK) ? - PAGE_SIZE : 4; + unsigned int pitch = xe_migrate_copy_pitch(xe, len); int err; unsigned long i, j; bool use_pde = xe_migrate_vram_use_pde(sram_addr, len + sram_offset); - if (drm_WARN_ON(&xe->drm, (len & XE_CACHELINE_MASK) || - (sram_offset | vram_addr) & XE_CACHELINE_MASK)) + if (!xe->info.has_mem_copy_instr && + drm_WARN_ON(&xe->drm, + (!IS_ALIGNED(len, pitch)) || (sram_offset | vram_addr) & XE_CACHELINE_MASK)) return ERR_PTR(-EOPNOTSUPP); xe_assert(xe, npages * PAGE_SIZE <= MAX_PREEMPTDISABLE_TRANSFER); - batch_size += pte_update_cmd_size(len); + batch_size += pte_update_cmd_size(npages << PAGE_SHIFT); batch_size += EMIT_COPY_DW; bb = xe_bb_new(gt, batch_size, use_usm_batch); @@ -1918,10 +1997,10 @@ static struct dma_fence *xe_migrate_vram(struct xe_migrate *m, if (use_pde) build_pt_update_batch_sram(m, bb, m->large_page_copy_pdes, - sram_addr, len + sram_offset, 1); + sram_addr, npages << PAGE_SHIFT, 1); else build_pt_update_batch_sram(m, bb, pt_slot * XE_PAGE_SIZE, - sram_addr, len + sram_offset, 0); + sram_addr, npages << PAGE_SHIFT, 0); if (dir == XE_MIGRATE_COPY_TO_VRAM) { if (use_pde) @@ -1981,7 +2060,7 @@ err: * * Copy from an array dma addresses to a VRAM device physical address * - * Return: dma fence for migrate to signal completion on succees, ERR_PTR on + * Return: dma fence for migrate to signal completion on success, ERR_PTR on * failure */ struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m, @@ -2002,7 +2081,7 @@ struct dma_fence *xe_migrate_to_vram(struct xe_migrate *m, * * Copy from a VRAM device physical address to an array dma addresses * - * Return: dma fence for migrate to signal completion on succees, ERR_PTR on + * Return: dma fence for migrate to signal completion on success, ERR_PTR on * failure */ struct dma_fence *xe_migrate_from_vram(struct xe_migrate *m, @@ -2103,8 +2182,10 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, xe_bo_assert_held(bo); /* Use bounce buffer for small access and unaligned access */ - if (!IS_ALIGNED(len, XE_CACHELINE_BYTES) || - !IS_ALIGNED((unsigned long)buf + offset, XE_CACHELINE_BYTES)) { + if (!xe->info.has_mem_copy_instr && + (!IS_ALIGNED(len, 4) || + !IS_ALIGNED(page_offset, XE_CACHELINE_BYTES) || + !IS_ALIGNED(offset, XE_CACHELINE_BYTES))) { int buf_offset = 0; void *bounce; int err; @@ -2166,6 +2247,7 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, u64 vram_addr = vram_region_gpu_offset(bo->ttm.resource) + cursor.start; int current_bytes; + u32 pitch; if (cursor.size > MAX_PREEMPTDISABLE_TRANSFER) current_bytes = min_t(int, bytes_left, @@ -2173,13 +2255,13 @@ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, else current_bytes = min_t(int, bytes_left, cursor.size); - if (current_bytes & ~PAGE_MASK) { - int pitch = 4; - + pitch = xe_migrate_copy_pitch(xe, current_bytes); + if (xe->info.has_mem_copy_instr) + current_bytes = min_t(int, current_bytes, U16_MAX * pitch); + else current_bytes = min_t(int, current_bytes, round_down(S16_MAX * pitch, XE_CACHELINE_BYTES)); - } __fence = xe_migrate_vram(m, current_bytes, (unsigned long)buf & ~PAGE_MASK, |
