summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/xe/xe_lrc.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/xe/xe_lrc.c')
-rw-r--r--drivers/gpu/drm/xe/xe_lrc.c1182
1 files changed, 1075 insertions, 107 deletions
diff --git a/drivers/gpu/drm/xe/xe_lrc.c b/drivers/gpu/drm/xe/xe_lrc.c
index 615bbc372ac6..b5083c99dd50 100644
--- a/drivers/gpu/drm/xe/xe_lrc.c
+++ b/drivers/gpu/drm/xe/xe_lrc.c
@@ -5,16 +5,19 @@
#include "xe_lrc.h"
+#include <generated/xe_wa_oob.h>
+
#include <linux/ascii85.h>
+#include <linux/panic.h>
#include "instructions/xe_mi_commands.h"
#include "instructions/xe_gfxpipe_commands.h"
#include "instructions/xe_gfx_state_commands.h"
#include "regs/xe_engine_regs.h"
-#include "regs/xe_gpu_commands.h"
#include "regs/xe_lrc_layout.h"
#include "xe_bb.h"
#include "xe_bo.h"
+#include "xe_configfs.h"
#include "xe_device.h"
#include "xe_drm_client.h"
#include "xe_exec_queue_types.h"
@@ -23,8 +26,11 @@
#include "xe_hw_fence.h"
#include "xe_map.h"
#include "xe_memirq.h"
+#include "xe_mmio.h"
#include "xe_sriov.h"
+#include "xe_trace_lrc.h"
#include "xe_vm.h"
+#include "xe_wa.h"
#define LRC_VALID BIT_ULL(0)
#define LRC_PRIVILEGE BIT_ULL(8)
@@ -34,20 +40,33 @@
#define LRC_ENGINE_CLASS GENMASK_ULL(63, 61)
#define LRC_ENGINE_INSTANCE GENMASK_ULL(53, 48)
-struct xe_lrc_snapshot {
- struct xe_bo *lrc_bo;
- void *lrc_snapshot;
- unsigned long lrc_size, lrc_offset;
-
- u32 context_desc;
- u32 head;
- struct {
- u32 internal;
- u32 memory;
- } tail;
- u32 start_seqno;
- u32 seqno;
-};
+#define LRC_PPHWSP_SIZE SZ_4K
+#define LRC_INDIRECT_CTX_BO_SIZE SZ_4K
+#define LRC_INDIRECT_RING_STATE_SIZE SZ_4K
+
+/*
+ * Layout of the LRC and associated data allocated as
+ * lrc->bo:
+ *
+ * Region Size
+ * +============================+=================================+ <- __xe_lrc_ring_offset()
+ * | Ring | ring_size, see |
+ * | | xe_lrc_init() |
+ * +============================+=================================+ <- __xe_lrc_pphwsp_offset()
+ * | PPHWSP (includes SW state) | 4K |
+ * +----------------------------+---------------------------------+ <- __xe_lrc_regs_offset()
+ * | Engine Context Image | n * 4K, see |
+ * | | xe_gt_lrc_size() |
+ * +----------------------------+---------------------------------+ <- __xe_lrc_indirect_ring_offset()
+ * | Indirect Ring State Page | 0 or 4k, see |
+ * | | XE_LRC_FLAG_INDIRECT_RING_STATE |
+ * +============================+=================================+ <- __xe_lrc_indirect_ctx_offset()
+ * | Indirect Context Page | 0 or 4k, see |
+ * | | XE_LRC_FLAG_INDIRECT_CTX |
+ * +============================+=================================+ <- __xe_lrc_wa_bb_offset()
+ * | WA BB Per Ctx | 4k |
+ * +============================+=================================+ <- xe_bo_size(lrc->bo)
+ */
static struct xe_device *
lrc_to_xe(struct xe_lrc *lrc)
@@ -55,20 +74,45 @@ lrc_to_xe(struct xe_lrc *lrc)
return gt_to_xe(lrc->fence_ctx.gt);
}
-size_t xe_lrc_size(struct xe_device *xe, enum xe_engine_class class)
+static bool
+gt_engine_needs_indirect_ctx(struct xe_gt *gt, enum xe_engine_class class)
+{
+ struct xe_device *xe = gt_to_xe(gt);
+
+ if (XE_GT_WA(gt, 16010904313) &&
+ (class == XE_ENGINE_CLASS_RENDER ||
+ class == XE_ENGINE_CLASS_COMPUTE))
+ return true;
+
+ if (xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
+ class, NULL))
+ return true;
+
+ return false;
+}
+
+size_t xe_gt_lrc_size(struct xe_gt *gt, enum xe_engine_class class)
{
+ struct xe_device *xe = gt_to_xe(gt);
+ size_t size;
+
+ /* Per-process HW status page (PPHWSP) */
+ size = LRC_PPHWSP_SIZE;
+
+ /* Engine context image */
switch (class) {
case XE_ENGINE_CLASS_RENDER:
if (GRAPHICS_VER(xe) >= 20)
- return 4 * SZ_4K;
+ size += 3 * SZ_4K;
else
- return 14 * SZ_4K;
+ size += 13 * SZ_4K;
+ break;
case XE_ENGINE_CLASS_COMPUTE:
- /* 14 pages since graphics_ver == 11 */
if (GRAPHICS_VER(xe) >= 20)
- return 3 * SZ_4K;
+ size += 2 * SZ_4K;
else
- return 14 * SZ_4K;
+ size += 13 * SZ_4K;
+ break;
default:
WARN(1, "Unknown engine class: %d", class);
fallthrough;
@@ -76,8 +120,14 @@ size_t xe_lrc_size(struct xe_device *xe, enum xe_engine_class class)
case XE_ENGINE_CLASS_VIDEO_DECODE:
case XE_ENGINE_CLASS_VIDEO_ENHANCE:
case XE_ENGINE_CLASS_OTHER:
- return 2 * SZ_4K;
+ size += 1 * SZ_4K;
}
+
+ /* Add indirect ring state page */
+ if (xe_gt_has_indirect_ring_state(gt))
+ size += LRC_INDIRECT_RING_STATE_SIZE;
+
+ return size;
}
/*
@@ -508,6 +558,32 @@ static const u8 xe2_xcs_offsets[] = {
0
};
+static const u8 xe2_indirect_ring_state_offsets[] = {
+ NOP(1), /* [0x00] */
+ LRI(5, POSTED), /* [0x01] */
+ REG(0x034), /* [0x02] RING_BUFFER_HEAD */
+ REG(0x030), /* [0x04] RING_BUFFER_TAIL */
+ REG(0x038), /* [0x06] RING_BUFFER_START */
+ REG(0x048), /* [0x08] RING_BUFFER_START_UDW */
+ REG(0x03c), /* [0x0a] RING_BUFFER_CONTROL */
+
+ NOP(5), /* [0x0c] */
+ LRI(9, POSTED), /* [0x11] */
+ REG(0x168), /* [0x12] BB_ADDR_UDW */
+ REG(0x140), /* [0x14] BB_ADDR */
+ REG(0x110), /* [0x16] BB_STATE */
+ REG16(0x588), /* [0x18] BB_STACK_WRITE_PORT */
+ REG16(0x588), /* [0x20] BB_STACK_WRITE_PORT */
+ REG16(0x588), /* [0x22] BB_STACK_WRITE_PORT */
+ REG16(0x588), /* [0x24] BB_STACK_WRITE_PORT */
+ REG16(0x588), /* [0x26] BB_STACK_WRITE_PORT */
+ REG16(0x588), /* [0x28] BB_STACK_WRITE_PORT */
+
+ NOP(12), /* [0x00] */
+
+ 0
+};
+
#undef REG16
#undef REG
#undef LRI
@@ -546,15 +622,18 @@ static void set_context_control(u32 *regs, struct xe_hw_engine *hwe)
regs[CTX_CONTEXT_CONTROL] = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH |
CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
- /* TODO: Timestamp */
+ if (xe_gt_has_indirect_ring_state(hwe->gt))
+ regs[CTX_CONTEXT_CONTROL] |=
+ _MASKED_BIT_ENABLE(CTX_CTRL_INDIRECT_RING_STATE_ENABLE);
}
static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
{
- struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->sriov.vf.memirq;
+ struct xe_memirq *memirq = &gt_to_tile(hwe->gt)->memirq;
struct xe_device *xe = gt_to_xe(hwe->gt);
+ u8 num_regs;
- if (!IS_SRIOV_VF(xe) || !xe_device_has_memirq(xe))
+ if (!xe_device_uses_memirq(xe))
return;
regs[CTX_LRM_INT_MASK_ENABLE] = MI_LOAD_REGISTER_MEM |
@@ -562,12 +641,18 @@ static void set_memory_based_intr(u32 *regs, struct xe_hw_engine *hwe)
regs[CTX_INT_MASK_ENABLE_REG] = RING_IMR(0).addr;
regs[CTX_INT_MASK_ENABLE_PTR] = xe_memirq_enable_ptr(memirq);
- regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(2) |
+ num_regs = xe_device_has_msix(xe) ? 3 : 2;
+ regs[CTX_LRI_INT_REPORT_PTR] = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(num_regs) |
MI_LRI_LRM_CS_MMIO | MI_LRI_FORCE_POSTED;
regs[CTX_INT_STATUS_REPORT_REG] = RING_INT_STATUS_RPT_PTR(0).addr;
- regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq);
+ regs[CTX_INT_STATUS_REPORT_PTR] = xe_memirq_status_ptr(memirq, hwe);
regs[CTX_INT_SRC_REPORT_REG] = RING_INT_SRC_RPT_PTR(0).addr;
- regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq);
+ regs[CTX_INT_SRC_REPORT_PTR] = xe_memirq_source_ptr(memirq, hwe);
+
+ if (xe_device_has_msix(xe)) {
+ regs[CTX_CS_INT_VEC_REG] = CS_INT_VEC(0).addr;
+ /* CTX_CS_INT_VEC_DATA will be set in xe_lrc_init */
+ }
}
static int lrc_ring_mi_mode(struct xe_hw_engine *hwe)
@@ -589,6 +674,11 @@ static void reset_stop_ring(u32 *regs, struct xe_hw_engine *hwe)
regs[x + 1] |= STOP_RING << 16;
}
+static inline bool xe_lrc_has_indirect_ring_state(struct xe_lrc *lrc)
+{
+ return lrc->flags & XE_LRC_FLAG_INDIRECT_RING_STATE;
+}
+
static inline u32 __xe_lrc_ring_offset(struct xe_lrc *lrc)
{
return 0;
@@ -601,13 +691,26 @@ u32 xe_lrc_pphwsp_offset(struct xe_lrc *lrc)
/* Make the magic macros work */
#define __xe_lrc_pphwsp_offset xe_lrc_pphwsp_offset
+#define __xe_lrc_regs_offset xe_lrc_regs_offset
#define LRC_SEQNO_PPHWSP_OFFSET 512
#define LRC_START_SEQNO_PPHWSP_OFFSET (LRC_SEQNO_PPHWSP_OFFSET + 8)
+#define LRC_CTX_JOB_TIMESTAMP_OFFSET (LRC_START_SEQNO_PPHWSP_OFFSET + 8)
+#define LRC_ENGINE_ID_PPHWSP_OFFSET 1024
#define LRC_PARALLEL_PPHWSP_OFFSET 2048
-#define LRC_PPHWSP_SIZE SZ_4K
-static size_t lrc_reg_size(struct xe_device *xe)
+u32 xe_lrc_regs_offset(struct xe_lrc *lrc)
+{
+ return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
+}
+
+/**
+ * xe_lrc_reg_size() - Get size of the LRC registers area within queues
+ * @xe: the &xe_device struct instance
+ *
+ * Returns: Size of the LRC registers area for current platform
+ */
+size_t xe_lrc_reg_size(struct xe_device *xe)
{
if (GRAPHICS_VERx100(xe) >= 1250)
return 96 * sizeof(u32);
@@ -617,7 +720,7 @@ static size_t lrc_reg_size(struct xe_device *xe)
size_t xe_lrc_skip_size(struct xe_device *xe)
{
- return LRC_PPHWSP_SIZE + lrc_reg_size(xe);
+ return LRC_PPHWSP_SIZE + xe_lrc_reg_size(xe);
}
static inline u32 __xe_lrc_seqno_offset(struct xe_lrc *lrc)
@@ -632,15 +735,52 @@ static inline u32 __xe_lrc_start_seqno_offset(struct xe_lrc *lrc)
return xe_lrc_pphwsp_offset(lrc) + LRC_START_SEQNO_PPHWSP_OFFSET;
}
+static u32 __xe_lrc_ctx_job_timestamp_offset(struct xe_lrc *lrc)
+{
+ /* This is stored in the driver-defined portion of PPHWSP */
+ return xe_lrc_pphwsp_offset(lrc) + LRC_CTX_JOB_TIMESTAMP_OFFSET;
+}
+
static inline u32 __xe_lrc_parallel_offset(struct xe_lrc *lrc)
{
/* The parallel is stored in the driver-defined portion of PPHWSP */
return xe_lrc_pphwsp_offset(lrc) + LRC_PARALLEL_PPHWSP_OFFSET;
}
-static inline u32 __xe_lrc_regs_offset(struct xe_lrc *lrc)
+static inline u32 __xe_lrc_engine_id_offset(struct xe_lrc *lrc)
{
- return xe_lrc_pphwsp_offset(lrc) + LRC_PPHWSP_SIZE;
+ return xe_lrc_pphwsp_offset(lrc) + LRC_ENGINE_ID_PPHWSP_OFFSET;
+}
+
+static u32 __xe_lrc_ctx_timestamp_offset(struct xe_lrc *lrc)
+{
+ return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP * sizeof(u32);
+}
+
+static u32 __xe_lrc_ctx_timestamp_udw_offset(struct xe_lrc *lrc)
+{
+ return __xe_lrc_regs_offset(lrc) + CTX_TIMESTAMP_UDW * sizeof(u32);
+}
+
+static inline u32 __xe_lrc_indirect_ring_offset(struct xe_lrc *lrc)
+{
+ u32 offset = xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE -
+ LRC_INDIRECT_RING_STATE_SIZE;
+
+ if (lrc->flags & XE_LRC_FLAG_INDIRECT_CTX)
+ offset -= LRC_INDIRECT_CTX_BO_SIZE;
+
+ return offset;
+}
+
+static inline u32 __xe_lrc_indirect_ctx_offset(struct xe_lrc *lrc)
+{
+ return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE - LRC_INDIRECT_CTX_BO_SIZE;
+}
+
+static inline u32 __xe_lrc_wa_bb_offset(struct xe_lrc *lrc)
+{
+ return xe_bo_size(lrc->bo) - LRC_WA_BB_SIZE;
}
#define DECL_MAP_ADDR_HELPERS(elem) \
@@ -662,15 +802,120 @@ DECL_MAP_ADDR_HELPERS(pphwsp)
DECL_MAP_ADDR_HELPERS(seqno)
DECL_MAP_ADDR_HELPERS(regs)
DECL_MAP_ADDR_HELPERS(start_seqno)
+DECL_MAP_ADDR_HELPERS(ctx_job_timestamp)
+DECL_MAP_ADDR_HELPERS(ctx_timestamp)
+DECL_MAP_ADDR_HELPERS(ctx_timestamp_udw)
DECL_MAP_ADDR_HELPERS(parallel)
+DECL_MAP_ADDR_HELPERS(indirect_ring)
+DECL_MAP_ADDR_HELPERS(engine_id)
#undef DECL_MAP_ADDR_HELPERS
+/**
+ * xe_lrc_ctx_timestamp_ggtt_addr() - Get ctx timestamp GGTT address
+ * @lrc: Pointer to the lrc.
+ *
+ * Returns: ctx timestamp GGTT address
+ */
+u32 xe_lrc_ctx_timestamp_ggtt_addr(struct xe_lrc *lrc)
+{
+ return __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
+}
+
+/**
+ * xe_lrc_ctx_timestamp_udw_ggtt_addr() - Get ctx timestamp udw GGTT address
+ * @lrc: Pointer to the lrc.
+ *
+ * Returns: ctx timestamp udw GGTT address
+ */
+u32 xe_lrc_ctx_timestamp_udw_ggtt_addr(struct xe_lrc *lrc)
+{
+ return __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
+}
+
+/**
+ * xe_lrc_ctx_timestamp() - Read ctx timestamp value
+ * @lrc: Pointer to the lrc.
+ *
+ * Returns: ctx timestamp value
+ */
+u64 xe_lrc_ctx_timestamp(struct xe_lrc *lrc)
+{
+ struct xe_device *xe = lrc_to_xe(lrc);
+ struct iosys_map map;
+ u32 ldw, udw = 0;
+
+ map = __xe_lrc_ctx_timestamp_map(lrc);
+ ldw = xe_map_read32(xe, &map);
+
+ if (xe->info.has_64bit_timestamp) {
+ map = __xe_lrc_ctx_timestamp_udw_map(lrc);
+ udw = xe_map_read32(xe, &map);
+ }
+
+ return (u64)udw << 32 | ldw;
+}
+
+/**
+ * xe_lrc_ctx_job_timestamp_ggtt_addr() - Get ctx job timestamp GGTT address
+ * @lrc: Pointer to the lrc.
+ *
+ * Returns: ctx timestamp job GGTT address
+ */
+u32 xe_lrc_ctx_job_timestamp_ggtt_addr(struct xe_lrc *lrc)
+{
+ return __xe_lrc_ctx_job_timestamp_ggtt_addr(lrc);
+}
+
+/**
+ * xe_lrc_ctx_job_timestamp() - Read ctx job timestamp value
+ * @lrc: Pointer to the lrc.
+ *
+ * Returns: ctx timestamp job value
+ */
+u32 xe_lrc_ctx_job_timestamp(struct xe_lrc *lrc)
+{
+ struct xe_device *xe = lrc_to_xe(lrc);
+ struct iosys_map map;
+
+ map = __xe_lrc_ctx_job_timestamp_map(lrc);
+ return xe_map_read32(xe, &map);
+}
+
u32 xe_lrc_ggtt_addr(struct xe_lrc *lrc)
{
return __xe_lrc_pphwsp_ggtt_addr(lrc);
}
+u32 xe_lrc_indirect_ring_ggtt_addr(struct xe_lrc *lrc)
+{
+ if (!xe_lrc_has_indirect_ring_state(lrc))
+ return 0;
+
+ return __xe_lrc_indirect_ring_ggtt_addr(lrc);
+}
+
+static u32 xe_lrc_read_indirect_ctx_reg(struct xe_lrc *lrc, int reg_nr)
+{
+ struct xe_device *xe = lrc_to_xe(lrc);
+ struct iosys_map map;
+
+ map = __xe_lrc_indirect_ring_map(lrc);
+ iosys_map_incr(&map, reg_nr * sizeof(u32));
+ return xe_map_read32(xe, &map);
+}
+
+static void xe_lrc_write_indirect_ctx_reg(struct xe_lrc *lrc,
+ int reg_nr, u32 val)
+{
+ struct xe_device *xe = lrc_to_xe(lrc);
+ struct iosys_map map;
+
+ map = __xe_lrc_indirect_ring_map(lrc);
+ iosys_map_incr(&map, reg_nr * sizeof(u32));
+ xe_map_write32(xe, &map, val);
+}
+
u32 xe_lrc_read_ctx_reg(struct xe_lrc *lrc, int reg_nr)
{
struct xe_device *xe = lrc_to_xe(lrc);
@@ -693,89 +938,520 @@ void xe_lrc_write_ctx_reg(struct xe_lrc *lrc, int reg_nr, u32 val)
static void *empty_lrc_data(struct xe_hw_engine *hwe)
{
- struct xe_device *xe = gt_to_xe(hwe->gt);
+ struct xe_gt *gt = hwe->gt;
void *data;
u32 *regs;
- data = kzalloc(xe_lrc_size(xe, hwe->class), GFP_KERNEL);
+ data = kzalloc(xe_gt_lrc_size(gt, hwe->class), GFP_KERNEL);
if (!data)
return NULL;
/* 1st page: Per-Process of HW status Page */
regs = data + LRC_PPHWSP_SIZE;
- set_offsets(regs, reg_offsets(xe, hwe->class), hwe);
+ set_offsets(regs, reg_offsets(gt_to_xe(gt), hwe->class), hwe);
set_context_control(regs, hwe);
set_memory_based_intr(regs, hwe);
reset_stop_ring(regs, hwe);
+ if (xe_gt_has_indirect_ring_state(gt)) {
+ regs = data + xe_gt_lrc_size(gt, hwe->class) -
+ LRC_INDIRECT_RING_STATE_SIZE;
+ set_offsets(regs, xe2_indirect_ring_state_offsets, hwe);
+ }
return data;
}
+/**
+ * xe_default_lrc_update_memirq_regs_with_address - Re-compute GGTT references in default LRC
+ * of given engine.
+ * @hwe: the &xe_hw_engine struct instance
+ */
+void xe_default_lrc_update_memirq_regs_with_address(struct xe_hw_engine *hwe)
+{
+ struct xe_gt *gt = hwe->gt;
+ u32 *regs;
+
+ if (!gt->default_lrc[hwe->class])
+ return;
+
+ regs = gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE;
+ set_memory_based_intr(regs, hwe);
+}
+
+/**
+ * xe_lrc_update_memirq_regs_with_address - Re-compute GGTT references in mem interrupt data
+ * for given LRC.
+ * @lrc: the &xe_lrc struct instance
+ * @hwe: the &xe_hw_engine struct instance
+ * @regs: scratch buffer to be used as temporary storage
+ */
+void xe_lrc_update_memirq_regs_with_address(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
+ u32 *regs)
+{
+ struct xe_gt *gt = hwe->gt;
+ struct iosys_map map;
+ size_t regs_len;
+
+ if (!xe_device_uses_memirq(gt_to_xe(gt)))
+ return;
+
+ map = __xe_lrc_regs_map(lrc);
+ regs_len = xe_lrc_reg_size(gt_to_xe(gt));
+ xe_map_memcpy_from(gt_to_xe(gt), regs, &map, 0, regs_len);
+ set_memory_based_intr(regs, hwe);
+ xe_map_memcpy_to(gt_to_xe(gt), &map, 0, regs, regs_len);
+}
+
static void xe_lrc_set_ppgtt(struct xe_lrc *lrc, struct xe_vm *vm)
{
- u64 desc = xe_vm_pdp4_descriptor(vm, lrc->tile);
+ u64 desc = xe_vm_pdp4_descriptor(vm, gt_to_tile(lrc->gt));
xe_lrc_write_ctx_reg(lrc, CTX_PDP0_UDW, upper_32_bits(desc));
xe_lrc_write_ctx_reg(lrc, CTX_PDP0_LDW, lower_32_bits(desc));
}
-#define PVC_CTX_ASID (0x2e + 1)
-#define PVC_CTX_ACC_CTR_THOLD (0x2a + 1)
+static void xe_lrc_finish(struct xe_lrc *lrc)
+{
+ xe_hw_fence_ctx_finish(&lrc->fence_ctx);
+ xe_bo_unpin_map_no_vm(lrc->bo);
+}
+
+/*
+ * wa_bb_setup_utilization() - Write commands to wa bb to assist
+ * in calculating active context run ticks.
+ *
+ * Context Timestamp (CTX_TIMESTAMP) in the LRC accumulates the run ticks of the
+ * context, but only gets updated when the context switches out. In order to
+ * check how long a context has been active before it switches out, two things
+ * are required:
+ *
+ * (1) Determine if the context is running:
+ * To do so, we program the WA BB to set an initial value for CTX_TIMESTAMP in
+ * the LRC. The value chosen is 1 since 0 is the initial value when the LRC is
+ * initialized. During a query, we just check for this value to determine if the
+ * context is active. If the context switched out, it would overwrite this
+ * location with the actual CTX_TIMESTAMP MMIO value. Note that WA BB runs as
+ * the last part of context restore, so reusing this LRC location will not
+ * clobber anything.
+ *
+ * (2) Calculate the time that the context has been active for:
+ * The CTX_TIMESTAMP ticks only when the context is active. If a context is
+ * active, we just use the CTX_TIMESTAMP MMIO as the new value of utilization.
+ * While doing so, we need to read the CTX_TIMESTAMP MMIO for the specific
+ * engine instance. Since we do not know which instance the context is running
+ * on until it is scheduled, we also read the ENGINE_ID MMIO in the WA BB and
+ * store it in the PPHSWP.
+ */
+#define CONTEXT_ACTIVE 1ULL
+static ssize_t setup_utilization_wa(struct xe_lrc *lrc,
+ struct xe_hw_engine *hwe,
+ u32 *batch,
+ size_t max_len)
+{
+ u32 *cmd = batch;
+
+ if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
+ return -ENOSPC;
+
+ *cmd++ = MI_STORE_REGISTER_MEM | MI_SRM_USE_GGTT | MI_SRM_ADD_CS_OFFSET;
+ *cmd++ = ENGINE_ID(0).addr;
+ *cmd++ = __xe_lrc_engine_id_ggtt_addr(lrc);
+ *cmd++ = 0;
+
+ *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
+ *cmd++ = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
+ *cmd++ = 0;
+ *cmd++ = lower_32_bits(CONTEXT_ACTIVE);
+
+ if (lrc_to_xe(lrc)->info.has_64bit_timestamp) {
+ *cmd++ = MI_STORE_DATA_IMM | MI_SDI_GGTT | MI_SDI_NUM_DW(1);
+ *cmd++ = __xe_lrc_ctx_timestamp_udw_ggtt_addr(lrc);
+ *cmd++ = 0;
+ *cmd++ = upper_32_bits(CONTEXT_ACTIVE);
+ }
+
+ return cmd - batch;
+}
+
+static ssize_t setup_timestamp_wa(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
+ u32 *batch, size_t max_len)
+{
+ const u32 ts_addr = __xe_lrc_ctx_timestamp_ggtt_addr(lrc);
+ u32 *cmd = batch;
+
+ if (!XE_GT_WA(lrc->gt, 16010904313) ||
+ !(hwe->class == XE_ENGINE_CLASS_RENDER ||
+ hwe->class == XE_ENGINE_CLASS_COMPUTE ||
+ hwe->class == XE_ENGINE_CLASS_COPY ||
+ hwe->class == XE_ENGINE_CLASS_VIDEO_DECODE ||
+ hwe->class == XE_ENGINE_CLASS_VIDEO_ENHANCE))
+ return 0;
+
+ if (xe_gt_WARN_ON(lrc->gt, max_len < 12))
+ return -ENOSPC;
+
+ *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
+ MI_LRM_ASYNC;
+ *cmd++ = RING_CTX_TIMESTAMP(0).addr;
+ *cmd++ = ts_addr;
+ *cmd++ = 0;
+
+ *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO |
+ MI_LRM_ASYNC;
+ *cmd++ = RING_CTX_TIMESTAMP(0).addr;
+ *cmd++ = ts_addr;
+ *cmd++ = 0;
+
+ *cmd++ = MI_LOAD_REGISTER_MEM | MI_LRM_USE_GGTT | MI_LRI_LRM_CS_MMIO;
+ *cmd++ = RING_CTX_TIMESTAMP(0).addr;
+ *cmd++ = ts_addr;
+ *cmd++ = 0;
+
+ return cmd - batch;
+}
+
+static ssize_t setup_configfs_post_ctx_restore_bb(struct xe_lrc *lrc,
+ struct xe_hw_engine *hwe,
+ u32 *batch, size_t max_len)
+{
+ struct xe_device *xe = gt_to_xe(lrc->gt);
+ const u32 *user_batch;
+ u32 *cmd = batch;
+ u32 count;
+
+ count = xe_configfs_get_ctx_restore_post_bb(to_pci_dev(xe->drm.dev),
+ hwe->class, &user_batch);
+ if (!count)
+ return 0;
+
+ if (count > max_len)
+ return -ENOSPC;
+
+ /*
+ * This should be used only for tests and validation. Taint the kernel
+ * as anything could be submitted directly in context switches
+ */
+ add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
+
+ memcpy(cmd, user_batch, count * sizeof(u32));
+ cmd += count;
+
+ return cmd - batch;
+}
+
+static ssize_t setup_configfs_mid_ctx_restore_bb(struct xe_lrc *lrc,
+ struct xe_hw_engine *hwe,
+ u32 *batch, size_t max_len)
+{
+ struct xe_device *xe = gt_to_xe(lrc->gt);
+ const u32 *user_batch;
+ u32 *cmd = batch;
+ u32 count;
+
+ count = xe_configfs_get_ctx_restore_mid_bb(to_pci_dev(xe->drm.dev),
+ hwe->class, &user_batch);
+ if (!count)
+ return 0;
+
+ if (count > max_len)
+ return -ENOSPC;
+
+ /*
+ * This should be used only for tests and validation. Taint the kernel
+ * as anything could be submitted directly in context switches
+ */
+ add_taint(TAINT_TEST, LOCKDEP_STILL_OK);
+
+ memcpy(cmd, user_batch, count * sizeof(u32));
+ cmd += count;
+
+ return cmd - batch;
+}
+
+static ssize_t setup_invalidate_state_cache_wa(struct xe_lrc *lrc,
+ struct xe_hw_engine *hwe,
+ u32 *batch, size_t max_len)
+{
+ u32 *cmd = batch;
+
+ if (!XE_GT_WA(lrc->gt, 18022495364) ||
+ hwe->class != XE_ENGINE_CLASS_RENDER)
+ return 0;
+
+ if (xe_gt_WARN_ON(lrc->gt, max_len < 3))
+ return -ENOSPC;
+
+ *cmd++ = MI_LOAD_REGISTER_IMM | MI_LRI_NUM_REGS(1);
+ *cmd++ = CS_DEBUG_MODE1(0).addr;
+ *cmd++ = _MASKED_BIT_ENABLE(INSTRUCTION_STATE_CACHE_INVALIDATE);
+
+ return cmd - batch;
+}
+
+struct bo_setup {
+ ssize_t (*setup)(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
+ u32 *batch, size_t max_size);
+};
+
+struct bo_setup_state {
+ /* Input: */
+ struct xe_lrc *lrc;
+ struct xe_hw_engine *hwe;
+ size_t max_size;
+ size_t reserve_dw;
+ unsigned int offset;
+ const struct bo_setup *funcs;
+ unsigned int num_funcs;
+
+ /* State: */
+ u32 *buffer;
+ u32 *ptr;
+ unsigned int written;
+};
+
+static int setup_bo(struct bo_setup_state *state)
+{
+ ssize_t remain;
+
+ if (state->lrc->bo->vmap.is_iomem) {
+ xe_gt_assert(state->hwe->gt, state->buffer);
+ state->ptr = state->buffer;
+ } else {
+ state->ptr = state->lrc->bo->vmap.vaddr + state->offset;
+ }
+
+ remain = state->max_size / sizeof(u32);
+
+ for (size_t i = 0; i < state->num_funcs; i++) {
+ ssize_t len = state->funcs[i].setup(state->lrc, state->hwe,
+ state->ptr, remain);
+
+ remain -= len;
+
+ /*
+ * Caller has asked for at least reserve_dw to remain unused.
+ */
+ if (len < 0 ||
+ xe_gt_WARN_ON(state->lrc->gt, remain < state->reserve_dw))
+ goto fail;
+
+ state->ptr += len;
+ state->written += len;
+ }
+
+ return 0;
+
+fail:
+ return -ENOSPC;
+}
+
+static void finish_bo(struct bo_setup_state *state)
+{
+ if (!state->lrc->bo->vmap.is_iomem)
+ return;
+
+ xe_map_memcpy_to(gt_to_xe(state->lrc->gt), &state->lrc->bo->vmap,
+ state->offset, state->buffer,
+ state->written * sizeof(u32));
+}
-int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
- struct xe_exec_queue *q, struct xe_vm *vm, u32 ring_size)
+/**
+ * xe_lrc_setup_wa_bb_with_scratch - Execute all wa bb setup callbacks.
+ * @lrc: the &xe_lrc struct instance
+ * @hwe: the &xe_hw_engine struct instance
+ * @scratch: preallocated scratch buffer for temporary storage
+ * Return: 0 on success, negative error code on failure
+ */
+int xe_lrc_setup_wa_bb_with_scratch(struct xe_lrc *lrc, struct xe_hw_engine *hwe, u32 *scratch)
+{
+ static const struct bo_setup funcs[] = {
+ { .setup = setup_timestamp_wa },
+ { .setup = setup_invalidate_state_cache_wa },
+ { .setup = setup_utilization_wa },
+ { .setup = setup_configfs_post_ctx_restore_bb },
+ };
+ struct bo_setup_state state = {
+ .lrc = lrc,
+ .hwe = hwe,
+ .max_size = LRC_WA_BB_SIZE,
+ .buffer = scratch,
+ .reserve_dw = 1,
+ .offset = __xe_lrc_wa_bb_offset(lrc),
+ .funcs = funcs,
+ .num_funcs = ARRAY_SIZE(funcs),
+ };
+ int ret;
+
+ ret = setup_bo(&state);
+ if (ret)
+ return ret;
+
+ *state.ptr++ = MI_BATCH_BUFFER_END;
+ state.written++;
+
+ finish_bo(&state);
+
+ xe_lrc_write_ctx_reg(lrc, CTX_BB_PER_CTX_PTR,
+ xe_bo_ggtt_addr(lrc->bo) + state.offset + 1);
+
+ return 0;
+}
+
+static int setup_wa_bb(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
+{
+ u32 *buf = NULL;
+ int ret;
+
+ if (lrc->bo->vmap.is_iomem) {
+ buf = kmalloc(LRC_WA_BB_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ }
+
+ ret = xe_lrc_setup_wa_bb_with_scratch(lrc, hwe, buf);
+
+ kfree(buf);
+
+ return ret;
+}
+
+static int
+setup_indirect_ctx(struct xe_lrc *lrc, struct xe_hw_engine *hwe)
+{
+ static const struct bo_setup rcs_funcs[] = {
+ { .setup = setup_timestamp_wa },
+ { .setup = setup_configfs_mid_ctx_restore_bb },
+ };
+ static const struct bo_setup xcs_funcs[] = {
+ { .setup = setup_configfs_mid_ctx_restore_bb },
+ };
+ struct bo_setup_state state = {
+ .lrc = lrc,
+ .hwe = hwe,
+ .max_size = (63 * 64) /* max 63 cachelines */,
+ .buffer = NULL,
+ .offset = __xe_lrc_indirect_ctx_offset(lrc),
+ };
+ int ret;
+
+ if (!(lrc->flags & XE_LRC_FLAG_INDIRECT_CTX))
+ return 0;
+
+ if (hwe->class == XE_ENGINE_CLASS_RENDER ||
+ hwe->class == XE_ENGINE_CLASS_COMPUTE) {
+ state.funcs = rcs_funcs;
+ state.num_funcs = ARRAY_SIZE(rcs_funcs);
+ } else {
+ state.funcs = xcs_funcs;
+ state.num_funcs = ARRAY_SIZE(xcs_funcs);
+ }
+
+ if (xe_gt_WARN_ON(lrc->gt, !state.funcs))
+ return 0;
+
+ if (lrc->bo->vmap.is_iomem) {
+ state.buffer = kmalloc(state.max_size, GFP_KERNEL);
+ if (!state.buffer)
+ return -ENOMEM;
+ }
+
+ ret = setup_bo(&state);
+ if (ret) {
+ kfree(state.buffer);
+ return ret;
+ }
+
+ /*
+ * Align to 64B cacheline so there's no garbage at the end for CS to
+ * execute: size for indirect ctx must be a multiple of 64.
+ */
+ while (state.written & 0xf) {
+ *state.ptr++ = MI_NOOP;
+ state.written++;
+ }
+
+ finish_bo(&state);
+ kfree(state.buffer);
+
+ /*
+ * Enable INDIRECT_CTX leaving INDIRECT_CTX_OFFSET at its default: it
+ * varies per engine class, but the default is good enough
+ */
+ xe_lrc_write_ctx_reg(lrc,
+ CTX_CS_INDIRECT_CTX,
+ (xe_bo_ggtt_addr(lrc->bo) + state.offset) |
+ /* Size in CLs. */
+ (state.written * sizeof(u32) / 64));
+
+ return 0;
+}
+
+static int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
+ struct xe_vm *vm, u32 ring_size, u16 msix_vec,
+ u32 init_flags)
{
struct xe_gt *gt = hwe->gt;
+ const u32 lrc_size = xe_gt_lrc_size(gt, hwe->class);
+ u32 bo_size = ring_size + lrc_size + LRC_WA_BB_SIZE;
struct xe_tile *tile = gt_to_tile(gt);
struct xe_device *xe = gt_to_xe(gt);
struct iosys_map map;
- void *init_data = NULL;
u32 arb_enable;
+ u32 bo_flags;
int err;
+ kref_init(&lrc->refcount);
+ lrc->gt = gt;
+ lrc->size = lrc_size;
lrc->flags = 0;
+ lrc->ring.size = ring_size;
+ lrc->ring.tail = 0;
- /*
- * FIXME: Perma-pinning LRC as we don't yet support moving GGTT address
- * via VM bind calls.
- */
- lrc->bo = xe_bo_create_pin_map(xe, tile, vm,
- ring_size + xe_lrc_size(xe, hwe->class),
- ttm_bo_type_kernel,
- XE_BO_FLAG_VRAM_IF_DGFX(tile) |
- XE_BO_FLAG_GGTT |
- XE_BO_FLAG_GGTT_INVALIDATE);
+ if (gt_engine_needs_indirect_ctx(gt, hwe->class)) {
+ lrc->flags |= XE_LRC_FLAG_INDIRECT_CTX;
+ bo_size += LRC_INDIRECT_CTX_BO_SIZE;
+ }
+
+ if (xe_gt_has_indirect_ring_state(gt))
+ lrc->flags |= XE_LRC_FLAG_INDIRECT_RING_STATE;
+
+ bo_flags = XE_BO_FLAG_VRAM_IF_DGFX(tile) | XE_BO_FLAG_GGTT |
+ XE_BO_FLAG_GGTT_INVALIDATE;
+
+ if ((vm && vm->xef) || init_flags & XE_LRC_CREATE_USER_CTX) /* userspace */
+ bo_flags |= XE_BO_FLAG_PINNED_LATE_RESTORE | XE_BO_FLAG_FORCE_USER_VRAM;
+
+ lrc->bo = xe_bo_create_pin_map_novm(xe, tile,
+ bo_size,
+ ttm_bo_type_kernel,
+ bo_flags, false);
if (IS_ERR(lrc->bo))
return PTR_ERR(lrc->bo);
- lrc->tile = gt_to_tile(hwe->gt);
- lrc->ring.size = ring_size;
- lrc->ring.tail = 0;
-
xe_hw_fence_ctx_init(&lrc->fence_ctx, hwe->gt,
hwe->fence_irq, hwe->name);
- if (!gt->default_lrc[hwe->class]) {
- init_data = empty_lrc_data(hwe);
- if (!init_data) {
- err = -ENOMEM;
- goto err_lrc_finish;
- }
- }
-
/*
* Init Per-Process of HW status Page, LRC / context state to known
- * values
+ * values. If there's already a primed default_lrc, just copy it, otherwise
+ * it's the early submission to record the lrc: build a new empty one from
+ * scratch.
*/
map = __xe_lrc_pphwsp_map(lrc);
- if (!init_data) {
+ if (gt->default_lrc[hwe->class]) {
xe_map_memset(xe, &map, 0, 0, LRC_PPHWSP_SIZE); /* PPHWSP */
xe_map_memcpy_to(xe, &map, LRC_PPHWSP_SIZE,
gt->default_lrc[hwe->class] + LRC_PPHWSP_SIZE,
- xe_lrc_size(xe, hwe->class) - LRC_PPHWSP_SIZE);
+ lrc_size - LRC_PPHWSP_SIZE);
} else {
- xe_map_memcpy_to(xe, &map, 0, init_data,
- xe_lrc_size(xe, hwe->class));
+ void *init_data = empty_lrc_data(hwe);
+
+ if (!init_data) {
+ err = -ENOMEM;
+ goto err_lrc_finish;
+ }
+
+ xe_map_memcpy_to(xe, &map, 0, init_data, lrc_size);
kfree(init_data);
}
@@ -786,13 +1462,50 @@ int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
xe_drm_client_add_bo(vm->xef->client, lrc->bo);
}
- xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
- xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
- xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
- xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
- RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
+ if (xe_device_has_msix(xe)) {
+ xe_lrc_write_ctx_reg(lrc, CTX_INT_STATUS_REPORT_PTR,
+ xe_memirq_status_ptr(&tile->memirq, hwe));
+ xe_lrc_write_ctx_reg(lrc, CTX_INT_SRC_REPORT_PTR,
+ xe_memirq_source_ptr(&tile->memirq, hwe));
+ xe_lrc_write_ctx_reg(lrc, CTX_CS_INT_VEC_DATA, msix_vec << 16 | msix_vec);
+ }
+
+ if (xe_gt_has_indirect_ring_state(gt)) {
+ xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
+ __xe_lrc_indirect_ring_ggtt_addr(lrc));
+
+ xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
+ __xe_lrc_ring_ggtt_addr(lrc));
+ xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START_UDW, 0);
+ xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, 0);
+ xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, lrc->ring.tail);
+ xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_CTL,
+ RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
+ } else {
+ xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
+ xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, 0);
+ xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, lrc->ring.tail);
+ xe_lrc_write_ctx_reg(lrc, CTX_RING_CTL,
+ RING_CTL_SIZE(lrc->ring.size) | RING_VALID);
+ }
+
+ if (init_flags & XE_LRC_CREATE_RUNALONE)
+ xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
+ xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
+ _MASKED_BIT_ENABLE(CTX_CTRL_RUN_ALONE));
+
+ if (init_flags & XE_LRC_CREATE_PXP)
+ xe_lrc_write_ctx_reg(lrc, CTX_CONTEXT_CONTROL,
+ xe_lrc_read_ctx_reg(lrc, CTX_CONTEXT_CONTROL) |
+ _MASKED_BIT_ENABLE(CTX_CTRL_PXP_ENABLE));
+
+ lrc->ctx_timestamp = 0;
+ xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP, 0);
+ if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
+ xe_lrc_write_ctx_reg(lrc, CTX_TIMESTAMP_UDW, 0);
+
if (xe->info.has_asid && vm)
- xe_lrc_write_ctx_reg(lrc, PVC_CTX_ASID, vm->usm.asid);
+ xe_lrc_write_ctx_reg(lrc, CTX_ASID, vm->usm.asid);
lrc->desc = LRC_VALID;
lrc->desc |= FIELD_PREP(LRC_ADDRESSING_MODE, LRC_LEGACY_64B_CONTEXT);
@@ -818,6 +1531,14 @@ int xe_lrc_init(struct xe_lrc *lrc, struct xe_hw_engine *hwe,
map = __xe_lrc_start_seqno_map(lrc);
xe_map_write32(lrc_to_xe(lrc), &map, lrc->fence_ctx.next_seqno - 1);
+ err = setup_wa_bb(lrc, hwe);
+ if (err)
+ goto err_lrc_finish;
+
+ err = setup_indirect_ctx(lrc, hwe);
+ if (err)
+ goto err_lrc_finish;
+
return 0;
err_lrc_finish:
@@ -825,23 +1546,108 @@ err_lrc_finish:
return err;
}
-void xe_lrc_finish(struct xe_lrc *lrc)
+/**
+ * xe_lrc_create - Create a LRC
+ * @hwe: Hardware Engine
+ * @vm: The VM (address space)
+ * @ring_size: LRC ring size
+ * @msix_vec: MSI-X interrupt vector (for platforms that support it)
+ * @flags: LRC initialization flags
+ *
+ * Allocate and initialize the Logical Ring Context (LRC).
+ *
+ * Return pointer to created LRC upon success and an error pointer
+ * upon failure.
+ */
+struct xe_lrc *xe_lrc_create(struct xe_hw_engine *hwe, struct xe_vm *vm,
+ u32 ring_size, u16 msix_vec, u32 flags)
{
- xe_hw_fence_ctx_finish(&lrc->fence_ctx);
- xe_bo_lock(lrc->bo, false);
- xe_bo_unpin(lrc->bo);
- xe_bo_unlock(lrc->bo);
- xe_bo_put(lrc->bo);
+ struct xe_lrc *lrc;
+ int err;
+
+ lrc = kzalloc(sizeof(*lrc), GFP_KERNEL);
+ if (!lrc)
+ return ERR_PTR(-ENOMEM);
+
+ err = xe_lrc_init(lrc, hwe, vm, ring_size, msix_vec, flags);
+ if (err) {
+ kfree(lrc);
+ return ERR_PTR(err);
+ }
+
+ return lrc;
+}
+
+/**
+ * xe_lrc_destroy - Destroy the LRC
+ * @ref: reference to LRC
+ *
+ * Called when ref == 0, release resources held by the Logical Ring Context
+ * (LRC) and free the LRC memory.
+ */
+void xe_lrc_destroy(struct kref *ref)
+{
+ struct xe_lrc *lrc = container_of(ref, struct xe_lrc, refcount);
+
+ xe_lrc_finish(lrc);
+ kfree(lrc);
+}
+
+/**
+ * xe_lrc_update_hwctx_regs_with_address - Re-compute GGTT references within given LRC.
+ * @lrc: the &xe_lrc struct instance
+ */
+void xe_lrc_update_hwctx_regs_with_address(struct xe_lrc *lrc)
+{
+ if (xe_lrc_has_indirect_ring_state(lrc)) {
+ xe_lrc_write_ctx_reg(lrc, CTX_INDIRECT_RING_STATE,
+ __xe_lrc_indirect_ring_ggtt_addr(lrc));
+
+ xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START,
+ __xe_lrc_ring_ggtt_addr(lrc));
+ } else {
+ xe_lrc_write_ctx_reg(lrc, CTX_RING_START, __xe_lrc_ring_ggtt_addr(lrc));
+ }
+}
+
+void xe_lrc_set_ring_tail(struct xe_lrc *lrc, u32 tail)
+{
+ if (xe_lrc_has_indirect_ring_state(lrc))
+ xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL, tail);
+ else
+ xe_lrc_write_ctx_reg(lrc, CTX_RING_TAIL, tail);
+}
+
+u32 xe_lrc_ring_tail(struct xe_lrc *lrc)
+{
+ if (xe_lrc_has_indirect_ring_state(lrc))
+ return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_TAIL) & TAIL_ADDR;
+ else
+ return xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL) & TAIL_ADDR;
+}
+
+static u32 xe_lrc_ring_start(struct xe_lrc *lrc)
+{
+ if (xe_lrc_has_indirect_ring_state(lrc))
+ return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_START);
+ else
+ return xe_lrc_read_ctx_reg(lrc, CTX_RING_START);
}
void xe_lrc_set_ring_head(struct xe_lrc *lrc, u32 head)
{
- xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
+ if (xe_lrc_has_indirect_ring_state(lrc))
+ xe_lrc_write_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD, head);
+ else
+ xe_lrc_write_ctx_reg(lrc, CTX_RING_HEAD, head);
}
u32 xe_lrc_ring_head(struct xe_lrc *lrc)
{
- return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
+ if (xe_lrc_has_indirect_ring_state(lrc))
+ return xe_lrc_read_indirect_ctx_reg(lrc, INDIRECT_CTX_RING_HEAD) & HEAD_ADDR;
+ else
+ return xe_lrc_read_ctx_reg(lrc, CTX_RING_HEAD) & HEAD_ADDR;
}
u32 xe_lrc_ring_space(struct xe_lrc *lrc)
@@ -901,10 +1707,43 @@ u32 xe_lrc_seqno_ggtt_addr(struct xe_lrc *lrc)
return __xe_lrc_seqno_ggtt_addr(lrc);
}
-struct dma_fence *xe_lrc_create_seqno_fence(struct xe_lrc *lrc)
+/**
+ * xe_lrc_alloc_seqno_fence() - Allocate an lrc seqno fence.
+ *
+ * Allocate but don't initialize an lrc seqno fence.
+ *
+ * Return: Pointer to the allocated fence or
+ * negative error pointer on error.
+ */
+struct dma_fence *xe_lrc_alloc_seqno_fence(void)
{
- return &xe_hw_fence_create(&lrc->fence_ctx,
- __xe_lrc_seqno_map(lrc))->dma;
+ return xe_hw_fence_alloc();
+}
+
+/**
+ * xe_lrc_free_seqno_fence() - Free an lrc seqno fence.
+ * @fence: Pointer to the fence to free.
+ *
+ * Frees an lrc seqno fence that hasn't yet been
+ * initialized.
+ */
+void xe_lrc_free_seqno_fence(struct dma_fence *fence)
+{
+ xe_hw_fence_free(fence);
+}
+
+/**
+ * xe_lrc_init_seqno_fence() - Initialize an lrc seqno fence.
+ * @lrc: Pointer to the lrc.
+ * @fence: Pointer to the fence to initialize.
+ *
+ * Initializes a pre-allocated lrc seqno fence.
+ * After initialization, the fence is subject to normal
+ * dma-fence refcounting.
+ */
+void xe_lrc_init_seqno_fence(struct xe_lrc *lrc, struct dma_fence *fence)
+{
+ xe_hw_fence_init(fence, &lrc->fence_ctx, __xe_lrc_seqno_map(lrc));
}
s32 xe_lrc_seqno(struct xe_lrc *lrc)
@@ -936,6 +1775,21 @@ struct iosys_map xe_lrc_parallel_map(struct xe_lrc *lrc)
return __xe_lrc_parallel_map(lrc);
}
+/**
+ * xe_lrc_engine_id() - Read engine id value
+ * @lrc: Pointer to the lrc.
+ *
+ * Returns: context id value
+ */
+static u32 xe_lrc_engine_id(struct xe_lrc *lrc)
+{
+ struct xe_device *xe = lrc_to_xe(lrc);
+ struct iosys_map map;
+
+ map = __xe_lrc_engine_id_map(lrc);
+ return xe_map_read32(xe, &map);
+}
+
static int instr_dw(u32 cmd_header)
{
/* GFXPIPE "SINGLE_DW" opcodes are a single dword */
@@ -1143,6 +1997,7 @@ static int dump_gfxpipe_command(struct drm_printer *p,
MATCH3D(3DSTATE_CLIP_MESH);
MATCH3D(3DSTATE_SBE_MESH);
MATCH3D(3DSTATE_CPSIZE_CONTROL_BUFFER);
+ MATCH3D(3DSTATE_COARSE_PIXEL);
MATCH3D(3DSTATE_DRAWING_RECTANGLE);
MATCH3D(3DSTATE_CHROMA_KEY);
@@ -1214,7 +2069,7 @@ void xe_lrc_dump_default(struct drm_printer *p,
* hardware status page.
*/
dw = gt->default_lrc[hwe_class] + LRC_PPHWSP_SIZE;
- remaining_dw = (xe_lrc_size(gt_to_xe(gt), hwe_class) - LRC_PPHWSP_SIZE) / 4;
+ remaining_dw = (xe_gt_lrc_size(gt, hwe_class) - LRC_PPHWSP_SIZE) / 4;
while (remaining_dw > 0) {
if ((*dw & XE_INSTR_CMD_TYPE) == XE_INSTR_MI) {
@@ -1293,7 +2148,7 @@ static const struct instr_state xe_hpg_svg_state[] = {
{ .instr = CMD_3DSTATE_DRAWING_RECTANGLE, .num_dw = 4 },
};
-void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *bb)
+u32 *xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, u32 *cs)
{
struct xe_gt *gt = q->hwe->gt;
struct xe_device *xe = gt_to_xe(gt);
@@ -1301,22 +2156,34 @@ void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *b
int state_table_size = 0;
/*
- * At the moment we only need to emit non-register state for the RCS
- * engine.
+ * Wa_14019789679
+ *
+ * If the driver doesn't explicitly emit the SVG instructions while
+ * setting up the default LRC, the context switch will write 0's
+ * (noops) into the LRC memory rather than the expected instruction
+ * headers. Application contexts start out as a copy of the default
+ * LRC, and if they also do not emit specific settings for some SVG
+ * state, then on context restore they'll unintentionally inherit
+ * whatever state setting the previous context had programmed into the
+ * hardware (i.e., the lack of a 3DSTATE_* instruction in the LRC will
+ * prevent the hardware from resetting that state back to any specific
+ * value).
+ *
+ * The official workaround only requires emitting 3DSTATE_MESH_CONTROL
+ * since that's a specific state setting that can easily cause GPU
+ * hangs if unintentionally inherited. However to be safe we'll
+ * continue to emit all of the SVG state since it's best not to leak
+ * any of the state between contexts, even if that leakage is harmless.
*/
- if (q->hwe->class != XE_ENGINE_CLASS_RENDER)
- return;
-
- switch (GRAPHICS_VERx100(xe)) {
- case 1255:
- case 1270 ... 2004:
+ if (XE_GT_WA(gt, 14019789679) && q->hwe->class == XE_ENGINE_CLASS_RENDER) {
state_table = xe_hpg_svg_state;
state_table_size = ARRAY_SIZE(xe_hpg_svg_state);
- break;
- default:
+ }
+
+ if (!state_table) {
xe_gt_dbg(gt, "No non-register state to emit on graphics ver %d.%02d\n",
GRAPHICS_VER(xe), GRAPHICS_VERx100(xe) % 100);
- return;
+ return cs;
}
for (int i = 0; i < state_table_size; i++) {
@@ -1339,12 +2206,14 @@ void xe_lrc_emit_hwe_state_instructions(struct xe_exec_queue *q, struct xe_bb *b
instr == CMD_3DSTATE_DRAWING_RECTANGLE)
instr = CMD_3DSTATE_DRAWING_RECTANGLE_FAST;
- bb->cs[bb->len] = instr;
+ *cs = instr;
if (!is_single_dw)
- bb->cs[bb->len] |= (num_dw - 2);
+ *cs |= (num_dw - 2);
- bb->len += num_dw;
+ cs += num_dw;
}
+
+ return cs;
}
struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
@@ -1354,16 +2223,21 @@ struct xe_lrc_snapshot *xe_lrc_snapshot_capture(struct xe_lrc *lrc)
if (!snapshot)
return NULL;
- snapshot->context_desc = lower_32_bits(xe_lrc_ggtt_addr(lrc));
+ snapshot->context_desc = xe_lrc_ggtt_addr(lrc);
+ snapshot->ring_addr = __xe_lrc_ring_ggtt_addr(lrc);
+ snapshot->indirect_context_desc = xe_lrc_indirect_ring_ggtt_addr(lrc);
snapshot->head = xe_lrc_ring_head(lrc);
snapshot->tail.internal = lrc->ring.tail;
- snapshot->tail.memory = xe_lrc_read_ctx_reg(lrc, CTX_RING_TAIL);
+ snapshot->tail.memory = xe_lrc_ring_tail(lrc);
+ snapshot->start = xe_lrc_ring_start(lrc);
snapshot->start_seqno = xe_lrc_start_seqno(lrc);
snapshot->seqno = xe_lrc_seqno(lrc);
snapshot->lrc_bo = xe_bo_get(lrc->bo);
snapshot->lrc_offset = xe_lrc_pphwsp_offset(lrc);
- snapshot->lrc_size = lrc->bo->size - snapshot->lrc_offset;
+ snapshot->lrc_size = lrc->size;
snapshot->lrc_snapshot = NULL;
+ snapshot->ctx_timestamp = lower_32_bits(xe_lrc_ctx_timestamp(lrc));
+ snapshot->ctx_job_timestamp = xe_lrc_ctx_job_timestamp(lrc);
return snapshot;
}
@@ -1382,7 +2256,7 @@ void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
if (!snapshot->lrc_snapshot)
goto put_bo;
- dma_resv_lock(bo->ttm.base.resv, NULL);
+ xe_bo_lock(bo, false);
if (!ttm_bo_vmap(&bo->ttm, &src)) {
xe_map_memcpy_from(xe_bo_device(bo),
snapshot->lrc_snapshot, &src, snapshot->lrc_offset,
@@ -1392,7 +2266,7 @@ void xe_lrc_snapshot_capture_delayed(struct xe_lrc_snapshot *snapshot)
kvfree(snapshot->lrc_snapshot);
snapshot->lrc_snapshot = NULL;
}
- dma_resv_unlock(bo->ttm.base.resv);
+ xe_bo_unlock(bo);
put_bo:
xe_bo_put(bo);
}
@@ -1405,11 +2279,18 @@ void xe_lrc_snapshot_print(struct xe_lrc_snapshot *snapshot, struct drm_printer
return;
drm_printf(p, "\tHW Context Desc: 0x%08x\n", snapshot->context_desc);
+ drm_printf(p, "\tHW Ring address: 0x%08x\n",
+ snapshot->ring_addr);
+ drm_printf(p, "\tHW Indirect Ring State: 0x%08x\n",
+ snapshot->indirect_context_desc);
drm_printf(p, "\tLRC Head: (memory) %u\n", snapshot->head);
drm_printf(p, "\tLRC Tail: (internal) %u, (memory) %u\n",
snapshot->tail.internal, snapshot->tail.memory);
+ drm_printf(p, "\tRing start: (memory) 0x%08x\n", snapshot->start);
drm_printf(p, "\tStart seqno: (memory) %d\n", snapshot->start_seqno);
drm_printf(p, "\tSeqno: (memory) %d\n", snapshot->seqno);
+ drm_printf(p, "\tTimestamp: 0x%08x\n", snapshot->ctx_timestamp);
+ drm_printf(p, "\tJob Timestamp: 0x%08x\n", snapshot->ctx_job_timestamp);
if (!snapshot->lrc_snapshot)
return;
@@ -1442,5 +2323,92 @@ void xe_lrc_snapshot_free(struct xe_lrc_snapshot *snapshot)
kvfree(snapshot->lrc_snapshot);
if (snapshot->lrc_bo)
xe_bo_put(snapshot->lrc_bo);
+
kfree(snapshot);
}
+
+static int get_ctx_timestamp(struct xe_lrc *lrc, u32 engine_id, u64 *reg_ctx_ts)
+{
+ u16 class = REG_FIELD_GET(ENGINE_CLASS_ID, engine_id);
+ u16 instance = REG_FIELD_GET(ENGINE_INSTANCE_ID, engine_id);
+ struct xe_hw_engine *hwe;
+ u64 val;
+
+ hwe = xe_gt_hw_engine(lrc->gt, class, instance, false);
+ if (xe_gt_WARN_ONCE(lrc->gt, !hwe || xe_hw_engine_is_reserved(hwe),
+ "Unexpected engine class:instance %d:%d for context utilization\n",
+ class, instance))
+ return -1;
+
+ if (lrc_to_xe(lrc)->info.has_64bit_timestamp)
+ val = xe_mmio_read64_2x32(&hwe->gt->mmio,
+ RING_CTX_TIMESTAMP(hwe->mmio_base));
+ else
+ val = xe_mmio_read32(&hwe->gt->mmio,
+ RING_CTX_TIMESTAMP(hwe->mmio_base));
+
+ *reg_ctx_ts = val;
+
+ return 0;
+}
+
+/**
+ * xe_lrc_update_timestamp() - Update ctx timestamp
+ * @lrc: Pointer to the lrc.
+ * @old_ts: Old timestamp value
+ *
+ * Populate @old_ts current saved ctx timestamp, read new ctx timestamp and
+ * update saved value. With support for active contexts, the calculation may be
+ * slightly racy, so follow a read-again logic to ensure that the context is
+ * still active before returning the right timestamp.
+ *
+ * Returns: New ctx timestamp value
+ */
+u64 xe_lrc_update_timestamp(struct xe_lrc *lrc, u64 *old_ts)
+{
+ u64 lrc_ts, reg_ts;
+ u32 engine_id;
+
+ *old_ts = lrc->ctx_timestamp;
+
+ lrc_ts = xe_lrc_ctx_timestamp(lrc);
+ /* CTX_TIMESTAMP mmio read is invalid on VF, so return the LRC value */
+ if (IS_SRIOV_VF(lrc_to_xe(lrc))) {
+ lrc->ctx_timestamp = lrc_ts;
+ goto done;
+ }
+
+ if (lrc_ts == CONTEXT_ACTIVE) {
+ engine_id = xe_lrc_engine_id(lrc);
+ if (!get_ctx_timestamp(lrc, engine_id, &reg_ts))
+ lrc->ctx_timestamp = reg_ts;
+
+ /* read lrc again to ensure context is still active */
+ lrc_ts = xe_lrc_ctx_timestamp(lrc);
+ }
+
+ /*
+ * If context switched out, just use the lrc_ts. Note that this needs to
+ * be a separate if condition.
+ */
+ if (lrc_ts != CONTEXT_ACTIVE)
+ lrc->ctx_timestamp = lrc_ts;
+
+done:
+ trace_xe_lrc_update_timestamp(lrc, *old_ts);
+
+ return lrc->ctx_timestamp;
+}
+
+/**
+ * xe_lrc_ring_is_idle() - LRC is idle
+ * @lrc: Pointer to the lrc.
+ *
+ * Compare LRC ring head and tail to determine if idle.
+ *
+ * Return: True is ring is idle, False otherwise
+ */
+bool xe_lrc_ring_is_idle(struct xe_lrc *lrc)
+{
+ return xe_lrc_ring_head(lrc) == xe_lrc_ring_tail(lrc);
+}