diff options
Diffstat (limited to 'drivers/gpu/drm/i915/i915_perf.c')
| -rw-r--r-- | drivers/gpu/drm/i915/i915_perf.c | 1419 |
1 files changed, 1024 insertions, 395 deletions
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 0defbb43ceea..0b9d9f3f7813 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -192,6 +192,7 @@ */ #include <linux/anon_inodes.h> +#include <linux/nospec.h> #include <linux/sizes.h> #include <linux/uuid.h> @@ -204,15 +205,21 @@ #include "gt/intel_gpu_commands.h" #include "gt/intel_gt.h" #include "gt/intel_gt_clock_utils.h" +#include "gt/intel_gt_mcr.h" +#include "gt/intel_gt_print.h" #include "gt/intel_gt_regs.h" #include "gt/intel_lrc.h" #include "gt/intel_lrc_reg.h" +#include "gt/intel_rc6.h" #include "gt/intel_ring.h" +#include "gt/uc/intel_guc_slpc.h" #include "i915_drv.h" #include "i915_file_private.h" #include "i915_perf.h" #include "i915_perf_oa_regs.h" +#include "i915_reg.h" +#include "i915_mmio_range.h" /* HW requires this to be a power of two, between 128k and 16M, though driver * is currently generally designed assuming the largest 16M size is used such @@ -286,6 +293,7 @@ static u32 i915_perf_stream_paranoid = true; #define OAREPORT_REASON_CTX_SWITCH (1<<3) #define OAREPORT_REASON_CLK_RATIO (1<<5) +#define HAS_MI_SET_PREDICATE(i915) (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 55)) /* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate * @@ -320,6 +328,14 @@ static const struct i915_oa_format oa_formats[I915_OA_FORMAT_MAX] = { [I915_OA_FORMAT_A12] = { 0, 64 }, [I915_OA_FORMAT_A12_B8_C8] = { 2, 128 }, [I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 }, + [I915_OAR_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 }, + [I915_OA_FORMAT_A24u40_A14u32_B8_C8] = { 5, 256 }, + [I915_OAM_FORMAT_MPEC8u64_B8_C8] = { 1, 192, TYPE_OAM, HDR_64_BIT }, + [I915_OAM_FORMAT_MPEC8u32_B8_C8] = { 2, 128, TYPE_OAM, HDR_64_BIT }, +}; + +static const u32 mtl_oa_base[] = { + [PERF_GROUP_OAM_SAMEDIA_0] = 0x393000, }; #define SAMPLE_OA_REPORT (1<<0) @@ -412,11 +428,17 @@ static void free_oa_config_bo(struct i915_oa_config_bo *oa_bo) kfree(oa_bo); } +static inline const +struct i915_perf_regs *__oa_regs(struct i915_perf_stream *stream) +{ + return &stream->engine->oa_group->regs; +} + static u32 gen12_oa_hw_tail_read(struct i915_perf_stream *stream) { struct intel_uncore *uncore = stream->uncore; - return intel_uncore_read(uncore, GEN12_OAG_OATAILPTR) & + return intel_uncore_read(uncore, __oa_regs(stream)->oa_tail_ptr) & GEN12_OAG_OATAILPTR_MASK; } @@ -435,6 +457,66 @@ static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream) return oastatus1 & GEN7_OASTATUS1_TAIL_MASK; } +#define oa_report_header_64bit(__s) \ + ((__s)->oa_buffer.format->header == HDR_64_BIT) + +static u64 oa_report_id(struct i915_perf_stream *stream, void *report) +{ + return oa_report_header_64bit(stream) ? *(u64 *)report : *(u32 *)report; +} + +static u64 oa_report_reason(struct i915_perf_stream *stream, void *report) +{ + return (oa_report_id(stream, report) >> OAREPORT_REASON_SHIFT) & + (GRAPHICS_VER(stream->perf->i915) == 12 ? + OAREPORT_REASON_MASK_EXTENDED : + OAREPORT_REASON_MASK); +} + +static void oa_report_id_clear(struct i915_perf_stream *stream, u32 *report) +{ + if (oa_report_header_64bit(stream)) + *(u64 *)report = 0; + else + *report = 0; +} + +static bool oa_report_ctx_invalid(struct i915_perf_stream *stream, void *report) +{ + return !(oa_report_id(stream, report) & + stream->perf->gen8_valid_ctx_bit); +} + +static u64 oa_timestamp(struct i915_perf_stream *stream, void *report) +{ + return oa_report_header_64bit(stream) ? + *((u64 *)report + 1) : + *((u32 *)report + 1); +} + +static void oa_timestamp_clear(struct i915_perf_stream *stream, u32 *report) +{ + if (oa_report_header_64bit(stream)) + *(u64 *)&report[2] = 0; + else + report[1] = 0; +} + +static u32 oa_context_id(struct i915_perf_stream *stream, u32 *report) +{ + u32 ctx_id = oa_report_header_64bit(stream) ? report[4] : report[2]; + + return ctx_id & stream->specific_ctx_id_mask; +} + +static void oa_context_id_squash(struct i915_perf_stream *stream, u32 *report) +{ + if (oa_report_header_64bit(stream)) + report[4] = INVALID_CTX_ID; + else + report[2] = INVALID_CTX_ID; +} + /** * oa_buffer_check_unlocked - check for data and update tail ptr state * @stream: i915 stream instance @@ -450,8 +532,7 @@ static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream) * (See description of OA_TAIL_MARGIN_NSEC above for further details.) * * Besides returning true when there is data available to read() this function - * also updates the tail, aging_tail and aging_timestamp in the oa_buffer - * object. + * also updates the tail in the oa_buffer object. * * Note: It's safe to read OA config state here unlocked, assuming that this is * only called while the stream is enabled, while the global OA configuration @@ -462,80 +543,67 @@ static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream) static bool oa_buffer_check_unlocked(struct i915_perf_stream *stream) { u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); - int report_size = stream->oa_buffer.format_size; + int report_size = stream->oa_buffer.format->size; + u32 tail, hw_tail; unsigned long flags; bool pollin; - u32 hw_tail; - u64 now; + u32 partial_report_size; - /* We have to consider the (unlikely) possibility that read() errors + /* + * We have to consider the (unlikely) possibility that read() errors * could result in an OA buffer reset which might reset the head and * tail state. */ spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); hw_tail = stream->perf->ops.oa_hw_tail_read(stream); + hw_tail -= gtt_offset; - /* The tail pointer increases in 64 byte increments, - * not in report_size steps... + /* + * The tail pointer increases in 64 byte increments, not in report_size + * steps. Also the report size may not be a power of 2. Compute + * potentially partially landed report in the OA buffer */ - hw_tail &= ~(report_size - 1); + partial_report_size = OA_TAKEN(hw_tail, stream->oa_buffer.tail); + partial_report_size %= report_size; - now = ktime_get_mono_fast_ns(); + /* Subtract partial amount off the tail */ + hw_tail = OA_TAKEN(hw_tail, partial_report_size); - if (hw_tail == stream->oa_buffer.aging_tail && - (now - stream->oa_buffer.aging_timestamp) > OA_TAIL_MARGIN_NSEC) { - /* If the HW tail hasn't move since the last check and the HW - * tail has been aging for long enough, declare it the new - * tail. - */ - stream->oa_buffer.tail = stream->oa_buffer.aging_tail; - } else { - u32 head, tail, aged_tail; + tail = hw_tail; - /* NB: The head we observe here might effectively be a little - * out of date. If a read() is in progress, the head could be - * anywhere between this head and stream->oa_buffer.tail. - */ - head = stream->oa_buffer.head - gtt_offset; - aged_tail = stream->oa_buffer.tail - gtt_offset; - - hw_tail -= gtt_offset; - tail = hw_tail; - - /* Walk the stream backward until we find a report with dword 0 - * & 1 not at 0. Since the circular buffer pointers progress by - * increments of 64 bytes and that reports can be up to 256 - * bytes long, we can't tell whether a report has fully landed - * in memory before the first 2 dwords of the following report - * have effectively landed. - * - * This is assuming that the writes of the OA unit land in - * memory in the order they were written to. - * If not : (╯°□°)╯︵ ┻━┻ - */ - while (OA_TAKEN(tail, aged_tail) >= report_size) { - u32 *report32 = (void *)(stream->oa_buffer.vaddr + tail); + /* + * Walk the stream backward until we find a report with report + * id and timestamp not at 0. Since the circular buffer pointers + * progress by increments of 64 bytes and that reports can be up + * to 256 bytes long, we can't tell whether a report has fully + * landed in memory before the report id and timestamp of the + * following report have effectively landed. + * + * This is assuming that the writes of the OA unit land in + * memory in the order they were written to. + * If not : (╯°□°)╯︵ ┻━┻ + */ + while (OA_TAKEN(tail, stream->oa_buffer.tail) >= report_size) { + void *report = stream->oa_buffer.vaddr + tail; - if (report32[0] != 0 || report32[1] != 0) - break; + if (oa_report_id(stream, report) || + oa_timestamp(stream, report)) + break; - tail = (tail - report_size) & (OA_BUFFER_SIZE - 1); - } + tail = (tail - report_size) & (OA_BUFFER_SIZE - 1); + } - if (OA_TAKEN(hw_tail, tail) > report_size && - __ratelimit(&stream->perf->tail_pointer_race)) - DRM_NOTE("unlanded report(s) head=0x%x " - "tail=0x%x hw_tail=0x%x\n", - head, tail, hw_tail); + if (OA_TAKEN(hw_tail, tail) > report_size && + __ratelimit(&stream->perf->tail_pointer_race)) + drm_notice(&stream->uncore->i915->drm, + "unlanded report(s) head=0x%x tail=0x%x hw_tail=0x%x\n", + stream->oa_buffer.head, tail, hw_tail); - stream->oa_buffer.tail = gtt_offset + tail; - stream->oa_buffer.aging_tail = gtt_offset + hw_tail; - stream->oa_buffer.aging_timestamp = now; - } + stream->oa_buffer.tail = tail; - pollin = OA_TAKEN(stream->oa_buffer.tail - gtt_offset, - stream->oa_buffer.head - gtt_offset) >= report_size; + pollin = OA_TAKEN(stream->oa_buffer.tail, + stream->oa_buffer.head) >= report_size; spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); @@ -599,8 +667,10 @@ static int append_oa_sample(struct i915_perf_stream *stream, size_t *offset, const u8 *report) { - int report_size = stream->oa_buffer.format_size; + int report_size = stream->oa_buffer.format->size; struct drm_i915_perf_record_header header; + int report_size_partial; + u8 *oa_buf_end; header.type = DRM_I915_PERF_RECORD_SAMPLE; header.pad = 0; @@ -614,8 +684,20 @@ static int append_oa_sample(struct i915_perf_stream *stream, return -EFAULT; buf += sizeof(header); - if (copy_to_user(buf, report, report_size)) + oa_buf_end = stream->oa_buffer.vaddr + OA_BUFFER_SIZE; + report_size_partial = oa_buf_end - report; + + if (report_size_partial < report_size) { + if (copy_to_user(buf, report, report_size_partial)) + return -EFAULT; + buf += report_size_partial; + + if (copy_to_user(buf, stream->oa_buffer.vaddr, + report_size - report_size_partial)) + return -EFAULT; + } else if (copy_to_user(buf, report, report_size)) { return -EFAULT; + } (*offset) += header.size; @@ -649,14 +731,13 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, size_t *offset) { struct intel_uncore *uncore = stream->uncore; - int report_size = stream->oa_buffer.format_size; + int report_size = stream->oa_buffer.format->size; u8 *oa_buf_base = stream->oa_buffer.vaddr; u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); u32 mask = (OA_BUFFER_SIZE - 1); size_t start_offset = *offset; unsigned long flags; u32 head, tail; - u32 taken; int ret = 0; if (drm_WARN_ON(&uncore->i915->drm, !stream->enabled)) @@ -670,66 +751,34 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); /* - * NB: oa_buffer.head/tail include the gtt_offset which we don't want - * while indexing relative to oa_buf_base. - */ - head -= gtt_offset; - tail -= gtt_offset; - - /* * An out of bounds or misaligned head or tail pointer implies a driver * bug since we validate + align the tail pointers we read from the * hardware and we are in full control of the head pointer which should - * only be incremented by multiples of the report size (notably also - * all a power of two). + * only be incremented by multiples of the report size. */ if (drm_WARN_ONCE(&uncore->i915->drm, - head > OA_BUFFER_SIZE || head % report_size || - tail > OA_BUFFER_SIZE || tail % report_size, + head > OA_BUFFER_SIZE || + tail > OA_BUFFER_SIZE, "Inconsistent OA buffer pointers: head = %u, tail = %u\n", head, tail)) return -EIO; for (/* none */; - (taken = OA_TAKEN(tail, head)); + OA_TAKEN(tail, head); head = (head + report_size) & mask) { u8 *report = oa_buf_base + head; u32 *report32 = (void *)report; u32 ctx_id; - u32 reason; - - /* - * All the report sizes factor neatly into the buffer - * size so we never expect to see a report split - * between the beginning and end of the buffer. - * - * Given the initial alignment check a misalignment - * here would imply a driver bug that would result - * in an overrun. - */ - if (drm_WARN_ON(&uncore->i915->drm, - (OA_BUFFER_SIZE - head) < report_size)) { - drm_err(&uncore->i915->drm, - "Spurious OA head ptr: non-integral report offset\n"); - break; - } + u64 reason; /* * The reason field includes flags identifying what * triggered this specific report (mostly timer * triggered or e.g. due to a context switch). - * - * This field is never expected to be zero so we can - * check that the report isn't invalid before copying - * it to userspace... */ - reason = ((report32[0] >> OAREPORT_REASON_SHIFT) & - (GRAPHICS_VER(stream->perf->i915) == 12 ? - OAREPORT_REASON_MASK_EXTENDED : - OAREPORT_REASON_MASK)); - - ctx_id = report32[2] & stream->specific_ctx_id_mask; + reason = oa_report_reason(stream, report); + ctx_id = oa_context_id(stream, report32); /* * Squash whatever is in the CTX_ID field if it's marked as @@ -738,10 +787,44 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, * * Note: that we don't clear the valid_ctx_bit so userspace can * understand that the ID has been squashed by the kernel. + * + * Update: + * + * On XEHP platforms the behavior of context id valid bit has + * changed compared to prior platforms. To describe this, we + * define a few terms: + * + * context-switch-report: This is a report with the reason type + * being context-switch. It is generated when a context switches + * out. + * + * context-valid-bit: A bit that is set in the report ID field + * to indicate that a valid context has been loaded. + * + * gpu-idle: A condition characterized by a + * context-switch-report with context-valid-bit set to 0. + * + * On prior platforms, context-id-valid bit is set to 0 only + * when GPU goes idle. In all other reports, it is set to 1. + * + * On XEHP platforms, context-valid-bit is set to 1 in a context + * switch report if a new context switched in. For all other + * reports it is set to 0. + * + * This change in behavior causes an issue with MMIO triggered + * reports. MMIO triggered reports have the markers in the + * context ID field and the context-valid-bit is 0. The logic + * below to squash the context ID would render the report + * useless since the user will not be able to find it in the OA + * buffer. Since MMIO triggered reports exist only on XEHP, + * we should avoid squashing these for XEHP platforms. */ - if (!(report32[0] & stream->perf->gen8_valid_ctx_bit) && - GRAPHICS_VER(stream->perf->i915) <= 11) - ctx_id = report32[2] = INVALID_CTX_ID; + + if (oa_report_ctx_invalid(stream, report) && + GRAPHICS_VER_FULL(stream->engine->i915) < IP_VER(12, 55)) { + ctx_id = INVALID_CTX_ID; + oa_context_id_squash(stream, report32); + } /* * NB: For Gen 8 the OA unit no longer supports clock gating @@ -774,7 +857,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, * switches since it's not-uncommon for periodic samples to * identify a switch before any 'context switch' report. */ - if (!stream->perf->exclusive_stream->ctx || + if (!stream->ctx || stream->specific_ctx_id == ctx_id || stream->oa_buffer.last_ctx_id == stream->specific_ctx_id || reason & OAREPORT_REASON_CTX_SWITCH) { @@ -783,9 +866,9 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, * While filtering for a single context we avoid * leaking the IDs of other contexts. */ - if (stream->perf->exclusive_stream->ctx && + if (stream->ctx && stream->specific_ctx_id != ctx_id) { - report32[2] = INVALID_CTX_ID; + oa_context_id_squash(stream, report32); } ret = append_oa_sample(stream, buf, count, offset, @@ -796,19 +879,34 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, stream->oa_buffer.last_ctx_id = ctx_id; } - /* - * Clear out the first 2 dword as a mean to detect unlanded - * reports. - */ - report32[0] = 0; - report32[1] = 0; + if (is_power_of_2(report_size)) { + /* + * Clear out the report id and timestamp as a means + * to detect unlanded reports. + */ + oa_report_id_clear(stream, report32); + oa_timestamp_clear(stream, report32); + } else { + u8 *oa_buf_end = stream->oa_buffer.vaddr + + OA_BUFFER_SIZE; + u32 part = oa_buf_end - (u8 *)report32; + + /* Zero out the entire report */ + if (report_size <= part) { + memset(report32, 0, report_size); + } else { + memset(report32, 0, part); + memset(oa_buf_base, 0, report_size - part); + } + } } if (start_offset != *offset) { i915_reg_t oaheadptr; oaheadptr = GRAPHICS_VER(stream->perf->i915) == 12 ? - GEN12_OAG_OAHEADPTR : GEN8_OAHEADPTR; + __oa_regs(stream)->oa_head_ptr : + GEN8_OAHEADPTR; spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); @@ -816,9 +914,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, * We removed the gtt_offset for the copy loop above, indexing * relative to oa_buf_base so put back here... */ - head += gtt_offset; intel_uncore_write(uncore, oaheadptr, - head & GEN12_OAG_OAHEADPTR_MASK); + (head + gtt_offset) & GEN12_OAG_OAHEADPTR_MASK); stream->oa_buffer.head = head; spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); @@ -861,7 +958,8 @@ static int gen8_oa_read(struct i915_perf_stream *stream, return -EIO; oastatus_reg = GRAPHICS_VER(stream->perf->i915) == 12 ? - GEN12_OAG_OASTATUS : GEN8_OASTATUS; + __oa_regs(stream)->oa_status : + GEN8_OASTATUS; oastatus = intel_uncore_read(uncore, oastatus_reg); @@ -943,14 +1041,13 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream, size_t *offset) { struct intel_uncore *uncore = stream->uncore; - int report_size = stream->oa_buffer.format_size; + int report_size = stream->oa_buffer.format->size; u8 *oa_buf_base = stream->oa_buffer.vaddr; u32 gtt_offset = i915_ggtt_offset(stream->oa_buffer.vma); u32 mask = (OA_BUFFER_SIZE - 1); size_t start_offset = *offset; unsigned long flags; u32 head, tail; - u32 taken; int ret = 0; if (drm_WARN_ON(&uncore->i915->drm, !stream->enabled)) @@ -963,12 +1060,6 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream, spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); - /* NB: oa_buffer.head/tail include the gtt_offset which we don't want - * while indexing relative to oa_buf_base. - */ - head -= gtt_offset; - tail -= gtt_offset; - /* An out of bounds or misaligned head or tail pointer implies a driver * bug since we validate + align the tail pointers we read from the * hardware and we are in full control of the head pointer which should @@ -984,7 +1075,7 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream, for (/* none */; - (taken = OA_TAKEN(tail, head)); + OA_TAKEN(tail, head); head = (head + report_size) & mask) { u8 *report = oa_buf_base + head; u32 *report32 = (void *)report; @@ -1012,7 +1103,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream, */ if (report32[0] == 0) { if (__ratelimit(&stream->perf->spurious_report_rs)) - DRM_NOTE("Skipping spurious, invalid OA report\n"); + drm_notice(&uncore->i915->drm, + "Skipping spurious, invalid OA report\n"); continue; } @@ -1030,13 +1122,8 @@ static int gen7_append_oa_reports(struct i915_perf_stream *stream, if (start_offset != *offset) { spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); - /* We removed the gtt_offset for the copy loop above, indexing - * relative to oa_buf_base so put back here... - */ - head += gtt_offset; - intel_uncore_write(uncore, GEN7_OASTATUS2, - (head & GEN7_OASTATUS2_HEAD_MASK) | + ((head + gtt_offset) & GEN7_OASTATUS2_HEAD_MASK) | GEN7_OASTATUS2_MEM_SELECT_GGTT); stream->oa_buffer.head = head; @@ -1233,6 +1320,199 @@ retry: return stream->pinned_ctx; } +static int +__store_reg_to_mem(struct i915_request *rq, i915_reg_t reg, u32 ggtt_offset) +{ + u32 *cs, cmd; + + cmd = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; + if (GRAPHICS_VER(rq->i915) >= 8) + cmd++; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = cmd; + *cs++ = i915_mmio_reg_offset(reg); + *cs++ = ggtt_offset; + *cs++ = 0; + + intel_ring_advance(rq, cs); + + return 0; +} + +static int +__read_reg(struct intel_context *ce, i915_reg_t reg, u32 ggtt_offset) +{ + struct i915_request *rq; + int err; + + rq = i915_request_create(ce); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + i915_request_get(rq); + + err = __store_reg_to_mem(rq, reg, ggtt_offset); + + i915_request_add(rq); + if (!err && i915_request_wait(rq, 0, HZ / 2) < 0) + err = -ETIME; + + i915_request_put(rq); + + return err; +} + +static int +gen12_guc_sw_ctx_id(struct intel_context *ce, u32 *ctx_id) +{ + struct i915_vma *scratch; + u32 *val; + int err; + + scratch = __vm_create_scratch_for_read_pinned(&ce->engine->gt->ggtt->vm, 4); + if (IS_ERR(scratch)) + return PTR_ERR(scratch); + + err = i915_vma_sync(scratch); + if (err) + goto err_scratch; + + err = __read_reg(ce, RING_EXECLIST_STATUS_HI(ce->engine->mmio_base), + i915_ggtt_offset(scratch)); + if (err) + goto err_scratch; + + val = i915_gem_object_pin_map_unlocked(scratch->obj, I915_MAP_WB); + if (IS_ERR(val)) { + err = PTR_ERR(val); + goto err_scratch; + } + + *ctx_id = *val; + i915_gem_object_unpin_map(scratch->obj); + +err_scratch: + i915_vma_unpin_and_release(&scratch, 0); + return err; +} + +/* + * For execlist mode of submission, pick an unused context id + * 0 - (NUM_CONTEXT_TAG -1) are used by other contexts + * XXX_MAX_CONTEXT_HW_ID is used by idle context + * + * For GuC mode of submission read context id from the upper dword of the + * EXECLIST_STATUS register. Note that we read this value only once and expect + * that the value stays fixed for the entire OA use case. There are cases where + * GuC KMD implementation may deregister a context to reuse it's context id, but + * we prevent that from happening to the OA context by pinning it. + */ +static int gen12_get_render_context_id(struct i915_perf_stream *stream) +{ + u32 ctx_id, mask; + int ret; + + if (intel_engine_uses_guc(stream->engine)) { + ret = gen12_guc_sw_ctx_id(stream->pinned_ctx, &ctx_id); + if (ret) + return ret; + + mask = ((1U << GEN12_GUC_SW_CTX_ID_WIDTH) - 1) << + (GEN12_GUC_SW_CTX_ID_SHIFT - 32); + } else if (GRAPHICS_VER_FULL(stream->engine->i915) >= IP_VER(12, 55)) { + ctx_id = (XEHP_MAX_CONTEXT_HW_ID - 1) << + (XEHP_SW_CTX_ID_SHIFT - 32); + + mask = ((1U << XEHP_SW_CTX_ID_WIDTH) - 1) << + (XEHP_SW_CTX_ID_SHIFT - 32); + } else { + ctx_id = (GEN12_MAX_CONTEXT_HW_ID - 1) << + (GEN11_SW_CTX_ID_SHIFT - 32); + + mask = ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << + (GEN11_SW_CTX_ID_SHIFT - 32); + } + stream->specific_ctx_id = ctx_id & mask; + stream->specific_ctx_id_mask = mask; + + return 0; +} + +static bool oa_find_reg_in_lri(u32 *state, u32 reg, u32 *offset, u32 end) +{ + u32 idx = *offset; + u32 len = min(MI_LRI_LEN(state[idx]) + idx, end); + bool found = false; + + idx++; + for (; idx < len; idx += 2) { + if (state[idx] == reg) { + found = true; + break; + } + } + + *offset = idx; + return found; +} + +static u32 oa_context_image_offset(struct intel_context *ce, u32 reg) +{ + u32 offset, len = (ce->engine->context_size - PAGE_SIZE) / 4; + u32 *state = ce->lrc_reg_state; + + if (drm_WARN_ON(&ce->engine->i915->drm, !state)) + return U32_MAX; + + for (offset = 0; offset < len; ) { + if (IS_MI_LRI_CMD(state[offset])) { + /* + * We expect reg-value pairs in MI_LRI command, so + * MI_LRI_LEN() should be even, if not, issue a warning. + */ + drm_WARN_ON(&ce->engine->i915->drm, + MI_LRI_LEN(state[offset]) & 0x1); + + if (oa_find_reg_in_lri(state, reg, &offset, len)) + break; + } else { + offset++; + } + } + + return offset < len ? offset : U32_MAX; +} + +static int set_oa_ctx_ctrl_offset(struct intel_context *ce) +{ + i915_reg_t reg = GEN12_OACTXCONTROL(ce->engine->mmio_base); + struct i915_perf *perf = &ce->engine->i915->perf; + u32 offset = perf->ctx_oactxctrl_offset; + + /* Do this only once. Failure is stored as offset of U32_MAX */ + if (offset) + goto exit; + + offset = oa_context_image_offset(ce, i915_mmio_reg_offset(reg)); + perf->ctx_oactxctrl_offset = offset; + + drm_dbg(&ce->engine->i915->drm, + "%s oa ctx control at 0x%08x dword offset\n", + ce->engine->name, offset); + +exit: + return offset && offset != U32_MAX ? 0 : -ENODEV; +} + +static bool engine_supports_mi_query(struct intel_engine_cs *engine) +{ + return engine->class == RENDER_CLASS; +} + /** * oa_get_render_ctx_id - determine and hold ctx hw id * @stream: An i915-perf stream opened for OA metrics @@ -1246,11 +1526,28 @@ retry: static int oa_get_render_ctx_id(struct i915_perf_stream *stream) { struct intel_context *ce; + int ret = 0; ce = oa_pin_context(stream); if (IS_ERR(ce)) return PTR_ERR(ce); + if (engine_supports_mi_query(stream->engine) && + HAS_LOGICAL_RING_CONTEXTS(stream->perf->i915)) { + /* + * We are enabling perf query here. If we don't find the context + * offset here, just return an error. + */ + ret = set_oa_ctx_ctrl_offset(ce); + if (ret) { + intel_context_unpin(ce); + drm_err(&stream->perf->i915->drm, + "Enabling perf query failed for %s\n", + stream->engine->name); + return ret; + } + } + switch (GRAPHICS_VER(ce->engine->i915)) { case 7: { /* @@ -1292,24 +1589,7 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream) case 11: case 12: - if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 50)) { - stream->specific_ctx_id_mask = - ((1U << XEHP_SW_CTX_ID_WIDTH) - 1) << - (XEHP_SW_CTX_ID_SHIFT - 32); - stream->specific_ctx_id = - (XEHP_MAX_CONTEXT_HW_ID - 1) << - (XEHP_SW_CTX_ID_SHIFT - 32); - } else { - stream->specific_ctx_id_mask = - ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32); - /* - * Pick an unused context id - * 0 - BITS_PER_LONG are used by other contexts - * GEN12_MAX_CONTEXT_HW_ID (0x7ff) is used by idle context - */ - stream->specific_ctx_id = - (GEN12_MAX_CONTEXT_HW_ID - 1) << (GEN11_SW_CTX_ID_SHIFT - 32); - } + ret = gen12_get_render_context_id(stream); break; default: @@ -1323,7 +1603,7 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream) stream->specific_ctx_id, stream->specific_ctx_id_mask); - return 0; + return ret; } /** @@ -1372,11 +1652,24 @@ free_noa_wait(struct i915_perf_stream *stream) i915_vma_unpin_and_release(&stream->noa_wait, 0); } +static bool engine_supports_oa(const struct intel_engine_cs *engine) +{ + return engine->oa_group; +} + +static bool engine_supports_oa_format(struct intel_engine_cs *engine, int type) +{ + return engine->oa_group && engine->oa_group->type == type; +} + static void i915_oa_stream_destroy(struct i915_perf_stream *stream) { struct i915_perf *perf = stream->perf; + struct intel_gt *gt = stream->engine->gt; + struct i915_perf_group *g = stream->engine->oa_group; + int m; - if (WARN_ON(stream != perf->exclusive_stream)) + if (WARN_ON(stream != g->exclusive_stream)) return; /* @@ -1385,7 +1678,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream) * * See i915_oa_init_reg_state() and lrc_configure_all_contexts() */ - WRITE_ONCE(perf->exclusive_stream, NULL); + WRITE_ONCE(g->exclusive_stream, NULL); perf->ops.disable_metric_set(stream); free_oa_buffer(stream); @@ -1399,10 +1692,9 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream) free_oa_configs(stream); free_noa_wait(stream); - if (perf->spurious_report_rs.missed) { - DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n", - perf->spurious_report_rs.missed); - } + m = ratelimit_state_get_miss(&perf->spurious_report_rs); + if (m) + gt_notice(gt, "%d spurious OA report notices suppressed due to ratelimiting\n", m); } static void gen7_init_oa_buffer(struct i915_perf_stream *stream) @@ -1418,7 +1710,7 @@ static void gen7_init_oa_buffer(struct i915_perf_stream *stream) */ intel_uncore_write(uncore, GEN7_OASTATUS2, /* head */ gtt_offset | GEN7_OASTATUS2_MEM_SELECT_GGTT); - stream->oa_buffer.head = gtt_offset; + stream->oa_buffer.head = 0; intel_uncore_write(uncore, GEN7_OABUFFER, gtt_offset); @@ -1426,8 +1718,7 @@ static void gen7_init_oa_buffer(struct i915_perf_stream *stream) gtt_offset | OABUFFER_SIZE_16M); /* Mark that we need updated tail pointers to read from... */ - stream->oa_buffer.aging_tail = INVALID_TAIL_PTR; - stream->oa_buffer.tail = gtt_offset; + stream->oa_buffer.tail = 0; spin_unlock_irqrestore(&stream->oa_buffer.ptr_lock, flags); @@ -1461,7 +1752,7 @@ static void gen8_init_oa_buffer(struct i915_perf_stream *stream) intel_uncore_write(uncore, GEN8_OASTATUS, 0); intel_uncore_write(uncore, GEN8_OAHEADPTR, gtt_offset); - stream->oa_buffer.head = gtt_offset; + stream->oa_buffer.head = 0; intel_uncore_write(uncore, GEN8_OABUFFER_UDW, 0); @@ -1478,8 +1769,7 @@ static void gen8_init_oa_buffer(struct i915_perf_stream *stream) intel_uncore_write(uncore, GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK); /* Mark that we need updated tail pointers to read from... */ - stream->oa_buffer.aging_tail = INVALID_TAIL_PTR; - stream->oa_buffer.tail = gtt_offset; + stream->oa_buffer.tail = 0; /* * Reset state used to recognise context switches, affecting which @@ -1513,10 +1803,10 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream) spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags); - intel_uncore_write(uncore, GEN12_OAG_OASTATUS, 0); - intel_uncore_write(uncore, GEN12_OAG_OAHEADPTR, + intel_uncore_write(uncore, __oa_regs(stream)->oa_status, 0); + intel_uncore_write(uncore, __oa_regs(stream)->oa_head_ptr, gtt_offset & GEN12_OAG_OAHEADPTR_MASK); - stream->oa_buffer.head = gtt_offset; + stream->oa_buffer.head = 0; /* * PRM says: @@ -1526,14 +1816,13 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream) * to enable proper functionality of the overflow * bit." */ - intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset | + intel_uncore_write(uncore, __oa_regs(stream)->oa_buffer, gtt_offset | OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT); - intel_uncore_write(uncore, GEN12_OAG_OATAILPTR, + intel_uncore_write(uncore, __oa_regs(stream)->oa_tail_ptr, gtt_offset & GEN12_OAG_OATAILPTR_MASK); /* Mark that we need updated tail pointers to read from... */ - stream->oa_buffer.aging_tail = INVALID_TAIL_PTR; - stream->oa_buffer.tail = gtt_offset; + stream->oa_buffer.tail = 0; /* * Reset state used to recognise context switches, affecting which @@ -1563,6 +1852,7 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream) static int alloc_oa_buffer(struct i915_perf_stream *stream) { struct drm_i915_private *i915 = stream->perf->i915; + struct intel_gt *gt = stream->engine->gt; struct drm_i915_gem_object *bo; struct i915_vma *vma; int ret; @@ -1582,11 +1872,22 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream) i915_gem_object_set_cache_coherency(bo, I915_CACHE_LLC); /* PreHSW required 512K alignment, HSW requires 16M */ - vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, 0); + vma = i915_vma_instance(bo, >->ggtt->vm, NULL); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto err_unref; } + + /* + * PreHSW required 512K alignment. + * HSW and onwards, align to requested size of OA buffer. + */ + ret = i915_vma_pin(vma, 0, SZ_16M, PIN_GLOBAL | PIN_HIGH); + if (ret) { + gt_err(gt, "Failed to pin OA buffer %d\n", ret); + goto err_unref; + } + stream->oa_buffer.vma = vma; stream->oa_buffer.vaddr = @@ -1625,8 +1926,7 @@ static u32 *save_restore_register(struct i915_perf_stream *stream, u32 *cs, for (d = 0; d < dword_count; d++) { *cs++ = cmd; *cs++ = i915_mmio_reg_offset(reg) + 4 * d; - *cs++ = intel_gt_scratch_offset(stream->engine->gt, - offset) + 4 * d; + *cs++ = i915_ggtt_offset(stream->noa_wait) + offset + 4 * d; *cs++ = 0; } @@ -1636,6 +1936,7 @@ static u32 *save_restore_register(struct i915_perf_stream *stream, u32 *cs, static int alloc_noa_wait(struct i915_perf_stream *stream) { struct drm_i915_private *i915 = stream->perf->i915; + struct intel_gt *gt = stream->engine->gt; struct drm_i915_gem_object *bo; struct i915_vma *vma; const u64 delay_ticks = 0xffffffffffffffff - @@ -1654,8 +1955,17 @@ static int alloc_noa_wait(struct i915_perf_stream *stream) DELTA_TARGET, N_CS_GPR }; + i915_reg_t mi_predicate_result = HAS_MI_SET_PREDICATE(i915) ? + MI_PREDICATE_RESULT_2_ENGINE(base) : + MI_PREDICATE_RESULT_1(RENDER_RING_BASE); - bo = i915_gem_object_create_internal(i915, 4096); + /* + * gt->scratch was being used to save/restore the GPR registers, but on + * MTL the scratch uses stolen lmem. An MI_SRM to this memory region + * causes an engine hang. Instead allocate an additional page here to + * save/restore GPR registers + */ + bo = i915_gem_object_create_internal(i915, 8192); if (IS_ERR(bo)) { drm_err(&i915->drm, "Failed to allocate NOA wait batchbuffer\n"); @@ -1673,26 +1983,35 @@ retry: * multiple OA config BOs will have a jump to this address and it * needs to be fixed during the lifetime of the i915/perf stream. */ - vma = i915_gem_object_ggtt_pin_ww(bo, &ww, NULL, 0, 0, PIN_HIGH); + vma = i915_vma_instance(bo, >->ggtt->vm, NULL); if (IS_ERR(vma)) { ret = PTR_ERR(vma); goto out_ww; } + ret = i915_vma_pin_ww(vma, &ww, 0, 0, PIN_GLOBAL | PIN_HIGH); + if (ret) + goto out_ww; + batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB); if (IS_ERR(batch)) { ret = PTR_ERR(batch); goto err_unpin; } + stream->noa_wait = vma; + +#define GPR_SAVE_OFFSET 4096 +#define PREDICATE_SAVE_OFFSET 4160 + /* Save registers. */ for (i = 0; i < N_CS_GPR; i++) cs = save_restore_register( stream, cs, true /* save */, CS_GPR(i), - INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2); + GPR_SAVE_OFFSET + 8 * i, 2); cs = save_restore_register( - stream, cs, true /* save */, MI_PREDICATE_RESULT_1(RENDER_RING_BASE), - INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1); + stream, cs, true /* save */, mi_predicate_result, + PREDICATE_SAVE_OFFSET, 1); /* First timestamp snapshot location. */ ts0 = cs; @@ -1745,7 +2064,10 @@ retry: */ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); *cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE)); - *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1(RENDER_RING_BASE)); + *cs++ = i915_mmio_reg_offset(mi_predicate_result); + + if (HAS_MI_SET_PREDICATE(i915)) + *cs++ = MI_SET_PREDICATE | 1; /* Restart from the beginning if we had timestamps roll over. */ *cs++ = (GRAPHICS_VER(i915) < 8 ? @@ -1755,6 +2077,9 @@ retry: *cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4; *cs++ = 0; + if (HAS_MI_SET_PREDICATE(i915)) + *cs++ = MI_SET_PREDICATE; + /* * Now add the diff between to previous timestamps and add it to : * (((1 * << 64) - 1) - delay_ns) @@ -1782,7 +2107,10 @@ retry: */ *cs++ = MI_LOAD_REGISTER_REG | (3 - 2); *cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE)); - *cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1(RENDER_RING_BASE)); + *cs++ = i915_mmio_reg_offset(mi_predicate_result); + + if (HAS_MI_SET_PREDICATE(i915)) + *cs++ = MI_SET_PREDICATE | 1; /* Predicate the jump. */ *cs++ = (GRAPHICS_VER(i915) < 8 ? @@ -1792,14 +2120,17 @@ retry: *cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4; *cs++ = 0; + if (HAS_MI_SET_PREDICATE(i915)) + *cs++ = MI_SET_PREDICATE; + /* Restore registers. */ for (i = 0; i < N_CS_GPR; i++) cs = save_restore_register( stream, cs, false /* restore */, CS_GPR(i), - INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2); + GPR_SAVE_OFFSET + 8 * i, 2); cs = save_restore_register( - stream, cs, false /* restore */, MI_PREDICATE_RESULT_1(RENDER_RING_BASE), - INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1); + stream, cs, false /* restore */, mi_predicate_result, + PREDICATE_SAVE_OFFSET, 1); /* And return to the ring. */ *cs++ = MI_BATCH_BUFFER_END; @@ -1809,7 +2140,6 @@ retry: i915_gem_object_flush_map(bo); __i915_gem_object_release_map(bo); - stream->noa_wait = vma; goto out_ww; err_unpin: @@ -2017,14 +2347,12 @@ retry: goto err_add_request; } - err = i915_request_await_object(rq, vma->obj, 0); - if (!err) - err = i915_vma_move_to_active(vma, rq, 0); + err = i915_vma_move_to_active(vma, rq, 0); if (err) goto err_add_request; err = rq->engine->emit_bb_start(rq, - vma->node.start, 0, + i915_vma_offset(vma), 0, I915_DISPATCH_SECURE); if (err) goto err_add_request; @@ -2249,7 +2577,8 @@ err_add_request: return err; } -static int gen8_configure_context(struct i915_gem_context *ctx, +static int gen8_configure_context(struct i915_perf_stream *stream, + struct i915_gem_context *ctx, struct flex *flex, unsigned int count) { struct i915_gem_engines_iter it; @@ -2283,11 +2612,12 @@ static int gen12_configure_oar_context(struct i915_perf_stream *stream, { int err; struct intel_context *ce = stream->pinned_ctx; - u32 format = stream->oa_buffer.format; + u32 format = stream->oa_buffer.format->format; + u32 offset = stream->perf->ctx_oactxctrl_offset; struct flex regs_context[] = { { GEN8_OACTXCONTROL, - stream->perf->ctx_oactxctrl_offset + 1, + offset + 1, active ? GEN8_OA_COUNTER_RESUME : 0, }, }; @@ -2312,12 +2642,13 @@ static int gen12_configure_oar_context(struct i915_perf_stream *stream, }, }; - /* Modify the context image of pinned context with regs_context*/ + /* Modify the context image of pinned context with regs_context */ err = intel_context_lock_pinned(ce); if (err) return err; - err = gen8_modify_context(ce, regs_context, ARRAY_SIZE(regs_context)); + err = gen8_modify_context(ce, regs_context, + ARRAY_SIZE(regs_context)); intel_context_unlock_pinned(ce); if (err) return err; @@ -2359,10 +2690,11 @@ oa_configure_all_contexts(struct i915_perf_stream *stream, { struct drm_i915_private *i915 = stream->perf->i915; struct intel_engine_cs *engine; + struct intel_gt *gt = stream->engine->gt; struct i915_gem_context *ctx, *cn; int err; - lockdep_assert_held(&stream->perf->lock); + lockdep_assert_held(>->perf.lock); /* * The OA register config is setup through the context image. This image @@ -2387,7 +2719,7 @@ oa_configure_all_contexts(struct i915_perf_stream *stream, spin_unlock(&i915->gem.contexts.lock); - err = gen8_configure_context(ctx, regs, num_regs); + err = gen8_configure_context(stream, ctx, regs, num_regs); if (err) { i915_gem_context_put(ctx); return err; @@ -2421,27 +2753,11 @@ oa_configure_all_contexts(struct i915_perf_stream *stream, } static int -gen12_configure_all_contexts(struct i915_perf_stream *stream, - const struct i915_oa_config *oa_config, - struct i915_active *active) -{ - struct flex regs[] = { - { - GEN8_R_PWR_CLK_STATE(RENDER_RING_BASE), - CTX_R_PWR_CLK_STATE, - }, - }; - - return oa_configure_all_contexts(stream, - regs, ARRAY_SIZE(regs), - active); -} - -static int lrc_configure_all_contexts(struct i915_perf_stream *stream, const struct i915_oa_config *oa_config, struct i915_active *active) { + u32 ctx_oactxctrl = stream->perf->ctx_oactxctrl_offset; /* The MMIO offsets for Flex EU registers aren't contiguous */ const u32 ctx_flexeu0 = stream->perf->ctx_flexeu0_offset; #define ctx_flexeuN(N) (ctx_flexeu0 + 2 * (N) + 1) @@ -2452,7 +2768,7 @@ lrc_configure_all_contexts(struct i915_perf_stream *stream, }, { GEN8_OACTXCONTROL, - stream->perf->ctx_oactxctrl_offset + 1, + ctx_oactxctrl + 1, }, { EU_PERF_CNTL0, ctx_flexeuN(0) }, { EU_PERF_CNTL1, ctx_flexeuN(1) }, @@ -2540,13 +2856,26 @@ static int gen12_enable_metric_set(struct i915_perf_stream *stream, struct i915_active *active) { + struct drm_i915_private *i915 = stream->perf->i915; struct intel_uncore *uncore = stream->uncore; - struct i915_oa_config *oa_config = stream->oa_config; bool periodic = stream->periodic; u32 period_exponent = stream->period_exponent; + u32 sqcnt1; int ret; - intel_uncore_write(uncore, GEN12_OAG_OA_DEBUG, + /* + * Wa_1508761755 + * EU NOA signals behave incorrectly if EU clock gating is enabled. + * Disable thread stall DOP gating and EU DOP gating. + */ + if (IS_DG2(i915)) { + intel_gt_mcr_multicast_write(uncore->gt, GEN8_ROW_CHICKEN, + _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE)); + intel_uncore_write(uncore, GEN7_ROW_CHICKEN2, + _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING)); + } + + intel_uncore_write(uncore, __oa_regs(stream)->oa_debug, /* Disable clk ratio reports, like previous Gens. */ _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS | GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) | @@ -2556,20 +2885,21 @@ gen12_enable_metric_set(struct i915_perf_stream *stream, */ oag_report_ctx_switches(stream)); - intel_uncore_write(uncore, GEN12_OAG_OAGLBCTXCTRL, periodic ? + intel_uncore_write(uncore, __oa_regs(stream)->oa_ctx_ctrl, periodic ? (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME | GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE | (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT)) : 0); /* - * Update all contexts prior writing the mux configurations as we need - * to make sure all slices/subslices are ON before writing to NOA - * registers. + * Initialize Super Queue Internal Cnt Register + * Set PMON Enable in order to collect valid metrics. + * Enable bytes per clock reporting in OA. */ - ret = gen12_configure_all_contexts(stream, oa_config, active); - if (ret) - return ret; + sqcnt1 = GEN12_SQCNT1_PMON_ENABLE | + (HAS_OA_BPC_REPORTING(i915) ? GEN12_SQCNT1_OABPC : 0); + + intel_uncore_rmw(uncore, GEN12_SQCNT1, 0, sqcnt1); /* * For Gen12, performance counters are context @@ -2611,9 +2941,18 @@ static void gen11_disable_metric_set(struct i915_perf_stream *stream) static void gen12_disable_metric_set(struct i915_perf_stream *stream) { struct intel_uncore *uncore = stream->uncore; + struct drm_i915_private *i915 = stream->perf->i915; + u32 sqcnt1; - /* Reset all contexts' slices/subslices configurations. */ - gen12_configure_all_contexts(stream, NULL, NULL); + /* + * Wa_1508761755: Enable thread stall DOP gating and EU DOP gating. + */ + if (IS_DG2(i915)) { + intel_gt_mcr_multicast_write(uncore->gt, GEN8_ROW_CHICKEN, + _MASKED_BIT_DISABLE(STALL_DOP_GATING_DISABLE)); + intel_uncore_write(uncore, GEN7_ROW_CHICKEN2, + _MASKED_BIT_DISABLE(GEN12_DISABLE_DOP_GATING)); + } /* disable the context save/restore or OAR counters */ if (stream->ctx) @@ -2621,6 +2960,12 @@ static void gen12_disable_metric_set(struct i915_perf_stream *stream) /* Make sure we disable noa to save power. */ intel_uncore_rmw(uncore, RPM_CONFIG1, GEN10_GT_NOA_ENABLE, 0); + + sqcnt1 = GEN12_SQCNT1_PMON_ENABLE | + (HAS_OA_BPC_REPORTING(i915) ? GEN12_SQCNT1_OABPC : 0); + + /* Reset PMON Enable to save power. */ + intel_uncore_rmw(uncore, GEN12_SQCNT1, sqcnt1, 0); } static void gen7_oa_enable(struct i915_perf_stream *stream) @@ -2630,7 +2975,7 @@ static void gen7_oa_enable(struct i915_perf_stream *stream) u32 ctx_id = stream->specific_ctx_id; bool periodic = stream->periodic; u32 period_exponent = stream->period_exponent; - u32 report_format = stream->oa_buffer.format; + u32 report_format = stream->oa_buffer.format->format; /* * Reset buf pointers so we don't forward reports from before now. @@ -2656,7 +3001,7 @@ static void gen7_oa_enable(struct i915_perf_stream *stream) static void gen8_oa_enable(struct i915_perf_stream *stream) { struct intel_uncore *uncore = stream->uncore; - u32 report_format = stream->oa_buffer.format; + u32 report_format = stream->oa_buffer.format->format; /* * Reset buf pointers so we don't forward reports from before now. @@ -2681,8 +3026,8 @@ static void gen8_oa_enable(struct i915_perf_stream *stream) static void gen12_oa_enable(struct i915_perf_stream *stream) { - struct intel_uncore *uncore = stream->uncore; - u32 report_format = stream->oa_buffer.format; + const struct i915_perf_regs *regs; + u32 val; /* * If we don't want OA reports from the OA buffer, then we don't even @@ -2693,9 +3038,11 @@ static void gen12_oa_enable(struct i915_perf_stream *stream) gen12_init_oa_buffer(stream); - intel_uncore_write(uncore, GEN12_OAG_OACONTROL, - (report_format << GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT) | - GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE); + regs = __oa_regs(stream); + val = (stream->oa_buffer.format->format << regs->oa_ctrl_counter_format_shift) | + GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE; + + intel_uncore_write(stream->uncore, regs->oa_ctrl, val); } /** @@ -2747,9 +3094,9 @@ static void gen12_oa_disable(struct i915_perf_stream *stream) { struct intel_uncore *uncore = stream->uncore; - intel_uncore_write(uncore, GEN12_OAG_OACONTROL, 0); + intel_uncore_write(uncore, __oa_regs(stream)->oa_ctrl, 0); if (intel_wait_for_register(uncore, - GEN12_OAG_OACONTROL, + __oa_regs(stream)->oa_ctrl, GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 0, 50)) drm_err(&stream->perf->i915->drm, @@ -2838,6 +3185,32 @@ get_sseu_config(struct intel_sseu *out_sseu, return i915_gem_user_to_context_sseu(engine->gt, drm_sseu, out_sseu); } +/* + * OA timestamp frequency = CS timestamp frequency in most platforms. On some + * platforms OA unit ignores the CTC_SHIFT and the 2 timestamps differ. In such + * cases, return the adjusted CS timestamp frequency to the user. + */ +u32 i915_perf_oa_timestamp_frequency(struct drm_i915_private *i915) +{ + struct intel_gt *gt = to_gt(i915); + + /* Wa_18013179988 */ + if (IS_DG2(i915) || IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) { + intel_wakeref_t wakeref; + u32 reg, shift; + + with_intel_runtime_pm(to_gt(i915)->uncore->rpm, wakeref) + reg = intel_uncore_read(to_gt(i915)->uncore, RPM_CONFIG0); + + shift = REG_FIELD_GET(GEN10_RPM_CONFIG0_CTC_SHIFT_PARAMETER_MASK, + reg); + + return to_gt(i915)->clock_frequency << (3 - shift); + } + + return to_gt(i915)->clock_frequency; +} + /** * i915_oa_stream_init - validate combined props for OA stream and init * @stream: An i915 perf stream @@ -2862,7 +3235,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, { struct drm_i915_private *i915 = stream->perf->i915; struct i915_perf *perf = stream->perf; - int format_size; + struct i915_perf_group *g; int ret; if (!props->engine) { @@ -2870,6 +3243,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, "OA engine not specified\n"); return -EINVAL; } + g = props->engine->oa_group; /* * If the sysfs metrics/ directory wasn't registered for some @@ -2900,7 +3274,7 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, * counter reports and marshal to the appropriate client * we currently only allow exclusive access */ - if (perf->exclusive_stream) { + if (g->exclusive_stream) { drm_dbg(&stream->perf->i915->drm, "OA unit already in use\n"); return -EBUSY; @@ -2917,20 +3291,15 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, stream->sample_size = sizeof(struct drm_i915_perf_record_header); - format_size = perf->oa_formats[props->oa_format].size; + stream->oa_buffer.format = &perf->oa_formats[props->oa_format]; + if (drm_WARN_ON(&i915->drm, stream->oa_buffer.format->size == 0)) + return -EINVAL; stream->sample_flags = props->sample_flags; - stream->sample_size += format_size; - - stream->oa_buffer.format_size = format_size; - if (drm_WARN_ON(&i915->drm, stream->oa_buffer.format_size == 0)) - return -EINVAL; + stream->sample_size += stream->oa_buffer.format->size; stream->hold_preemption = props->hold_preemption; - stream->oa_buffer.format = - perf->oa_formats[props->oa_format].format; - stream->periodic = props->oa_periodic; if (stream->periodic) stream->period_exponent = props->oa_period_exponent; @@ -2980,8 +3349,8 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, stream->ops = &i915_oa_stream_ops; - perf->sseu = props->sseu; - WRITE_ONCE(perf->exclusive_stream, stream); + stream->engine->gt->perf.sseu = props->sseu; + WRITE_ONCE(g->exclusive_stream, stream); ret = i915_perf_stream_enable_sync(stream); if (ret) { @@ -2994,26 +3363,26 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, "opening stream oa config uuid=%s\n", stream->oa_config->uuid); - hrtimer_init(&stream->poll_check_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); - stream->poll_check_timer.function = oa_poll_check_timer_cb; + hrtimer_setup(&stream->poll_check_timer, oa_poll_check_timer_cb, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); init_waitqueue_head(&stream->poll_wq); spin_lock_init(&stream->oa_buffer.ptr_lock); + mutex_init(&stream->lock); return 0; err_enable: - WRITE_ONCE(perf->exclusive_stream, NULL); + WRITE_ONCE(g->exclusive_stream, NULL); perf->ops.disable_metric_set(stream); free_oa_buffer(stream); err_oa_buf_alloc: - free_oa_configs(stream); - intel_uncore_forcewake_put(stream->uncore, FORCEWAKE_ALL); intel_engine_pm_put(stream->engine); + free_oa_configs(stream); + err_config: free_noa_wait(stream); @@ -3033,7 +3402,7 @@ void i915_oa_init_reg_state(const struct intel_context *ce, return; /* perf.exclusive_stream serialised by lrc_configure_all_contexts() */ - stream = READ_ONCE(engine->i915->perf.exclusive_stream); + stream = READ_ONCE(engine->oa_group->exclusive_stream); if (stream && GRAPHICS_VER(stream->perf->i915) < 12) gen8_update_reg_state_unlocked(ce, stream); } @@ -3062,7 +3431,6 @@ static ssize_t i915_perf_read(struct file *file, loff_t *ppos) { struct i915_perf_stream *stream = file->private_data; - struct i915_perf *perf = stream->perf; size_t offset = 0; int ret; @@ -3086,14 +3454,14 @@ static ssize_t i915_perf_read(struct file *file, if (ret) return ret; - mutex_lock(&perf->lock); + mutex_lock(&stream->lock); ret = stream->ops->read(stream, buf, count, &offset); - mutex_unlock(&perf->lock); + mutex_unlock(&stream->lock); } while (!offset && !ret); } else { - mutex_lock(&perf->lock); + mutex_lock(&stream->lock); ret = stream->ops->read(stream, buf, count, &offset); - mutex_unlock(&perf->lock); + mutex_unlock(&stream->lock); } /* We allow the poll checking to sometimes report false positive EPOLLIN @@ -3140,9 +3508,6 @@ static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer) * &i915_perf_stream_ops->poll_wait to call poll_wait() with a wait queue that * will be woken for new stream data. * - * Note: The &perf->lock mutex has been taken to serialize - * with any non-file-operation driver hooks. - * * Returns: any poll events that are ready without sleeping */ static __poll_t i915_perf_poll_locked(struct i915_perf_stream *stream, @@ -3181,12 +3546,11 @@ static __poll_t i915_perf_poll_locked(struct i915_perf_stream *stream, static __poll_t i915_perf_poll(struct file *file, poll_table *wait) { struct i915_perf_stream *stream = file->private_data; - struct i915_perf *perf = stream->perf; __poll_t ret; - mutex_lock(&perf->lock); + mutex_lock(&stream->lock); ret = i915_perf_poll_locked(stream, file, wait); - mutex_unlock(&perf->lock); + mutex_unlock(&stream->lock); return ret; } @@ -3285,9 +3649,6 @@ static long i915_perf_config_locked(struct i915_perf_stream *stream, * @cmd: the ioctl request * @arg: the ioctl data * - * Note: The &perf->lock mutex has been taken to serialize - * with any non-file-operation driver hooks. - * * Returns: zero on success or a negative error code. Returns -EINVAL for * an unknown ioctl request. */ @@ -3325,12 +3686,11 @@ static long i915_perf_ioctl(struct file *file, unsigned long arg) { struct i915_perf_stream *stream = file->private_data; - struct i915_perf *perf = stream->perf; long ret; - mutex_lock(&perf->lock); + mutex_lock(&stream->lock); ret = i915_perf_ioctl_locked(stream, cmd, arg); - mutex_unlock(&perf->lock); + mutex_unlock(&stream->lock); return ret; } @@ -3342,7 +3702,7 @@ static long i915_perf_ioctl(struct file *file, * Frees all resources associated with the given i915 perf @stream, disabling * any associated data capture in the process. * - * Note: The &perf->lock mutex has been taken to serialize + * Note: The >->perf.lock mutex has been taken to serialize * with any non-file-operation driver hooks. */ static void i915_perf_destroy_locked(struct i915_perf_stream *stream) @@ -3374,10 +3734,16 @@ static int i915_perf_release(struct inode *inode, struct file *file) { struct i915_perf_stream *stream = file->private_data; struct i915_perf *perf = stream->perf; + struct intel_gt *gt = stream->engine->gt; - mutex_lock(&perf->lock); + /* + * Within this call, we know that the fd is being closed and we have no + * other user of stream->lock. Use the perf lock to destroy the stream + * here. + */ + mutex_lock(>->perf.lock); i915_perf_destroy_locked(stream); - mutex_unlock(&perf->lock); + mutex_unlock(>->perf.lock); /* Release the reference the perf stream kept on the driver. */ drm_dev_put(&perf->i915->drm); @@ -3388,7 +3754,6 @@ static int i915_perf_release(struct inode *inode, struct file *file) static const struct file_operations fops = { .owner = THIS_MODULE, - .llseek = no_llseek, .release = i915_perf_release, .poll = i915_perf_poll, .read = i915_perf_read, @@ -3410,7 +3775,7 @@ static const struct file_operations fops = { * See i915_perf_ioctl_open() for interface details. * * Implements further stream config validation and stream initialization on - * behalf of i915_perf_open_ioctl() with the &perf->lock mutex + * behalf of i915_perf_open_ioctl() with the >->perf.lock mutex * taken to serialize with any non-file-operation driver hooks. * * Note: at this point the @props have only been validated in isolation and @@ -3487,7 +3852,7 @@ i915_perf_open_ioctl_locked(struct i915_perf *perf, } /* - * Asking for SSEU configuration is a priviliged operation. + * Asking for SSEU configuration is a privileged operation. */ if (props->has_sseu) privileged_op = true; @@ -3565,8 +3930,10 @@ err: static u64 oa_exponent_to_ns(struct i915_perf *perf, int exponent) { - return intel_gt_clock_interval_to_ns(to_gt(perf->i915), - 2ULL << exponent); + u64 nom = (2ULL << exponent) * NSEC_PER_SEC; + u32 den = i915_perf_oa_timestamp_frequency(perf->i915); + + return div_u64(nom + den - 1, den); } static __always_inline bool @@ -3601,41 +3968,35 @@ static int read_properties_unlocked(struct i915_perf *perf, u32 n_props, struct perf_open_properties *props) { + struct drm_i915_gem_context_param_sseu user_sseu; + const struct i915_oa_format *f; u64 __user *uprop = uprops; + bool config_instance = false; + bool config_class = false; + bool config_sseu = false; + u8 class, instance; u32 i; int ret; memset(props, 0, sizeof(struct perf_open_properties)); props->poll_oa_period = DEFAULT_POLL_PERIOD_NS; - if (!n_props) { - drm_dbg(&perf->i915->drm, - "No i915 perf properties given\n"); - return -EINVAL; - } - - /* At the moment we only support using i915-perf on the RCS. */ - props->engine = intel_engine_lookup_user(perf->i915, - I915_ENGINE_CLASS_RENDER, - 0); - if (!props->engine) { - drm_dbg(&perf->i915->drm, - "No RENDER-capable engines\n"); - return -EINVAL; - } - /* Considering that ID = 0 is reserved and assuming that we don't * (currently) expect any configurations to ever specify duplicate * values for a particular property ID then the last _PROP_MAX value is * one greater than the maximum number of properties we expect to get * from userspace. */ - if (n_props >= DRM_I915_PERF_PROP_MAX) { + if (!n_props || n_props >= DRM_I915_PERF_PROP_MAX) { drm_dbg(&perf->i915->drm, - "More i915 perf properties specified than exist\n"); + "Invalid number of i915 perf properties given\n"); return -EINVAL; } + /* Defaults when class:instance is not passed */ + class = I915_ENGINE_CLASS_RENDER; + instance = 0; + for (i = 0; i < n_props; i++) { u64 oa_period, oa_freq_hz; u64 id, value; @@ -3730,9 +4091,7 @@ static int read_properties_unlocked(struct i915_perf *perf, props->hold_preemption = !!value; break; case DRM_I915_PERF_PROP_GLOBAL_SSEU: { - struct drm_i915_gem_context_param_sseu user_sseu; - - if (GRAPHICS_VER_FULL(perf->i915) >= IP_VER(12, 50)) { + if (GRAPHICS_VER_FULL(perf->i915) >= IP_VER(12, 55)) { drm_dbg(&perf->i915->drm, "SSEU config not supported on gfx %x\n", GRAPHICS_VER_FULL(perf->i915)); @@ -3746,14 +4105,7 @@ static int read_properties_unlocked(struct i915_perf *perf, "Unable to copy global sseu parameter\n"); return -EFAULT; } - - ret = get_sseu_config(&props->sseu, props->engine, &user_sseu); - if (ret) { - drm_dbg(&perf->i915->drm, - "Invalid SSEU configuration\n"); - return ret; - } - props->has_sseu = true; + config_sseu = true; break; } case DRM_I915_PERF_PROP_POLL_OA_PERIOD: @@ -3765,7 +4117,15 @@ static int read_properties_unlocked(struct i915_perf *perf, } props->poll_oa_period = value; break; - case DRM_I915_PERF_PROP_MAX: + case DRM_I915_PERF_PROP_OA_ENGINE_CLASS: + class = (u8)value; + config_class = true; + break; + case DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE: + instance = (u8)value; + config_instance = true; + break; + default: MISSING_CASE(id); return -EINVAL; } @@ -3773,6 +4133,60 @@ static int read_properties_unlocked(struct i915_perf *perf, uprop += 2; } + if ((config_class && !config_instance) || + (config_instance && !config_class)) { + drm_dbg(&perf->i915->drm, + "OA engine-class and engine-instance parameters must be passed together\n"); + return -EINVAL; + } + + props->engine = intel_engine_lookup_user(perf->i915, class, instance); + if (!props->engine) { + drm_dbg(&perf->i915->drm, + "OA engine class and instance invalid %d:%d\n", + class, instance); + return -EINVAL; + } + + if (!engine_supports_oa(props->engine)) { + drm_dbg(&perf->i915->drm, + "Engine not supported by OA %d:%d\n", + class, instance); + return -EINVAL; + } + + /* + * Wa_14017512683: mtl[a0..c0): Use of OAM must be preceded with Media + * C6 disable in BIOS. Fail if Media C6 is enabled on steppings where OAM + * does not work as expected. + */ + if (IS_MEDIA_GT_IP_STEP(props->engine->gt, IP_VER(13, 0), STEP_A0, STEP_C0) && + props->engine->oa_group->type == TYPE_OAM && + intel_check_bios_c6_setup(&props->engine->gt->rc6)) { + drm_dbg(&perf->i915->drm, + "OAM requires media C6 to be disabled in BIOS\n"); + return -EINVAL; + } + + i = array_index_nospec(props->oa_format, I915_OA_FORMAT_MAX); + f = &perf->oa_formats[i]; + if (!engine_supports_oa_format(props->engine, f->type)) { + drm_dbg(&perf->i915->drm, + "Invalid OA format %d for class %d\n", + f->type, props->engine->class); + return -EINVAL; + } + + if (config_sseu) { + ret = get_sseu_config(&props->sseu, props->engine, &user_sseu); + if (ret) { + drm_dbg(&perf->i915->drm, + "Invalid SSEU configuration\n"); + return ret; + } + props->has_sseu = true; + } + return 0; } @@ -3794,7 +4208,7 @@ static int read_properties_unlocked(struct i915_perf *perf, * mutex to avoid an awkward lockdep with mmap_lock. * * Most of the implementation details are handled by - * i915_perf_open_ioctl_locked() after taking the &perf->lock + * i915_perf_open_ioctl_locked() after taking the >->perf.lock * mutex for serializing with any non-file-operation driver hooks. * * Return: A newly opened i915 Perf stream file descriptor or negative @@ -3805,15 +4219,13 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data, { struct i915_perf *perf = &to_i915(dev)->perf; struct drm_i915_perf_open_param *param = data; + struct intel_gt *gt; struct perf_open_properties props; u32 known_open_flags; int ret; - if (!perf->i915) { - drm_dbg(&perf->i915->drm, - "i915 perf interface not available for this system\n"); + if (!perf->i915) return -ENOTSUPP; - } known_open_flags = I915_PERF_FLAG_FD_CLOEXEC | I915_PERF_FLAG_FD_NONBLOCK | @@ -3831,9 +4243,11 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data, if (ret) return ret; - mutex_lock(&perf->lock); + gt = props.engine->gt; + + mutex_lock(>->perf.lock); ret = i915_perf_open_ioctl_locked(perf, param, &props, file); - mutex_unlock(&perf->lock); + mutex_unlock(>->perf.lock); return ret; } @@ -3849,6 +4263,7 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data, void i915_perf_register(struct drm_i915_private *i915) { struct i915_perf *perf = &i915->perf; + struct intel_gt *gt = to_gt(i915); if (!perf->i915) return; @@ -3857,13 +4272,13 @@ void i915_perf_register(struct drm_i915_private *i915) * i915_perf_open_ioctl(); considering that we register after * being exposed to userspace. */ - mutex_lock(&perf->lock); + mutex_lock(>->perf.lock); perf->metrics_kobj = kobject_create_and_add("metrics", &i915->drm.primary->kdev->kobj); - mutex_unlock(&perf->lock); + mutex_unlock(>->perf.lock); } /** @@ -3906,29 +4321,17 @@ static bool gen8_is_valid_flex_addr(struct i915_perf *perf, u32 addr) return false; } -static bool reg_in_range_table(u32 addr, const struct i915_range *table) -{ - while (table->start || table->end) { - if (addr >= table->start && addr <= table->end) - return true; - - table++; - } - - return false; -} - #define REG_EQUAL(addr, mmio) \ ((addr) == i915_mmio_reg_offset(mmio)) -static const struct i915_range gen7_oa_b_counters[] = { +static const struct i915_mmio_range gen7_oa_b_counters[] = { { .start = 0x2710, .end = 0x272c }, /* OASTARTTRIG[1-8] */ { .start = 0x2740, .end = 0x275c }, /* OAREPORTTRIG[1-8] */ { .start = 0x2770, .end = 0x27ac }, /* OACEC[0-7][0-1] */ {} }; -static const struct i915_range gen12_oa_b_counters[] = { +static const struct i915_mmio_range gen12_oa_b_counters[] = { { .start = 0x2b2c, .end = 0x2b2c }, /* GEN12_OAG_OA_PESS */ { .start = 0xd900, .end = 0xd91c }, /* GEN12_OAG_OASTARTTRIG[1-8] */ { .start = 0xd920, .end = 0xd93c }, /* GEN12_OAG_OAREPORTTRIG1[1-8] */ @@ -3939,37 +4342,51 @@ static const struct i915_range gen12_oa_b_counters[] = { {} }; -static const struct i915_range gen7_oa_mux_regs[] = { +static const struct i915_mmio_range mtl_oam_b_counters[] = { + { .start = 0x393000, .end = 0x39301c }, /* GEN12_OAM_STARTTRIG1[1-8] */ + { .start = 0x393020, .end = 0x39303c }, /* GEN12_OAM_REPORTTRIG1[1-8] */ + { .start = 0x393040, .end = 0x39307c }, /* GEN12_OAM_CEC[0-7][0-1] */ + { .start = 0x393200, .end = 0x39323C }, /* MPES[0-7] */ + {} +}; + +static const struct i915_mmio_range xehp_oa_b_counters[] = { + { .start = 0xdc48, .end = 0xdc48 }, /* OAA_ENABLE_REG */ + { .start = 0xdd00, .end = 0xdd48 }, /* OAG_LCE0_0 - OAA_LENABLE_REG */ + {} +}; + +static const struct i915_mmio_range gen7_oa_mux_regs[] = { { .start = 0x91b8, .end = 0x91cc }, /* OA_PERFCNT[1-2], OA_PERFMATRIX */ { .start = 0x9800, .end = 0x9888 }, /* MICRO_BP0_0 - NOA_WRITE */ { .start = 0xe180, .end = 0xe180 }, /* HALF_SLICE_CHICKEN2 */ {} }; -static const struct i915_range hsw_oa_mux_regs[] = { +static const struct i915_mmio_range hsw_oa_mux_regs[] = { { .start = 0x09e80, .end = 0x09ea4 }, /* HSW_MBVID2_NOA[0-9] */ { .start = 0x09ec0, .end = 0x09ec0 }, /* HSW_MBVID2_MISR0 */ { .start = 0x25100, .end = 0x2ff90 }, {} }; -static const struct i915_range chv_oa_mux_regs[] = { +static const struct i915_mmio_range chv_oa_mux_regs[] = { { .start = 0x182300, .end = 0x1823a4 }, {} }; -static const struct i915_range gen8_oa_mux_regs[] = { +static const struct i915_mmio_range gen8_oa_mux_regs[] = { { .start = 0x0d00, .end = 0x0d2c }, /* RPM_CONFIG[0-1], NOA_CONFIG[0-8] */ { .start = 0x20cc, .end = 0x20cc }, /* WAIT_FOR_RC6_EXIT */ {} }; -static const struct i915_range gen11_oa_mux_regs[] = { +static const struct i915_mmio_range gen11_oa_mux_regs[] = { { .start = 0x91c8, .end = 0x91dc }, /* OA_PERFCNT[3-4] */ {} }; -static const struct i915_range gen12_oa_mux_regs[] = { +static const struct i915_mmio_range gen12_oa_mux_regs[] = { { .start = 0x0d00, .end = 0x0d04 }, /* RPM_CONFIG[0-1] */ { .start = 0x0d0c, .end = 0x0d2c }, /* NOA_CONFIG[0-8] */ { .start = 0x9840, .end = 0x9840 }, /* GDT_CHICKEN_BITS */ @@ -3978,56 +4395,90 @@ static const struct i915_range gen12_oa_mux_regs[] = { {} }; +/* + * Ref: 14010536224: + * 0x20cc is repurposed on MTL, so use a separate array for MTL. + */ +static const struct i915_mmio_range mtl_oa_mux_regs[] = { + { .start = 0x0d00, .end = 0x0d04 }, /* RPM_CONFIG[0-1] */ + { .start = 0x0d0c, .end = 0x0d2c }, /* NOA_CONFIG[0-8] */ + { .start = 0x9840, .end = 0x9840 }, /* GDT_CHICKEN_BITS */ + { .start = 0x9884, .end = 0x9888 }, /* NOA_WRITE */ + { .start = 0x38d100, .end = 0x38d114}, /* VISACTL */ + {} +}; + static bool gen7_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr) { - return reg_in_range_table(addr, gen7_oa_b_counters); + return i915_mmio_range_table_contains(addr, gen7_oa_b_counters); } static bool gen8_is_valid_mux_addr(struct i915_perf *perf, u32 addr) { - return reg_in_range_table(addr, gen7_oa_mux_regs) || - reg_in_range_table(addr, gen8_oa_mux_regs); + return i915_mmio_range_table_contains(addr, gen7_oa_mux_regs) || + i915_mmio_range_table_contains(addr, gen8_oa_mux_regs); } static bool gen11_is_valid_mux_addr(struct i915_perf *perf, u32 addr) { - return reg_in_range_table(addr, gen7_oa_mux_regs) || - reg_in_range_table(addr, gen8_oa_mux_regs) || - reg_in_range_table(addr, gen11_oa_mux_regs); + return i915_mmio_range_table_contains(addr, gen7_oa_mux_regs) || + i915_mmio_range_table_contains(addr, gen8_oa_mux_regs) || + i915_mmio_range_table_contains(addr, gen11_oa_mux_regs); } static bool hsw_is_valid_mux_addr(struct i915_perf *perf, u32 addr) { - return reg_in_range_table(addr, gen7_oa_mux_regs) || - reg_in_range_table(addr, hsw_oa_mux_regs); + return i915_mmio_range_table_contains(addr, gen7_oa_mux_regs) || + i915_mmio_range_table_contains(addr, hsw_oa_mux_regs); } static bool chv_is_valid_mux_addr(struct i915_perf *perf, u32 addr) { - return reg_in_range_table(addr, gen7_oa_mux_regs) || - reg_in_range_table(addr, chv_oa_mux_regs); + return i915_mmio_range_table_contains(addr, gen7_oa_mux_regs) || + i915_mmio_range_table_contains(addr, chv_oa_mux_regs); } static bool gen12_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr) { - return reg_in_range_table(addr, gen12_oa_b_counters); + return i915_mmio_range_table_contains(addr, gen12_oa_b_counters); +} + +static bool mtl_is_valid_oam_b_counter_addr(struct i915_perf *perf, u32 addr) +{ + if (HAS_OAM(perf->i915) && + GRAPHICS_VER_FULL(perf->i915) >= IP_VER(12, 70)) + return i915_mmio_range_table_contains(addr, mtl_oam_b_counters); + + return false; +} + +static bool xehp_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr) +{ + return i915_mmio_range_table_contains(addr, xehp_oa_b_counters) || + i915_mmio_range_table_contains(addr, gen12_oa_b_counters) || + mtl_is_valid_oam_b_counter_addr(perf, addr); } static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr) { - return reg_in_range_table(addr, gen12_oa_mux_regs); + if (GRAPHICS_VER_FULL(perf->i915) >= IP_VER(12, 70)) + return i915_mmio_range_table_contains(addr, mtl_oa_mux_regs); + else + return i915_mmio_range_table_contains(addr, gen12_oa_mux_regs); } static u32 mask_reg_value(u32 reg, u32 val) { - /* HALF_SLICE_CHICKEN2 is programmed with a the + /* + * HALF_SLICE_CHICKEN2 is programmed with a the * WaDisableSTUnitPowerOptimization workaround. Make sure the value * programmed by userspace doesn't change this. */ if (REG_EQUAL(reg, HALF_SLICE_CHICKEN2)) val = val & ~_MASKED_BIT_ENABLE(GEN8_ST_PO_DISABLE); - /* WAIT_FOR_RC6_EXIT has only one bit fullfilling the function + /* + * WAIT_FOR_RC6_EXIT has only one bit fulfilling the function * indicated by its name and a bunch of selection fields used by OA * configs. */ @@ -4140,11 +4591,8 @@ int i915_perf_add_config_ioctl(struct drm_device *dev, void *data, struct i915_oa_reg *regs; int err, id; - if (!perf->i915) { - drm_dbg(&perf->i915->drm, - "i915 perf interface not available for this system\n"); + if (!perf->i915) return -ENOTSUPP; - } if (!perf->metrics_kobj) { drm_dbg(&perf->i915->drm, @@ -4270,13 +4718,13 @@ int i915_perf_add_config_ioctl(struct drm_device *dev, void *data, err = oa_config->id; goto sysfs_err; } - - mutex_unlock(&perf->metrics_lock); + id = oa_config->id; drm_dbg(&perf->i915->drm, "Added config %s id=%i\n", oa_config->uuid, oa_config->id); + mutex_unlock(&perf->metrics_lock); - return oa_config->id; + return id; sysfs_err: mutex_unlock(&perf->metrics_lock); @@ -4306,11 +4754,8 @@ int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data, struct i915_oa_config *oa_config; int ret; - if (!perf->i915) { - drm_dbg(&perf->i915->drm, - "i915 perf interface not available for this system\n"); + if (!perf->i915) return -ENOTSUPP; - } if (i915_perf_stream_paranoid && !perfmon_capable()) { drm_dbg(&perf->i915->drm, @@ -4350,7 +4795,7 @@ err_unlock: return ret; } -static struct ctl_table oa_table[] = { +static const struct ctl_table oa_table[] = { { .procname = "perf_stream_paranoid", .data = &i915_perf_stream_paranoid, @@ -4369,9 +4814,138 @@ static struct ctl_table oa_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &oa_sample_rate_hard_limit, }, - {} }; +static u32 num_perf_groups_per_gt(struct intel_gt *gt) +{ + return 1; +} + +static u32 __oam_engine_group(struct intel_engine_cs *engine) +{ + if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) { + /* + * There's 1 SAMEDIA gt and 1 OAM per SAMEDIA gt. All media slices + * within the gt use the same OAM. All MTL SKUs list 1 SA MEDIA. + */ + drm_WARN_ON(&engine->i915->drm, + engine->gt->type != GT_MEDIA); + + return PERF_GROUP_OAM_SAMEDIA_0; + } + + return PERF_GROUP_INVALID; +} + +static u32 __oa_engine_group(struct intel_engine_cs *engine) +{ + switch (engine->class) { + case RENDER_CLASS: + return PERF_GROUP_OAG; + + case VIDEO_DECODE_CLASS: + case VIDEO_ENHANCEMENT_CLASS: + return __oam_engine_group(engine); + + default: + return PERF_GROUP_INVALID; + } +} + +static struct i915_perf_regs __oam_regs(u32 base) +{ + return (struct i915_perf_regs) { + base, + GEN12_OAM_HEAD_POINTER(base), + GEN12_OAM_TAIL_POINTER(base), + GEN12_OAM_BUFFER(base), + GEN12_OAM_CONTEXT_CONTROL(base), + GEN12_OAM_CONTROL(base), + GEN12_OAM_DEBUG(base), + GEN12_OAM_STATUS(base), + GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT, + }; +} + +static struct i915_perf_regs __oag_regs(void) +{ + return (struct i915_perf_regs) { + 0, + GEN12_OAG_OAHEADPTR, + GEN12_OAG_OATAILPTR, + GEN12_OAG_OABUFFER, + GEN12_OAG_OAGLBCTXCTRL, + GEN12_OAG_OACONTROL, + GEN12_OAG_OA_DEBUG, + GEN12_OAG_OASTATUS, + GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT, + }; +} + +static void oa_init_groups(struct intel_gt *gt) +{ + int i, num_groups = gt->perf.num_perf_groups; + + for (i = 0; i < num_groups; i++) { + struct i915_perf_group *g = >->perf.group[i]; + + /* Fused off engines can result in a group with num_engines == 0 */ + if (g->num_engines == 0) + continue; + + if (i == PERF_GROUP_OAG && gt->type != GT_MEDIA) { + g->regs = __oag_regs(); + g->type = TYPE_OAG; + } else if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70)) { + g->regs = __oam_regs(mtl_oa_base[i]); + g->type = TYPE_OAM; + } + } +} + +static int oa_init_gt(struct intel_gt *gt) +{ + u32 num_groups = num_perf_groups_per_gt(gt); + struct intel_engine_cs *engine; + struct i915_perf_group *g; + intel_engine_mask_t tmp; + + g = kcalloc(num_groups, sizeof(*g), GFP_KERNEL); + if (!g) + return -ENOMEM; + + for_each_engine_masked(engine, gt, ALL_ENGINES, tmp) { + u32 index = __oa_engine_group(engine); + + engine->oa_group = NULL; + if (index < num_groups) { + g[index].num_engines++; + engine->oa_group = &g[index]; + } + } + + gt->perf.num_perf_groups = num_groups; + gt->perf.group = g; + + oa_init_groups(gt); + + return 0; +} + +static int oa_init_engine_groups(struct i915_perf *perf) +{ + struct intel_gt *gt; + int i, ret; + + for_each_gt(gt, perf->i915, i) { + ret = oa_init_gt(gt); + if (ret) + return ret; + } + + return 0; +} + static void oa_init_supported_formats(struct i915_perf *perf) { struct drm_i915_private *i915 = perf->i915; @@ -4411,11 +4985,55 @@ static void oa_init_supported_formats(struct i915_perf *perf) oa_format_add(perf, I915_OA_FORMAT_C4_B8); break; + case INTEL_DG2: + oa_format_add(perf, I915_OAR_FORMAT_A32u40_A4u32_B8_C8); + oa_format_add(perf, I915_OA_FORMAT_A24u40_A14u32_B8_C8); + break; + + case INTEL_METEORLAKE: + oa_format_add(perf, I915_OAR_FORMAT_A32u40_A4u32_B8_C8); + oa_format_add(perf, I915_OA_FORMAT_A24u40_A14u32_B8_C8); + oa_format_add(perf, I915_OAM_FORMAT_MPEC8u64_B8_C8); + oa_format_add(perf, I915_OAM_FORMAT_MPEC8u32_B8_C8); + break; + default: MISSING_CASE(platform); } } +static void i915_perf_init_info(struct drm_i915_private *i915) +{ + struct i915_perf *perf = &i915->perf; + + switch (GRAPHICS_VER(i915)) { + case 8: + perf->ctx_oactxctrl_offset = 0x120; + perf->ctx_flexeu0_offset = 0x2ce; + perf->gen8_valid_ctx_bit = BIT(25); + break; + case 9: + perf->ctx_oactxctrl_offset = 0x128; + perf->ctx_flexeu0_offset = 0x3de; + perf->gen8_valid_ctx_bit = BIT(16); + break; + case 11: + perf->ctx_oactxctrl_offset = 0x124; + perf->ctx_flexeu0_offset = 0x78e; + perf->gen8_valid_ctx_bit = BIT(16); + break; + case 12: + perf->gen8_valid_ctx_bit = BIT(16); + /* + * Calculate offset at runtime in oa_pin_context for gen12 and + * cache the value in perf->ctx_oactxctrl_offset. + */ + break; + default: + MISSING_CASE(GRAPHICS_VER(i915)); + } +} + /** * i915_perf_init - initialize i915-perf state on module bind * @i915: i915 device instance @@ -4425,16 +5043,10 @@ static void oa_init_supported_formats(struct i915_perf *perf) * Note: i915-perf initialization is split into an 'init' and 'register' * phase with the i915_perf_register() exposing state to userspace. */ -void i915_perf_init(struct drm_i915_private *i915) +int i915_perf_init(struct drm_i915_private *i915) { struct i915_perf *perf = &i915->perf; - /* XXX const struct i915_perf_ops! */ - - /* i915_perf is not enabled for DG2 yet */ - if (IS_DG2(i915)) - return; - perf->oa_formats = oa_formats; if (IS_HASWELL(i915)) { perf->ops.is_valid_b_counter_reg = gen7_is_valid_b_counter_addr; @@ -4454,6 +5066,7 @@ void i915_perf_init(struct drm_i915_private *i915) * execlist mode by default. */ perf->ops.read = gen8_oa_read; + i915_perf_init_info(i915); if (IS_GRAPHICS_VER(i915, 8, 9)) { perf->ops.is_valid_b_counter_reg = @@ -4473,18 +5086,6 @@ void i915_perf_init(struct drm_i915_private *i915) perf->ops.enable_metric_set = gen8_enable_metric_set; perf->ops.disable_metric_set = gen8_disable_metric_set; perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read; - - if (GRAPHICS_VER(i915) == 8) { - perf->ctx_oactxctrl_offset = 0x120; - perf->ctx_flexeu0_offset = 0x2ce; - - perf->gen8_valid_ctx_bit = BIT(25); - } else { - perf->ctx_oactxctrl_offset = 0x128; - perf->ctx_flexeu0_offset = 0x3de; - - perf->gen8_valid_ctx_bit = BIT(16); - } } else if (GRAPHICS_VER(i915) == 11) { perf->ops.is_valid_b_counter_reg = gen7_is_valid_b_counter_addr; @@ -4498,13 +5099,10 @@ void i915_perf_init(struct drm_i915_private *i915) perf->ops.enable_metric_set = gen8_enable_metric_set; perf->ops.disable_metric_set = gen11_disable_metric_set; perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read; - - perf->ctx_oactxctrl_offset = 0x124; - perf->ctx_flexeu0_offset = 0x78e; - - perf->gen8_valid_ctx_bit = BIT(16); } else if (GRAPHICS_VER(i915) == 12) { perf->ops.is_valid_b_counter_reg = + HAS_OA_SLICE_CONTRIB_LIMITS(i915) ? + xehp_is_valid_b_counter_addr : gen12_is_valid_b_counter_addr; perf->ops.is_valid_mux_reg = gen12_is_valid_mux_addr; @@ -4516,14 +5114,15 @@ void i915_perf_init(struct drm_i915_private *i915) perf->ops.enable_metric_set = gen12_enable_metric_set; perf->ops.disable_metric_set = gen12_disable_metric_set; perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read; - - perf->ctx_flexeu0_offset = 0; - perf->ctx_oactxctrl_offset = 0x144; } } if (perf->ops.enable_metric_set) { - mutex_init(&perf->lock); + struct intel_gt *gt; + int i, ret; + + for_each_gt(gt, i915, i) + mutex_init(>->perf.lock); /* Choose a representative limit */ oa_sample_rate_hard_limit = to_gt(i915)->clock_frequency / 2; @@ -4559,8 +5158,17 @@ void i915_perf_init(struct drm_i915_private *i915) perf->i915 = i915; + ret = oa_init_engine_groups(perf); + if (ret) { + drm_err(&i915->drm, + "OA initialization failed %d\n", ret); + return ret; + } + oa_init_supported_formats(perf); } + + return 0; } static int destroy_config(int id, void *p, void *data) @@ -4587,10 +5195,15 @@ void i915_perf_sysctl_unregister(void) void i915_perf_fini(struct drm_i915_private *i915) { struct i915_perf *perf = &i915->perf; + struct intel_gt *gt; + int i; if (!perf->i915) return; + for_each_gt(gt, perf->i915, i) + kfree(gt->perf.group); + idr_for_each(&perf->metrics_idr, destroy_config, perf); idr_destroy(&perf->metrics_idr); @@ -4600,10 +5213,11 @@ void i915_perf_fini(struct drm_i915_private *i915) /** * i915_perf_ioctl_version - Version of the i915-perf subsystem + * @i915: The i915 device * * This version number is used by userspace to detect available features. */ -int i915_perf_ioctl_version(void) +int i915_perf_ioctl_version(struct drm_i915_private *i915) { /* * 1: Initial version @@ -4624,8 +5238,23 @@ int i915_perf_ioctl_version(void) * * 5: Add DRM_I915_PERF_PROP_POLL_OA_PERIOD parameter that controls the * interval for the hrtimer used to check for OA data. + * + * 6: Add DRM_I915_PERF_PROP_OA_ENGINE_CLASS and + * DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE + * + * 7: Add support for video decode and enhancement classes. */ - return 5; + + /* + * Wa_14017512683: mtl[a0..c0): Use of OAM must be preceded with Media + * C6 disable in BIOS. If Media C6 is enabled in BIOS, return version 6 + * to indicate that OA media is not supported. + */ + if (IS_MEDIA_GT_IP_STEP(i915->media_gt, IP_VER(13, 0), STEP_A0, STEP_C0) && + intel_check_bios_c6_setup(&i915->media_gt->rc6)) + return 6; + + return 7; } #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) |
