diff options
Diffstat (limited to 'drivers/gpu/drm/xe/xe_guc.c')
-rw-r--r-- | drivers/gpu/drm/xe/xe_guc.c | 354 |
1 files changed, 272 insertions, 82 deletions
diff --git a/drivers/gpu/drm/xe/xe_guc.c b/drivers/gpu/drm/xe/xe_guc.c index 240e7a4bbff1..0bf6e01b8910 100644 --- a/drivers/gpu/drm/xe/xe_guc.c +++ b/drivers/gpu/drm/xe/xe_guc.c @@ -19,8 +19,11 @@ #include "xe_force_wake.h" #include "xe_gt.h" #include "xe_gt_printk.h" +#include "xe_gt_sriov_vf.h" +#include "xe_gt_throttle.h" #include "xe_guc_ads.h" #include "xe_guc_ct.h" +#include "xe_guc_db_mgr.h" #include "xe_guc_hwconfig.h" #include "xe_guc_log.h" #include "xe_guc_pc.h" @@ -239,7 +242,7 @@ static void guc_write_params(struct xe_guc *guc) xe_mmio_write32(gt, SOFT_SCRATCH(1 + i), guc->params[i]); } -static void guc_fini(struct drm_device *drm, void *arg) +static void guc_fini_hw(void *arg) { struct xe_guc *guc = arg; struct xe_gt *gt = guc_to_gt(guc); @@ -293,6 +296,23 @@ static int xe_guc_realloc_post_hwconfig(struct xe_guc *guc) return 0; } +static int vf_guc_init(struct xe_guc *guc) +{ + int err; + + xe_guc_comm_init_early(guc); + + err = xe_guc_ct_init(&guc->ct); + if (err) + return err; + + err = xe_guc_relay_init(&guc->relay); + if (err) + return err; + + return 0; +} + int xe_guc_init(struct xe_guc *guc) { struct xe_device *xe = guc_to_xe(guc); @@ -307,6 +327,13 @@ int xe_guc_init(struct xe_guc *guc) if (!xe_uc_fw_is_enabled(&guc->fw)) return 0; + if (IS_SRIOV_VF(xe)) { + ret = vf_guc_init(guc); + if (ret) + goto out; + return 0; + } + ret = xe_guc_log_init(&guc->log); if (ret) goto out; @@ -323,7 +350,7 @@ int xe_guc_init(struct xe_guc *guc) if (ret) goto out; - ret = drmm_add_action_or_reset(&xe->drm, guc_fini, guc); + ret = devm_add_action_or_reset(xe->drm.dev, guc_fini_hw, guc); if (ret) goto out; @@ -340,6 +367,19 @@ out: return ret; } +static int vf_guc_init_post_hwconfig(struct xe_guc *guc) +{ + int err; + + err = xe_guc_submit_init(guc, xe_gt_sriov_vf_guc_ids(guc_to_gt(guc))); + if (err) + return err; + + /* XXX xe_guc_db_mgr_init not needed for now */ + + return 0; +} + /** * xe_guc_init_post_hwconfig - initialize GuC post hwconfig load * @guc: The GuC object @@ -350,12 +390,23 @@ int xe_guc_init_post_hwconfig(struct xe_guc *guc) { int ret; + if (IS_SRIOV_VF(guc_to_xe(guc))) + return vf_guc_init_post_hwconfig(guc); + ret = xe_guc_realloc_post_hwconfig(guc); if (ret) return ret; guc_init_params_post_hwconfig(guc); + ret = xe_guc_submit_init(guc, ~0); + if (ret) + return ret; + + ret = xe_guc_db_mgr_init(&guc->dbm, ~0); + if (ret) + return ret; + ret = xe_guc_pc_init(&guc->pc); if (ret) return ret; @@ -379,6 +430,9 @@ int xe_guc_reset(struct xe_guc *guc) xe_force_wake_assert_held(gt_to_fw(gt), XE_FW_GT); + if (IS_SRIOV_VF(gt_to_xe(gt))) + return xe_gt_sriov_vf_bootstrap(gt); + xe_mmio_write32(gt, GDRST, GRDOM_GUC); ret = xe_mmio_wait32(gt, GDRST, GRDOM_GUC, 0, 5000, &gdrst, false); @@ -451,58 +505,194 @@ static int guc_xfer_rsa(struct xe_guc *guc) return 0; } -static int guc_wait_ucode(struct xe_guc *guc) +/* + * Check a previously read GuC status register (GUC_STATUS) looking for + * known terminal states (either completion or failure) of either the + * microkernel status field or the boot ROM status field. Returns +1 for + * successful completion, -1 for failure and 0 for any intermediate state. + */ +static int guc_load_done(u32 status) { - struct xe_gt *gt = guc_to_gt(guc); - u32 status; - int ret; + u32 uk_val = REG_FIELD_GET(GS_UKERNEL_MASK, status); + u32 br_val = REG_FIELD_GET(GS_BOOTROM_MASK, status); + + switch (uk_val) { + case XE_GUC_LOAD_STATUS_READY: + return 1; + + case XE_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH: + case XE_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH: + case XE_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE: + case XE_GUC_LOAD_STATUS_HWCONFIG_ERROR: + case XE_GUC_LOAD_STATUS_DPC_ERROR: + case XE_GUC_LOAD_STATUS_EXCEPTION: + case XE_GUC_LOAD_STATUS_INIT_DATA_INVALID: + case XE_GUC_LOAD_STATUS_MPU_DATA_INVALID: + case XE_GUC_LOAD_STATUS_INIT_MMIO_SAVE_RESTORE_INVALID: + return -1; + } + + switch (br_val) { + case XE_BOOTROM_STATUS_NO_KEY_FOUND: + case XE_BOOTROM_STATUS_RSA_FAILED: + case XE_BOOTROM_STATUS_PAVPC_FAILED: + case XE_BOOTROM_STATUS_WOPCM_FAILED: + case XE_BOOTROM_STATUS_LOADLOC_FAILED: + case XE_BOOTROM_STATUS_JUMP_FAILED: + case XE_BOOTROM_STATUS_RC6CTXCONFIG_FAILED: + case XE_BOOTROM_STATUS_MPUMAP_INCORRECT: + case XE_BOOTROM_STATUS_EXCEPTION: + case XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE: + return -1; + } + + return 0; +} + +static s32 guc_pc_get_cur_freq(struct xe_guc_pc *guc_pc) +{ + u32 freq; + int ret = xe_guc_pc_get_cur_freq(guc_pc, &freq); + + return ret ? ret : freq; +} +/* + * Wait for the GuC to start up. + * + * Measurements indicate this should take no more than 20ms (assuming the GT + * clock is at maximum frequency). However, thermal throttling and other issues + * can prevent the clock hitting max and thus making the load take significantly + * longer. Allow up to 200ms as a safety margin for real world worst case situations. + * + * However, bugs anywhere from KMD to GuC to PCODE to fan failure in a CI farm can + * lead to even longer times. E.g. if the GT is clamped to minimum frequency then + * the load times can be in the seconds range. So the timeout is increased for debug + * builds to ensure that problems can be correctly analysed. For release builds, the + * timeout is kept short so that users don't wait forever to find out that there is a + * problem. In either case, if the load took longer than is reasonable even with some + * 'sensible' throttling, then flag a warning because something is not right. + * + * Note that there is a limit on how long an individual usleep_range() can wait for, + * hence longer waits require wrapping a shorter wait in a loop. + * + * Note that the only reason an end user should hit the shorter timeout is in case of + * extreme thermal throttling. And a system that is that hot during boot is probably + * dead anyway! + */ +#if defined(CONFIG_DRM_XE_DEBUG) +#define GUC_LOAD_RETRY_LIMIT 20 +#else +#define GUC_LOAD_RETRY_LIMIT 3 +#endif +#define GUC_LOAD_TIME_WARN_MS 200 + +static void guc_wait_ucode(struct xe_guc *guc) +{ + struct xe_gt *gt = guc_to_gt(guc); + struct xe_guc_pc *guc_pc = >->uc.guc.pc; + ktime_t before, after, delta; + int load_done; + u32 status = 0; + int count = 0; + u64 delta_ms; + u32 before_freq; + + before_freq = xe_guc_pc_get_act_freq(guc_pc); + before = ktime_get(); /* - * Wait for the GuC to start up. - * NB: Docs recommend not using the interrupt for completion. - * Measurements indicate this should take no more than 20ms - * (assuming the GT clock is at maximum frequency). So, a - * timeout here indicates that the GuC has failed and is unusable. - * (Higher levels of the driver may decide to reset the GuC and - * attempt the ucode load again if this happens.) - * - * FIXME: There is a known (but exceedingly unlikely) race condition - * where the asynchronous frequency management code could reduce - * the GT clock while a GuC reload is in progress (during a full - * GT reset). A fix is in progress but there are complex locking - * issues to be resolved. In the meantime bump the timeout to - * 200ms. Even at slowest clock, this should be sufficient. And - * in the working case, a larger timeout makes no difference. + * Note, can't use any kind of timing information from the call to xe_mmio_wait. + * It could return a thousand intermediate stages at random times. Instead, must + * manually track the total time taken and locally implement the timeout. */ - ret = xe_mmio_wait32(gt, GUC_STATUS, GS_UKERNEL_MASK, - FIELD_PREP(GS_UKERNEL_MASK, XE_GUC_LOAD_STATUS_READY), - 200000, &status, false); + do { + u32 last_status = status & (GS_UKERNEL_MASK | GS_BOOTROM_MASK); + int ret; - if (ret) { - xe_gt_info(gt, "GuC load failed: status = 0x%08X\n", status); - xe_gt_info(gt, "GuC status: Reset = %u, BootROM = %#X, UKernel = %#X, MIA = %#X, Auth = %#X\n", - REG_FIELD_GET(GS_MIA_IN_RESET, status), - REG_FIELD_GET(GS_BOOTROM_MASK, status), - REG_FIELD_GET(GS_UKERNEL_MASK, status), - REG_FIELD_GET(GS_MIA_MASK, status), - REG_FIELD_GET(GS_AUTH_STATUS_MASK, status)); - - if ((status & GS_BOOTROM_MASK) == GS_BOOTROM_RSA_FAILED) { - xe_gt_info(gt, "GuC firmware signature verification failed\n"); - ret = -ENOEXEC; + /* + * Wait for any change (intermediate or terminal) in the status register. + * Note, the return value is a don't care. The only failure code is timeout + * but the timeouts need to be accumulated over all the intermediate partial + * timeouts rather than allowing a huge timeout each time. So basically, need + * to treat a timeout no different to a value change. + */ + ret = xe_mmio_wait32_not(gt, GUC_STATUS, GS_UKERNEL_MASK | GS_BOOTROM_MASK, + last_status, 1000 * 1000, &status, false); + if (ret < 0) + count++; + after = ktime_get(); + delta = ktime_sub(after, before); + delta_ms = ktime_to_ms(delta); + + load_done = guc_load_done(status); + if (load_done != 0) + break; + + if (delta_ms >= (GUC_LOAD_RETRY_LIMIT * 1000)) + break; + + xe_gt_dbg(gt, "load still in progress, timeouts = %d, freq = %dMHz (req %dMHz), status = 0x%08X [0x%02X/%02X]\n", + count, xe_guc_pc_get_act_freq(guc_pc), + guc_pc_get_cur_freq(guc_pc), status, + REG_FIELD_GET(GS_BOOTROM_MASK, status), + REG_FIELD_GET(GS_UKERNEL_MASK, status)); + } while (1); + + if (load_done != 1) { + u32 ukernel = REG_FIELD_GET(GS_UKERNEL_MASK, status); + u32 bootrom = REG_FIELD_GET(GS_BOOTROM_MASK, status); + + xe_gt_err(gt, "load failed: status = 0x%08X, time = %lldms, freq = %dMHz (req %dMHz), done = %d\n", + status, delta_ms, xe_guc_pc_get_act_freq(guc_pc), + guc_pc_get_cur_freq(guc_pc), load_done); + xe_gt_err(gt, "load failed: status: Reset = %d, BootROM = 0x%02X, UKernel = 0x%02X, MIA = 0x%02X, Auth = 0x%02X\n", + REG_FIELD_GET(GS_MIA_IN_RESET, status), + bootrom, ukernel, + REG_FIELD_GET(GS_MIA_MASK, status), + REG_FIELD_GET(GS_AUTH_STATUS_MASK, status)); + + switch (bootrom) { + case XE_BOOTROM_STATUS_NO_KEY_FOUND: + xe_gt_err(gt, "invalid key requested, header = 0x%08X\n", + xe_mmio_read32(gt, GUC_HEADER_INFO)); + break; + + case XE_BOOTROM_STATUS_RSA_FAILED: + xe_gt_err(gt, "firmware signature verification failed\n"); + break; + + case XE_BOOTROM_STATUS_PROD_KEY_CHECK_FAILURE: + xe_gt_err(gt, "firmware production part check failure\n"); + break; } - if (REG_FIELD_GET(GS_UKERNEL_MASK, status) == - XE_GUC_LOAD_STATUS_EXCEPTION) { - xe_gt_info(gt, "GuC firmware exception. EIP: %#x\n", - xe_mmio_read32(gt, SOFT_SCRATCH(13))); - ret = -ENXIO; + switch (ukernel) { + case XE_GUC_LOAD_STATUS_EXCEPTION: + xe_gt_err(gt, "firmware exception. EIP: %#x\n", + xe_mmio_read32(gt, SOFT_SCRATCH(13))); + break; + + case XE_GUC_LOAD_STATUS_INIT_MMIO_SAVE_RESTORE_INVALID: + xe_gt_err(gt, "illegal register in save/restore workaround list\n"); + break; + + case XE_GUC_LOAD_STATUS_HWCONFIG_START: + xe_gt_err(gt, "still extracting hwconfig table.\n"); + break; } + + xe_device_declare_wedged(gt_to_xe(gt)); + } else if (delta_ms > GUC_LOAD_TIME_WARN_MS) { + xe_gt_warn(gt, "excessive init time: %lldms! [status = 0x%08X, timeouts = %d]\n", + delta_ms, status, count); + xe_gt_warn(gt, "excessive init time: [freq = %dMHz (req = %dMHz), before = %dMHz, perf_limit_reasons = 0x%08X]\n", + xe_guc_pc_get_act_freq(guc_pc), guc_pc_get_cur_freq(guc_pc), + before_freq, xe_gt_throttle_get_limit_reasons(gt)); } else { - xe_gt_dbg(gt, "GuC successfully loaded\n"); + xe_gt_dbg(gt, "init took %lldms, freq = %dMHz (req = %dMHz), before = %dMHz, status = 0x%08X, timeouts = %d\n", + delta_ms, xe_guc_pc_get_act_freq(guc_pc), guc_pc_get_cur_freq(guc_pc), + before_freq, status, count); } - - return ret; } static int __xe_guc_upload(struct xe_guc *guc) @@ -532,9 +722,7 @@ static int __xe_guc_upload(struct xe_guc *guc) goto out; /* Wait for authentication */ - ret = guc_wait_ucode(guc); - if (ret) - goto out; + guc_wait_ucode(guc); xe_uc_fw_change_status(&guc->fw, XE_UC_FIRMWARE_RUNNING); return 0; @@ -544,6 +732,38 @@ out: return 0 /* FIXME: ret, don't want to stop load currently */; } +static int vf_guc_min_load_for_hwconfig(struct xe_guc *guc) +{ + struct xe_gt *gt = guc_to_gt(guc); + int ret; + + ret = xe_gt_sriov_vf_bootstrap(gt); + if (ret) + return ret; + + ret = xe_gt_sriov_vf_query_config(gt); + if (ret) + return ret; + + ret = xe_guc_hwconfig_init(guc); + if (ret) + return ret; + + ret = xe_guc_enable_communication(guc); + if (ret) + return ret; + + ret = xe_gt_sriov_vf_connect(gt); + if (ret) + return ret; + + ret = xe_gt_sriov_vf_query_runtime(gt); + if (ret) + return ret; + + return 0; +} + /** * xe_guc_min_load_for_hwconfig - load minimal GuC and read hwconfig table * @guc: The GuC object @@ -559,6 +779,9 @@ int xe_guc_min_load_for_hwconfig(struct xe_guc *guc) { int ret; + if (IS_SRIOV_VF(guc_to_xe(guc))) + return vf_guc_min_load_for_hwconfig(guc); + xe_guc_ads_populate_minimal(&guc->ads); /* Raise GT freq to speed up HuC/GuC load */ @@ -891,17 +1114,11 @@ void xe_guc_stop_prepare(struct xe_guc *guc) XE_WARN_ON(xe_guc_pc_stop(&guc->pc)); } -int xe_guc_stop(struct xe_guc *guc) +void xe_guc_stop(struct xe_guc *guc) { - int ret; - xe_guc_ct_stop(&guc->ct); - ret = xe_guc_submit_stop(guc); - if (ret) - return ret; - - return 0; + xe_guc_submit_stop(guc); } int xe_guc_start(struct xe_guc *guc) @@ -950,30 +1167,3 @@ void xe_guc_print_info(struct xe_guc *guc, struct drm_printer *p) xe_guc_ct_print(&guc->ct, p, false); xe_guc_submit_print(guc, p); } - -/** - * xe_guc_in_reset() - Detect if GuC MIA is in reset. - * @guc: The GuC object - * - * This function detects runtime resume from d3cold by leveraging - * GUC_STATUS, GUC doesn't get reset during d3hot, - * it strictly to be called from RPM resume handler. - * - * Return: true if failed to get forcewake or GuC MIA is in Reset, - * otherwise false. - */ -bool xe_guc_in_reset(struct xe_guc *guc) -{ - struct xe_gt *gt = guc_to_gt(guc); - u32 status; - int err; - - err = xe_force_wake_get(gt_to_fw(gt), XE_FW_GT); - if (err) - return true; - - status = xe_mmio_read32(gt, GUC_STATUS); - xe_force_wake_put(gt_to_fw(gt), XE_FW_GT); - - return status & GS_MIA_IN_RESET; -} |