summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/xe/xe_gsc.c
diff options
context:
space:
mode:
authorDaniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>2023-11-17 14:51:47 -0800
committerRodrigo Vivi <rodrigo.vivi@intel.com>2023-12-21 11:45:06 -0500
commitdd0e89e5edc20d3875ed7ded48e7e97118cdfbc8 (patch)
treeeabea4b033dbcaa883308b551b395ec4142e3b1d /drivers/gpu/drm/xe/xe_gsc.c
parent985d5a49e8454d64a01ab362e9091788eeed1839 (diff)
drm/xe/gsc: GSC FW load
The GSC FW must be copied in a 4MB stolen memory allocation, whose GGTT address is then passed as a parameter to a dedicated load instruction submitted via the GSC engine. Since the GSC load is relatively slow (up to 250ms), we perform it asynchronously via a worker. This requires us to make sure that the worker has stopped before suspending/unloading. Note that we can't yet use xe_migrate_copy for the copy because it doesn't work with stolen memory right now, so we do a memcpy from the CPU side instead. v2: add comment about timeout value, fix GSC status checking before load (John) Bspec: 65306, 65346 Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com> Cc: Alan Previn <alan.previn.teres.alexis@intel.com> Cc: John Harrison <John.C.Harrison@Intel.com> Reviewed-by: John Harrison <John.C.Harrison@Intel.com> Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Diffstat (limited to 'drivers/gpu/drm/xe/xe_gsc.c')
-rw-r--r--drivers/gpu/drm/xe/xe_gsc.c250
1 files changed, 250 insertions, 0 deletions
diff --git a/drivers/gpu/drm/xe/xe_gsc.c b/drivers/gpu/drm/xe/xe_gsc.c
index 216f36cee0f7..e014b829bc8a 100644
--- a/drivers/gpu/drm/xe/xe_gsc.c
+++ b/drivers/gpu/drm/xe/xe_gsc.c
@@ -5,10 +5,20 @@
#include "xe_gsc.h"
+#include <drm/drm_managed.h>
+
+#include "xe_bb.h"
+#include "xe_bo.h"
#include "xe_device.h"
+#include "xe_exec_queue.h"
#include "xe_gt.h"
#include "xe_gt_printk.h"
+#include "xe_map.h"
+#include "xe_mmio.h"
+#include "xe_sched_job.h"
#include "xe_uc_fw.h"
+#include "instructions/xe_gsc_commands.h"
+#include "regs/xe_gsc_regs.h"
static struct xe_gt *
gsc_to_gt(struct xe_gsc *gsc)
@@ -16,6 +26,145 @@ gsc_to_gt(struct xe_gsc *gsc)
return container_of(gsc, struct xe_gt, uc.gsc);
}
+static int memcpy_fw(struct xe_gsc *gsc)
+{
+ struct xe_gt *gt = gsc_to_gt(gsc);
+ struct xe_device *xe = gt_to_xe(gt);
+ u32 fw_size = gsc->fw.size;
+ void *storage;
+
+ /*
+ * FIXME: xe_migrate_copy does not work with stolen mem yet, so we use
+ * a memcpy for now.
+ */
+ storage = kmalloc(fw_size, GFP_KERNEL);
+ if (!storage)
+ return -ENOMEM;
+
+ xe_map_memcpy_from(xe, storage, &gsc->fw.bo->vmap, 0, fw_size);
+ xe_map_memcpy_to(xe, &gsc->private->vmap, 0, storage, fw_size);
+ xe_map_memset(xe, &gsc->private->vmap, fw_size, 0, gsc->private->size - fw_size);
+
+ kfree(storage);
+
+ return 0;
+}
+
+static int emit_gsc_upload(struct xe_gsc *gsc)
+{
+ struct xe_gt *gt = gsc_to_gt(gsc);
+ u64 offset = xe_bo_ggtt_addr(gsc->private);
+ struct xe_bb *bb;
+ struct xe_sched_job *job;
+ struct dma_fence *fence;
+ long timeout;
+
+ bb = xe_bb_new(gt, 4, false);
+ if (IS_ERR(bb))
+ return PTR_ERR(bb);
+
+ bb->cs[bb->len++] = GSC_FW_LOAD;
+ bb->cs[bb->len++] = lower_32_bits(offset);
+ bb->cs[bb->len++] = upper_32_bits(offset);
+ bb->cs[bb->len++] = (gsc->private->size / SZ_4K) | GSC_FW_LOAD_LIMIT_VALID;
+
+ job = xe_bb_create_job(gsc->q, bb);
+ if (IS_ERR(job)) {
+ xe_bb_free(bb, NULL);
+ return PTR_ERR(job);
+ }
+
+ xe_sched_job_arm(job);
+ fence = dma_fence_get(&job->drm.s_fence->finished);
+ xe_sched_job_push(job);
+
+ timeout = dma_fence_wait_timeout(fence, false, HZ);
+ dma_fence_put(fence);
+ xe_bb_free(bb, NULL);
+ if (timeout < 0)
+ return timeout;
+ else if (!timeout)
+ return -ETIME;
+
+ return 0;
+}
+
+static int gsc_fw_is_loaded(struct xe_gt *gt)
+{
+ return xe_mmio_read32(gt, HECI_FWSTS1(MTL_GSC_HECI1_BASE)) &
+ HECI1_FWSTS1_INIT_COMPLETE;
+}
+
+static int gsc_fw_wait(struct xe_gt *gt)
+{
+ /*
+ * GSC load can take up to 250ms from the moment the instruction is
+ * executed by the GSCCS. To account for possible submission delays or
+ * other issues, we use a 500ms timeout in the wait here.
+ */
+ return xe_mmio_wait32(gt, HECI_FWSTS1(MTL_GSC_HECI1_BASE),
+ HECI1_FWSTS1_INIT_COMPLETE,
+ HECI1_FWSTS1_INIT_COMPLETE,
+ 500 * USEC_PER_MSEC, NULL, false);
+}
+
+static int gsc_upload(struct xe_gsc *gsc)
+{
+ struct xe_gt *gt = gsc_to_gt(gsc);
+ struct xe_device *xe = gt_to_xe(gt);
+ int err;
+
+ /* we should only be here if the init step were successful */
+ xe_assert(xe, xe_uc_fw_is_loadable(&gsc->fw) && gsc->q);
+
+ if (gsc_fw_is_loaded(gt)) {
+ xe_gt_err(gt, "GSC already loaded at upload time\n");
+ return -EEXIST;
+ }
+
+ err = memcpy_fw(gsc);
+ if (err) {
+ xe_gt_err(gt, "Failed to memcpy GSC FW\n");
+ return err;
+ }
+
+ err = emit_gsc_upload(gsc);
+ if (err) {
+ xe_gt_err(gt, "Failed to emit GSC FW upload (%pe)\n", ERR_PTR(err));
+ return err;
+ }
+
+ err = gsc_fw_wait(gt);
+ if (err) {
+ xe_gt_err(gt, "Failed to wait for GSC load (%pe)\n", ERR_PTR(err));
+ return err;
+ }
+
+ xe_gt_dbg(gt, "GSC FW async load completed\n");
+
+ return 0;
+}
+
+static void gsc_work(struct work_struct *work)
+{
+ struct xe_gsc *gsc = container_of(work, typeof(*gsc), work);
+ struct xe_gt *gt = gsc_to_gt(gsc);
+ struct xe_device *xe = gt_to_xe(gt);
+ int ret;
+
+ xe_device_mem_access_get(xe);
+ xe_force_wake_get(gt_to_fw(gt), XE_FW_GSC);
+
+ ret = gsc_upload(gsc);
+ if (ret && ret != -EEXIST)
+ xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_LOAD_FAIL);
+ else
+ xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_TRANSFERRED);
+
+ xe_force_wake_put(gt_to_fw(gt), XE_FW_GSC);
+ xe_device_mem_access_put(xe);
+}
+
int xe_gsc_init(struct xe_gsc *gsc)
{
struct xe_gt *gt = gsc_to_gt(gsc);
@@ -23,6 +172,7 @@ int xe_gsc_init(struct xe_gsc *gsc)
int ret;
gsc->fw.type = XE_UC_FW_TYPE_GSC;
+ INIT_WORK(&gsc->work, gsc_work);
/* The GSC uC is only available on the media GT */
if (tile->media_gt && (gt != tile->media_gt)) {
@@ -50,3 +200,103 @@ out:
return ret;
}
+static void free_resources(struct drm_device *drm, void *arg)
+{
+ struct xe_gsc *gsc = arg;
+
+ if (gsc->wq) {
+ destroy_workqueue(gsc->wq);
+ gsc->wq = NULL;
+ }
+
+ if (gsc->q) {
+ xe_exec_queue_put(gsc->q);
+ gsc->q = NULL;
+ }
+
+ if (gsc->private) {
+ xe_bo_unpin_map_no_vm(gsc->private);
+ gsc->private = NULL;
+ }
+}
+
+int xe_gsc_init_post_hwconfig(struct xe_gsc *gsc)
+{
+ struct xe_gt *gt = gsc_to_gt(gsc);
+ struct xe_tile *tile = gt_to_tile(gt);
+ struct xe_device *xe = gt_to_xe(gt);
+ struct xe_hw_engine *hwe = xe_gt_hw_engine(gt, XE_ENGINE_CLASS_OTHER, 0, true);
+ struct xe_exec_queue *q;
+ struct workqueue_struct *wq;
+ struct xe_bo *bo;
+ int err;
+
+ if (!xe_uc_fw_is_available(&gsc->fw))
+ return 0;
+
+ if (!hwe)
+ return -ENODEV;
+
+ bo = xe_bo_create_pin_map(xe, tile, NULL, SZ_4M,
+ ttm_bo_type_kernel,
+ XE_BO_CREATE_STOLEN_BIT |
+ XE_BO_CREATE_GGTT_BIT);
+ if (IS_ERR(bo))
+ return PTR_ERR(bo);
+
+ q = xe_exec_queue_create(xe, NULL,
+ BIT(hwe->logical_instance), 1, hwe,
+ EXEC_QUEUE_FLAG_KERNEL |
+ EXEC_QUEUE_FLAG_PERMANENT);
+ if (IS_ERR(q)) {
+ xe_gt_err(gt, "Failed to create queue for GSC submission\n");
+ err = PTR_ERR(q);
+ goto out_bo;
+ }
+
+ wq = alloc_ordered_workqueue("gsc-ordered-wq", 0);
+ if (!wq) {
+ err = -ENOMEM;
+ goto out_q;
+ }
+
+ gsc->private = bo;
+ gsc->q = q;
+ gsc->wq = wq;
+
+ err = drmm_add_action_or_reset(&xe->drm, free_resources, gsc);
+ if (err)
+ return err;
+
+ xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_LOADABLE);
+
+ return 0;
+
+out_q:
+ xe_exec_queue_put(q);
+out_bo:
+ xe_bo_unpin_map_no_vm(bo);
+ return err;
+}
+
+void xe_gsc_load_start(struct xe_gsc *gsc)
+{
+ struct xe_gt *gt = gsc_to_gt(gsc);
+
+ if (!xe_uc_fw_is_loadable(&gsc->fw) || !gsc->q)
+ return;
+
+ /* GSC FW survives GT reset and D3Hot */
+ if (gsc_fw_is_loaded(gt)) {
+ xe_uc_fw_change_status(&gsc->fw, XE_UC_FIRMWARE_TRANSFERRED);
+ return;
+ }
+
+ queue_work(gsc->wq, &gsc->work);
+}
+
+void xe_gsc_wait_for_worker_completion(struct xe_gsc *gsc)
+{
+ if (xe_uc_fw_is_loadable(&gsc->fw) && gsc->wq)
+ flush_work(&gsc->work);
+}