122 files changed, 16514 insertions, 4490 deletions
diff --git a/drivers/accel/Kconfig b/drivers/accel/Kconfig
index 64065fb8922b..5b9490367a39 100644
--- a/drivers/accel/Kconfig
+++ b/drivers/accel/Kconfig
@@ -24,6 +24,7 @@ menuconfig DRM_ACCEL
 	  different device files, called accel/accel* (in /dev, sysfs
 	  and debugfs).
 
+source "drivers/accel/amdxdna/Kconfig"
 source "drivers/accel/habanalabs/Kconfig"
 source "drivers/accel/ivpu/Kconfig"
 source "drivers/accel/qaic/Kconfig"
diff --git a/drivers/accel/Makefile b/drivers/accel/Makefile
index ab3df932937f..a301fb6089d4 100644
--- a/drivers/accel/Makefile
+++ b/drivers/accel/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
+obj-$(CONFIG_DRM_ACCEL_AMDXDNA)		+= amdxdna/
 obj-$(CONFIG_DRM_ACCEL_HABANALABS)	+= habanalabs/
 obj-$(CONFIG_DRM_ACCEL_IVPU)		+= ivpu/
 obj-$(CONFIG_DRM_ACCEL_QAIC)		+= qaic/
diff --git a/drivers/accel/amdxdna/Kconfig b/drivers/accel/amdxdna/Kconfig
new file mode 100644
index 000000000000..f39d7a87296c
--- /dev/null
+++ b/drivers/accel/amdxdna/Kconfig
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config DRM_ACCEL_AMDXDNA
+	tristate "AMD AI Engine"
+	depends on AMD_IOMMU
+	depends on DRM_ACCEL
+	depends on PCI && HAS_IOMEM
+	depends on X86_64
+	select DRM_SCHED
+	select DRM_GEM_SHMEM_HELPER
+	select FW_LOADER
+	select HMM_MIRROR
+	help
+	  Choose this option to enable support for NPU integrated into AMD
+	  client CPUs like AMD Ryzen AI 300 Series. AMD NPU can be used to
+	  accelerate machine learning applications.
+
+	  If "M" is selected, the driver module will be amdxdna.
diff --git a/drivers/accel/amdxdna/Makefile b/drivers/accel/amdxdna/Makefile
new file mode 100644
index 000000000000..0e9adf6890a0
--- /dev/null
+++ b/drivers/accel/amdxdna/Makefile
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+amdxdna-y := \
+	aie2_ctx.o \
+	aie2_error.o \
+	aie2_message.o \
+	aie2_pci.o \
+	aie2_pm.o \
+	aie2_psp.o \
+	aie2_smu.o \
+	aie2_solver.o \
+	amdxdna_ctx.o \
+	amdxdna_gem.o \
+	amdxdna_mailbox.o \
+	amdxdna_mailbox_helper.o \
+	amdxdna_pci_drv.o \
+	amdxdna_sysfs.o \
+	npu1_regs.o \
+	npu2_regs.o \
+	npu4_regs.o \
+	npu5_regs.o \
+	npu6_regs.o
+obj-$(CONFIG_DRM_ACCEL_AMDXDNA) = amdxdna.o
diff --git a/drivers/accel/amdxdna/TODO b/drivers/accel/amdxdna/TODO
new file mode 100644
index 000000000000..5119bccd1917
--- /dev/null
+++ b/drivers/accel/amdxdna/TODO
@@ -0,0 +1,3 @@
+- Add import and export BO support
+- Add debugfs support
+- Add debug BO support
diff --git a/drivers/accel/amdxdna/aie2_ctx.c b/drivers/accel/amdxdna/aie2_ctx.c
new file mode 100644
index 000000000000..5f43db02b240
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_ctx.c
@@ -0,0 +1,910 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_device.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_print.h>
+#include <drm/drm_syncobj.h>
+#include <linux/hmm.h>
+#include <linux/types.h>
+#include <linux/xarray.h>
+#include <trace/events/amdxdna.h>
+
+#include "aie2_msg_priv.h"
+#include "aie2_pci.h"
+#include "aie2_solver.h"
+#include "amdxdna_ctx.h"
+#include "amdxdna_gem.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_pci_drv.h"
+
+static bool force_cmdlist;
+module_param(force_cmdlist, bool, 0600);
+MODULE_PARM_DESC(force_cmdlist, "Force use command list (Default false)");
+
+#define HWCTX_MAX_TIMEOUT	60000 /* milliseconds */
+
+static void aie2_job_release(struct kref *ref)
+{
+	struct amdxdna_sched_job *job;
+
+	job = container_of(ref, struct amdxdna_sched_job, refcnt);
+	amdxdna_sched_job_cleanup(job);
+	if (job->out_fence)
+		dma_fence_put(job->out_fence);
+	kfree(job);
+}
+
+static void aie2_job_put(struct amdxdna_sched_job *job)
+{
+	kref_put(&job->refcnt, aie2_job_release);
+}
+
+/* The bad_job is used in aie2_sched_job_timedout, otherwise, set it to NULL */
+static void aie2_hwctx_stop(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwctx,
+			    struct drm_sched_job *bad_job)
+{
+	drm_sched_stop(&hwctx->priv->sched, bad_job);
+	aie2_destroy_context(xdna->dev_handle, hwctx);
+}
+
+static int aie2_hwctx_restart(struct amdxdna_dev *xdna, struct amdxdna_hwctx *hwctx)
+{
+	struct amdxdna_gem_obj *heap = hwctx->priv->heap;
+	int ret;
+
+	ret = aie2_create_context(xdna->dev_handle, hwctx);
+	if (ret) {
+		XDNA_ERR(xdna, "Create hwctx failed, ret %d", ret);
+		goto out;
+	}
+
+	ret = aie2_map_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
+				heap->mem.userptr, heap->mem.size);
+	if (ret) {
+		XDNA_ERR(xdna, "Map host buf failed, ret %d", ret);
+		goto out;
+	}
+
+	if (hwctx->status != HWCTX_STAT_READY) {
+		XDNA_DBG(xdna, "hwctx is not ready, status %d", hwctx->status);
+		goto out;
+	}
+
+	ret = aie2_config_cu(hwctx);
+	if (ret) {
+		XDNA_ERR(xdna, "Config cu failed, ret %d", ret);
+		goto out;
+	}
+
+out:
+	drm_sched_start(&hwctx->priv->sched, 0);
+	XDNA_DBG(xdna, "%s restarted, ret %d", hwctx->name, ret);
+	return ret;
+}
+
+void aie2_restart_ctx(struct amdxdna_client *client)
+{
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_hwctx *hwctx;
+	unsigned long hwctx_id;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+	mutex_lock(&client->hwctx_lock);
+	amdxdna_for_each_hwctx(client, hwctx_id, hwctx) {
+		if (hwctx->status != HWCTX_STAT_STOP)
+			continue;
+
+		hwctx->status = hwctx->old_status;
+		XDNA_DBG(xdna, "Resetting %s", hwctx->name);
+		aie2_hwctx_restart(xdna, hwctx);
+	}
+	mutex_unlock(&client->hwctx_lock);
+}
+
+static struct dma_fence *aie2_cmd_get_out_fence(struct amdxdna_hwctx *hwctx, u64 seq)
+{
+	struct dma_fence *fence, *out_fence = NULL;
+	int ret;
+
+	fence = drm_syncobj_fence_get(hwctx->priv->syncobj);
+	if (!fence)
+		return NULL;
+
+	ret = dma_fence_chain_find_seqno(&fence,  seq);
+	if (ret)
+		goto out;
+
+	out_fence = dma_fence_get(dma_fence_chain_contained(fence));
+
+out:
+	dma_fence_put(fence);
+	return out_fence;
+}
+
+static void aie2_hwctx_wait_for_idle(struct amdxdna_hwctx *hwctx)
+{
+	struct dma_fence *fence;
+
+	fence = aie2_cmd_get_out_fence(hwctx, hwctx->priv->seq - 1);
+	if (!fence)
+		return;
+
+	dma_fence_wait(fence, false);
+	dma_fence_put(fence);
+}
+
+void aie2_hwctx_suspend(struct amdxdna_hwctx *hwctx)
+{
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+
+	/*
+	 * Command timeout is unlikely. But if it happens, it doesn't
+	 * break the system. aie2_hwctx_stop() will destroy mailbox
+	 * and abort all commands.
+	 */
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+	aie2_hwctx_wait_for_idle(hwctx);
+	aie2_hwctx_stop(xdna, hwctx, NULL);
+	hwctx->old_status = hwctx->status;
+	hwctx->status = HWCTX_STAT_STOP;
+}
+
+void aie2_hwctx_resume(struct amdxdna_hwctx *hwctx)
+{
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+
+	/*
+	 * The resume path cannot guarantee that mailbox channel can be
+	 * regenerated. If this happen, when submit message to this
+	 * mailbox channel, error will return.
+	 */
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+	hwctx->status = hwctx->old_status;
+	aie2_hwctx_restart(xdna, hwctx);
+}
+
+static void
+aie2_sched_notify(struct amdxdna_sched_job *job)
+{
+	struct dma_fence *fence = job->fence;
+
+	trace_xdna_job(&job->base, job->hwctx->name, "signaled fence", job->seq);
+	job->hwctx->priv->completed++;
+	dma_fence_signal(fence);
+
+	up(&job->hwctx->priv->job_sem);
+	job->job_done = true;
+	dma_fence_put(fence);
+	mmput_async(job->mm);
+	aie2_job_put(job);
+}
+
+static int
+aie2_sched_resp_handler(void *handle, const u32 *data, size_t size)
+{
+	struct amdxdna_sched_job *job = handle;
+	struct amdxdna_gem_obj *cmd_abo;
+	u32 ret = 0;
+	u32 status;
+
+	cmd_abo = job->cmd_bo;
+
+	if (unlikely(!data))
+		goto out;
+
+	if (unlikely(size != sizeof(u32))) {
+		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	status = *data;
+	XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
+	if (status == AIE2_STATUS_SUCCESS)
+		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
+	else
+		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ERROR);
+
+out:
+	aie2_sched_notify(job);
+	return ret;
+}
+
+static int
+aie2_sched_nocmd_resp_handler(void *handle, const u32 *data, size_t size)
+{
+	struct amdxdna_sched_job *job = handle;
+	u32 ret = 0;
+	u32 status;
+
+	if (unlikely(!data))
+		goto out;
+
+	if (unlikely(size != sizeof(u32))) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	status = *data;
+	XDNA_DBG(job->hwctx->client->xdna, "Resp status 0x%x", status);
+
+out:
+	aie2_sched_notify(job);
+	return ret;
+}
+
+static int
+aie2_sched_cmdlist_resp_handler(void *handle, const u32 *data, size_t size)
+{
+	struct amdxdna_sched_job *job = handle;
+	struct amdxdna_gem_obj *cmd_abo;
+	struct cmd_chain_resp *resp;
+	struct amdxdna_dev *xdna;
+	u32 fail_cmd_status;
+	u32 fail_cmd_idx;
+	u32 ret = 0;
+
+	cmd_abo = job->cmd_bo;
+	if (unlikely(!data) || unlikely(size != sizeof(u32) * 3)) {
+		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	resp = (struct cmd_chain_resp *)data;
+	xdna = job->hwctx->client->xdna;
+	XDNA_DBG(xdna, "Status 0x%x", resp->status);
+	if (resp->status == AIE2_STATUS_SUCCESS) {
+		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_COMPLETED);
+		goto out;
+	}
+
+	/* Slow path to handle error, read from ringbuf on BAR */
+	fail_cmd_idx = resp->fail_cmd_idx;
+	fail_cmd_status = resp->fail_cmd_status;
+	XDNA_DBG(xdna, "Failed cmd idx %d, status 0x%x",
+		 fail_cmd_idx, fail_cmd_status);
+
+	if (fail_cmd_status == AIE2_STATUS_SUCCESS) {
+		amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_ABORT);
+		ret = -EINVAL;
+		goto out;
+	}
+	amdxdna_cmd_set_state(cmd_abo, fail_cmd_status);
+
+	if (amdxdna_cmd_get_op(cmd_abo) == ERT_CMD_CHAIN) {
+		struct amdxdna_cmd_chain *cc = amdxdna_cmd_get_payload(cmd_abo, NULL);
+
+		cc->error_index = fail_cmd_idx;
+		if (cc->error_index >= cc->command_count)
+			cc->error_index = 0;
+	}
+out:
+	aie2_sched_notify(job);
+	return ret;
+}
+
+static struct dma_fence *
+aie2_sched_job_run(struct drm_sched_job *sched_job)
+{
+	struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
+	struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+	struct amdxdna_hwctx *hwctx = job->hwctx;
+	struct dma_fence *fence;
+	int ret;
+
+	if (!mmget_not_zero(job->mm))
+		return ERR_PTR(-ESRCH);
+
+	kref_get(&job->refcnt);
+	fence = dma_fence_get(job->fence);
+
+	if (unlikely(!cmd_abo)) {
+		ret = aie2_sync_bo(hwctx, job, aie2_sched_nocmd_resp_handler);
+		goto out;
+	}
+
+	amdxdna_cmd_set_state(cmd_abo, ERT_CMD_STATE_NEW);
+
+	if (amdxdna_cmd_get_op(cmd_abo) == ERT_CMD_CHAIN)
+		ret = aie2_cmdlist_multi_execbuf(hwctx, job, aie2_sched_cmdlist_resp_handler);
+	else if (force_cmdlist)
+		ret = aie2_cmdlist_single_execbuf(hwctx, job, aie2_sched_cmdlist_resp_handler);
+	else
+		ret = aie2_execbuf(hwctx, job, aie2_sched_resp_handler);
+
+out:
+	if (ret) {
+		dma_fence_put(job->fence);
+		aie2_job_put(job);
+		mmput(job->mm);
+		fence = ERR_PTR(ret);
+	}
+	trace_xdna_job(sched_job, hwctx->name, "sent to device", job->seq);
+
+	return fence;
+}
+
+static void aie2_sched_job_free(struct drm_sched_job *sched_job)
+{
+	struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
+	struct amdxdna_hwctx *hwctx = job->hwctx;
+
+	trace_xdna_job(sched_job, hwctx->name, "job free", job->seq);
+	if (!job->job_done)
+		up(&hwctx->priv->job_sem);
+
+	drm_sched_job_cleanup(sched_job);
+	aie2_job_put(job);
+}
+
+static enum drm_gpu_sched_stat
+aie2_sched_job_timedout(struct drm_sched_job *sched_job)
+{
+	struct amdxdna_sched_job *job = drm_job_to_xdna_job(sched_job);
+	struct amdxdna_hwctx *hwctx = job->hwctx;
+	struct amdxdna_dev *xdna;
+
+	xdna = hwctx->client->xdna;
+	trace_xdna_job(sched_job, hwctx->name, "job timedout", job->seq);
+	mutex_lock(&xdna->dev_lock);
+	aie2_hwctx_stop(xdna, hwctx, sched_job);
+
+	aie2_hwctx_restart(xdna, hwctx);
+	mutex_unlock(&xdna->dev_lock);
+
+	return DRM_GPU_SCHED_STAT_NOMINAL;
+}
+
+const struct drm_sched_backend_ops sched_ops = {
+	.run_job = aie2_sched_job_run,
+	.free_job = aie2_sched_job_free,
+	.timedout_job = aie2_sched_job_timedout,
+};
+
+static int aie2_hwctx_col_list(struct amdxdna_hwctx *hwctx)
+{
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	struct amdxdna_dev_hdl *ndev;
+	int start, end, first, last;
+	u32 width = 1, entries = 0;
+	int i;
+
+	if (!hwctx->num_tiles) {
+		XDNA_ERR(xdna, "Number of tiles is zero");
+		return -EINVAL;
+	}
+
+	ndev = xdna->dev_handle;
+	if (unlikely(!ndev->metadata.core.row_count)) {
+		XDNA_WARN(xdna, "Core tile row count is zero");
+		return -EINVAL;
+	}
+
+	hwctx->num_col = hwctx->num_tiles / ndev->metadata.core.row_count;
+	if (!hwctx->num_col || hwctx->num_col > ndev->total_col) {
+		XDNA_ERR(xdna, "Invalid num_col %d", hwctx->num_col);
+		return -EINVAL;
+	}
+
+	if (ndev->priv->col_align == COL_ALIGN_NATURE)
+		width = hwctx->num_col;
+
+	/*
+	 * In range [start, end], find out columns that is multiple of width.
+	 *	'first' is the first column,
+	 *	'last' is the last column,
+	 *	'entries' is the total number of columns.
+	 */
+	start =  xdna->dev_info->first_col;
+	end =  ndev->total_col - hwctx->num_col;
+	if (start > 0 && end == 0) {
+		XDNA_DBG(xdna, "Force start from col 0");
+		start = 0;
+	}
+	first = start + (width - start % width) % width;
+	last = end - end % width;
+	if (last >= first)
+		entries = (last - first) / width + 1;
+	XDNA_DBG(xdna, "start %d end %d first %d last %d",
+		 start, end, first, last);
+
+	if (unlikely(!entries)) {
+		XDNA_ERR(xdna, "Start %d end %d width %d",
+			 start, end, width);
+		return -EINVAL;
+	}
+
+	hwctx->col_list = kmalloc_array(entries, sizeof(*hwctx->col_list), GFP_KERNEL);
+	if (!hwctx->col_list)
+		return -ENOMEM;
+
+	hwctx->col_list_len = entries;
+	hwctx->col_list[0] = first;
+	for (i = 1; i < entries; i++)
+		hwctx->col_list[i] = hwctx->col_list[i - 1] + width;
+
+	print_hex_dump_debug("col_list: ", DUMP_PREFIX_OFFSET, 16, 4, hwctx->col_list,
+			     entries * sizeof(*hwctx->col_list), false);
+	return 0;
+}
+
+static int aie2_alloc_resource(struct amdxdna_hwctx *hwctx)
+{
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	struct alloc_requests *xrs_req;
+	int ret;
+
+	xrs_req = kzalloc(sizeof(*xrs_req), GFP_KERNEL);
+	if (!xrs_req)
+		return -ENOMEM;
+
+	xrs_req->cdo.start_cols = hwctx->col_list;
+	xrs_req->cdo.cols_len = hwctx->col_list_len;
+	xrs_req->cdo.ncols = hwctx->num_col;
+	xrs_req->cdo.qos_cap.opc = hwctx->max_opc;
+
+	xrs_req->rqos.gops = hwctx->qos.gops;
+	xrs_req->rqos.fps = hwctx->qos.fps;
+	xrs_req->rqos.dma_bw = hwctx->qos.dma_bandwidth;
+	xrs_req->rqos.latency = hwctx->qos.latency;
+	xrs_req->rqos.exec_time = hwctx->qos.frame_exec_time;
+	xrs_req->rqos.priority = hwctx->qos.priority;
+
+	xrs_req->rid = (uintptr_t)hwctx;
+
+	ret = xrs_allocate_resource(xdna->xrs_hdl, xrs_req, hwctx);
+	if (ret)
+		XDNA_ERR(xdna, "Allocate AIE resource failed, ret %d", ret);
+
+	kfree(xrs_req);
+	return ret;
+}
+
+static void aie2_release_resource(struct amdxdna_hwctx *hwctx)
+{
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	int ret;
+
+	ret = xrs_release_resource(xdna->xrs_hdl, (uintptr_t)hwctx);
+	if (ret)
+		XDNA_ERR(xdna, "Release AIE resource failed, ret %d", ret);
+}
+
+static int aie2_ctx_syncobj_create(struct amdxdna_hwctx *hwctx)
+{
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	struct drm_file *filp = hwctx->client->filp;
+	struct drm_syncobj *syncobj;
+	u32 hdl;
+	int ret;
+
+	hwctx->syncobj_hdl = AMDXDNA_INVALID_FENCE_HANDLE;
+
+	ret = drm_syncobj_create(&syncobj, 0, NULL);
+	if (ret) {
+		XDNA_ERR(xdna, "Create ctx syncobj failed, ret %d", ret);
+		return ret;
+	}
+	ret = drm_syncobj_get_handle(filp, syncobj, &hdl);
+	if (ret) {
+		drm_syncobj_put(syncobj);
+		XDNA_ERR(xdna, "Create ctx syncobj handle failed, ret %d", ret);
+		return ret;
+	}
+	hwctx->priv->syncobj = syncobj;
+	hwctx->syncobj_hdl = hdl;
+
+	return 0;
+}
+
+static void aie2_ctx_syncobj_destroy(struct amdxdna_hwctx *hwctx)
+{
+	/*
+	 * The syncobj_hdl is owned by user space and will be cleaned up
+	 * separately.
+	 */
+	drm_syncobj_put(hwctx->priv->syncobj);
+}
+
+int aie2_hwctx_init(struct amdxdna_hwctx *hwctx)
+{
+	struct amdxdna_client *client = hwctx->client;
+	struct amdxdna_dev *xdna = client->xdna;
+	struct drm_gpu_scheduler *sched;
+	struct amdxdna_hwctx_priv *priv;
+	struct amdxdna_gem_obj *heap;
+	struct amdxdna_dev_hdl *ndev;
+	int i, ret;
+
+	priv = kzalloc(sizeof(*hwctx->priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+	hwctx->priv = priv;
+
+	mutex_lock(&client->mm_lock);
+	heap = client->dev_heap;
+	if (!heap) {
+		XDNA_ERR(xdna, "The client dev heap object not exist");
+		mutex_unlock(&client->mm_lock);
+		ret = -ENOENT;
+		goto free_priv;
+	}
+	drm_gem_object_get(to_gobj(heap));
+	mutex_unlock(&client->mm_lock);
+	priv->heap = heap;
+	sema_init(&priv->job_sem, HWCTX_MAX_CMDS);
+
+	ret = amdxdna_gem_pin(heap);
+	if (ret) {
+		XDNA_ERR(xdna, "Dev heap pin failed, ret %d", ret);
+		goto put_heap;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
+		struct amdxdna_gem_obj *abo;
+		struct amdxdna_drm_create_bo args = {
+			.flags = 0,
+			.type = AMDXDNA_BO_DEV,
+			.vaddr = 0,
+			.size = MAX_CHAIN_CMDBUF_SIZE,
+		};
+
+		abo = amdxdna_drm_alloc_dev_bo(&xdna->ddev, &args, client->filp, true);
+		if (IS_ERR(abo)) {
+			ret = PTR_ERR(abo);
+			goto free_cmd_bufs;
+		}
+
+		XDNA_DBG(xdna, "Command buf %d addr 0x%llx size 0x%lx",
+			 i, abo->mem.dev_addr, abo->mem.size);
+		priv->cmd_buf[i] = abo;
+	}
+
+	sched = &priv->sched;
+	mutex_init(&priv->io_lock);
+
+	fs_reclaim_acquire(GFP_KERNEL);
+	might_lock(&priv->io_lock);
+	fs_reclaim_release(GFP_KERNEL);
+
+	ret = drm_sched_init(sched, &sched_ops, NULL, DRM_SCHED_PRIORITY_COUNT,
+			     HWCTX_MAX_CMDS, 0, msecs_to_jiffies(HWCTX_MAX_TIMEOUT),
+			     NULL, NULL, hwctx->name, xdna->ddev.dev);
+	if (ret) {
+		XDNA_ERR(xdna, "Failed to init DRM scheduler. ret %d", ret);
+		goto free_cmd_bufs;
+	}
+
+	ret = drm_sched_entity_init(&priv->entity, DRM_SCHED_PRIORITY_NORMAL,
+				    &sched, 1, NULL);
+	if (ret) {
+		XDNA_ERR(xdna, "Failed to initial sched entiry. ret %d", ret);
+		goto free_sched;
+	}
+
+	ret = aie2_hwctx_col_list(hwctx);
+	if (ret) {
+		XDNA_ERR(xdna, "Create col list failed, ret %d", ret);
+		goto free_entity;
+	}
+
+	ret = aie2_alloc_resource(hwctx);
+	if (ret) {
+		XDNA_ERR(xdna, "Alloc hw resource failed, ret %d", ret);
+		goto free_col_list;
+	}
+
+	ret = aie2_map_host_buf(xdna->dev_handle, hwctx->fw_ctx_id,
+				heap->mem.userptr, heap->mem.size);
+	if (ret) {
+		XDNA_ERR(xdna, "Map host buffer failed, ret %d", ret);
+		goto release_resource;
+	}
+
+	ret = aie2_ctx_syncobj_create(hwctx);
+	if (ret) {
+		XDNA_ERR(xdna, "Create syncobj failed, ret %d", ret);
+		goto release_resource;
+	}
+
+	hwctx->status = HWCTX_STAT_INIT;
+	ndev = xdna->dev_handle;
+	ndev->hwctx_num++;
+
+	XDNA_DBG(xdna, "hwctx %s init completed", hwctx->name);
+
+	return 0;
+
+release_resource:
+	aie2_release_resource(hwctx);
+free_col_list:
+	kfree(hwctx->col_list);
+free_entity:
+	drm_sched_entity_destroy(&priv->entity);
+free_sched:
+	drm_sched_fini(&priv->sched);
+free_cmd_bufs:
+	for (i = 0; i < ARRAY_SIZE(priv->cmd_buf); i++) {
+		if (!priv->cmd_buf[i])
+			continue;
+		drm_gem_object_put(to_gobj(priv->cmd_buf[i]));
+	}
+	amdxdna_gem_unpin(heap);
+put_heap:
+	drm_gem_object_put(to_gobj(heap));
+free_priv:
+	kfree(priv);
+	return ret;
+}
+
+void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx)
+{
+	struct amdxdna_dev_hdl *ndev;
+	struct amdxdna_dev *xdna;
+	int idx;
+
+	xdna = hwctx->client->xdna;
+	ndev = xdna->dev_handle;
+	ndev->hwctx_num--;
+	drm_sched_wqueue_stop(&hwctx->priv->sched);
+
+	/* Now, scheduler will not send command to device. */
+	aie2_release_resource(hwctx);
+
+	/*
+	 * All submitted commands are aborted.
+	 * Restart scheduler queues to cleanup jobs. The amdxdna_sched_job_run()
+	 * will return NODEV if it is called.
+	 */
+	drm_sched_wqueue_start(&hwctx->priv->sched);
+
+	aie2_hwctx_wait_for_idle(hwctx);
+	drm_sched_entity_destroy(&hwctx->priv->entity);
+	drm_sched_fini(&hwctx->priv->sched);
+	aie2_ctx_syncobj_destroy(hwctx);
+
+	XDNA_DBG(xdna, "%s sequence number %lld", hwctx->name, hwctx->priv->seq);
+
+	for (idx = 0; idx < ARRAY_SIZE(hwctx->priv->cmd_buf); idx++)
+		drm_gem_object_put(to_gobj(hwctx->priv->cmd_buf[idx]));
+	amdxdna_gem_unpin(hwctx->priv->heap);
+	drm_gem_object_put(to_gobj(hwctx->priv->heap));
+
+	mutex_destroy(&hwctx->priv->io_lock);
+	kfree(hwctx->col_list);
+	kfree(hwctx->priv);
+	kfree(hwctx->cus);
+}
+
+static int aie2_hwctx_cu_config(struct amdxdna_hwctx *hwctx, void *buf, u32 size)
+{
+	struct amdxdna_hwctx_param_config_cu *config = buf;
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	u32 total_size;
+	int ret;
+
+	XDNA_DBG(xdna, "Config %d CU to %s", config->num_cus, hwctx->name);
+	if (XDNA_MBZ_DBG(xdna, config->pad, sizeof(config->pad)))
+		return -EINVAL;
+
+	if (hwctx->status != HWCTX_STAT_INIT) {
+		XDNA_ERR(xdna, "Not support re-config CU");
+		return -EINVAL;
+	}
+
+	if (!config->num_cus) {
+		XDNA_ERR(xdna, "Number of CU is zero");
+		return -EINVAL;
+	}
+
+	total_size = struct_size(config, cu_configs, config->num_cus);
+	if (total_size > size) {
+		XDNA_ERR(xdna, "CU config larger than size");
+		return -EINVAL;
+	}
+
+	hwctx->cus = kmemdup(config, total_size, GFP_KERNEL);
+	if (!hwctx->cus)
+		return -ENOMEM;
+
+	ret = aie2_config_cu(hwctx);
+	if (ret) {
+		XDNA_ERR(xdna, "Config CU to firmware failed, ret %d", ret);
+		goto free_cus;
+	}
+
+	wmb(); /* To avoid locking in command submit when check status */
+	hwctx->status = HWCTX_STAT_READY;
+
+	return 0;
+
+free_cus:
+	kfree(hwctx->cus);
+	hwctx->cus = NULL;
+	return ret;
+}
+
+int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size)
+{
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+	switch (type) {
+	case DRM_AMDXDNA_HWCTX_CONFIG_CU:
+		return aie2_hwctx_cu_config(hwctx, buf, size);
+	case DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF:
+	case DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF:
+		return -EOPNOTSUPP;
+	default:
+		XDNA_DBG(xdna, "Not supported type %d", type);
+		return -EOPNOTSUPP;
+	}
+}
+
+static int aie2_populate_range(struct amdxdna_gem_obj *abo)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
+	struct mm_struct *mm = abo->mem.notifier.mm;
+	struct hmm_range range = { 0 };
+	unsigned long timeout;
+	int ret;
+
+	XDNA_INFO_ONCE(xdna, "populate memory range %llx size %lx",
+		       abo->mem.userptr, abo->mem.size);
+	range.notifier = &abo->mem.notifier;
+	range.start = abo->mem.userptr;
+	range.end = abo->mem.userptr + abo->mem.size;
+	range.hmm_pfns = abo->mem.pfns;
+	range.default_flags = HMM_PFN_REQ_FAULT;
+
+	if (!mmget_not_zero(mm))
+		return -EFAULT;
+
+	timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+again:
+	range.notifier_seq = mmu_interval_read_begin(&abo->mem.notifier);
+	mmap_read_lock(mm);
+	ret = hmm_range_fault(&range);
+	mmap_read_unlock(mm);
+	if (ret) {
+		if (time_after(jiffies, timeout)) {
+			ret = -ETIME;
+			goto put_mm;
+		}
+
+		if (ret == -EBUSY)
+			goto again;
+
+		goto put_mm;
+	}
+
+	down_read(&xdna->notifier_lock);
+	if (mmu_interval_read_retry(&abo->mem.notifier, range.notifier_seq)) {
+		up_read(&xdna->notifier_lock);
+		goto again;
+	}
+	abo->mem.map_invalid = false;
+	up_read(&xdna->notifier_lock);
+
+put_mm:
+	mmput(mm);
+	return ret;
+}
+
+int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq)
+{
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	struct ww_acquire_ctx acquire_ctx;
+	struct dma_fence_chain *chain;
+	struct amdxdna_gem_obj *abo;
+	unsigned long timeout = 0;
+	int ret, i;
+
+	ret = down_interruptible(&hwctx->priv->job_sem);
+	if (ret) {
+		XDNA_ERR(xdna, "Grab job sem failed, ret %d", ret);
+		return ret;
+	}
+
+	chain = dma_fence_chain_alloc();
+	if (!chain) {
+		XDNA_ERR(xdna, "Alloc fence chain failed");
+		ret = -ENOMEM;
+		goto up_sem;
+	}
+
+	ret = drm_sched_job_init(&job->base, &hwctx->priv->entity, 1, hwctx);
+	if (ret) {
+		XDNA_ERR(xdna, "DRM job init failed, ret %d", ret);
+		goto free_chain;
+	}
+
+retry:
+	ret = drm_gem_lock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+	if (ret) {
+		XDNA_WARN(xdna, "Failed to lock BOs, ret %d", ret);
+		goto cleanup_job;
+	}
+
+	for (i = 0; i < job->bo_cnt; i++) {
+		ret = dma_resv_reserve_fences(job->bos[i]->resv, 1);
+		if (ret) {
+			XDNA_WARN(xdna, "Failed to reserve fences %d", ret);
+			drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+			goto cleanup_job;
+		}
+	}
+
+	down_read(&xdna->notifier_lock);
+	for (i = 0; i < job->bo_cnt; i++) {
+		abo = to_xdna_obj(job->bos[i]);
+		if (abo->mem.map_invalid) {
+			up_read(&xdna->notifier_lock);
+			drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+			if (!timeout) {
+				timeout = jiffies +
+					msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
+			} else if (time_after(jiffies, timeout)) {
+				ret = -ETIME;
+				goto cleanup_job;
+			}
+
+			ret = aie2_populate_range(abo);
+			if (ret)
+				goto cleanup_job;
+			goto retry;
+		}
+	}
+
+	mutex_lock(&hwctx->priv->io_lock);
+	drm_sched_job_arm(&job->base);
+	job->out_fence = dma_fence_get(&job->base.s_fence->finished);
+	for (i = 0; i < job->bo_cnt; i++)
+		dma_resv_add_fence(job->bos[i]->resv, job->out_fence, DMA_RESV_USAGE_WRITE);
+	job->seq = hwctx->priv->seq++;
+	kref_get(&job->refcnt);
+	drm_sched_entity_push_job(&job->base);
+
+	*seq = job->seq;
+	drm_syncobj_add_point(hwctx->priv->syncobj, chain, job->out_fence, *seq);
+	mutex_unlock(&hwctx->priv->io_lock);
+
+	up_read(&xdna->notifier_lock);
+	drm_gem_unlock_reservations(job->bos, job->bo_cnt, &acquire_ctx);
+
+	aie2_job_put(job);
+
+	return 0;
+
+cleanup_job:
+	drm_sched_job_cleanup(&job->base);
+free_chain:
+	dma_fence_chain_free(chain);
+up_sem:
+	up(&hwctx->priv->job_sem);
+	job->job_done = true;
+	return ret;
+}
+
+void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo,
+			 unsigned long cur_seq)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
+	struct drm_gem_object *gobj = to_gobj(abo);
+	long ret;
+
+	down_write(&xdna->notifier_lock);
+	abo->mem.map_invalid = true;
+	mmu_interval_set_seq(&abo->mem.notifier, cur_seq);
+	up_write(&xdna->notifier_lock);
+	ret = dma_resv_wait_timeout(gobj->resv, DMA_RESV_USAGE_BOOKKEEP,
+				    true, MAX_SCHEDULE_TIMEOUT);
+	if (!ret || ret == -ERESTARTSYS)
+		XDNA_ERR(xdna, "Failed to wait for bo, ret %ld", ret);
+}
diff --git a/drivers/accel/amdxdna/aie2_error.c b/drivers/accel/amdxdna/aie2_error.c
new file mode 100644
index 000000000000..b1defaa8513b
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_error.c
@@ -0,0 +1,360 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/drm_cache.h>
+#include <drm/drm_device.h>
+#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/dma-mapping.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+
+#include "aie2_msg_priv.h"
+#include "aie2_pci.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_pci_drv.h"
+
+struct async_event {
+	struct amdxdna_dev_hdl		*ndev;
+	struct async_event_msg_resp	resp;
+	struct workqueue_struct		*wq;
+	struct work_struct		work;
+	u8				*buf;
+	dma_addr_t			addr;
+	u32				size;
+};
+
+struct async_events {
+	struct workqueue_struct		*wq;
+	u8				*buf;
+	dma_addr_t			addr;
+	u32				size;
+	u32				event_cnt;
+	struct async_event		event[] __counted_by(event_cnt);
+};
+
+/*
+ * Below enum, struct and lookup tables are porting from XAIE util header file.
+ *
+ * Below data is defined by AIE device and it is used for decode error message
+ * from the device.
+ */
+
+enum aie_module_type {
+	AIE_MEM_MOD = 0,
+	AIE_CORE_MOD,
+	AIE_PL_MOD,
+};
+
+enum aie_error_category {
+	AIE_ERROR_SATURATION = 0,
+	AIE_ERROR_FP,
+	AIE_ERROR_STREAM,
+	AIE_ERROR_ACCESS,
+	AIE_ERROR_BUS,
+	AIE_ERROR_INSTRUCTION,
+	AIE_ERROR_ECC,
+	AIE_ERROR_LOCK,
+	AIE_ERROR_DMA,
+	AIE_ERROR_MEM_PARITY,
+	/* Unknown is not from XAIE, added for better category */
+	AIE_ERROR_UNKNOWN,
+};
+
+/* Don't pack, unless XAIE side changed */
+struct aie_error {
+	__u8			row;
+	__u8			col;
+	__u32			mod_type;
+	__u8			event_id;
+};
+
+struct aie_err_info {
+	u32			err_cnt;
+	u32			ret_code;
+	u32			rsvd;
+	struct aie_error	payload[] __counted_by(err_cnt);
+};
+
+struct aie_event_category {
+	u8			event_id;
+	enum aie_error_category category;
+};
+
+#define EVENT_CATEGORY(id, cat) { id, cat }
+static const struct aie_event_category aie_ml_mem_event_cat[] = {
+	EVENT_CATEGORY(88U,  AIE_ERROR_ECC),
+	EVENT_CATEGORY(90U,  AIE_ERROR_ECC),
+	EVENT_CATEGORY(91U,  AIE_ERROR_MEM_PARITY),
+	EVENT_CATEGORY(92U,  AIE_ERROR_MEM_PARITY),
+	EVENT_CATEGORY(93U,  AIE_ERROR_MEM_PARITY),
+	EVENT_CATEGORY(94U,  AIE_ERROR_MEM_PARITY),
+	EVENT_CATEGORY(95U,  AIE_ERROR_MEM_PARITY),
+	EVENT_CATEGORY(96U,  AIE_ERROR_MEM_PARITY),
+	EVENT_CATEGORY(97U,  AIE_ERROR_DMA),
+	EVENT_CATEGORY(98U,  AIE_ERROR_DMA),
+	EVENT_CATEGORY(99U,  AIE_ERROR_DMA),
+	EVENT_CATEGORY(100U, AIE_ERROR_DMA),
+	EVENT_CATEGORY(101U, AIE_ERROR_LOCK),
+};
+
+static const struct aie_event_category aie_ml_core_event_cat[] = {
+	EVENT_CATEGORY(55U, AIE_ERROR_ACCESS),
+	EVENT_CATEGORY(56U, AIE_ERROR_STREAM),
+	EVENT_CATEGORY(57U, AIE_ERROR_STREAM),
+	EVENT_CATEGORY(58U, AIE_ERROR_BUS),
+	EVENT_CATEGORY(59U, AIE_ERROR_INSTRUCTION),
+	EVENT_CATEGORY(60U, AIE_ERROR_ACCESS),
+	EVENT_CATEGORY(62U, AIE_ERROR_ECC),
+	EVENT_CATEGORY(64U, AIE_ERROR_ECC),
+	EVENT_CATEGORY(65U, AIE_ERROR_ACCESS),
+	EVENT_CATEGORY(66U, AIE_ERROR_ACCESS),
+	EVENT_CATEGORY(67U, AIE_ERROR_LOCK),
+	EVENT_CATEGORY(70U, AIE_ERROR_INSTRUCTION),
+	EVENT_CATEGORY(71U, AIE_ERROR_STREAM),
+	EVENT_CATEGORY(72U, AIE_ERROR_BUS),
+};
+
+static const struct aie_event_category aie_ml_mem_tile_event_cat[] = {
+	EVENT_CATEGORY(130U, AIE_ERROR_ECC),
+	EVENT_CATEGORY(132U, AIE_ERROR_ECC),
+	EVENT_CATEGORY(133U, AIE_ERROR_DMA),
+	EVENT_CATEGORY(134U, AIE_ERROR_DMA),
+	EVENT_CATEGORY(135U, AIE_ERROR_STREAM),
+	EVENT_CATEGORY(136U, AIE_ERROR_STREAM),
+	EVENT_CATEGORY(137U, AIE_ERROR_STREAM),
+	EVENT_CATEGORY(138U, AIE_ERROR_BUS),
+	EVENT_CATEGORY(139U, AIE_ERROR_LOCK),
+};
+
+static const struct aie_event_category aie_ml_shim_tile_event_cat[] = {
+	EVENT_CATEGORY(64U, AIE_ERROR_BUS),
+	EVENT_CATEGORY(65U, AIE_ERROR_STREAM),
+	EVENT_CATEGORY(66U, AIE_ERROR_STREAM),
+	EVENT_CATEGORY(67U, AIE_ERROR_BUS),
+	EVENT_CATEGORY(68U, AIE_ERROR_BUS),
+	EVENT_CATEGORY(69U, AIE_ERROR_BUS),
+	EVENT_CATEGORY(70U, AIE_ERROR_BUS),
+	EVENT_CATEGORY(71U, AIE_ERROR_BUS),
+	EVENT_CATEGORY(72U, AIE_ERROR_DMA),
+	EVENT_CATEGORY(73U, AIE_ERROR_DMA),
+	EVENT_CATEGORY(74U, AIE_ERROR_LOCK),
+};
+
+static enum aie_error_category
+aie_get_error_category(u8 row, u8 event_id, enum aie_module_type mod_type)
+{
+	const struct aie_event_category *lut;
+	int num_entry;
+	int i;
+
+	switch (mod_type) {
+	case AIE_PL_MOD:
+		lut = aie_ml_shim_tile_event_cat;
+		num_entry = ARRAY_SIZE(aie_ml_shim_tile_event_cat);
+		break;
+	case AIE_CORE_MOD:
+		lut = aie_ml_core_event_cat;
+		num_entry = ARRAY_SIZE(aie_ml_core_event_cat);
+		break;
+	case AIE_MEM_MOD:
+		if (row == 1) {
+			lut = aie_ml_mem_tile_event_cat;
+			num_entry = ARRAY_SIZE(aie_ml_mem_tile_event_cat);
+		} else {
+			lut = aie_ml_mem_event_cat;
+			num_entry = ARRAY_SIZE(aie_ml_mem_event_cat);
+		}
+		break;
+	default:
+		return AIE_ERROR_UNKNOWN;
+	}
+
+	for (i = 0; i < num_entry; i++) {
+		if (event_id != lut[i].event_id)
+			continue;
+
+		return lut[i].category;
+	}
+
+	return AIE_ERROR_UNKNOWN;
+}
+
+static u32 aie2_error_backtrack(struct amdxdna_dev_hdl *ndev, void *err_info, u32 num_err)
+{
+	struct aie_error *errs = err_info;
+	u32 err_col = 0; /* assume that AIE has less than 32 columns */
+	int i;
+
+	/* Get err column bitmap */
+	for (i = 0; i < num_err; i++) {
+		struct aie_error *err = &errs[i];
+		enum aie_error_category cat;
+
+		cat = aie_get_error_category(err->row, err->event_id, err->mod_type);
+		XDNA_ERR(ndev->xdna, "Row: %d, Col: %d, module %d, event ID %d, category %d",
+			 err->row, err->col, err->mod_type,
+			 err->event_id, cat);
+
+		if (err->col >= 32) {
+			XDNA_WARN(ndev->xdna, "Invalid column number");
+			break;
+		}
+
+		err_col |= (1 << err->col);
+	}
+
+	return err_col;
+}
+
+static int aie2_error_async_cb(void *handle, const u32 *data, size_t size)
+{
+	struct async_event_msg_resp *resp;
+	struct async_event *e = handle;
+
+	if (data) {
+		resp = (struct async_event_msg_resp *)data;
+		e->resp.type = resp->type;
+		wmb(); /* Update status in the end, so that no lock for here */
+		e->resp.status = resp->status;
+	}
+	queue_work(e->wq, &e->work);
+	return 0;
+}
+
+static int aie2_error_event_send(struct async_event *e)
+{
+	drm_clflush_virt_range(e->buf, e->size); /* device can access */
+	return aie2_register_asyn_event_msg(e->ndev, e->addr, e->size, e,
+					    aie2_error_async_cb);
+}
+
+static void aie2_error_worker(struct work_struct *err_work)
+{
+	struct aie_err_info *info;
+	struct amdxdna_dev *xdna;
+	struct async_event *e;
+	u32 max_err;
+	u32 err_col;
+
+	e = container_of(err_work, struct async_event, work);
+
+	xdna = e->ndev->xdna;
+
+	if (e->resp.status == MAX_AIE2_STATUS_CODE)
+		return;
+
+	e->resp.status = MAX_AIE2_STATUS_CODE;
+
+	print_hex_dump_debug("AIE error: ", DUMP_PREFIX_OFFSET, 16, 4,
+			     e->buf, 0x100, false);
+
+	info = (struct aie_err_info *)e->buf;
+	XDNA_DBG(xdna, "Error count %d return code %d", info->err_cnt, info->ret_code);
+
+	max_err = (e->size - sizeof(*info)) / sizeof(struct aie_error);
+	if (unlikely(info->err_cnt > max_err)) {
+		WARN_ONCE(1, "Error count too large %d\n", info->err_cnt);
+		return;
+	}
+	err_col = aie2_error_backtrack(e->ndev, info->payload, info->err_cnt);
+	if (!err_col) {
+		XDNA_WARN(xdna, "Did not get error column");
+		return;
+	}
+
+	mutex_lock(&xdna->dev_lock);
+	/* Re-sent this event to firmware */
+	if (aie2_error_event_send(e))
+		XDNA_WARN(xdna, "Unable to register async event");
+	mutex_unlock(&xdna->dev_lock);
+}
+
+int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+	struct async_event *e;
+	int i, ret;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+	for (i = 0; i < ndev->async_events->event_cnt; i++) {
+		e = &ndev->async_events->event[i];
+		ret = aie2_error_event_send(e);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+	struct async_events *events;
+
+	events = ndev->async_events;
+
+	mutex_unlock(&xdna->dev_lock);
+	destroy_workqueue(events->wq);
+	mutex_lock(&xdna->dev_lock);
+
+	dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
+			     events->addr, DMA_FROM_DEVICE);
+	kfree(events);
+}
+
+int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+	u32 total_col = ndev->total_col;
+	u32 total_size = ASYNC_BUF_SIZE * total_col;
+	struct async_events *events;
+	int i, ret;
+
+	events = kzalloc(struct_size(events, event, total_col), GFP_KERNEL);
+	if (!events)
+		return -ENOMEM;
+
+	events->buf = dma_alloc_noncoherent(xdna->ddev.dev, total_size, &events->addr,
+					    DMA_FROM_DEVICE, GFP_KERNEL);
+	if (!events->buf) {
+		ret = -ENOMEM;
+		goto free_events;
+	}
+	events->size = total_size;
+	events->event_cnt = total_col;
+
+	events->wq = alloc_ordered_workqueue("async_wq", 0);
+	if (!events->wq) {
+		ret = -ENOMEM;
+		goto free_buf;
+	}
+
+	for (i = 0; i < events->event_cnt; i++) {
+		struct async_event *e = &events->event[i];
+		u32 offset = i * ASYNC_BUF_SIZE;
+
+		e->ndev = ndev;
+		e->wq = events->wq;
+		e->buf = &events->buf[offset];
+		e->addr = events->addr + offset;
+		e->size = ASYNC_BUF_SIZE;
+		e->resp.status = MAX_AIE2_STATUS_CODE;
+		INIT_WORK(&e->work, aie2_error_worker);
+	}
+
+	ndev->async_events = events;
+
+	XDNA_DBG(xdna, "Async event count %d, buf total size 0x%x",
+		 events->event_cnt, events->size);
+	return 0;
+
+free_buf:
+	dma_free_noncoherent(xdna->ddev.dev, events->size, events->buf,
+			     events->addr, DMA_FROM_DEVICE);
+free_events:
+	kfree(events);
+	return ret;
+}
diff --git a/drivers/accel/amdxdna/aie2_message.c b/drivers/accel/amdxdna/aie2_message.c
new file mode 100644
index 000000000000..9e2c9a44f76a
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_message.c
@@ -0,0 +1,776 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_cache.h>
+#include <drm/drm_device.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/bitfield.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/types.h>
+#include <linux/xarray.h>
+
+#include "aie2_msg_priv.h"
+#include "aie2_pci.h"
+#include "amdxdna_ctx.h"
+#include "amdxdna_gem.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_mailbox_helper.h"
+#include "amdxdna_pci_drv.h"
+
+#define DECLARE_AIE2_MSG(name, op) \
+	DECLARE_XDNA_MSG_COMMON(name, op, MAX_AIE2_STATUS_CODE)
+
+static int aie2_send_mgmt_msg_wait(struct amdxdna_dev_hdl *ndev,
+				   struct xdna_mailbox_msg *msg)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+	struct xdna_notify *hdl = msg->handle;
+	int ret;
+
+	if (!ndev->mgmt_chann)
+		return -ENODEV;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+	ret = xdna_send_msg_wait(xdna, ndev->mgmt_chann, msg);
+	if (ret == -ETIME) {
+		xdna_mailbox_stop_channel(ndev->mgmt_chann);
+		xdna_mailbox_destroy_channel(ndev->mgmt_chann);
+		ndev->mgmt_chann = NULL;
+	}
+
+	if (!ret && *hdl->data != AIE2_STATUS_SUCCESS) {
+		XDNA_ERR(xdna, "command opcode 0x%x failed, status 0x%x",
+			 msg->opcode, *hdl->data);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev)
+{
+	DECLARE_AIE2_MSG(suspend, MSG_OP_SUSPEND);
+
+	return aie2_send_mgmt_msg_wait(ndev, &msg);
+}
+
+int aie2_resume_fw(struct amdxdna_dev_hdl *ndev)
+{
+	DECLARE_AIE2_MSG(suspend, MSG_OP_RESUME);
+
+	return aie2_send_mgmt_msg_wait(ndev, &msg);
+}
+
+int aie2_set_runtime_cfg(struct amdxdna_dev_hdl *ndev, u32 type, u64 value)
+{
+	DECLARE_AIE2_MSG(set_runtime_cfg, MSG_OP_SET_RUNTIME_CONFIG);
+	int ret;
+
+	req.type = type;
+	req.value = value;
+
+	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Failed to set runtime config, ret %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+int aie2_get_runtime_cfg(struct amdxdna_dev_hdl *ndev, u32 type, u64 *value)
+{
+	DECLARE_AIE2_MSG(get_runtime_cfg, MSG_OP_GET_RUNTIME_CONFIG);
+	int ret;
+
+	req.type = type;
+	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Failed to get runtime config, ret %d", ret);
+		return ret;
+	}
+
+	*value = resp.value;
+	return 0;
+}
+
+int aie2_assign_mgmt_pasid(struct amdxdna_dev_hdl *ndev, u16 pasid)
+{
+	DECLARE_AIE2_MSG(assign_mgmt_pasid, MSG_OP_ASSIGN_MGMT_PASID);
+
+	req.pasid = pasid;
+
+	return aie2_send_mgmt_msg_wait(ndev, &msg);
+}
+
+int aie2_query_aie_version(struct amdxdna_dev_hdl *ndev, struct aie_version *version)
+{
+	DECLARE_AIE2_MSG(aie_version_info, MSG_OP_QUERY_AIE_VERSION);
+	struct amdxdna_dev *xdna = ndev->xdna;
+	int ret;
+
+	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+	if (ret)
+		return ret;
+
+	XDNA_DBG(xdna, "Query AIE version - major: %u minor: %u completed",
+		 resp.major, resp.minor);
+
+	version->major = resp.major;
+	version->minor = resp.minor;
+
+	return 0;
+}
+
+int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct aie_metadata *metadata)
+{
+	DECLARE_AIE2_MSG(aie_tile_info, MSG_OP_QUERY_AIE_TILE_INFO);
+	int ret;
+
+	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+	if (ret)
+		return ret;
+
+	metadata->size = resp.info.size;
+	metadata->cols = resp.info.cols;
+	metadata->rows = resp.info.rows;
+
+	metadata->version.major = resp.info.major;
+	metadata->version.minor = resp.info.minor;
+
+	metadata->core.row_count = resp.info.core_rows;
+	metadata->core.row_start = resp.info.core_row_start;
+	metadata->core.dma_channel_count = resp.info.core_dma_channels;
+	metadata->core.lock_count = resp.info.core_locks;
+	metadata->core.event_reg_count = resp.info.core_events;
+
+	metadata->mem.row_count = resp.info.mem_rows;
+	metadata->mem.row_start = resp.info.mem_row_start;
+	metadata->mem.dma_channel_count = resp.info.mem_dma_channels;
+	metadata->mem.lock_count = resp.info.mem_locks;
+	metadata->mem.event_reg_count = resp.info.mem_events;
+
+	metadata->shim.row_count = resp.info.shim_rows;
+	metadata->shim.row_start = resp.info.shim_row_start;
+	metadata->shim.dma_channel_count = resp.info.shim_dma_channels;
+	metadata->shim.lock_count = resp.info.shim_locks;
+	metadata->shim.event_reg_count = resp.info.shim_events;
+
+	return 0;
+}
+
+int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
+				struct amdxdna_fw_ver *fw_ver)
+{
+	DECLARE_AIE2_MSG(firmware_version, MSG_OP_GET_FIRMWARE_VERSION);
+	int ret;
+
+	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+	if (ret)
+		return ret;
+
+	fw_ver->major = resp.major;
+	fw_ver->minor = resp.minor;
+	fw_ver->sub = resp.sub;
+	fw_ver->build = resp.build;
+
+	return 0;
+}
+
+int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx)
+{
+	DECLARE_AIE2_MSG(create_ctx, MSG_OP_CREATE_CONTEXT);
+	struct amdxdna_dev *xdna = ndev->xdna;
+	struct xdna_mailbox_chann_res x2i;
+	struct xdna_mailbox_chann_res i2x;
+	struct cq_pair *cq_pair;
+	u32 intr_reg;
+	int ret;
+
+	req.aie_type = 1;
+	req.start_col = hwctx->start_col;
+	req.num_col = hwctx->num_col;
+	req.num_cq_pairs_requested = 1;
+	req.pasid = hwctx->client->pasid;
+	req.context_priority = 2;
+
+	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+	if (ret)
+		return ret;
+
+	hwctx->fw_ctx_id = resp.context_id;
+	WARN_ONCE(hwctx->fw_ctx_id == -1, "Unexpected context id");
+
+	cq_pair = &resp.cq_pair[0];
+	x2i.mb_head_ptr_reg = AIE2_MBOX_OFF(ndev, cq_pair->x2i_q.head_addr);
+	x2i.mb_tail_ptr_reg = AIE2_MBOX_OFF(ndev, cq_pair->x2i_q.tail_addr);
+	x2i.rb_start_addr   = AIE2_SRAM_OFF(ndev, cq_pair->x2i_q.buf_addr);
+	x2i.rb_size	    = cq_pair->x2i_q.buf_size;
+
+	i2x.mb_head_ptr_reg = AIE2_MBOX_OFF(ndev, cq_pair->i2x_q.head_addr);
+	i2x.mb_tail_ptr_reg = AIE2_MBOX_OFF(ndev, cq_pair->i2x_q.tail_addr);
+	i2x.rb_start_addr   = AIE2_SRAM_OFF(ndev, cq_pair->i2x_q.buf_addr);
+	i2x.rb_size	    = cq_pair->i2x_q.buf_size;
+
+	ret = pci_irq_vector(to_pci_dev(xdna->ddev.dev), resp.msix_id);
+	if (ret == -EINVAL) {
+		XDNA_ERR(xdna, "not able to create channel");
+		goto out_destroy_context;
+	}
+
+	intr_reg = i2x.mb_head_ptr_reg + 4;
+	hwctx->priv->mbox_chann = xdna_mailbox_create_channel(ndev->mbox, &x2i, &i2x,
+							      intr_reg, ret);
+	if (!hwctx->priv->mbox_chann) {
+		XDNA_ERR(xdna, "not able to create channel");
+		ret = -EINVAL;
+		goto out_destroy_context;
+	}
+
+	XDNA_DBG(xdna, "%s mailbox channel irq: %d, msix_id: %d",
+		 hwctx->name, ret, resp.msix_id);
+	XDNA_DBG(xdna, "%s created fw ctx %d pasid %d", hwctx->name,
+		 hwctx->fw_ctx_id, hwctx->client->pasid);
+
+	return 0;
+
+out_destroy_context:
+	aie2_destroy_context(ndev, hwctx);
+	return ret;
+}
+
+int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx)
+{
+	DECLARE_AIE2_MSG(destroy_ctx, MSG_OP_DESTROY_CONTEXT);
+	struct amdxdna_dev *xdna = ndev->xdna;
+	int ret;
+
+	if (hwctx->fw_ctx_id == -1)
+		return 0;
+
+	xdna_mailbox_stop_channel(hwctx->priv->mbox_chann);
+
+	req.context_id = hwctx->fw_ctx_id;
+	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+	if (ret)
+		XDNA_WARN(xdna, "%s destroy context failed, ret %d", hwctx->name, ret);
+
+	xdna_mailbox_destroy_channel(hwctx->priv->mbox_chann);
+	XDNA_DBG(xdna, "%s destroyed fw ctx %d", hwctx->name,
+		 hwctx->fw_ctx_id);
+	hwctx->priv->mbox_chann = NULL;
+	hwctx->fw_ctx_id = -1;
+
+	return ret;
+}
+
+int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size)
+{
+	DECLARE_AIE2_MSG(map_host_buffer, MSG_OP_MAP_HOST_BUFFER);
+	struct amdxdna_dev *xdna = ndev->xdna;
+	int ret;
+
+	req.context_id = context_id;
+	req.buf_addr = addr;
+	req.buf_size = size;
+	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+	if (ret)
+		return ret;
+
+	XDNA_DBG(xdna, "fw ctx %d map host buf addr 0x%llx size 0x%llx",
+		 context_id, addr, size);
+
+	return 0;
+}
+
+int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf,
+		      u32 size, u32 *cols_filled)
+{
+	DECLARE_AIE2_MSG(aie_column_info, MSG_OP_QUERY_COL_STATUS);
+	struct amdxdna_dev *xdna = ndev->xdna;
+	struct amdxdna_client *client;
+	struct amdxdna_hwctx *hwctx;
+	unsigned long hwctx_id;
+	dma_addr_t dma_addr;
+	u32 aie_bitmap = 0;
+	u8 *buff_addr;
+	int ret, idx;
+
+	buff_addr = dma_alloc_noncoherent(xdna->ddev.dev, size, &dma_addr,
+					  DMA_FROM_DEVICE, GFP_KERNEL);
+	if (!buff_addr)
+		return -ENOMEM;
+
+	/* Go through each hardware context and mark the AIE columns that are active */
+	list_for_each_entry(client, &xdna->client_list, node) {
+		idx = srcu_read_lock(&client->hwctx_srcu);
+		amdxdna_for_each_hwctx(client, hwctx_id, hwctx)
+			aie_bitmap |= amdxdna_hwctx_col_map(hwctx);
+		srcu_read_unlock(&client->hwctx_srcu, idx);
+	}
+
+	*cols_filled = 0;
+	req.dump_buff_addr = dma_addr;
+	req.dump_buff_size = size;
+	req.num_cols = hweight32(aie_bitmap);
+	req.aie_bitmap = aie_bitmap;
+
+	drm_clflush_virt_range(buff_addr, size); /* device can access */
+	ret = aie2_send_mgmt_msg_wait(ndev, &msg);
+	if (ret) {
+		XDNA_ERR(xdna, "Error during NPU query, status %d", ret);
+		goto fail;
+	}
+
+	if (resp.status != AIE2_STATUS_SUCCESS) {
+		XDNA_ERR(xdna, "Query NPU status failed, status 0x%x", resp.status);
+		ret = -EINVAL;
+		goto fail;
+	}
+	XDNA_DBG(xdna, "Query NPU status completed");
+
+	if (size < resp.size) {
+		ret = -EINVAL;
+		XDNA_ERR(xdna, "Bad buffer size. Available: %u. Needs: %u", size, resp.size);
+		goto fail;
+	}
+
+	if (copy_to_user(buf, buff_addr, resp.size)) {
+		ret = -EFAULT;
+		XDNA_ERR(xdna, "Failed to copy NPU status to user space");
+		goto fail;
+	}
+
+	*cols_filled = aie_bitmap;
+
+fail:
+	dma_free_noncoherent(xdna->ddev.dev, size, buff_addr, dma_addr, DMA_FROM_DEVICE);
+	return ret;
+}
+
+int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
+				 void *handle, int (*cb)(void*, const u32 *, size_t))
+{
+	struct async_event_msg_req req = { 0 };
+	struct xdna_mailbox_msg msg = {
+		.send_data = (u8 *)&req,
+		.send_size = sizeof(req),
+		.handle = handle,
+		.opcode = MSG_OP_REGISTER_ASYNC_EVENT_MSG,
+		.notify_cb = cb,
+	};
+
+	req.buf_addr = addr;
+	req.buf_size = size;
+
+	XDNA_DBG(ndev->xdna, "Register addr 0x%llx size 0x%x", addr, size);
+	return xdna_mailbox_send_msg(ndev->mgmt_chann, &msg, TX_TIMEOUT);
+}
+
+int aie2_config_cu(struct amdxdna_hwctx *hwctx)
+{
+	struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	u32 shift = xdna->dev_info->dev_mem_buf_shift;
+	DECLARE_AIE2_MSG(config_cu, MSG_OP_CONFIG_CU);
+	struct drm_gem_object *gobj;
+	struct amdxdna_gem_obj *abo;
+	int ret, i;
+
+	if (!chann)
+		return -ENODEV;
+
+	if (hwctx->cus->num_cus > MAX_NUM_CUS) {
+		XDNA_DBG(xdna, "Exceed maximum CU %d", MAX_NUM_CUS);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < hwctx->cus->num_cus; i++) {
+		struct amdxdna_cu_config *cu = &hwctx->cus->cu_configs[i];
+
+		if (XDNA_MBZ_DBG(xdna, cu->pad, sizeof(cu->pad)))
+			return -EINVAL;
+
+		gobj = drm_gem_object_lookup(hwctx->client->filp, cu->cu_bo);
+		if (!gobj) {
+			XDNA_ERR(xdna, "Lookup GEM object failed");
+			return -EINVAL;
+		}
+		abo = to_xdna_obj(gobj);
+
+		if (abo->type != AMDXDNA_BO_DEV) {
+			drm_gem_object_put(gobj);
+			XDNA_ERR(xdna, "Invalid BO type");
+			return -EINVAL;
+		}
+
+		req.cfgs[i] = FIELD_PREP(AIE2_MSG_CFG_CU_PDI_ADDR,
+					 abo->mem.dev_addr >> shift);
+		req.cfgs[i] |= FIELD_PREP(AIE2_MSG_CFG_CU_FUNC, cu->cu_func);
+		XDNA_DBG(xdna, "CU %d full addr 0x%llx, cfg 0x%x", i,
+			 abo->mem.dev_addr, req.cfgs[i]);
+		drm_gem_object_put(gobj);
+	}
+	req.num_cus = hwctx->cus->num_cus;
+
+	ret = xdna_send_msg_wait(xdna, chann, &msg);
+	if (ret == -ETIME)
+		aie2_destroy_context(xdna->dev_handle, hwctx);
+
+	if (resp.status == AIE2_STATUS_SUCCESS) {
+		XDNA_DBG(xdna, "Configure %d CUs, ret %d", req.num_cus, ret);
+		return 0;
+	}
+
+	XDNA_ERR(xdna, "Command opcode 0x%x failed, status 0x%x ret %d",
+		 msg.opcode, resp.status, ret);
+	return ret;
+}
+
+int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+		 int (*notify_cb)(void *, const u32 *, size_t))
+{
+	struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+	union {
+		struct execute_buffer_req ebuf;
+		struct exec_dpu_req dpu;
+	} req;
+	struct xdna_mailbox_msg msg;
+	u32 payload_len;
+	void *payload;
+	int cu_idx;
+	int ret;
+	u32 op;
+
+	if (!chann)
+		return -ENODEV;
+
+	payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len);
+	if (!payload) {
+		XDNA_ERR(xdna, "Invalid command, cannot get payload");
+		return -EINVAL;
+	}
+
+	cu_idx = amdxdna_cmd_get_cu_idx(cmd_abo);
+	if (cu_idx < 0) {
+		XDNA_DBG(xdna, "Invalid cu idx");
+		return -EINVAL;
+	}
+
+	op = amdxdna_cmd_get_op(cmd_abo);
+	switch (op) {
+	case ERT_START_CU:
+		if (unlikely(payload_len > sizeof(req.ebuf.payload)))
+			XDNA_DBG(xdna, "Invalid ebuf payload len: %d", payload_len);
+		req.ebuf.cu_idx = cu_idx;
+		memcpy(req.ebuf.payload, payload, sizeof(req.ebuf.payload));
+		msg.send_size = sizeof(req.ebuf);
+		msg.opcode = MSG_OP_EXECUTE_BUFFER_CF;
+		break;
+	case ERT_START_NPU: {
+		struct amdxdna_cmd_start_npu *sn = payload;
+
+		if (unlikely(payload_len - sizeof(*sn) > sizeof(req.dpu.payload)))
+			XDNA_DBG(xdna, "Invalid dpu payload len: %d", payload_len);
+		req.dpu.inst_buf_addr = sn->buffer;
+		req.dpu.inst_size = sn->buffer_size;
+		req.dpu.inst_prop_cnt = sn->prop_count;
+		req.dpu.cu_idx = cu_idx;
+		memcpy(req.dpu.payload, sn->prop_args, sizeof(req.dpu.payload));
+		msg.send_size = sizeof(req.dpu);
+		msg.opcode = MSG_OP_EXEC_DPU;
+		break;
+	}
+	default:
+		XDNA_DBG(xdna, "Invalid ERT cmd op code: %d", op);
+		return -EINVAL;
+	}
+	msg.handle = job;
+	msg.notify_cb = notify_cb;
+	msg.send_data = (u8 *)&req;
+	print_hex_dump_debug("cmd: ", DUMP_PREFIX_OFFSET, 16, 4, &req,
+			     0x40, false);
+
+	ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+	if (ret) {
+		XDNA_ERR(xdna, "Send message failed");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int
+aie2_cmdlist_fill_one_slot_cf(void *cmd_buf, u32 offset,
+			      struct amdxdna_gem_obj *abo, u32 *size)
+{
+	struct cmd_chain_slot_execbuf_cf *buf = cmd_buf + offset;
+	int cu_idx = amdxdna_cmd_get_cu_idx(abo);
+	u32 payload_len;
+	void *payload;
+
+	if (cu_idx < 0)
+		return -EINVAL;
+
+	payload = amdxdna_cmd_get_payload(abo, &payload_len);
+	if (!payload)
+		return -EINVAL;
+
+	if (!slot_cf_has_space(offset, payload_len))
+		return -ENOSPC;
+
+	buf->cu_idx = cu_idx;
+	buf->arg_cnt = payload_len / sizeof(u32);
+	memcpy(buf->args, payload, payload_len);
+	/* Accurate buf size to hint firmware to do necessary copy */
+	*size = sizeof(*buf) + payload_len;
+	return 0;
+}
+
+static int
+aie2_cmdlist_fill_one_slot_dpu(void *cmd_buf, u32 offset,
+			       struct amdxdna_gem_obj *abo, u32 *size)
+{
+	struct cmd_chain_slot_dpu *buf = cmd_buf + offset;
+	int cu_idx = amdxdna_cmd_get_cu_idx(abo);
+	struct amdxdna_cmd_start_npu *sn;
+	u32 payload_len;
+	void *payload;
+	u32 arg_sz;
+
+	if (cu_idx < 0)
+		return -EINVAL;
+
+	payload = amdxdna_cmd_get_payload(abo, &payload_len);
+	if (!payload)
+		return -EINVAL;
+	sn = payload;
+	arg_sz = payload_len - sizeof(*sn);
+	if (payload_len < sizeof(*sn) || arg_sz > MAX_DPU_ARGS_SIZE)
+		return -EINVAL;
+
+	if (!slot_dpu_has_space(offset, arg_sz))
+		return -ENOSPC;
+
+	buf->inst_buf_addr = sn->buffer;
+	buf->inst_size = sn->buffer_size;
+	buf->inst_prop_cnt = sn->prop_count;
+	buf->cu_idx = cu_idx;
+	buf->arg_cnt = arg_sz / sizeof(u32);
+	memcpy(buf->args, sn->prop_args, arg_sz);
+
+	/* Accurate buf size to hint firmware to do necessary copy */
+	*size += sizeof(*buf) + arg_sz;
+	return 0;
+}
+
+static int
+aie2_cmdlist_fill_one_slot(u32 op, struct amdxdna_gem_obj *cmdbuf_abo, u32 offset,
+			   struct amdxdna_gem_obj *abo, u32 *size)
+{
+	u32 this_op = amdxdna_cmd_get_op(abo);
+	void *cmd_buf = cmdbuf_abo->mem.kva;
+	int ret;
+
+	if (this_op != op) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	switch (op) {
+	case ERT_START_CU:
+		ret = aie2_cmdlist_fill_one_slot_cf(cmd_buf, offset, abo, size);
+		break;
+	case ERT_START_NPU:
+		ret = aie2_cmdlist_fill_one_slot_dpu(cmd_buf, offset, abo, size);
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+	}
+
+done:
+	if (ret) {
+		XDNA_ERR(abo->client->xdna, "Can't fill slot for cmd op %d ret %d",
+			 op, ret);
+	}
+	return ret;
+}
+
+static inline struct amdxdna_gem_obj *
+aie2_cmdlist_get_cmd_buf(struct amdxdna_sched_job *job)
+{
+	int idx = get_job_idx(job->seq);
+
+	return job->hwctx->priv->cmd_buf[idx];
+}
+
+static void
+aie2_cmdlist_prepare_request(struct cmd_chain_req *req,
+			     struct amdxdna_gem_obj *cmdbuf_abo, u32 size, u32 cnt)
+{
+	req->buf_addr = cmdbuf_abo->mem.dev_addr;
+	req->buf_size = size;
+	req->count = cnt;
+	drm_clflush_virt_range(cmdbuf_abo->mem.kva, size);
+	XDNA_DBG(cmdbuf_abo->client->xdna, "Command buf addr 0x%llx size 0x%x count %d",
+		 req->buf_addr, size, cnt);
+}
+
+static inline u32
+aie2_cmd_op_to_msg_op(u32 op)
+{
+	switch (op) {
+	case ERT_START_CU:
+		return MSG_OP_CHAIN_EXEC_BUFFER_CF;
+	case ERT_START_NPU:
+		return MSG_OP_CHAIN_EXEC_DPU;
+	default:
+		return MSG_OP_MAX_OPCODE;
+	}
+}
+
+int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
+			       struct amdxdna_sched_job *job,
+			       int (*notify_cb)(void *, const u32 *, size_t))
+{
+	struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job);
+	struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+	struct amdxdna_client *client = hwctx->client;
+	struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+	struct amdxdna_cmd_chain *payload;
+	struct xdna_mailbox_msg msg;
+	struct cmd_chain_req req;
+	u32 payload_len;
+	u32 offset = 0;
+	u32 size;
+	int ret;
+	u32 op;
+	u32 i;
+
+	op = amdxdna_cmd_get_op(cmd_abo);
+	payload = amdxdna_cmd_get_payload(cmd_abo, &payload_len);
+	if (op != ERT_CMD_CHAIN || !payload ||
+	    payload_len < struct_size(payload, data, payload->command_count))
+		return -EINVAL;
+
+	for (i = 0; i < payload->command_count; i++) {
+		u32 boh = (u32)(payload->data[i]);
+		struct amdxdna_gem_obj *abo;
+
+		abo = amdxdna_gem_get_obj(client, boh, AMDXDNA_BO_CMD);
+		if (!abo) {
+			XDNA_ERR(client->xdna, "Failed to find cmd BO %d", boh);
+			return -ENOENT;
+		}
+
+		/* All sub-cmd should have same op, use the first one. */
+		if (i == 0)
+			op = amdxdna_cmd_get_op(abo);
+
+		ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, offset, abo, &size);
+		amdxdna_gem_put_obj(abo);
+		if (ret)
+			return -EINVAL;
+
+		offset += size;
+	}
+
+	/* The offset is the accumulated total size of the cmd buffer */
+	aie2_cmdlist_prepare_request(&req, cmdbuf_abo, offset, payload->command_count);
+
+	msg.opcode = aie2_cmd_op_to_msg_op(op);
+	if (msg.opcode == MSG_OP_MAX_OPCODE)
+		return -EOPNOTSUPP;
+	msg.handle = job;
+	msg.notify_cb = notify_cb;
+	msg.send_data = (u8 *)&req;
+	msg.send_size = sizeof(req);
+	ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+	if (ret) {
+		XDNA_ERR(hwctx->client->xdna, "Send message failed");
+		return ret;
+	}
+
+	return 0;
+}
+
+int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
+				struct amdxdna_sched_job *job,
+				int (*notify_cb)(void *, const u32 *, size_t))
+{
+	struct amdxdna_gem_obj *cmdbuf_abo = aie2_cmdlist_get_cmd_buf(job);
+	struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+	struct amdxdna_gem_obj *cmd_abo = job->cmd_bo;
+	struct xdna_mailbox_msg msg;
+	struct cmd_chain_req req;
+	u32 size;
+	int ret;
+	u32 op;
+
+	op = amdxdna_cmd_get_op(cmd_abo);
+	ret = aie2_cmdlist_fill_one_slot(op, cmdbuf_abo, 0, cmd_abo, &size);
+	if (ret)
+		return ret;
+
+	aie2_cmdlist_prepare_request(&req, cmdbuf_abo, size, 1);
+
+	msg.opcode = aie2_cmd_op_to_msg_op(op);
+	if (msg.opcode == MSG_OP_MAX_OPCODE)
+		return -EOPNOTSUPP;
+	msg.handle = job;
+	msg.notify_cb = notify_cb;
+	msg.send_data = (u8 *)&req;
+	msg.send_size = sizeof(req);
+	ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+	if (ret) {
+		XDNA_ERR(hwctx->client->xdna, "Send message failed");
+		return ret;
+	}
+
+	return 0;
+}
+
+int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+		 int (*notify_cb)(void *, const u32 *, size_t))
+{
+	struct mailbox_channel *chann = hwctx->priv->mbox_chann;
+	struct amdxdna_gem_obj *abo = to_xdna_obj(job->bos[0]);
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+	struct xdna_mailbox_msg msg;
+	struct sync_bo_req req;
+	int ret = 0;
+
+	req.src_addr = 0;
+	req.dst_addr = abo->mem.dev_addr - hwctx->client->dev_heap->mem.dev_addr;
+	req.size = abo->mem.size;
+
+	/* Device to Host */
+	req.type = FIELD_PREP(AIE2_MSG_SYNC_BO_SRC_TYPE, SYNC_BO_DEV_MEM) |
+		FIELD_PREP(AIE2_MSG_SYNC_BO_DST_TYPE, SYNC_BO_HOST_MEM);
+
+	XDNA_DBG(xdna, "sync %d bytes src(0x%llx) to dst(0x%llx) completed",
+		 req.size, req.src_addr, req.dst_addr);
+
+	msg.handle = job;
+	msg.notify_cb = notify_cb;
+	msg.send_data = (u8 *)&req;
+	msg.send_size = sizeof(req);
+	msg.opcode = MSG_OP_SYNC_BO;
+
+	ret = xdna_mailbox_send_msg(chann, &msg, TX_TIMEOUT);
+	if (ret) {
+		XDNA_ERR(xdna, "Send message failed");
+		return ret;
+	}
+
+	return 0;
+}
diff --git a/drivers/accel/amdxdna/aie2_msg_priv.h b/drivers/accel/amdxdna/aie2_msg_priv.h
new file mode 100644
index 000000000000..4e02e744b470
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_msg_priv.h
@@ -0,0 +1,370 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _AIE2_MSG_PRIV_H_
+#define _AIE2_MSG_PRIV_H_
+
+enum aie2_msg_opcode {
+	MSG_OP_CREATE_CONTEXT              = 0x2,
+	MSG_OP_DESTROY_CONTEXT             = 0x3,
+	MSG_OP_SYNC_BO			   = 0x7,
+	MSG_OP_EXECUTE_BUFFER_CF           = 0xC,
+	MSG_OP_QUERY_COL_STATUS            = 0xD,
+	MSG_OP_QUERY_AIE_TILE_INFO         = 0xE,
+	MSG_OP_QUERY_AIE_VERSION           = 0xF,
+	MSG_OP_EXEC_DPU                    = 0x10,
+	MSG_OP_CONFIG_CU                   = 0x11,
+	MSG_OP_CHAIN_EXEC_BUFFER_CF        = 0x12,
+	MSG_OP_CHAIN_EXEC_DPU              = 0x13,
+	MSG_OP_MAX_XRT_OPCODE,
+	MSG_OP_SUSPEND                     = 0x101,
+	MSG_OP_RESUME                      = 0x102,
+	MSG_OP_ASSIGN_MGMT_PASID           = 0x103,
+	MSG_OP_INVOKE_SELF_TEST            = 0x104,
+	MSG_OP_MAP_HOST_BUFFER             = 0x106,
+	MSG_OP_GET_FIRMWARE_VERSION        = 0x108,
+	MSG_OP_SET_RUNTIME_CONFIG          = 0x10A,
+	MSG_OP_GET_RUNTIME_CONFIG          = 0x10B,
+	MSG_OP_REGISTER_ASYNC_EVENT_MSG    = 0x10C,
+	MSG_OP_MAX_DRV_OPCODE,
+	MSG_OP_GET_PROTOCOL_VERSION        = 0x301,
+	MSG_OP_MAX_OPCODE
+};
+
+enum aie2_msg_status {
+	AIE2_STATUS_SUCCESS				= 0x0,
+	/* AIE Error codes */
+	AIE2_STATUS_AIE_SATURATION_ERROR		= 0x1000001,
+	AIE2_STATUS_AIE_FP_ERROR			= 0x1000002,
+	AIE2_STATUS_AIE_STREAM_ERROR			= 0x1000003,
+	AIE2_STATUS_AIE_ACCESS_ERROR			= 0x1000004,
+	AIE2_STATUS_AIE_BUS_ERROR			= 0x1000005,
+	AIE2_STATUS_AIE_INSTRUCTION_ERROR		= 0x1000006,
+	AIE2_STATUS_AIE_ECC_ERROR			= 0x1000007,
+	AIE2_STATUS_AIE_LOCK_ERROR			= 0x1000008,
+	AIE2_STATUS_AIE_DMA_ERROR			= 0x1000009,
+	AIE2_STATUS_AIE_MEM_PARITY_ERROR		= 0x100000a,
+	AIE2_STATUS_AIE_PWR_CFG_ERROR			= 0x100000b,
+	AIE2_STATUS_AIE_BACKTRACK_ERROR			= 0x100000c,
+	AIE2_STATUS_MAX_AIE_STATUS_CODE,
+	/* MGMT ERT Error codes */
+	AIE2_STATUS_MGMT_ERT_SELF_TEST_FAILURE		= 0x2000001,
+	AIE2_STATUS_MGMT_ERT_HASH_MISMATCH,
+	AIE2_STATUS_MGMT_ERT_NOAVAIL,
+	AIE2_STATUS_MGMT_ERT_INVALID_PARAM,
+	AIE2_STATUS_MGMT_ERT_ENTER_SUSPEND_FAILURE,
+	AIE2_STATUS_MGMT_ERT_BUSY,
+	AIE2_STATUS_MGMT_ERT_APPLICATION_ACTIVE,
+	MAX_MGMT_ERT_STATUS_CODE,
+	/* APP ERT Error codes */
+	AIE2_STATUS_APP_ERT_FIRST_ERROR			= 0x3000001,
+	AIE2_STATUS_APP_INVALID_INSTR,
+	AIE2_STATUS_APP_LOAD_PDI_FAIL,
+	MAX_APP_ERT_STATUS_CODE,
+	/* NPU RTOS Error Codes */
+	AIE2_STATUS_INVALID_INPUT_BUFFER		= 0x4000001,
+	AIE2_STATUS_INVALID_COMMAND,
+	AIE2_STATUS_INVALID_PARAM,
+	AIE2_STATUS_INVALID_OPERATION			= 0x4000006,
+	AIE2_STATUS_ASYNC_EVENT_MSGS_FULL,
+	AIE2_STATUS_MAX_RTOS_STATUS_CODE,
+	MAX_AIE2_STATUS_CODE
+};
+
+struct assign_mgmt_pasid_req {
+	__u16	pasid;
+	__u16	reserved;
+} __packed;
+
+struct assign_mgmt_pasid_resp {
+	enum aie2_msg_status	status;
+} __packed;
+
+struct map_host_buffer_req {
+	__u32		context_id;
+	__u64		buf_addr;
+	__u64		buf_size;
+} __packed;
+
+struct map_host_buffer_resp {
+	enum aie2_msg_status	status;
+} __packed;
+
+#define MAX_CQ_PAIRS		2
+struct cq_info {
+	__u32 head_addr;
+	__u32 tail_addr;
+	__u32 buf_addr;
+	__u32 buf_size;
+};
+
+struct cq_pair {
+	struct cq_info x2i_q;
+	struct cq_info i2x_q;
+};
+
+struct create_ctx_req {
+	__u32	aie_type;
+	__u8	start_col;
+	__u8	num_col;
+	__u16	reserved;
+	__u8	num_cq_pairs_requested;
+	__u8	reserved1;
+	__u16	pasid;
+	__u32	pad[2];
+	__u32	sec_comm_target_type;
+	__u32	context_priority;
+} __packed;
+
+struct create_ctx_resp {
+	enum aie2_msg_status	status;
+	__u32			context_id;
+	__u16			msix_id;
+	__u8			num_cq_pairs_allocated;
+	__u8			reserved;
+	struct cq_pair		cq_pair[MAX_CQ_PAIRS];
+} __packed;
+
+struct destroy_ctx_req {
+	__u32	context_id;
+} __packed;
+
+struct destroy_ctx_resp {
+	enum aie2_msg_status	status;
+} __packed;
+
+struct execute_buffer_req {
+	__u32	cu_idx;
+	__u32	payload[19];
+} __packed;
+
+struct exec_dpu_req {
+	__u64	inst_buf_addr;
+	__u32	inst_size;
+	__u32	inst_prop_cnt;
+	__u32	cu_idx;
+	__u32	payload[35];
+} __packed;
+
+struct execute_buffer_resp {
+	enum aie2_msg_status	status;
+} __packed;
+
+struct aie_tile_info {
+	__u32		size;
+	__u16		major;
+	__u16		minor;
+	__u16		cols;
+	__u16		rows;
+	__u16		core_rows;
+	__u16		mem_rows;
+	__u16		shim_rows;
+	__u16		core_row_start;
+	__u16		mem_row_start;
+	__u16		shim_row_start;
+	__u16		core_dma_channels;
+	__u16		mem_dma_channels;
+	__u16		shim_dma_channels;
+	__u16		core_locks;
+	__u16		mem_locks;
+	__u16		shim_locks;
+	__u16		core_events;
+	__u16		mem_events;
+	__u16		shim_events;
+	__u16		reserved;
+};
+
+struct aie_tile_info_req {
+	__u32	reserved;
+} __packed;
+
+struct aie_tile_info_resp {
+	enum aie2_msg_status	status;
+	struct aie_tile_info	info;
+} __packed;
+
+struct aie_version_info_req {
+	__u32		reserved;
+} __packed;
+
+struct aie_version_info_resp {
+	enum aie2_msg_status	status;
+	__u16			major;
+	__u16			minor;
+} __packed;
+
+struct aie_column_info_req {
+	__u64 dump_buff_addr;
+	__u32 dump_buff_size;
+	__u32 num_cols;
+	__u32 aie_bitmap;
+} __packed;
+
+struct aie_column_info_resp {
+	enum aie2_msg_status	status;
+	__u32 size;
+} __packed;
+
+struct suspend_req {
+	__u32		place_holder;
+} __packed;
+
+struct suspend_resp {
+	enum aie2_msg_status	status;
+} __packed;
+
+struct resume_req {
+	__u32		place_holder;
+} __packed;
+
+struct resume_resp {
+	enum aie2_msg_status	status;
+} __packed;
+
+struct check_header_hash_req {
+	__u64		hash_high;
+	__u64		hash_low;
+} __packed;
+
+struct check_header_hash_resp {
+	enum aie2_msg_status	status;
+} __packed;
+
+struct query_error_req {
+	__u64		buf_addr;
+	__u32		buf_size;
+	__u32		next_row;
+	__u32		next_column;
+	__u32		next_module;
+} __packed;
+
+struct query_error_resp {
+	enum aie2_msg_status	status;
+	__u32			num_err;
+	__u32			has_next_err;
+	__u32			next_row;
+	__u32			next_column;
+	__u32			next_module;
+} __packed;
+
+struct protocol_version_req {
+	__u32		reserved;
+} __packed;
+
+struct protocol_version_resp {
+	enum aie2_msg_status	status;
+	__u32			major;
+	__u32			minor;
+} __packed;
+
+struct firmware_version_req {
+	__u32		reserved;
+} __packed;
+
+struct firmware_version_resp {
+	enum aie2_msg_status	status;
+	__u32			major;
+	__u32			minor;
+	__u32			sub;
+	__u32			build;
+} __packed;
+
+#define MAX_NUM_CUS			32
+#define AIE2_MSG_CFG_CU_PDI_ADDR	GENMASK(16, 0)
+#define AIE2_MSG_CFG_CU_FUNC		GENMASK(24, 17)
+struct config_cu_req {
+	__u32	num_cus;
+	__u32	cfgs[MAX_NUM_CUS];
+} __packed;
+
+struct config_cu_resp {
+	enum aie2_msg_status	status;
+} __packed;
+
+struct set_runtime_cfg_req {
+	__u32	type;
+	__u64	value;
+} __packed;
+
+struct set_runtime_cfg_resp {
+	enum aie2_msg_status	status;
+} __packed;
+
+struct get_runtime_cfg_req {
+	__u32	type;
+} __packed;
+
+struct get_runtime_cfg_resp {
+	enum aie2_msg_status	status;
+	__u64			value;
+} __packed;
+
+enum async_event_type {
+	ASYNC_EVENT_TYPE_AIE_ERROR,
+	ASYNC_EVENT_TYPE_EXCEPTION,
+	MAX_ASYNC_EVENT_TYPE
+};
+
+#define ASYNC_BUF_SIZE SZ_8K
+struct async_event_msg_req {
+	__u64 buf_addr;
+	__u32 buf_size;
+} __packed;
+
+struct async_event_msg_resp {
+	enum aie2_msg_status	status;
+	enum async_event_type	type;
+} __packed;
+
+#define MAX_CHAIN_CMDBUF_SIZE SZ_4K
+#define slot_cf_has_space(offset, payload_size) \
+	(MAX_CHAIN_CMDBUF_SIZE - ((offset) + (payload_size)) > \
+	 offsetof(struct cmd_chain_slot_execbuf_cf, args[0]))
+struct cmd_chain_slot_execbuf_cf {
+	__u32 cu_idx;
+	__u32 arg_cnt;
+	__u32 args[] __counted_by(arg_cnt);
+};
+
+#define slot_dpu_has_space(offset, payload_size) \
+	(MAX_CHAIN_CMDBUF_SIZE - ((offset) + (payload_size)) > \
+	 offsetof(struct cmd_chain_slot_dpu, args[0]))
+struct cmd_chain_slot_dpu {
+	__u64 inst_buf_addr;
+	__u32 inst_size;
+	__u32 inst_prop_cnt;
+	__u32 cu_idx;
+	__u32 arg_cnt;
+#define MAX_DPU_ARGS_SIZE (34 * sizeof(__u32))
+	__u32 args[] __counted_by(arg_cnt);
+};
+
+struct cmd_chain_req {
+	__u64 buf_addr;
+	__u32 buf_size;
+	__u32 count;
+} __packed;
+
+struct cmd_chain_resp {
+	enum aie2_msg_status	status;
+	__u32			fail_cmd_idx;
+	enum aie2_msg_status	fail_cmd_status;
+} __packed;
+
+#define AIE2_MSG_SYNC_BO_SRC_TYPE	GENMASK(3, 0)
+#define AIE2_MSG_SYNC_BO_DST_TYPE	GENMASK(7, 4)
+struct sync_bo_req {
+	__u64 src_addr;
+	__u64 dst_addr;
+	__u32 size;
+#define SYNC_BO_DEV_MEM  0
+#define SYNC_BO_HOST_MEM 2
+	__u32 type;
+} __packed;
+
+struct sync_bo_resp {
+	enum aie2_msg_status	status;
+} __packed;
+#endif /* _AIE2_MSG_PRIV_H_ */
diff --git a/drivers/accel/amdxdna/aie2_pci.c b/drivers/accel/amdxdna/aie2_pci.c
new file mode 100644
index 000000000000..5a058e565b01
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_pci.c
@@ -0,0 +1,928 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_device.h>
+#include <drm/drm_drv.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_managed.h>
+#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/errno.h>
+#include <linux/firmware.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/pci.h>
+#include <linux/xarray.h>
+
+#include "aie2_msg_priv.h"
+#include "aie2_pci.h"
+#include "aie2_solver.h"
+#include "amdxdna_ctx.h"
+#include "amdxdna_gem.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_pci_drv.h"
+
+static int aie2_max_col = XRS_MAX_COL;
+module_param(aie2_max_col, uint, 0600);
+MODULE_PARM_DESC(aie2_max_col, "Maximum column could be used");
+
+/*
+ * The management mailbox channel is allocated by firmware.
+ * The related register and ring buffer information is on SRAM BAR.
+ * This struct is the register layout.
+ */
+#define MGMT_MBOX_MAGIC 0x55504e5f /* _NPU */
+struct mgmt_mbox_chann_info {
+	__u32	x2i_tail;
+	__u32	x2i_head;
+	__u32	x2i_buf;
+	__u32	x2i_buf_sz;
+	__u32	i2x_tail;
+	__u32	i2x_head;
+	__u32	i2x_buf;
+	__u32	i2x_buf_sz;
+	__u32	magic;
+	__u32	msi_id;
+	__u32	prot_major;
+	__u32	prot_minor;
+	__u32	rsvd[4];
+};
+
+static int aie2_check_protocol(struct amdxdna_dev_hdl *ndev, u32 fw_major, u32 fw_minor)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+
+	/*
+	 * The driver supported mailbox behavior is defined by
+	 * ndev->priv->protocol_major and protocol_minor.
+	 *
+	 * When protocol_major and fw_major are different, it means driver
+	 * and firmware are incompatible.
+	 */
+	if (ndev->priv->protocol_major != fw_major) {
+		XDNA_ERR(xdna, "Incompatible firmware protocol major %d minor %d",
+			 fw_major, fw_minor);
+		return -EINVAL;
+	}
+
+	/*
+	 * When protocol_minor is greater then fw_minor, that means driver
+	 * relies on operation the installed firmware does not support.
+	 */
+	if (ndev->priv->protocol_minor > fw_minor) {
+		XDNA_ERR(xdna, "Firmware minor version smaller than supported");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void aie2_dump_chann_info_debug(struct amdxdna_dev_hdl *ndev)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+
+	XDNA_DBG(xdna, "i2x tail    0x%x", ndev->mgmt_i2x.mb_tail_ptr_reg);
+	XDNA_DBG(xdna, "i2x head    0x%x", ndev->mgmt_i2x.mb_head_ptr_reg);
+	XDNA_DBG(xdna, "i2x ringbuf 0x%x", ndev->mgmt_i2x.rb_start_addr);
+	XDNA_DBG(xdna, "i2x rsize   0x%x", ndev->mgmt_i2x.rb_size);
+	XDNA_DBG(xdna, "x2i tail    0x%x", ndev->mgmt_x2i.mb_tail_ptr_reg);
+	XDNA_DBG(xdna, "x2i head    0x%x", ndev->mgmt_x2i.mb_head_ptr_reg);
+	XDNA_DBG(xdna, "x2i ringbuf 0x%x", ndev->mgmt_x2i.rb_start_addr);
+	XDNA_DBG(xdna, "x2i rsize   0x%x", ndev->mgmt_x2i.rb_size);
+	XDNA_DBG(xdna, "x2i chann index 0x%x", ndev->mgmt_chan_idx);
+	XDNA_DBG(xdna, "mailbox protocol major 0x%x", ndev->mgmt_prot_major);
+	XDNA_DBG(xdna, "mailbox protocol minor 0x%x", ndev->mgmt_prot_minor);
+}
+
+static int aie2_get_mgmt_chann_info(struct amdxdna_dev_hdl *ndev)
+{
+	struct mgmt_mbox_chann_info info_regs;
+	struct xdna_mailbox_chann_res *i2x;
+	struct xdna_mailbox_chann_res *x2i;
+	u32 addr, off;
+	u32 *reg;
+	int ret;
+	int i;
+
+	/*
+	 * Once firmware is alive, it will write management channel
+	 * information in SRAM BAR and write the address of that information
+	 * at FW_ALIVE_OFF offset in SRMA BAR.
+	 *
+	 * Read a non-zero value from FW_ALIVE_OFF implies that firmware
+	 * is alive.
+	 */
+	ret = readx_poll_timeout(readl, SRAM_GET_ADDR(ndev, FW_ALIVE_OFF),
+				 addr, addr, AIE2_INTERVAL, AIE2_TIMEOUT);
+	if (ret || !addr)
+		return -ETIME;
+
+	off = AIE2_SRAM_OFF(ndev, addr);
+	reg = (u32 *)&info_regs;
+	for (i = 0; i < sizeof(info_regs) / sizeof(u32); i++)
+		reg[i] = readl(ndev->sram_base + off + i * sizeof(u32));
+
+	if (info_regs.magic != MGMT_MBOX_MAGIC) {
+		XDNA_ERR(ndev->xdna, "Invalid mbox magic 0x%x", info_regs.magic);
+		ret = -EINVAL;
+		goto done;
+	}
+
+	i2x = &ndev->mgmt_i2x;
+	x2i = &ndev->mgmt_x2i;
+
+	i2x->mb_head_ptr_reg = AIE2_MBOX_OFF(ndev, info_regs.i2x_head);
+	i2x->mb_tail_ptr_reg = AIE2_MBOX_OFF(ndev, info_regs.i2x_tail);
+	i2x->rb_start_addr   = AIE2_SRAM_OFF(ndev, info_regs.i2x_buf);
+	i2x->rb_size         = info_regs.i2x_buf_sz;
+
+	x2i->mb_head_ptr_reg = AIE2_MBOX_OFF(ndev, info_regs.x2i_head);
+	x2i->mb_tail_ptr_reg = AIE2_MBOX_OFF(ndev, info_regs.x2i_tail);
+	x2i->rb_start_addr   = AIE2_SRAM_OFF(ndev, info_regs.x2i_buf);
+	x2i->rb_size         = info_regs.x2i_buf_sz;
+
+	ndev->mgmt_chan_idx  = info_regs.msi_id;
+	ndev->mgmt_prot_major = info_regs.prot_major;
+	ndev->mgmt_prot_minor = info_regs.prot_minor;
+
+	ret = aie2_check_protocol(ndev, ndev->mgmt_prot_major, ndev->mgmt_prot_minor);
+
+done:
+	aie2_dump_chann_info_debug(ndev);
+
+	/* Must clear address at FW_ALIVE_OFF */
+	writel(0, SRAM_GET_ADDR(ndev, FW_ALIVE_OFF));
+
+	return ret;
+}
+
+int aie2_runtime_cfg(struct amdxdna_dev_hdl *ndev,
+		     enum rt_config_category category, u32 *val)
+{
+	const struct rt_config *cfg;
+	u32 value;
+	int ret;
+
+	for (cfg = ndev->priv->rt_config; cfg->type; cfg++) {
+		if (cfg->category != category)
+			continue;
+
+		value = val ? *val : cfg->value;
+		ret = aie2_set_runtime_cfg(ndev, cfg->type, value);
+		if (ret) {
+			XDNA_ERR(ndev->xdna, "Set type %d value %d failed",
+				 cfg->type, value);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int aie2_xdna_reset(struct amdxdna_dev_hdl *ndev)
+{
+	int ret;
+
+	ret = aie2_suspend_fw(ndev);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Suspend firmware failed");
+		return ret;
+	}
+
+	ret = aie2_resume_fw(ndev);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Resume firmware failed");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int aie2_mgmt_fw_init(struct amdxdna_dev_hdl *ndev)
+{
+	int ret;
+
+	ret = aie2_runtime_cfg(ndev, AIE2_RT_CFG_INIT, NULL);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Runtime config failed");
+		return ret;
+	}
+
+	ret = aie2_assign_mgmt_pasid(ndev, 0);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Can not assign PASID");
+		return ret;
+	}
+
+	ret = aie2_xdna_reset(ndev);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Reset firmware failed");
+		return ret;
+	}
+
+	if (!ndev->async_events)
+		return 0;
+
+	ret = aie2_error_async_events_send(ndev);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Send async events failed");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int aie2_mgmt_fw_query(struct amdxdna_dev_hdl *ndev)
+{
+	int ret;
+
+	ret = aie2_query_firmware_version(ndev, &ndev->xdna->fw_ver);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "query firmware version failed");
+		return ret;
+	}
+
+	ret = aie2_query_aie_version(ndev, &ndev->version);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Query AIE version failed");
+		return ret;
+	}
+
+	ret = aie2_query_aie_metadata(ndev, &ndev->metadata);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Query AIE metadata failed");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void aie2_mgmt_fw_fini(struct amdxdna_dev_hdl *ndev)
+{
+	if (aie2_suspend_fw(ndev))
+		XDNA_ERR(ndev->xdna, "Suspend_fw failed");
+	XDNA_DBG(ndev->xdna, "Firmware suspended");
+}
+
+static int aie2_xrs_load(void *cb_arg, struct xrs_action_load *action)
+{
+	struct amdxdna_hwctx *hwctx = cb_arg;
+	struct amdxdna_dev *xdna;
+	int ret;
+
+	xdna = hwctx->client->xdna;
+
+	hwctx->start_col = action->part.start_col;
+	hwctx->num_col = action->part.ncols;
+	ret = aie2_create_context(xdna->dev_handle, hwctx);
+	if (ret)
+		XDNA_ERR(xdna, "create context failed, ret %d", ret);
+
+	return ret;
+}
+
+static int aie2_xrs_unload(void *cb_arg)
+{
+	struct amdxdna_hwctx *hwctx = cb_arg;
+	struct amdxdna_dev *xdna;
+	int ret;
+
+	xdna = hwctx->client->xdna;
+
+	ret = aie2_destroy_context(xdna->dev_handle, hwctx);
+	if (ret)
+		XDNA_ERR(xdna, "destroy context failed, ret %d", ret);
+
+	return ret;
+}
+
+static int aie2_xrs_set_dft_dpm_level(struct drm_device *ddev, u32 dpm_level)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(ddev);
+	struct amdxdna_dev_hdl *ndev;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+
+	ndev = xdna->dev_handle;
+	ndev->dft_dpm_level = dpm_level;
+	if (ndev->pw_mode != POWER_MODE_DEFAULT || ndev->dpm_level == dpm_level)
+		return 0;
+
+	return ndev->priv->hw_ops.set_dpm(ndev, dpm_level);
+}
+
+static struct xrs_action_ops aie2_xrs_actions = {
+	.load = aie2_xrs_load,
+	.unload = aie2_xrs_unload,
+	.set_dft_dpm_level = aie2_xrs_set_dft_dpm_level,
+};
+
+static void aie2_hw_stop(struct amdxdna_dev *xdna)
+{
+	struct pci_dev *pdev = to_pci_dev(xdna->ddev.dev);
+	struct amdxdna_dev_hdl *ndev = xdna->dev_handle;
+
+	if (ndev->dev_status <= AIE2_DEV_INIT) {
+		XDNA_ERR(xdna, "device is already stopped");
+		return;
+	}
+
+	aie2_mgmt_fw_fini(ndev);
+	xdna_mailbox_stop_channel(ndev->mgmt_chann);
+	xdna_mailbox_destroy_channel(ndev->mgmt_chann);
+	ndev->mgmt_chann = NULL;
+	drmm_kfree(&xdna->ddev, ndev->mbox);
+	ndev->mbox = NULL;
+	aie2_psp_stop(ndev->psp_hdl);
+	aie2_smu_fini(ndev);
+	pci_disable_device(pdev);
+
+	ndev->dev_status = AIE2_DEV_INIT;
+}
+
+static int aie2_hw_start(struct amdxdna_dev *xdna)
+{
+	struct pci_dev *pdev = to_pci_dev(xdna->ddev.dev);
+	struct amdxdna_dev_hdl *ndev = xdna->dev_handle;
+	struct xdna_mailbox_res mbox_res;
+	u32 xdna_mailbox_intr_reg;
+	int mgmt_mb_irq, ret;
+
+	if (ndev->dev_status >= AIE2_DEV_START) {
+		XDNA_INFO(xdna, "device is already started");
+		return 0;
+	}
+
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		XDNA_ERR(xdna, "failed to enable device, ret %d", ret);
+		return ret;
+	}
+	pci_set_master(pdev);
+
+	ret = aie2_smu_init(ndev);
+	if (ret) {
+		XDNA_ERR(xdna, "failed to init smu, ret %d", ret);
+		goto disable_dev;
+	}
+
+	ret = aie2_psp_start(ndev->psp_hdl);
+	if (ret) {
+		XDNA_ERR(xdna, "failed to start psp, ret %d", ret);
+		goto fini_smu;
+	}
+
+	ret = aie2_get_mgmt_chann_info(ndev);
+	if (ret) {
+		XDNA_ERR(xdna, "firmware is not alive");
+		goto stop_psp;
+	}
+
+	mbox_res.ringbuf_base = ndev->sram_base;
+	mbox_res.ringbuf_size = pci_resource_len(pdev, xdna->dev_info->sram_bar);
+	mbox_res.mbox_base = ndev->mbox_base;
+	mbox_res.mbox_size = MBOX_SIZE(ndev);
+	mbox_res.name = "xdna_mailbox";
+	ndev->mbox = xdnam_mailbox_create(&xdna->ddev, &mbox_res);
+	if (!ndev->mbox) {
+		XDNA_ERR(xdna, "failed to create mailbox device");
+		ret = -ENODEV;
+		goto stop_psp;
+	}
+
+	mgmt_mb_irq = pci_irq_vector(pdev, ndev->mgmt_chan_idx);
+	if (mgmt_mb_irq < 0) {
+		ret = mgmt_mb_irq;
+		XDNA_ERR(xdna, "failed to alloc irq vector, ret %d", ret);
+		goto stop_psp;
+	}
+
+	xdna_mailbox_intr_reg = ndev->mgmt_i2x.mb_head_ptr_reg + 4;
+	ndev->mgmt_chann = xdna_mailbox_create_channel(ndev->mbox,
+						       &ndev->mgmt_x2i,
+						       &ndev->mgmt_i2x,
+						       xdna_mailbox_intr_reg,
+						       mgmt_mb_irq);
+	if (!ndev->mgmt_chann) {
+		XDNA_ERR(xdna, "failed to create management mailbox channel");
+		ret = -EINVAL;
+		goto stop_psp;
+	}
+
+	ret = aie2_pm_init(ndev);
+	if (ret) {
+		XDNA_ERR(xdna, "failed to init pm, ret %d", ret);
+		goto destroy_mgmt_chann;
+	}
+
+	ret = aie2_mgmt_fw_init(ndev);
+	if (ret) {
+		XDNA_ERR(xdna, "initial mgmt firmware failed, ret %d", ret);
+		goto destroy_mgmt_chann;
+	}
+
+	ndev->dev_status = AIE2_DEV_START;
+
+	return 0;
+
+destroy_mgmt_chann:
+	xdna_mailbox_stop_channel(ndev->mgmt_chann);
+	xdna_mailbox_destroy_channel(ndev->mgmt_chann);
+stop_psp:
+	aie2_psp_stop(ndev->psp_hdl);
+fini_smu:
+	aie2_smu_fini(ndev);
+disable_dev:
+	pci_disable_device(pdev);
+
+	return ret;
+}
+
+static int aie2_init(struct amdxdna_dev *xdna)
+{
+	struct pci_dev *pdev = to_pci_dev(xdna->ddev.dev);
+	void __iomem *tbl[PCI_NUM_RESOURCES] = {0};
+	struct init_config xrs_cfg = { 0 };
+	struct amdxdna_dev_hdl *ndev;
+	struct psp_config psp_conf;
+	const struct firmware *fw;
+	unsigned long bars = 0;
+	int i, nvec, ret;
+
+	ndev = drmm_kzalloc(&xdna->ddev, sizeof(*ndev), GFP_KERNEL);
+	if (!ndev)
+		return -ENOMEM;
+
+	ndev->priv = xdna->dev_info->dev_priv;
+	ndev->xdna = xdna;
+
+	ret = request_firmware(&fw, ndev->priv->fw_path, &pdev->dev);
+	if (ret) {
+		XDNA_ERR(xdna, "failed to request_firmware %s, ret %d",
+			 ndev->priv->fw_path, ret);
+		return ret;
+	}
+
+	ret = pcim_enable_device(pdev);
+	if (ret) {
+		XDNA_ERR(xdna, "pcim enable device failed, ret %d", ret);
+		goto release_fw;
+	}
+
+	for (i = 0; i < PSP_MAX_REGS; i++)
+		set_bit(PSP_REG_BAR(ndev, i), &bars);
+
+	set_bit(xdna->dev_info->sram_bar, &bars);
+	set_bit(xdna->dev_info->smu_bar, &bars);
+	set_bit(xdna->dev_info->mbox_bar, &bars);
+
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		if (!test_bit(i, &bars))
+			continue;
+		tbl[i] = pcim_iomap(pdev, i, 0);
+		if (!tbl[i]) {
+			XDNA_ERR(xdna, "map bar %d failed", i);
+			ret = -ENOMEM;
+			goto release_fw;
+		}
+	}
+
+	ndev->sram_base = tbl[xdna->dev_info->sram_bar];
+	ndev->smu_base = tbl[xdna->dev_info->smu_bar];
+	ndev->mbox_base = tbl[xdna->dev_info->mbox_bar];
+
+	ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+	if (ret) {
+		XDNA_ERR(xdna, "Failed to set DMA mask: %d", ret);
+		goto release_fw;
+	}
+
+	nvec = pci_msix_vec_count(pdev);
+	if (nvec <= 0) {
+		XDNA_ERR(xdna, "does not get number of interrupt vector");
+		ret = -EINVAL;
+		goto release_fw;
+	}
+
+	ret = pci_alloc_irq_vectors(pdev, nvec, nvec, PCI_IRQ_MSIX);
+	if (ret < 0) {
+		XDNA_ERR(xdna, "failed to alloc irq vectors, ret %d", ret);
+		goto release_fw;
+	}
+
+	ret = iommu_dev_enable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
+	if (ret) {
+		XDNA_ERR(xdna, "Enable PASID failed, ret %d", ret);
+		goto free_irq;
+	}
+
+	psp_conf.fw_size = fw->size;
+	psp_conf.fw_buf = fw->data;
+	for (i = 0; i < PSP_MAX_REGS; i++)
+		psp_conf.psp_regs[i] = tbl[PSP_REG_BAR(ndev, i)] + PSP_REG_OFF(ndev, i);
+	ndev->psp_hdl = aie2m_psp_create(&xdna->ddev, &psp_conf);
+	if (!ndev->psp_hdl) {
+		XDNA_ERR(xdna, "failed to create psp");
+		ret = -ENOMEM;
+		goto disable_sva;
+	}
+	xdna->dev_handle = ndev;
+
+	ret = aie2_hw_start(xdna);
+	if (ret) {
+		XDNA_ERR(xdna, "start npu failed, ret %d", ret);
+		goto disable_sva;
+	}
+
+	ret = aie2_mgmt_fw_query(ndev);
+	if (ret) {
+		XDNA_ERR(xdna, "Query firmware failed, ret %d", ret);
+		goto stop_hw;
+	}
+	ndev->total_col = min(aie2_max_col, ndev->metadata.cols);
+
+	xrs_cfg.clk_list.num_levels = ndev->max_dpm_level + 1;
+	for (i = 0; i < xrs_cfg.clk_list.num_levels; i++)
+		xrs_cfg.clk_list.cu_clk_list[i] = ndev->priv->dpm_clk_tbl[i].hclk;
+	xrs_cfg.sys_eff_factor = 1;
+	xrs_cfg.ddev = &xdna->ddev;
+	xrs_cfg.actions = &aie2_xrs_actions;
+	xrs_cfg.total_col = ndev->total_col;
+
+	xdna->xrs_hdl = xrsm_init(&xrs_cfg);
+	if (!xdna->xrs_hdl) {
+		XDNA_ERR(xdna, "Initialize resolver failed");
+		ret = -EINVAL;
+		goto stop_hw;
+	}
+
+	ret = aie2_error_async_events_alloc(ndev);
+	if (ret) {
+		XDNA_ERR(xdna, "Allocate async events failed, ret %d", ret);
+		goto stop_hw;
+	}
+
+	ret = aie2_error_async_events_send(ndev);
+	if (ret) {
+		XDNA_ERR(xdna, "Send async events failed, ret %d", ret);
+		goto async_event_free;
+	}
+
+	/* Issue a command to make sure firmware handled async events */
+	ret = aie2_query_firmware_version(ndev, &ndev->xdna->fw_ver);
+	if (ret) {
+		XDNA_ERR(xdna, "Re-query firmware version failed");
+		goto async_event_free;
+	}
+
+	release_firmware(fw);
+	return 0;
+
+async_event_free:
+	aie2_error_async_events_free(ndev);
+stop_hw:
+	aie2_hw_stop(xdna);
+disable_sva:
+	iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
+free_irq:
+	pci_free_irq_vectors(pdev);
+release_fw:
+	release_firmware(fw);
+
+	return ret;
+}
+
+static void aie2_fini(struct amdxdna_dev *xdna)
+{
+	struct pci_dev *pdev = to_pci_dev(xdna->ddev.dev);
+	struct amdxdna_dev_hdl *ndev = xdna->dev_handle;
+
+	aie2_hw_stop(xdna);
+	aie2_error_async_events_free(ndev);
+	iommu_dev_disable_feature(&pdev->dev, IOMMU_DEV_FEAT_SVA);
+	pci_free_irq_vectors(pdev);
+}
+
+static int aie2_get_aie_status(struct amdxdna_client *client,
+			       struct amdxdna_drm_get_info *args)
+{
+	struct amdxdna_drm_query_aie_status status;
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_dev_hdl *ndev;
+	int ret;
+
+	ndev = xdna->dev_handle;
+	if (copy_from_user(&status, u64_to_user_ptr(args->buffer), sizeof(status))) {
+		XDNA_ERR(xdna, "Failed to copy AIE request into kernel");
+		return -EFAULT;
+	}
+
+	if (ndev->metadata.cols * ndev->metadata.size < status.buffer_size) {
+		XDNA_ERR(xdna, "Invalid buffer size. Given Size: %u. Need Size: %u.",
+			 status.buffer_size, ndev->metadata.cols * ndev->metadata.size);
+		return -EINVAL;
+	}
+
+	ret = aie2_query_status(ndev, u64_to_user_ptr(status.buffer),
+				status.buffer_size, &status.cols_filled);
+	if (ret) {
+		XDNA_ERR(xdna, "Failed to get AIE status info. Ret: %d", ret);
+		return ret;
+	}
+
+	if (copy_to_user(u64_to_user_ptr(args->buffer), &status, sizeof(status))) {
+		XDNA_ERR(xdna, "Failed to copy AIE request info to user space");
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int aie2_get_aie_metadata(struct amdxdna_client *client,
+				 struct amdxdna_drm_get_info *args)
+{
+	struct amdxdna_drm_query_aie_metadata *meta;
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_dev_hdl *ndev;
+	int ret = 0;
+
+	ndev = xdna->dev_handle;
+	meta = kzalloc(sizeof(*meta), GFP_KERNEL);
+	if (!meta)
+		return -ENOMEM;
+
+	meta->col_size = ndev->metadata.size;
+	meta->cols = ndev->metadata.cols;
+	meta->rows = ndev->metadata.rows;
+
+	meta->version.major = ndev->metadata.version.major;
+	meta->version.minor = ndev->metadata.version.minor;
+
+	meta->core.row_count = ndev->metadata.core.row_count;
+	meta->core.row_start = ndev->metadata.core.row_start;
+	meta->core.dma_channel_count = ndev->metadata.core.dma_channel_count;
+	meta->core.lock_count = ndev->metadata.core.lock_count;
+	meta->core.event_reg_count = ndev->metadata.core.event_reg_count;
+
+	meta->mem.row_count = ndev->metadata.mem.row_count;
+	meta->mem.row_start = ndev->metadata.mem.row_start;
+	meta->mem.dma_channel_count = ndev->metadata.mem.dma_channel_count;
+	meta->mem.lock_count = ndev->metadata.mem.lock_count;
+	meta->mem.event_reg_count = ndev->metadata.mem.event_reg_count;
+
+	meta->shim.row_count = ndev->metadata.shim.row_count;
+	meta->shim.row_start = ndev->metadata.shim.row_start;
+	meta->shim.dma_channel_count = ndev->metadata.shim.dma_channel_count;
+	meta->shim.lock_count = ndev->metadata.shim.lock_count;
+	meta->shim.event_reg_count = ndev->metadata.shim.event_reg_count;
+
+	if (copy_to_user(u64_to_user_ptr(args->buffer), meta, sizeof(*meta)))
+		ret = -EFAULT;
+
+	kfree(meta);
+	return ret;
+}
+
+static int aie2_get_aie_version(struct amdxdna_client *client,
+				struct amdxdna_drm_get_info *args)
+{
+	struct amdxdna_drm_query_aie_version version;
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_dev_hdl *ndev;
+
+	ndev = xdna->dev_handle;
+	version.major = ndev->version.major;
+	version.minor = ndev->version.minor;
+
+	if (copy_to_user(u64_to_user_ptr(args->buffer), &version, sizeof(version)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int aie2_get_firmware_version(struct amdxdna_client *client,
+				     struct amdxdna_drm_get_info *args)
+{
+	struct amdxdna_drm_query_firmware_version version;
+	struct amdxdna_dev *xdna = client->xdna;
+
+	version.major = xdna->fw_ver.major;
+	version.minor = xdna->fw_ver.minor;
+	version.patch = xdna->fw_ver.sub;
+	version.build = xdna->fw_ver.build;
+
+	if (copy_to_user(u64_to_user_ptr(args->buffer), &version, sizeof(version)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int aie2_get_power_mode(struct amdxdna_client *client,
+			       struct amdxdna_drm_get_info *args)
+{
+	struct amdxdna_drm_get_power_mode mode = {};
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_dev_hdl *ndev;
+
+	ndev = xdna->dev_handle;
+	mode.power_mode = ndev->pw_mode;
+
+	if (copy_to_user(u64_to_user_ptr(args->buffer), &mode, sizeof(mode)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int aie2_get_clock_metadata(struct amdxdna_client *client,
+				   struct amdxdna_drm_get_info *args)
+{
+	struct amdxdna_drm_query_clock_metadata *clock;
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_dev_hdl *ndev;
+	int ret = 0;
+
+	ndev = xdna->dev_handle;
+	clock = kzalloc(sizeof(*clock), GFP_KERNEL);
+	if (!clock)
+		return -ENOMEM;
+
+	snprintf(clock->mp_npu_clock.name, sizeof(clock->mp_npu_clock.name),
+		 "MP-NPU Clock");
+	clock->mp_npu_clock.freq_mhz = ndev->npuclk_freq;
+	snprintf(clock->h_clock.name, sizeof(clock->h_clock.name), "H Clock");
+	clock->h_clock.freq_mhz = ndev->hclk_freq;
+
+	if (copy_to_user(u64_to_user_ptr(args->buffer), clock, sizeof(*clock)))
+		ret = -EFAULT;
+
+	kfree(clock);
+	return ret;
+}
+
+static int aie2_get_hwctx_status(struct amdxdna_client *client,
+				 struct amdxdna_drm_get_info *args)
+{
+	struct amdxdna_drm_query_hwctx __user *buf;
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_drm_query_hwctx *tmp;
+	struct amdxdna_client *tmp_client;
+	struct amdxdna_hwctx *hwctx;
+	unsigned long hwctx_id;
+	bool overflow = false;
+	u32 req_bytes = 0;
+	u32 hw_i = 0;
+	int ret = 0;
+	int idx;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+
+	tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	buf = u64_to_user_ptr(args->buffer);
+	list_for_each_entry(tmp_client, &xdna->client_list, node) {
+		idx = srcu_read_lock(&tmp_client->hwctx_srcu);
+		amdxdna_for_each_hwctx(tmp_client, hwctx_id, hwctx) {
+			req_bytes += sizeof(*tmp);
+			if (args->buffer_size < req_bytes) {
+				/* Continue iterating to get the required size */
+				overflow = true;
+				continue;
+			}
+
+			memset(tmp, 0, sizeof(*tmp));
+			tmp->pid = tmp_client->pid;
+			tmp->context_id = hwctx->id;
+			tmp->start_col = hwctx->start_col;
+			tmp->num_col = hwctx->num_col;
+			tmp->command_submissions = hwctx->priv->seq;
+			tmp->command_completions = hwctx->priv->completed;
+
+			if (copy_to_user(&buf[hw_i], tmp, sizeof(*tmp))) {
+				ret = -EFAULT;
+				srcu_read_unlock(&tmp_client->hwctx_srcu, idx);
+				goto out;
+			}
+			hw_i++;
+		}
+		srcu_read_unlock(&tmp_client->hwctx_srcu, idx);
+	}
+
+	if (overflow) {
+		XDNA_ERR(xdna, "Invalid buffer size. Given: %u Need: %u.",
+			 args->buffer_size, req_bytes);
+		ret = -EINVAL;
+	}
+
+out:
+	kfree(tmp);
+	args->buffer_size = req_bytes;
+	return ret;
+}
+
+static int aie2_get_info(struct amdxdna_client *client, struct amdxdna_drm_get_info *args)
+{
+	struct amdxdna_dev *xdna = client->xdna;
+	int ret, idx;
+
+	if (!drm_dev_enter(&xdna->ddev, &idx))
+		return -ENODEV;
+
+	switch (args->param) {
+	case DRM_AMDXDNA_QUERY_AIE_STATUS:
+		ret = aie2_get_aie_status(client, args);
+		break;
+	case DRM_AMDXDNA_QUERY_AIE_METADATA:
+		ret = aie2_get_aie_metadata(client, args);
+		break;
+	case DRM_AMDXDNA_QUERY_AIE_VERSION:
+		ret = aie2_get_aie_version(client, args);
+		break;
+	case DRM_AMDXDNA_QUERY_CLOCK_METADATA:
+		ret = aie2_get_clock_metadata(client, args);
+		break;
+	case DRM_AMDXDNA_QUERY_HW_CONTEXTS:
+		ret = aie2_get_hwctx_status(client, args);
+		break;
+	case DRM_AMDXDNA_QUERY_FIRMWARE_VERSION:
+		ret = aie2_get_firmware_version(client, args);
+		break;
+	case DRM_AMDXDNA_GET_POWER_MODE:
+		ret = aie2_get_power_mode(client, args);
+		break;
+	default:
+		XDNA_ERR(xdna, "Not supported request parameter %u", args->param);
+		ret = -EOPNOTSUPP;
+	}
+	XDNA_DBG(xdna, "Got param %d", args->param);
+
+	drm_dev_exit(idx);
+	return ret;
+}
+
+static int aie2_set_power_mode(struct amdxdna_client *client,
+			       struct amdxdna_drm_set_state *args)
+{
+	struct amdxdna_drm_set_power_mode power_state;
+	enum amdxdna_power_mode_type power_mode;
+	struct amdxdna_dev *xdna = client->xdna;
+
+	if (copy_from_user(&power_state, u64_to_user_ptr(args->buffer),
+			   sizeof(power_state))) {
+		XDNA_ERR(xdna, "Failed to copy power mode request into kernel");
+		return -EFAULT;
+	}
+
+	if (XDNA_MBZ_DBG(xdna, power_state.pad, sizeof(power_state.pad)))
+		return -EINVAL;
+
+	power_mode = power_state.power_mode;
+	if (power_mode > POWER_MODE_TURBO) {
+		XDNA_ERR(xdna, "Invalid power mode %d", power_mode);
+		return -EINVAL;
+	}
+
+	return aie2_pm_set_mode(xdna->dev_handle, power_mode);
+}
+
+static int aie2_set_state(struct amdxdna_client *client,
+			  struct amdxdna_drm_set_state *args)
+{
+	struct amdxdna_dev *xdna = client->xdna;
+	int ret, idx;
+
+	if (!drm_dev_enter(&xdna->ddev, &idx))
+		return -ENODEV;
+
+	switch (args->param) {
+	case DRM_AMDXDNA_SET_POWER_MODE:
+		ret = aie2_set_power_mode(client, args);
+		break;
+	default:
+		XDNA_ERR(xdna, "Not supported request parameter %u", args->param);
+		ret = -EOPNOTSUPP;
+		break;
+	}
+
+	drm_dev_exit(idx);
+	return ret;
+}
+
+const struct amdxdna_dev_ops aie2_ops = {
+	.init           = aie2_init,
+	.fini           = aie2_fini,
+	.resume         = aie2_hw_start,
+	.suspend        = aie2_hw_stop,
+	.get_aie_info   = aie2_get_info,
+	.set_aie_state	= aie2_set_state,
+	.hwctx_init     = aie2_hwctx_init,
+	.hwctx_fini     = aie2_hwctx_fini,
+	.hwctx_config   = aie2_hwctx_config,
+	.cmd_submit     = aie2_cmd_submit,
+	.hmm_invalidate = aie2_hmm_invalidate,
+	.hwctx_suspend  = aie2_hwctx_suspend,
+	.hwctx_resume   = aie2_hwctx_resume,
+};
diff --git a/drivers/accel/amdxdna/aie2_pci.h b/drivers/accel/amdxdna/aie2_pci.h
new file mode 100644
index 000000000000..f2d95531ddc2
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_pci.h
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _AIE2_PCI_H_
+#define _AIE2_PCI_H_
+
+#include <drm/amdxdna_accel.h>
+#include <linux/semaphore.h>
+
+#include "amdxdna_mailbox.h"
+
+#define AIE2_INTERVAL	20000	/* us */
+#define AIE2_TIMEOUT	1000000	/* us */
+
+/* Firmware determines device memory base address and size */
+#define AIE2_DEVM_BASE	0x4000000
+#define AIE2_DEVM_SIZE	SZ_64M
+
+#define NDEV2PDEV(ndev) (to_pci_dev((ndev)->xdna->ddev.dev))
+
+#define AIE2_SRAM_OFF(ndev, addr) ((addr) - (ndev)->priv->sram_dev_addr)
+#define AIE2_MBOX_OFF(ndev, addr) ((addr) - (ndev)->priv->mbox_dev_addr)
+
+#define PSP_REG_BAR(ndev, idx) ((ndev)->priv->psp_regs_off[(idx)].bar_idx)
+#define PSP_REG_OFF(ndev, idx) ((ndev)->priv->psp_regs_off[(idx)].offset)
+#define SRAM_REG_OFF(ndev, idx) ((ndev)->priv->sram_offs[(idx)].offset)
+
+#define SMU_REG(ndev, idx) \
+({ \
+	typeof(ndev) _ndev = ndev; \
+	((_ndev)->smu_base + (_ndev)->priv->smu_regs_off[(idx)].offset); \
+})
+#define SRAM_GET_ADDR(ndev, idx) \
+({ \
+	typeof(ndev) _ndev = ndev; \
+	((_ndev)->sram_base + SRAM_REG_OFF((_ndev), (idx))); \
+})
+
+#define CHAN_SLOT_SZ SZ_8K
+#define MBOX_SIZE(ndev) \
+({ \
+	typeof(ndev) _ndev = (ndev); \
+	((_ndev)->priv->mbox_size) ? (_ndev)->priv->mbox_size : \
+	pci_resource_len(NDEV2PDEV(_ndev), (_ndev)->xdna->dev_info->mbox_bar); \
+})
+
+enum aie2_smu_reg_idx {
+	SMU_CMD_REG = 0,
+	SMU_ARG_REG,
+	SMU_INTR_REG,
+	SMU_RESP_REG,
+	SMU_OUT_REG,
+	SMU_MAX_REGS /* Keep this at the end */
+};
+
+enum aie2_sram_reg_idx {
+	MBOX_CHANN_OFF = 0,
+	FW_ALIVE_OFF,
+	SRAM_MAX_INDEX /* Keep this at the end */
+};
+
+enum psp_reg_idx {
+	PSP_CMD_REG = 0,
+	PSP_ARG0_REG,
+	PSP_ARG1_REG,
+	PSP_ARG2_REG,
+	PSP_NUM_IN_REGS, /* number of input registers */
+	PSP_INTR_REG = PSP_NUM_IN_REGS,
+	PSP_STATUS_REG,
+	PSP_RESP_REG,
+	PSP_MAX_REGS /* Keep this at the end */
+};
+
+struct amdxdna_client;
+struct amdxdna_fw_ver;
+struct amdxdna_hwctx;
+struct amdxdna_sched_job;
+
+struct psp_config {
+	const void	*fw_buf;
+	u32		fw_size;
+	void __iomem	*psp_regs[PSP_MAX_REGS];
+};
+
+struct aie_version {
+	u16 major;
+	u16 minor;
+};
+
+struct aie_tile_metadata {
+	u16 row_count;
+	u16 row_start;
+	u16 dma_channel_count;
+	u16 lock_count;
+	u16 event_reg_count;
+};
+
+struct aie_metadata {
+	u32 size;
+	u16 cols;
+	u16 rows;
+	struct aie_version version;
+	struct aie_tile_metadata core;
+	struct aie_tile_metadata mem;
+	struct aie_tile_metadata shim;
+};
+
+enum rt_config_category {
+	AIE2_RT_CFG_INIT,
+	AIE2_RT_CFG_CLK_GATING,
+};
+
+struct rt_config {
+	u32	type;
+	u32	value;
+	u32	category;
+};
+
+struct dpm_clk_freq {
+	u32	npuclk;
+	u32	hclk;
+};
+
+/*
+ * Define the maximum number of pending commands in a hardware context.
+ * Must be power of 2!
+ */
+#define HWCTX_MAX_CMDS		4
+#define get_job_idx(seq) ((seq) & (HWCTX_MAX_CMDS - 1))
+struct amdxdna_hwctx_priv {
+	struct amdxdna_gem_obj		*heap;
+	void				*mbox_chann;
+
+	struct drm_gpu_scheduler	sched;
+	struct drm_sched_entity		entity;
+
+	struct mutex			io_lock; /* protect seq and cmd order */
+	struct wait_queue_head		job_free_wq;
+	u32				num_pending;
+	u64				seq;
+	struct semaphore		job_sem;
+	bool				job_done;
+
+	/* Completed job counter */
+	u64				completed;
+
+	struct amdxdna_gem_obj		*cmd_buf[HWCTX_MAX_CMDS];
+	struct drm_syncobj		*syncobj;
+};
+
+enum aie2_dev_status {
+	AIE2_DEV_UNINIT,
+	AIE2_DEV_INIT,
+	AIE2_DEV_START,
+};
+
+struct amdxdna_dev_hdl {
+	struct amdxdna_dev		*xdna;
+	const struct amdxdna_dev_priv	*priv;
+	void			__iomem *sram_base;
+	void			__iomem *smu_base;
+	void			__iomem *mbox_base;
+	struct psp_device		*psp_hdl;
+
+	struct xdna_mailbox_chann_res	mgmt_x2i;
+	struct xdna_mailbox_chann_res	mgmt_i2x;
+	u32				mgmt_chan_idx;
+	u32				mgmt_prot_major;
+	u32				mgmt_prot_minor;
+
+	u32				total_col;
+	struct aie_version		version;
+	struct aie_metadata		metadata;
+
+	/* power management and clock*/
+	enum amdxdna_power_mode_type	pw_mode;
+	u32				dpm_level;
+	u32				dft_dpm_level;
+	u32				max_dpm_level;
+	u32				clk_gating;
+	u32				npuclk_freq;
+	u32				hclk_freq;
+
+	/* Mailbox and the management channel */
+	struct mailbox			*mbox;
+	struct mailbox_channel		*mgmt_chann;
+	struct async_events		*async_events;
+
+	enum aie2_dev_status		dev_status;
+	u32				hwctx_num;
+};
+
+#define DEFINE_BAR_OFFSET(reg_name, bar, reg_addr) \
+	[reg_name] = {bar##_BAR_INDEX, (reg_addr) - bar##_BAR_BASE}
+
+struct aie2_bar_off_pair {
+	int	bar_idx;
+	u32	offset;
+};
+
+struct aie2_hw_ops {
+	int (*set_dpm)(struct amdxdna_dev_hdl *ndev, u32 dpm_level);
+};
+
+struct amdxdna_dev_priv {
+	const char			*fw_path;
+	u64				protocol_major;
+	u64				protocol_minor;
+	const struct rt_config		*rt_config;
+	const struct dpm_clk_freq	*dpm_clk_tbl;
+
+#define COL_ALIGN_NONE   0
+#define COL_ALIGN_NATURE 1
+	u32				col_align;
+	u32				mbox_dev_addr;
+	/* If mbox_size is 0, use BAR size. See MBOX_SIZE macro */
+	u32				mbox_size;
+	u32				sram_dev_addr;
+	struct aie2_bar_off_pair	sram_offs[SRAM_MAX_INDEX];
+	struct aie2_bar_off_pair	psp_regs_off[PSP_MAX_REGS];
+	struct aie2_bar_off_pair	smu_regs_off[SMU_MAX_REGS];
+	struct aie2_hw_ops		hw_ops;
+};
+
+extern const struct amdxdna_dev_ops aie2_ops;
+
+int aie2_runtime_cfg(struct amdxdna_dev_hdl *ndev,
+		     enum rt_config_category category, u32 *val);
+
+/* aie2 npu hw config */
+extern const struct dpm_clk_freq npu1_dpm_clk_table[];
+extern const struct dpm_clk_freq npu4_dpm_clk_table[];
+extern const struct rt_config npu1_default_rt_cfg[];
+extern const struct rt_config npu4_default_rt_cfg[];
+
+/* aie2_smu.c */
+int aie2_smu_init(struct amdxdna_dev_hdl *ndev);
+void aie2_smu_fini(struct amdxdna_dev_hdl *ndev);
+int npu1_set_dpm(struct amdxdna_dev_hdl *ndev, u32 dpm_level);
+int npu4_set_dpm(struct amdxdna_dev_hdl *ndev, u32 dpm_level);
+
+/* aie2_pm.c */
+int aie2_pm_init(struct amdxdna_dev_hdl *ndev);
+int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type target);
+
+/* aie2_psp.c */
+struct psp_device *aie2m_psp_create(struct drm_device *ddev, struct psp_config *conf);
+int aie2_psp_start(struct psp_device *psp);
+void aie2_psp_stop(struct psp_device *psp);
+
+/* aie2_error.c */
+int aie2_error_async_events_alloc(struct amdxdna_dev_hdl *ndev);
+void aie2_error_async_events_free(struct amdxdna_dev_hdl *ndev);
+int aie2_error_async_events_send(struct amdxdna_dev_hdl *ndev);
+int aie2_error_async_msg_thread(void *data);
+
+/* aie2_message.c */
+int aie2_suspend_fw(struct amdxdna_dev_hdl *ndev);
+int aie2_resume_fw(struct amdxdna_dev_hdl *ndev);
+int aie2_set_runtime_cfg(struct amdxdna_dev_hdl *ndev, u32 type, u64 value);
+int aie2_get_runtime_cfg(struct amdxdna_dev_hdl *ndev, u32 type, u64 *value);
+int aie2_assign_mgmt_pasid(struct amdxdna_dev_hdl *ndev, u16 pasid);
+int aie2_query_aie_version(struct amdxdna_dev_hdl *ndev, struct aie_version *version);
+int aie2_query_aie_metadata(struct amdxdna_dev_hdl *ndev, struct aie_metadata *metadata);
+int aie2_query_firmware_version(struct amdxdna_dev_hdl *ndev,
+				struct amdxdna_fw_ver *fw_ver);
+int aie2_create_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
+int aie2_destroy_context(struct amdxdna_dev_hdl *ndev, struct amdxdna_hwctx *hwctx);
+int aie2_map_host_buf(struct amdxdna_dev_hdl *ndev, u32 context_id, u64 addr, u64 size);
+int aie2_query_status(struct amdxdna_dev_hdl *ndev, char __user *buf, u32 size, u32 *cols_filled);
+int aie2_register_asyn_event_msg(struct amdxdna_dev_hdl *ndev, dma_addr_t addr, u32 size,
+				 void *handle, int (*cb)(void*, const u32 *, size_t));
+int aie2_config_cu(struct amdxdna_hwctx *hwctx);
+int aie2_execbuf(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+		 int (*notify_cb)(void *, const u32 *, size_t));
+int aie2_cmdlist_single_execbuf(struct amdxdna_hwctx *hwctx,
+				struct amdxdna_sched_job *job,
+				int (*notify_cb)(void *, const u32 *, size_t));
+int aie2_cmdlist_multi_execbuf(struct amdxdna_hwctx *hwctx,
+			       struct amdxdna_sched_job *job,
+			       int (*notify_cb)(void *, const u32 *, size_t));
+int aie2_sync_bo(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job,
+		 int (*notify_cb)(void *, const u32 *, size_t));
+
+/* aie2_hwctx.c */
+int aie2_hwctx_init(struct amdxdna_hwctx *hwctx);
+void aie2_hwctx_fini(struct amdxdna_hwctx *hwctx);
+int aie2_hwctx_config(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
+void aie2_hwctx_suspend(struct amdxdna_hwctx *hwctx);
+void aie2_hwctx_resume(struct amdxdna_hwctx *hwctx);
+int aie2_cmd_submit(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
+void aie2_hmm_invalidate(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
+void aie2_restart_ctx(struct amdxdna_client *client);
+
+#endif /* _AIE2_PCI_H_ */
diff --git a/drivers/accel/amdxdna/aie2_pm.c b/drivers/accel/amdxdna/aie2_pm.c
new file mode 100644
index 000000000000..426c38fce848
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_pm.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_device.h>
+#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+
+#include "aie2_pci.h"
+#include "amdxdna_pci_drv.h"
+
+#define AIE2_CLK_GATING_ENABLE	1
+#define AIE2_CLK_GATING_DISABLE	0
+
+static int aie2_pm_set_clk_gating(struct amdxdna_dev_hdl *ndev, u32 val)
+{
+	int ret;
+
+	ret = aie2_runtime_cfg(ndev, AIE2_RT_CFG_CLK_GATING, &val);
+	if (ret)
+		return ret;
+
+	ndev->clk_gating = val;
+	return 0;
+}
+
+int aie2_pm_init(struct amdxdna_dev_hdl *ndev)
+{
+	int ret;
+
+	if (ndev->dev_status != AIE2_DEV_UNINIT) {
+		/* Resume device */
+		ret = ndev->priv->hw_ops.set_dpm(ndev, ndev->dpm_level);
+		if (ret)
+			return ret;
+
+		ret = aie2_pm_set_clk_gating(ndev, ndev->clk_gating);
+		if (ret)
+			return ret;
+
+		return 0;
+	}
+
+	while (ndev->priv->dpm_clk_tbl[ndev->max_dpm_level].hclk)
+		ndev->max_dpm_level++;
+	ndev->max_dpm_level--;
+
+	ret = ndev->priv->hw_ops.set_dpm(ndev, ndev->max_dpm_level);
+	if (ret)
+		return ret;
+
+	ret = aie2_pm_set_clk_gating(ndev, AIE2_CLK_GATING_ENABLE);
+	if (ret)
+		return ret;
+
+	ndev->pw_mode = POWER_MODE_DEFAULT;
+	ndev->dft_dpm_level = ndev->max_dpm_level;
+
+	return 0;
+}
+
+int aie2_pm_set_mode(struct amdxdna_dev_hdl *ndev, enum amdxdna_power_mode_type target)
+{
+	struct amdxdna_dev *xdna = ndev->xdna;
+	u32 clk_gating, dpm_level;
+	int ret;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+
+	if (ndev->pw_mode == target)
+		return 0;
+
+	switch (target) {
+	case POWER_MODE_TURBO:
+		if (ndev->hwctx_num) {
+			XDNA_ERR(xdna, "Can not set turbo when there is active hwctx");
+			return -EINVAL;
+		}
+
+		clk_gating = AIE2_CLK_GATING_DISABLE;
+		dpm_level = ndev->max_dpm_level;
+		break;
+	case POWER_MODE_HIGH:
+		clk_gating = AIE2_CLK_GATING_ENABLE;
+		dpm_level = ndev->max_dpm_level;
+		break;
+	case POWER_MODE_DEFAULT:
+		clk_gating = AIE2_CLK_GATING_ENABLE;
+		dpm_level = ndev->dft_dpm_level;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	ret = ndev->priv->hw_ops.set_dpm(ndev, dpm_level);
+	if (ret)
+		return ret;
+
+	ret = aie2_pm_set_clk_gating(ndev, clk_gating);
+	if (ret)
+		return ret;
+
+	ndev->pw_mode = target;
+
+	return 0;
+}
diff --git a/drivers/accel/amdxdna/aie2_psp.c b/drivers/accel/amdxdna/aie2_psp.c
new file mode 100644
index 000000000000..dc3a072ce3b6
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_psp.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/drm_device.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_managed.h>
+#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/bitfield.h>
+#include <linux/iopoll.h>
+
+#include "aie2_pci.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_pci_drv.h"
+
+#define PSP_STATUS_READY	BIT(31)
+
+/* PSP commands */
+#define PSP_VALIDATE		1
+#define PSP_START		2
+#define PSP_RELEASE_TMR		3
+
+/* PSP special arguments */
+#define PSP_START_COPY_FW	1
+
+/* PSP response error code */
+#define PSP_ERROR_CANCEL	0xFFFF0002
+#define PSP_ERROR_BAD_STATE	0xFFFF0007
+
+#define PSP_FW_ALIGN		0x10000
+#define PSP_POLL_INTERVAL	20000	/* us */
+#define PSP_POLL_TIMEOUT	1000000	/* us */
+
+#define PSP_REG(p, reg) ((p)->psp_regs[reg])
+
+struct psp_device {
+	struct drm_device	*ddev;
+	struct psp_config	conf;
+	u32			fw_buf_sz;
+	u64			fw_paddr;
+	void			*fw_buffer;
+	void __iomem		*psp_regs[PSP_MAX_REGS];
+};
+
+static int psp_exec(struct psp_device *psp, u32 *reg_vals)
+{
+	u32 resp_code;
+	int ret, i;
+	u32 ready;
+
+	/* Write command and argument registers */
+	for (i = 0; i < PSP_NUM_IN_REGS; i++)
+		writel(reg_vals[i], PSP_REG(psp, i));
+
+	/* clear and set PSP INTR register to kick off */
+	writel(0, PSP_REG(psp, PSP_INTR_REG));
+	writel(1, PSP_REG(psp, PSP_INTR_REG));
+
+	/* PSP should be busy. Wait for ready, so we know task is done. */
+	ret = readx_poll_timeout(readl, PSP_REG(psp, PSP_STATUS_REG), ready,
+				 FIELD_GET(PSP_STATUS_READY, ready),
+				 PSP_POLL_INTERVAL, PSP_POLL_TIMEOUT);
+	if (ret) {
+		drm_err(psp->ddev, "PSP is not ready, ret 0x%x", ret);
+		return ret;
+	}
+
+	resp_code = readl(PSP_REG(psp, PSP_RESP_REG));
+	if (resp_code) {
+		drm_err(psp->ddev, "fw return error 0x%x", resp_code);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+void aie2_psp_stop(struct psp_device *psp)
+{
+	u32 reg_vals[PSP_NUM_IN_REGS] = { PSP_RELEASE_TMR, };
+	int ret;
+
+	ret = psp_exec(psp, reg_vals);
+	if (ret)
+		drm_err(psp->ddev, "release tmr failed, ret %d", ret);
+}
+
+int aie2_psp_start(struct psp_device *psp)
+{
+	u32 reg_vals[PSP_NUM_IN_REGS];
+	int ret;
+
+	reg_vals[0] = PSP_VALIDATE;
+	reg_vals[1] = lower_32_bits(psp->fw_paddr);
+	reg_vals[2] = upper_32_bits(psp->fw_paddr);
+	reg_vals[3] = psp->fw_buf_sz;
+
+	ret = psp_exec(psp, reg_vals);
+	if (ret) {
+		drm_err(psp->ddev, "failed to validate fw, ret %d", ret);
+		return ret;
+	}
+
+	memset(reg_vals, 0, sizeof(reg_vals));
+	reg_vals[0] = PSP_START;
+	reg_vals[1] = PSP_START_COPY_FW;
+	ret = psp_exec(psp, reg_vals);
+	if (ret) {
+		drm_err(psp->ddev, "failed to start fw, ret %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+struct psp_device *aie2m_psp_create(struct drm_device *ddev, struct psp_config *conf)
+{
+	struct psp_device *psp;
+	u64 offset;
+
+	psp = drmm_kzalloc(ddev, sizeof(*psp), GFP_KERNEL);
+	if (!psp)
+		return NULL;
+
+	psp->ddev = ddev;
+	memcpy(psp->psp_regs, conf->psp_regs, sizeof(psp->psp_regs));
+
+	psp->fw_buf_sz = ALIGN(conf->fw_size, PSP_FW_ALIGN) + PSP_FW_ALIGN;
+	psp->fw_buffer = drmm_kmalloc(ddev, psp->fw_buf_sz, GFP_KERNEL);
+	if (!psp->fw_buffer) {
+		drm_err(ddev, "no memory for fw buffer");
+		return NULL;
+	}
+
+	/*
+	 * AMD Platform Security Processor(PSP) requires host physical
+	 * address to load NPU firmware.
+	 */
+	psp->fw_paddr = virt_to_phys(psp->fw_buffer);
+	offset = ALIGN(psp->fw_paddr, PSP_FW_ALIGN) - psp->fw_paddr;
+	psp->fw_paddr += offset;
+	memcpy(psp->fw_buffer + offset, conf->fw_buf, conf->fw_size);
+
+	return psp;
+}
diff --git a/drivers/accel/amdxdna/aie2_smu.c b/drivers/accel/amdxdna/aie2_smu.c
new file mode 100644
index 000000000000..73388443c676
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_smu.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/drm_device.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/iopoll.h>
+
+#include "aie2_pci.h"
+#include "amdxdna_pci_drv.h"
+
+#define SMU_RESULT_OK		1
+
+/* SMU commands */
+#define AIE2_SMU_POWER_ON		0x3
+#define AIE2_SMU_POWER_OFF		0x4
+#define AIE2_SMU_SET_MPNPUCLK_FREQ	0x5
+#define AIE2_SMU_SET_HCLK_FREQ		0x6
+#define AIE2_SMU_SET_SOFT_DPMLEVEL	0x7
+#define AIE2_SMU_SET_HARD_DPMLEVEL	0x8
+
+static int aie2_smu_exec(struct amdxdna_dev_hdl *ndev, u32 reg_cmd,
+			 u32 reg_arg, u32 *out)
+{
+	u32 resp;
+	int ret;
+
+	writel(0, SMU_REG(ndev, SMU_RESP_REG));
+	writel(reg_arg, SMU_REG(ndev, SMU_ARG_REG));
+	writel(reg_cmd, SMU_REG(ndev, SMU_CMD_REG));
+
+	/* Clear and set SMU_INTR_REG to kick off */
+	writel(0, SMU_REG(ndev, SMU_INTR_REG));
+	writel(1, SMU_REG(ndev, SMU_INTR_REG));
+
+	ret = readx_poll_timeout(readl, SMU_REG(ndev, SMU_RESP_REG), resp,
+				 resp, AIE2_INTERVAL, AIE2_TIMEOUT);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "smu cmd %d timed out", reg_cmd);
+		return ret;
+	}
+
+	if (out)
+		*out = readl(SMU_REG(ndev, SMU_OUT_REG));
+
+	if (resp != SMU_RESULT_OK) {
+		XDNA_ERR(ndev->xdna, "smu cmd %d failed, 0x%x", reg_cmd, resp);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int npu1_set_dpm(struct amdxdna_dev_hdl *ndev, u32 dpm_level)
+{
+	u32 freq;
+	int ret;
+
+	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_MPNPUCLK_FREQ,
+			    ndev->priv->dpm_clk_tbl[dpm_level].npuclk, &freq);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Set npu clock to %d failed, ret %d\n",
+			 ndev->priv->dpm_clk_tbl[dpm_level].npuclk, ret);
+	}
+	ndev->npuclk_freq = freq;
+
+	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_HCLK_FREQ,
+			    ndev->priv->dpm_clk_tbl[dpm_level].hclk, &freq);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Set h clock to %d failed, ret %d\n",
+			 ndev->priv->dpm_clk_tbl[dpm_level].hclk, ret);
+	}
+	ndev->hclk_freq = freq;
+	ndev->dpm_level = dpm_level;
+
+	XDNA_DBG(ndev->xdna, "MP-NPU clock %d, H clock %d\n",
+		 ndev->npuclk_freq, ndev->hclk_freq);
+
+	return 0;
+}
+
+int npu4_set_dpm(struct amdxdna_dev_hdl *ndev, u32 dpm_level)
+{
+	int ret;
+
+	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_HARD_DPMLEVEL, dpm_level, NULL);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Set hard dpm level %d failed, ret %d ",
+			 dpm_level, ret);
+		return ret;
+	}
+
+	ret = aie2_smu_exec(ndev, AIE2_SMU_SET_SOFT_DPMLEVEL, dpm_level, NULL);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Set soft dpm level %d failed, ret %d",
+			 dpm_level, ret);
+		return ret;
+	}
+
+	ndev->npuclk_freq = ndev->priv->dpm_clk_tbl[dpm_level].npuclk;
+	ndev->hclk_freq = ndev->priv->dpm_clk_tbl[dpm_level].hclk;
+	ndev->dpm_level = dpm_level;
+
+	XDNA_DBG(ndev->xdna, "MP-NPU clock %d, H clock %d\n",
+		 ndev->npuclk_freq, ndev->hclk_freq);
+
+	return 0;
+}
+
+int aie2_smu_init(struct amdxdna_dev_hdl *ndev)
+{
+	int ret;
+
+	ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_ON, 0, NULL);
+	if (ret) {
+		XDNA_ERR(ndev->xdna, "Power on failed, ret %d", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+void aie2_smu_fini(struct amdxdna_dev_hdl *ndev)
+{
+	int ret;
+
+	ndev->priv->hw_ops.set_dpm(ndev, 0);
+	ret = aie2_smu_exec(ndev, AIE2_SMU_POWER_OFF, 0, NULL);
+	if (ret)
+		XDNA_ERR(ndev->xdna, "Power off failed, ret %d", ret);
+}
diff --git a/drivers/accel/amdxdna/aie2_solver.c b/drivers/accel/amdxdna/aie2_solver.c
new file mode 100644
index 000000000000..2013d1f13aae
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_solver.c
@@ -0,0 +1,380 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/drm_device.h>
+#include <drm/drm_managed.h>
+#include <drm/drm_print.h>
+#include <linux/bitops.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+
+#include "aie2_solver.h"
+
+struct partition_node {
+	struct list_head	list;
+	u32			nshared;	/* # shared requests */
+	u32			start_col;	/* start column */
+	u32			ncols;		/* # columns */
+	bool			exclusive;	/* can not be shared if set */
+};
+
+struct solver_node {
+	struct list_head	list;
+	u64			rid;		/* Request ID from consumer */
+
+	struct partition_node	*pt_node;
+	void			*cb_arg;
+	u32			dpm_level;
+	u32			cols_len;
+	u32			start_cols[] __counted_by(cols_len);
+};
+
+struct solver_rgroup {
+	u32				rgid;
+	u32				nnode;
+	u32				npartition_node;
+
+	DECLARE_BITMAP(resbit, XRS_MAX_COL);
+	struct list_head		node_list;
+	struct list_head		pt_node_list;
+};
+
+struct solver_state {
+	struct solver_rgroup		rgp;
+	struct init_config		cfg;
+	struct xrs_action_ops		*actions;
+};
+
+static u32 calculate_gops(struct aie_qos *rqos)
+{
+	u32 service_rate = 0;
+
+	if (rqos->latency)
+		service_rate = (1000 / rqos->latency);
+
+	if (rqos->fps > service_rate)
+		return rqos->fps * rqos->gops;
+
+	return service_rate * rqos->gops;
+}
+
+/*
+ * qos_meet() - Check the QOS request can be met.
+ */
+static int qos_meet(struct solver_state *xrs, struct aie_qos *rqos, u32 cgops)
+{
+	u32 request_gops = calculate_gops(rqos) * xrs->cfg.sys_eff_factor;
+
+	if (request_gops <= cgops)
+		return 0;
+
+	return -EINVAL;
+}
+
+/*
+ * sanity_check() - Do a basic sanity check on allocation request.
+ */
+static int sanity_check(struct solver_state *xrs, struct alloc_requests *req)
+{
+	struct cdo_parts *cdop = &req->cdo;
+	struct aie_qos *rqos = &req->rqos;
+	u32 cu_clk_freq;
+
+	if (cdop->ncols > xrs->cfg.total_col)
+		return -EINVAL;
+
+	/*
+	 * We can find at least one CDOs groups that meet the
+	 * GOPs requirement.
+	 */
+	cu_clk_freq = xrs->cfg.clk_list.cu_clk_list[xrs->cfg.clk_list.num_levels - 1];
+
+	if (qos_meet(xrs, rqos, cdop->qos_cap.opc * cu_clk_freq / 1000))
+		return -EINVAL;
+
+	return 0;
+}
+
+static bool is_valid_qos_dpm_params(struct aie_qos *rqos)
+{
+	/*
+	 * gops is retrieved from the xmodel, so it's always set
+	 * fps and latency are the configurable params from the application
+	 */
+	if (rqos->gops > 0 && (rqos->fps > 0 ||  rqos->latency > 0))
+		return true;
+
+	return false;
+}
+
+static int set_dpm_level(struct solver_state *xrs, struct alloc_requests *req, u32 *dpm_level)
+{
+	struct solver_rgroup *rgp = &xrs->rgp;
+	struct cdo_parts *cdop = &req->cdo;
+	struct aie_qos *rqos = &req->rqos;
+	u32 freq, max_dpm_level, level;
+	struct solver_node *node;
+
+	max_dpm_level = xrs->cfg.clk_list.num_levels - 1;
+	/* If no QoS parameters are passed, set it to the max DPM level */
+	if (!is_valid_qos_dpm_params(rqos)) {
+		level = max_dpm_level;
+		goto set_dpm;
+	}
+
+	/* Find one CDO group that meet the GOPs requirement. */
+	for (level = 0; level < max_dpm_level; level++) {
+		freq = xrs->cfg.clk_list.cu_clk_list[level];
+		if (!qos_meet(xrs, rqos, cdop->qos_cap.opc * freq / 1000))
+			break;
+	}
+
+	/* set the dpm level which fits all the sessions */
+	list_for_each_entry(node, &rgp->node_list, list) {
+		if (node->dpm_level > level)
+			level = node->dpm_level;
+	}
+
+set_dpm:
+	*dpm_level = level;
+	return xrs->cfg.actions->set_dft_dpm_level(xrs->cfg.ddev, level);
+}
+
+static struct solver_node *rg_search_node(struct solver_rgroup *rgp, u64 rid)
+{
+	struct solver_node *node;
+
+	list_for_each_entry(node, &rgp->node_list, list) {
+		if (node->rid == rid)
+			return node;
+	}
+
+	return NULL;
+}
+
+static void remove_partition_node(struct solver_rgroup *rgp,
+				  struct partition_node *pt_node)
+{
+	pt_node->nshared--;
+	if (pt_node->nshared > 0)
+		return;
+
+	list_del(&pt_node->list);
+	rgp->npartition_node--;
+
+	bitmap_clear(rgp->resbit, pt_node->start_col, pt_node->ncols);
+	kfree(pt_node);
+}
+
+static void remove_solver_node(struct solver_rgroup *rgp,
+			       struct solver_node *node)
+{
+	list_del(&node->list);
+	rgp->nnode--;
+
+	if (node->pt_node)
+		remove_partition_node(rgp, node->pt_node);
+
+	kfree(node);
+}
+
+static int get_free_partition(struct solver_state *xrs,
+			      struct solver_node *snode,
+			      struct alloc_requests *req)
+{
+	struct partition_node *pt_node;
+	u32 ncols = req->cdo.ncols;
+	u32 col, i;
+
+	for (i = 0; i < snode->cols_len; i++) {
+		col = snode->start_cols[i];
+		if (find_next_bit(xrs->rgp.resbit, XRS_MAX_COL, col) >= col + ncols)
+			break;
+	}
+
+	if (i == snode->cols_len)
+		return -ENODEV;
+
+	pt_node = kzalloc(sizeof(*pt_node), GFP_KERNEL);
+	if (!pt_node)
+		return -ENOMEM;
+
+	pt_node->nshared = 1;
+	pt_node->start_col = col;
+	pt_node->ncols = ncols;
+
+	/*
+	 * Always set exclusive to false for now.
+	 */
+	pt_node->exclusive = false;
+
+	list_add_tail(&pt_node->list, &xrs->rgp.pt_node_list);
+	xrs->rgp.npartition_node++;
+	bitmap_set(xrs->rgp.resbit, pt_node->start_col, pt_node->ncols);
+
+	snode->pt_node = pt_node;
+
+	return 0;
+}
+
+static int allocate_partition(struct solver_state *xrs,
+			      struct solver_node *snode,
+			      struct alloc_requests *req)
+{
+	struct partition_node *pt_node, *rpt_node = NULL;
+	int idx, ret;
+
+	ret = get_free_partition(xrs, snode, req);
+	if (!ret)
+		return ret;
+
+	/* try to get a share-able partition */
+	list_for_each_entry(pt_node, &xrs->rgp.pt_node_list, list) {
+		if (pt_node->exclusive)
+			continue;
+
+		if (rpt_node && pt_node->nshared >= rpt_node->nshared)
+			continue;
+
+		for (idx = 0; idx < snode->cols_len; idx++) {
+			if (snode->start_cols[idx] != pt_node->start_col)
+				continue;
+
+			if (req->cdo.ncols != pt_node->ncols)
+				continue;
+
+			rpt_node = pt_node;
+			break;
+		}
+	}
+
+	if (!rpt_node)
+		return -ENODEV;
+
+	rpt_node->nshared++;
+	snode->pt_node = rpt_node;
+
+	return 0;
+}
+
+static struct solver_node *create_solver_node(struct solver_state *xrs,
+					      struct alloc_requests *req)
+{
+	struct cdo_parts *cdop = &req->cdo;
+	struct solver_node *node;
+	int ret;
+
+	node = kzalloc(struct_size(node, start_cols, cdop->cols_len), GFP_KERNEL);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+
+	node->rid = req->rid;
+	node->cols_len = cdop->cols_len;
+	memcpy(node->start_cols, cdop->start_cols, cdop->cols_len * sizeof(u32));
+
+	ret = allocate_partition(xrs, node, req);
+	if (ret)
+		goto free_node;
+
+	list_add_tail(&node->list, &xrs->rgp.node_list);
+	xrs->rgp.nnode++;
+	return node;
+
+free_node:
+	kfree(node);
+	return ERR_PTR(ret);
+}
+
+static void fill_load_action(struct solver_state *xrs,
+			     struct solver_node *snode,
+			     struct xrs_action_load *action)
+{
+	action->rid = snode->rid;
+	action->part.start_col = snode->pt_node->start_col;
+	action->part.ncols = snode->pt_node->ncols;
+}
+
+int xrs_allocate_resource(void *hdl, struct alloc_requests *req, void *cb_arg)
+{
+	struct xrs_action_load load_act;
+	struct solver_node *snode;
+	struct solver_state *xrs;
+	u32 dpm_level;
+	int ret;
+
+	xrs = (struct solver_state *)hdl;
+
+	ret = sanity_check(xrs, req);
+	if (ret) {
+		drm_err(xrs->cfg.ddev, "invalid request");
+		return ret;
+	}
+
+	if (rg_search_node(&xrs->rgp, req->rid)) {
+		drm_err(xrs->cfg.ddev, "rid %lld is in-use", req->rid);
+		return -EEXIST;
+	}
+
+	snode = create_solver_node(xrs, req);
+	if (IS_ERR(snode))
+		return PTR_ERR(snode);
+
+	fill_load_action(xrs, snode, &load_act);
+	ret = xrs->cfg.actions->load(cb_arg, &load_act);
+	if (ret)
+		goto free_node;
+
+	ret = set_dpm_level(xrs, req, &dpm_level);
+	if (ret)
+		goto free_node;
+
+	snode->dpm_level = dpm_level;
+	snode->cb_arg = cb_arg;
+
+	drm_dbg(xrs->cfg.ddev, "start col %d ncols %d\n",
+		snode->pt_node->start_col, snode->pt_node->ncols);
+
+	return 0;
+
+free_node:
+	remove_solver_node(&xrs->rgp, snode);
+
+	return ret;
+}
+
+int xrs_release_resource(void *hdl, u64 rid)
+{
+	struct solver_state *xrs = hdl;
+	struct solver_node *node;
+
+	node = rg_search_node(&xrs->rgp, rid);
+	if (!node) {
+		drm_err(xrs->cfg.ddev, "node not exist");
+		return -ENODEV;
+	}
+
+	xrs->cfg.actions->unload(node->cb_arg);
+	remove_solver_node(&xrs->rgp, node);
+
+	return 0;
+}
+
+void *xrsm_init(struct init_config *cfg)
+{
+	struct solver_rgroup *rgp;
+	struct solver_state *xrs;
+
+	xrs = drmm_kzalloc(cfg->ddev, sizeof(*xrs), GFP_KERNEL);
+	if (!xrs)
+		return NULL;
+
+	memcpy(&xrs->cfg, cfg, sizeof(*cfg));
+
+	rgp = &xrs->rgp;
+	INIT_LIST_HEAD(&rgp->node_list);
+	INIT_LIST_HEAD(&rgp->pt_node_list);
+
+	return xrs;
+}
diff --git a/drivers/accel/amdxdna/aie2_solver.h b/drivers/accel/amdxdna/aie2_solver.h
new file mode 100644
index 000000000000..a2e3c52229e9
--- /dev/null
+++ b/drivers/accel/amdxdna/aie2_solver.h
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _AIE2_SOLVER_H
+#define _AIE2_SOLVER_H
+
+#define XRS_MAX_COL 128
+
+/*
+ * Structure used to describe a partition. A partition is column based
+ * allocation unit described by its start column and number of columns.
+ */
+struct aie_part {
+	u32	start_col;
+	u32	ncols;
+};
+
+/*
+ * The QoS capabilities of a given AIE partition.
+ */
+struct aie_qos_cap {
+	u32     opc;            /* operations per cycle */
+	u32     dma_bw;         /* DMA bandwidth */
+};
+
+/*
+ * QoS requirement of a resource allocation.
+ */
+struct aie_qos {
+	u32	gops;		/* Giga operations */
+	u32	fps;		/* Frames per second */
+	u32	dma_bw;		/* DMA bandwidth */
+	u32	latency;	/* Frame response latency */
+	u32	exec_time;	/* Frame execution time */
+	u32	priority;	/* Request priority */
+};
+
+/*
+ * Structure used to describe a relocatable CDO (Configuration Data Object).
+ */
+struct cdo_parts {
+	u32		   *start_cols;		/* Start column array */
+	u32		   cols_len;		/* Length of start column array */
+	u32		   ncols;		/* # of column */
+	struct aie_qos_cap qos_cap;		/* CDO QoS capabilities */
+};
+
+/*
+ * Structure used to describe a request to allocate.
+ */
+struct alloc_requests {
+	u64			rid;
+	struct cdo_parts	cdo;
+	struct aie_qos		rqos;		/* Requested QoS */
+};
+
+/*
+ * Load callback argument
+ */
+struct xrs_action_load {
+	u32                     rid;
+	struct aie_part         part;
+};
+
+/*
+ * Define the power level available
+ *
+ * POWER_LEVEL_MIN:
+ *     Lowest power level. Usually set when all actions are unloaded.
+ *
+ * POWER_LEVEL_n
+ *     Power levels 0 - n, is a step increase in system frequencies
+ */
+enum power_level {
+	POWER_LEVEL_MIN = 0x0,
+	POWER_LEVEL_0   = 0x1,
+	POWER_LEVEL_1   = 0x2,
+	POWER_LEVEL_2   = 0x3,
+	POWER_LEVEL_3   = 0x4,
+	POWER_LEVEL_4   = 0x5,
+	POWER_LEVEL_5   = 0x6,
+	POWER_LEVEL_6   = 0x7,
+	POWER_LEVEL_7   = 0x8,
+	POWER_LEVEL_NUM,
+};
+
+/*
+ * Structure used to describe the frequency table.
+ * Resource solver chooses the frequency from the table
+ * to meet the QOS requirements.
+ */
+struct clk_list_info {
+	u32        num_levels;                     /* available power levels */
+	u32        cu_clk_list[POWER_LEVEL_NUM];   /* available aie clock frequencies in Mhz*/
+};
+
+struct xrs_action_ops {
+	int (*load)(void *cb_arg, struct xrs_action_load *action);
+	int (*unload)(void *cb_arg);
+	int (*set_dft_dpm_level)(struct drm_device *ddev, u32 level);
+};
+
+/*
+ * Structure used to describe information for solver during initialization.
+ */
+struct init_config {
+	u32			total_col;
+	u32			sys_eff_factor; /* system efficiency factor */
+	u32			latency_adj;    /* latency adjustment in ms */
+	struct clk_list_info	clk_list;       /* List of frequencies available in system */
+	struct drm_device	*ddev;
+	struct xrs_action_ops	*actions;
+};
+
+/*
+ * xrsm_init() - Register resource solver. Resource solver client needs
+ *              to call this function to register itself.
+ *
+ * @cfg:	The system metrics for resource solver to use
+ *
+ * Return:	A resource solver handle
+ *
+ * Note: We should only create one handle per AIE array to be managed.
+ */
+void *xrsm_init(struct init_config *cfg);
+
+/*
+ * xrs_allocate_resource() - Request to allocate resources for a given context
+ *                           and a partition metadata. (See struct part_meta)
+ *
+ * @hdl:	Resource solver handle obtained from xrs_init()
+ * @req:	Input to the Resource solver including request id
+ *		and partition metadata.
+ * @cb_arg:	callback argument pointer
+ *
+ * Return:	0 when successful.
+ *		Or standard error number when failing
+ *
+ * Note:
+ *      There is no lock mechanism inside resource solver. So it is
+ *      the caller's responsibility to lock down XCLBINs and grab
+ *      necessary lock.
+ */
+int xrs_allocate_resource(void *hdl, struct alloc_requests *req, void *cb_arg);
+
+/*
+ * xrs_release_resource() - Request to free resources for a given context.
+ *
+ * @hdl:	Resource solver handle obtained from xrs_init()
+ * @rid:	The Request ID to identify the requesting context
+ */
+int xrs_release_resource(void *hdl, u64 rid);
+#endif /* _AIE2_SOLVER_H */
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.c b/drivers/accel/amdxdna/amdxdna_ctx.c
new file mode 100644
index 000000000000..d11b1c83d9c3
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_ctx.c
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_device.h>
+#include <drm/drm_drv.h>
+#include <drm/drm_file.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/xarray.h>
+#include <trace/events/amdxdna.h>
+
+#include "amdxdna_ctx.h"
+#include "amdxdna_gem.h"
+#include "amdxdna_pci_drv.h"
+
+#define MAX_HWCTX_ID		255
+#define MAX_ARG_COUNT		4095
+
+struct amdxdna_fence {
+	struct dma_fence	base;
+	spinlock_t		lock; /* for base */
+	struct amdxdna_hwctx	*hwctx;
+};
+
+static const char *amdxdna_fence_get_driver_name(struct dma_fence *fence)
+{
+	return KBUILD_MODNAME;
+}
+
+static const char *amdxdna_fence_get_timeline_name(struct dma_fence *fence)
+{
+	struct amdxdna_fence *xdna_fence;
+
+	xdna_fence = container_of(fence, struct amdxdna_fence, base);
+
+	return xdna_fence->hwctx->name;
+}
+
+static const struct dma_fence_ops fence_ops = {
+	.get_driver_name = amdxdna_fence_get_driver_name,
+	.get_timeline_name = amdxdna_fence_get_timeline_name,
+};
+
+static struct dma_fence *amdxdna_fence_create(struct amdxdna_hwctx *hwctx)
+{
+	struct amdxdna_fence *fence;
+
+	fence = kzalloc(sizeof(*fence), GFP_KERNEL);
+	if (!fence)
+		return NULL;
+
+	fence->hwctx = hwctx;
+	spin_lock_init(&fence->lock);
+	dma_fence_init(&fence->base, &fence_ops, &fence->lock, hwctx->id, 0);
+	return &fence->base;
+}
+
+void amdxdna_hwctx_suspend(struct amdxdna_client *client)
+{
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_hwctx *hwctx;
+	unsigned long hwctx_id;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+	mutex_lock(&client->hwctx_lock);
+	amdxdna_for_each_hwctx(client, hwctx_id, hwctx)
+		xdna->dev_info->ops->hwctx_suspend(hwctx);
+	mutex_unlock(&client->hwctx_lock);
+}
+
+void amdxdna_hwctx_resume(struct amdxdna_client *client)
+{
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_hwctx *hwctx;
+	unsigned long hwctx_id;
+
+	drm_WARN_ON(&xdna->ddev, !mutex_is_locked(&xdna->dev_lock));
+	mutex_lock(&client->hwctx_lock);
+	amdxdna_for_each_hwctx(client, hwctx_id, hwctx)
+		xdna->dev_info->ops->hwctx_resume(hwctx);
+	mutex_unlock(&client->hwctx_lock);
+}
+
+static void amdxdna_hwctx_destroy_rcu(struct amdxdna_hwctx *hwctx,
+				      struct srcu_struct *ss)
+{
+	struct amdxdna_dev *xdna = hwctx->client->xdna;
+
+	synchronize_srcu(ss);
+
+	/* At this point, user is not able to submit new commands */
+	mutex_lock(&xdna->dev_lock);
+	xdna->dev_info->ops->hwctx_fini(hwctx);
+	mutex_unlock(&xdna->dev_lock);
+
+	kfree(hwctx->name);
+	kfree(hwctx);
+}
+
+void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size)
+{
+	struct amdxdna_cmd *cmd = abo->mem.kva;
+	u32 num_masks, count;
+
+	if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN)
+		num_masks = 0;
+	else
+		num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, cmd->header);
+
+	if (size) {
+		count = FIELD_GET(AMDXDNA_CMD_COUNT, cmd->header);
+		if (unlikely(count <= num_masks)) {
+			*size = 0;
+			return NULL;
+		}
+		*size = (count - num_masks) * sizeof(u32);
+	}
+	return &cmd->data[num_masks];
+}
+
+int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo)
+{
+	struct amdxdna_cmd *cmd = abo->mem.kva;
+	u32 num_masks, i;
+	u32 *cu_mask;
+
+	if (amdxdna_cmd_get_op(abo) == ERT_CMD_CHAIN)
+		return -1;
+
+	num_masks = 1 + FIELD_GET(AMDXDNA_CMD_EXTRA_CU_MASK, cmd->header);
+	cu_mask = cmd->data;
+	for (i = 0; i < num_masks; i++) {
+		if (cu_mask[i])
+			return ffs(cu_mask[i]) - 1;
+	}
+
+	return -1;
+}
+
+/*
+ * This should be called in close() and remove(). DO NOT call in other syscalls.
+ * This guarantee that when hwctx and resources will be released, if user
+ * doesn't call amdxdna_drm_destroy_hwctx_ioctl.
+ */
+void amdxdna_hwctx_remove_all(struct amdxdna_client *client)
+{
+	struct amdxdna_hwctx *hwctx;
+	unsigned long hwctx_id;
+
+	mutex_lock(&client->hwctx_lock);
+	amdxdna_for_each_hwctx(client, hwctx_id, hwctx) {
+		XDNA_DBG(client->xdna, "PID %d close HW context %d",
+			 client->pid, hwctx->id);
+		xa_erase(&client->hwctx_xa, hwctx->id);
+		mutex_unlock(&client->hwctx_lock);
+		amdxdna_hwctx_destroy_rcu(hwctx, &client->hwctx_srcu);
+		mutex_lock(&client->hwctx_lock);
+	}
+	mutex_unlock(&client->hwctx_lock);
+}
+
+int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+{
+	struct amdxdna_client *client = filp->driver_priv;
+	struct amdxdna_drm_create_hwctx *args = data;
+	struct amdxdna_dev *xdna = to_xdna_dev(dev);
+	struct amdxdna_hwctx *hwctx;
+	int ret, idx;
+
+	if (args->ext || args->ext_flags)
+		return -EINVAL;
+
+	if (!drm_dev_enter(dev, &idx))
+		return -ENODEV;
+
+	hwctx = kzalloc(sizeof(*hwctx), GFP_KERNEL);
+	if (!hwctx) {
+		ret = -ENOMEM;
+		goto exit;
+	}
+
+	if (copy_from_user(&hwctx->qos, u64_to_user_ptr(args->qos_p), sizeof(hwctx->qos))) {
+		XDNA_ERR(xdna, "Access QoS info failed");
+		ret = -EFAULT;
+		goto free_hwctx;
+	}
+
+	hwctx->client = client;
+	hwctx->fw_ctx_id = -1;
+	hwctx->num_tiles = args->num_tiles;
+	hwctx->mem_size = args->mem_size;
+	hwctx->max_opc = args->max_opc;
+	ret = xa_alloc_cyclic(&client->hwctx_xa, &hwctx->id, hwctx,
+			      XA_LIMIT(AMDXDNA_INVALID_CTX_HANDLE + 1, MAX_HWCTX_ID),
+			      &client->next_hwctxid, GFP_KERNEL);
+	if (ret < 0) {
+		XDNA_ERR(xdna, "Allocate hwctx ID failed, ret %d", ret);
+		goto free_hwctx;
+	}
+
+	hwctx->name = kasprintf(GFP_KERNEL, "hwctx.%d.%d", client->pid, hwctx->id);
+	if (!hwctx->name) {
+		ret = -ENOMEM;
+		goto rm_id;
+	}
+
+	mutex_lock(&xdna->dev_lock);
+	ret = xdna->dev_info->ops->hwctx_init(hwctx);
+	if (ret) {
+		mutex_unlock(&xdna->dev_lock);
+		XDNA_ERR(xdna, "Init hwctx failed, ret %d", ret);
+		goto free_name;
+	}
+	args->handle = hwctx->id;
+	args->syncobj_handle = hwctx->syncobj_hdl;
+	mutex_unlock(&xdna->dev_lock);
+
+	XDNA_DBG(xdna, "PID %d create HW context %d, ret %d", client->pid, args->handle, ret);
+	drm_dev_exit(idx);
+	return 0;
+
+free_name:
+	kfree(hwctx->name);
+rm_id:
+	xa_erase(&client->hwctx_xa, hwctx->id);
+free_hwctx:
+	kfree(hwctx);
+exit:
+	drm_dev_exit(idx);
+	return ret;
+}
+
+int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+{
+	struct amdxdna_client *client = filp->driver_priv;
+	struct amdxdna_drm_destroy_hwctx *args = data;
+	struct amdxdna_dev *xdna = to_xdna_dev(dev);
+	struct amdxdna_hwctx *hwctx;
+	int ret = 0, idx;
+
+	if (XDNA_MBZ_DBG(xdna, &args->pad, sizeof(args->pad)))
+		return -EINVAL;
+
+	if (!drm_dev_enter(dev, &idx))
+		return -ENODEV;
+
+	hwctx = xa_erase(&client->hwctx_xa, args->handle);
+	if (!hwctx) {
+		ret = -EINVAL;
+		XDNA_DBG(xdna, "PID %d HW context %d not exist",
+			 client->pid, args->handle);
+		goto out;
+	}
+
+	/*
+	 * The pushed jobs are handled by DRM scheduler during destroy.
+	 * SRCU to synchronize with exec command ioctls.
+	 */
+	amdxdna_hwctx_destroy_rcu(hwctx, &client->hwctx_srcu);
+
+	XDNA_DBG(xdna, "PID %d destroyed HW context %d", client->pid, args->handle);
+out:
+	drm_dev_exit(idx);
+	return ret;
+}
+
+int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+{
+	struct amdxdna_client *client = filp->driver_priv;
+	struct amdxdna_drm_config_hwctx *args = data;
+	struct amdxdna_dev *xdna = to_xdna_dev(dev);
+	struct amdxdna_hwctx *hwctx;
+	int ret, idx;
+	u32 buf_size;
+	void *buf;
+	u64 val;
+
+	if (XDNA_MBZ_DBG(xdna, &args->pad, sizeof(args->pad)))
+		return -EINVAL;
+
+	if (!xdna->dev_info->ops->hwctx_config)
+		return -EOPNOTSUPP;
+
+	val = args->param_val;
+	buf_size = args->param_val_size;
+
+	switch (args->param_type) {
+	case DRM_AMDXDNA_HWCTX_CONFIG_CU:
+		/* For those types that param_val is pointer */
+		if (buf_size > PAGE_SIZE) {
+			XDNA_ERR(xdna, "Config CU param buffer too large");
+			return -E2BIG;
+		}
+
+		/* Hwctx needs to keep buf */
+		buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
+		if (!buf)
+			return -ENOMEM;
+
+		if (copy_from_user(buf, u64_to_user_ptr(val), buf_size)) {
+			kfree(buf);
+			return -EFAULT;
+		}
+
+		break;
+	case DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF:
+	case DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF:
+		/* For those types that param_val is a value */
+		buf = NULL;
+		buf_size = 0;
+		break;
+	default:
+		XDNA_DBG(xdna, "Unknown HW context config type %d", args->param_type);
+		return -EINVAL;
+	}
+
+	mutex_lock(&xdna->dev_lock);
+	idx = srcu_read_lock(&client->hwctx_srcu);
+	hwctx = xa_load(&client->hwctx_xa, args->handle);
+	if (!hwctx) {
+		XDNA_DBG(xdna, "PID %d failed to get hwctx %d", client->pid, args->handle);
+		ret = -EINVAL;
+		goto unlock_srcu;
+	}
+
+	ret = xdna->dev_info->ops->hwctx_config(hwctx, args->param_type, val, buf, buf_size);
+
+unlock_srcu:
+	srcu_read_unlock(&client->hwctx_srcu, idx);
+	mutex_unlock(&xdna->dev_lock);
+	kfree(buf);
+	return ret;
+}
+
+static void
+amdxdna_arg_bos_put(struct amdxdna_sched_job *job)
+{
+	int i;
+
+	for (i = 0; i < job->bo_cnt; i++) {
+		if (!job->bos[i])
+			break;
+		drm_gem_object_put(job->bos[i]);
+	}
+}
+
+static int
+amdxdna_arg_bos_lookup(struct amdxdna_client *client,
+		       struct amdxdna_sched_job *job,
+		       u32 *bo_hdls, u32 bo_cnt)
+{
+	struct drm_gem_object *gobj;
+	int i, ret;
+
+	job->bo_cnt = bo_cnt;
+	for (i = 0; i < job->bo_cnt; i++) {
+		struct amdxdna_gem_obj *abo;
+
+		gobj = drm_gem_object_lookup(client->filp, bo_hdls[i]);
+		if (!gobj) {
+			ret = -ENOENT;
+			goto put_shmem_bo;
+		}
+		abo = to_xdna_obj(gobj);
+
+		mutex_lock(&abo->lock);
+		if (abo->pinned) {
+			mutex_unlock(&abo->lock);
+			job->bos[i] = gobj;
+			continue;
+		}
+
+		ret = amdxdna_gem_pin_nolock(abo);
+		if (ret) {
+			mutex_unlock(&abo->lock);
+			drm_gem_object_put(gobj);
+			goto put_shmem_bo;
+		}
+		abo->pinned = true;
+		mutex_unlock(&abo->lock);
+
+		job->bos[i] = gobj;
+	}
+
+	return 0;
+
+put_shmem_bo:
+	amdxdna_arg_bos_put(job);
+	return ret;
+}
+
+void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job)
+{
+	trace_amdxdna_debug_point(job->hwctx->name, job->seq, "job release");
+	amdxdna_arg_bos_put(job);
+	amdxdna_gem_put_obj(job->cmd_bo);
+}
+
+int amdxdna_cmd_submit(struct amdxdna_client *client,
+		       u32 cmd_bo_hdl, u32 *arg_bo_hdls, u32 arg_bo_cnt,
+		       u32 hwctx_hdl, u64 *seq)
+{
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_sched_job *job;
+	struct amdxdna_hwctx *hwctx;
+	int ret, idx;
+
+	XDNA_DBG(xdna, "Command BO hdl %d, Arg BO count %d", cmd_bo_hdl, arg_bo_cnt);
+	job = kzalloc(struct_size(job, bos, arg_bo_cnt), GFP_KERNEL);
+	if (!job)
+		return -ENOMEM;
+
+	if (cmd_bo_hdl != AMDXDNA_INVALID_BO_HANDLE) {
+		job->cmd_bo = amdxdna_gem_get_obj(client, cmd_bo_hdl, AMDXDNA_BO_CMD);
+		if (!job->cmd_bo) {
+			XDNA_ERR(xdna, "Failed to get cmd bo from %d", cmd_bo_hdl);
+			ret = -EINVAL;
+			goto free_job;
+		}
+	} else {
+		job->cmd_bo = NULL;
+	}
+
+	ret = amdxdna_arg_bos_lookup(client, job, arg_bo_hdls, arg_bo_cnt);
+	if (ret) {
+		XDNA_ERR(xdna, "Argument BOs lookup failed, ret %d", ret);
+		goto cmd_put;
+	}
+
+	idx = srcu_read_lock(&client->hwctx_srcu);
+	hwctx = xa_load(&client->hwctx_xa, hwctx_hdl);
+	if (!hwctx) {
+		XDNA_DBG(xdna, "PID %d failed to get hwctx %d",
+			 client->pid, hwctx_hdl);
+		ret = -EINVAL;
+		goto unlock_srcu;
+	}
+
+	if (hwctx->status != HWCTX_STAT_READY) {
+		XDNA_ERR(xdna, "HW Context is not ready");
+		ret = -EINVAL;
+		goto unlock_srcu;
+	}
+
+	job->hwctx = hwctx;
+	job->mm = current->mm;
+
+	job->fence = amdxdna_fence_create(hwctx);
+	if (!job->fence) {
+		XDNA_ERR(xdna, "Failed to create fence");
+		ret = -ENOMEM;
+		goto unlock_srcu;
+	}
+	kref_init(&job->refcnt);
+
+	ret = xdna->dev_info->ops->cmd_submit(hwctx, job, seq);
+	if (ret)
+		goto put_fence;
+
+	/*
+	 * The amdxdna_hwctx_destroy_rcu() will release hwctx and associated
+	 * resource after synchronize_srcu(). The submitted jobs should be
+	 * handled by the queue, for example DRM scheduler, in device layer.
+	 * For here we can unlock SRCU.
+	 */
+	srcu_read_unlock(&client->hwctx_srcu, idx);
+	trace_amdxdna_debug_point(hwctx->name, *seq, "job pushed");
+
+	return 0;
+
+put_fence:
+	dma_fence_put(job->fence);
+unlock_srcu:
+	srcu_read_unlock(&client->hwctx_srcu, idx);
+	amdxdna_arg_bos_put(job);
+cmd_put:
+	amdxdna_gem_put_obj(job->cmd_bo);
+free_job:
+	kfree(job);
+	return ret;
+}
+
+/*
+ * The submit command ioctl submits a command to firmware. One firmware command
+ * may contain multiple command BOs for processing as a whole.
+ * The command sequence number is returned which can be used for wait command ioctl.
+ */
+static int amdxdna_drm_submit_execbuf(struct amdxdna_client *client,
+				      struct amdxdna_drm_exec_cmd *args)
+{
+	struct amdxdna_dev *xdna = client->xdna;
+	u32 *arg_bo_hdls;
+	u32 cmd_bo_hdl;
+	int ret;
+
+	if (!args->arg_count || args->arg_count > MAX_ARG_COUNT) {
+		XDNA_ERR(xdna, "Invalid arg bo count %d", args->arg_count);
+		return -EINVAL;
+	}
+
+	/* Only support single command for now. */
+	if (args->cmd_count != 1) {
+		XDNA_ERR(xdna, "Invalid cmd bo count %d", args->cmd_count);
+		return -EINVAL;
+	}
+
+	cmd_bo_hdl = (u32)args->cmd_handles;
+	arg_bo_hdls = kcalloc(args->arg_count, sizeof(u32), GFP_KERNEL);
+	if (!arg_bo_hdls)
+		return -ENOMEM;
+	ret = copy_from_user(arg_bo_hdls, u64_to_user_ptr(args->args),
+			     args->arg_count * sizeof(u32));
+	if (ret) {
+		ret = -EFAULT;
+		goto free_cmd_bo_hdls;
+	}
+
+	ret = amdxdna_cmd_submit(client, cmd_bo_hdl, arg_bo_hdls,
+				 args->arg_count, args->hwctx, &args->seq);
+	if (ret)
+		XDNA_DBG(xdna, "Submit cmds failed, ret %d", ret);
+
+free_cmd_bo_hdls:
+	kfree(arg_bo_hdls);
+	if (!ret)
+		XDNA_DBG(xdna, "Pushed cmd %lld to scheduler", args->seq);
+	return ret;
+}
+
+int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+{
+	struct amdxdna_client *client = filp->driver_priv;
+	struct amdxdna_drm_exec_cmd *args = data;
+
+	if (args->ext || args->ext_flags)
+		return -EINVAL;
+
+	switch (args->type) {
+	case AMDXDNA_CMD_SUBMIT_EXEC_BUF:
+		return amdxdna_drm_submit_execbuf(client, args);
+	}
+
+	XDNA_ERR(client->xdna, "Invalid command type %d", args->type);
+	return -EINVAL;
+}
diff --git a/drivers/accel/amdxdna/amdxdna_ctx.h b/drivers/accel/amdxdna/amdxdna_ctx.h
new file mode 100644
index 000000000000..80b0304193ec
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_ctx.h
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _AMDXDNA_CTX_H_
+#define _AMDXDNA_CTX_H_
+
+#include <linux/bitfield.h>
+
+#include "amdxdna_gem.h"
+
+struct amdxdna_hwctx_priv;
+
+enum ert_cmd_opcode {
+	ERT_START_CU      = 0,
+	ERT_CMD_CHAIN     = 19,
+	ERT_START_NPU     = 20,
+};
+
+enum ert_cmd_state {
+	ERT_CMD_STATE_INVALID,
+	ERT_CMD_STATE_NEW,
+	ERT_CMD_STATE_QUEUED,
+	ERT_CMD_STATE_RUNNING,
+	ERT_CMD_STATE_COMPLETED,
+	ERT_CMD_STATE_ERROR,
+	ERT_CMD_STATE_ABORT,
+	ERT_CMD_STATE_SUBMITTED,
+	ERT_CMD_STATE_TIMEOUT,
+	ERT_CMD_STATE_NORESPONSE,
+};
+
+/*
+ * Interpretation of the beginning of data payload for ERT_START_NPU in
+ * amdxdna_cmd. The rest of the payload in amdxdna_cmd is regular kernel args.
+ */
+struct amdxdna_cmd_start_npu {
+	u64 buffer;       /* instruction buffer address */
+	u32 buffer_size;  /* size of buffer in bytes */
+	u32 prop_count;	  /* properties count */
+	u32 prop_args[];  /* properties and regular kernel arguments */
+};
+
+/*
+ * Interpretation of the beginning of data payload for ERT_CMD_CHAIN in
+ * amdxdna_cmd. The rest of the payload in amdxdna_cmd is cmd BO handles.
+ */
+struct amdxdna_cmd_chain {
+	u32 command_count;
+	u32 submit_index;
+	u32 error_index;
+	u32 reserved[3];
+	u64 data[] __counted_by(command_count);
+};
+
+/* Exec buffer command header format */
+#define AMDXDNA_CMD_STATE		GENMASK(3, 0)
+#define AMDXDNA_CMD_EXTRA_CU_MASK	GENMASK(11, 10)
+#define AMDXDNA_CMD_COUNT		GENMASK(22, 12)
+#define AMDXDNA_CMD_OPCODE		GENMASK(27, 23)
+struct amdxdna_cmd {
+	u32 header;
+	u32 data[];
+};
+
+struct amdxdna_hwctx {
+	struct amdxdna_client		*client;
+	struct amdxdna_hwctx_priv	*priv;
+	char				*name;
+
+	u32				id;
+	u32				max_opc;
+	u32				num_tiles;
+	u32				mem_size;
+	u32				fw_ctx_id;
+	u32				col_list_len;
+	u32				*col_list;
+	u32				start_col;
+	u32				num_col;
+#define HWCTX_STAT_INIT  0
+#define HWCTX_STAT_READY 1
+#define HWCTX_STAT_STOP  2
+	u32				status;
+	u32				old_status;
+
+	struct amdxdna_qos_info		     qos;
+	struct amdxdna_hwctx_param_config_cu *cus;
+	u32				syncobj_hdl;
+};
+
+#define drm_job_to_xdna_job(j) \
+	container_of(j, struct amdxdna_sched_job, base)
+
+struct amdxdna_sched_job {
+	struct drm_sched_job	base;
+	struct kref		refcnt;
+	struct amdxdna_hwctx	*hwctx;
+	struct mm_struct	*mm;
+	/* The fence to notice DRM scheduler that job is done by hardware */
+	struct dma_fence	*fence;
+	/* user can wait on this fence */
+	struct dma_fence	*out_fence;
+	bool			job_done;
+	u64			seq;
+	struct amdxdna_gem_obj	*cmd_bo;
+	size_t			bo_cnt;
+	struct drm_gem_object	*bos[] __counted_by(bo_cnt);
+};
+
+static inline u32
+amdxdna_cmd_get_op(struct amdxdna_gem_obj *abo)
+{
+	struct amdxdna_cmd *cmd = abo->mem.kva;
+
+	return FIELD_GET(AMDXDNA_CMD_OPCODE, cmd->header);
+}
+
+static inline void
+amdxdna_cmd_set_state(struct amdxdna_gem_obj *abo, enum ert_cmd_state s)
+{
+	struct amdxdna_cmd *cmd = abo->mem.kva;
+
+	cmd->header &= ~AMDXDNA_CMD_STATE;
+	cmd->header |= FIELD_PREP(AMDXDNA_CMD_STATE, s);
+}
+
+static inline enum ert_cmd_state
+amdxdna_cmd_get_state(struct amdxdna_gem_obj *abo)
+{
+	struct amdxdna_cmd *cmd = abo->mem.kva;
+
+	return FIELD_GET(AMDXDNA_CMD_STATE, cmd->header);
+}
+
+void *amdxdna_cmd_get_payload(struct amdxdna_gem_obj *abo, u32 *size);
+int amdxdna_cmd_get_cu_idx(struct amdxdna_gem_obj *abo);
+
+static inline u32 amdxdna_hwctx_col_map(struct amdxdna_hwctx *hwctx)
+{
+	return GENMASK(hwctx->start_col + hwctx->num_col - 1,
+		       hwctx->start_col);
+}
+
+void amdxdna_sched_job_cleanup(struct amdxdna_sched_job *job);
+void amdxdna_hwctx_remove_all(struct amdxdna_client *client);
+void amdxdna_hwctx_suspend(struct amdxdna_client *client);
+void amdxdna_hwctx_resume(struct amdxdna_client *client);
+
+int amdxdna_cmd_submit(struct amdxdna_client *client,
+		       u32 cmd_bo_hdls, u32 *arg_bo_hdls, u32 arg_bo_cnt,
+		       u32 hwctx_hdl, u64 *seq);
+
+int amdxdna_cmd_wait(struct amdxdna_client *client, u32 hwctx_hdl,
+		     u64 seq, u32 timeout);
+
+int amdxdna_drm_create_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
+int amdxdna_drm_config_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
+int amdxdna_drm_destroy_hwctx_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
+int amdxdna_drm_submit_cmd_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
+
+#endif /* _AMDXDNA_CTX_H_ */
diff --git a/drivers/accel/amdxdna/amdxdna_gem.c b/drivers/accel/amdxdna/amdxdna_gem.c
new file mode 100644
index 000000000000..606433d73236
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_gem.c
@@ -0,0 +1,622 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_cache.h>
+#include <drm/drm_device.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/iosys-map.h>
+#include <linux/vmalloc.h>
+
+#include "amdxdna_ctx.h"
+#include "amdxdna_gem.h"
+#include "amdxdna_pci_drv.h"
+
+#define XDNA_MAX_CMD_BO_SIZE	SZ_32K
+
+static int
+amdxdna_gem_insert_node_locked(struct amdxdna_gem_obj *abo, bool use_vmap)
+{
+	struct amdxdna_client *client = abo->client;
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_mem *mem = &abo->mem;
+	u64 offset;
+	u32 align;
+	int ret;
+
+	align = 1 << max(PAGE_SHIFT, xdna->dev_info->dev_mem_buf_shift);
+	ret = drm_mm_insert_node_generic(&abo->dev_heap->mm, &abo->mm_node,
+					 mem->size, align,
+					 0, DRM_MM_INSERT_BEST);
+	if (ret) {
+		XDNA_ERR(xdna, "Failed to alloc dev bo memory, ret %d", ret);
+		return ret;
+	}
+
+	mem->dev_addr = abo->mm_node.start;
+	offset = mem->dev_addr - abo->dev_heap->mem.dev_addr;
+	mem->userptr = abo->dev_heap->mem.userptr + offset;
+	mem->pages = &abo->dev_heap->base.pages[offset >> PAGE_SHIFT];
+	mem->nr_pages = mem->size >> PAGE_SHIFT;
+
+	if (use_vmap) {
+		mem->kva = vmap(mem->pages, mem->nr_pages, VM_MAP, PAGE_KERNEL);
+		if (!mem->kva) {
+			XDNA_ERR(xdna, "Failed to vmap");
+			drm_mm_remove_node(&abo->mm_node);
+			return -EFAULT;
+		}
+	}
+
+	return 0;
+}
+
+static void amdxdna_gem_obj_free(struct drm_gem_object *gobj)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(gobj->dev);
+	struct amdxdna_gem_obj *abo = to_xdna_obj(gobj);
+	struct iosys_map map = IOSYS_MAP_INIT_VADDR(abo->mem.kva);
+
+	XDNA_DBG(xdna, "BO type %d xdna_addr 0x%llx", abo->type, abo->mem.dev_addr);
+	if (abo->pinned)
+		amdxdna_gem_unpin(abo);
+
+	if (abo->type == AMDXDNA_BO_DEV) {
+		mutex_lock(&abo->client->mm_lock);
+		drm_mm_remove_node(&abo->mm_node);
+		mutex_unlock(&abo->client->mm_lock);
+
+		vunmap(abo->mem.kva);
+		drm_gem_object_put(to_gobj(abo->dev_heap));
+		drm_gem_object_release(gobj);
+		mutex_destroy(&abo->lock);
+		kfree(abo);
+		return;
+	}
+
+	if (abo->type == AMDXDNA_BO_DEV_HEAP)
+		drm_mm_takedown(&abo->mm);
+
+	drm_gem_vunmap_unlocked(gobj, &map);
+	mutex_destroy(&abo->lock);
+	drm_gem_shmem_free(&abo->base);
+}
+
+static const struct drm_gem_object_funcs amdxdna_gem_dev_obj_funcs = {
+	.free = amdxdna_gem_obj_free,
+};
+
+static bool amdxdna_hmm_invalidate(struct mmu_interval_notifier *mni,
+				   const struct mmu_notifier_range *range,
+				   unsigned long cur_seq)
+{
+	struct amdxdna_gem_obj *abo = container_of(mni, struct amdxdna_gem_obj,
+						   mem.notifier);
+	struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
+
+	XDNA_DBG(xdna, "Invalid range 0x%llx, 0x%lx, type %d",
+		 abo->mem.userptr, abo->mem.size, abo->type);
+
+	if (!mmu_notifier_range_blockable(range))
+		return false;
+
+	xdna->dev_info->ops->hmm_invalidate(abo, cur_seq);
+
+	return true;
+}
+
+static const struct mmu_interval_notifier_ops amdxdna_hmm_ops = {
+	.invalidate = amdxdna_hmm_invalidate,
+};
+
+static void amdxdna_hmm_unregister(struct amdxdna_gem_obj *abo)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
+
+	if (!xdna->dev_info->ops->hmm_invalidate)
+		return;
+
+	mmu_interval_notifier_remove(&abo->mem.notifier);
+	kvfree(abo->mem.pfns);
+	abo->mem.pfns = NULL;
+}
+
+static int amdxdna_hmm_register(struct amdxdna_gem_obj *abo, unsigned long addr,
+				size_t len)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
+	u32 nr_pages;
+	int ret;
+
+	if (!xdna->dev_info->ops->hmm_invalidate)
+		return 0;
+
+	if (abo->mem.pfns)
+		return -EEXIST;
+
+	nr_pages = (PAGE_ALIGN(addr + len) - (addr & PAGE_MASK)) >> PAGE_SHIFT;
+	abo->mem.pfns = kvcalloc(nr_pages, sizeof(*abo->mem.pfns),
+				 GFP_KERNEL);
+	if (!abo->mem.pfns)
+		return -ENOMEM;
+
+	ret = mmu_interval_notifier_insert_locked(&abo->mem.notifier,
+						  current->mm,
+						  addr,
+						  len,
+						  &amdxdna_hmm_ops);
+	if (ret) {
+		XDNA_ERR(xdna, "Insert mmu notifier failed, ret %d", ret);
+		kvfree(abo->mem.pfns);
+	}
+	abo->mem.userptr = addr;
+
+	return ret;
+}
+
+static int amdxdna_gem_obj_mmap(struct drm_gem_object *gobj,
+				struct vm_area_struct *vma)
+{
+	struct amdxdna_gem_obj *abo = to_xdna_obj(gobj);
+	unsigned long num_pages;
+	int ret;
+
+	ret = amdxdna_hmm_register(abo, vma->vm_start, gobj->size);
+	if (ret)
+		return ret;
+
+	ret = drm_gem_shmem_mmap(&abo->base, vma);
+	if (ret)
+		goto hmm_unreg;
+
+	num_pages = gobj->size >> PAGE_SHIFT;
+	/* Try to insert the pages */
+	vm_flags_mod(vma, VM_MIXEDMAP, VM_PFNMAP);
+	ret = vm_insert_pages(vma, vma->vm_start, abo->base.pages, &num_pages);
+	if (ret)
+		XDNA_ERR(abo->client->xdna, "Failed insert pages, ret %d", ret);
+
+	return 0;
+
+hmm_unreg:
+	amdxdna_hmm_unregister(abo);
+	return ret;
+}
+
+static vm_fault_t amdxdna_gem_vm_fault(struct vm_fault *vmf)
+{
+	return drm_gem_shmem_vm_ops.fault(vmf);
+}
+
+static void amdxdna_gem_vm_open(struct vm_area_struct *vma)
+{
+	drm_gem_shmem_vm_ops.open(vma);
+}
+
+static void amdxdna_gem_vm_close(struct vm_area_struct *vma)
+{
+	struct drm_gem_object *gobj = vma->vm_private_data;
+
+	amdxdna_hmm_unregister(to_xdna_obj(gobj));
+	drm_gem_shmem_vm_ops.close(vma);
+}
+
+static const struct vm_operations_struct amdxdna_gem_vm_ops = {
+	.fault = amdxdna_gem_vm_fault,
+	.open = amdxdna_gem_vm_open,
+	.close = amdxdna_gem_vm_close,
+};
+
+static const struct drm_gem_object_funcs amdxdna_gem_shmem_funcs = {
+	.free = amdxdna_gem_obj_free,
+	.print_info = drm_gem_shmem_object_print_info,
+	.pin = drm_gem_shmem_object_pin,
+	.unpin = drm_gem_shmem_object_unpin,
+	.get_sg_table = drm_gem_shmem_object_get_sg_table,
+	.vmap = drm_gem_shmem_object_vmap,
+	.vunmap = drm_gem_shmem_object_vunmap,
+	.mmap = amdxdna_gem_obj_mmap,
+	.vm_ops = &amdxdna_gem_vm_ops,
+};
+
+static struct amdxdna_gem_obj *
+amdxdna_gem_create_obj(struct drm_device *dev, size_t size)
+{
+	struct amdxdna_gem_obj *abo;
+
+	abo = kzalloc(sizeof(*abo), GFP_KERNEL);
+	if (!abo)
+		return ERR_PTR(-ENOMEM);
+
+	abo->pinned = false;
+	abo->assigned_hwctx = AMDXDNA_INVALID_CTX_HANDLE;
+	mutex_init(&abo->lock);
+
+	abo->mem.userptr = AMDXDNA_INVALID_ADDR;
+	abo->mem.dev_addr = AMDXDNA_INVALID_ADDR;
+	abo->mem.size = size;
+
+	return abo;
+}
+
+/* For drm_driver->gem_create_object callback */
+struct drm_gem_object *
+amdxdna_gem_create_object_cb(struct drm_device *dev, size_t size)
+{
+	struct amdxdna_gem_obj *abo;
+
+	abo = amdxdna_gem_create_obj(dev, size);
+	if (IS_ERR(abo))
+		return ERR_CAST(abo);
+
+	to_gobj(abo)->funcs = &amdxdna_gem_shmem_funcs;
+
+	return to_gobj(abo);
+}
+
+static struct amdxdna_gem_obj *
+amdxdna_drm_alloc_shmem(struct drm_device *dev,
+			struct amdxdna_drm_create_bo *args,
+			struct drm_file *filp)
+{
+	struct amdxdna_client *client = filp->driver_priv;
+	struct drm_gem_shmem_object *shmem;
+	struct amdxdna_gem_obj *abo;
+
+	shmem = drm_gem_shmem_create(dev, args->size);
+	if (IS_ERR(shmem))
+		return ERR_CAST(shmem);
+
+	shmem->map_wc = false;
+
+	abo = to_xdna_obj(&shmem->base);
+	abo->client = client;
+	abo->type = AMDXDNA_BO_SHMEM;
+
+	return abo;
+}
+
+static struct amdxdna_gem_obj *
+amdxdna_drm_create_dev_heap(struct drm_device *dev,
+			    struct amdxdna_drm_create_bo *args,
+			    struct drm_file *filp)
+{
+	struct amdxdna_client *client = filp->driver_priv;
+	struct amdxdna_dev *xdna = to_xdna_dev(dev);
+	struct drm_gem_shmem_object *shmem;
+	struct amdxdna_gem_obj *abo;
+	int ret;
+
+	if (args->size > xdna->dev_info->dev_mem_size) {
+		XDNA_DBG(xdna, "Invalid dev heap size 0x%llx, limit 0x%lx",
+			 args->size, xdna->dev_info->dev_mem_size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	mutex_lock(&client->mm_lock);
+	if (client->dev_heap) {
+		XDNA_DBG(client->xdna, "dev heap is already created");
+		ret = -EBUSY;
+		goto mm_unlock;
+	}
+
+	shmem = drm_gem_shmem_create(dev, args->size);
+	if (IS_ERR(shmem)) {
+		ret = PTR_ERR(shmem);
+		goto mm_unlock;
+	}
+
+	shmem->map_wc = false;
+	abo = to_xdna_obj(&shmem->base);
+
+	abo->type = AMDXDNA_BO_DEV_HEAP;
+	abo->client = client;
+	abo->mem.dev_addr = client->xdna->dev_info->dev_mem_base;
+	drm_mm_init(&abo->mm, abo->mem.dev_addr, abo->mem.size);
+
+	client->dev_heap = abo;
+	drm_gem_object_get(to_gobj(abo));
+	mutex_unlock(&client->mm_lock);
+
+	return abo;
+
+mm_unlock:
+	mutex_unlock(&client->mm_lock);
+	return ERR_PTR(ret);
+}
+
+struct amdxdna_gem_obj *
+amdxdna_drm_alloc_dev_bo(struct drm_device *dev,
+			 struct amdxdna_drm_create_bo *args,
+			 struct drm_file *filp, bool use_vmap)
+{
+	struct amdxdna_client *client = filp->driver_priv;
+	struct amdxdna_dev *xdna = to_xdna_dev(dev);
+	size_t aligned_sz = PAGE_ALIGN(args->size);
+	struct amdxdna_gem_obj *abo, *heap;
+	int ret;
+
+	mutex_lock(&client->mm_lock);
+	heap = client->dev_heap;
+	if (!heap) {
+		ret = -EINVAL;
+		goto mm_unlock;
+	}
+
+	if (heap->mem.userptr == AMDXDNA_INVALID_ADDR) {
+		XDNA_ERR(xdna, "Invalid dev heap userptr");
+		ret = -EINVAL;
+		goto mm_unlock;
+	}
+
+	if (args->size > heap->mem.size) {
+		XDNA_ERR(xdna, "Invalid dev bo size 0x%llx, limit 0x%lx",
+			 args->size, heap->mem.size);
+		ret = -EINVAL;
+		goto mm_unlock;
+	}
+
+	abo = amdxdna_gem_create_obj(&xdna->ddev, aligned_sz);
+	if (IS_ERR(abo)) {
+		ret = PTR_ERR(abo);
+		goto mm_unlock;
+	}
+	to_gobj(abo)->funcs = &amdxdna_gem_dev_obj_funcs;
+	abo->type = AMDXDNA_BO_DEV;
+	abo->client = client;
+	abo->dev_heap = heap;
+	ret = amdxdna_gem_insert_node_locked(abo, use_vmap);
+	if (ret) {
+		XDNA_ERR(xdna, "Failed to alloc dev bo memory, ret %d", ret);
+		goto mm_unlock;
+	}
+
+	drm_gem_object_get(to_gobj(heap));
+	drm_gem_private_object_init(&xdna->ddev, to_gobj(abo), aligned_sz);
+
+	mutex_unlock(&client->mm_lock);
+	return abo;
+
+mm_unlock:
+	mutex_unlock(&client->mm_lock);
+	return ERR_PTR(ret);
+}
+
+static struct amdxdna_gem_obj *
+amdxdna_drm_create_cmd_bo(struct drm_device *dev,
+			  struct amdxdna_drm_create_bo *args,
+			  struct drm_file *filp)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(dev);
+	struct drm_gem_shmem_object *shmem;
+	struct amdxdna_gem_obj *abo;
+	struct iosys_map map;
+	int ret;
+
+	if (args->size > XDNA_MAX_CMD_BO_SIZE) {
+		XDNA_ERR(xdna, "Command bo size 0x%llx too large", args->size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (args->size < sizeof(struct amdxdna_cmd)) {
+		XDNA_DBG(xdna, "Command BO size 0x%llx too small", args->size);
+		return ERR_PTR(-EINVAL);
+	}
+
+	shmem = drm_gem_shmem_create(dev, args->size);
+	if (IS_ERR(shmem))
+		return ERR_CAST(shmem);
+
+	shmem->map_wc = false;
+	abo = to_xdna_obj(&shmem->base);
+
+	abo->type = AMDXDNA_BO_CMD;
+	abo->client = filp->driver_priv;
+
+	ret = drm_gem_vmap_unlocked(to_gobj(abo), &map);
+	if (ret) {
+		XDNA_ERR(xdna, "Vmap cmd bo failed, ret %d", ret);
+		goto release_obj;
+	}
+	abo->mem.kva = map.vaddr;
+
+	return abo;
+
+release_obj:
+	drm_gem_shmem_free(shmem);
+	return ERR_PTR(ret);
+}
+
+int amdxdna_drm_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(dev);
+	struct amdxdna_drm_create_bo *args = data;
+	struct amdxdna_gem_obj *abo;
+	int ret;
+
+	if (args->flags || args->vaddr || !args->size)
+		return -EINVAL;
+
+	XDNA_DBG(xdna, "BO arg type %d vaddr 0x%llx size 0x%llx flags 0x%llx",
+		 args->type, args->vaddr, args->size, args->flags);
+	switch (args->type) {
+	case AMDXDNA_BO_SHMEM:
+		abo = amdxdna_drm_alloc_shmem(dev, args, filp);
+		break;
+	case AMDXDNA_BO_DEV_HEAP:
+		abo = amdxdna_drm_create_dev_heap(dev, args, filp);
+		break;
+	case AMDXDNA_BO_DEV:
+		abo = amdxdna_drm_alloc_dev_bo(dev, args, filp, false);
+		break;
+	case AMDXDNA_BO_CMD:
+		abo = amdxdna_drm_create_cmd_bo(dev, args, filp);
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (IS_ERR(abo))
+		return PTR_ERR(abo);
+
+	/* ready to publish object to userspace */
+	ret = drm_gem_handle_create(filp, to_gobj(abo), &args->handle);
+	if (ret) {
+		XDNA_ERR(xdna, "Create handle failed");
+		goto put_obj;
+	}
+
+	XDNA_DBG(xdna, "BO hdl %d type %d userptr 0x%llx xdna_addr 0x%llx size 0x%lx",
+		 args->handle, args->type, abo->mem.userptr,
+		 abo->mem.dev_addr, abo->mem.size);
+put_obj:
+	/* Dereference object reference. Handle holds it now. */
+	drm_gem_object_put(to_gobj(abo));
+	return ret;
+}
+
+int amdxdna_gem_pin_nolock(struct amdxdna_gem_obj *abo)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(to_gobj(abo)->dev);
+	int ret;
+
+	switch (abo->type) {
+	case AMDXDNA_BO_SHMEM:
+	case AMDXDNA_BO_DEV_HEAP:
+		ret = drm_gem_shmem_pin(&abo->base);
+		break;
+	case AMDXDNA_BO_DEV:
+		ret = drm_gem_shmem_pin(&abo->dev_heap->base);
+		break;
+	default:
+		ret = -EOPNOTSUPP;
+	}
+
+	XDNA_DBG(xdna, "BO type %d ret %d", abo->type, ret);
+	return ret;
+}
+
+int amdxdna_gem_pin(struct amdxdna_gem_obj *abo)
+{
+	int ret;
+
+	if (abo->type == AMDXDNA_BO_DEV)
+		abo = abo->dev_heap;
+
+	mutex_lock(&abo->lock);
+	ret = amdxdna_gem_pin_nolock(abo);
+	mutex_unlock(&abo->lock);
+
+	return ret;
+}
+
+void amdxdna_gem_unpin(struct amdxdna_gem_obj *abo)
+{
+	if (abo->type == AMDXDNA_BO_DEV)
+		abo = abo->dev_heap;
+
+	mutex_lock(&abo->lock);
+	drm_gem_shmem_unpin(&abo->base);
+	mutex_unlock(&abo->lock);
+}
+
+struct amdxdna_gem_obj *amdxdna_gem_get_obj(struct amdxdna_client *client,
+					    u32 bo_hdl, u8 bo_type)
+{
+	struct amdxdna_dev *xdna = client->xdna;
+	struct amdxdna_gem_obj *abo;
+	struct drm_gem_object *gobj;
+
+	gobj = drm_gem_object_lookup(client->filp, bo_hdl);
+	if (!gobj) {
+		XDNA_DBG(xdna, "Can not find bo %d", bo_hdl);
+		return NULL;
+	}
+
+	abo = to_xdna_obj(gobj);
+	if (bo_type == AMDXDNA_BO_INVALID || abo->type == bo_type)
+		return abo;
+
+	drm_gem_object_put(gobj);
+	return NULL;
+}
+
+int amdxdna_drm_get_bo_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+{
+	struct amdxdna_drm_get_bo_info *args = data;
+	struct amdxdna_dev *xdna = to_xdna_dev(dev);
+	struct amdxdna_gem_obj *abo;
+	struct drm_gem_object *gobj;
+	int ret = 0;
+
+	if (args->ext || args->ext_flags || args->pad)
+		return -EINVAL;
+
+	gobj = drm_gem_object_lookup(filp, args->handle);
+	if (!gobj) {
+		XDNA_DBG(xdna, "Lookup GEM object %d failed", args->handle);
+		return -ENOENT;
+	}
+
+	abo = to_xdna_obj(gobj);
+	args->vaddr = abo->mem.userptr;
+	args->xdna_addr = abo->mem.dev_addr;
+
+	if (abo->type != AMDXDNA_BO_DEV)
+		args->map_offset = drm_vma_node_offset_addr(&gobj->vma_node);
+	else
+		args->map_offset = AMDXDNA_INVALID_ADDR;
+
+	XDNA_DBG(xdna, "BO hdl %d map_offset 0x%llx vaddr 0x%llx xdna_addr 0x%llx",
+		 args->handle, args->map_offset, args->vaddr, args->xdna_addr);
+
+	drm_gem_object_put(gobj);
+	return ret;
+}
+
+/*
+ * The sync bo ioctl is to make sure the CPU cache is in sync with memory.
+ * This is required because NPU is not cache coherent device. CPU cache
+ * flushing/invalidation is expensive so it is best to handle this outside
+ * of the command submission path. This ioctl allows explicit cache
+ * flushing/invalidation outside of the critical path.
+ */
+int amdxdna_drm_sync_bo_ioctl(struct drm_device *dev,
+			      void *data, struct drm_file *filp)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(dev);
+	struct amdxdna_drm_sync_bo *args = data;
+	struct amdxdna_gem_obj *abo;
+	struct drm_gem_object *gobj;
+	int ret;
+
+	gobj = drm_gem_object_lookup(filp, args->handle);
+	if (!gobj) {
+		XDNA_ERR(xdna, "Lookup GEM object failed");
+		return -ENOENT;
+	}
+	abo = to_xdna_obj(gobj);
+
+	ret = amdxdna_gem_pin(abo);
+	if (ret) {
+		XDNA_ERR(xdna, "Pin BO %d failed, ret %d", args->handle, ret);
+		goto put_obj;
+	}
+
+	if (abo->type == AMDXDNA_BO_DEV)
+		drm_clflush_pages(abo->mem.pages, abo->mem.nr_pages);
+	else
+		drm_clflush_pages(abo->base.pages, gobj->size >> PAGE_SHIFT);
+
+	amdxdna_gem_unpin(abo);
+
+	XDNA_DBG(xdna, "Sync bo %d offset 0x%llx, size 0x%llx\n",
+		 args->handle, args->offset, args->size);
+
+put_obj:
+	drm_gem_object_put(gobj);
+	return ret;
+}
diff --git a/drivers/accel/amdxdna/amdxdna_gem.h b/drivers/accel/amdxdna/amdxdna_gem.h
new file mode 100644
index 000000000000..8ccc0375dd9d
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_gem.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _AMDXDNA_GEM_H_
+#define _AMDXDNA_GEM_H_
+
+struct amdxdna_mem {
+	u64				userptr;
+	void				*kva;
+	u64				dev_addr;
+	size_t				size;
+	struct page			**pages;
+	u32				nr_pages;
+	struct mmu_interval_notifier	notifier;
+	unsigned long			*pfns;
+	bool				map_invalid;
+};
+
+struct amdxdna_gem_obj {
+	struct drm_gem_shmem_object	base;
+	struct amdxdna_client		*client;
+	u8				type;
+	bool				pinned;
+	struct mutex			lock; /* Protects: pinned */
+	struct amdxdna_mem		mem;
+
+	/* Below members is uninitialized when needed */
+	struct drm_mm			mm; /* For AMDXDNA_BO_DEV_HEAP */
+	struct amdxdna_gem_obj		*dev_heap; /* For AMDXDNA_BO_DEV */
+	struct drm_mm_node		mm_node; /* For AMDXDNA_BO_DEV */
+	u32				assigned_hwctx;
+};
+
+#define to_gobj(obj)    (&(obj)->base.base)
+
+static inline struct amdxdna_gem_obj *to_xdna_obj(struct drm_gem_object *gobj)
+{
+	return container_of(gobj, struct amdxdna_gem_obj, base.base);
+}
+
+struct amdxdna_gem_obj *amdxdna_gem_get_obj(struct amdxdna_client *client,
+					    u32 bo_hdl, u8 bo_type);
+static inline void amdxdna_gem_put_obj(struct amdxdna_gem_obj *abo)
+{
+	drm_gem_object_put(to_gobj(abo));
+}
+
+struct drm_gem_object *
+amdxdna_gem_create_object_cb(struct drm_device *dev, size_t size);
+struct amdxdna_gem_obj *
+amdxdna_drm_alloc_dev_bo(struct drm_device *dev,
+			 struct amdxdna_drm_create_bo *args,
+			 struct drm_file *filp, bool use_vmap);
+
+int amdxdna_gem_pin_nolock(struct amdxdna_gem_obj *abo);
+int amdxdna_gem_pin(struct amdxdna_gem_obj *abo);
+void amdxdna_gem_unpin(struct amdxdna_gem_obj *abo);
+
+int amdxdna_drm_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
+int amdxdna_drm_get_bo_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
+int amdxdna_drm_sync_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
+
+#endif /* _AMDXDNA_GEM_H_ */
diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.c b/drivers/accel/amdxdna/amdxdna_mailbox.c
new file mode 100644
index 000000000000..e5301fac1397
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_mailbox.c
@@ -0,0 +1,562 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/drm_device.h>
+#include <drm/drm_managed.h>
+#include <linux/bitfield.h>
+#include <linux/interrupt.h>
+#include <linux/iopoll.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/amdxdna.h>
+
+#include "amdxdna_mailbox.h"
+
+#define MB_ERR(chann, fmt, args...) \
+({ \
+	typeof(chann) _chann = chann; \
+	dev_err((_chann)->mb->dev, "xdna_mailbox.%d: "fmt, \
+		(_chann)->msix_irq, ##args); \
+})
+#define MB_DBG(chann, fmt, args...) \
+({ \
+	typeof(chann) _chann = chann; \
+	dev_dbg((_chann)->mb->dev, "xdna_mailbox.%d: "fmt, \
+		(_chann)->msix_irq, ##args); \
+})
+#define MB_WARN_ONCE(chann, fmt, args...) \
+({ \
+	typeof(chann) _chann = chann; \
+	dev_warn_once((_chann)->mb->dev, "xdna_mailbox.%d: "fmt, \
+		      (_chann)->msix_irq, ##args); \
+})
+
+#define MAGIC_VAL			0x1D000000U
+#define MAGIC_VAL_MASK			0xFF000000
+#define MAX_MSG_ID_ENTRIES		256
+#define MSG_RX_TIMER			200 /* milliseconds */
+#define MAILBOX_NAME			"xdna_mailbox"
+
+enum channel_res_type {
+	CHAN_RES_X2I,
+	CHAN_RES_I2X,
+	CHAN_RES_NUM
+};
+
+struct mailbox {
+	struct device		*dev;
+	struct xdna_mailbox_res	res;
+};
+
+struct mailbox_channel {
+	struct mailbox			*mb;
+	struct xdna_mailbox_chann_res	res[CHAN_RES_NUM];
+	int				msix_irq;
+	u32				iohub_int_addr;
+	struct xarray			chan_xa;
+	u32				next_msgid;
+	u32				x2i_tail;
+
+	/* Received msg related fields */
+	struct workqueue_struct		*work_q;
+	struct work_struct		rx_work;
+	u32				i2x_head;
+	bool				bad_state;
+};
+
+#define MSG_BODY_SZ		GENMASK(10, 0)
+#define MSG_PROTO_VER		GENMASK(23, 16)
+struct xdna_msg_header {
+	__u32 total_size;
+	__u32 sz_ver;
+	__u32 id;
+	__u32 opcode;
+} __packed;
+
+static_assert(sizeof(struct xdna_msg_header) == 16);
+
+struct mailbox_pkg {
+	struct xdna_msg_header	header;
+	__u32			payload[];
+};
+
+/* The protocol version. */
+#define MSG_PROTOCOL_VERSION	0x1
+/* The tombstone value. */
+#define TOMBSTONE		0xDEADFACE
+
+struct mailbox_msg {
+	void			*handle;
+	int			(*notify_cb)(void *handle, const u32 *data, size_t size);
+	size_t			pkg_size; /* package size in bytes */
+	struct mailbox_pkg	pkg;
+};
+
+static void mailbox_reg_write(struct mailbox_channel *mb_chann, u32 mbox_reg, u32 data)
+{
+	struct xdna_mailbox_res *mb_res = &mb_chann->mb->res;
+	void __iomem *ringbuf_addr = mb_res->mbox_base + mbox_reg;
+
+	writel(data, ringbuf_addr);
+}
+
+static u32 mailbox_reg_read(struct mailbox_channel *mb_chann, u32 mbox_reg)
+{
+	struct xdna_mailbox_res *mb_res = &mb_chann->mb->res;
+	void __iomem *ringbuf_addr = mb_res->mbox_base + mbox_reg;
+
+	return readl(ringbuf_addr);
+}
+
+static int mailbox_reg_read_non_zero(struct mailbox_channel *mb_chann, u32 mbox_reg, u32 *val)
+{
+	struct xdna_mailbox_res *mb_res = &mb_chann->mb->res;
+	void __iomem *ringbuf_addr = mb_res->mbox_base + mbox_reg;
+	int ret, value;
+
+	/* Poll till value is not zero */
+	ret = readx_poll_timeout(readl, ringbuf_addr, value,
+				 value, 1 /* us */, 100);
+	if (ret < 0)
+		return ret;
+
+	*val = value;
+	return 0;
+}
+
+static inline void
+mailbox_set_headptr(struct mailbox_channel *mb_chann, u32 headptr_val)
+{
+	mailbox_reg_write(mb_chann, mb_chann->res[CHAN_RES_I2X].mb_head_ptr_reg, headptr_val);
+	mb_chann->i2x_head = headptr_val;
+}
+
+static inline void
+mailbox_set_tailptr(struct mailbox_channel *mb_chann, u32 tailptr_val)
+{
+	mailbox_reg_write(mb_chann, mb_chann->res[CHAN_RES_X2I].mb_tail_ptr_reg, tailptr_val);
+	mb_chann->x2i_tail = tailptr_val;
+}
+
+static inline u32
+mailbox_get_headptr(struct mailbox_channel *mb_chann, enum channel_res_type type)
+{
+	return mailbox_reg_read(mb_chann, mb_chann->res[type].mb_head_ptr_reg);
+}
+
+static inline u32
+mailbox_get_tailptr(struct mailbox_channel *mb_chann, enum channel_res_type type)
+{
+	return mailbox_reg_read(mb_chann, mb_chann->res[type].mb_tail_ptr_reg);
+}
+
+static inline u32
+mailbox_get_ringbuf_size(struct mailbox_channel *mb_chann, enum channel_res_type type)
+{
+	return mb_chann->res[type].rb_size;
+}
+
+static inline int mailbox_validate_msgid(int msg_id)
+{
+	return (msg_id & MAGIC_VAL_MASK) == MAGIC_VAL;
+}
+
+static int mailbox_acquire_msgid(struct mailbox_channel *mb_chann, struct mailbox_msg *mb_msg)
+{
+	u32 msg_id;
+	int ret;
+
+	ret = xa_alloc_cyclic_irq(&mb_chann->chan_xa, &msg_id, mb_msg,
+				  XA_LIMIT(0, MAX_MSG_ID_ENTRIES - 1),
+				  &mb_chann->next_msgid, GFP_NOWAIT);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Add MAGIC_VAL to the higher bits.
+	 */
+	msg_id |= MAGIC_VAL;
+	return msg_id;
+}
+
+static void mailbox_release_msgid(struct mailbox_channel *mb_chann, int msg_id)
+{
+	msg_id &= ~MAGIC_VAL_MASK;
+	xa_erase_irq(&mb_chann->chan_xa, msg_id);
+}
+
+static void mailbox_release_msg(struct mailbox_channel *mb_chann,
+				struct mailbox_msg *mb_msg)
+{
+	MB_DBG(mb_chann, "msg_id 0x%x msg opcode 0x%x",
+	       mb_msg->pkg.header.id, mb_msg->pkg.header.opcode);
+	mb_msg->notify_cb(mb_msg->handle, NULL, 0);
+	kfree(mb_msg);
+}
+
+static int
+mailbox_send_msg(struct mailbox_channel *mb_chann, struct mailbox_msg *mb_msg)
+{
+	void __iomem *write_addr;
+	u32 ringbuf_size;
+	u32 head, tail;
+	u32 start_addr;
+	u32 tmp_tail;
+
+	head = mailbox_get_headptr(mb_chann, CHAN_RES_X2I);
+	tail = mb_chann->x2i_tail;
+	ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I);
+	start_addr = mb_chann->res[CHAN_RES_X2I].rb_start_addr;
+	tmp_tail = tail + mb_msg->pkg_size;
+
+	if (tail < head && tmp_tail >= head)
+		goto no_space;
+
+	if (tail >= head && (tmp_tail > ringbuf_size - sizeof(u32) &&
+			     mb_msg->pkg_size >= head))
+		goto no_space;
+
+	if (tail >= head && tmp_tail > ringbuf_size - sizeof(u32)) {
+		write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
+		writel(TOMBSTONE, write_addr);
+
+		/* tombstone is set. Write from the start of the ringbuf */
+		tail = 0;
+	}
+
+	write_addr = mb_chann->mb->res.ringbuf_base + start_addr + tail;
+	memcpy_toio(write_addr, &mb_msg->pkg, mb_msg->pkg_size);
+	mailbox_set_tailptr(mb_chann, tail + mb_msg->pkg_size);
+
+	trace_mbox_set_tail(MAILBOX_NAME, mb_chann->msix_irq,
+			    mb_msg->pkg.header.opcode,
+			    mb_msg->pkg.header.id);
+
+	return 0;
+
+no_space:
+	return -ENOSPC;
+}
+
+static int
+mailbox_get_resp(struct mailbox_channel *mb_chann, struct xdna_msg_header *header,
+		 void *data)
+{
+	struct mailbox_msg *mb_msg;
+	int msg_id;
+	int ret;
+
+	msg_id = header->id;
+	if (!mailbox_validate_msgid(msg_id)) {
+		MB_ERR(mb_chann, "Bad message ID 0x%x", msg_id);
+		return -EINVAL;
+	}
+
+	msg_id &= ~MAGIC_VAL_MASK;
+	mb_msg = xa_erase_irq(&mb_chann->chan_xa, msg_id);
+	if (!mb_msg) {
+		MB_ERR(mb_chann, "Cannot find msg 0x%x", msg_id);
+		return -EINVAL;
+	}
+
+	MB_DBG(mb_chann, "opcode 0x%x size %d id 0x%x",
+	       header->opcode, header->total_size, header->id);
+	ret = mb_msg->notify_cb(mb_msg->handle, data, header->total_size);
+	if (unlikely(ret))
+		MB_ERR(mb_chann, "Message callback ret %d", ret);
+
+	kfree(mb_msg);
+	return ret;
+}
+
+static int mailbox_get_msg(struct mailbox_channel *mb_chann)
+{
+	struct xdna_msg_header header;
+	void __iomem *read_addr;
+	u32 msg_size, rest;
+	u32 ringbuf_size;
+	u32 head, tail;
+	u32 start_addr;
+	int ret;
+
+	if (mailbox_reg_read_non_zero(mb_chann, mb_chann->res[CHAN_RES_I2X].mb_tail_ptr_reg, &tail))
+		return -EINVAL;
+	head = mb_chann->i2x_head;
+	ringbuf_size = mailbox_get_ringbuf_size(mb_chann, CHAN_RES_I2X);
+	start_addr = mb_chann->res[CHAN_RES_I2X].rb_start_addr;
+
+	if (unlikely(tail > ringbuf_size || !IS_ALIGNED(tail, 4))) {
+		MB_WARN_ONCE(mb_chann, "Invalid tail 0x%x", tail);
+		return -EINVAL;
+	}
+
+	/* ringbuf empty */
+	if (head == tail)
+		return -ENOENT;
+
+	if (head == ringbuf_size)
+		head = 0;
+
+	/* Peek size of the message or TOMBSTONE */
+	read_addr = mb_chann->mb->res.ringbuf_base + start_addr + head;
+	header.total_size = readl(read_addr);
+	/* size is TOMBSTONE, set next read from 0 */
+	if (header.total_size == TOMBSTONE) {
+		if (head < tail) {
+			MB_WARN_ONCE(mb_chann, "Tombstone, head 0x%x tail 0x%x",
+				     head, tail);
+			return -EINVAL;
+		}
+		mailbox_set_headptr(mb_chann, 0);
+		return 0;
+	}
+
+	if (unlikely(!header.total_size || !IS_ALIGNED(header.total_size, 4))) {
+		MB_WARN_ONCE(mb_chann, "Invalid total size 0x%x", header.total_size);
+		return -EINVAL;
+	}
+	msg_size = sizeof(header) + header.total_size;
+
+	if (msg_size > ringbuf_size - head || msg_size > tail - head) {
+		MB_WARN_ONCE(mb_chann, "Invalid message size %d, tail %d, head %d",
+			     msg_size, tail, head);
+		return -EINVAL;
+	}
+
+	rest = sizeof(header) - sizeof(u32);
+	read_addr += sizeof(u32);
+	memcpy_fromio((u32 *)&header + 1, read_addr, rest);
+	read_addr += rest;
+
+	ret = mailbox_get_resp(mb_chann, &header, (u32 *)read_addr);
+
+	mailbox_set_headptr(mb_chann, head + msg_size);
+	/* After update head, it can equal to ringbuf_size. This is expected. */
+	trace_mbox_set_head(MAILBOX_NAME, mb_chann->msix_irq,
+			    header.opcode, header.id);
+
+	return ret;
+}
+
+static irqreturn_t mailbox_irq_handler(int irq, void *p)
+{
+	struct mailbox_channel *mb_chann = p;
+
+	trace_mbox_irq_handle(MAILBOX_NAME, irq);
+	/* Schedule a rx_work to call the callback functions */
+	queue_work(mb_chann->work_q, &mb_chann->rx_work);
+	/* Clear IOHUB register */
+	mailbox_reg_write(mb_chann, mb_chann->iohub_int_addr, 0);
+
+	return IRQ_HANDLED;
+}
+
+static void mailbox_rx_worker(struct work_struct *rx_work)
+{
+	struct mailbox_channel *mb_chann;
+	int ret;
+
+	mb_chann = container_of(rx_work, struct mailbox_channel, rx_work);
+
+	if (READ_ONCE(mb_chann->bad_state)) {
+		MB_ERR(mb_chann, "Channel in bad state, work aborted");
+		return;
+	}
+
+	while (1) {
+		/*
+		 * If return is 0, keep consuming next message, until there is
+		 * no messages or an error happened.
+		 */
+		ret = mailbox_get_msg(mb_chann);
+		if (ret == -ENOENT)
+			break;
+
+		/* Other error means device doesn't look good, disable irq. */
+		if (unlikely(ret)) {
+			MB_ERR(mb_chann, "Unexpected ret %d, disable irq", ret);
+			WRITE_ONCE(mb_chann->bad_state, true);
+			disable_irq(mb_chann->msix_irq);
+			break;
+		}
+	}
+}
+
+int xdna_mailbox_send_msg(struct mailbox_channel *mb_chann,
+			  const struct xdna_mailbox_msg *msg, u64 tx_timeout)
+{
+	struct xdna_msg_header *header;
+	struct mailbox_msg *mb_msg;
+	size_t pkg_size;
+	int ret;
+
+	pkg_size = sizeof(*header) + msg->send_size;
+	if (pkg_size > mailbox_get_ringbuf_size(mb_chann, CHAN_RES_X2I)) {
+		MB_ERR(mb_chann, "Message size larger than ringbuf size");
+		return -EINVAL;
+	}
+
+	if (unlikely(!IS_ALIGNED(msg->send_size, 4))) {
+		MB_ERR(mb_chann, "Message must be 4 bytes align");
+		return -EINVAL;
+	}
+
+	/* The fist word in payload can NOT be TOMBSTONE */
+	if (unlikely(((u32 *)msg->send_data)[0] == TOMBSTONE)) {
+		MB_ERR(mb_chann, "Tomb stone in data");
+		return -EINVAL;
+	}
+
+	if (READ_ONCE(mb_chann->bad_state)) {
+		MB_ERR(mb_chann, "Channel in bad state");
+		return -EPIPE;
+	}
+
+	mb_msg = kzalloc(sizeof(*mb_msg) + pkg_size, GFP_KERNEL);
+	if (!mb_msg)
+		return -ENOMEM;
+
+	mb_msg->handle = msg->handle;
+	mb_msg->notify_cb = msg->notify_cb;
+	mb_msg->pkg_size = pkg_size;
+
+	header = &mb_msg->pkg.header;
+	/*
+	 * Hardware use total_size and size to split huge message.
+	 * We do not support it here. Thus the values are the same.
+	 */
+	header->total_size = msg->send_size;
+	header->sz_ver = FIELD_PREP(MSG_BODY_SZ, msg->send_size) |
+			FIELD_PREP(MSG_PROTO_VER, MSG_PROTOCOL_VERSION);
+	header->opcode = msg->opcode;
+	memcpy(mb_msg->pkg.payload, msg->send_data, msg->send_size);
+
+	ret = mailbox_acquire_msgid(mb_chann, mb_msg);
+	if (unlikely(ret < 0)) {
+		MB_ERR(mb_chann, "mailbox_acquire_msgid failed");
+		goto msg_id_failed;
+	}
+	header->id = ret;
+
+	MB_DBG(mb_chann, "opcode 0x%x size %d id 0x%x",
+	       header->opcode, header->total_size, header->id);
+
+	ret = mailbox_send_msg(mb_chann, mb_msg);
+	if (ret) {
+		MB_DBG(mb_chann, "Error in mailbox send msg, ret %d", ret);
+		goto release_id;
+	}
+
+	return 0;
+
+release_id:
+	mailbox_release_msgid(mb_chann, header->id);
+msg_id_failed:
+	kfree(mb_msg);
+	return ret;
+}
+
+struct mailbox_channel *
+xdna_mailbox_create_channel(struct mailbox *mb,
+			    const struct xdna_mailbox_chann_res *x2i,
+			    const struct xdna_mailbox_chann_res *i2x,
+			    u32 iohub_int_addr,
+			    int mb_irq)
+{
+	struct mailbox_channel *mb_chann;
+	int ret;
+
+	if (!is_power_of_2(x2i->rb_size) || !is_power_of_2(i2x->rb_size)) {
+		pr_err("Ring buf size must be power of 2");
+		return NULL;
+	}
+
+	mb_chann = kzalloc(sizeof(*mb_chann), GFP_KERNEL);
+	if (!mb_chann)
+		return NULL;
+
+	mb_chann->mb = mb;
+	mb_chann->msix_irq = mb_irq;
+	mb_chann->iohub_int_addr = iohub_int_addr;
+	memcpy(&mb_chann->res[CHAN_RES_X2I], x2i, sizeof(*x2i));
+	memcpy(&mb_chann->res[CHAN_RES_I2X], i2x, sizeof(*i2x));
+
+	xa_init_flags(&mb_chann->chan_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
+	mb_chann->x2i_tail = mailbox_get_tailptr(mb_chann, CHAN_RES_X2I);
+	mb_chann->i2x_head = mailbox_get_headptr(mb_chann, CHAN_RES_I2X);
+
+	INIT_WORK(&mb_chann->rx_work, mailbox_rx_worker);
+	mb_chann->work_q = create_singlethread_workqueue(MAILBOX_NAME);
+	if (!mb_chann->work_q) {
+		MB_ERR(mb_chann, "Create workqueue failed");
+		goto free_and_out;
+	}
+
+	/* Everything look good. Time to enable irq handler */
+	ret = request_irq(mb_irq, mailbox_irq_handler, 0, MAILBOX_NAME, mb_chann);
+	if (ret) {
+		MB_ERR(mb_chann, "Failed to request irq %d ret %d", mb_irq, ret);
+		goto destroy_wq;
+	}
+
+	mb_chann->bad_state = false;
+
+	MB_DBG(mb_chann, "Mailbox channel created (irq: %d)", mb_chann->msix_irq);
+	return mb_chann;
+
+destroy_wq:
+	destroy_workqueue(mb_chann->work_q);
+free_and_out:
+	kfree(mb_chann);
+	return NULL;
+}
+
+int xdna_mailbox_destroy_channel(struct mailbox_channel *mb_chann)
+{
+	struct mailbox_msg *mb_msg;
+	unsigned long msg_id;
+
+	MB_DBG(mb_chann, "IRQ disabled and RX work cancelled");
+	free_irq(mb_chann->msix_irq, mb_chann);
+	destroy_workqueue(mb_chann->work_q);
+	/* We can clean up and release resources */
+
+	xa_for_each(&mb_chann->chan_xa, msg_id, mb_msg)
+		mailbox_release_msg(mb_chann, mb_msg);
+
+	xa_destroy(&mb_chann->chan_xa);
+
+	MB_DBG(mb_chann, "Mailbox channel destroyed, irq: %d", mb_chann->msix_irq);
+	kfree(mb_chann);
+	return 0;
+}
+
+void xdna_mailbox_stop_channel(struct mailbox_channel *mb_chann)
+{
+	/* Disable an irq and wait. This might sleep. */
+	disable_irq(mb_chann->msix_irq);
+
+	/* Cancel RX work and wait for it to finish */
+	cancel_work_sync(&mb_chann->rx_work);
+	MB_DBG(mb_chann, "IRQ disabled and RX work cancelled");
+}
+
+struct mailbox *xdnam_mailbox_create(struct drm_device *ddev,
+				     const struct xdna_mailbox_res *res)
+{
+	struct mailbox *mb;
+
+	mb = drmm_kzalloc(ddev, sizeof(*mb), GFP_KERNEL);
+	if (!mb)
+		return NULL;
+	mb->dev = ddev->dev;
+
+	/* mailbox and ring buf base and size information */
+	memcpy(&mb->res, res, sizeof(*res));
+
+	return mb;
+}
diff --git a/drivers/accel/amdxdna/amdxdna_mailbox.h b/drivers/accel/amdxdna/amdxdna_mailbox.h
new file mode 100644
index 000000000000..57954c303bdd
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_mailbox.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _AIE2_MAILBOX_H_
+#define _AIE2_MAILBOX_H_
+
+struct mailbox;
+struct mailbox_channel;
+
+/*
+ * xdna_mailbox_msg - message struct
+ *
+ * @opcode:	opcode for firmware
+ * @handle:	handle used for the notify callback
+ * @notify_cb:  callback function to notify the sender when there is response
+ * @send_data:	pointing to sending data
+ * @send_size:	size of the sending data
+ *
+ * The mailbox will split the sending data in to multiple firmware message if
+ * the size of the data is too big. This is transparent to the sender. The
+ * sender will receive one notification.
+ */
+struct xdna_mailbox_msg {
+	u32		opcode;
+	void		*handle;
+	int		(*notify_cb)(void *handle, const u32 *data, size_t size);
+	u8		*send_data;
+	size_t		send_size;
+};
+
+/*
+ * xdna_mailbox_res - mailbox hardware resource
+ *
+ * @ringbuf_base:	ring buffer base address
+ * @ringbuf_size:	ring buffer size
+ * @mbox_base:		mailbox base address
+ * @mbox_size:		mailbox size
+ */
+struct xdna_mailbox_res {
+	void __iomem	*ringbuf_base;
+	size_t		ringbuf_size;
+	void __iomem	*mbox_base;
+	size_t		mbox_size;
+	const char	*name;
+};
+
+/*
+ * xdna_mailbox_chann_res - resources
+ *
+ * @rb_start_addr:	ring buffer start address
+ * @rb_size:		ring buffer size
+ * @mb_head_ptr_reg:	mailbox head pointer register
+ * @mb_tail_ptr_reg:	mailbox tail pointer register
+ */
+struct xdna_mailbox_chann_res {
+	u32 rb_start_addr;
+	u32 rb_size;
+	u32 mb_head_ptr_reg;
+	u32 mb_tail_ptr_reg;
+};
+
+/*
+ * xdna_mailbox_create() -- create mailbox subsystem and initialize
+ *
+ * @ddev: device pointer
+ * @res: SRAM and mailbox resources
+ *
+ * Return: If success, return a handle of mailbox subsystem.
+ * Otherwise, return NULL pointer.
+ */
+struct mailbox *xdnam_mailbox_create(struct drm_device *ddev,
+				     const struct xdna_mailbox_res *res);
+
+/*
+ * xdna_mailbox_create_channel() -- Create a mailbox channel instance
+ *
+ * @mailbox: the handle return from xdna_mailbox_create()
+ * @x2i: host to firmware mailbox resources
+ * @i2x: firmware to host mailbox resources
+ * @xdna_mailbox_intr_reg: register addr of MSI-X interrupt
+ * @mb_irq: Linux IRQ number associated with mailbox MSI-X interrupt vector index
+ *
+ * Return: If success, return a handle of mailbox channel. Otherwise, return NULL.
+ */
+struct mailbox_channel *
+xdna_mailbox_create_channel(struct mailbox *mailbox,
+			    const struct xdna_mailbox_chann_res *x2i,
+			    const struct xdna_mailbox_chann_res *i2x,
+			    u32 xdna_mailbox_intr_reg,
+			    int mb_irq);
+
+/*
+ * xdna_mailbox_destroy_channel() -- destroy mailbox channel
+ *
+ * @mailbox_chann: the handle return from xdna_mailbox_create_channel()
+ *
+ * Return: if success, return 0. otherwise return error code
+ */
+int xdna_mailbox_destroy_channel(struct mailbox_channel *mailbox_chann);
+
+/*
+ * xdna_mailbox_stop_channel() -- stop mailbox channel
+ *
+ * @mailbox_chann: the handle return from xdna_mailbox_create_channel()
+ *
+ * Return: if success, return 0. otherwise return error code
+ */
+void xdna_mailbox_stop_channel(struct mailbox_channel *mailbox_chann);
+
+/*
+ * xdna_mailbox_send_msg() -- Send a message
+ *
+ * @mailbox_chann: Mailbox channel handle
+ * @msg: message struct for message information
+ * @tx_timeout: the timeout value for sending the message in ms.
+ *
+ * Return: If success return 0, otherwise, return error code
+ */
+int xdna_mailbox_send_msg(struct mailbox_channel *mailbox_chann,
+			  const struct xdna_mailbox_msg *msg, u64 tx_timeout);
+
+#endif /* _AIE2_MAILBOX_ */
diff --git a/drivers/accel/amdxdna/amdxdna_mailbox_helper.c b/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
new file mode 100644
index 000000000000..5139a9c96a91
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_mailbox_helper.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_device.h>
+#include <drm/drm_print.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/completion.h>
+
+#include "amdxdna_gem.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_mailbox_helper.h"
+#include "amdxdna_pci_drv.h"
+
+int xdna_msg_cb(void *handle, const u32 *data, size_t size)
+{
+	struct xdna_notify *cb_arg = handle;
+	int ret;
+
+	if (unlikely(!data))
+		goto out;
+
+	if (unlikely(cb_arg->size != size)) {
+		cb_arg->error = -EINVAL;
+		goto out;
+	}
+
+	print_hex_dump_debug("resp data: ", DUMP_PREFIX_OFFSET,
+			     16, 4, data, cb_arg->size, true);
+	memcpy(cb_arg->data, data, cb_arg->size);
+out:
+	ret = cb_arg->error;
+	complete(&cb_arg->comp);
+	return ret;
+}
+
+int xdna_send_msg_wait(struct amdxdna_dev *xdna, struct mailbox_channel *chann,
+		       struct xdna_mailbox_msg *msg)
+{
+	struct xdna_notify *hdl = msg->handle;
+	int ret;
+
+	ret = xdna_mailbox_send_msg(chann, msg, TX_TIMEOUT);
+	if (ret) {
+		XDNA_ERR(xdna, "Send message failed, ret %d", ret);
+		return ret;
+	}
+
+	ret = wait_for_completion_timeout(&hdl->comp,
+					  msecs_to_jiffies(RX_TIMEOUT));
+	if (!ret) {
+		XDNA_ERR(xdna, "Wait for completion timeout");
+		return -ETIME;
+	}
+
+	return hdl->error;
+}
diff --git a/drivers/accel/amdxdna/amdxdna_mailbox_helper.h b/drivers/accel/amdxdna/amdxdna_mailbox_helper.h
new file mode 100644
index 000000000000..23e1317b79fe
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_mailbox_helper.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _AMDXDNA_MAILBOX_HELPER_H
+#define _AMDXDNA_MAILBOX_HELPER_H
+
+#define TX_TIMEOUT 2000 /* milliseconds */
+#define RX_TIMEOUT 5000 /* milliseconds */
+
+struct amdxdna_dev;
+
+struct xdna_notify {
+	struct completion       comp;
+	u32			*data;
+	size_t			size;
+	int			error;
+};
+
+#define DECLARE_XDNA_MSG_COMMON(name, op, status)			\
+	struct name##_req	req = { 0 };				\
+	struct name##_resp	resp = { status	};			\
+	struct xdna_notify	hdl = {					\
+		.error = 0,						\
+		.data = (u32 *)&resp,					\
+		.size = sizeof(resp),					\
+		.comp = COMPLETION_INITIALIZER_ONSTACK(hdl.comp),	\
+	};								\
+	struct xdna_mailbox_msg msg = {					\
+		.send_data = (u8 *)&req,				\
+		.send_size = sizeof(req),				\
+		.handle = &hdl,						\
+		.opcode = op,						\
+		.notify_cb = xdna_msg_cb,				\
+	}
+
+int xdna_msg_cb(void *handle, const u32 *data, size_t size);
+int xdna_send_msg_wait(struct amdxdna_dev *xdna, struct mailbox_channel *chann,
+		       struct xdna_mailbox_msg *msg);
+
+#endif /* _AMDXDNA_MAILBOX_HELPER_H */
diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.c b/drivers/accel/amdxdna/amdxdna_pci_drv.c
new file mode 100644
index 000000000000..f5b8497cf5ad
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_pci_drv.c
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_accel.h>
+#include <drm/drm_drv.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_ioctl.h>
+#include <drm/drm_managed.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/iommu.h>
+#include <linux/pci.h>
+#include <linux/pm_runtime.h>
+
+#include "amdxdna_ctx.h"
+#include "amdxdna_gem.h"
+#include "amdxdna_pci_drv.h"
+
+#define AMDXDNA_AUTOSUSPEND_DELAY	5000 /* milliseconds */
+
+MODULE_FIRMWARE("amdnpu/1502_00/npu.sbin");
+MODULE_FIRMWARE("amdnpu/17f0_10/npu.sbin");
+MODULE_FIRMWARE("amdnpu/17f0_11/npu.sbin");
+MODULE_FIRMWARE("amdnpu/17f0_20/npu.sbin");
+
+/*
+ * Bind the driver base on (vendor_id, device_id) pair and later use the
+ * (device_id, rev_id) pair as a key to select the devices. The devices with
+ * same device_id have very similar interface to host driver.
+ */
+static const struct pci_device_id pci_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1502) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x17f0) },
+	{0}
+};
+
+MODULE_DEVICE_TABLE(pci, pci_ids);
+
+static const struct amdxdna_device_id amdxdna_ids[] = {
+	{ 0x1502, 0x0,  &dev_npu1_info },
+	{ 0x17f0, 0x0,  &dev_npu2_info },
+	{ 0x17f0, 0x10, &dev_npu4_info },
+	{ 0x17f0, 0x11, &dev_npu5_info },
+	{ 0x17f0, 0x20, &dev_npu6_info },
+	{0}
+};
+
+static int amdxdna_drm_open(struct drm_device *ddev, struct drm_file *filp)
+{
+	struct amdxdna_dev *xdna = to_xdna_dev(ddev);
+	struct amdxdna_client *client;
+	int ret;
+
+	ret = pm_runtime_resume_and_get(ddev->dev);
+	if (ret) {
+		XDNA_ERR(xdna, "Failed to get rpm, ret %d", ret);
+		return ret;
+	}
+
+	client = kzalloc(sizeof(*client), GFP_KERNEL);
+	if (!client) {
+		ret = -ENOMEM;
+		goto put_rpm;
+	}
+
+	client->pid = pid_nr(rcu_access_pointer(filp->pid));
+	client->xdna = xdna;
+
+	client->sva = iommu_sva_bind_device(xdna->ddev.dev, current->mm);
+	if (IS_ERR(client->sva)) {
+		ret = PTR_ERR(client->sva);
+		XDNA_ERR(xdna, "SVA bind device failed, ret %d", ret);
+		goto failed;
+	}
+	client->pasid = iommu_sva_get_pasid(client->sva);
+	if (client->pasid == IOMMU_PASID_INVALID) {
+		XDNA_ERR(xdna, "SVA get pasid failed");
+		ret = -ENODEV;
+		goto unbind_sva;
+	}
+	mutex_init(&client->hwctx_lock);
+	init_srcu_struct(&client->hwctx_srcu);
+	xa_init_flags(&client->hwctx_xa, XA_FLAGS_ALLOC);
+	mutex_init(&client->mm_lock);
+
+	mutex_lock(&xdna->dev_lock);
+	list_add_tail(&client->node, &xdna->client_list);
+	mutex_unlock(&xdna->dev_lock);
+
+	filp->driver_priv = client;
+	client->filp = filp;
+
+	XDNA_DBG(xdna, "pid %d opened", client->pid);
+	return 0;
+
+unbind_sva:
+	iommu_sva_unbind_device(client->sva);
+failed:
+	kfree(client);
+put_rpm:
+	pm_runtime_mark_last_busy(ddev->dev);
+	pm_runtime_put_autosuspend(ddev->dev);
+
+	return ret;
+}
+
+static void amdxdna_drm_close(struct drm_device *ddev, struct drm_file *filp)
+{
+	struct amdxdna_client *client = filp->driver_priv;
+	struct amdxdna_dev *xdna = to_xdna_dev(ddev);
+
+	XDNA_DBG(xdna, "closing pid %d", client->pid);
+
+	xa_destroy(&client->hwctx_xa);
+	cleanup_srcu_struct(&client->hwctx_srcu);
+	mutex_destroy(&client->hwctx_lock);
+	mutex_destroy(&client->mm_lock);
+	if (client->dev_heap)
+		drm_gem_object_put(to_gobj(client->dev_heap));
+
+	iommu_sva_unbind_device(client->sva);
+
+	XDNA_DBG(xdna, "pid %d closed", client->pid);
+	kfree(client);
+	pm_runtime_mark_last_busy(ddev->dev);
+	pm_runtime_put_autosuspend(ddev->dev);
+}
+
+static int amdxdna_flush(struct file *f, fl_owner_t id)
+{
+	struct drm_file *filp = f->private_data;
+	struct amdxdna_client *client = filp->driver_priv;
+	struct amdxdna_dev *xdna = client->xdna;
+	int idx;
+
+	XDNA_DBG(xdna, "PID %d flushing...", client->pid);
+	if (!drm_dev_enter(&xdna->ddev, &idx))
+		return 0;
+
+	mutex_lock(&xdna->dev_lock);
+	list_del_init(&client->node);
+	mutex_unlock(&xdna->dev_lock);
+	amdxdna_hwctx_remove_all(client);
+
+	drm_dev_exit(idx);
+	return 0;
+}
+
+static int amdxdna_drm_get_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+{
+	struct amdxdna_client *client = filp->driver_priv;
+	struct amdxdna_dev *xdna = to_xdna_dev(dev);
+	struct amdxdna_drm_get_info *args = data;
+	int ret;
+
+	if (!xdna->dev_info->ops->get_aie_info)
+		return -EOPNOTSUPP;
+
+	XDNA_DBG(xdna, "Request parameter %u", args->param);
+	mutex_lock(&xdna->dev_lock);
+	ret = xdna->dev_info->ops->get_aie_info(client, args);
+	mutex_unlock(&xdna->dev_lock);
+	return ret;
+}
+
+static int amdxdna_drm_set_state_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
+{
+	struct amdxdna_client *client = filp->driver_priv;
+	struct amdxdna_dev *xdna = to_xdna_dev(dev);
+	struct amdxdna_drm_set_state *args = data;
+	int ret;
+
+	if (!xdna->dev_info->ops->set_aie_state)
+		return -EOPNOTSUPP;
+
+	XDNA_DBG(xdna, "Request parameter %u", args->param);
+	mutex_lock(&xdna->dev_lock);
+	ret = xdna->dev_info->ops->set_aie_state(client, args);
+	mutex_unlock(&xdna->dev_lock);
+
+	return ret;
+}
+
+static const struct drm_ioctl_desc amdxdna_drm_ioctls[] = {
+	/* Context */
+	DRM_IOCTL_DEF_DRV(AMDXDNA_CREATE_HWCTX, amdxdna_drm_create_hwctx_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(AMDXDNA_DESTROY_HWCTX, amdxdna_drm_destroy_hwctx_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(AMDXDNA_CONFIG_HWCTX, amdxdna_drm_config_hwctx_ioctl, 0),
+	/* BO */
+	DRM_IOCTL_DEF_DRV(AMDXDNA_CREATE_BO, amdxdna_drm_create_bo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(AMDXDNA_GET_BO_INFO, amdxdna_drm_get_bo_info_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(AMDXDNA_SYNC_BO, amdxdna_drm_sync_bo_ioctl, 0),
+	/* Execution */
+	DRM_IOCTL_DEF_DRV(AMDXDNA_EXEC_CMD, amdxdna_drm_submit_cmd_ioctl, 0),
+	/* AIE hardware */
+	DRM_IOCTL_DEF_DRV(AMDXDNA_GET_INFO, amdxdna_drm_get_info_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(AMDXDNA_SET_STATE, amdxdna_drm_set_state_ioctl, DRM_ROOT_ONLY),
+};
+
+static const struct file_operations amdxdna_fops = {
+	.owner		= THIS_MODULE,
+	.open		= accel_open,
+	.release	= drm_release,
+	.flush		= amdxdna_flush,
+	.unlocked_ioctl	= drm_ioctl,
+	.compat_ioctl	= drm_compat_ioctl,
+	.poll		= drm_poll,
+	.read		= drm_read,
+	.llseek		= noop_llseek,
+	.mmap		= drm_gem_mmap,
+	.fop_flags	= FOP_UNSIGNED_OFFSET,
+};
+
+const struct drm_driver amdxdna_drm_drv = {
+	.driver_features = DRIVER_GEM | DRIVER_COMPUTE_ACCEL |
+		DRIVER_SYNCOBJ | DRIVER_SYNCOBJ_TIMELINE,
+	.fops = &amdxdna_fops,
+	.name = "amdxdna_accel_driver",
+	.desc = "AMD XDNA DRM implementation",
+	.open = amdxdna_drm_open,
+	.postclose = amdxdna_drm_close,
+	.ioctls = amdxdna_drm_ioctls,
+	.num_ioctls = ARRAY_SIZE(amdxdna_drm_ioctls),
+
+	.gem_create_object = amdxdna_gem_create_object_cb,
+};
+
+static const struct amdxdna_dev_info *
+amdxdna_get_dev_info(struct pci_dev *pdev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(amdxdna_ids); i++) {
+		if (pdev->device == amdxdna_ids[i].device &&
+		    pdev->revision == amdxdna_ids[i].revision)
+			return amdxdna_ids[i].dev_info;
+	}
+	return NULL;
+}
+
+static int amdxdna_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct device *dev = &pdev->dev;
+	struct amdxdna_dev *xdna;
+	int ret;
+
+	xdna = devm_drm_dev_alloc(dev, &amdxdna_drm_drv, typeof(*xdna), ddev);
+	if (IS_ERR(xdna))
+		return PTR_ERR(xdna);
+
+	xdna->dev_info = amdxdna_get_dev_info(pdev);
+	if (!xdna->dev_info)
+		return -ENODEV;
+
+	drmm_mutex_init(&xdna->ddev, &xdna->dev_lock);
+	init_rwsem(&xdna->notifier_lock);
+	INIT_LIST_HEAD(&xdna->client_list);
+	pci_set_drvdata(pdev, xdna);
+
+	if (IS_ENABLED(CONFIG_LOCKDEP)) {
+		fs_reclaim_acquire(GFP_KERNEL);
+		might_lock(&xdna->notifier_lock);
+		fs_reclaim_release(GFP_KERNEL);
+	}
+
+	mutex_lock(&xdna->dev_lock);
+	ret = xdna->dev_info->ops->init(xdna);
+	mutex_unlock(&xdna->dev_lock);
+	if (ret) {
+		XDNA_ERR(xdna, "Hardware init failed, ret %d", ret);
+		return ret;
+	}
+
+	ret = amdxdna_sysfs_init(xdna);
+	if (ret) {
+		XDNA_ERR(xdna, "Create amdxdna attrs failed: %d", ret);
+		goto failed_dev_fini;
+	}
+
+	pm_runtime_set_autosuspend_delay(dev, AMDXDNA_AUTOSUSPEND_DELAY);
+	pm_runtime_use_autosuspend(dev);
+	pm_runtime_allow(dev);
+
+	ret = drm_dev_register(&xdna->ddev, 0);
+	if (ret) {
+		XDNA_ERR(xdna, "DRM register failed, ret %d", ret);
+		pm_runtime_forbid(dev);
+		goto failed_sysfs_fini;
+	}
+
+	pm_runtime_mark_last_busy(dev);
+	pm_runtime_put_autosuspend(dev);
+	return 0;
+
+failed_sysfs_fini:
+	amdxdna_sysfs_fini(xdna);
+failed_dev_fini:
+	mutex_lock(&xdna->dev_lock);
+	xdna->dev_info->ops->fini(xdna);
+	mutex_unlock(&xdna->dev_lock);
+	return ret;
+}
+
+static void amdxdna_remove(struct pci_dev *pdev)
+{
+	struct amdxdna_dev *xdna = pci_get_drvdata(pdev);
+	struct device *dev = &pdev->dev;
+	struct amdxdna_client *client;
+
+	pm_runtime_get_noresume(dev);
+	pm_runtime_forbid(dev);
+
+	drm_dev_unplug(&xdna->ddev);
+	amdxdna_sysfs_fini(xdna);
+
+	mutex_lock(&xdna->dev_lock);
+	client = list_first_entry_or_null(&xdna->client_list,
+					  struct amdxdna_client, node);
+	while (client) {
+		list_del_init(&client->node);
+		mutex_unlock(&xdna->dev_lock);
+
+		amdxdna_hwctx_remove_all(client);
+
+		mutex_lock(&xdna->dev_lock);
+		client = list_first_entry_or_null(&xdna->client_list,
+						  struct amdxdna_client, node);
+	}
+
+	xdna->dev_info->ops->fini(xdna);
+	mutex_unlock(&xdna->dev_lock);
+}
+
+static int amdxdna_dev_suspend_nolock(struct amdxdna_dev *xdna)
+{
+	if (xdna->dev_info->ops->suspend)
+		xdna->dev_info->ops->suspend(xdna);
+
+	return 0;
+}
+
+static int amdxdna_dev_resume_nolock(struct amdxdna_dev *xdna)
+{
+	if (xdna->dev_info->ops->resume)
+		return xdna->dev_info->ops->resume(xdna);
+
+	return 0;
+}
+
+static int amdxdna_pmops_suspend(struct device *dev)
+{
+	struct amdxdna_dev *xdna = pci_get_drvdata(to_pci_dev(dev));
+	struct amdxdna_client *client;
+
+	mutex_lock(&xdna->dev_lock);
+	list_for_each_entry(client, &xdna->client_list, node)
+		amdxdna_hwctx_suspend(client);
+
+	amdxdna_dev_suspend_nolock(xdna);
+	mutex_unlock(&xdna->dev_lock);
+
+	return 0;
+}
+
+static int amdxdna_pmops_resume(struct device *dev)
+{
+	struct amdxdna_dev *xdna = pci_get_drvdata(to_pci_dev(dev));
+	struct amdxdna_client *client;
+	int ret;
+
+	XDNA_INFO(xdna, "firmware resuming...");
+	mutex_lock(&xdna->dev_lock);
+	ret = amdxdna_dev_resume_nolock(xdna);
+	if (ret) {
+		XDNA_ERR(xdna, "resume NPU firmware failed");
+		mutex_unlock(&xdna->dev_lock);
+		return ret;
+	}
+
+	XDNA_INFO(xdna, "hardware context resuming...");
+	list_for_each_entry(client, &xdna->client_list, node)
+		amdxdna_hwctx_resume(client);
+	mutex_unlock(&xdna->dev_lock);
+
+	return 0;
+}
+
+static int amdxdna_rpmops_suspend(struct device *dev)
+{
+	struct amdxdna_dev *xdna = pci_get_drvdata(to_pci_dev(dev));
+	int ret;
+
+	mutex_lock(&xdna->dev_lock);
+	ret = amdxdna_dev_suspend_nolock(xdna);
+	mutex_unlock(&xdna->dev_lock);
+
+	XDNA_DBG(xdna, "Runtime suspend done ret: %d", ret);
+	return ret;
+}
+
+static int amdxdna_rpmops_resume(struct device *dev)
+{
+	struct amdxdna_dev *xdna = pci_get_drvdata(to_pci_dev(dev));
+	int ret;
+
+	mutex_lock(&xdna->dev_lock);
+	ret = amdxdna_dev_resume_nolock(xdna);
+	mutex_unlock(&xdna->dev_lock);
+
+	XDNA_DBG(xdna, "Runtime resume done ret: %d", ret);
+	return ret;
+}
+
+static const struct dev_pm_ops amdxdna_pm_ops = {
+	SYSTEM_SLEEP_PM_OPS(amdxdna_pmops_suspend, amdxdna_pmops_resume)
+	RUNTIME_PM_OPS(amdxdna_rpmops_suspend, amdxdna_rpmops_resume, NULL)
+};
+
+static struct pci_driver amdxdna_pci_driver = {
+	.name = KBUILD_MODNAME,
+	.id_table = pci_ids,
+	.probe = amdxdna_probe,
+	.remove = amdxdna_remove,
+	.driver.pm = &amdxdna_pm_ops,
+};
+
+module_pci_driver(amdxdna_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("XRT Team <runtimeca39d@amd.com>");
+MODULE_DESCRIPTION("amdxdna driver");
diff --git a/drivers/accel/amdxdna/amdxdna_pci_drv.h b/drivers/accel/amdxdna/amdxdna_pci_drv.h
new file mode 100644
index 000000000000..37848a8d8031
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_pci_drv.h
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
+ */
+
+#ifndef _AMDXDNA_PCI_DRV_H_
+#define _AMDXDNA_PCI_DRV_H_
+
+#include <linux/xarray.h>
+
+#define XDNA_INFO(xdna, fmt, args...)	drm_info(&(xdna)->ddev, fmt, ##args)
+#define XDNA_WARN(xdna, fmt, args...)	drm_warn(&(xdna)->ddev, "%s: "fmt, __func__, ##args)
+#define XDNA_ERR(xdna, fmt, args...)	drm_err(&(xdna)->ddev, "%s: "fmt, __func__, ##args)
+#define XDNA_DBG(xdna, fmt, args...)	drm_dbg(&(xdna)->ddev, fmt, ##args)
+#define XDNA_INFO_ONCE(xdna, fmt, args...) drm_info_once(&(xdna)->ddev, fmt, ##args)
+
+#define XDNA_MBZ_DBG(xdna, ptr, sz)					\
+	({								\
+		int __i;						\
+		int __ret = 0;						\
+		u8 *__ptr = (u8 *)(ptr);				\
+		for (__i = 0; __i < (sz); __i++) {			\
+			if (__ptr[__i]) {				\
+				XDNA_DBG(xdna, "MBZ check failed");	\
+				__ret = -EINVAL;			\
+				break;					\
+			}						\
+		}							\
+		__ret;							\
+	})
+
+#define to_xdna_dev(drm_dev) \
+	((struct amdxdna_dev *)container_of(drm_dev, struct amdxdna_dev, ddev))
+
+extern const struct drm_driver amdxdna_drm_drv;
+
+struct amdxdna_client;
+struct amdxdna_dev;
+struct amdxdna_drm_get_info;
+struct amdxdna_drm_set_state;
+struct amdxdna_gem_obj;
+struct amdxdna_hwctx;
+struct amdxdna_sched_job;
+
+/*
+ * struct amdxdna_dev_ops - Device hardware operation callbacks
+ */
+struct amdxdna_dev_ops {
+	int (*init)(struct amdxdna_dev *xdna);
+	void (*fini)(struct amdxdna_dev *xdna);
+	int (*resume)(struct amdxdna_dev *xdna);
+	void (*suspend)(struct amdxdna_dev *xdna);
+	int (*hwctx_init)(struct amdxdna_hwctx *hwctx);
+	void (*hwctx_fini)(struct amdxdna_hwctx *hwctx);
+	int (*hwctx_config)(struct amdxdna_hwctx *hwctx, u32 type, u64 value, void *buf, u32 size);
+	void (*hmm_invalidate)(struct amdxdna_gem_obj *abo, unsigned long cur_seq);
+	void (*hwctx_suspend)(struct amdxdna_hwctx *hwctx);
+	void (*hwctx_resume)(struct amdxdna_hwctx *hwctx);
+	int (*cmd_submit)(struct amdxdna_hwctx *hwctx, struct amdxdna_sched_job *job, u64 *seq);
+	int (*get_aie_info)(struct amdxdna_client *client, struct amdxdna_drm_get_info *args);
+	int (*set_aie_state)(struct amdxdna_client *client, struct amdxdna_drm_set_state *args);
+};
+
+/*
+ * struct amdxdna_dev_info - Device hardware information
+ * Record device static information, like reg, mbox, PSP, SMU bar index
+ */
+struct amdxdna_dev_info {
+	int				reg_bar;
+	int				mbox_bar;
+	int				sram_bar;
+	int				psp_bar;
+	int				smu_bar;
+	int				device_type;
+	int				first_col;
+	u32				dev_mem_buf_shift;
+	u64				dev_mem_base;
+	size_t				dev_mem_size;
+	char				*vbnv;
+	const struct amdxdna_dev_priv	*dev_priv;
+	const struct amdxdna_dev_ops	*ops;
+};
+
+struct amdxdna_fw_ver {
+	u32 major;
+	u32 minor;
+	u32 sub;
+	u32 build;
+};
+
+struct amdxdna_dev {
+	struct drm_device		ddev;
+	struct amdxdna_dev_hdl		*dev_handle;
+	const struct amdxdna_dev_info	*dev_info;
+	void				*xrs_hdl;
+
+	struct mutex			dev_lock; /* per device lock */
+	struct list_head		client_list;
+	struct amdxdna_fw_ver		fw_ver;
+	struct rw_semaphore		notifier_lock; /* for mmu notifier*/
+};
+
+/*
+ * struct amdxdna_device_id - PCI device info
+ */
+struct amdxdna_device_id {
+	unsigned short device;
+	u8 revision;
+	const struct amdxdna_dev_info *dev_info;
+};
+
+/*
+ * struct amdxdna_client - amdxdna client
+ * A per fd data structure for managing context and other user process stuffs.
+ */
+struct amdxdna_client {
+	struct list_head		node;
+	pid_t				pid;
+	struct mutex			hwctx_lock; /* protect hwctx */
+	/* do NOT wait this srcu when hwctx_lock is held */
+	struct srcu_struct		hwctx_srcu;
+	struct xarray			hwctx_xa;
+	u32				next_hwctxid;
+	struct amdxdna_dev		*xdna;
+	struct drm_file			*filp;
+
+	struct mutex			mm_lock; /* protect memory related */
+	struct amdxdna_gem_obj		*dev_heap;
+
+	struct iommu_sva		*sva;
+	int				pasid;
+};
+
+#define amdxdna_for_each_hwctx(client, hwctx_id, entry)		\
+	xa_for_each(&(client)->hwctx_xa, hwctx_id, entry)
+
+/* Add device info below */
+extern const struct amdxdna_dev_info dev_npu1_info;
+extern const struct amdxdna_dev_info dev_npu2_info;
+extern const struct amdxdna_dev_info dev_npu4_info;
+extern const struct amdxdna_dev_info dev_npu5_info;
+extern const struct amdxdna_dev_info dev_npu6_info;
+
+int amdxdna_sysfs_init(struct amdxdna_dev *xdna);
+void amdxdna_sysfs_fini(struct amdxdna_dev *xdna);
+
+#endif /* _AMDXDNA_PCI_DRV_H_ */
diff --git a/drivers/accel/amdxdna/amdxdna_sysfs.c b/drivers/accel/amdxdna/amdxdna_sysfs.c
new file mode 100644
index 000000000000..f27e4ee960a0
--- /dev/null
+++ b/drivers/accel/amdxdna/amdxdna_sysfs.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_device.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_print.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/types.h>
+
+#include "amdxdna_gem.h"
+#include "amdxdna_pci_drv.h"
+
+static ssize_t vbnv_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct amdxdna_dev *xdna = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s\n", xdna->dev_info->vbnv);
+}
+static DEVICE_ATTR_RO(vbnv);
+
+static ssize_t device_type_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct amdxdna_dev *xdna = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d\n", xdna->dev_info->device_type);
+}
+static DEVICE_ATTR_RO(device_type);
+
+static ssize_t fw_version_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct amdxdna_dev *xdna = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%d.%d.%d.%d\n", xdna->fw_ver.major,
+		       xdna->fw_ver.minor, xdna->fw_ver.sub,
+		       xdna->fw_ver.build);
+}
+static DEVICE_ATTR_RO(fw_version);
+
+static struct attribute *amdxdna_attrs[] = {
+	&dev_attr_device_type.attr,
+	&dev_attr_vbnv.attr,
+	&dev_attr_fw_version.attr,
+	NULL,
+};
+
+static struct attribute_group amdxdna_attr_group = {
+	.attrs = amdxdna_attrs,
+};
+
+int amdxdna_sysfs_init(struct amdxdna_dev *xdna)
+{
+	int ret;
+
+	ret = sysfs_create_group(&xdna->ddev.dev->kobj, &amdxdna_attr_group);
+	if (ret)
+		XDNA_ERR(xdna, "Create attr group failed");
+
+	return ret;
+}
+
+void amdxdna_sysfs_fini(struct amdxdna_dev *xdna)
+{
+	sysfs_remove_group(&xdna->ddev.dev->kobj, &amdxdna_attr_group);
+}
diff --git a/drivers/accel/amdxdna/npu1_regs.c b/drivers/accel/amdxdna/npu1_regs.c
new file mode 100644
index 000000000000..e4f6dac7d00f
--- /dev/null
+++ b/drivers/accel/amdxdna/npu1_regs.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_device.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/sizes.h>
+
+#include "aie2_pci.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_pci_drv.h"
+
+/* Address definition from NPU1 docs */
+#define MPNPU_PUB_SEC_INTR		0x3010090
+#define MPNPU_PUB_PWRMGMT_INTR		0x3010094
+#define MPNPU_PUB_SCRATCH2		0x30100A0
+#define MPNPU_PUB_SCRATCH3		0x30100A4
+#define MPNPU_PUB_SCRATCH4		0x30100A8
+#define MPNPU_PUB_SCRATCH5		0x30100AC
+#define MPNPU_PUB_SCRATCH6		0x30100B0
+#define MPNPU_PUB_SCRATCH7		0x30100B4
+#define MPNPU_PUB_SCRATCH9		0x30100BC
+
+#define MPNPU_SRAM_X2I_MAILBOX_0	0x30A0000
+#define MPNPU_SRAM_X2I_MAILBOX_1	0x30A2000
+#define MPNPU_SRAM_I2X_MAILBOX_15	0x30BF000
+
+#define MPNPU_APERTURE0_BASE		0x3000000
+#define MPNPU_APERTURE1_BASE		0x3080000
+#define MPNPU_APERTURE2_BASE		0x30C0000
+
+/* PCIe BAR Index for NPU1 */
+#define NPU1_REG_BAR_INDEX  0
+#define NPU1_MBOX_BAR_INDEX 4
+#define NPU1_PSP_BAR_INDEX  0
+#define NPU1_SMU_BAR_INDEX  0
+#define NPU1_SRAM_BAR_INDEX 2
+/* Associated BARs and Apertures */
+#define NPU1_REG_BAR_BASE  MPNPU_APERTURE0_BASE
+#define NPU1_MBOX_BAR_BASE MPNPU_APERTURE2_BASE
+#define NPU1_PSP_BAR_BASE  MPNPU_APERTURE0_BASE
+#define NPU1_SMU_BAR_BASE  MPNPU_APERTURE0_BASE
+#define NPU1_SRAM_BAR_BASE MPNPU_APERTURE1_BASE
+
+const struct rt_config npu1_default_rt_cfg[] = {
+	{ 2, 1, AIE2_RT_CFG_INIT }, /* PDI APP LOAD MODE */
+	{ 1, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */
+	{ 0 },
+};
+
+const struct dpm_clk_freq npu1_dpm_clk_table[] = {
+	{400, 800},
+	{600, 1024},
+	{600, 1024},
+	{600, 1024},
+	{600, 1024},
+	{720, 1309},
+	{720, 1309},
+	{847, 1600},
+	{ 0 }
+};
+
+static const struct amdxdna_dev_priv npu1_dev_priv = {
+	.fw_path        = "amdnpu/1502_00/npu.sbin",
+	.protocol_major = 0x5,
+	.protocol_minor = 0x7,
+	.rt_config	= npu1_default_rt_cfg,
+	.dpm_clk_tbl	= npu1_dpm_clk_table,
+	.col_align	= COL_ALIGN_NONE,
+	.mbox_dev_addr  = NPU1_MBOX_BAR_BASE,
+	.mbox_size      = 0, /* Use BAR size */
+	.sram_dev_addr  = NPU1_SRAM_BAR_BASE,
+	.sram_offs      = {
+		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU1_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
+		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU1_SRAM, MPNPU_SRAM_I2X_MAILBOX_15),
+	},
+	.psp_regs_off   = {
+		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU1_PSP, MPNPU_PUB_SCRATCH2),
+		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU1_PSP, MPNPU_PUB_SCRATCH3),
+		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU1_PSP, MPNPU_PUB_SCRATCH4),
+		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU1_PSP, MPNPU_PUB_SCRATCH9),
+		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU1_PSP, MPNPU_PUB_SEC_INTR),
+		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU1_PSP, MPNPU_PUB_SCRATCH2),
+		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU1_PSP, MPNPU_PUB_SCRATCH3),
+	},
+	.smu_regs_off   = {
+		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU1_SMU, MPNPU_PUB_SCRATCH5),
+		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU1_SMU, MPNPU_PUB_SCRATCH7),
+		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU1_SMU, MPNPU_PUB_PWRMGMT_INTR),
+		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU1_SMU, MPNPU_PUB_SCRATCH6),
+		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU1_SMU, MPNPU_PUB_SCRATCH7),
+	},
+	.hw_ops		= {
+		.set_dpm = npu1_set_dpm,
+	},
+};
+
+const struct amdxdna_dev_info dev_npu1_info = {
+	.reg_bar           = NPU1_REG_BAR_INDEX,
+	.mbox_bar          = NPU1_MBOX_BAR_INDEX,
+	.sram_bar          = NPU1_SRAM_BAR_INDEX,
+	.psp_bar           = NPU1_PSP_BAR_INDEX,
+	.smu_bar           = NPU1_SMU_BAR_INDEX,
+	.first_col         = 1,
+	.dev_mem_buf_shift = 15, /* 32 KiB aligned */
+	.dev_mem_base      = AIE2_DEVM_BASE,
+	.dev_mem_size      = AIE2_DEVM_SIZE,
+	.vbnv              = "RyzenAI-npu1",
+	.device_type       = AMDXDNA_DEV_TYPE_KMQ,
+	.dev_priv          = &npu1_dev_priv,
+	.ops               = &aie2_ops,
+};
diff --git a/drivers/accel/amdxdna/npu2_regs.c b/drivers/accel/amdxdna/npu2_regs.c
new file mode 100644
index 000000000000..a081cac75ee0
--- /dev/null
+++ b/drivers/accel/amdxdna/npu2_regs.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_device.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/sizes.h>
+
+#include "aie2_pci.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_pci_drv.h"
+
+/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
+#define MPNPU_PUB_SEC_INTR             0x3010060
+#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
+#define MPNPU_PUB_SCRATCH0             0x301006C
+#define MPNPU_PUB_SCRATCH1             0x3010070
+#define MPNPU_PUB_SCRATCH2             0x3010074
+#define MPNPU_PUB_SCRATCH3             0x3010078
+#define MPNPU_PUB_SCRATCH4             0x301007C
+#define MPNPU_PUB_SCRATCH5             0x3010080
+#define MPNPU_PUB_SCRATCH6             0x3010084
+#define MPNPU_PUB_SCRATCH7             0x3010088
+#define MPNPU_PUB_SCRATCH8             0x301008C
+#define MPNPU_PUB_SCRATCH9             0x3010090
+#define MPNPU_PUB_SCRATCH10            0x3010094
+#define MPNPU_PUB_SCRATCH11            0x3010098
+#define MPNPU_PUB_SCRATCH12            0x301009C
+#define MPNPU_PUB_SCRATCH13            0x30100A0
+#define MPNPU_PUB_SCRATCH14            0x30100A4
+#define MPNPU_PUB_SCRATCH15            0x30100A8
+#define MP0_C2PMSG_73                  0x3810A24
+#define MP0_C2PMSG_123                 0x3810AEC
+
+#define MP1_C2PMSG_0                   0x3B10900
+#define MP1_C2PMSG_60                  0x3B109F0
+#define MP1_C2PMSG_61                  0x3B109F4
+
+#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
+#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
+#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
+#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
+
+#define MMNPU_APERTURE0_BASE           0x3000000
+#define MMNPU_APERTURE1_BASE           0x3600000
+#define MMNPU_APERTURE3_BASE           0x3810000
+#define MMNPU_APERTURE4_BASE           0x3B10000
+
+/* PCIe BAR Index for NPU2 */
+#define NPU2_REG_BAR_INDEX	0
+#define NPU2_MBOX_BAR_INDEX	0
+#define NPU2_PSP_BAR_INDEX	4
+#define NPU2_SMU_BAR_INDEX	5
+#define NPU2_SRAM_BAR_INDEX	2
+/* Associated BARs and Apertures */
+#define NPU2_REG_BAR_BASE	MMNPU_APERTURE0_BASE
+#define NPU2_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
+#define NPU2_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
+#define NPU2_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
+#define NPU2_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
+
+static const struct amdxdna_dev_priv npu2_dev_priv = {
+	.fw_path        = "amdnpu/17f0_00/npu.sbin",
+	.protocol_major = 0x6,
+	.protocol_minor = 0x6,
+	.rt_config	= npu4_default_rt_cfg,
+	.dpm_clk_tbl	= npu4_dpm_clk_table,
+	.col_align	= COL_ALIGN_NATURE,
+	.mbox_dev_addr  = NPU2_MBOX_BAR_BASE,
+	.mbox_size      = 0, /* Use BAR size */
+	.sram_dev_addr  = NPU2_SRAM_BAR_BASE,
+	.sram_offs      = {
+		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU2_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
+		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU2_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
+	},
+	.psp_regs_off   = {
+		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU2_PSP, MP0_C2PMSG_123),
+		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU2_REG, MPNPU_PUB_SCRATCH3),
+		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU2_REG, MPNPU_PUB_SCRATCH4),
+		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU2_REG, MPNPU_PUB_SCRATCH9),
+		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU2_PSP, MP0_C2PMSG_73),
+		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU2_PSP, MP0_C2PMSG_123),
+		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU2_REG, MPNPU_PUB_SCRATCH3),
+	},
+	.smu_regs_off   = {
+		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU2_SMU, MP1_C2PMSG_0),
+		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU2_SMU, MP1_C2PMSG_60),
+		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU2_SMU, MMNPU_APERTURE4_BASE),
+		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU2_SMU, MP1_C2PMSG_61),
+		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU2_SMU, MP1_C2PMSG_60),
+	},
+	.hw_ops	=	 {
+		.set_dpm = npu4_set_dpm,
+	},
+};
+
+const struct amdxdna_dev_info dev_npu2_info = {
+	.reg_bar           = NPU2_REG_BAR_INDEX,
+	.mbox_bar          = NPU2_MBOX_BAR_INDEX,
+	.sram_bar          = NPU2_SRAM_BAR_INDEX,
+	.psp_bar           = NPU2_PSP_BAR_INDEX,
+	.smu_bar           = NPU2_SMU_BAR_INDEX,
+	.first_col         = 0,
+	.dev_mem_buf_shift = 15, /* 32 KiB aligned */
+	.dev_mem_base      = AIE2_DEVM_BASE,
+	.dev_mem_size      = AIE2_DEVM_SIZE,
+	.vbnv              = "RyzenAI-npu2",
+	.device_type       = AMDXDNA_DEV_TYPE_KMQ,
+	.dev_priv          = &npu2_dev_priv,
+	.ops               = &aie2_ops, /* NPU2 can share NPU1's callback */
+};
diff --git a/drivers/accel/amdxdna/npu4_regs.c b/drivers/accel/amdxdna/npu4_regs.c
new file mode 100644
index 000000000000..9f2e33182ec6
--- /dev/null
+++ b/drivers/accel/amdxdna/npu4_regs.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_device.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/sizes.h>
+
+#include "aie2_pci.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_pci_drv.h"
+
+/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
+#define MPNPU_PUB_SEC_INTR             0x3010060
+#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
+#define MPNPU_PUB_SCRATCH0             0x301006C
+#define MPNPU_PUB_SCRATCH1             0x3010070
+#define MPNPU_PUB_SCRATCH2             0x3010074
+#define MPNPU_PUB_SCRATCH3             0x3010078
+#define MPNPU_PUB_SCRATCH4             0x301007C
+#define MPNPU_PUB_SCRATCH5             0x3010080
+#define MPNPU_PUB_SCRATCH6             0x3010084
+#define MPNPU_PUB_SCRATCH7             0x3010088
+#define MPNPU_PUB_SCRATCH8             0x301008C
+#define MPNPU_PUB_SCRATCH9             0x3010090
+#define MPNPU_PUB_SCRATCH10            0x3010094
+#define MPNPU_PUB_SCRATCH11            0x3010098
+#define MPNPU_PUB_SCRATCH12            0x301009C
+#define MPNPU_PUB_SCRATCH13            0x30100A0
+#define MPNPU_PUB_SCRATCH14            0x30100A4
+#define MPNPU_PUB_SCRATCH15            0x30100A8
+#define MP0_C2PMSG_73                  0x3810A24
+#define MP0_C2PMSG_123                 0x3810AEC
+
+#define MP1_C2PMSG_0                   0x3B10900
+#define MP1_C2PMSG_60                  0x3B109F0
+#define MP1_C2PMSG_61                  0x3B109F4
+
+#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
+#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
+#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
+#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
+
+#define MMNPU_APERTURE0_BASE           0x3000000
+#define MMNPU_APERTURE1_BASE           0x3600000
+#define MMNPU_APERTURE3_BASE           0x3810000
+#define MMNPU_APERTURE4_BASE           0x3B10000
+
+/* PCIe BAR Index for NPU4 */
+#define NPU4_REG_BAR_INDEX	0
+#define NPU4_MBOX_BAR_INDEX	0
+#define NPU4_PSP_BAR_INDEX	4
+#define NPU4_SMU_BAR_INDEX	5
+#define NPU4_SRAM_BAR_INDEX	2
+/* Associated BARs and Apertures */
+#define NPU4_REG_BAR_BASE	MMNPU_APERTURE0_BASE
+#define NPU4_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
+#define NPU4_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
+#define NPU4_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
+#define NPU4_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
+
+const struct rt_config npu4_default_rt_cfg[] = {
+	{ 5, 1, AIE2_RT_CFG_INIT }, /* PDI APP LOAD MODE */
+	{ 1, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */
+	{ 2, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */
+	{ 3, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */
+	{ 4, 1, AIE2_RT_CFG_CLK_GATING }, /* Clock gating on */
+	{ 0 },
+};
+
+const struct dpm_clk_freq npu4_dpm_clk_table[] = {
+	{396, 792},
+	{600, 1056},
+	{792, 1152},
+	{975, 1267},
+	{975, 1267},
+	{1056, 1408},
+	{1152, 1584},
+	{1267, 1800},
+	{ 0 }
+};
+
+static const struct amdxdna_dev_priv npu4_dev_priv = {
+	.fw_path        = "amdnpu/17f0_10/npu.sbin",
+	.protocol_major = 0x6,
+	.protocol_minor = 12,
+	.rt_config	= npu4_default_rt_cfg,
+	.dpm_clk_tbl	= npu4_dpm_clk_table,
+	.col_align	= COL_ALIGN_NATURE,
+	.mbox_dev_addr  = NPU4_MBOX_BAR_BASE,
+	.mbox_size      = 0, /* Use BAR size */
+	.sram_dev_addr  = NPU4_SRAM_BAR_BASE,
+	.sram_offs      = {
+		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
+		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU4_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
+	},
+	.psp_regs_off   = {
+		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU4_PSP, MP0_C2PMSG_123),
+		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU4_REG, MPNPU_PUB_SCRATCH3),
+		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU4_REG, MPNPU_PUB_SCRATCH4),
+		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU4_REG, MPNPU_PUB_SCRATCH9),
+		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU4_PSP, MP0_C2PMSG_73),
+		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU4_PSP, MP0_C2PMSG_123),
+		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU4_REG, MPNPU_PUB_SCRATCH3),
+	},
+	.smu_regs_off   = {
+		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU4_SMU, MP1_C2PMSG_0),
+		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU4_SMU, MP1_C2PMSG_60),
+		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU4_SMU, MMNPU_APERTURE4_BASE),
+		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU4_SMU, MP1_C2PMSG_61),
+		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU4_SMU, MP1_C2PMSG_60),
+	},
+	.hw_ops		= {
+		.set_dpm = npu4_set_dpm,
+	},
+};
+
+const struct amdxdna_dev_info dev_npu4_info = {
+	.reg_bar           = NPU4_REG_BAR_INDEX,
+	.mbox_bar          = NPU4_MBOX_BAR_INDEX,
+	.sram_bar          = NPU4_SRAM_BAR_INDEX,
+	.psp_bar           = NPU4_PSP_BAR_INDEX,
+	.smu_bar           = NPU4_SMU_BAR_INDEX,
+	.first_col         = 0,
+	.dev_mem_buf_shift = 15, /* 32 KiB aligned */
+	.dev_mem_base      = AIE2_DEVM_BASE,
+	.dev_mem_size      = AIE2_DEVM_SIZE,
+	.vbnv              = "RyzenAI-npu4",
+	.device_type       = AMDXDNA_DEV_TYPE_KMQ,
+	.dev_priv          = &npu4_dev_priv,
+	.ops               = &aie2_ops, /* NPU4 can share NPU1's callback */
+};
diff --git a/drivers/accel/amdxdna/npu5_regs.c b/drivers/accel/amdxdna/npu5_regs.c
new file mode 100644
index 000000000000..5f1cf83461c4
--- /dev/null
+++ b/drivers/accel/amdxdna/npu5_regs.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_device.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/sizes.h>
+
+#include "aie2_pci.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_pci_drv.h"
+
+/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
+#define MPNPU_PUB_SEC_INTR             0x3010060
+#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
+#define MPNPU_PUB_SCRATCH0             0x301006C
+#define MPNPU_PUB_SCRATCH1             0x3010070
+#define MPNPU_PUB_SCRATCH2             0x3010074
+#define MPNPU_PUB_SCRATCH3             0x3010078
+#define MPNPU_PUB_SCRATCH4             0x301007C
+#define MPNPU_PUB_SCRATCH5             0x3010080
+#define MPNPU_PUB_SCRATCH6             0x3010084
+#define MPNPU_PUB_SCRATCH7             0x3010088
+#define MPNPU_PUB_SCRATCH8             0x301008C
+#define MPNPU_PUB_SCRATCH9             0x3010090
+#define MPNPU_PUB_SCRATCH10            0x3010094
+#define MPNPU_PUB_SCRATCH11            0x3010098
+#define MPNPU_PUB_SCRATCH12            0x301009C
+#define MPNPU_PUB_SCRATCH13            0x30100A0
+#define MPNPU_PUB_SCRATCH14            0x30100A4
+#define MPNPU_PUB_SCRATCH15            0x30100A8
+#define MP0_C2PMSG_73                  0x3810A24
+#define MP0_C2PMSG_123                 0x3810AEC
+
+#define MP1_C2PMSG_0                   0x3B10900
+#define MP1_C2PMSG_60                  0x3B109F0
+#define MP1_C2PMSG_61                  0x3B109F4
+
+#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
+#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
+#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
+#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
+
+#define MMNPU_APERTURE0_BASE           0x3000000
+#define MMNPU_APERTURE1_BASE           0x3600000
+#define MMNPU_APERTURE3_BASE           0x3810000
+#define MMNPU_APERTURE4_BASE           0x3B10000
+
+/* PCIe BAR Index for NPU5 */
+#define NPU5_REG_BAR_INDEX	0
+#define NPU5_MBOX_BAR_INDEX	0
+#define NPU5_PSP_BAR_INDEX	4
+#define NPU5_SMU_BAR_INDEX	5
+#define NPU5_SRAM_BAR_INDEX	2
+/* Associated BARs and Apertures */
+#define NPU5_REG_BAR_BASE	MMNPU_APERTURE0_BASE
+#define NPU5_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
+#define NPU5_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
+#define NPU5_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
+#define NPU5_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
+
+static const struct amdxdna_dev_priv npu5_dev_priv = {
+	.fw_path        = "amdnpu/17f0_11/npu.sbin",
+	.protocol_major = 0x6,
+	.protocol_minor = 12,
+	.rt_config	= npu4_default_rt_cfg,
+	.dpm_clk_tbl	= npu4_dpm_clk_table,
+	.col_align	= COL_ALIGN_NATURE,
+	.mbox_dev_addr  = NPU5_MBOX_BAR_BASE,
+	.mbox_size      = 0, /* Use BAR size */
+	.sram_dev_addr  = NPU5_SRAM_BAR_BASE,
+	.sram_offs      = {
+		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU5_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
+		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU5_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
+	},
+	.psp_regs_off   = {
+		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU5_PSP, MP0_C2PMSG_123),
+		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU5_REG, MPNPU_PUB_SCRATCH3),
+		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU5_REG, MPNPU_PUB_SCRATCH4),
+		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU5_REG, MPNPU_PUB_SCRATCH9),
+		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU5_PSP, MP0_C2PMSG_73),
+		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU5_PSP, MP0_C2PMSG_123),
+		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU5_REG, MPNPU_PUB_SCRATCH3),
+	},
+	.smu_regs_off   = {
+		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU5_SMU, MP1_C2PMSG_0),
+		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU5_SMU, MP1_C2PMSG_60),
+		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU5_SMU, MMNPU_APERTURE4_BASE),
+		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU5_SMU, MP1_C2PMSG_61),
+		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU5_SMU, MP1_C2PMSG_60),
+	},
+	.hw_ops		= {
+		.set_dpm = npu4_set_dpm,
+	},
+};
+
+const struct amdxdna_dev_info dev_npu5_info = {
+	.reg_bar           = NPU5_REG_BAR_INDEX,
+	.mbox_bar          = NPU5_MBOX_BAR_INDEX,
+	.sram_bar          = NPU5_SRAM_BAR_INDEX,
+	.psp_bar           = NPU5_PSP_BAR_INDEX,
+	.smu_bar           = NPU5_SMU_BAR_INDEX,
+	.first_col         = 0,
+	.dev_mem_buf_shift = 15, /* 32 KiB aligned */
+	.dev_mem_base      = AIE2_DEVM_BASE,
+	.dev_mem_size      = AIE2_DEVM_SIZE,
+	.vbnv              = "RyzenAI-npu5",
+	.device_type       = AMDXDNA_DEV_TYPE_KMQ,
+	.dev_priv          = &npu5_dev_priv,
+	.ops               = &aie2_ops,
+};
diff --git a/drivers/accel/amdxdna/npu6_regs.c b/drivers/accel/amdxdna/npu6_regs.c
new file mode 100644
index 000000000000..94a7005685a7
--- /dev/null
+++ b/drivers/accel/amdxdna/npu6_regs.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ */
+
+#include <drm/amdxdna_accel.h>
+#include <drm/drm_device.h>
+#include <drm/gpu_scheduler.h>
+#include <linux/sizes.h>
+
+#include "aie2_pci.h"
+#include "amdxdna_mailbox.h"
+#include "amdxdna_pci_drv.h"
+
+/* NPU Public Registers on MpNPUAxiXbar (refer to Diag npu_registers.h) */
+#define MPNPU_PUB_SEC_INTR             0x3010060
+#define MPNPU_PUB_PWRMGMT_INTR         0x3010064
+#define MPNPU_PUB_SCRATCH0             0x301006C
+#define MPNPU_PUB_SCRATCH1             0x3010070
+#define MPNPU_PUB_SCRATCH2             0x3010074
+#define MPNPU_PUB_SCRATCH3             0x3010078
+#define MPNPU_PUB_SCRATCH4             0x301007C
+#define MPNPU_PUB_SCRATCH5             0x3010080
+#define MPNPU_PUB_SCRATCH6             0x3010084
+#define MPNPU_PUB_SCRATCH7             0x3010088
+#define MPNPU_PUB_SCRATCH8             0x301008C
+#define MPNPU_PUB_SCRATCH9             0x3010090
+#define MPNPU_PUB_SCRATCH10            0x3010094
+#define MPNPU_PUB_SCRATCH11            0x3010098
+#define MPNPU_PUB_SCRATCH12            0x301009C
+#define MPNPU_PUB_SCRATCH13            0x30100A0
+#define MPNPU_PUB_SCRATCH14            0x30100A4
+#define MPNPU_PUB_SCRATCH15            0x30100A8
+#define MP0_C2PMSG_73                  0x3810A24
+#define MP0_C2PMSG_123                 0x3810AEC
+
+#define MP1_C2PMSG_0                   0x3B10900
+#define MP1_C2PMSG_60                  0x3B109F0
+#define MP1_C2PMSG_61                  0x3B109F4
+
+#define MPNPU_SRAM_X2I_MAILBOX_0       0x3600000
+#define MPNPU_SRAM_X2I_MAILBOX_15      0x361E000
+#define MPNPU_SRAM_X2I_MAILBOX_31      0x363E000
+#define MPNPU_SRAM_I2X_MAILBOX_31      0x363F000
+
+#define MMNPU_APERTURE0_BASE           0x3000000
+#define MMNPU_APERTURE1_BASE           0x3600000
+#define MMNPU_APERTURE3_BASE           0x3810000
+#define MMNPU_APERTURE4_BASE           0x3B10000
+
+/* PCIe BAR Index for NPU6 */
+#define NPU6_REG_BAR_INDEX	0
+#define NPU6_MBOX_BAR_INDEX	0
+#define NPU6_PSP_BAR_INDEX	4
+#define NPU6_SMU_BAR_INDEX	5
+#define NPU6_SRAM_BAR_INDEX	2
+/* Associated BARs and Apertures */
+#define NPU6_REG_BAR_BASE	MMNPU_APERTURE0_BASE
+#define NPU6_MBOX_BAR_BASE	MMNPU_APERTURE0_BASE
+#define NPU6_PSP_BAR_BASE	MMNPU_APERTURE3_BASE
+#define NPU6_SMU_BAR_BASE	MMNPU_APERTURE4_BASE
+#define NPU6_SRAM_BAR_BASE	MMNPU_APERTURE1_BASE
+
+static const struct amdxdna_dev_priv npu6_dev_priv = {
+	.fw_path        = "amdnpu/17f0_10/npu.sbin",
+	.protocol_major = 0x6,
+	.protocol_minor = 12,
+	.rt_config	= npu4_default_rt_cfg,
+	.dpm_clk_tbl	= npu4_dpm_clk_table,
+	.col_align	= COL_ALIGN_NATURE,
+	.mbox_dev_addr  = NPU6_MBOX_BAR_BASE,
+	.mbox_size      = 0, /* Use BAR size */
+	.sram_dev_addr  = NPU6_SRAM_BAR_BASE,
+	.sram_offs      = {
+		DEFINE_BAR_OFFSET(MBOX_CHANN_OFF, NPU6_SRAM, MPNPU_SRAM_X2I_MAILBOX_0),
+		DEFINE_BAR_OFFSET(FW_ALIVE_OFF,   NPU6_SRAM, MPNPU_SRAM_X2I_MAILBOX_15),
+	},
+	.psp_regs_off   = {
+		DEFINE_BAR_OFFSET(PSP_CMD_REG,    NPU6_PSP, MP0_C2PMSG_123),
+		DEFINE_BAR_OFFSET(PSP_ARG0_REG,   NPU6_REG, MPNPU_PUB_SCRATCH3),
+		DEFINE_BAR_OFFSET(PSP_ARG1_REG,   NPU6_REG, MPNPU_PUB_SCRATCH4),
+		DEFINE_BAR_OFFSET(PSP_ARG2_REG,   NPU6_REG, MPNPU_PUB_SCRATCH9),
+		DEFINE_BAR_OFFSET(PSP_INTR_REG,   NPU6_PSP, MP0_C2PMSG_73),
+		DEFINE_BAR_OFFSET(PSP_STATUS_REG, NPU6_PSP, MP0_C2PMSG_123),
+		DEFINE_BAR_OFFSET(PSP_RESP_REG,   NPU6_REG, MPNPU_PUB_SCRATCH3),
+	},
+	.smu_regs_off   = {
+		DEFINE_BAR_OFFSET(SMU_CMD_REG,  NPU6_SMU, MP1_C2PMSG_0),
+		DEFINE_BAR_OFFSET(SMU_ARG_REG,  NPU6_SMU, MP1_C2PMSG_60),
+		DEFINE_BAR_OFFSET(SMU_INTR_REG, NPU6_SMU, MMNPU_APERTURE4_BASE),
+		DEFINE_BAR_OFFSET(SMU_RESP_REG, NPU6_SMU, MP1_C2PMSG_61),
+		DEFINE_BAR_OFFSET(SMU_OUT_REG,  NPU6_SMU, MP1_C2PMSG_60),
+	},
+	.hw_ops         = {
+		.set_dpm = npu4_set_dpm,
+	},
+
+};
+
+const struct amdxdna_dev_info dev_npu6_info = {
+	.reg_bar           = NPU6_REG_BAR_INDEX,
+	.mbox_bar          = NPU6_MBOX_BAR_INDEX,
+	.sram_bar          = NPU6_SRAM_BAR_INDEX,
+	.psp_bar           = NPU6_PSP_BAR_INDEX,
+	.smu_bar           = NPU6_SMU_BAR_INDEX,
+	.first_col         = 0,
+	.dev_mem_buf_shift = 15, /* 32 KiB aligned */
+	.dev_mem_base      = AIE2_DEVM_BASE,
+	.dev_mem_size      = AIE2_DEVM_SIZE,
+	.vbnv              = "RyzenAI-npu6",
+	.device_type       = AMDXDNA_DEV_TYPE_KMQ,
+	.dev_priv          = &npu6_dev_priv,
+	.ops               = &aie2_ops,
+};
diff --git a/drivers/accel/drm_accel.c b/drivers/accel/drm_accel.c
index 24cac4c0274b..aa826033b0ce 100644
--- a/drivers/accel/drm_accel.c
+++ b/drivers/accel/drm_accel.c
@@ -8,7 +8,7 @@
 
 #include <linux/debugfs.h>
 #include <linux/device.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 
 #include <drm/drm_accel.h>
 #include <drm/drm_auth.h>
@@ -18,12 +18,11 @@
 #include <drm/drm_ioctl.h>
 #include <drm/drm_print.h>
 
-static DEFINE_SPINLOCK(accel_minor_lock);
-static struct idr accel_minors_idr;
+DEFINE_XARRAY_ALLOC(accel_minors_xa);
 
 static struct dentry *accel_debugfs_root;
 
-static struct device_type accel_sysfs_device_minor = {
+static const struct device_type accel_sysfs_device_minor = {
 	.name = "accel_minor"
 };
 
@@ -118,99 +117,6 @@ void accel_set_device_instance_params(struct device *kdev, int index)
 }
 
 /**
- * accel_minor_alloc() - Allocates a new accel minor
- *
- * This function access the accel minors idr and allocates from it
- * a new id to represent a new accel minor
- *
- * Return: A new id on success or error code in case idr_alloc failed
- */
-int accel_minor_alloc(void)
-{
-	unsigned long flags;
-	int r;
-
-	spin_lock_irqsave(&accel_minor_lock, flags);
-	r = idr_alloc(&accel_minors_idr, NULL, 0, ACCEL_MAX_MINORS, GFP_NOWAIT);
-	spin_unlock_irqrestore(&accel_minor_lock, flags);
-
-	return r;
-}
-
-/**
- * accel_minor_remove() - Remove an accel minor
- * @index: The minor id to remove.
- *
- * This function access the accel minors idr and removes from
- * it the member with the id that is passed to this function.
- */
-void accel_minor_remove(int index)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&accel_minor_lock, flags);
-	idr_remove(&accel_minors_idr, index);
-	spin_unlock_irqrestore(&accel_minor_lock, flags);
-}
-
-/**
- * accel_minor_replace() - Replace minor pointer in accel minors idr.
- * @minor: Pointer to the new minor.
- * @index: The minor id to replace.
- *
- * This function access the accel minors idr structure and replaces the pointer
- * that is associated with an existing id. Because the minor pointer can be
- * NULL, we need to explicitly pass the index.
- *
- * Return: 0 for success, negative value for error
- */
-void accel_minor_replace(struct drm_minor *minor, int index)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&accel_minor_lock, flags);
-	idr_replace(&accel_minors_idr, minor, index);
-	spin_unlock_irqrestore(&accel_minor_lock, flags);
-}
-
-/*
- * Looks up the given minor-ID and returns the respective DRM-minor object. The
- * refence-count of the underlying device is increased so you must release this
- * object with accel_minor_release().
- *
- * The object can be only a drm_minor that represents an accel device.
- *
- * As long as you hold this minor, it is guaranteed that the object and the
- * minor->dev pointer will stay valid! However, the device may get unplugged and
- * unregistered while you hold the minor.
- */
-static struct drm_minor *accel_minor_acquire(unsigned int minor_id)
-{
-	struct drm_minor *minor;
-	unsigned long flags;
-
-	spin_lock_irqsave(&accel_minor_lock, flags);
-	minor = idr_find(&accel_minors_idr, minor_id);
-	if (minor)
-		drm_dev_get(minor->dev);
-	spin_unlock_irqrestore(&accel_minor_lock, flags);
-
-	if (!minor) {
-		return ERR_PTR(-ENODEV);
-	} else if (drm_dev_is_unplugged(minor->dev)) {
-		drm_dev_put(minor->dev);
-		return ERR_PTR(-ENODEV);
-	}
-
-	return minor;
-}
-
-static void accel_minor_release(struct drm_minor *minor)
-{
-	drm_dev_put(minor->dev);
-}
-
-/**
  * accel_open - open method for ACCEL file
  * @inode: device inode
  * @filp: file pointer.
@@ -227,7 +133,7 @@ int accel_open(struct inode *inode, struct file *filp)
 	struct drm_minor *minor;
 	int retcode;
 
-	minor = accel_minor_acquire(iminor(inode));
+	minor = drm_minor_acquire(&accel_minors_xa, iminor(inode));
 	if (IS_ERR(minor))
 		return PTR_ERR(minor);
 
@@ -246,7 +152,7 @@ int accel_open(struct inode *inode, struct file *filp)
 
 err_undo:
 	atomic_dec(&dev->open_count);
-	accel_minor_release(minor);
+	drm_minor_release(minor);
 	return retcode;
 }
 EXPORT_SYMBOL_GPL(accel_open);
@@ -257,7 +163,7 @@ static int accel_stub_open(struct inode *inode, struct file *filp)
 	struct drm_minor *minor;
 	int err;
 
-	minor = accel_minor_acquire(iminor(inode));
+	minor = drm_minor_acquire(&accel_minors_xa, iminor(inode));
 	if (IS_ERR(minor))
 		return PTR_ERR(minor);
 
@@ -274,7 +180,7 @@ static int accel_stub_open(struct inode *inode, struct file *filp)
 		err = 0;
 
 out:
-	accel_minor_release(minor);
+	drm_minor_release(minor);
 
 	return err;
 }
@@ -290,15 +196,13 @@ void accel_core_exit(void)
 	unregister_chrdev(ACCEL_MAJOR, "accel");
 	debugfs_remove(accel_debugfs_root);
 	accel_sysfs_destroy();
-	idr_destroy(&accel_minors_idr);
+	WARN_ON(!xa_empty(&accel_minors_xa));
 }
 
 int __init accel_core_init(void)
 {
 	int ret;
 
-	idr_init(&accel_minors_idr);
-
 	ret = accel_sysfs_init();
 	if (ret < 0) {
 		DRM_ERROR("Cannot create ACCEL class: %d\n", ret);
diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c
index 3aa6eeef443b..59823e3c3bf7 100644
--- a/drivers/accel/habanalabs/common/command_submission.c
+++ b/drivers/accel/habanalabs/common/command_submission.c
@@ -1360,9 +1360,8 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
 			return -EINVAL;
 		}
 
-	if (!hl_device_operational(hdev, &status)) {
+	if (!hl_device_operational(hdev, &status))
 		return -EBUSY;
-	}
 
 	if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
 			!hdev->supports_staged_submission) {
@@ -3285,12 +3284,6 @@ static int ts_get_and_handle_kernel_record(struct hl_device *hdev, struct hl_ctx
 
 	/* In case the node already registered, need to unregister first then re-use */
 	if (req_offset_record->ts_reg_info.in_use) {
-		dev_dbg(data->buf->mmg->dev,
-				"Requested record %p is in use on irq: %u ts addr: %p, unregister first then put on irq: %u\n",
-				req_offset_record,
-				req_offset_record->ts_reg_info.interrupt->interrupt_id,
-				req_offset_record->ts_reg_info.timestamp_kernel_addr,
-				data->interrupt->interrupt_id);
 		/*
 		 * Since interrupt here can be different than the one the node currently registered
 		 * on, and we don't want to lock two lists while we're doing unregister, so
@@ -3346,10 +3339,6 @@ static int _hl_interrupt_ts_reg_ioctl(struct hl_device *hdev, struct hl_ctx *ctx
 		goto put_cq_cb;
 	}
 
-	dev_dbg(hdev->dev, "Timestamp registration: interrupt id: %u, handle: 0x%llx, ts offset: %llu, cq_offset: %llu\n",
-					data->interrupt->interrupt_id, data->ts_handle,
-					data->ts_offset, data->cq_offset);
-
 	data->buf = hl_mmap_mem_buf_get(data->mmg, data->ts_handle);
 	if (!data->buf) {
 		rc = -EINVAL;
@@ -3371,9 +3360,6 @@ static int _hl_interrupt_ts_reg_ioctl(struct hl_device *hdev, struct hl_ctx *ctx
 	if (*pend->cq_kernel_addr >= data->target_value) {
 		spin_unlock_irqrestore(&data->interrupt->ts_list_lock, flags);
 
-		dev_dbg(hdev->dev, "Target value already reached release ts record: pend: %p, offset: %llu, interrupt: %u\n",
-				pend, data->ts_offset, data->interrupt->interrupt_id);
-
 		pend->ts_reg_info.in_use = 0;
 		*status = HL_WAIT_CS_STATUS_COMPLETED;
 		*pend->ts_reg_info.timestamp_kernel_addr = ktime_get_ns();
diff --git a/drivers/accel/habanalabs/common/context.c b/drivers/accel/habanalabs/common/context.c
index b83141f58319..9f212b17611a 100644
--- a/drivers/accel/habanalabs/common/context.c
+++ b/drivers/accel/habanalabs/common/context.c
@@ -199,7 +199,6 @@ out_err:
 
 int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 {
-	char task_comm[TASK_COMM_LEN];
 	int rc = 0, i;
 
 	ctx->hdev = hdev;
@@ -272,7 +271,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 		mutex_init(&ctx->ts_reg_lock);
 
 		dev_dbg(hdev->dev, "create user context, comm=\"%s\", asid=%u\n",
-			get_task_comm(task_comm, current), ctx->asid);
+			current->comm, ctx->asid);
 	}
 
 	return 0;
diff --git a/drivers/accel/habanalabs/common/debugfs.c b/drivers/accel/habanalabs/common/debugfs.c
index 01f071d52570..ca7677293a55 100644
--- a/drivers/accel/habanalabs/common/debugfs.c
+++ b/drivers/accel/habanalabs/common/debugfs.c
@@ -42,9 +42,8 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 	pkt.i2c_reg = i2c_reg;
 	pkt.i2c_len = i2c_len;
 
-	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-						0, val);
-	if (rc)
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, val);
+	if (rc && rc != -EAGAIN)
 		dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc);
 
 	return rc;
@@ -75,10 +74,8 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 	pkt.i2c_len = i2c_len;
 	pkt.value = cpu_to_le64(val);
 
-	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-						0, NULL);
-
-	if (rc)
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+	if (rc && rc != -EAGAIN)
 		dev_err(hdev->dev, "Failed to write to I2C, error %d\n", rc);
 
 	return rc;
@@ -99,10 +96,8 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
 	pkt.led_index = cpu_to_le32(led);
 	pkt.value = cpu_to_le64(state);
 
-	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-						0, NULL);
-
-	if (rc)
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+	if (rc && rc != -EAGAIN)
 		dev_err(hdev->dev, "Failed to set LED %d, error %d\n", led, rc);
 }
 
@@ -484,7 +479,7 @@ static ssize_t mmu_asid_va_write(struct file *file, const char __user *buf,
 	struct hl_debugfs_entry *entry = s->private;
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;
-	char kbuf[MMU_KBUF_SIZE];
+	char kbuf[MMU_KBUF_SIZE] = {0};
 	char *c;
 	ssize_t rc;
 
@@ -546,7 +541,7 @@ static ssize_t mmu_ack_error_value_write(struct file *file,
 	struct hl_debugfs_entry *entry = s->private;
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;
-	char kbuf[MMU_KBUF_SIZE];
+	char kbuf[MMU_KBUF_SIZE] = {0};
 	ssize_t rc;
 
 	if (count > sizeof(kbuf) - 1)
@@ -1643,19 +1638,19 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
 				&hl_data64b_fops);
 
 	debugfs_create_file("set_power_state",
-				0200,
+				0644,
 				root,
 				dev_entry,
 				&hl_power_fops);
 
 	debugfs_create_file("device",
-				0200,
+				0644,
 				root,
 				dev_entry,
 				&hl_device_fops);
 
 	debugfs_create_file("clk_gate",
-				0200,
+				0644,
 				root,
 				dev_entry,
 				&hl_clk_gate_fops);
@@ -1667,13 +1662,13 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
 				&hl_stop_on_err_fops);
 
 	debugfs_create_file("dump_security_violations",
-				0644,
+				0400,
 				root,
 				dev_entry,
 				&hl_security_violations_fops);
 
 	debugfs_create_file("dump_razwi_events",
-				0644,
+				0400,
 				root,
 				dev_entry,
 				&hl_razwi_check_fops);
@@ -1706,7 +1701,7 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
 				&hdev->reset_info.skip_reset_on_timeout);
 
 	debugfs_create_file("state_dump",
-				0600,
+				0644,
 				root,
 				dev_entry,
 				&hl_state_dump_fops);
@@ -1722,9 +1717,14 @@ static void add_files_to_device(struct hl_device *hdev, struct hl_dbg_device_ent
 				root,
 				&hdev->device_release_watchdog_timeout_sec);
 
+	debugfs_create_u16("server_type",
+				0444,
+				root,
+				&hdev->asic_prop.server_type);
+
 	for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
 		debugfs_create_file(hl_debugfs_list[i].name,
-					0444,
+					0644,
 					root,
 					entry,
 					&hl_debugfs_fops);
diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index a73bd4be94b1..30277ae410d4 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -30,6 +30,8 @@ enum dma_alloc_type {
 
 #define MEM_SCRUB_DEFAULT_VAL 0x1122334455667788
 
+static void hl_device_heartbeat(struct work_struct *work);
+
 /*
  * hl_set_dram_bar- sets the bar to allow later access to address
  *
@@ -55,7 +57,8 @@ static u64 hl_set_dram_bar(struct hl_device *hdev, u64 addr, struct pci_mem_regi
 	if (is_power_of_2(prop->dram_pci_bar_size))
 		bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull);
 	else
-		bar_base_addr = DIV_ROUND_DOWN_ULL(addr, prop->dram_pci_bar_size) *
+		bar_base_addr = region->region_base +
+				div64_u64((addr - region->region_base), prop->dram_pci_bar_size) *
 				prop->dram_pci_bar_size;
 
 	old_base = hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr);
@@ -129,8 +132,8 @@ static void *hl_dma_alloc_common(struct hl_device *hdev, size_t size, dma_addr_t
 	}
 
 	if (trace_habanalabs_dma_alloc_enabled() && !ZERO_OR_NULL_PTR(ptr))
-		trace_habanalabs_dma_alloc(hdev->dev, (u64) (uintptr_t) ptr, *dma_handle, size,
-						caller);
+		trace_habanalabs_dma_alloc(&(hdev)->pdev->dev, (u64) (uintptr_t) ptr, *dma_handle,
+						size, caller);
 
 	return ptr;
 }
@@ -151,7 +154,7 @@ static void hl_asic_dma_free_common(struct hl_device *hdev, size_t size, void *c
 		break;
 	}
 
-	trace_habanalabs_dma_free(hdev->dev, store_cpu_addr, dma_handle, size, caller);
+	trace_habanalabs_dma_free(&(hdev)->pdev->dev, store_cpu_addr, dma_handle, size, caller);
 }
 
 void *hl_asic_dma_alloc_coherent_caller(struct hl_device *hdev, size_t size, dma_addr_t *dma_handle,
@@ -203,15 +206,15 @@ int hl_dma_map_sgtable_caller(struct hl_device *hdev, struct sg_table *sgt,
 		return 0;
 
 	for_each_sgtable_dma_sg(sgt, sg, i)
-		trace_habanalabs_dma_map_page(hdev->dev,
-				page_to_phys(sg_page(sg)),
-				sg->dma_address - prop->device_dma_offset_for_host_access,
+		trace_habanalabs_dma_map_page(&(hdev)->pdev->dev,
+					page_to_phys(sg_page(sg)),
+					sg->dma_address - prop->device_dma_offset_for_host_access,
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
-				sg->dma_length,
+					sg->dma_length,
 #else
-				sg->length,
+					sg->length,
 #endif
-				dir, caller);
+					dir, caller);
 
 	return 0;
 }
@@ -246,7 +249,8 @@ void hl_dma_unmap_sgtable_caller(struct hl_device *hdev, struct sg_table *sgt,
 
 	if (trace_habanalabs_dma_unmap_page_enabled()) {
 		for_each_sgtable_dma_sg(sgt, sg, i)
-			trace_habanalabs_dma_unmap_page(hdev->dev, page_to_phys(sg_page(sg)),
+			trace_habanalabs_dma_unmap_page(&(hdev)->pdev->dev,
+					page_to_phys(sg_page(sg)),
 					sg->dma_address - prop->device_dma_offset_for_host_access,
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
 					sg->dma_length,
@@ -438,16 +442,19 @@ static void print_idle_status_mask(struct hl_device *hdev, const char *message,
 					u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE])
 {
 	if (idle_mask[3])
-		dev_err(hdev->dev, "%s (mask %#llx_%016llx_%016llx_%016llx)\n",
-			message, idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]);
+		dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx_%016llx)\n",
+			dev_name(&hdev->pdev->dev), message,
+			idle_mask[3], idle_mask[2], idle_mask[1], idle_mask[0]);
 	else if (idle_mask[2])
-		dev_err(hdev->dev, "%s (mask %#llx_%016llx_%016llx)\n",
-			message, idle_mask[2], idle_mask[1], idle_mask[0]);
+		dev_err(hdev->dev, "%s %s (mask %#llx_%016llx_%016llx)\n",
+			dev_name(&hdev->pdev->dev), message,
+			idle_mask[2], idle_mask[1], idle_mask[0]);
 	else if (idle_mask[1])
-		dev_err(hdev->dev, "%s (mask %#llx_%016llx)\n",
-			message, idle_mask[1], idle_mask[0]);
+		dev_err(hdev->dev, "%s %s (mask %#llx_%016llx)\n",
+			dev_name(&hdev->pdev->dev), message, idle_mask[1], idle_mask[0]);
 	else
-		dev_err(hdev->dev, "%s (mask %#llx)\n", message, idle_mask[0]);
+		dev_err(hdev->dev, "%s %s (mask %#llx)\n", dev_name(&hdev->pdev->dev), message,
+			idle_mask[0]);
 }
 
 static void hpriv_release(struct kref *ref)
@@ -544,7 +551,8 @@ int hl_hpriv_put(struct hl_fpriv *hpriv)
 	return kref_put(&hpriv->refcount, hpriv_release);
 }
 
-static void print_device_in_use_info(struct hl_device *hdev, const char *message)
+static void print_device_in_use_info(struct hl_device *hdev,
+		struct hl_mem_mgr_fini_stats *mm_fini_stats, const char *message)
 {
 	u32 active_cs_num, dmabuf_export_cnt;
 	bool unknown_reason = true;
@@ -568,6 +576,12 @@ static void print_device_in_use_info(struct hl_device *hdev, const char *message
 					dmabuf_export_cnt);
 	}
 
+	if (mm_fini_stats->n_busy_cb) {
+		unknown_reason = false;
+		offset += scnprintf(buf + offset, size - offset, " [%u live CB handles]",
+				mm_fini_stats->n_busy_cb);
+	}
+
 	if (unknown_reason)
 		scnprintf(buf + offset, size - offset, " [unknown reason]");
 
@@ -585,6 +599,7 @@ void hl_device_release(struct drm_device *ddev, struct drm_file *file_priv)
 {
 	struct hl_fpriv *hpriv = file_priv->driver_priv;
 	struct hl_device *hdev = to_hl_device(ddev);
+	struct hl_mem_mgr_fini_stats mm_fini_stats;
 
 	if (!hdev) {
 		pr_crit("Closing FD after device was removed. Memory leak will occur and it is advised to reboot.\n");
@@ -596,12 +611,13 @@ void hl_device_release(struct drm_device *ddev, struct drm_file *file_priv)
 	/* Memory buffers might be still in use at this point and thus the handles IDR destruction
 	 * is postponed to hpriv_release().
 	 */
-	hl_mem_mgr_fini(&hpriv->mem_mgr);
+	hl_mem_mgr_fini(&hpriv->mem_mgr, &mm_fini_stats);
 
 	hdev->compute_ctx_in_release = 1;
 
 	if (!hl_hpriv_put(hpriv)) {
-		print_device_in_use_info(hdev, "User process closed FD but device still in use");
+		print_device_in_use_info(hdev, &mm_fini_stats,
+				"User process closed FD but device still in use");
 		hl_device_reset(hdev, HL_DRV_RESET_HARD);
 	}
 
@@ -801,7 +817,7 @@ static void device_hard_reset_pending(struct work_struct *work)
 		}
 
 		queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work,
-					msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000));
+					secs_to_jiffies(HL_PENDING_RESET_PER_SEC));
 	}
 }
 
@@ -857,6 +873,10 @@ static int device_early_init(struct hl_device *hdev)
 		gaudi2_set_asic_funcs(hdev);
 		strscpy(hdev->asic_name, "GAUDI2C", sizeof(hdev->asic_name));
 		break;
+	case ASIC_GAUDI2D:
+		gaudi2_set_asic_funcs(hdev);
+		strscpy(hdev->asic_name, "GAUDI2D", sizeof(hdev->asic_name));
+		break;
 	default:
 		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
 			hdev->asic_type);
@@ -945,6 +965,8 @@ static int device_early_init(struct hl_device *hdev)
 		goto free_cb_mgr;
 	}
 
+	INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
+
 	INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work, device_hard_reset_pending);
 	hdev->device_reset_work.hdev = hdev;
 	hdev->device_fini_pending = 0;
@@ -967,7 +989,7 @@ static int device_early_init(struct hl_device *hdev)
 	return 0;
 
 free_cb_mgr:
-	hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
+	hl_mem_mgr_fini(&hdev->kernel_mem_mgr, NULL);
 	hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
 free_chip_info:
 	kfree(hdev->hl_chip_info);
@@ -1011,7 +1033,7 @@ static void device_early_fini(struct hl_device *hdev)
 
 	mutex_destroy(&hdev->clk_throttling.lock);
 
-	hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
+	hl_mem_mgr_fini(&hdev->kernel_mem_mgr, NULL);
 	hl_mem_mgr_idr_destroy(&hdev->kernel_mem_mgr);
 
 	kfree(hdev->hl_chip_info);
@@ -1034,31 +1056,65 @@ static void device_early_fini(struct hl_device *hdev)
 
 static bool is_pci_link_healthy(struct hl_device *hdev)
 {
-	u16 vendor_id;
+	u16 device_id;
 
 	if (!hdev->pdev)
 		return false;
 
-	pci_read_config_word(hdev->pdev, PCI_VENDOR_ID, &vendor_id);
+	pci_read_config_word(hdev->pdev, PCI_DEVICE_ID, &device_id);
 
-	return (vendor_id == PCI_VENDOR_ID_HABANALABS);
+	return (device_id == hdev->pdev->device);
 }
 
-static int hl_device_eq_heartbeat_check(struct hl_device *hdev)
+static void stringify_time_of_last_heartbeat(struct hl_device *hdev, char *time_str, size_t size,
+						bool is_pq_hb)
 {
+	time64_t seconds = is_pq_hb ? hdev->heartbeat_debug_info.last_pq_heartbeat_ts
+					: hdev->heartbeat_debug_info.last_eq_heartbeat_ts;
+	struct tm tm;
+
+	if (!seconds)
+		return;
+
+	time64_to_tm(seconds, 0, &tm);
+
+	snprintf(time_str, size, "%ld-%02d-%02d %02d:%02d:%02d (UTC)",
+		tm.tm_year + 1900, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec);
+}
+
+static bool hl_device_eq_heartbeat_received(struct hl_device *hdev)
+{
+	struct eq_heartbeat_debug_info *heartbeat_debug_info = &hdev->heartbeat_debug_info;
+	u32 cpu_q_id = heartbeat_debug_info->cpu_queue_id, pq_pi_mask = (HL_QUEUE_LENGTH << 1) - 1;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	char pq_time_str[64] = "N/A", eq_time_str[64] = "N/A";
 
 	if (!prop->cpucp_info.eq_health_check_supported)
-		return 0;
+		return true;
 
-	if (hdev->eq_heartbeat_received) {
-		hdev->eq_heartbeat_received = false;
-	} else {
+	if (!hdev->eq_heartbeat_received) {
 		dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
-		return -EIO;
+
+		stringify_time_of_last_heartbeat(hdev, pq_time_str, sizeof(pq_time_str), true);
+		stringify_time_of_last_heartbeat(hdev, eq_time_str, sizeof(eq_time_str), false);
+		dev_err(hdev->dev,
+			"EQ: {CI %u, HB counter %u, last HB time: %s}, PQ: {PI: %u, CI: %u (%u), last HB time: %s}\n",
+			hdev->event_queue.ci,
+			heartbeat_debug_info->heartbeat_event_counter,
+			eq_time_str,
+			hdev->kernel_queues[cpu_q_id].pi,
+			atomic_read(&hdev->kernel_queues[cpu_q_id].ci),
+			atomic_read(&hdev->kernel_queues[cpu_q_id].ci) & pq_pi_mask,
+			pq_time_str);
+
+		hl_eq_dump(hdev, &hdev->event_queue);
+
+		return false;
 	}
 
-	return 0;
+	hdev->eq_heartbeat_received = false;
+
+	return true;
 }
 
 static void hl_device_heartbeat(struct work_struct *work)
@@ -1077,7 +1133,7 @@ static void hl_device_heartbeat(struct work_struct *work)
 	 * in order to validate the eq is working.
 	 * Only if both the EQ is healthy and we managed to send the next heartbeat reschedule.
 	 */
-	if ((!hl_device_eq_heartbeat_check(hdev)) && (!hdev->asic_funcs->send_heartbeat(hdev)))
+	if (hl_device_eq_heartbeat_received(hdev) && (!hdev->asic_funcs->send_heartbeat(hdev)))
 		goto reschedule;
 
 	if (hl_device_operational(hdev, NULL))
@@ -1131,21 +1187,6 @@ static int device_late_init(struct hl_device *hdev)
 	}
 
 	hdev->high_pll = hdev->asic_prop.high_pll;
-
-	if (hdev->heartbeat) {
-		/*
-		 * Before scheduling the heartbeat driver will check if eq event has received.
-		 * for the first schedule we need to set the indication as true then for the next
-		 * one this indication will be true only if eq event was sent by FW.
-		 */
-		hdev->eq_heartbeat_received = true;
-
-		INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
-
-		schedule_delayed_work(&hdev->work_heartbeat,
-				usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
-	}
-
 	hdev->late_init_done = true;
 
 	return 0;
@@ -1162,9 +1203,6 @@ static void device_late_fini(struct hl_device *hdev)
 	if (!hdev->late_init_done)
 		return;
 
-	if (hdev->heartbeat)
-		cancel_delayed_work_sync(&hdev->work_heartbeat);
-
 	if (hdev->asic_funcs->late_fini)
 		hdev->asic_funcs->late_fini(hdev);
 
@@ -1265,8 +1303,12 @@ static void hl_abort_waiting_for_completions(struct hl_device *hdev)
 static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset,
 				bool skip_wq_flush)
 {
-	if (hard_reset)
+	if (hard_reset) {
+		if (hdev->heartbeat)
+			cancel_delayed_work_sync(&hdev->work_heartbeat);
+
 		device_late_fini(hdev);
+	}
 
 	/*
 	 * Halt the engines and disable interrupts so we won't get any more
@@ -1494,15 +1536,14 @@ static void send_disable_pci_access(struct hl_device *hdev, u32 flags)
 		 * of heartbeat, the device CPU is marked as disable
 		 * so this message won't be sent
 		 */
-		if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0)) {
-			dev_warn(hdev->dev, "Failed to disable FW's PCI access\n");
+		if (hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0))
 			return;
-		}
 
-		/* verify that last EQs are handled before disabled is set */
+		/* disable_irq also generates sync irq, this verifies that last EQs are handled
+		 * before disabled is set. The IRQ will be enabled again in request_irq call.
+		 */
 		if (hdev->cpu_queues_enable)
-			synchronize_irq(pci_irq_vector(hdev->pdev,
-					hdev->asic_prop.eq_interrupt_id));
+			disable_irq(pci_irq_vector(hdev->pdev, hdev->asic_prop.eq_interrupt_id));
 	}
 }
 
@@ -1546,6 +1587,31 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
 	}
 }
 
+static void reset_heartbeat_debug_info(struct hl_device *hdev)
+{
+	hdev->heartbeat_debug_info.last_pq_heartbeat_ts = 0;
+	hdev->heartbeat_debug_info.last_eq_heartbeat_ts = 0;
+	hdev->heartbeat_debug_info.heartbeat_event_counter = 0;
+}
+
+static inline void device_heartbeat_schedule(struct hl_device *hdev)
+{
+	if (!hdev->heartbeat)
+		return;
+
+	reset_heartbeat_debug_info(hdev);
+
+	/*
+	 * Before scheduling the heartbeat driver will check if eq event has received.
+	 * for the first schedule we need to set the indication as true then for the next
+	 * one this indication will be true only if eq event was sent by FW.
+	 */
+	hdev->eq_heartbeat_received = true;
+
+	schedule_delayed_work(&hdev->work_heartbeat,
+			usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
+}
+
 /*
  * hl_device_reset - reset the device
  *
@@ -1768,14 +1834,16 @@ kill_processes:
 		hdev->device_cpu_disabled = false;
 		hdev->reset_info.hard_reset_pending = false;
 
+		/*
+		 * Put the device in an unusable state if there are 2 back to back resets due to
+		 * fatal errors.
+		 */
 		if (hdev->reset_info.reset_trigger_repeated &&
-				(hdev->reset_info.prev_reset_trigger ==
-						HL_DRV_RESET_FW_FATAL_ERR)) {
-			/* if there 2 back to back resets from FW,
-			 * ensure driver puts the driver in a unusable state
-			 */
+				(hdev->reset_info.prev_reset_trigger == HL_DRV_RESET_FW_FATAL_ERR ||
+						hdev->reset_info.prev_reset_trigger ==
+								HL_DRV_RESET_HEARTBEAT)) {
 			dev_crit(hdev->dev,
-				"%s Consecutive FW fatal errors received, stopping hard reset\n",
+				"%s Consecutive fatal errors, stopping hard reset\n",
 				dev_name(&(hdev)->pdev->dev));
 			rc = -EIO;
 			goto out_err;
@@ -1913,6 +1981,8 @@ kill_processes:
 	if (hard_reset) {
 		hdev->reset_info.hard_reset_cnt++;
 
+		device_heartbeat_schedule(hdev);
+
 		/* After reset is done, we are ready to receive events from
 		 * the F/W. We can't do it before because we will ignore events
 		 * and if those events are fatal, we won't know about it and
@@ -2347,6 +2417,12 @@ int hl_device_init(struct hl_device *hdev)
 		goto out_disabled;
 	}
 
+	/* Scheduling the EQ heartbeat thread must come after driver is done with all
+	 * initializations, as we want to make sure the FW gets enough time to be prepared
+	 * to respond to heartbeat packets.
+	 */
+	device_heartbeat_schedule(hdev);
+
 	dev_notice(hdev->dev,
 		"Successfully added device %s to habanalabs driver\n",
 		dev_name(&(hdev)->pdev->dev));
@@ -2589,7 +2665,7 @@ inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
 	u32 val = readl(hdev->rmmio + reg);
 
 	if (unlikely(trace_habanalabs_rreg32_enabled()))
-		trace_habanalabs_rreg32(hdev->dev, reg, val);
+		trace_habanalabs_rreg32(&(hdev)->pdev->dev, reg, val);
 
 	return val;
 }
@@ -2607,7 +2683,7 @@ inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
 inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
 {
 	if (unlikely(trace_habanalabs_wreg32_enabled()))
-		trace_habanalabs_wreg32(hdev->dev, reg, val);
+		trace_habanalabs_wreg32(&(hdev)->pdev->dev, reg, val);
 
 	writel(val, hdev->rmmio + reg);
 }
@@ -2801,3 +2877,88 @@ void hl_enable_err_info_capture(struct hl_error_info *captured_err_info)
 	atomic_set(&captured_err_info->cs_timeout.write_enable, 1);
 	captured_err_info->undef_opcode.write_enable = true;
 }
+
+void hl_init_cpu_for_irq(struct hl_device *hdev)
+{
+#ifdef CONFIG_NUMA
+	struct cpumask *available_mask = &hdev->irq_affinity_mask;
+	int numa_node = hdev->pdev->dev.numa_node, i;
+	static struct cpumask cpu_mask;
+
+	if (numa_node < 0)
+		return;
+
+	if (!cpumask_and(&cpu_mask, cpumask_of_node(numa_node), cpu_online_mask)) {
+		dev_err(hdev->dev, "No available affinities in current numa node\n");
+		return;
+	}
+
+	/* Remove HT siblings */
+	for_each_cpu(i, &cpu_mask)
+		cpumask_set_cpu(cpumask_first(topology_sibling_cpumask(i)), available_mask);
+#endif
+}
+
+void hl_set_irq_affinity(struct hl_device *hdev, int irq)
+{
+	if (cpumask_empty(&hdev->irq_affinity_mask)) {
+		dev_dbg(hdev->dev, "affinity mask is empty\n");
+		return;
+	}
+
+	if (irq_set_affinity_and_hint(irq, &hdev->irq_affinity_mask))
+		dev_err(hdev->dev, "Failed setting irq %d affinity\n", irq);
+}
+
+void hl_eq_heartbeat_event_handle(struct hl_device *hdev)
+{
+	hdev->heartbeat_debug_info.heartbeat_event_counter++;
+	hdev->heartbeat_debug_info.last_eq_heartbeat_ts = ktime_get_real_seconds();
+	hdev->eq_heartbeat_received = true;
+}
+
+void hl_handle_clk_change_event(struct hl_device *hdev, u16 event_type, u64 *event_mask)
+{
+	struct hl_clk_throttle *clk_throttle = &hdev->clk_throttling;
+	ktime_t zero_time = ktime_set(0, 0);
+
+	mutex_lock(&clk_throttle->lock);
+
+	switch (event_type) {
+	case EQ_EVENT_POWER_EVT_START:
+		clk_throttle->current_reason |= HL_CLK_THROTTLE_POWER;
+		clk_throttle->aggregated_reason |= HL_CLK_THROTTLE_POWER;
+		clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
+		clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
+		dev_dbg_ratelimited(hdev->dev, "Clock throttling due to power consumption\n");
+		break;
+
+	case EQ_EVENT_POWER_EVT_END:
+		clk_throttle->current_reason &= ~HL_CLK_THROTTLE_POWER;
+		clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
+		dev_dbg_ratelimited(hdev->dev, "Power envelop is safe, back to optimal clock\n");
+		break;
+
+	case EQ_EVENT_THERMAL_EVT_START:
+		clk_throttle->current_reason |= HL_CLK_THROTTLE_THERMAL;
+		clk_throttle->aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
+		clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
+		clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
+		*event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
+		dev_info_ratelimited(hdev->dev, "Clock throttling due to overheating\n");
+		break;
+
+	case EQ_EVENT_THERMAL_EVT_END:
+		clk_throttle->current_reason &= ~HL_CLK_THROTTLE_THERMAL;
+		clk_throttle->timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
+		*event_mask |= HL_NOTIFIER_EVENT_USER_ENGINE_ERR;
+		dev_info_ratelimited(hdev->dev, "Thermal envelop is safe, back to optimal clock\n");
+		break;
+
+	default:
+		dev_err(hdev->dev, "Received invalid clock change event %d\n", event_type);
+		break;
+	}
+
+	mutex_unlock(&clk_throttle->lock);
+}
diff --git a/drivers/accel/habanalabs/common/firmware_if.c b/drivers/accel/habanalabs/common/firmware_if.c
index 3558a6a8e192..eeb6b2a80fc7 100644
--- a/drivers/accel/habanalabs/common/firmware_if.c
+++ b/drivers/accel/habanalabs/common/firmware_if.c
@@ -8,6 +8,7 @@
 #include "habanalabs.h"
 #include <linux/habanalabs/hl_boot_if.h>
 
+#include <linux/pci.h>
 #include <linux/firmware.h>
 #include <linux/crc32.h>
 #include <linux/slab.h>
@@ -40,6 +41,31 @@ static char *comms_sts_str_arr[COMMS_STS_INVLD_LAST] = {
 	[COMMS_STS_TIMEOUT_ERR] = __stringify(COMMS_STS_TIMEOUT_ERR),
 };
 
+/**
+ * hl_fw_version_cmp() - compares the FW version to a specific version
+ *
+ * @hdev: pointer to hl_device structure
+ * @major: major number of a reference version
+ * @minor: minor number of a reference version
+ * @subminor: sub-minor number of a reference version
+ *
+ * Return 1 if FW version greater than the reference version, -1 if it's
+ *         smaller and 0 if versions are identical.
+ */
+int hl_fw_version_cmp(struct hl_device *hdev, u32 major, u32 minor, u32 subminor)
+{
+	if (hdev->fw_sw_major_ver != major)
+		return (hdev->fw_sw_major_ver > major) ? 1 : -1;
+
+	if (hdev->fw_sw_minor_ver != minor)
+		return (hdev->fw_sw_minor_ver > minor) ? 1 : -1;
+
+	if (hdev->fw_sw_sub_minor_ver != subminor)
+		return (hdev->fw_sw_sub_minor_ver > subminor) ? 1 : -1;
+
+	return 0;
+}
+
 static char *extract_fw_ver_from_str(const char *fw_str)
 {
 	char *str, *fw_ver, *whitespace;
@@ -345,43 +371,63 @@ int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
 int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode, u64 value)
 {
 	struct cpucp_packet pkt = {};
+	int rc;
 
 	pkt.ctl = cpu_to_le32(opcode << CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.value = cpu_to_le64(value);
 
-	return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+	if (rc)
+		dev_err(hdev->dev, "Failed to disable FW's PCI access\n");
+
+	return rc;
 }
 
+/**
+ * hl_fw_send_cpu_message() - send CPU message to the device.
+ *
+ * @hdev: pointer to hl_device structure.
+ * @hw_queue_id: HW queue ID
+ * @msg: raw data of the message/packet
+ * @size: size of @msg in bytes
+ * @timeout_us: timeout in usec to wait for CPU reply on the message
+ * @result: return code reported by FW
+ *
+ * send message to the device CPU.
+ *
+ * Return: 0 on success, non-zero for failure.
+ *     -ENOMEM: memory allocation failure
+ *     -EAGAIN: CPU is disabled (try again when enabled)
+ *     -ETIMEDOUT: timeout waiting for FW response
+ *     -EIO: protocol error
+ */
 int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
-				u16 len, u32 timeout, u64 *result)
+				u16 size, u32 timeout_us, u64 *result)
 {
 	struct hl_hw_queue *queue = &hdev->kernel_queues[hw_queue_id];
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u32 tmp, expected_ack_val, pi, opcode;
 	struct cpucp_packet *pkt;
 	dma_addr_t pkt_dma_addr;
 	struct hl_bd *sent_bd;
-	u32 tmp, expected_ack_val, pi, opcode;
-	int rc;
+	int rc = 0, fw_rc;
 
-	pkt = hl_cpu_accessible_dma_pool_alloc(hdev, len, &pkt_dma_addr);
+	pkt = hl_cpu_accessible_dma_pool_alloc(hdev, size, &pkt_dma_addr);
 	if (!pkt) {
-		dev_err(hdev->dev,
-			"Failed to allocate DMA memory for packet to CPU\n");
+		dev_err(hdev->dev, "Failed to allocate DMA memory for packet to CPU\n");
 		return -ENOMEM;
 	}
 
-	memcpy(pkt, msg, len);
+	memcpy(pkt, msg, size);
 
 	mutex_lock(&hdev->send_cpu_message_lock);
 
 	/* CPU-CP messages can be sent during soft-reset */
-	if (hdev->disabled && !hdev->reset_info.in_compute_reset) {
-		rc = 0;
+	if (hdev->disabled && !hdev->reset_info.in_compute_reset)
 		goto out;
-	}
 
 	if (hdev->device_cpu_disabled) {
-		rc = -EIO;
+		rc = -EAGAIN;
 		goto out;
 	}
 
@@ -397,7 +443,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 	 * Which means that we don't need to lock the access to the entire H/W
 	 * queues module when submitting a JOB to the CPU queue.
 	 */
-	hl_hw_queue_submit_bd(hdev, queue, hl_queue_inc_ptr(queue->pi), len, pkt_dma_addr);
+	hl_hw_queue_submit_bd(hdev, queue, hl_queue_inc_ptr(queue->pi), size, pkt_dma_addr);
 
 	if (prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
 		expected_ack_val = queue->pi;
@@ -406,7 +452,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 
 	rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
 				(tmp == expected_ack_val), 1000,
-				timeout, true);
+				timeout_us, true);
 
 	hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
 
@@ -414,19 +460,27 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 		/* If FW performed reset just before sending it a packet, we will get a timeout.
 		 * This is expected behavior, hence no need for error message.
 		 */
-		if (!hl_device_operational(hdev, NULL) && !hdev->reset_info.in_compute_reset)
+		if (!hl_device_operational(hdev, NULL) && !hdev->reset_info.in_compute_reset) {
 			dev_dbg(hdev->dev, "Device CPU packet timeout (0x%x) due to FW reset\n",
 					tmp);
-		else
-			dev_err(hdev->dev, "Device CPU packet timeout (status = 0x%x)\n", tmp);
+		} else {
+			struct hl_bd *bd = queue->kernel_address;
+
+			bd += hl_pi_2_offset(pi);
+
+			dev_err(hdev->dev, "Device CPU packet timeout (status = 0x%x)\n"
+				"Pkt info[%u]: dma_addr: 0x%llx, kernel_addr: %p, len:0x%x, ctl: 0x%x, ptr:0x%llx, dram_bd:%u\n",
+				tmp, pi, pkt_dma_addr, (void *)pkt, bd->len, bd->ctl, bd->ptr,
+				queue->dram_bd);
+		}
 		hdev->device_cpu_disabled = true;
 		goto out;
 	}
 
 	tmp = le32_to_cpu(pkt->ctl);
 
-	rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
-	if (rc) {
+	fw_rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
+	if (fw_rc) {
 		opcode = (tmp & CPUCP_PKT_CTL_OPCODE_MASK) >> CPUCP_PKT_CTL_OPCODE_SHIFT;
 
 		if (!prop->supports_advanced_cpucp_rc) {
@@ -435,7 +489,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 			goto scrub_descriptor;
 		}
 
-		switch (rc) {
+		switch (fw_rc) {
 		case cpucp_packet_invalid:
 			dev_err(hdev->dev,
 				"CPU packet %d is not supported by F/W\n", opcode);
@@ -460,7 +514,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 
 		/* propagate the return code from the f/w to the callers who want to check it */
 		if (result)
-			*result = rc;
+			*result = fw_rc;
 
 		rc = -EIO;
 
@@ -480,7 +534,7 @@ scrub_descriptor:
 out:
 	mutex_unlock(&hdev->send_cpu_message_lock);
 
-	hl_cpu_accessible_dma_pool_free(hdev, len, pkt);
+	hl_cpu_accessible_dma_pool_free(hdev, size, pkt);
 
 	return rc;
 }
@@ -501,7 +555,7 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
 						0, &result);
 
 	if (rc)
-		dev_err(hdev->dev, "failed to unmask RAZWI IRQ %d", event_type);
+		dev_err(hdev->dev, "failed to unmask event %d", event_type);
 
 	return rc;
 }
@@ -540,7 +594,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
 						total_pkt_size, 0, &result);
 
 	if (rc)
-		dev_err(hdev->dev, "failed to unmask IRQ array\n");
+		dev_err(hdev->dev, "failed to unmask event array\n");
 
 	kfree(pkt);
 
@@ -550,7 +604,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
 int hl_fw_test_cpu_queue(struct hl_device *hdev)
 {
 	struct cpucp_packet test_pkt = {};
-	u64 result;
+	u64 result = 0;
 	int rc;
 
 	test_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
@@ -623,16 +677,14 @@ int hl_fw_send_device_activity(struct hl_device *hdev, bool open)
 int hl_fw_send_heartbeat(struct hl_device *hdev)
 {
 	struct cpucp_packet hb_pkt;
-	u64 result;
+	u64 result = 0;
 	int rc;
 
 	memset(&hb_pkt, 0, sizeof(hb_pkt));
-	hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
-					CPUCP_PKT_CTL_OPCODE_SHIFT);
+	hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST << CPUCP_PKT_CTL_OPCODE_SHIFT);
 	hb_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL);
 
-	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
-						sizeof(hb_pkt), 0, &result);
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt, sizeof(hb_pkt), 0, &result);
 
 	if ((rc) || (result != CPUCP_PACKET_FENCE_VAL))
 		return -EIO;
@@ -643,6 +695,8 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
 		rc = -EIO;
 	}
 
+	hdev->heartbeat_debug_info.last_pq_heartbeat_ts = ktime_get_real_seconds();
+
 	return rc;
 }
 
@@ -885,7 +939,7 @@ static int hl_fw_send_msi_info_msg(struct hl_device *hdev)
 {
 	struct cpucp_array_data_packet *pkt;
 	size_t total_pkt_size, data_size;
-	u64 result;
+	u64 result = 0;
 	int rc;
 
 	/* skip sending this info for unsupported ASICs */
@@ -976,11 +1030,10 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 			HL_CPUCP_EEPROM_TIMEOUT_USEC, &result);
-
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to handle CPU-CP EEPROM packet, error %d\n",
-			rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev,
+				"Failed to handle CPU-CP EEPROM packet, error %d\n", rc);
 		goto out;
 	}
 
@@ -1021,7 +1074,9 @@ int hl_fw_get_monitor_dump(struct hl_device *hdev, void *data)
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 							HL_CPUCP_MON_DUMP_TIMEOUT_USEC, &result);
 	if (rc) {
-		dev_err(hdev->dev, "Failed to handle CPU-CP monitor-dump packet, error %d\n", rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev,
+				"Failed to handle CPU-CP monitor-dump packet, error %d\n", rc);
 		goto out;
 	}
 
@@ -1055,8 +1110,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev,
+				"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
 		return rc;
 	}
 	counters->rx_throughput = result;
@@ -1070,8 +1126,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev,
+				"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
 		return rc;
 	}
 	counters->tx_throughput = result;
@@ -1084,8 +1141,9 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 			HL_CPUCP_INFO_TIMEOUT_USEC, &result);
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev,
+				"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
 		return rc;
 	}
 	counters->replay_cnt = (u32) result;
@@ -1105,9 +1163,9 @@ int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy)
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to handle CpuCP total energy pkt, error %d\n",
-				rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev,
+				"Failed to handle CpuCP total energy pkt, error %d\n", rc);
 		return rc;
 	}
 
@@ -1183,7 +1241,8 @@ int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u32 pll_index,
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 			HL_CPUCP_INFO_TIMEOUT_USEC, &result);
 	if (rc) {
-		dev_err(hdev->dev, "Failed to read PLL info, error %d\n", rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev, "Failed to read PLL info, error %d\n", rc);
 		return rc;
 	}
 
@@ -1210,7 +1269,8 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power)
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 			HL_CPUCP_INFO_TIMEOUT_USEC, &result);
 	if (rc) {
-		dev_err(hdev->dev, "Failed to read power, error %d\n", rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev, "Failed to read power, error %d\n", rc);
 		return rc;
 	}
 
@@ -1247,8 +1307,9 @@ int hl_fw_dram_replaced_row_get(struct hl_device *hdev,
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev,
+				"Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc);
 		goto out;
 	}
 
@@ -1273,7 +1334,8 @@ int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num)
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
 	if (rc) {
-		dev_err(hdev->dev,
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev,
 				"Failed to handle CPU-CP pending rows info pkt, error %d\n", rc);
 		goto out;
 	}
@@ -1428,7 +1490,7 @@ int hl_fw_wait_preboot_ready(struct hl_device *hdev)
 {
 	struct pre_fw_load_props *pre_fw_load = &hdev->fw_loader.pre_fw_load;
 	u32 status = 0, timeout;
-	int rc, tries = 1;
+	int rc, tries = 1, fw_err = 0;
 	bool preboot_still_runs;
 
 	/* Need to check two possible scenarios:
@@ -1468,18 +1530,18 @@ retry:
 		}
 	}
 
-	if (rc) {
+	/* If we read all FF, then something is totally wrong, no point
+	 * of reading specific errors
+	 */
+	if (status != -1)
+		fw_err = fw_read_errors(hdev, pre_fw_load->boot_err0_reg,
+					pre_fw_load->boot_err1_reg,
+					pre_fw_load->sts_boot_dev_sts0_reg,
+					pre_fw_load->sts_boot_dev_sts1_reg);
+	if (rc || fw_err) {
 		detect_cpu_boot_status(hdev, status);
-		dev_err(hdev->dev, "CPU boot ready timeout (status = %d)\n", status);
-
-		/* If we read all FF, then something is totally wrong, no point
-		 * of reading specific errors
-		 */
-		if (status != -1)
-			fw_read_errors(hdev, pre_fw_load->boot_err0_reg,
-						pre_fw_load->boot_err1_reg,
-						pre_fw_load->sts_boot_dev_sts0_reg,
-						pre_fw_load->sts_boot_dev_sts1_reg);
+		dev_err(hdev->dev, "CPU boot %s (status = %d)\n",
+				fw_err ? "failed due to an error" : "ready timeout", status);
 		return -EIO;
 	}
 
@@ -1750,7 +1812,7 @@ static void hl_fw_dynamic_send_cmd(struct hl_device *hdev,
 	val = FIELD_PREP(COMMS_COMMAND_CMD_MASK, cmd);
 	val |= FIELD_PREP(COMMS_COMMAND_SIZE_MASK, size);
 
-	trace_habanalabs_comms_send_cmd(hdev->dev, comms_cmd_str_arr[cmd]);
+	trace_habanalabs_comms_send_cmd(&hdev->pdev->dev, comms_cmd_str_arr[cmd]);
 	WREG32(le32_to_cpu(dyn_regs->kmd_msg_to_cpu), val);
 }
 
@@ -1808,7 +1870,7 @@ static int hl_fw_dynamic_wait_for_status(struct hl_device *hdev,
 
 	dyn_regs = &fw_loader->dynamic_loader.comm_desc.cpu_dyn_regs;
 
-	trace_habanalabs_comms_wait_status(hdev->dev, comms_sts_str_arr[expected_status]);
+	trace_habanalabs_comms_wait_status(&hdev->pdev->dev, comms_sts_str_arr[expected_status]);
 
 	/* Wait for expected status */
 	rc = hl_poll_timeout(
@@ -1825,7 +1887,8 @@ static int hl_fw_dynamic_wait_for_status(struct hl_device *hdev,
 		return -EIO;
 	}
 
-	trace_habanalabs_comms_wait_status_done(hdev->dev, comms_sts_str_arr[expected_status]);
+	trace_habanalabs_comms_wait_status_done(&hdev->pdev->dev,
+						comms_sts_str_arr[expected_status]);
 
 	/*
 	 * skip storing FW response for NOOP to preserve the actual desired
@@ -1899,7 +1962,7 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev,
 {
 	int rc;
 
-	trace_habanalabs_comms_protocol_cmd(hdev->dev, comms_cmd_str_arr[cmd]);
+	trace_habanalabs_comms_protocol_cmd(&hdev->pdev->dev, comms_cmd_str_arr[cmd]);
 
 	/* first send clear command to clean former commands */
 	rc = hl_fw_dynamic_send_clear_cmd(hdev, fw_loader);
@@ -2038,7 +2101,7 @@ static int hl_fw_dynamic_validate_descriptor(struct hl_device *hdev,
 	 * note that no alignment/stride address issues here as all structures
 	 * are 64 bit padded.
 	 */
-	data_ptr = (u8 *)fw_desc + sizeof(struct comms_desc_header);
+	data_ptr = (u8 *)fw_desc + sizeof(struct comms_msg_header);
 	data_size = le16_to_cpu(fw_desc->header.size);
 
 	data_crc32 = hl_fw_compat_crc32(data_ptr, data_size);
@@ -2192,11 +2255,11 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev,
 	memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc));
 	fw_data_size = le16_to_cpu(fw_desc->header.size);
 
-	temp_fw_desc = vzalloc(sizeof(struct comms_desc_header) + fw_data_size);
+	temp_fw_desc = vzalloc(sizeof(struct comms_msg_header) + fw_data_size);
 	if (!temp_fw_desc)
 		return -ENOMEM;
 
-	memcpy_fromio(temp_fw_desc, src, sizeof(struct comms_desc_header) + fw_data_size);
+	memcpy_fromio(temp_fw_desc, src, sizeof(struct comms_msg_header) + fw_data_size);
 
 	rc = hl_fw_dynamic_validate_descriptor(hdev, fw_loader,
 					(struct lkd_fw_comms_desc *) temp_fw_desc);
@@ -2718,18 +2781,20 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
 	}
 
+	rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader, sizeof(struct lkd_msg_comms));
+	if (rc)
+		goto protocol_err;
+
+	if (hdev->asic_prop.support_dynamic_resereved_fw_size)
+		hdev->asic_prop.reserved_fw_mem_size =
+			le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb) * SZ_1M;
+
 	if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
 		struct lkd_fw_binning_info *binning_info;
 
-		rc = hl_fw_dynamic_request_descriptor(hdev, fw_loader,
-							sizeof(struct lkd_msg_comms));
-		if (rc)
-			goto protocol_err;
-
 		/* read preboot version */
 		rc = hl_fw_dynamic_read_device_fw_version(hdev, FW_COMP_PREBOOT,
 				fw_loader->dynamic_loader.comm_desc.cur_fw_ver);
-
 		if (rc)
 			return rc;
 
@@ -2756,11 +2821,6 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 				hdev->decoder_binning, hdev->rotator_binning);
 		}
 
-		if (hdev->asic_prop.support_dynamic_resereved_fw_size) {
-			hdev->asic_prop.reserved_fw_mem_size =
-				le32_to_cpu(fw_loader->dynamic_loader.comm_desc.rsvd_mem_size_mb);
-		}
-
 		return 0;
 	}
 
@@ -2795,7 +2855,7 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	hdev->asic_funcs->init_cpu_scrambler_dram(hdev);
 
 	if (!(hdev->fw_components & FW_TYPE_LINUX)) {
-		dev_info(hdev->dev, "Skip loading Linux F/W\n");
+		dev_dbg(hdev->dev, "Skip loading Linux F/W\n");
 		return 0;
 	}
 
@@ -3125,10 +3185,10 @@ long hl_fw_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
 	pkt.pll_index = cpu_to_le32((u32)used_pll_idx);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
-
 	if (rc) {
-		dev_err(hdev->dev, "Failed to get frequency of PLL %d, error %d\n",
-			used_pll_idx, rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev, "Failed to get frequency of PLL %d, error %d\n",
+				used_pll_idx, rc);
 		return rc;
 	}
 
@@ -3152,8 +3212,7 @@ void hl_fw_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
 	pkt.value = cpu_to_le64(freq);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
-
-	if (rc)
+	if (rc && rc != -EAGAIN)
 		dev_err(hdev->dev, "Failed to set frequency to PLL %d, error %d\n",
 			used_pll_idx, rc);
 }
@@ -3169,9 +3228,9 @@ long hl_fw_get_max_power(struct hl_device *hdev)
 	pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_GET << CPUCP_PKT_CTL_OPCODE_SHIFT);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
-
 	if (rc) {
-		dev_err(hdev->dev, "Failed to get max power, error %d\n", rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev, "Failed to get max power, error %d\n", rc);
 		return rc;
 	}
 
@@ -3193,8 +3252,7 @@ void hl_fw_set_max_power(struct hl_device *hdev)
 	pkt.value = cpu_to_le64(hdev->max_power);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
-
-	if (rc)
+	if (rc && rc != -EAGAIN)
 		dev_err(hdev->dev, "Failed to set max power, error %d\n", rc);
 }
 
@@ -3220,11 +3278,11 @@ static int hl_fw_get_sec_attest_data(struct hl_device *hdev, u32 packet_id, void
 	pkt.data_max_size = cpu_to_le32(size);
 	pkt.nonce = cpu_to_le32(nonce);
 
-	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-					timeout, NULL);
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), timeout, NULL);
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to handle CPU-CP pkt %u, error %d\n", packet_id, rc);
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev,
+				"Failed to handle CPU-CP pkt %u, error %d\n", packet_id, rc);
 		goto out;
 	}
 
@@ -3266,10 +3324,12 @@ int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *)&pkt, sizeof(pkt),
 						HL_CPUCP_INFO_TIMEOUT_USEC, &result);
-	if (rc)
-		dev_err(hdev->dev, "failed to send CPUCP data of generic fw pkt\n");
-	else
+	if (rc) {
+		if (rc != -EAGAIN)
+			dev_err(hdev->dev, "failed to send CPUCP data of generic fw pkt\n");
+	} else {
 		dev_dbg(hdev->dev, "generic pkt was successful, result: 0x%llx\n", result);
+	}
 
 	*size = (u32)result;
 
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 2a900c9941fe..6f27ce4fa01b 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -71,7 +71,7 @@ struct hl_fpriv;
 
 #define HL_DEVICE_TIMEOUT_USEC		1000000 /* 1 s */
 
-#define HL_HEARTBEAT_PER_USEC		5000000 /* 5 s */
+#define HL_HEARTBEAT_PER_USEC		10000000 /* 10 s */
 
 #define HL_PLL_LOW_JOB_FREQ_USEC	5000000 /* 5 s */
 
@@ -443,18 +443,22 @@ enum hl_collective_mode {
  *                  a CB handle can be provided for jobs on this queue.
  *                  Otherwise, a CB address must be provided.
  * @collective_mode: collective mode of current queue
+ * @q_dram_bd_address: PQ dram address, used when PQ need to reside in DRAM.
  * @driver_only: true if only the driver is allowed to send a job to this queue,
  *               false otherwise.
  * @binned: True if the queue is binned out and should not be used
  * @supports_sync_stream: True if queue supports sync stream
+ * @dram_bd: True if the bd should be copied to dram, needed for PQ which has been allocated on dram
  */
 struct hw_queue_properties {
 	enum hl_queue_type		type;
 	enum queue_cb_alloc_flags	cb_alloc_flags;
 	enum hl_collective_mode		collective_mode;
+	u64				q_dram_bd_address;
 	u8				driver_only;
 	u8				binned;
 	u8				supports_sync_stream;
+	u8				dram_bd;
 };
 
 /**
@@ -590,8 +594,6 @@ struct hl_hints_range {
  *                 we display to the user
  * @mmu_pgt_size: MMU page tables total size.
  * @mmu_pte_size: PTE size in MMU page tables.
- * @mmu_hop_table_size: MMU hop table size.
- * @mmu_hop0_tables_total_size: total size of MMU hop0 tables.
  * @dram_page_size: The DRAM physical page size.
  * @cfg_size: configuration space size on SRAM.
  * @sram_size: total size of SRAM.
@@ -645,10 +647,12 @@ struct hl_hints_range {
  * @num_engine_cores: number of engine cpu cores.
  * @max_num_of_engines: maximum number of all engines in the ASIC.
  * @num_of_special_blocks: special_blocks array size.
- * @glbl_err_cause_num: global err cause number.
+ * @glbl_err_max_cause_num: global err max cause number.
  * @hbw_flush_reg: register to read to generate HBW flush. value of 0 means HBW flush is
  *                 not supported.
- * @reserved_fw_mem_size: size in MB of dram memory reserved for FW.
+ * @reserved_fw_mem_size: size of dram memory reserved for FW.
+ * @fw_event_queue_size: queue size for events from CPU-CP.
+ *                       A value of 0 means using the default HL_EQ_SIZE_IN_BYTES value.
  * @collective_first_sob: first sync object available for collective use
  * @collective_first_mon: first monitor available for collective use
  * @sync_stream_first_sob: first sync object available for sync stream use
@@ -743,8 +747,6 @@ struct asic_fixed_properties {
 	u32				clk_pll_index;
 	u32				mmu_pgt_size;
 	u32				mmu_pte_size;
-	u32				mmu_hop_table_size;
-	u32				mmu_hop0_tables_total_size;
 	u32				dram_page_size;
 	u32				cfg_size;
 	u32				sram_size;
@@ -779,9 +781,10 @@ struct asic_fixed_properties {
 	u32				num_engine_cores;
 	u32				max_num_of_engines;
 	u32				num_of_special_blocks;
-	u32				glbl_err_cause_num;
+	u32				glbl_err_max_cause_num;
 	u32				hbw_flush_reg;
 	u32				reserved_fw_mem_size;
+	u32				fw_event_queue_size;
 	u16				collective_first_sob;
 	u16				collective_first_mon;
 	u16				sync_stream_first_sob;
@@ -902,6 +905,18 @@ struct hl_mem_mgr {
 };
 
 /**
+ * struct hl_mem_mgr_fini_stats - describes statistics returned during memory manager teardown.
+ * @n_busy_cb: the amount of CB handles that could not be removed
+ * @n_busy_ts: the amount of TS handles that could not be removed
+ * @n_busy_other: the amount of any other type of handles that could not be removed
+ */
+struct hl_mem_mgr_fini_stats {
+	u32 n_busy_cb;
+	u32 n_busy_ts;
+	u32 n_busy_other;
+};
+
+/**
  * struct hl_mmap_mem_buf_behavior - describes unified memory manager buffer behavior
  * @topic: string identifier used for logging
  * @mem_id: memory type identifier, embedded in the handle and used to identify
@@ -1052,6 +1067,8 @@ struct hl_encaps_signals_mgr {
  * @collective_mode: collective mode of current queue
  * @kernel_address: holds the queue's kernel virtual address.
  * @bus_address: holds the queue's DMA address.
+ * @pq_dram_address: hold the dram address when the PQ is allocated, used when dram_bd is true in
+ *                   queue properites.
  * @pi: holds the queue's pi value.
  * @ci: holds the queue's ci value, AS CALCULATED BY THE DRIVER (not real ci).
  * @hw_queue_id: the id of the H/W queue.
@@ -1061,6 +1078,7 @@ struct hl_encaps_signals_mgr {
  * @valid: is the queue valid (we have array of 32 queues, not all of them
  *         exist).
  * @supports_sync_stream: True if queue supports sync stream
+ * @dram_bd: True if the bd should be copied to dram, needed for PQ which has been allocated on dram
  */
 struct hl_hw_queue {
 	struct hl_cs_job			**shadow_queue;
@@ -1069,6 +1087,7 @@ struct hl_hw_queue {
 	enum hl_collective_mode			collective_mode;
 	void					*kernel_address;
 	dma_addr_t				bus_address;
+	u64					pq_dram_address;
 	u32					pi;
 	atomic_t				ci;
 	u32					hw_queue_id;
@@ -1077,6 +1096,7 @@ struct hl_hw_queue {
 	u16					int_queue_len;
 	u8					valid;
 	u8					supports_sync_stream;
+	u8					dram_bd;
 };
 
 /**
@@ -1224,6 +1244,7 @@ struct hl_user_pending_interrupt {
  * @hdev: pointer to the device structure
  * @kernel_address: holds the queue's kernel virtual address
  * @bus_address: holds the queue's DMA address
+ * @size: the event queue size
  * @ci: ci inside the queue
  * @prev_eqe_index: the index of the previous event queue entry. The index of
  *                  the current entry's index must be +1 of the previous one.
@@ -1235,6 +1256,7 @@ struct hl_eq {
 	struct hl_device	*hdev;
 	void			*kernel_address;
 	dma_addr_t		bus_address;
+	u32			size;
 	u32			ci;
 	u32			prev_eqe_index;
 	bool			check_eqe_index;
@@ -1263,15 +1285,18 @@ struct hl_dec {
  * @ASIC_GAUDI2: Gaudi2 device.
  * @ASIC_GAUDI2B: Gaudi2B device.
  * @ASIC_GAUDI2C: Gaudi2C device.
+ * @ASIC_GAUDI2D: Gaudi2D device.
  */
 enum hl_asic_type {
 	ASIC_INVALID,
+
 	ASIC_GOYA,
 	ASIC_GAUDI,
 	ASIC_GAUDI_SEC,
 	ASIC_GAUDI2,
 	ASIC_GAUDI2B,
 	ASIC_GAUDI2C,
+	ASIC_GAUDI2D,
 };
 
 struct hl_cs_parser;
@@ -2547,7 +2572,7 @@ struct hl_state_dump_specs {
  * DEVICES
  */
 
-#define HL_STR_MAX	32
+#define HL_STR_MAX	64
 
 #define HL_DEV_STS_MAX (HL_DEVICE_STATUS_LAST + 1)
 
@@ -2704,11 +2729,16 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
  * updated directly by the device. If false, the host memory being polled will
  * be updated by host CPU. Required so host knows whether or not the memory
  * might need to be byte-swapped before returning value to caller.
+ *
+ * On the first 4 polling iterations the macro goes to sleep for short period of
+ * time that gradually increases and reaches sleep_us on the fifth iteration.
  */
 #define hl_poll_timeout_memory(hdev, addr, val, cond, sleep_us, timeout_us, \
 				mem_written_by_device) \
 ({ \
+	u64 __sleep_step_us; \
 	ktime_t __timeout; \
+	u8 __step = 8; \
 	\
 	__timeout = ktime_add_us(ktime_get(), timeout_us); \
 	might_sleep_if(sleep_us); \
@@ -2726,8 +2756,10 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
 				(val) = le32_to_cpu(*(__le32 *) &(val)); \
 			break; \
 		} \
-		if (sleep_us) \
-			usleep_range((sleep_us >> 2) + 1, sleep_us); \
+		__sleep_step_us = sleep_us >> __step; \
+		if (__sleep_step_us) \
+			usleep_range((__sleep_step_us >> 2) + 1, __sleep_step_us); \
+		__step >>= 1; \
 	} \
 	(cond) ? 0 : -ETIMEDOUT; \
 })
@@ -3170,6 +3202,21 @@ struct hl_reset_info {
 };
 
 /**
+ * struct eq_heartbeat_debug_info - stores debug info to be used upon heartbeat failure.
+ * @last_pq_heartbeat_ts: timestamp of the last test packet that was sent to FW.
+ *                        This packet is the trigger in FW to send the EQ heartbeat event.
+ * @last_eq_heartbeat_ts: timestamp of the last EQ heartbeat event that was received from FW.
+ * @heartbeat_event_counter: number of heartbeat events received.
+ * @cpu_queue_id: used to read the queue pi/ci
+ */
+struct eq_heartbeat_debug_info {
+	time64_t last_pq_heartbeat_ts;
+	time64_t last_eq_heartbeat_ts;
+	u32 heartbeat_event_counter;
+	u32 cpu_queue_id;
+};
+
+/**
  * struct hl_device - habanalabs device structure.
  * @pdev: pointer to PCI device, can be NULL in case of simulator device.
  * @pcie_bar_phys: array of available PCIe bars physical addresses.
@@ -3257,6 +3304,8 @@ struct hl_reset_info {
  * @clk_throttling: holds information about current/previous clock throttling events
  * @captured_err_info: holds information about errors.
  * @reset_info: holds current device reset information.
+ * @heartbeat_debug_info: counters used to debug heartbeat failures.
+ * @irq_affinity_mask: mask of available CPU cores for user and decoder interrupt handling.
  * @stream_master_qid_arr: pointer to array with QIDs of master streams.
  * @fw_inner_major_ver: the major of current loaded preboot inner version.
  * @fw_inner_minor_ver: the minor of current loaded preboot inner version.
@@ -3446,6 +3495,10 @@ struct hl_device {
 
 	struct hl_reset_info		reset_info;
 
+	struct eq_heartbeat_debug_info	heartbeat_debug_info;
+
+	cpumask_t			irq_affinity_mask;
+
 	u32				*stream_master_qid_arr;
 	u32				fw_inner_major_ver;
 	u32				fw_inner_minor_ver;
@@ -3588,25 +3641,6 @@ struct hl_ioctl_desc {
 	hl_ioctl_t *func;
 };
 
-static inline bool hl_is_fw_sw_ver_below(struct hl_device *hdev, u32 fw_sw_major, u32 fw_sw_minor)
-{
-	if (hdev->fw_sw_major_ver < fw_sw_major)
-		return true;
-	if (hdev->fw_sw_major_ver > fw_sw_major)
-		return false;
-	if (hdev->fw_sw_minor_ver < fw_sw_minor)
-		return true;
-	return false;
-}
-
-static inline bool hl_is_fw_sw_ver_equal_or_greater(struct hl_device *hdev, u32 fw_sw_major,
-							u32 fw_sw_minor)
-{
-	return (hdev->fw_sw_major_ver > fw_sw_major ||
-			(hdev->fw_sw_major_ver == fw_sw_major &&
-					hdev->fw_sw_minor_ver >= fw_sw_minor));
-}
-
 /*
  * Kernel module functions that can be accessed by entire module
  */
@@ -3732,6 +3766,7 @@ int hl_eq_init(struct hl_device *hdev, struct hl_eq *q);
 void hl_eq_fini(struct hl_device *hdev, struct hl_eq *q);
 void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q);
 void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q);
+void hl_eq_dump(struct hl_device *hdev, struct hl_eq *q);
 irqreturn_t hl_irq_handler_cq(int irq, void *arg);
 irqreturn_t hl_irq_handler_eq(int irq, void *arg);
 irqreturn_t hl_irq_handler_dec_abnrm(int irq, void *arg);
@@ -3886,6 +3921,7 @@ int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_
 							struct hl_hr_mmu_funcs *hr_func);
 int hl_mmu_if_set_funcs(struct hl_device *hdev);
 void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
+void hl_mmu_v2_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
 void hl_mmu_v2_hr_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
 int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr);
 int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
@@ -3893,7 +3929,24 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
 u64 hl_mmu_scramble_addr(struct hl_device *hdev, u64 addr);
 u64 hl_mmu_descramble_addr(struct hl_device *hdev, u64 addr);
 bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr);
-
+struct pgt_info *hl_mmu_dr_get_pgt_info(struct hl_ctx *ctx, u64 hop_addr);
+void hl_mmu_dr_free_hop(struct hl_ctx *ctx, u64 hop_addr);
+void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info);
+u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx);
+u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx);
+void hl_mmu_dr_write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val);
+void hl_mmu_dr_write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val);
+void hl_mmu_dr_clear_pte(struct hl_ctx *ctx, u64 pte_addr);
+u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
+void hl_mmu_dr_get_pte(struct hl_ctx *ctx, u64 hop_addr);
+int hl_mmu_dr_put_pte(struct hl_ctx *ctx, u64 hop_addr);
+u64 hl_mmu_dr_get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, bool *is_new_hop);
+u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx);
+void hl_mmu_dr_flush(struct hl_ctx *ctx);
+int hl_mmu_dr_init(struct hl_device *hdev);
+void hl_mmu_dr_fini(struct hl_device *hdev);
+
+int hl_fw_version_cmp(struct hl_device *hdev, u32 major, u32 minor, u32 subminor);
 int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
 				void __iomem *dst, u32 src_offset, u32 size);
 int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode, u64 value);
@@ -4008,7 +4061,7 @@ char *hl_format_as_binary(char *buf, size_t buf_len, u32 n);
 const char *hl_sync_engine_to_string(enum hl_sync_engine_type engine_type);
 
 void hl_mem_mgr_init(struct device *dev, struct hl_mem_mgr *mmg);
-void hl_mem_mgr_fini(struct hl_mem_mgr *mmg);
+void hl_mem_mgr_fini(struct hl_mem_mgr *mmg, struct hl_mem_mgr_fini_stats *stats);
 void hl_mem_mgr_idr_destroy(struct hl_mem_mgr *mmg);
 int hl_mem_mgr_mmap(struct hl_mem_mgr *mmg, struct vm_area_struct *vma,
 		    void *args);
@@ -4032,6 +4085,10 @@ void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_
 void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info);
 void hl_capture_engine_err(struct hl_device *hdev, u16 engine_id, u16 error_count);
 void hl_enable_err_info_capture(struct hl_error_info *captured_err_info);
+void hl_init_cpu_for_irq(struct hl_device *hdev);
+void hl_set_irq_affinity(struct hl_device *hdev, int irq);
+void hl_eq_heartbeat_event_handle(struct hl_device *hdev);
+void hl_handle_clk_change_event(struct hl_device *hdev, u16 event_type, u64 *event_mask);
 
 #ifdef CONFIG_DEBUG_FS
 
diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c
index e542fd40e16c..596c52e8aa26 100644
--- a/drivers/accel/habanalabs/common/habanalabs_drv.c
+++ b/drivers/accel/habanalabs/common/habanalabs_drv.c
@@ -101,7 +101,6 @@ static const struct drm_driver hl_driver = {
 	.major = LINUX_VERSION_MAJOR,
 	.minor = LINUX_VERSION_PATCHLEVEL,
 	.patchlevel = LINUX_VERSION_SUBLEVEL,
-	.date = "20190505",
 
 	.fops = &hl_fops,
 	.open = hl_device_open,
@@ -144,6 +143,9 @@ static enum hl_asic_type get_asic_type(struct hl_device *hdev)
 		case REV_ID_C:
 			asic_type = ASIC_GAUDI2C;
 			break;
+		case REV_ID_D:
+			asic_type = ASIC_GAUDI2D;
+			break;
 		default:
 			break;
 		}
@@ -260,7 +262,7 @@ int hl_device_open(struct drm_device *ddev, struct drm_file *file_priv)
 
 out_err:
 	mutex_unlock(&hdev->fpriv_list_lock);
-	hl_mem_mgr_fini(&hpriv->mem_mgr);
+	hl_mem_mgr_fini(&hpriv->mem_mgr, NULL);
 	hl_mem_mgr_idr_destroy(&hpriv->mem_mgr);
 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
 	mutex_destroy(&hpriv->ctx_lock);
@@ -359,8 +361,7 @@ static void fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
 		 * a different default timeout for Gaudi
 		 */
 		if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
-			hdev->timeout_jiffies = msecs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED *
-										MSEC_PER_SEC);
+			hdev->timeout_jiffies = secs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED);
 
 		hdev->reset_upon_device_release = 0;
 		break;
diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
index 1dd6e23172ca..8729a0c57d78 100644
--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -1279,13 +1279,10 @@ static long _hl_ioctl(struct hl_fpriv *hpriv, unsigned int cmd, unsigned long ar
 		retcode = -EFAULT;
 
 out_err:
-	if (retcode) {
-		char task_comm[TASK_COMM_LEN];
-
+	if (retcode)
 		dev_dbg_ratelimited(dev,
 				"error in ioctl: pid=%d, comm=\"%s\", cmd=%#010x, nr=%#04x\n",
-				task_pid_nr(current), get_task_comm(task_comm, current), cmd, nr);
-	}
+				task_pid_nr(current), current->comm, cmd, nr);
 
 	if (kdata != stack_kdata)
 		kfree(kdata);
@@ -1308,11 +1305,9 @@ long hl_ioctl_control(struct file *filep, unsigned int cmd, unsigned long arg)
 	if (nr == _IOC_NR(DRM_IOCTL_HL_INFO)) {
 		ioctl = &hl_ioctls_control[nr - HL_COMMAND_START];
 	} else {
-		char task_comm[TASK_COMM_LEN];
-
 		dev_dbg_ratelimited(hdev->dev_ctrl,
 				"invalid ioctl: pid=%d, comm=\"%s\", cmd=%#010x, nr=%#04x\n",
-				task_pid_nr(current), get_task_comm(task_comm, current), cmd, nr);
+				task_pid_nr(current), current->comm, cmd, nr);
 		return -ENOTTY;
 	}
 
diff --git a/drivers/accel/habanalabs/common/hw_queue.c b/drivers/accel/habanalabs/common/hw_queue.c
index d0087c0ec48c..3d04a7507cce 100644
--- a/drivers/accel/habanalabs/common/hw_queue.c
+++ b/drivers/accel/habanalabs/common/hw_queue.c
@@ -84,6 +84,8 @@ void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
 		u32 ctl, u32 len, u64 ptr)
 {
 	struct hl_bd *bd;
+	u64 addr;
+	int i;
 
 	bd = q->kernel_address;
 	bd += hl_pi_2_offset(q->pi);
@@ -91,7 +93,16 @@ void hl_hw_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
 	bd->len = cpu_to_le32(len);
 	bd->ptr = cpu_to_le64(ptr);
 
+	if (q->dram_bd)
+		for (i = 0 ; i < 2 ; i++) {
+			addr = q->pq_dram_address +
+			((hl_pi_2_offset(q->pi) * sizeof(struct hl_bd))	+ (i * sizeof(u64)));
+			hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM,	addr,
+						(u64 *)(bd) + i, DEBUGFS_WRITE64);
+		}
+
 	q->pi = hl_queue_inc_ptr(q->pi);
+
 	hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
 }
 
@@ -1087,12 +1098,18 @@ int hl_hw_queues_create(struct hl_device *hdev)
 		q->supports_sync_stream =
 				asic->hw_queues_props[i].supports_sync_stream;
 		q->collective_mode = asic->hw_queues_props[i].collective_mode;
+		q->dram_bd = asic->hw_queues_props[i].dram_bd;
+
 		rc = queue_init(hdev, q, i);
 		if (rc) {
 			dev_err(hdev->dev,
 				"failed to initialize queue %d\n", i);
 			goto release_queues;
 		}
+
+		/* Set DRAM PQ address for the queue if it should be at DRAM */
+		if (q->dram_bd)
+			q->pq_dram_address = asic->hw_queues_props[i].q_dram_bd_address;
 	}
 
 	return 0;
diff --git a/drivers/accel/habanalabs/common/hwmon.c b/drivers/accel/habanalabs/common/hwmon.c
index 1ee2ee07e9ed..52d1e6bf10dc 100644
--- a/drivers/accel/habanalabs/common/hwmon.c
+++ b/drivers/accel/habanalabs/common/hwmon.c
@@ -46,7 +46,7 @@ static u32 fixup_flags_legacy_fw(struct hl_device *hdev, enum hwmon_sensor_types
 		break;
 
 	default:
-		dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type);
+		dev_err_ratelimited(hdev->dev, "unsupported h/w sensor type %d\n", type);
 		flags = cpucp_flags;
 		break;
 	}
@@ -134,7 +134,7 @@ static u32 adjust_hwmon_flags(struct hl_device *hdev, enum hwmon_sensor_types ty
 			break;
 
 		default:
-			dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type);
+			dev_err_ratelimited(hdev->dev, "unsupported h/w sensor type %d\n", type);
 			flags = cpucp_flags;
 			break;
 		}
@@ -162,7 +162,8 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev, struct cpucp_sensor *sen
 			break;
 
 		if (type >= HWMON_NR_SENSOR_TYPES) {
-			dev_err(hdev->dev, "Got wrong sensor type %d from device\n", type);
+			dev_err_ratelimited(hdev->dev,
+				"Got wrong sensor type %d from device\n", type);
 			return -EINVAL;
 		}
 
@@ -584,9 +585,10 @@ int hl_get_temperature(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to get temperature from sensor %d, error %d\n",
-			sensor_index, rc);
+		if (rc != -EAGAIN)
+			dev_err_ratelimited(hdev->dev,
+				"Failed to get temperature from sensor %d, error %d\n",
+				sensor_index, rc);
 		*value = 0;
 	}
 
@@ -609,9 +611,8 @@ int hl_set_temperature(struct hl_device *hdev,
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 						0, NULL);
-
-	if (rc)
-		dev_err(hdev->dev,
+	if (rc && rc != -EAGAIN)
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set temperature of sensor %d, error %d\n",
 			sensor_index, rc);
 
@@ -638,9 +639,10 @@ int hl_get_voltage(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to get voltage from sensor %d, error %d\n",
-			sensor_index, rc);
+		if (rc != -EAGAIN)
+			dev_err_ratelimited(hdev->dev,
+				"Failed to get voltage from sensor %d, error %d\n",
+				sensor_index, rc);
 		*value = 0;
 	}
 
@@ -667,9 +669,10 @@ int hl_get_current(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to get current from sensor %d, error %d\n",
-			sensor_index, rc);
+		if (rc != -EAGAIN)
+			dev_err_ratelimited(hdev->dev,
+				"Failed to get current from sensor %d, error %d\n",
+				sensor_index, rc);
 		*value = 0;
 	}
 
@@ -696,9 +699,10 @@ int hl_get_fan_speed(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to get fan speed from sensor %d, error %d\n",
-			sensor_index, rc);
+		if (rc != -EAGAIN)
+			dev_err_ratelimited(hdev->dev,
+				"Failed to get fan speed from sensor %d, error %d\n",
+				sensor_index, rc);
 		*value = 0;
 	}
 
@@ -725,9 +729,10 @@ int hl_get_pwm_info(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to get pwm info from sensor %d, error %d\n",
-			sensor_index, rc);
+		if (rc != -EAGAIN)
+			dev_err_ratelimited(hdev->dev,
+				"Failed to get pwm info from sensor %d, error %d\n",
+				sensor_index, rc);
 		*value = 0;
 	}
 
@@ -750,9 +755,8 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 						0, NULL);
-
-	if (rc)
-		dev_err(hdev->dev,
+	if (rc && rc != -EAGAIN)
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set pwm info to sensor %d, error %d\n",
 			sensor_index, rc);
 }
@@ -773,9 +777,8 @@ int hl_set_voltage(struct hl_device *hdev,
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 						0, NULL);
-
-	if (rc)
-		dev_err(hdev->dev,
+	if (rc && rc != -EAGAIN)
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set voltage of sensor %d, error %d\n",
 			sensor_index, rc);
 
@@ -796,11 +799,9 @@ int hl_set_current(struct hl_device *hdev,
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = __cpu_to_le64(value);
 
-	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-						0, NULL);
-
-	if (rc)
-		dev_err(hdev->dev,
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, NULL);
+	if (rc && rc != -EAGAIN)
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set current of sensor %d, error %d\n",
 			sensor_index, rc);
 
@@ -829,9 +830,8 @@ int hl_set_power(struct hl_device *hdev,
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 						0, NULL);
-
-	if (rc)
-		dev_err(hdev->dev,
+	if (rc && rc != -EAGAIN)
+		dev_err_ratelimited(hdev->dev,
 			"Failed to set power of sensor %d, error %d\n",
 			sensor_index, rc);
 
@@ -858,9 +858,10 @@ int hl_get_power(struct hl_device *hdev,
 	*value = (long) result;
 
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to get power of sensor %d, error %d\n",
-			sensor_index, rc);
+		if (rc != -EAGAIN)
+			dev_err_ratelimited(hdev->dev,
+				"Failed to get power of sensor %d, error %d\n",
+				sensor_index, rc);
 		*value = 0;
 	}
 
diff --git a/drivers/accel/habanalabs/common/irq.c b/drivers/accel/habanalabs/common/irq.c
index 978b7f4d5eeb..7c9f2f6a2870 100644
--- a/drivers/accel/habanalabs/common/irq.c
+++ b/drivers/accel/habanalabs/common/irq.c
@@ -652,14 +652,16 @@ void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q)
  */
 int hl_eq_init(struct hl_device *hdev, struct hl_eq *q)
 {
+	u32 size = hdev->asic_prop.fw_event_queue_size ? : HL_EQ_SIZE_IN_BYTES;
 	void *p;
 
-	p = hl_cpu_accessible_dma_pool_alloc(hdev, HL_EQ_SIZE_IN_BYTES, &q->bus_address);
+	p = hl_cpu_accessible_dma_pool_alloc(hdev, size, &q->bus_address);
 	if (!p)
 		return -ENOMEM;
 
 	q->hdev = hdev;
 	q->kernel_address = p;
+	q->size = size;
 	q->ci = 0;
 	q->prev_eqe_index = 0;
 
@@ -678,7 +680,7 @@ void hl_eq_fini(struct hl_device *hdev, struct hl_eq *q)
 {
 	flush_workqueue(hdev->eq_wq);
 
-	hl_cpu_accessible_dma_pool_free(hdev, HL_EQ_SIZE_IN_BYTES, q->kernel_address);
+	hl_cpu_accessible_dma_pool_free(hdev, q->size, q->kernel_address);
 }
 
 void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q)
@@ -693,5 +695,30 @@ void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q)
 	 * when the device is operational again
 	 */
 
-	memset(q->kernel_address, 0, HL_EQ_SIZE_IN_BYTES);
+	memset(q->kernel_address, 0, q->size);
+}
+
+void hl_eq_dump(struct hl_device *hdev, struct hl_eq *q)
+{
+	u32 eq_length, eqe_size, ctl, ready, mode, type, index;
+	struct hl_eq_header *hdr;
+	u8 *ptr;
+	int i;
+
+	eq_length = HL_EQ_LENGTH;
+	eqe_size = q->size / HL_EQ_LENGTH;
+
+	dev_info(hdev->dev, "Contents of EQ entries headers:\n");
+
+	for (i = 0, ptr = q->kernel_address ; i < eq_length ; ++i, ptr += eqe_size) {
+		hdr = (struct hl_eq_header *) ptr;
+		ctl = le32_to_cpu(hdr->ctl);
+		ready = FIELD_GET(EQ_CTL_READY_MASK, ctl);
+		mode = FIELD_GET(EQ_CTL_EVENT_MODE_MASK, ctl);
+		type = FIELD_GET(EQ_CTL_EVENT_TYPE_MASK, ctl);
+		index = FIELD_GET(EQ_CTL_INDEX_MASK, ctl);
+
+		dev_info(hdev->dev, "%02u: %#010x [ready: %u, mode %u, type %04u, index %05u]\n",
+				i, ctl, ready, mode, type, index);
+	}
 }
diff --git a/drivers/accel/habanalabs/common/memory.c b/drivers/accel/habanalabs/common/memory.c
index 3348ad12c237..601fdbe70179 100644
--- a/drivers/accel/habanalabs/common/memory.c
+++ b/drivers/accel/habanalabs/common/memory.c
@@ -14,7 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/pci-p2pdma.h>
 
-MODULE_IMPORT_NS(DMA_BUF);
+MODULE_IMPORT_NS("DMA_BUF");
 
 #define HL_MMU_DEBUG	0
 
diff --git a/drivers/accel/habanalabs/common/memory_mgr.c b/drivers/accel/habanalabs/common/memory_mgr.c
index c4d84df355b0..99cd83139d46 100644
--- a/drivers/accel/habanalabs/common/memory_mgr.c
+++ b/drivers/accel/habanalabs/common/memory_mgr.c
@@ -318,28 +318,61 @@ void hl_mem_mgr_init(struct device *dev, struct hl_mem_mgr *mmg)
 	idr_init(&mmg->handles);
 }
 
+static void hl_mem_mgr_fini_stats_reset(struct hl_mem_mgr_fini_stats *stats)
+{
+	if (!stats)
+		return;
+
+	memset(stats, 0, sizeof(*stats));
+}
+
+static void hl_mem_mgr_fini_stats_inc(u64 mem_id, struct hl_mem_mgr_fini_stats *stats)
+{
+	if (!stats)
+		return;
+
+	switch (mem_id) {
+	case HL_MMAP_TYPE_CB:
+		++stats->n_busy_cb;
+		break;
+	case HL_MMAP_TYPE_TS_BUFF:
+		++stats->n_busy_ts;
+		break;
+	default:
+		/* we currently store only CB/TS so this shouldn't happen */
+		++stats->n_busy_other;
+	}
+}
+
 /**
  * hl_mem_mgr_fini - release unified memory manager
  *
  * @mmg: parent unified memory manager
+ * @stats: if non-NULL, will return some counters for handles that could not be removed.
  *
  * Release the unified memory manager. Shall be called from an interrupt context.
  */
-void hl_mem_mgr_fini(struct hl_mem_mgr *mmg)
+void hl_mem_mgr_fini(struct hl_mem_mgr *mmg, struct hl_mem_mgr_fini_stats *stats)
 {
 	struct hl_mmap_mem_buf *buf;
 	struct idr *idp;
 	const char *topic;
+	u64 mem_id;
 	u32 id;
 
+	hl_mem_mgr_fini_stats_reset(stats);
+
 	idp = &mmg->handles;
 
 	idr_for_each_entry(idp, buf, id) {
 		topic = buf->behavior->topic;
-		if (hl_mmap_mem_buf_put(buf) != 1)
+		mem_id = buf->behavior->mem_id;
+		if (hl_mmap_mem_buf_put(buf) != 1) {
 			dev_err(mmg->dev,
 				"%s: Buff handle %u for CTX is still alive\n",
 				topic, id);
+			hl_mem_mgr_fini_stats_inc(mem_id, stats);
+		}
 	}
 }
 
diff --git a/drivers/accel/habanalabs/common/mmu/Makefile b/drivers/accel/habanalabs/common/mmu/Makefile
index 1806c524e04a..f4b815bf4f7d 100644
--- a/drivers/accel/habanalabs/common/mmu/Makefile
+++ b/drivers/accel/habanalabs/common/mmu/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0-only
 HL_COMMON_MMU_FILES := common/mmu/mmu.o common/mmu/mmu_v1.o \
-			common/mmu/mmu_v2_hr.o
+			common/mmu/mmu_v2.o common/mmu/mmu_v2_hr.o
diff --git a/drivers/accel/habanalabs/common/mmu/mmu.c b/drivers/accel/habanalabs/common/mmu/mmu.c
index b654302a68fc..79823facce7f 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/slab.h>
+#include <linux/pci.h>
 
 #include "../habanalabs.h"
 
@@ -262,7 +263,7 @@ int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size, bool flu
 		mmu_funcs->flush(ctx);
 
 	if (trace_habanalabs_mmu_unmap_enabled() && !rc)
-		trace_habanalabs_mmu_unmap(hdev->dev, virt_addr, 0, page_size, flush_pte);
+		trace_habanalabs_mmu_unmap(&hdev->pdev->dev, virt_addr, 0, page_size, flush_pte);
 
 	return rc;
 }
@@ -349,7 +350,7 @@ int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_s
 	if (flush_pte)
 		mmu_funcs->flush(ctx);
 
-	trace_habanalabs_mmu_map(hdev->dev, virt_addr, phys_addr, page_size, flush_pte);
+	trace_habanalabs_mmu_map(&hdev->pdev->dev, virt_addr, phys_addr, page_size, flush_pte);
 
 	return 0;
 
@@ -585,6 +586,8 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
 
 int hl_mmu_if_set_funcs(struct hl_device *hdev)
 {
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+
 	if (hdev->mmu_disable)
 		return 0;
 
@@ -597,8 +600,10 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev)
 	case ASIC_GAUDI2:
 	case ASIC_GAUDI2B:
 	case ASIC_GAUDI2C:
-		/* MMUs in Gaudi2 are always host resident */
-		hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]);
+	case ASIC_GAUDI2D:
+		hl_mmu_v2_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]);
+		if (prop->pmmu.host_resident)
+			hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]);
 		break;
 	default:
 		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
@@ -641,7 +646,8 @@ int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags)
 	rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, is_hard, flags);
 	if (rc)
 		dev_err_ratelimited(hdev->dev,
-				"%s cache invalidation failed, rc=%d\n",
+				"%s: %s cache invalidation failed, rc=%d\n",
+				dev_name(&hdev->pdev->dev),
 				flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", rc);
 
 	return rc;
@@ -656,8 +662,9 @@ int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard,
 								asid, va, size);
 	if (rc)
 		dev_err_ratelimited(hdev->dev,
-				"%s cache range invalidation failed: va=%#llx, size=%llu, rc=%d",
-				flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU", va, size, rc);
+			"%s: %s cache range invalidation failed: va=%#llx, size=%llu, rc=%d",
+			dev_name(&hdev->pdev->dev), flags == VM_TYPE_USERPTR ? "PMMU" : "HMMU",
+			va, size, rc);
 
 	return rc;
 }
@@ -1209,3 +1216,219 @@ int hl_mmu_hr_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_
 	return 0;
 }
 
+struct pgt_info *hl_mmu_dr_get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = NULL;
+
+	hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
+			(unsigned long) hop_addr)
+		if (hop_addr == pgt_info->shadow_addr)
+			break;
+
+	return pgt_info;
+}
+
+void hl_mmu_dr_free_hop(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = hl_mmu_dr_get_pgt_info(ctx, hop_addr);
+
+	hl_mmu_dr_free_pgt_node(ctx, pgt_info);
+}
+
+void hl_mmu_dr_free_pgt_node(struct hl_ctx *ctx, struct pgt_info *pgt_info)
+{
+	struct hl_device *hdev = ctx->hdev;
+
+	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr,
+			hdev->asic_prop.dmmu.hop_table_size);
+	hash_del(&pgt_info->node);
+	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
+	kfree(pgt_info);
+}
+
+u64 hl_mmu_dr_get_phys_hop0_addr(struct hl_ctx *ctx)
+{
+	return ctx->hdev->asic_prop.mmu_pgt_addr +
+			(ctx->asid * ctx->hdev->asic_prop.dmmu.hop_table_size);
+}
+
+u64 hl_mmu_dr_get_hop0_addr(struct hl_ctx *ctx)
+{
+	return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 +
+			(ctx->asid * ctx->hdev->asic_prop.dmmu.hop_table_size);
+}
+
+u64 hl_mmu_dr_get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
+{
+	u64 page_mask = ctx->hdev->asic_prop.dmmu.hop_table_size - 1;
+	u64 shadow_hop_addr = shadow_addr & (~page_mask);
+	u64 pte_offset = shadow_addr & page_mask;
+	u64 phys_hop_addr;
+
+	if (shadow_hop_addr != hl_mmu_dr_get_hop0_addr(ctx))
+		phys_hop_addr = hl_mmu_dr_get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
+	else
+		phys_hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx);
+
+	return phys_hop_addr + pte_offset;
+}
+
+void hl_mmu_dr_write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
+{
+	u64 phys_val = hl_mmu_dr_get_phys_addr(ctx, val);
+
+	ctx->hdev->asic_funcs->write_pte(ctx->hdev, hl_mmu_dr_get_phys_addr(ctx, shadow_pte_addr),
+					phys_val);
+
+	*(u64 *) (uintptr_t) shadow_pte_addr = val;
+}
+
+void hl_mmu_dr_write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
+{
+	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
+				hl_mmu_dr_get_phys_addr(ctx, shadow_pte_addr), val);
+	*(u64 *) (uintptr_t) shadow_pte_addr = val;
+}
+
+void hl_mmu_dr_clear_pte(struct hl_ctx *ctx, u64 pte_addr)
+{
+	hl_mmu_dr_write_final_pte(ctx, pte_addr, 0);
+}
+
+void hl_mmu_dr_get_pte(struct hl_ctx *ctx, u64 hop_addr)
+{
+	hl_mmu_dr_get_pgt_info(ctx, hop_addr)->num_of_ptes++;
+}
+
+int hl_mmu_dr_put_pte(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = hl_mmu_dr_get_pgt_info(ctx, hop_addr);
+	int num_of_ptes_left;
+
+	pgt_info->num_of_ptes--;
+
+	/*
+	 * Need to save the number of ptes left because hl_mmu_free_hop might free
+	 * the pgt_info
+	 */
+	num_of_ptes_left = pgt_info->num_of_ptes;
+	if (!num_of_ptes_left)
+		hl_mmu_dr_free_pgt_node(ctx, pgt_info);
+
+	return num_of_ptes_left;
+}
+
+u64 hl_mmu_dr_alloc_hop(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct pgt_info *pgt_info;
+	u64 phys_addr, shadow_addr;
+
+	pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
+	if (!pgt_info)
+		return ULLONG_MAX;
+
+	phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool,
+					prop->dmmu.hop_table_size);
+	if (!phys_addr) {
+		dev_err(hdev->dev, "failed to allocate page\n");
+		goto pool_add_err;
+	}
+
+	shadow_addr = (u64) (uintptr_t) kzalloc(prop->dmmu.hop_table_size,
+						GFP_KERNEL);
+	if (!shadow_addr)
+		goto shadow_err;
+
+	pgt_info->phys_addr = phys_addr;
+	pgt_info->shadow_addr = shadow_addr;
+	pgt_info->ctx = ctx;
+	pgt_info->num_of_ptes = 0;
+	hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);
+
+	return shadow_addr;
+
+shadow_err:
+	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool,
+			phys_addr, prop->dmmu.hop_table_size);
+pool_add_err:
+	kfree(pgt_info);
+
+	return ULLONG_MAX;
+}
+
+u64 hl_mmu_dr_get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte, bool *is_new_hop)
+{
+	u64 hop_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte);
+
+	if (hop_addr == ULLONG_MAX) {
+		hop_addr = hl_mmu_dr_alloc_hop(ctx);
+		*is_new_hop = (hop_addr != ULLONG_MAX);
+	}
+
+	return hop_addr;
+}
+
+void hl_mmu_dr_flush(struct hl_ctx *ctx)
+{
+	/* flush all writes from all cores to reach PCI */
+	mb();
+	ctx->hdev->asic_funcs->read_pte(ctx->hdev, hl_mmu_dr_get_phys_hop0_addr(ctx));
+}
+
+int hl_mmu_dr_init(struct hl_device *hdev)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc;
+
+	hdev->mmu_priv.dr.mmu_pgt_pool =
+			gen_pool_create(__ffs(prop->dmmu.hop_table_size), -1);
+
+	if (!hdev->mmu_priv.dr.mmu_pgt_pool) {
+		dev_err(hdev->dev, "Failed to create page gen pool\n");
+		return -ENOMEM;
+	}
+
+	rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr +
+			prop->dmmu.hop0_tables_total_size,
+			prop->dmmu.pgt_size - prop->dmmu.hop0_tables_total_size,
+			-1);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
+		goto err_pool_add;
+	}
+
+	hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid,
+						prop->dmmu.hop_table_size, GFP_KERNEL);
+	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
+		rc = -ENOMEM;
+		goto err_pool_add;
+	}
+
+	/* MMU H/W init will be done in device hw_init() */
+
+	return 0;
+
+err_pool_add:
+	gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
+
+	return rc;
+}
+
+void hl_mmu_dr_fini(struct hl_device *hdev)
+{
+	/* MMU H/W fini was already done in device hw_fini() */
+
+	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0))
+		return;
+
+	kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0);
+	gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
+
+	/* Make sure that if we arrive here again without init was
+	 * called we won't cause kernel panic. This can happen for
+	 * example if we fail during hard reset code at certain points
+	 */
+	hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL;
+}
diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v1.c b/drivers/accel/habanalabs/common/mmu/mmu_v1.c
index d925dc4dd097..845d16aaa637 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu_v1.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu_v1.c
@@ -12,166 +12,6 @@
 
 #define MMU_V1_MAX_HOPS	(MMU_HOP4 + 1)
 
-static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
-
-static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = NULL;
-
-	hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
-				(unsigned long) hop_addr)
-		if (hop_addr == pgt_info->shadow_addr)
-			break;
-
-	return pgt_info;
-}
-
-static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info)
-{
-	struct hl_device *hdev = ctx->hdev;
-
-	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr,
-			hdev->asic_prop.mmu_hop_table_size);
-	hash_del(&pgt_info->node);
-	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
-	kfree(pgt_info);
-}
-
-static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
-
-	_free_hop(ctx, pgt_info);
-}
-
-static u64 alloc_hop(struct hl_ctx *ctx)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	struct pgt_info *pgt_info;
-	u64 phys_addr, shadow_addr;
-
-	pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
-	if (!pgt_info)
-		return ULLONG_MAX;
-
-	phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool,
-					prop->mmu_hop_table_size);
-	if (!phys_addr) {
-		dev_err(hdev->dev, "failed to allocate page\n");
-		goto pool_add_err;
-	}
-
-	shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
-						GFP_KERNEL);
-	if (!shadow_addr)
-		goto shadow_err;
-
-	pgt_info->phys_addr = phys_addr;
-	pgt_info->shadow_addr = shadow_addr;
-	pgt_info->ctx = ctx;
-	pgt_info->num_of_ptes = 0;
-	hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);
-
-	return shadow_addr;
-
-shadow_err:
-	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, phys_addr,
-			prop->mmu_hop_table_size);
-pool_add_err:
-	kfree(pgt_info);
-
-	return ULLONG_MAX;
-}
-
-static inline u64 get_phys_hop0_addr(struct hl_ctx *ctx)
-{
-	return ctx->hdev->asic_prop.mmu_pgt_addr +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
-}
-
-static inline u64 get_hop0_addr(struct hl_ctx *ctx)
-{
-	return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
-}
-
-static void flush(struct hl_ctx *ctx)
-{
-	/* flush all writes from all cores to reach PCI */
-	mb();
-	ctx->hdev->asic_funcs->read_pte(ctx->hdev, get_phys_hop0_addr(ctx));
-}
-
-/* transform the value to physical address when writing to H/W */
-static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
-{
-	/*
-	 * The value to write is actually the address of the next shadow hop +
-	 * flags at the 12 LSBs.
-	 * Hence in order to get the value to write to the physical PTE, we
-	 * clear the 12 LSBs and translate the shadow hop to its associated
-	 * physical hop, and add back the original 12 LSBs.
-	 */
-	u64 phys_val = get_phys_addr(ctx, val & HOP_PHYS_ADDR_MASK) |
-				(val & FLAGS_MASK);
-
-	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
-					get_phys_addr(ctx, shadow_pte_addr),
-					phys_val);
-
-	*(u64 *) (uintptr_t) shadow_pte_addr = val;
-}
-
-/* do not transform the value to physical address when writing to H/W */
-static inline void write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr,
-					u64 val)
-{
-	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
-					get_phys_addr(ctx, shadow_pte_addr),
-					val);
-	*(u64 *) (uintptr_t) shadow_pte_addr = val;
-}
-
-/* clear the last and present bits */
-static inline void clear_pte(struct hl_ctx *ctx, u64 pte_addr)
-{
-	/* no need to transform the value to physical address */
-	write_final_pte(ctx, pte_addr, 0);
-}
-
-static inline void get_pte(struct hl_ctx *ctx, u64 hop_addr)
-{
-	get_pgt_info(ctx, hop_addr)->num_of_ptes++;
-}
-
-/*
- * put_pte - decrement the num of ptes and free the hop if possible
- *
- * @ctx: pointer to the context structure
- * @hop_addr: addr of the hop
- *
- * This function returns the number of ptes left on this hop. If the number is
- * 0, it means the pte was freed.
- */
-static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
-	int num_of_ptes_left;
-
-	pgt_info->num_of_ptes--;
-
-	/*
-	 * Need to save the number of ptes left because free_hop might free
-	 * the pgt_info
-	 */
-	num_of_ptes_left = pgt_info->num_of_ptes;
-	if (!num_of_ptes_left)
-		_free_hop(ctx, pgt_info);
-
-	return num_of_ptes_left;
-}
-
 static inline u64 get_hop_pte_addr(struct hl_ctx *ctx, struct hl_mmu_properties *mmu_prop,
 					u64 *hop_addr_arr, u64 virt_addr, enum mmu_hop_num hop_idx)
 {
@@ -183,35 +23,6 @@ static inline u64 get_hop_pte_addr(struct hl_ctx *ctx, struct hl_mmu_properties
 			ctx->hdev->asic_prop.mmu_pte_size * ((virt_addr & mask) >> shift);
 }
 
-static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte,
-						bool *is_new_hop)
-{
-	u64 hop_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte);
-
-	if (hop_addr == ULLONG_MAX) {
-		hop_addr = alloc_hop(ctx);
-		*is_new_hop = (hop_addr != ULLONG_MAX);
-	}
-
-	return hop_addr;
-}
-
-/* translates shadow address inside hop to a physical address */
-static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
-{
-	u64 page_mask = (ctx->hdev->asic_prop.mmu_hop_table_size - 1);
-	u64 shadow_hop_addr = shadow_addr & ~page_mask;
-	u64 pte_offset = shadow_addr & page_mask;
-	u64 phys_hop_addr;
-
-	if (shadow_hop_addr != get_hop0_addr(ctx))
-		phys_hop_addr = get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
-	else
-		phys_hop_addr = get_phys_hop0_addr(ctx);
-
-	return phys_hop_addr + pte_offset;
-}
-
 static int dram_default_mapping_init(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
@@ -232,13 +43,13 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 	/* add hop1 and hop2 */
 	total_hops = num_of_hop3 + 2;
 
-	ctx->dram_default_hops = kzalloc(HL_PTE_SIZE * total_hops,  GFP_KERNEL);
+	ctx->dram_default_hops = kcalloc(total_hops, HL_PTE_SIZE,  GFP_KERNEL);
 	if (!ctx->dram_default_hops)
 		return -ENOMEM;
 
-	hop0_addr = get_hop0_addr(ctx);
+	hop0_addr = hl_mmu_dr_get_hop0_addr(ctx);
 
-	hop1_addr = alloc_hop(ctx);
+	hop1_addr = hl_mmu_dr_alloc_hop(ctx);
 	if (hop1_addr == ULLONG_MAX) {
 		dev_err(hdev->dev, "failed to alloc hop 1\n");
 		rc = -ENOMEM;
@@ -247,7 +58,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 
 	ctx->dram_default_hops[total_hops - 1] = hop1_addr;
 
-	hop2_addr = alloc_hop(ctx);
+	hop2_addr = hl_mmu_dr_alloc_hop(ctx);
 	if (hop2_addr == ULLONG_MAX) {
 		dev_err(hdev->dev, "failed to alloc hop 2\n");
 		rc = -ENOMEM;
@@ -257,7 +68,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 	ctx->dram_default_hops[total_hops - 2] = hop2_addr;
 
 	for (i = 0 ; i < num_of_hop3 ; i++) {
-		ctx->dram_default_hops[i] = alloc_hop(ctx);
+		ctx->dram_default_hops[i] = hl_mmu_dr_alloc_hop(ctx);
 		if (ctx->dram_default_hops[i] == ULLONG_MAX) {
 			dev_err(hdev->dev, "failed to alloc hop 3, i: %d\n", i);
 			rc = -ENOMEM;
@@ -268,18 +79,18 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 
 	/* need only pte 0 in hops 0 and 1 */
 	pte_val = (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-	write_pte(ctx, hop0_addr, pte_val);
+	hl_mmu_dr_write_pte(ctx, hop0_addr, pte_val);
 
 	pte_val = (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-	write_pte(ctx, hop1_addr, pte_val);
-	get_pte(ctx, hop1_addr);
+	hl_mmu_dr_write_pte(ctx, hop1_addr, pte_val);
+	hl_mmu_dr_get_pte(ctx, hop1_addr);
 
 	hop2_pte_addr = hop2_addr;
 	for (i = 0 ; i < num_of_hop3 ; i++) {
 		pte_val = (ctx->dram_default_hops[i] & HOP_PHYS_ADDR_MASK) |
 				PAGE_PRESENT_MASK;
-		write_pte(ctx, hop2_pte_addr, pte_val);
-		get_pte(ctx, hop2_addr);
+		hl_mmu_dr_write_pte(ctx, hop2_pte_addr, pte_val);
+		hl_mmu_dr_get_pte(ctx, hop2_addr);
 		hop2_pte_addr += HL_PTE_SIZE;
 	}
 
@@ -289,23 +100,23 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 	for (i = 0 ; i < num_of_hop3 ; i++) {
 		hop3_pte_addr = ctx->dram_default_hops[i];
 		for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) {
-			write_final_pte(ctx, hop3_pte_addr, pte_val);
-			get_pte(ctx, ctx->dram_default_hops[i]);
+			hl_mmu_dr_write_final_pte(ctx, hop3_pte_addr, pte_val);
+			hl_mmu_dr_get_pte(ctx, ctx->dram_default_hops[i]);
 			hop3_pte_addr += HL_PTE_SIZE;
 		}
 	}
 
-	flush(ctx);
+	hl_mmu_dr_flush(ctx);
 
 	return 0;
 
 hop3_err:
 	for (i = 0 ; i < hop3_allocated ; i++)
-		free_hop(ctx, ctx->dram_default_hops[i]);
+		hl_mmu_dr_free_hop(ctx, ctx->dram_default_hops[i]);
 
-	free_hop(ctx, hop2_addr);
+	hl_mmu_dr_free_hop(ctx, hop2_addr);
 hop2_err:
-	free_hop(ctx, hop1_addr);
+	hl_mmu_dr_free_hop(ctx, hop1_addr);
 hop1_err:
 	kfree(ctx->dram_default_hops);
 
@@ -329,7 +140,7 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
 	do_div(num_of_hop3, prop->dram_page_size);
 	do_div(num_of_hop3, HOP_PTE_ENTRIES_512);
 
-	hop0_addr = get_hop0_addr(ctx);
+	hop0_addr = hl_mmu_dr_get_hop0_addr(ctx);
 	/* add hop1 and hop2 */
 	total_hops = num_of_hop3 + 2;
 	hop1_addr = ctx->dram_default_hops[total_hops - 1];
@@ -338,101 +149,26 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
 	for (i = 0 ; i < num_of_hop3 ; i++) {
 		hop3_pte_addr = ctx->dram_default_hops[i];
 		for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) {
-			clear_pte(ctx, hop3_pte_addr);
-			put_pte(ctx, ctx->dram_default_hops[i]);
+			hl_mmu_dr_clear_pte(ctx, hop3_pte_addr);
+			hl_mmu_dr_put_pte(ctx, ctx->dram_default_hops[i]);
 			hop3_pte_addr += HL_PTE_SIZE;
 		}
 	}
 
 	hop2_pte_addr = hop2_addr;
 	for (i = 0 ; i < num_of_hop3 ; i++) {
-		clear_pte(ctx, hop2_pte_addr);
-		put_pte(ctx, hop2_addr);
+		hl_mmu_dr_clear_pte(ctx, hop2_pte_addr);
+		hl_mmu_dr_put_pte(ctx, hop2_addr);
 		hop2_pte_addr += HL_PTE_SIZE;
 	}
 
-	clear_pte(ctx, hop1_addr);
-	put_pte(ctx, hop1_addr);
-	clear_pte(ctx, hop0_addr);
+	hl_mmu_dr_clear_pte(ctx, hop1_addr);
+	hl_mmu_dr_put_pte(ctx, hop1_addr);
+	hl_mmu_dr_clear_pte(ctx, hop0_addr);
 
 	kfree(ctx->dram_default_hops);
 
-	flush(ctx);
-}
-
-/**
- * hl_mmu_v1_init() - initialize the MMU module.
- * @hdev: habanalabs device structure.
- *
- * This function does the following:
- * - Create a pool of pages for pgt_infos.
- * - Create a shadow table for pgt
- *
- * Return: 0 for success, non-zero for failure.
- */
-static int hl_mmu_v1_init(struct hl_device *hdev)
-{
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	int rc;
-
-	hdev->mmu_priv.dr.mmu_pgt_pool =
-			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
-
-	if (!hdev->mmu_priv.dr.mmu_pgt_pool) {
-		dev_err(hdev->dev, "Failed to create page gen pool\n");
-		return -ENOMEM;
-	}
-
-	rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr +
-			prop->mmu_hop0_tables_total_size,
-			prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size,
-			-1);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
-		goto err_pool_add;
-	}
-
-	hdev->mmu_priv.dr.mmu_shadow_hop0 = kvcalloc(prop->max_asid, prop->mmu_hop_table_size,
-										GFP_KERNEL);
-	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
-		rc = -ENOMEM;
-		goto err_pool_add;
-	}
-
-	/* MMU H/W init will be done in device hw_init() */
-
-	return 0;
-
-err_pool_add:
-	gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
-
-	return rc;
-}
-
-/**
- * hl_mmu_v1_fini() - release the MMU module.
- * @hdev: habanalabs device structure.
- *
- * This function does the following:
- * - Disable MMU in H/W.
- * - Free the pgt_infos pool.
- *
- * All contexts should be freed before calling this function.
- */
-static void hl_mmu_v1_fini(struct hl_device *hdev)
-{
-	/* MMU H/W fini was already done in device hw_fini() */
-
-	if (!ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
-		kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0);
-		gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
-
-		/* Make sure that if we arrive here again without init was
-		 * called we won't cause kernel panic. This can happen for
-		 * example if we fail during hard reset code at certain points
-		 */
-		hdev->mmu_priv.dr.mmu_shadow_hop0 = NULL;
-	}
+	hl_mmu_dr_flush(ctx);
 }
 
 /**
@@ -476,7 +212,7 @@ static void hl_mmu_v1_ctx_fini(struct hl_ctx *ctx)
 		dev_err_ratelimited(hdev->dev,
 			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
 			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
-		_free_hop(ctx, pgt_info);
+		hl_mmu_dr_free_pgt_node(ctx, pgt_info);
 	}
 }
 
@@ -495,7 +231,7 @@ static int hl_mmu_v1_unmap(struct hl_ctx *ctx,
 
 	for (hop_idx = MMU_HOP0; hop_idx < MMU_HOP4; hop_idx++) {
 		if (hop_idx == MMU_HOP0) {
-			hop_addr[hop_idx] = get_hop0_addr(ctx);
+			hop_addr[hop_idx] = hl_mmu_dr_get_hop0_addr(ctx);
 		} else {
 			hop_addr[hop_idx] = hl_mmu_get_next_hop_addr(ctx, curr_pte);
 			if (hop_addr[hop_idx] == ULLONG_MAX)
@@ -546,30 +282,30 @@ static int hl_mmu_v1_unmap(struct hl_ctx *ctx,
 		}
 
 		hop_idx = MMU_HOP3;
-		write_final_pte(ctx, hop_pte_addr[hop_idx], default_pte);
-		put_pte(ctx, hop_addr[hop_idx]);
+		hl_mmu_dr_write_final_pte(ctx, hop_pte_addr[hop_idx], default_pte);
+		hl_mmu_dr_put_pte(ctx, hop_addr[hop_idx]);
 	} else {
 		if (!(curr_pte & PAGE_PRESENT_MASK))
 			goto not_mapped;
 
 		if (hop_addr[MMU_HOP4])
-			clear_pte(ctx, hop_pte_addr[MMU_HOP4]);
+			hl_mmu_dr_clear_pte(ctx, hop_pte_addr[MMU_HOP4]);
 		else
-			clear_pte(ctx, hop_pte_addr[MMU_HOP3]);
+			hl_mmu_dr_clear_pte(ctx, hop_pte_addr[MMU_HOP3]);
 
-		if (hop_addr[MMU_HOP4] && !put_pte(ctx, hop_addr[MMU_HOP4]))
+		if (hop_addr[MMU_HOP4] && !hl_mmu_dr_put_pte(ctx, hop_addr[MMU_HOP4]))
 			clear_hop3 = true;
 
 		if (!clear_hop3)
 			goto mapped;
 
 		for (hop_idx = MMU_HOP3; hop_idx >= 0; hop_idx--) {
-			clear_pte(ctx, hop_pte_addr[hop_idx]);
+			hl_mmu_dr_clear_pte(ctx, hop_pte_addr[hop_idx]);
 
 			if (hop_idx == MMU_HOP0)
 				break;
 
-			if (put_pte(ctx, hop_addr[hop_idx]))
+			if (hl_mmu_dr_put_pte(ctx, hop_addr[hop_idx]))
 				goto mapped;
 		}
 	}
@@ -616,10 +352,10 @@ static int hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 
 	for (hop_idx = MMU_HOP0; hop_idx < num_hops; hop_idx++) {
 		if (hop_idx == MMU_HOP0) {
-			hop_addr[hop_idx] = get_hop0_addr(ctx);
+			hop_addr[hop_idx] = hl_mmu_dr_get_hop0_addr(ctx);
 		} else {
 			hop_addr[hop_idx] =
-					get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[hop_idx]);
+				hl_mmu_dr_get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[hop_idx]);
 			if (hop_addr[hop_idx] == ULLONG_MAX)
 				goto err;
 		}
@@ -666,27 +402,27 @@ static int hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 	curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask
 			| PAGE_PRESENT_MASK;
 
-	write_final_pte(ctx, hop_pte_addr[num_hops - 1], curr_pte);
+	hl_mmu_dr_write_final_pte(ctx, hop_pte_addr[num_hops - 1], curr_pte);
 
 	for (hop_idx = MMU_HOP1; hop_idx < num_hops; hop_idx++) {
 		prev_hop = hop_idx - 1;
 
 		if (hop_new[hop_idx]) {
 			curr_pte = (hop_addr[hop_idx] & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-			write_pte(ctx, hop_pte_addr[prev_hop], curr_pte);
+			hl_mmu_dr_write_pte(ctx, hop_pte_addr[prev_hop], curr_pte);
 			if (hop_idx != MMU_HOP1)
-				get_pte(ctx, hop_addr[prev_hop]);
+				hl_mmu_dr_get_pte(ctx, hop_addr[prev_hop]);
 		}
 	}
 
-	get_pte(ctx, hop_addr[num_hops - 1]);
+	hl_mmu_dr_get_pte(ctx, hop_addr[num_hops - 1]);
 
 	return 0;
 
 err:
 	for (hop_idx = num_hops; hop_idx > MMU_HOP0; hop_idx--) {
 		if (hop_new[hop_idx])
-			free_hop(ctx, hop_addr[hop_idx]);
+			hl_mmu_dr_free_hop(ctx, hop_addr[hop_idx]);
 	}
 
 	return rc;
@@ -752,7 +488,7 @@ static int hl_mmu_v1_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
 	if (is_huge)
 		used_hops--;
 
-	hops->hop_info[0].hop_addr = get_phys_hop0_addr(ctx);
+	hops->hop_info[0].hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx);
 	hops->hop_info[0].hop_pte_addr =
 			hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0,
 					hops->hop_info[0].hop_addr, virt_addr);
@@ -801,13 +537,13 @@ static int hl_mmu_v1_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
  */
 void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu)
 {
-	mmu->init = hl_mmu_v1_init;
-	mmu->fini = hl_mmu_v1_fini;
+	mmu->init = hl_mmu_dr_init;
+	mmu->fini = hl_mmu_dr_fini;
 	mmu->ctx_init = hl_mmu_v1_ctx_init;
 	mmu->ctx_fini = hl_mmu_v1_ctx_fini;
 	mmu->map = hl_mmu_v1_map;
 	mmu->unmap = hl_mmu_v1_unmap;
-	mmu->flush = flush;
+	mmu->flush = hl_mmu_dr_flush;
 	mmu->swap_out = hl_mmu_v1_swap_out;
 	mmu->swap_in = hl_mmu_v1_swap_in;
 	mmu->get_tlb_info = hl_mmu_v1_get_tlb_info;
diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v2.c b/drivers/accel/habanalabs/common/mmu/mmu_v2.c
new file mode 100644
index 000000000000..4bc0268fff1c
--- /dev/null
+++ b/drivers/accel/habanalabs/common/mmu/mmu_v2.c
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2020 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include "../habanalabs.h"
+#include "../../include/hw_ip/mmu/mmu_general.h"
+#include "../../include/hw_ip/mmu/mmu_v2_0.h"
+
+#include <linux/slab.h>
+
+/**
+ * hl_mmu_v2_ctx_init() - initialize a context for using the MMU module.
+ * @ctx: pointer to the context structure to initialize.
+ *
+ * Initialize a mutex to protect the concurrent mapping flow, a hash to hold all
+ * page tables hops related to this context.
+ * Return: 0 on success, non-zero otherwise.
+ */
+static int hl_mmu_v2_ctx_init(struct hl_ctx *ctx)
+{
+	hash_init(ctx->mmu_shadow_hash);
+
+	return 0;
+}
+
+/*
+ * hl_mmu_v2_ctx_fini - disable a ctx from using the mmu module
+ *
+ * @ctx: pointer to the context structure
+ *
+ * This function does the following:
+ * - Free any pgts which were not freed yet
+ * - Free the mutex
+ * - Free DRAM default page mapping hops
+ */
+static void hl_mmu_v2_ctx_fini(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct pgt_info *pgt_info;
+	struct hlist_node *tmp;
+	int i;
+
+	if (!hash_empty(ctx->mmu_shadow_hash))
+		dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n",
+			ctx->asid);
+
+	hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) {
+		dev_err_ratelimited(hdev->dev,
+			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
+			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
+		hl_mmu_dr_free_pgt_node(ctx, pgt_info);
+	}
+}
+
+static int hl_mmu_v2_unmap(struct hl_ctx *ctx,	u64 virt_addr, bool is_dram_addr)
+{
+	u64 hop_addr[MMU_ARCH_6_HOPS] = { 0 }, hop_pte_addr[MMU_ARCH_6_HOPS] = { 0 }, curr_pte,
+							scrambled_virt_addr;
+	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_mmu_properties *mmu_prop;
+	bool is_huge = false;
+	int i, hop_last;
+
+	/* device resident in V2 are allowed only for HMMU */
+	if (!is_dram_addr)
+		return -EINVAL;
+
+	mmu_prop = &prop->dmmu;
+
+	hop_last = mmu_prop->num_hops - 1;
+
+	scrambled_virt_addr = hdev->asic_funcs->scramble_addr(hdev, virt_addr);
+
+	hop_addr[0] = hl_mmu_dr_get_hop0_addr(ctx);
+	hop_pte_addr[0] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0,
+					hop_addr[0], scrambled_virt_addr);
+	if (hop_pte_addr[0] == U64_MAX)
+		return -EFAULT;
+
+	curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[0];
+
+	for (i = 1 ; i < mmu_prop->num_hops ; i++) {
+		hop_addr[i] = hl_mmu_get_next_hop_addr(ctx, curr_pte);
+		if (hop_addr[i] == ULLONG_MAX)
+			goto not_mapped;
+
+		hop_pte_addr[i] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i,
+					hop_addr[i], scrambled_virt_addr);
+		if (hop_pte_addr[i] == U64_MAX)
+			return -EFAULT;
+
+		curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[i];
+
+		if ((i <= hop_last) && (curr_pte & mmu_prop->last_mask)) {
+			hop_last = i;
+			is_huge = true;
+			break;
+		}
+	}
+
+	if (is_dram_addr && !is_huge) {
+		dev_err(hdev->dev, "DRAM unmapping should use huge pages only\n");
+		return -EFAULT;
+	}
+
+	if (!(curr_pte & PAGE_PRESENT_MASK))
+		goto not_mapped;
+
+	for (i = hop_last ; i > 0 ; i--) {
+		hl_mmu_dr_clear_pte(ctx, hop_pte_addr[i]);
+		if (hl_mmu_dr_put_pte(ctx, hop_addr[i]))
+			goto mapped;
+	}
+	hl_mmu_dr_clear_pte(ctx, hop_pte_addr[0]);
+
+mapped:
+	return 0;
+
+not_mapped:
+	dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
+		virt_addr);
+
+	return -EINVAL;
+}
+
+static int hl_mmu_v2_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
+							u32 page_size, bool is_dram_addr)
+{
+	u64 hop_addr[MMU_ARCH_6_HOPS] = { 0 }, hop_pte_addr[MMU_ARCH_6_HOPS] = { 0 },
+			curr_pte = 0, scrambled_virt_addr, scrambled_phys_addr;
+	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
+	bool hop_new[MMU_ARCH_6_HOPS] = { false };
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_mmu_properties *mmu_prop;
+	int rc, i, hop_last;
+
+	/* device resident in V2 are allowed only for HMMU */
+	if (!is_dram_addr)
+		return -EINVAL;
+
+	mmu_prop = &prop->dmmu;
+
+	hop_last = mmu_prop->num_hops - 1;
+
+	scrambled_virt_addr = hdev->asic_funcs->scramble_addr(hdev, virt_addr);
+	scrambled_phys_addr = hdev->asic_funcs->scramble_addr(hdev, phys_addr);
+
+	/* First hop is preallocated therefore it is treated differently  */
+	hop_addr[0] = hl_mmu_dr_get_hop0_addr(ctx);
+	hop_pte_addr[0] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0,
+						hop_addr[0], scrambled_virt_addr);
+	curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[0];
+
+	/* Handle hop1 to hop_last */
+	for (i = 1 ; i <= hop_last ; i++) {
+		hop_addr[i] = hl_mmu_dr_get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[i]);
+		if (hop_addr[i] == ULLONG_MAX) {
+			rc = -ENOMEM;
+			goto err;
+		}
+
+		hop_pte_addr[i] = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i,
+					hop_addr[i], scrambled_virt_addr);
+		if (hop_pte_addr[i] == U64_MAX) {
+			rc = -EINVAL;
+			goto err;
+		}
+
+		if (!hop_pte_addr[i]) {
+			rc = -EINVAL;
+			goto err;
+		}
+
+		curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[i];
+	}
+
+	if (curr_pte & PAGE_PRESENT_MASK) {
+		dev_err(hdev->dev,
+			"mapping already exists for virt_addr 0x%llx\n",
+				virt_addr);
+
+		for (i = 0 ; i <= hop_last ; i++)
+			dev_dbg(hdev->dev, "hop%d pte: 0x%llx (0x%llx)\n",
+				i, *(u64 *) (uintptr_t) hop_pte_addr[i],
+				hop_pte_addr[i]);
+
+		rc = -EINVAL;
+		goto err;
+	}
+
+	curr_pte = (scrambled_phys_addr & HOP_PHYS_ADDR_MASK)
+					| mmu_prop->last_mask | PAGE_PRESENT_MASK;
+
+	/* Write the PTEs */
+	hl_mmu_dr_write_final_pte(ctx, hop_pte_addr[hop_last], curr_pte);
+
+	/* for each new hop, add its address to the table of previous-hop */
+	for (i = 1 ; i <= hop_last ; i++) {
+		if (hop_new[i]) {
+			curr_pte = (hop_addr[i] & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+			hl_mmu_dr_write_pte(ctx, hop_pte_addr[i - 1], curr_pte);
+
+			if (i - 1)
+				hl_mmu_dr_get_pte(ctx, hop_addr[i - 1]);
+		}
+	}
+	hl_mmu_dr_get_pte(ctx, hop_addr[hop_last]);
+
+	return 0;
+
+err:
+	for (i = 1 ; i <= hop_last ; i++)
+		if (hop_new[i] && (hop_addr[i] != U64_MAX))
+			hl_mmu_dr_free_hop(ctx, hop_addr[i]);
+
+	return rc;
+}
+
+/*
+ * hl_mmu_v2_swap_out - marks all mapping of the given ctx as swapped out
+ *
+ * @ctx: pointer to the context structure
+ *
+ */
+static void hl_mmu_v2_swap_out(struct hl_ctx *ctx)
+{
+
+}
+
+/*
+ * hl_mmu_v2_swap_in - marks all mapping of the given ctx as swapped in
+ *
+ * @ctx: pointer to the context structure
+ *
+ */
+static void hl_mmu_v2_swap_in(struct hl_ctx *ctx)
+{
+
+}
+
+static int hl_mmu_v2_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_info *hops)
+{
+	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_mmu_properties *mmu_prop;
+	bool is_dram_addr;
+	int i;
+
+	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
+						prop->dmmu.start_addr,
+						prop->dmmu.end_addr);
+
+	/* device resident in V2 are allowed only for HMMU */
+	if (!is_dram_addr)
+		return -EINVAL;
+
+	mmu_prop = &prop->dmmu;
+	hops->range_type = HL_VA_RANGE_TYPE_DRAM;
+
+	hops->scrambled_vaddr = hdev->asic_funcs->scramble_addr(hdev, virt_addr);
+
+	hops->hop_info[0].hop_addr = hl_mmu_dr_get_phys_hop0_addr(ctx);
+	hops->hop_info[0].hop_pte_addr = hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, 0,
+						hops->hop_info[0].hop_addr,
+							hops->scrambled_vaddr);
+	if (hops->hop_info[0].hop_pte_addr == U64_MAX)
+		return -EFAULT;
+
+	hops->hop_info[0].hop_pte_val = hdev->asic_funcs->read_pte(hdev,
+						hops->hop_info[0].hop_pte_addr);
+	if (hops->hop_info[0].hop_pte_val == U64_MAX)
+		return -EFAULT;
+
+	for (i = 1 ; i < mmu_prop->num_hops ; i++) {
+		hops->hop_info[i].hop_addr =
+			hl_mmu_get_next_hop_addr(ctx, hops->hop_info[i - 1].hop_pte_val);
+		if (hops->hop_info[i].hop_addr == ULLONG_MAX)
+			return -EFAULT;
+
+		hops->hop_info[i].hop_pte_addr =
+				hl_mmu_get_hop_pte_phys_addr(ctx, mmu_prop, i,
+						hops->hop_info[i].hop_addr,
+						hops->scrambled_vaddr);
+		if (hops->hop_info[i].hop_pte_addr == U64_MAX)
+			return -EFAULT;
+
+		hops->hop_info[i].hop_pte_val =
+				hdev->asic_funcs->read_pte(hdev,
+					hops->hop_info[i].hop_pte_addr);
+
+		if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK))
+			return -EFAULT;
+
+		if (hops->hop_info[i].hop_pte_val & mmu_prop->last_mask)
+			break;
+	}
+
+	/* if passed over all hops then no last hop was found */
+	if (i == mmu_prop->num_hops)
+		return -EFAULT;
+
+	if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK))
+		return -EFAULT;
+
+	if (hops->scrambled_vaddr != virt_addr)
+		hops->unscrambled_paddr = hdev->asic_funcs->descramble_addr
+				(hdev, hops->hop_info[i].hop_pte_val);
+	else
+		hops->unscrambled_paddr = hops->hop_info[i].hop_pte_val;
+
+	hops->used_hops = i + 1;
+
+	return 0;
+}
+
+/*
+ * hl_mmu_v2_prepare - prepare mmu_if for working with mmu v2
+ *
+ * @hdev: pointer to the device structure
+ * @mmu_if: pointer to the mmu interface structure
+ */
+void hl_mmu_v2_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu)
+{
+	mmu->init = hl_mmu_dr_init;
+	mmu->fini = hl_mmu_dr_fini;
+	mmu->ctx_init = hl_mmu_v2_ctx_init;
+	mmu->ctx_fini = hl_mmu_v2_ctx_fini;
+	mmu->map = hl_mmu_v2_map;
+	mmu->unmap = hl_mmu_v2_unmap;
+	mmu->flush = hl_mmu_dr_flush;
+	mmu->swap_out = hl_mmu_v2_swap_out;
+	mmu->swap_in = hl_mmu_v2_swap_in;
+	mmu->get_tlb_info = hl_mmu_v2_get_tlb_info;
+}
diff --git a/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c b/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c
index afe7ef964f82..31507b2a431b 100644
--- a/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c
+++ b/drivers/accel/habanalabs/common/mmu/mmu_v2_hr.c
@@ -47,7 +47,7 @@ static inline int hl_mmu_v2_hr_init(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 
-	return hl_mmu_hr_init(hdev, &hdev->mmu_priv.hr, prop->mmu_hop_table_size,
+	return hl_mmu_hr_init(hdev, &hdev->mmu_priv.hr, prop->pmmu.hop_table_size,
 				prop->mmu_pgt_size);
 }
 
@@ -65,7 +65,7 @@ static inline void hl_mmu_v2_hr_fini(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 
-	hl_mmu_hr_fini(hdev, &hdev->mmu_priv.hr, prop->mmu_hop_table_size);
+	hl_mmu_hr_fini(hdev, &hdev->mmu_priv.hr, prop->pmmu.hop_table_size);
 }
 
 /**
@@ -108,7 +108,7 @@ static void hl_mmu_v2_hr_ctx_fini(struct hl_ctx *ctx)
 			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
 			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
 		hl_mmu_hr_free_hop_remove_pgt(pgt_info, &ctx->hdev->mmu_priv.hr,
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 	}
 }
 
@@ -150,7 +150,7 @@ static int _hl_mmu_v2_hr_unmap(struct hl_ctx *ctx,
 
 		curr_pte = *(u64 *) (uintptr_t) hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i],
 							hop_pte_phys_addr[i],
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 		if ((i < hop_last) && (curr_pte & mmu_prop->last_mask)) {
 			hop_last = i;
@@ -169,14 +169,14 @@ static int _hl_mmu_v2_hr_unmap(struct hl_ctx *ctx,
 
 	for (i = hop_last ; i > 0 ; i--) {
 		hl_mmu_hr_clear_pte(ctx, hops_pgt_info[i], hop_pte_phys_addr[i],
-						ctx->hdev->asic_prop.mmu_hop_table_size);
+						ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 		if (hl_mmu_hr_put_pte(ctx, hops_pgt_info[i], &ctx->hdev->mmu_priv.hr,
-						ctx->hdev->asic_prop.mmu_hop_table_size))
+						ctx->hdev->asic_prop.pmmu.hop_table_size))
 			goto mapped;
 	}
 	hl_mmu_hr_clear_pte(ctx, hops_pgt_info[0], hop_pte_phys_addr[0],
-						ctx->hdev->asic_prop.mmu_hop_table_size);
+						ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 mapped:
 	return 0;
@@ -255,7 +255,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 									scrambled_virt_addr);
 		curr_pte = *(u64 *) (uintptr_t) hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i],
 							hop_pte_phys_addr[i],
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 	}
 
 	if (curr_pte & PAGE_PRESENT_MASK) {
@@ -268,7 +268,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 					*(u64 *) (uintptr_t)
 					hl_mmu_hr_pte_phys_to_virt(ctx, hops_pgt_info[i],
 							hop_pte_phys_addr[i],
-							ctx->hdev->asic_prop.mmu_hop_table_size),
+							ctx->hdev->asic_prop.pmmu.hop_table_size),
 					hop_pte_phys_addr[i]);
 		rc = -EINVAL;
 		goto err;
@@ -279,7 +279,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 
 	/* Write the PTEs */
 	hl_mmu_hr_write_pte(ctx, hops_pgt_info[hop_last], hop_pte_phys_addr[hop_last], curr_pte,
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 	/* for each new hop, add its address to the table of previous-hop */
 	for (i = 1 ; i <= hop_last ; i++) {
@@ -287,7 +287,7 @@ static int _hl_mmu_v2_hr_map(struct hl_ctx *ctx,
 			curr_pte = (hops_pgt_info[i]->phys_addr & HOP_PHYS_ADDR_MASK) |
 							PAGE_PRESENT_MASK;
 			hl_mmu_hr_write_pte(ctx, hops_pgt_info[i - 1], hop_pte_phys_addr[i - 1],
-						curr_pte, ctx->hdev->asic_prop.mmu_hop_table_size);
+						curr_pte, ctx->hdev->asic_prop.pmmu.hop_table_size);
 			if (i - 1)
 				hl_mmu_hr_get_pte(ctx, &ctx->hdev->mmu_func[MMU_HR_PGT].hr_funcs,
 								hops_pgt_info[i - 1]->phys_addr);
@@ -303,7 +303,7 @@ err:
 	for (i = 1 ; i <= hop_last ; i++)
 		if (hop_new[i] && hops_pgt_info[i])
 			hl_mmu_hr_free_hop_remove_pgt(hops_pgt_info[i], &ctx->hdev->mmu_priv.hr,
-							ctx->hdev->asic_prop.mmu_hop_table_size);
+							ctx->hdev->asic_prop.pmmu.hop_table_size);
 
 	return rc;
 }
diff --git a/drivers/accel/habanalabs/common/pci/pci.c b/drivers/accel/habanalabs/common/pci/pci.c
index 191e0e3cf3a5..81cbd8697d4c 100644
--- a/drivers/accel/habanalabs/common/pci/pci.c
+++ b/drivers/accel/habanalabs/common/pci/pci.c
@@ -123,7 +123,7 @@ int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data)
 		pci_read_config_dword(pdev, mmPCI_CONFIG_ELBI_DATA, data);
 
 		if (unlikely(trace_habanalabs_elbi_read_enabled()))
-			trace_habanalabs_elbi_read(hdev->dev, (u32) addr, val);
+			trace_habanalabs_elbi_read(&hdev->pdev->dev, (u32) addr, val);
 
 		return 0;
 	}
@@ -186,7 +186,7 @@ static int hl_pci_elbi_write(struct hl_device *hdev, u64 addr, u32 data)
 
 	if ((val & PCI_CONFIG_ELBI_STS_MASK) == PCI_CONFIG_ELBI_STS_DONE) {
 		if (unlikely(trace_habanalabs_elbi_write_enabled()))
-			trace_habanalabs_elbi_write(hdev->dev, (u32) addr, val);
+			trace_habanalabs_elbi_write(&hdev->pdev->dev, (u32) addr, val);
 		return 0;
 	}
 
diff --git a/drivers/accel/habanalabs/common/security.c b/drivers/accel/habanalabs/common/security.c
index fe913965dbad..5402a3cd0491 100644
--- a/drivers/accel/habanalabs/common/security.c
+++ b/drivers/accel/habanalabs/common/security.c
@@ -7,15 +7,31 @@
 
 #include "habanalabs.h"
 
-static const char * const hl_glbl_error_cause[HL_MAX_NUM_OF_GLBL_ERR_CAUSE] = {
+static const char * const hl_glbl_error_cause[] = {
 	"Error due to un-priv read",
 	"Error due to un-secure read",
 	"Error due to read from unmapped reg",
 	"Error due to un-priv write",
 	"Error due to un-secure write",
 	"Error due to write to unmapped reg",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
 	"External I/F write sec violation",
 	"External I/F write to un-mapped reg",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
+	"N/A",
 	"Read to write only",
 	"Write to read only"
 };
@@ -671,10 +687,11 @@ static bool hl_check_block_range_exclusion(struct hl_device *hdev,
 static int hl_read_glbl_errors(struct hl_device *hdev,
 		u32 blk_idx, u32 major, u32 minor, u32 sub_minor, void *data)
 {
-	struct hl_special_block_info *special_blocks = hdev->asic_prop.special_blocks;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_special_block_info *special_blocks = prop->special_blocks;
 	struct hl_special_block_info *current_block = &special_blocks[blk_idx];
 	u32 glbl_err_addr, glbl_err_cause, addr_val, cause_val, block_base,
-		base = current_block->base_addr - lower_32_bits(hdev->asic_prop.cfg_base_address);
+		base = current_block->base_addr - lower_32_bits(prop->cfg_base_address);
 	int i;
 
 	block_base = base + major * current_block->major_offset +
@@ -689,13 +706,13 @@ static int hl_read_glbl_errors(struct hl_device *hdev,
 	glbl_err_addr = block_base + HL_GLBL_ERR_ADDR_OFFSET;
 	addr_val = RREG32(glbl_err_addr);
 
-	for (i = 0 ; i < hdev->asic_prop.glbl_err_cause_num ; i++) {
+	for (i = 0 ; i <= prop->glbl_err_max_cause_num ; i++) {
 		if (cause_val & BIT(i))
 			dev_err_ratelimited(hdev->dev,
-				"%s, addr %#llx\n",
-				hl_glbl_error_cause[i],
-				hdev->asic_prop.cfg_base_address + block_base +
-				FIELD_GET(HL_GLBL_ERR_ADDRESS_MASK, addr_val));
+					"%s, addr %#llx\n",
+					hl_glbl_error_cause[i],
+					prop->cfg_base_address + block_base +
+						FIELD_GET(HL_GLBL_ERR_ADDRESS_MASK, addr_val));
 	}
 
 	WREG32(glbl_err_cause, cause_val);
diff --git a/drivers/accel/habanalabs/common/security.h b/drivers/accel/habanalabs/common/security.h
index d7a3b3e82ea4..476f70687c09 100644
--- a/drivers/accel/habanalabs/common/security.h
+++ b/drivers/accel/habanalabs/common/security.h
@@ -13,8 +13,7 @@
 struct hl_device;
 
 /* special blocks */
-#define HL_MAX_NUM_OF_GLBL_ERR_CAUSE		10
-#define HL_GLBL_ERR_ADDRESS_MASK		GENMASK(11, 0)
+#define HL_GLBL_ERR_ADDRESS_MASK	GENMASK(11, 0)
 /* GLBL_ERR_ADDR register offset from the start of the block */
 #define HL_GLBL_ERR_ADDR_OFFSET		0xF44
 /* GLBL_ERR_CAUSE register offset from the start of the block */
diff --git a/drivers/accel/habanalabs/common/sysfs.c b/drivers/accel/habanalabs/common/sysfs.c
index 8a9f98832157..e9f8ccc0bbf9 100644
--- a/drivers/accel/habanalabs/common/sysfs.c
+++ b/drivers/accel/habanalabs/common/sysfs.c
@@ -142,8 +142,9 @@ static ssize_t cpld_ver_show(struct device *dev, struct device_attribute *attr,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
-	return sprintf(buf, "0x%08x\n",
-			le32_to_cpu(hdev->asic_prop.cpucp_info.cpld_version));
+	return sprintf(buf, "0x%08x%08x\n",
+		le32_to_cpu(hdev->asic_prop.cpucp_info.cpld_timestamp),
+		le32_to_cpu(hdev->asic_prop.cpucp_info.cpld_version));
 }
 
 static ssize_t cpucp_kernel_ver_show(struct device *dev,
@@ -270,6 +271,9 @@ static ssize_t device_type_show(struct device *dev,
 	case ASIC_GAUDI2C:
 		str = "GAUDI2C";
 		break;
+	case ASIC_GAUDI2D:
+		str = "GAUDI2D";
+		break;
 	default:
 		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
 				hdev->asic_type);
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index 53292d4c15c8..fa893a9b826e 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -614,8 +614,6 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	else
 		prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE;
 	prop->mmu_pte_size = HL_PTE_SIZE;
-	prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
-	prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 	prop->dram_page_size = PAGE_SIZE_2MB;
 	prop->device_mem_alloc_default_page_size = prop->dram_page_size;
 	prop->dram_supports_virtual_memory = false;
@@ -637,8 +635,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
 	prop->pmmu.last_mask = LAST_MASK;
 	/* TODO: will be duplicated until implementing per-MMU props */
-	prop->pmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->pmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->pmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->pmmu.hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 
 	/* PMMU and HPMMU are the same except of page size */
 	memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@@ -649,6 +647,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	prop->dmmu.start_addr = (VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2);
 	prop->dmmu.end_addr = VA_HOST_SPACE_END;
 	prop->dmmu.page_size = PAGE_SIZE_2MB;
+	prop->dmmu.pgt_size = prop->mmu_pgt_size;
 
 	prop->cfg_size = CFG_SIZE;
 	prop->max_asid = MAX_ASID;
@@ -1640,10 +1639,8 @@ static int gaudi_late_init(struct hl_device *hdev)
 	}
 
 	rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS, 0x0);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to enable PCI access from CPU\n");
+	if (rc)
 		return rc;
-	}
 
 	/* Scrub both SRAM and DRAM */
 	rc = hdev->asic_funcs->scrub_device_mem(hdev);
@@ -3652,7 +3649,7 @@ static int gaudi_mmu_init(struct hl_device *hdev)
 
 	for (i = 0 ; i < prop->max_asid ; i++) {
 		hop0_addr = prop->mmu_pgt_addr +
-				(i * prop->mmu_hop_table_size);
+				(i * prop->dmmu.hop_table_size);
 
 		rc = gaudi_mmu_update_asid_hop0_addr(hdev, i, hop0_addr);
 		if (rc) {
@@ -4155,13 +4152,7 @@ skip_reset:
 
 static int gaudi_suspend(struct hl_device *hdev)
 {
-	int rc;
-
-	rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
-	if (rc)
-		dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
-
-	return rc;
+	return hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
 }
 
 static int gaudi_resume(struct hl_device *hdev)
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index e0e5615ef9b0..a38b88baadf2 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -158,11 +158,13 @@
 #define RAZWI_INITIATOR_ID_X_Y(xl, yl, xh) \
 	(RAZWI_INITIATOR_ID_X_Y_LOW(xl, yl) | RAZWI_INITIATOR_ID_X_HIGH(xh))
 
-#define PSOC_RAZWI_ENG_STR_SIZE 128
-#define PSOC_RAZWI_MAX_ENG_PER_RTR 5
+#define PSOC_RAZWI_ENG_STR_SIZE			128
+#define PSOC_RAZWI_MAX_ENG_PER_RTR		5
 
 /* HW scrambles only bits 0-25 */
-#define HW_UNSCRAMBLED_BITS_MASK GENMASK_ULL(63, 26)
+#define HW_UNSCRAMBLED_BITS_MASK		GENMASK_ULL(63, 26)
+
+#define GAUDI2_GLBL_ERR_MAX_CAUSE_NUM		17
 
 struct gaudi2_razwi_info {
 	u32 axuser_xy;
@@ -2308,11 +2310,26 @@ static int set_number_of_functional_hbms(struct hl_device *hdev)
 	return 0;
 }
 
+static bool gaudi2_is_edma_queue_id(u32 queue_id)
+{
+
+	switch (queue_id) {
+	case GAUDI2_QUEUE_ID_DCORE0_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE0_EDMA_1_3:
+	case GAUDI2_QUEUE_ID_DCORE1_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE1_EDMA_1_3:
+	case GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE2_EDMA_1_3:
+	case GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0...GAUDI2_QUEUE_ID_DCORE3_EDMA_1_3:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static int gaudi2_set_dram_properties(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	u32 basic_hbm_page_size;
-	int rc;
+	u64 hbm_drv_base_offset = 0, edma_pq_base_addr;
+	u32 basic_hbm_page_size, edma_idx = 0;
+	int rc, i;
 
 	rc = set_number_of_functional_hbms(hdev);
 	if (rc)
@@ -2356,9 +2373,35 @@ static int gaudi2_set_dram_properties(struct hl_device *hdev)
 	prop->dmmu.start_addr = prop->dram_base_address +
 			(prop->dram_page_size *
 				DIV_ROUND_UP_SECTOR_T(prop->dram_size, prop->dram_page_size));
-
 	prop->dmmu.end_addr = prop->dmmu.start_addr + prop->dram_page_size *
 			div_u64((VA_HBM_SPACE_END - prop->dmmu.start_addr), prop->dmmu.page_size);
+	/*
+	 * Driver can't share an (48MB) HBM page with the F/W in order to prevent FW to block
+	 * the driver part by range register, so it must start at the next (48MB) page
+	 */
+	hbm_drv_base_offset = roundup(CPU_FW_IMAGE_SIZE, prop->num_functional_hbms * SZ_8M);
+
+	/*
+	 * The NIC driver section size and the HMMU page tables section in the HBM needs
+	 * to be the remaining size in the first dram page after taking into
+	 * account the F/W image size
+	 */
+
+	/* Reserve region in HBM for HMMU page tables */
+	prop->mmu_pgt_addr = DRAM_PHYS_BASE + hbm_drv_base_offset +
+				((prop->dram_page_size - hbm_drv_base_offset) -
+				(HMMU_PAGE_TABLES_SIZE + EDMA_PQS_SIZE + EDMA_SCRATCHPAD_SIZE));
+
+	/* Set EDMA PQs HBM addresses */
+	edma_pq_base_addr = prop->mmu_pgt_addr + HMMU_PAGE_TABLES_SIZE;
+
+	for (i = 0 ; i < GAUDI2_QUEUE_ID_CPU_PQ ; i++) {
+		if (gaudi2_is_edma_queue_id(i)) {
+			prop->hw_queues_props[i].q_dram_bd_address = edma_pq_base_addr +
+							(edma_idx * HL_QUEUE_SIZE_IN_BYTES);
+			edma_idx++;
+		}
+	}
 
 	return 0;
 }
@@ -2368,7 +2411,7 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct hw_queue_properties *q_props;
 	u32 num_sync_stream_queues = 0;
-	int i;
+	int i, rc;
 
 	prop->max_queues = GAUDI2_QUEUE_ID_SIZE;
 	prop->hw_queues_props = kcalloc(prop->max_queues, sizeof(struct hw_queue_properties),
@@ -2391,6 +2434,9 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 		}
 
 		q_props[i].cb_alloc_flags = CB_ALLOC_USER;
+
+		if (gaudi2_is_edma_queue_id(i))
+			q_props[i].dram_bd = 1;
 	}
 
 	q_props[GAUDI2_QUEUE_ID_CPU_PQ].type = QUEUE_TYPE_CPU;
@@ -2419,46 +2465,43 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 
 	prop->rotator_enabled_mask = BIT(NUM_OF_ROT) - 1;
 
-	if (hdev->pldm)
-		prop->mmu_pgt_size = 0x800000; /* 8MB */
-	else
-		prop->mmu_pgt_size = MMU_PAGE_TABLES_INITIAL_SIZE;
+	prop->max_asid = 2;
 
+	prop->dmmu.pgt_size = HMMU_PAGE_TABLES_SIZE;
 	prop->mmu_pte_size = HL_PTE_SIZE;
-	prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
-	prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 
 	prop->dmmu.hop_shifts[MMU_HOP0] = DHOP0_SHIFT;
 	prop->dmmu.hop_shifts[MMU_HOP1] = DHOP1_SHIFT;
 	prop->dmmu.hop_shifts[MMU_HOP2] = DHOP2_SHIFT;
 	prop->dmmu.hop_shifts[MMU_HOP3] = DHOP3_SHIFT;
-	prop->dmmu.hop_shifts[MMU_HOP4] = DHOP4_SHIFT;
 	prop->dmmu.hop_masks[MMU_HOP0] = DHOP0_MASK;
 	prop->dmmu.hop_masks[MMU_HOP1] = DHOP1_MASK;
 	prop->dmmu.hop_masks[MMU_HOP2] = DHOP2_MASK;
 	prop->dmmu.hop_masks[MMU_HOP3] = DHOP3_MASK;
-	prop->dmmu.hop_masks[MMU_HOP4] = DHOP4_MASK;
 	prop->dmmu.page_size = PAGE_SIZE_1GB;
-	prop->dmmu.num_hops = MMU_ARCH_6_HOPS;
+	prop->dmmu.num_hops = MMU_ARCH_4_HOPS;
 	prop->dmmu.last_mask = LAST_MASK;
-	prop->dmmu.host_resident = 1;
-	prop->dmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->dmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->dmmu.host_resident = 0;
+	prop->dmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->dmmu.hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid;
 
-	/*
-	 * this is done in order to be able to validate FW descriptor (i.e. validating that
-	 * the addresses and allocated space for FW image does not cross memory bounds).
-	 * for this reason we set the DRAM size to the minimum possible and later it will
-	 * be modified according to what reported in the cpucp info packet
+	/* As we need to set the pgt address in dram for HMMU init so we cannot
+	 * wait to the fw cpucp info to set the dram props as mmu init comes before
+	 * hw init
 	 */
-	prop->dram_size = (GAUDI2_HBM_NUM - 1) * SZ_16G;
+	rc = hdev->asic_funcs->set_dram_properties(hdev);
+	if (rc)
+		goto free_qprops;
+
+	prop->mmu_pgt_size = PMMU_PAGE_TABLES_SIZE;
 
+	prop->pmmu.pgt_size = prop->mmu_pgt_size;
 	hdev->pmmu_huge_range = true;
 	prop->pmmu.host_resident = 1;
 	prop->pmmu.num_hops = MMU_ARCH_6_HOPS;
 	prop->pmmu.last_mask = LAST_MASK;
-	prop->pmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->pmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->pmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->pmmu.hop0_tables_total_size = HOP_TABLE_SIZE_512_PTE * prop->max_asid;
 
 	prop->hints_host_reserved_va_range.start_addr = RESERVED_VA_FOR_VIRTUAL_MSIX_DOORBELL_START;
 	prop->hints_host_reserved_va_range.end_addr = RESERVED_VA_RANGE_FOR_ARC_ON_HOST_END;
@@ -2516,7 +2559,6 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 	prop->max_num_of_engines = GAUDI2_ENGINE_ID_SIZE;
 	prop->num_engine_cores = CPU_ID_MAX;
 	prop->cfg_size = CFG_SIZE;
-	prop->max_asid = MAX_ASID;
 	prop->num_of_events = GAUDI2_EVENT_SIZE;
 
 	prop->supports_engine_modes = true;
@@ -2559,7 +2601,13 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 
 	prop->hbw_flush_reg = mmPCIE_WRAP_SPECIAL_GLBL_SPARE_0;
 
+	prop->supports_advanced_cpucp_rc = true;
+
 	return 0;
+
+free_qprops:
+	kfree(prop->hw_queues_props);
+	return rc;
 }
 
 static int gaudi2_pci_bars_map(struct hl_device *hdev)
@@ -3033,6 +3081,25 @@ static int gaudi2_fetch_psoc_frequency(struct hl_device *hdev)
 	return 0;
 }
 
+static int gaudi2_mmu_clear_pgt_range(struct hl_device *hdev)
+{
+	struct gaudi2_device *gaudi2 = hdev->asic_specific;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc;
+
+	if (!(gaudi2->hw_cap_initialized & HW_CAP_MMU_MASK))
+		return 0;
+
+	if (prop->dmmu.host_resident)
+		return 0;
+
+	rc = gaudi2_memset_device_memory(hdev, prop->mmu_pgt_addr, prop->dmmu.pgt_size, 0);
+	if (rc)
+		dev_err(hdev->dev, "Failed to clear mmu pgt");
+
+	return rc;
+}
+
 static int gaudi2_early_init(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -3243,14 +3310,10 @@ static int gaudi2_late_init(struct hl_device *hdev)
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
 	int rc;
 
-	hdev->asic_prop.supports_advanced_cpucp_rc = true;
-
 	rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS,
 					gaudi2->virt_msix_db_dma_addr);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to enable PCI access from CPU\n");
+	if (rc)
 		return rc;
-	}
 
 	rc = gaudi2_fetch_psoc_frequency(hdev);
 	if (rc) {
@@ -3258,6 +3321,12 @@ static int gaudi2_late_init(struct hl_device *hdev)
 		goto disable_pci_access;
 	}
 
+	rc = gaudi2_mmu_clear_pgt_range(hdev);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to clear MMU page tables range\n");
+		goto disable_pci_access;
+	}
+
 	gaudi2_init_arcs(hdev);
 
 	rc = gaudi2_scrub_arcs_dccm(hdev);
@@ -3518,7 +3587,7 @@ static int gaudi2_special_blocks_config(struct hl_device *hdev)
 	int i, rc;
 
 	/* Configure Special blocks */
-	prop->glbl_err_cause_num = GAUDI2_NUM_OF_GLBL_ERR_CAUSE;
+	prop->glbl_err_max_cause_num = GAUDI2_GLBL_ERR_MAX_CAUSE_NUM;
 	prop->num_of_special_blocks = ARRAY_SIZE(gaudi2_special_blocks);
 	prop->special_blocks = kmalloc_array(prop->num_of_special_blocks,
 			sizeof(*prop->special_blocks), GFP_KERNEL);
@@ -3697,13 +3766,7 @@ static int gaudi2_sw_init(struct hl_device *hdev)
 
 	spin_lock_init(&gaudi2->hw_queues_lock);
 
-	gaudi2->scratchpad_kernel_address = hl_asic_dma_alloc_coherent(hdev, PAGE_SIZE,
-							&gaudi2->scratchpad_bus_address,
-							GFP_KERNEL | __GFP_ZERO);
-	if (!gaudi2->scratchpad_kernel_address) {
-		rc = -ENOMEM;
-		goto free_virt_msix_db_mem;
-	}
+	gaudi2->scratchpad_bus_address = prop->mmu_pgt_addr + HMMU_PAGE_TABLES_SIZE + EDMA_PQS_SIZE;
 
 	gaudi2_user_mapped_blocks_init(hdev);
 
@@ -3718,7 +3781,7 @@ static int gaudi2_sw_init(struct hl_device *hdev)
 	prop->supports_compute_reset = true;
 
 	/* Event queue sanity check added in FW version 1.11 */
-	if (hl_is_fw_sw_ver_below(hdev, 1, 11))
+	if (hl_fw_version_cmp(hdev, 1, 11, 0) < 0)
 		hdev->event_queue.check_eqe_index = false;
 	else
 		hdev->event_queue.check_eqe_index = true;
@@ -3727,19 +3790,18 @@ static int gaudi2_sw_init(struct hl_device *hdev)
 
 	rc = gaudi2_special_blocks_iterator_config(hdev);
 	if (rc)
-		goto free_scratchpad_mem;
+		goto free_virt_msix_db_mem;
 
 	rc = gaudi2_test_queues_msgs_alloc(hdev);
 	if (rc)
 		goto special_blocks_free;
 
+	hdev->heartbeat_debug_info.cpu_queue_id = GAUDI2_QUEUE_ID_CPU_PQ;
+
 	return 0;
 
 special_blocks_free:
 	gaudi2_special_blocks_iterator_free(hdev);
-free_scratchpad_mem:
-	hl_asic_dma_free_coherent(hdev, PAGE_SIZE, gaudi2->scratchpad_kernel_address,
-				  gaudi2->scratchpad_bus_address);
 free_virt_msix_db_mem:
 	hl_cpu_accessible_dma_pool_free(hdev, prop->pmmu.page_size, gaudi2->virt_msix_db_cpu_addr);
 free_cpu_accessible_dma_pool:
@@ -3770,9 +3832,6 @@ static int gaudi2_sw_fini(struct hl_device *hdev)
 	hl_asic_dma_free_coherent(hdev, HL_CPU_ACCESSIBLE_MEM_SIZE, hdev->cpu_accessible_dma_mem,
 						hdev->cpu_accessible_dma_address);
 
-	hl_asic_dma_free_coherent(hdev, PAGE_SIZE, gaudi2->scratchpad_kernel_address,
-					gaudi2->scratchpad_bus_address);
-
 	dma_pool_destroy(hdev->dma_pool);
 
 	kfree(gaudi2);
@@ -4254,6 +4313,8 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
 	if (gaudi2->hw_cap_initialized & HW_CAP_MSIX)
 		return 0;
 
+	hl_init_cpu_for_irq(hdev);
+
 	rc = pci_alloc_irq_vectors(hdev->pdev, GAUDI2_MSIX_ENTRIES, GAUDI2_MSIX_ENTRIES,
 					PCI_IRQ_MSIX);
 	if (rc < 0) {
@@ -4307,6 +4368,7 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
 			i++, j++, user_irq_init_cnt++) {
 
 		irq = pci_irq_vector(hdev->pdev, i);
+		hl_set_irq_affinity(hdev, irq);
 		rc = request_irq(irq, hl_irq_user_interrupt_handler, 0, gaudi2_irq_name(i),
 				&hdev->user_interrupt[j]);
 		if (rc) {
@@ -4333,6 +4395,7 @@ free_user_irq:
 			i < GAUDI2_IRQ_NUM_USER_FIRST + user_irq_init_cnt ; i++, j++) {
 
 		irq = pci_irq_vector(hdev->pdev, i);
+		irq_set_affinity_and_hint(irq, NULL);
 		free_irq(irq, &hdev->user_interrupt[j]);
 	}
 	irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_UNEXPECTED_ERROR);
@@ -4413,6 +4476,7 @@ static void gaudi2_disable_msix(struct hl_device *hdev)
 			k < hdev->asic_prop.user_interrupt_count ; i++, j++, k++) {
 
 		irq = pci_irq_vector(hdev->pdev, i);
+		irq_set_affinity_and_hint(irq, NULL);
 		free_irq(irq, &hdev->user_interrupt[j]);
 	}
 
@@ -4957,10 +5021,17 @@ static void gaudi2_init_qman_pq(struct hl_device *hdev, u32 reg_base,
 		q = &hdev->kernel_queues[queue_id_base + pq_id];
 		pq_offset = pq_id * 4;
 
-		WREG32(reg_base + QM_PQ_BASE_LO_0_OFFSET + pq_offset,
-				lower_32_bits(q->bus_address));
-		WREG32(reg_base + QM_PQ_BASE_HI_0_OFFSET + pq_offset,
-				upper_32_bits(q->bus_address));
+		if (q->dram_bd) {
+			WREG32(reg_base + QM_PQ_BASE_LO_0_OFFSET + pq_offset,
+					lower_32_bits(q->pq_dram_address));
+			WREG32(reg_base + QM_PQ_BASE_HI_0_OFFSET + pq_offset,
+					upper_32_bits(q->pq_dram_address));
+		} else {
+			WREG32(reg_base + QM_PQ_BASE_LO_0_OFFSET + pq_offset,
+					lower_32_bits(q->bus_address));
+			WREG32(reg_base + QM_PQ_BASE_HI_0_OFFSET + pq_offset,
+					upper_32_bits(q->bus_address));
+		}
 		WREG32(reg_base + QM_PQ_SIZE_0_OFFSET + pq_offset, ilog2(HL_QUEUE_LENGTH));
 		WREG32(reg_base + QM_PQ_PI_0_OFFSET + pq_offset, 0);
 		WREG32(reg_base + QM_PQ_CI_0_OFFSET + pq_offset, 0);
@@ -5847,7 +5918,8 @@ static int gaudi2_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_har
 	return rc;
 }
 
-static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base)
+static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base,
+									bool host_resident_pgt)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	u64 hop0_addr;
@@ -5859,7 +5931,11 @@ static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base)
 		max_asid = min((u32) 8, max_asid);
 
 	for (asid = 0 ; asid < max_asid ; asid++) {
-		hop0_addr = hdev->mmu_priv.hr.mmu_asid_hop0[asid].phys_addr;
+		if (host_resident_pgt)
+			hop0_addr = hdev->mmu_priv.hr.mmu_asid_hop0[asid].phys_addr;
+		else
+			hop0_addr = prop->mmu_pgt_addr + (asid * prop->dmmu.hop_table_size);
+
 		rc = gaudi2_mmu_update_asid_hop0_addr(hdev, stlb_base, asid, hop0_addr);
 		if (rc) {
 			dev_err(hdev->dev, "failed to set hop0 addr for asid %d\n", asid);
@@ -5870,7 +5946,8 @@ static int gaudi2_mmu_update_hop0_addr(struct hl_device *hdev, u32 stlb_base)
 	return 0;
 }
 
-static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb_base)
+static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb_base,
+								bool host_resident_pgt)
 {
 	u32 status, timeout_usec;
 	int rc;
@@ -5893,7 +5970,7 @@ static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb
 	if (rc)
 		dev_notice_ratelimited(hdev->dev, "Timeout when waiting for MMU SRAM init\n");
 
-	rc = gaudi2_mmu_update_hop0_addr(hdev, stlb_base);
+	rc = gaudi2_mmu_update_hop0_addr(hdev, stlb_base, host_resident_pgt);
 	if (rc)
 		return rc;
 
@@ -5917,6 +5994,7 @@ static int gaudi2_mmu_init_common(struct hl_device *hdev, u32 mmu_base, u32 stlb
 
 static int gaudi2_pci_mmu_init(struct hl_device *hdev)
 {
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
 	u32 mmu_base, stlb_base;
 	int rc;
@@ -5956,7 +6034,7 @@ static int gaudi2_pci_mmu_init(struct hl_device *hdev)
 
 	WREG32(mmu_base + MMU_SPI_SEI_MASK_OFFSET, GAUDI2_PMMU_SPI_SEI_ENABLE_MASK);
 
-	rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base);
+	rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base, prop->pmmu.host_resident);
 	if (rc)
 		return rc;
 
@@ -6008,7 +6086,7 @@ static int gaudi2_dcore_hmmu_init(struct hl_device *hdev, int dcore_id,
 
 	WREG32(mmu_base + MMU_SPI_SEI_MASK_OFFSET, GAUDI2_HMMU_SPI_SEI_ENABLE_MASK);
 
-	rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base);
+	rc = gaudi2_mmu_init_common(hdev, mmu_base, stlb_base, prop->dmmu.host_resident);
 	if (rc)
 		return rc;
 
@@ -6236,26 +6314,6 @@ static void gaudi2_execute_hard_reset(struct hl_device *hdev)
 	WREG32(mmPSOC_RESET_CONF_SW_ALL_RST, 1);
 }
 
-static int gaudi2_get_soft_rst_done_indication(struct hl_device *hdev, u32 poll_timeout_us)
-{
-	int i, rc = 0;
-	u32 reg_val;
-
-	for (i = 0 ; i < GAUDI2_RESET_POLL_CNT ; i++)
-		rc = hl_poll_timeout(
-			hdev,
-			mmCPU_RST_STATUS_TO_HOST,
-			reg_val,
-			reg_val == CPU_RST_STATUS_SOFT_RST_DONE,
-			1000,
-			poll_timeout_us);
-
-	if (rc)
-		dev_err(hdev->dev, "Timeout while waiting for FW to complete soft reset (0x%x)\n",
-				reg_val);
-	return rc;
-}
-
 /**
  * gaudi2_execute_soft_reset - execute soft reset by driver/FW
  *
@@ -6268,23 +6326,8 @@ static int gaudi2_get_soft_rst_done_indication(struct hl_device *hdev, u32 poll_
 static int gaudi2_execute_soft_reset(struct hl_device *hdev, bool driver_performs_reset,
 						u32 poll_timeout_us)
 {
-	int rc;
-
-	if (!driver_performs_reset) {
-		if (hl_is_fw_sw_ver_below(hdev, 1, 10)) {
-			/* set SP to indicate reset request sent to FW */
-			WREG32(mmCPU_RST_STATUS_TO_HOST, CPU_RST_STATUS_NA);
-
-			WREG32(mmGIC_HOST_SOFT_RST_IRQ_POLL_REG,
-				gaudi2_irq_map_table[GAUDI2_EVENT_CPU_SOFT_RESET].cpu_id);
-
-			/* wait for f/w response */
-			rc = gaudi2_get_soft_rst_done_indication(hdev, poll_timeout_us);
-		} else {
-			rc = hl_fw_send_soft_reset(hdev);
-		}
-		return rc;
-	}
+	if (!driver_performs_reset)
+		return hl_fw_send_soft_reset(hdev);
 
 	/* Block access to engines, QMANs and SM during reset, these
 	 * RRs will be reconfigured after soft reset.
@@ -6424,13 +6467,7 @@ skip_reset:
 
 static int gaudi2_suspend(struct hl_device *hdev)
 {
-	int rc;
-
-	rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
-	if (rc)
-		dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
-
-	return rc;
+	return hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
 }
 
 static int gaudi2_resume(struct hl_device *hdev)
@@ -7046,7 +7083,7 @@ static int gaudi2_test_queues(struct hl_device *hdev)
 
 	/* send test message on all enabled Qs */
 	for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) {
-		if (!gaudi2_is_queue_enabled(hdev, i))
+		if (!gaudi2_is_queue_enabled(hdev, i) || gaudi2_is_edma_queue_id(i))
 			continue;
 
 		msg_info = &gaudi2->queues_test_info[i - GAUDI2_QUEUE_ID_PDMA_0_0];
@@ -7063,7 +7100,7 @@ static int gaudi2_test_queues(struct hl_device *hdev)
 
 	/* verify that all messages were processed */
 	for (i = GAUDI2_QUEUE_ID_PDMA_0_0 ; i < GAUDI2_QUEUE_ID_CPU_PQ; i++) {
-		if (!gaudi2_is_queue_enabled(hdev, i))
+		if (!gaudi2_is_queue_enabled(hdev, i) || gaudi2_is_edma_queue_id(i))
 			continue;
 
 		rc = gaudi2_test_queue_wait_completion(hdev, i, sob_val);
@@ -7836,7 +7873,7 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
 	bool has_block_id = false;
 	u16 block_id;
 
-	if (!hl_is_fw_sw_ver_below(hdev, 1, 12))
+	if (hl_fw_version_cmp(hdev, 1, 12, 0) >= 0)
 		has_block_id = true;
 
 	ecc_address = le64_to_cpu(ecc_data->ecc_address);
@@ -8087,13 +8124,7 @@ static void gaudi2_ack_module_razwi_event_handler(struct hl_device *hdev,
 		}
 
 		hbw_rtr_id = gaudi2_tpc_initiator_hbw_rtr_id[module_idx];
-
-		if (hl_is_fw_sw_ver_below(hdev, 1, 9) &&
-				!hdev->asic_prop.fw_security_enabled &&
-				((module_idx == 0) || (module_idx == 1)))
-			lbw_rtr_id = DCORE0_RTR0;
-		else
-			lbw_rtr_id = gaudi2_tpc_initiator_lbw_rtr_id[module_idx];
+		lbw_rtr_id = gaudi2_tpc_initiator_lbw_rtr_id[module_idx];
 		break;
 	case RAZWI_MME:
 		sprintf(initiator_name, "MME_%u", module_idx);
@@ -8907,9 +8938,6 @@ static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u16 event_typ
 	u32 error_count = 0;
 	int i;
 
-	gaudi2_print_event(hdev, event_type, true,
-		"intr_cause_data: %#llx", intr_cause_data);
-
 	for (i = 0 ; i < GAUDI2_NUM_OF_PCIE_ADDR_DEC_ERR_CAUSE ; i++) {
 		if (!(intr_cause_data & BIT_ULL(i)))
 			continue;
@@ -8918,15 +8946,16 @@ static int gaudi2_print_pcie_addr_dec_info(struct hl_device *hdev, u16 event_typ
 			"err cause: %s", gaudi2_pcie_addr_dec_error_cause[i]);
 		error_count++;
 
-		/*
-		 * Always check for LBW and HBW additional info as the indication itself is
-		 * sometimes missing
-		 */
+		switch (intr_cause_data & BIT_ULL(i)) {
+		case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_AXI_LBW_ERR_INTR_MASK:
+			hl_check_for_glbl_errors(hdev);
+			break;
+		case PCIE_WRAP_PCIE_IC_SEI_INTR_IND_BAD_ACCESS_INTR_MASK:
+			gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask);
+			break;
+		}
 	}
 
-	hl_check_for_glbl_errors(hdev);
-	gaudi2_print_pcie_mstr_rr_mstr_if_razwi_info(hdev, event_mask);
-
 	return error_count;
 }
 
@@ -8983,7 +9012,6 @@ static void gaudi2_handle_page_error(struct hl_device *hdev, u64 mmu_base, bool
 	if (is_pmmu) {
 		dev_err_ratelimited(hdev->dev, "PMMU page fault on va 0x%llx\n", addr);
 	} else {
-
 		addr = gaudi2_mmu_descramble_addr(hdev, addr);
 		addr &= HW_UNSCRAMBLED_BITS_MASK;
 		dev_err_ratelimited(hdev->dev, "HMMU page fault on va range 0x%llx - 0x%llx\n",
@@ -9235,8 +9263,8 @@ static int gaudi2_handle_mmu_spi_sei_err(struct hl_device *hdev, u16 event_type,
 static bool gaudi2_hbm_sei_handle_read_err(struct hl_device *hdev,
 			struct hl_eq_hbm_sei_read_err_intr_info *rd_err_data, u32 err_cnt)
 {
+	bool require_hard_reset = false;
 	u32 addr, beat, beat_shift;
-	bool rc = false;
 
 	dev_err_ratelimited(hdev->dev,
 			"READ ERROR count: ECC SERR: %d, ECC DERR: %d, RD_PARITY: %d\n",
@@ -9268,7 +9296,7 @@ static bool gaudi2_hbm_sei_handle_read_err(struct hl_device *hdev,
 						beat,
 						le32_to_cpu(rd_err_data->dbg_rd_err_dm),
 						le32_to_cpu(rd_err_data->dbg_rd_err_syndrome));
-			rc |= true;
+			require_hard_reset = true;
 		}
 
 		beat_shift = beat * HBM_RD_ERR_BEAT_SHIFT;
@@ -9281,7 +9309,7 @@ static bool gaudi2_hbm_sei_handle_read_err(struct hl_device *hdev,
 					(le32_to_cpu(rd_err_data->dbg_rd_err_misc) &
 						(HBM_RD_ERR_PAR_DATA_BEAT0_MASK << beat_shift)) >>
 						(HBM_RD_ERR_PAR_DATA_BEAT0_SHIFT + beat_shift));
-			rc |= true;
+			require_hard_reset = true;
 		}
 
 		dev_err_ratelimited(hdev->dev, "Beat%d DQ data:\n", beat);
@@ -9291,7 +9319,7 @@ static bool gaudi2_hbm_sei_handle_read_err(struct hl_device *hdev,
 					le32_to_cpu(rd_err_data->dbg_rd_err_data[beat * 2 + 1]));
 	}
 
-	return rc;
+	return require_hard_reset;
 }
 
 static void gaudi2_hbm_sei_print_wr_par_info(struct hl_device *hdev,
@@ -9514,25 +9542,17 @@ static int gaudi2_handle_pcie_p2p_msix(struct hl_device *hdev, u16 event_type)
 static int gaudi2_handle_pcie_drain(struct hl_device *hdev,
 			struct hl_eq_pcie_drain_ind_data *drain_data)
 {
-	u64 lbw_rd, lbw_wr, hbw_rd, hbw_wr, cause, error_count = 0;
+	u64 cause, error_count = 0;
 
 	cause = le64_to_cpu(drain_data->intr_cause.intr_cause_data);
-	lbw_rd = le64_to_cpu(drain_data->drain_rd_addr_lbw);
-	lbw_wr = le64_to_cpu(drain_data->drain_wr_addr_lbw);
-	hbw_rd = le64_to_cpu(drain_data->drain_rd_addr_hbw);
-	hbw_wr = le64_to_cpu(drain_data->drain_wr_addr_hbw);
 
 	if (cause & BIT_ULL(0)) {
-		dev_err_ratelimited(hdev->dev,
-			"PCIE AXI drain LBW completed, read_err %u, write_err %u\n",
-			!!lbw_rd, !!lbw_wr);
+		dev_err_ratelimited(hdev->dev, "PCIE AXI drain LBW completed\n");
 		error_count++;
 	}
 
 	if (cause & BIT_ULL(1)) {
-		dev_err_ratelimited(hdev->dev,
-			"PCIE AXI drain HBW completed, raddr %#llx, waddr %#llx\n",
-			hbw_rd, hbw_wr);
+		dev_err_ratelimited(hdev->dev, "PCIE AXI drain HBW completed\n");
 		error_count++;
 	}
 
@@ -9757,11 +9777,6 @@ static u16 event_id_to_engine_id(struct hl_device *hdev, u16 event_type)
 	return U16_MAX;
 }
 
-static void hl_eq_heartbeat_event_handle(struct hl_device *hdev)
-{
-	hdev->eq_heartbeat_received = true;
-}
-
 static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 {
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
@@ -9983,6 +9998,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		if (gaudi2_handle_hbm_mc_sei_err(hdev, event_type, &eq_entry->sei_data)) {
 			reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 			reset_required = true;
+			is_critical = eq_entry->sei_data.hdr.is_critical;
 		}
 		error_count++;
 		break;
@@ -10003,7 +10019,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		error_count = gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data);
 		reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
 		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
-		if (hl_is_fw_sw_ver_equal_or_greater(hdev, 1, 13))
+		if (hl_fw_version_cmp(hdev, 1, 13, 0) >= 0)
 			is_critical = true;
 		break;
 
@@ -10214,8 +10230,7 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
 		gaudi2_print_event(hdev, event_type, true,
 				"No error cause for H/W event %u", event_type);
 
-	if ((gaudi2_irq_map_table[event_type].reset != EVENT_RESET_TYPE_NONE) ||
-				reset_required) {
+	if ((gaudi2_irq_map_table[event_type].reset != EVENT_RESET_TYPE_NONE) || reset_required) {
 		if (reset_required ||
 				(gaudi2_irq_map_table[event_type].reset == EVENT_RESET_TYPE_HARD))
 			reset_flags |= HL_DRV_RESET_HARD;
@@ -10250,11 +10265,11 @@ reset_device:
 }
 
 static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev,
-			struct packet_lin_dma *lin_dma_pkt, dma_addr_t pkt_dma_addr,
-			u32 hw_queue_id, u32 size, u64 addr, u32 val)
+			struct packet_lin_dma *lin_dma_pkt,
+			u64 phys_addr, u32 hw_queue_id, u32 size, u64 addr, u32 val)
 {
 	u32 ctl, pkt_size;
-	int rc = 0;
+	int rc = 0, i;
 
 	ctl = FIELD_PREP(GAUDI2_PKT_CTL_OPCODE_MASK, PACKET_LIN_DMA);
 	ctl |= FIELD_PREP(GAUDI2_PKT_LIN_DMA_CTL_MEMSET_MASK, 1);
@@ -10268,9 +10283,20 @@ static int gaudi2_memset_memory_chunk_using_edma_qm(struct hl_device *hdev,
 
 	pkt_size = sizeof(struct packet_lin_dma);
 
-	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, pkt_dma_addr);
+	for (i = 0; i < 3; i++) {
+		rc = hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM,
+				phys_addr + (i * sizeof(u64)),
+				((u64 *)(lin_dma_pkt)) + i, DEBUGFS_WRITE64);
+		if (rc) {
+			dev_err(hdev->dev, "Failed to copy lin_dma packet to HBM (%#llx)\n",
+				phys_addr);
+			return rc;
+		}
+	}
+
+	rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, pkt_size, phys_addr);
 	if (rc)
-		dev_err(hdev->dev, "Failed to send lin dma packet to H/W queue %d\n",
+		dev_err(hdev->dev, "Failed to send lin_dma packet to H/W queue %d\n",
 				hw_queue_id);
 
 	return rc;
@@ -10283,12 +10309,11 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 					GAUDI2_QUEUE_ID_DCORE2_EDMA_0_0,
 					GAUDI2_QUEUE_ID_DCORE3_EDMA_0_0};
 	u32 chunk_size, dcore, edma_idx, sob_offset, sob_addr, comp_val,
-		old_mmubp, mmubp, num_of_pkts, busy, pkt_size;
+		old_mmubp, mmubp, num_of_pkts, busy, pkt_size, cb_len;
 	u64 comp_addr, cur_addr = addr, end_addr = addr + size;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc = 0, dma_num = 0, i;
 	void *lin_dma_pkts_arr;
-	dma_addr_t pkt_dma_addr;
-	int rc = 0, dma_num = 0;
 
 	if (prop->edma_enabled_mask == 0) {
 		dev_info(hdev->dev, "non of the EDMA engines is enabled - skip dram scrubbing\n");
@@ -10306,9 +10331,19 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 	/* Calculate how many lin dma pkts we'll need */
 	num_of_pkts = div64_u64(round_up(size, SZ_2G), SZ_2G);
 	pkt_size = sizeof(struct packet_lin_dma);
+	cb_len = pkt_size * num_of_pkts;
 
-	lin_dma_pkts_arr = hl_asic_dma_alloc_coherent(hdev, pkt_size * num_of_pkts,
-					&pkt_dma_addr, GFP_KERNEL);
+	/*
+	 * if we're not scrubing HMMU or NIC reserved sections in hbm,
+	 * then it the scrubing of the user section, as we use the start of the user section
+	 * to store the CB of the EDMA QM, so shift the start address of the scrubbing accordingly
+	 * and scrub the CB section before leaving this function.
+	 */
+	if ((addr >= prop->dram_user_base_address) &&
+				(addr < prop->dram_user_base_address + cb_len))
+		cur_addr += (prop->dram_user_base_address + cb_len) - addr;
+
+	lin_dma_pkts_arr = kvcalloc(num_of_pkts, pkt_size, GFP_KERNEL);
 	if (!lin_dma_pkts_arr)
 		return -ENOMEM;
 
@@ -10354,7 +10389,7 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 
 				rc = gaudi2_memset_memory_chunk_using_edma_qm(hdev,
 					(struct packet_lin_dma *)lin_dma_pkts_arr + dma_num,
-					pkt_dma_addr + dma_num * pkt_size,
+					prop->dram_user_base_address + (dma_num * pkt_size),
 					edma_queues_id[dcore] + edma_idx * 4,
 					chunk_size, cur_addr, val);
 				if (rc)
@@ -10363,14 +10398,16 @@ static int gaudi2_memset_device_memory(struct hl_device *hdev, u64 addr, u64 siz
 				dma_num++;
 				cur_addr += chunk_size;
 				if (cur_addr == end_addr)
-					break;
+					goto edma_wait;
 			}
 		}
 	}
 
+edma_wait:
 	rc = hl_poll_timeout(hdev, sob_addr, busy, (busy == dma_num), 1000, 1000000);
 	if (rc) {
-		dev_err(hdev->dev, "DMA Timeout during HBM scrubbing\n");
+		dev_err(hdev->dev, "DMA Timeout during HBM scrubbing(sob: 0x%x, dma_num: 0x%x)\n",
+						busy, dma_num);
 		goto end;
 	}
 end:
@@ -10391,8 +10428,16 @@ end:
 		}
 	}
 
+	memset(lin_dma_pkts_arr, 0, sizeof(u64));
+
+	/* Zero the HBM area where we copied the CB */
+	for (i = 0; i < cb_len / sizeof(u64); i += sizeof(u64))
+		rc = hdev->asic_funcs->access_dev_mem(hdev, PCI_REGION_DRAM,
+			prop->dram_user_base_address + i,
+				(u64 *)(lin_dma_pkts_arr), DEBUGFS_WRITE64);
 	WREG32(sob_addr, 0);
-	hl_asic_dma_free_coherent(hdev, pkt_size * num_of_pkts, lin_dma_pkts_arr, pkt_dma_addr);
+
+	kfree(lin_dma_pkts_arr);
 
 	return rc;
 }
@@ -11450,7 +11495,7 @@ static int gaudi2_mmu_get_real_page_size(struct hl_device *hdev, struct hl_mmu_p
 	return 0;
 
 page_size_err:
-	dev_err(hdev->dev, "page size of %u is not %uKB aligned, can't map\n",
+	dev_err(hdev->dev, "page size of 0x%X is not 0x%X aligned, can't map\n",
 							page_size, mmu_prop->page_size >> 10);
 	return -EFAULT;
 }
@@ -11470,6 +11515,29 @@ int gaudi2_send_device_activity(struct hl_device *hdev, bool open)
 	return hl_fw_send_device_activity(hdev, open);
 }
 
+static u64 gaudi2_read_pte(struct hl_device *hdev, u64 addr)
+{
+	struct gaudi2_device *gaudi2 = hdev->asic_specific;
+	u64 val;
+
+	if (hdev->reset_info.hard_reset_pending)
+		return U64_MAX;
+
+	val = readq(hdev->pcie_bar[DRAM_BAR_ID] + (addr - gaudi2->dram_bar_cur_addr));
+
+	return val;
+}
+
+static void gaudi2_write_pte(struct hl_device *hdev, u64 addr, u64 val)
+{
+	struct gaudi2_device *gaudi2 = hdev->asic_specific;
+
+	if (hdev->reset_info.hard_reset_pending)
+		return;
+
+	writeq(val, hdev->pcie_bar[DRAM_BAR_ID] + (addr - gaudi2->dram_bar_cur_addr));
+}
+
 static const struct hl_asic_funcs gaudi2_funcs = {
 	.early_init = gaudi2_early_init,
 	.early_fini = gaudi2_early_fini,
@@ -11506,8 +11574,8 @@ static const struct hl_asic_funcs gaudi2_funcs = {
 	.add_device_attr = gaudi2_add_device_attr,
 	.handle_eqe = gaudi2_handle_eqe,
 	.get_events_stat = gaudi2_get_events_stat,
-	.read_pte = NULL,
-	.write_pte = NULL,
+	.read_pte = gaudi2_read_pte,
+	.write_pte = gaudi2_write_pte,
 	.mmu_invalidate_cache = gaudi2_mmu_invalidate_cache,
 	.mmu_invalidate_cache_range = gaudi2_mmu_invalidate_cache_range,
 	.mmu_prefetch_cache_range = NULL,
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
index 9b9eef0d97d6..05117272cac7 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
@@ -19,8 +19,6 @@
 #define GAUDI2_LINUX_FW_FILE	"habanalabs/gaudi2/gaudi2-fit.itb"
 #define GAUDI2_BOOT_FIT_FILE	"habanalabs/gaudi2/gaudi2-boot-fit.itb"
 
-#define MMU_PAGE_TABLES_INITIAL_SIZE	0x10000000	/* 256MB */
-
 #define GAUDI2_CPU_TIMEOUT_USEC		30000000	/* 30s */
 
 #define NUMBER_OF_PDMA_QUEUES		2
@@ -109,13 +107,11 @@
 /* DRAM Memory Map */
 
 #define CPU_FW_IMAGE_SIZE			0x10000000	/* 256MB */
-
-/* This define should be used only when working in a debug mode without dram.
- * When working with dram, the driver size will be calculated dynamically.
- */
-#define NIC_DEFAULT_DRV_SIZE			0x20000000	/* 512MB */
-
 #define CPU_FW_IMAGE_ADDR			DRAM_PHYS_BASE
+#define PMMU_PAGE_TABLES_SIZE			0x10000000      /* 256MB */
+#define EDMA_PQS_SIZE				SZ_2M
+#define EDMA_SCRATCHPAD_SIZE			SZ_1M
+#define HMMU_PAGE_TABLES_SIZE			SZ_1M
 
 #define NIC_NUMBER_OF_PORTS			NIC_NUMBER_OF_ENGINES
 
@@ -241,9 +237,8 @@
 #define GAUDI2_SOB_INCREMENT_BY_ONE	(FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_VAL_MASK, 1) | \
 					FIELD_PREP(DCORE0_SYNC_MNGR_OBJS_SOB_OBJ_INC_MASK, 1))
 
-#define GAUDI2_NUM_TESTED_QS (GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0)
+#define GAUDI2_NUM_TESTED_QS		(GAUDI2_QUEUE_ID_CPU_PQ - GAUDI2_QUEUE_ID_PDMA_0_0)
 
-#define GAUDI2_NUM_OF_GLBL_ERR_CAUSE		8
 
 enum gaudi2_reserved_sob_id {
 	GAUDI2_RESERVED_SOB_CS_COMPLETION_FIRST,
@@ -389,7 +384,7 @@ enum gaudi2_edma_id {
 /* User interrupt count is aligned with HW CQ count.
  * We have 64 CQ's per dcore, CQ0 in dcore 0 is reserved for legacy mode
  */
-#define GAUDI2_NUM_USER_INTERRUPTS 255
+#define GAUDI2_NUM_USER_INTERRUPTS 64
 #define GAUDI2_NUM_RESERVED_INTERRUPTS 1
 #define GAUDI2_TOTAL_USER_INTERRUPTS (GAUDI2_NUM_USER_INTERRUPTS + GAUDI2_NUM_RESERVED_INTERRUPTS)
 
@@ -421,11 +416,11 @@ enum gaudi2_irq_num {
 	GAUDI2_IRQ_NUM_NIC_PORT_LAST = (GAUDI2_IRQ_NUM_NIC_PORT_FIRST + NIC_NUMBER_OF_PORTS - 1),
 	GAUDI2_IRQ_NUM_TPC_ASSERT,
 	GAUDI2_IRQ_NUM_EQ_ERROR,
+	GAUDI2_IRQ_NUM_USER_FIRST,
+	GAUDI2_IRQ_NUM_USER_LAST = (GAUDI2_IRQ_NUM_USER_FIRST + GAUDI2_NUM_USER_INTERRUPTS - 1),
 	GAUDI2_IRQ_NUM_RESERVED_FIRST,
-	GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_TOTAL_USER_INTERRUPTS - 1),
+	GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_NUM_RESERVED_INTERRUPTS - 1),
 	GAUDI2_IRQ_NUM_UNEXPECTED_ERROR = RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT,
-	GAUDI2_IRQ_NUM_USER_FIRST = GAUDI2_IRQ_NUM_UNEXPECTED_ERROR + 1,
-	GAUDI2_IRQ_NUM_USER_LAST = (GAUDI2_IRQ_NUM_USER_FIRST + GAUDI2_NUM_USER_INTERRUPTS - 1),
 	GAUDI2_IRQ_NUM_LAST = (GAUDI2_MSIX_ENTRIES - 1)
 };
 
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2_security.c b/drivers/accel/habanalabs/gaudi2/gaudi2_security.c
index 34bf80c5a44b..307ccb912ccd 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2_security.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2_security.c
@@ -479,6 +479,7 @@ static const u32 gaudi2_pb_dcr0_edma0_unsecured_regs[] = {
 	mmDCORE0_EDMA0_CORE_CTX_TE_NUMROWS,
 	mmDCORE0_EDMA0_CORE_CTX_IDX,
 	mmDCORE0_EDMA0_CORE_CTX_IDX_INC,
+	mmDCORE0_EDMA0_CORE_WR_COMP_MAX_OUTSTAND,
 	mmDCORE0_EDMA0_CORE_RD_LBW_RATE_LIM_CFG,
 	mmDCORE0_EDMA0_QM_CQ_CFG0_0,
 	mmDCORE0_EDMA0_QM_CQ_CFG0_1,
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index 1322cb330c57..84768e306269 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -413,8 +413,6 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	else
 		prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE;
 	prop->mmu_pte_size = HL_PTE_SIZE;
-	prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
-	prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 	prop->dram_page_size = PAGE_SIZE_2MB;
 	prop->device_mem_alloc_default_page_size = prop->dram_page_size;
 	prop->dram_supports_virtual_memory = true;
@@ -435,8 +433,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	prop->dmmu.num_hops = MMU_ARCH_5_HOPS;
 	prop->dmmu.last_mask = LAST_MASK;
 	/* TODO: will be duplicated until implementing per-MMU props */
-	prop->dmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->dmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->dmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->dmmu.hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 
 	/* shifts and masks are the same in PMMU and DMMU */
 	memcpy(&prop->pmmu, &prop->dmmu, sizeof(prop->dmmu));
@@ -446,8 +444,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
 	prop->pmmu.last_mask = LAST_MASK;
 	/* TODO: will be duplicated until implementing per-MMU props */
-	prop->pmmu.hop_table_size = prop->mmu_hop_table_size;
-	prop->pmmu.hop0_tables_total_size = prop->mmu_hop0_tables_total_size;
+	prop->pmmu.hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->pmmu.hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 
 	/* PMMU and HPMMU are the same except of page size */
 	memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@@ -895,11 +893,8 @@ int goya_late_init(struct hl_device *hdev)
 	WREG32(mmMMU_LOG2_DDR_SIZE, ilog2(prop->dram_size));
 
 	rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_ENABLE_PCI_ACCESS, 0x0);
-	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to enable PCI access from CPU %d\n", rc);
+	if (rc)
 		return rc;
-	}
 
 	/* force setting to low frequency */
 	goya->curr_pll_profile = PLL_LOW;
@@ -2678,7 +2673,7 @@ int goya_mmu_init(struct hl_device *hdev)
 
 	for (i = 0 ; i < prop->max_asid ; i++) {
 		hop0_addr = prop->mmu_pgt_addr +
-				(i * prop->mmu_hop_table_size);
+				(i * prop->dmmu.hop_table_size);
 
 		rc = goya_mmu_update_asid_hop0_addr(hdev, i, hop0_addr);
 		if (rc) {
@@ -2866,13 +2861,7 @@ static int goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 
 int goya_suspend(struct hl_device *hdev)
 {
-	int rc;
-
-	rc = hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
-	if (rc)
-		dev_err(hdev->dev, "Failed to disable PCI access from CPU\n");
-
-	return rc;
+	return hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS, 0x0);
 }
 
 int goya_resume(struct hl_device *hdev)
diff --git a/drivers/accel/habanalabs/goya/goya_coresight.c b/drivers/accel/habanalabs/goya/goya_coresight.c
index 41cae5fd843b..3827ea4c02f7 100644
--- a/drivers/accel/habanalabs/goya/goya_coresight.c
+++ b/drivers/accel/habanalabs/goya/goya_coresight.c
@@ -576,7 +576,6 @@ static int goya_config_spmu(struct hl_device *hdev,
 		struct hl_debug_params *params)
 {
 	u64 base_reg;
-	struct hl_debug_params_spmu *input = params->input;
 	u64 *output;
 	u32 output_arr_len;
 	u32 events_num;
@@ -592,7 +591,7 @@ static int goya_config_spmu(struct hl_device *hdev,
 	base_reg = debug_spmu_regs[params->reg_idx] - CFG_BASE;
 
 	if (params->enable) {
-		input = params->input;
+		struct hl_debug_params_spmu *input = params->input;
 
 		if (!input)
 			return -EINVAL;
diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2.h
index 0231d6c55b4a..753d46a2836b 100644
--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2.h
@@ -63,9 +63,9 @@
 #define RESERVED_VA_RANGE_FOR_ARC_ON_HOST_HPAGE_START	0xFFF0F80000000000ull
 #define RESERVED_VA_RANGE_FOR_ARC_ON_HOST_HPAGE_END	0xFFF0FFFFFFFFFFFFull
 
-#define RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT	256
+#define RESERVED_MSIX_UNEXPECTED_USER_ERROR_INTERRUPT	127
 
-#define GAUDI2_MSIX_ENTRIES	512
+#define GAUDI2_MSIX_ENTRIES	128
 
 #define QMAN_PQ_ENTRY_SIZE	16			/* Bytes */
 
diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
index b2dbe1f64430..82d639990cca 100644
--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_async_ids_map_extended.h
@@ -330,9 +330,9 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
 	{ .fc_id = 149, .cpu_id = 48, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "EDMA7_ECC_SERR" },
 	{ .fc_id = 150, .cpu_id = 48, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "HDMA4_ECC_SERR" },
+		 .name = "EDMA4_ECC_SERR" },
 	{ .fc_id = 151, .cpu_id = 48, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "HDMA5_ECC_SERR" },
+		 .name = "EDMA5_ECC_SERR" },
 	{ .fc_id = 152, .cpu_id = 49, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "EDMA2_ECC_DERR" },
 	{ .fc_id = 153, .cpu_id = 49, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
@@ -856,55 +856,55 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
 	{ .fc_id = 412, .cpu_id = 84, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "PCIE_ADDR_DEC_ERR" },
 	{ .fc_id = 413, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC0_AXI_ERR_RSP" },
+		 .name = "DCORE0_TPC0_AXI_ERR_RSP" },
 	{ .fc_id = 414, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC1_AXI_ERR_RSP" },
+		 .name = "DCORE0_TPC1_AXI_ERR_RSP" },
 	{ .fc_id = 415, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC2_AXI_ERR_RSP" },
+		 .name = "DCORE0_TPC2_AXI_ERR_RSP" },
 	{ .fc_id = 416, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC3_AXI_ERR_RSP" },
+		 .name = "DCORE0_TPC3_AXI_ERR_RSP" },
 	{ .fc_id = 417, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC4_AXI_ERR_RSP" },
+		 .name = "DCORE0_TPC4_AXI_ERR_RSP" },
 	{ .fc_id = 418, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC5_AXI_ERR_RSP" },
+		 .name = "DCORE0_TPC5_AXI_ERR_RSP" },
 	{ .fc_id = 419, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC6_AXI_ERR_RSP" },
+		 .name = "DCORE1_TPC0_AXI_ERR_RSP" },
 	{ .fc_id = 420, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC7_AXI_ERR_RSP" },
+		 .name = "DCORE1_TPC1_AXI_ERR_RSP" },
 	{ .fc_id = 421, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC8_AXI_ERR_RSP" },
+		 .name = "DCORE1_TPC2_AXI_ERR_RSP" },
 	{ .fc_id = 422, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC9_AXI_ERR_RSP" },
+		 .name = "DCORE1_TPC3_AXI_ERR_RSP" },
 	{ .fc_id = 423, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC10_AXI_ERR_RSP" },
+		 .name = "DCORE1_TPC4_AXI_ERR_RSP" },
 	{ .fc_id = 424, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC11_AXI_ERR_RSP" },
+		 .name = "DCORE1_TPC5_AXI_ERR_RSP" },
 	{ .fc_id = 425, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC12_AXI_ERR_RSP" },
+		 .name = "DCORE2_TPC0_AXI_ERR_RSP" },
 	{ .fc_id = 426, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC13_AXI_ERR_RSP" },
+		 .name = "DCORE2_TPC1_AXI_ERR_RSP" },
 	{ .fc_id = 427, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC14_AXI_ERR_RSP" },
+		 .name = "DCORE2_TPC2_AXI_ERR_RSP" },
 	{ .fc_id = 428, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC15_AXI_ERR_RSP" },
+		 .name = "DCORE2_TPC3_AXI_ERR_RSP" },
 	{ .fc_id = 429, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC16_AXI_ERR_RSP" },
+		 .name = "DCORE2_TPC4_AXI_ERR_RSP" },
 	{ .fc_id = 430, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC17_AXI_ERR_RSP" },
+		 .name = "DCORE2_TPC5_AXI_ERR_RSP" },
 	{ .fc_id = 431, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC18_AXI_ERR_RSP" },
+		 .name = "DCORE3_TPC0_AXI_ERR_RSP" },
 	{ .fc_id = 432, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC19_AXI_ERR_RSP" },
+		 .name = "DCORE3_TPC1_AXI_ERR_RSP" },
 	{ .fc_id = 433, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC20_AXI_ERR_RSP" },
+		 .name = "DCORE3_TPC2_AXI_ERR_RSP" },
 	{ .fc_id = 434, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC21_AXI_ERR_RSP" },
+		 .name = "DCORE3_TPC3_AXI_ERR_RSP" },
 	{ .fc_id = 435, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC22_AXI_ERR_RSP" },
+		 .name = "DCORE3_TPC4_AXI_ERR_RSP" },
 	{ .fc_id = 436, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC23_AXI_ERR_RSP" },
+		 .name = "DCORE3_TPC5_AXI_ERR_RSP" },
 	{ .fc_id = 437, .cpu_id = 85, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC24_AXI_ERR_RSP" },
+		 .name = "DCORE4_TPC0_AXI_ERR_RSP" },
 	{ .fc_id = 438, .cpu_id = 86, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "AXI_ECC" },
 	{ .fc_id = 439, .cpu_id = 87, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
@@ -965,73 +965,73 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
 		 .name = "MME3_CTRL_AXI_ERROR_RESPONSE" },
 	{ .fc_id = 467, .cpu_id = 91, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
 		 .name = "MME3_QMAN_SW_ERROR" },
-	{ .fc_id = 468, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 468, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "PSOC_MME_PLL_LOCK_ERR" },
-	{ .fc_id = 469, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 469, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "PSOC_CPU_PLL_LOCK_ERR" },
-	{ .fc_id = 470, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 470, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE3_TPC_PLL_LOCK_ERR" },
-	{ .fc_id = 471, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 471, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE3_NIC_PLL_LOCK_ERR" },
-	{ .fc_id = 472, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 472, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE3_XBAR_MMU_PLL_LOCK_ERR" },
-	{ .fc_id = 473, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 473, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE3_XBAR_DMA_PLL_LOCK_ERR" },
-	{ .fc_id = 474, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 474, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE3_XBAR_IF_PLL_LOCK_ERR" },
-	{ .fc_id = 475, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 475, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE3_XBAR_BANK_PLL_LOCK_ERR" },
-	{ .fc_id = 476, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 476, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE1_XBAR_MMU_PLL_LOCK_ERR" },
-	{ .fc_id = 477, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 477, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE1_XBAR_DMA_PLL_LOCK_ERR" },
-	{ .fc_id = 478, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 478, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE1_XBAR_IF_PLL_LOCK_ERR" },
-	{ .fc_id = 479, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 479, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE1_XBAR_MESH_PLL_LOCK_ERR" },
-	{ .fc_id = 480, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 480, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE1_TPC_PLL_LOCK_ERR" },
-	{ .fc_id = 481, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 481, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE1_NIC_PLL_LOCK_ERR" },
-	{ .fc_id = 482, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 482, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "PMMU_MME_PLL_LOCK_ERR" },
-	{ .fc_id = 483, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 483, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE0_TPC_PLL_LOCK_ERR" },
-	{ .fc_id = 484, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 484, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE0_PCI_PLL_LOCK_ERR" },
-	{ .fc_id = 485, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 485, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE0_XBAR_MMU_PLL_LOCK_ERR" },
-	{ .fc_id = 486, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 486, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE0_XBAR_DMA_PLL_LOCK_ERR" },
-	{ .fc_id = 487, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 487, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE0_XBAR_IF_PLL_LOCK_ERR" },
-	{ .fc_id = 488, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 488, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE0_XBAR_MESH_PLL_LOCK_ERR" },
-	{ .fc_id = 489, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 489, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE2_XBAR_MMU_PLL_LOCK_ERR" },
-	{ .fc_id = 490, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 490, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE2_XBAR_DMA_PLL_LOCK_ERR" },
-	{ .fc_id = 491, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 491, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE2_XBAR_IF_PLL_LOCK_ERR" },
-	{ .fc_id = 492, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 492, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE2_XBAR_BANK_PLL_LOCK_ERR" },
-	{ .fc_id = 493, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 493, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE2_TPC_PLL_LOCK_ERR" },
-	{ .fc_id = 494, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 494, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "PSOC_VID_PLL_LOCK_ERR" },
-	{ .fc_id = 495, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 495, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "PMMU_VID_PLL_LOCK_ERR" },
-	{ .fc_id = 496, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 496, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE3_HBM_PLL_LOCK_ERR" },
-	{ .fc_id = 497, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 497, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE1_XBAR_HBM_PLL_LOCK_ERR" },
-	{ .fc_id = 498, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 498, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE1_HBM_PLL_LOCK_ERR" },
-	{ .fc_id = 499, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 499, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE0_HBM_PLL_LOCK_ERR" },
-	{ .fc_id = 500, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 500, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE2_XBAR_HBM_PLL_LOCK_ERR" },
-	{ .fc_id = 501, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+	{ .fc_id = 501, .cpu_id = 92, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DCORE2_HBM_PLL_LOCK_ERR" },
 	{ .fc_id = 502, .cpu_id = 93, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "CPU_AXI_ERR_RSP" },
@@ -1298,103 +1298,103 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
 	{ .fc_id = 633, .cpu_id = 130, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC0_BMON_SPMU" },
 	{ .fc_id = 634, .cpu_id = 131, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC0_KERNEL_ERR" },
+		 .name = "DCORE0_TPC0_KERNEL_ERR" },
 	{ .fc_id = 635, .cpu_id = 132, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC1_BMON_SPMU" },
 	{ .fc_id = 636, .cpu_id = 133, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC1_KERNEL_ERR" },
+		 .name = "DCORE0_TPC1_KERNEL_ERR" },
 	{ .fc_id = 637, .cpu_id = 134, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC2_BMON_SPMU" },
 	{ .fc_id = 638, .cpu_id = 135, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC2_KERNEL_ERR" },
+		 .name = "DCORE0_TPC2_KERNEL_ERR" },
 	{ .fc_id = 639, .cpu_id = 136, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC3_BMON_SPMU" },
 	{ .fc_id = 640, .cpu_id = 137, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC3_KERNEL_ERR" },
+		 .name = "DCORE0_TPC3_KERNEL_ERR" },
 	{ .fc_id = 641, .cpu_id = 138, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC4_BMON_SPMU" },
 	{ .fc_id = 642, .cpu_id = 139, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC4_KERNEL_ERR" },
+		 .name = "DCORE0_TPC4_KERNEL_ERR" },
 	{ .fc_id = 643, .cpu_id = 140, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC5_BMON_SPMU" },
 	{ .fc_id = 644, .cpu_id = 141, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC5_KERNEL_ERR" },
+		 .name = "DCORE0_TPC5_KERNEL_ERR" },
 	{ .fc_id = 645, .cpu_id = 150, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC6_BMON_SPMU" },
 	{ .fc_id = 646, .cpu_id = 151, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC6_KERNEL_ERR" },
+		 .name = "DCORE1_TPC0_KERNEL_ERR" },
 	{ .fc_id = 647, .cpu_id = 152, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC7_BMON_SPMU" },
 	{ .fc_id = 648, .cpu_id = 153, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC7_KERNEL_ERR" },
+		 .name = "DCORE1_TPC1_KERNEL_ERR" },
 	{ .fc_id = 649, .cpu_id = 146, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC8_BMON_SPMU" },
 	{ .fc_id = 650, .cpu_id = 147, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC8_KERNEL_ERR" },
+		 .name = "DCORE1_TPC2_KERNEL_ERR" },
 	{ .fc_id = 651, .cpu_id = 148, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC9_BMON_SPMU" },
 	{ .fc_id = 652, .cpu_id = 149, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC9_KERNEL_ERR" },
+		 .name = "DCORE1_TPC3_KERNEL_ERR" },
 	{ .fc_id = 653, .cpu_id = 142, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC10_BMON_SPMU" },
 	{ .fc_id = 654, .cpu_id = 143, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC10_KERNEL_ERR" },
+		 .name = "DCORE1_TPC4_KERNEL_ERR" },
 	{ .fc_id = 655, .cpu_id = 144, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC11_BMON_SPMU" },
 	{ .fc_id = 656, .cpu_id = 145, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC11_KERNEL_ERR" },
+		 .name = "DCORE1_TPC5_KERNEL_ERR" },
 	{ .fc_id = 657, .cpu_id = 162, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC12_BMON_SPMU" },
 	{ .fc_id = 658, .cpu_id = 163, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC12_KERNEL_ERR" },
+		 .name = "DCORE2_TPC0_KERNEL_ERR" },
 	{ .fc_id = 659, .cpu_id = 164, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC13_BMON_SPMU" },
 	{ .fc_id = 660, .cpu_id = 165, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC13_KERNEL_ERR" },
+		 .name = "DCORE2_TPC1_KERNEL_ERR" },
 	{ .fc_id = 661, .cpu_id = 158, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC14_BMON_SPMU" },
 	{ .fc_id = 662, .cpu_id = 159, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC14_KERNEL_ERR" },
+		 .name = "DCORE2_TPC2_KERNEL_ERR" },
 	{ .fc_id = 663, .cpu_id = 160, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC15_BMON_SPMU" },
 	{ .fc_id = 664, .cpu_id = 161, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC15_KERNEL_ERR" },
+		 .name = "DCORE2_TPC3_KERNEL_ERR" },
 	{ .fc_id = 665, .cpu_id = 154, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC16_BMON_SPMU" },
 	{ .fc_id = 666, .cpu_id = 155, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC16_KERNEL_ERR" },
+		 .name = "DCORE2_TPC4_KERNEL_ERR" },
 	{ .fc_id = 667, .cpu_id = 156, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC17_BMON_SPMU" },
 	{ .fc_id = 668, .cpu_id = 157, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC17_KERNEL_ERR" },
+		 .name = "DCORE2_TPC5_KERNEL_ERR" },
 	{ .fc_id = 669, .cpu_id = 166, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC18_BMON_SPMU" },
 	{ .fc_id = 670, .cpu_id = 167, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC18_KERNEL_ERR" },
+		 .name = "DCORE3_TPC0_KERNEL_ERR" },
 	{ .fc_id = 671, .cpu_id = 168, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC19_BMON_SPMU" },
 	{ .fc_id = 672, .cpu_id = 169, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC19_KERNEL_ERR" },
+		 .name = "DCORE3_TPC1_KERNEL_ERR" },
 	{ .fc_id = 673, .cpu_id = 170, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC20_BMON_SPMU" },
 	{ .fc_id = 674, .cpu_id = 171, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC20_KERNEL_ERR" },
+		 .name = "DCORE3_TPC2_KERNEL_ERR" },
 	{ .fc_id = 675, .cpu_id = 172, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC21_BMON_SPMU" },
 	{ .fc_id = 676, .cpu_id = 173, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC21_KERNEL_ERR" },
+		 .name = "DCORE3_TPC3_KERNEL_ERR" },
 	{ .fc_id = 677, .cpu_id = 174, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC22_BMON_SPMU" },
 	{ .fc_id = 678, .cpu_id = 175, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC22_KERNEL_ERR" },
+		 .name = "DCORE3_TPC4_KERNEL_ERR" },
 	{ .fc_id = 679, .cpu_id = 176, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC23_BMON_SPMU" },
 	{ .fc_id = 680, .cpu_id = 177, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC23_KERNEL_ERR" },
+		 .name = "DCORE3_TPC5_KERNEL_ERR" },
 	{ .fc_id = 681, .cpu_id = 178, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "TPC24_BMON_SPMU" },
 	{ .fc_id = 682, .cpu_id = 179, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC24_KERNEL_ERR" },
+		 .name = "DCORE4_TPC0_KERNEL_ERR" },
 	{ .fc_id = 683, .cpu_id = 180, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "" },
 	{ .fc_id = 684, .cpu_id = 180, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
@@ -1827,8 +1827,8 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
 		 .name = "DEC0_BMON_SPMU" },
 	{ .fc_id = 898, .cpu_id = 330, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
 		 .name = "DEC1_SPI" },
-	{ .fc_id = 899, .cpu_id = 330, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "DEC1_SPI" },
+	{ .fc_id = 899, .cpu_id = 330, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
+		 .name = "DEC1_BMON_SPMU" },
 	{ .fc_id = 900, .cpu_id = 331, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_COMPUTE,
 		 .name = "DEC2_SPI" },
 	{ .fc_id = 901, .cpu_id = 331, .valid = 1, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
@@ -2377,8 +2377,8 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
 		 .name = "" },
 	{ .fc_id = 1173, .cpu_id = 479, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "" },
-	{ .fc_id = 1174, .cpu_id = 480, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "" },
+	{ .fc_id = 1174, .cpu_id = 480, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
+		 .name = "PSOC_DMA_QM" },
 	{ .fc_id = 1175, .cpu_id = 481, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "" },
 	{ .fc_id = 1176, .cpu_id = 482, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
@@ -2442,55 +2442,55 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
 	{ .fc_id = 1205, .cpu_id = 511, .valid = 0, .msg = 0, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "" },
 	{ .fc_id = 1206, .cpu_id = 512, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC0_QM" },
+		 .name = "DCORE0_TPC0_QM" },
 	{ .fc_id = 1207, .cpu_id = 513, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC1_QM" },
+		 .name = "DCORE0_TPC1_QM" },
 	{ .fc_id = 1208, .cpu_id = 514, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC2_QM" },
+		 .name = "DCORE0_TPC2_QM" },
 	{ .fc_id = 1209, .cpu_id = 515, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC3_QM" },
+		 .name = "DCORE0_TPC3_QM" },
 	{ .fc_id = 1210, .cpu_id = 516, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC4_QM" },
+		 .name = "DCORE0_TPC4_QM" },
 	{ .fc_id = 1211, .cpu_id = 517, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC5_QM" },
+		 .name = "DCORE0_TPC5_QM" },
 	{ .fc_id = 1212, .cpu_id = 518, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC6_QM" },
+		 .name = "DCORE1_TPC0_QM" },
 	{ .fc_id = 1213, .cpu_id = 519, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC7_QM" },
+		 .name = "DCORE1_TPC1_QM" },
 	{ .fc_id = 1214, .cpu_id = 520, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC8_QM" },
+		 .name = "DCORE1_TPC2_QM" },
 	{ .fc_id = 1215, .cpu_id = 521, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC9_QM" },
+		 .name = "DCORE1_TPC3_QM" },
 	{ .fc_id = 1216, .cpu_id = 522, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC10_QM" },
+		 .name = "DCORE1_TPC4_QM" },
 	{ .fc_id = 1217, .cpu_id = 523, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC11_QM" },
+		 .name = "DCORE1_TPC5_QM" },
 	{ .fc_id = 1218, .cpu_id = 524, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC12_QM" },
+		 .name = "DCORE2_TPC0_QM" },
 	{ .fc_id = 1219, .cpu_id = 525, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC13_QM" },
+		 .name = "DCORE2_TPC1_QM" },
 	{ .fc_id = 1220, .cpu_id = 526, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC14_QM" },
+		 .name = "DCORE2_TPC2_QM" },
 	{ .fc_id = 1221, .cpu_id = 527, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC15_QM" },
+		 .name = "DCORE2_TPC3_QM" },
 	{ .fc_id = 1222, .cpu_id = 528, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC16_QM" },
+		 .name = "DCORE2_TPC4_QM" },
 	{ .fc_id = 1223, .cpu_id = 529, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC17_QM" },
+		 .name = "DCORE2_TPC5_QM" },
 	{ .fc_id = 1224, .cpu_id = 530, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC18_QM" },
+		 .name = "DCORE3_TPC0_QM" },
 	{ .fc_id = 1225, .cpu_id = 531, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC19_QM" },
+		 .name = "DCORE3_TPC1_QM" },
 	{ .fc_id = 1226, .cpu_id = 532, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC20_QM" },
+		 .name = "DCORE3_TPC2_QM" },
 	{ .fc_id = 1227, .cpu_id = 533, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC21_QM" },
+		 .name = "DCORE3_TPC3_QM" },
 	{ .fc_id = 1228, .cpu_id = 534, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC22_QM" },
+		 .name = "DCORE3_TPC4_QM" },
 	{ .fc_id = 1229, .cpu_id = 535, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC23_QM" },
+		 .name = "DCORE3_TPC5_QM" },
 	{ .fc_id = 1230, .cpu_id = 536, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
-		 .name = "TPC24_QM" },
+		 .name = "DCORE4_TPC0_QM" },
 	{ .fc_id = 1231, .cpu_id = 537, .valid = 0, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
 		 .name = "" },
 	{ .fc_id = 1232, .cpu_id = 538, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_COMPUTE,
@@ -2674,19 +2674,19 @@ static struct gaudi2_async_events_ids_map gaudi2_irq_map_table[] = {
 	{ .fc_id = 1321, .cpu_id = 627, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_HARD,
 		 .name = "DEV_RESET_REQ" },
 	{ .fc_id = 1322, .cpu_id = 628, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "ARC_PWR_BRK_ENTRY" },
+		 .name = "PWR_BRK_ENTRY" },
 	{ .fc_id = 1323, .cpu_id = 629, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "ARC_PWR_BRK_EXT" },
+		 .name = "PWR_BRK_EXT" },
 	{ .fc_id = 1324, .cpu_id = 630, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "ARC_PWR_RD_MODE0" },
+		 .name = "PWR_RD_MODE0" },
 	{ .fc_id = 1325, .cpu_id = 631, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "ARC_PWR_RD_MODE1" },
+		 .name = "PWR_RD_MODE1" },
 	{ .fc_id = 1326, .cpu_id = 632, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "ARC_PWR_RD_MODE2" },
+		 .name = "PWR_RD_MODE2" },
 	{ .fc_id = 1327, .cpu_id = 633, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "ARC_PWR_RD_MODE3" },
+		 .name = "PWR_RD_MODE3" },
 	{ .fc_id = 1328, .cpu_id = 634, .valid = 1, .msg = 1, .reset = EVENT_RESET_TYPE_NONE,
-		 .name = "ARC_EQ_HEARTBEAT" },
+		 .name = "EQ_HEARTBEAT" },
 };
 
 #endif /* __GAUDI2_ASYNC_IDS_MAP_EVENTS_EXT_H_ */
diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
index 18ca147b1c86..6ea936c9594e 100644
--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_fw_if.h
@@ -45,6 +45,13 @@
 #define GAUDI2_ARM_RX_MB_OFFSET		(GAUDI2_ARM_RX_MB_ADDR - \
 					GAUDI2_SP_SRAM_BASE_ADDR)
 
+#define POWER_MODE_LEVELS	{	\
+	150000,		/* 00 */	\
+	250000,		/* 01 */	\
+	400000,		/* 10 */	\
+	/* 11: Normal mode */		\
+}
+
 enum gaudi2_fw_status {
 	GAUDI2_PID_STATUS_UP = 0x1,	/* PID on ARC0 is up */
 	GAUDI2_ARM_STATUS_UP = 0x2,	/* ARM Linux Boot complete */
@@ -52,26 +59,6 @@ enum gaudi2_fw_status {
 	GAUDI2_STATUS_LAST = 0xFF
 };
 
-struct gaudi2_cold_rst_data {
-	union {
-		struct {
-			u32 recovery_flag: 1;
-			u32 validation_flag: 1;
-			u32 efuse_read_flag: 1;
-			u32 spsram_init_done : 1;
-			u32 fake_security_enable : 1;
-			u32 fake_sig_validation_en : 1;
-			u32 bist_skip_enable : 1;
-			u32 reserved1 : 1;
-			u32 fake_bis_compliant : 1;
-			u32 wd_rst_cause_arm : 1;
-			u32 wd_rst_cause_arcpid : 1;
-			u32 reserved : 21;
-		};
-		__le32 data;
-	};
-};
-
 enum gaudi2_rst_src {
 	HL_COLD_RST = 1,
 	HL_MANUAL_RST = 2,
diff --git a/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h b/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h
index f3eaeb6d9b7e..1e9c056e437d 100644
--- a/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h
+++ b/drivers/accel/habanalabs/include/gaudi2/gaudi2_reg_map.h
@@ -58,4 +58,12 @@
 #define mmWD_GPIO_DATAOUT_REG			mmPSOC_GPIO3_DATAOUT
 #define mmSTM_PROFILER_SPE_REG			mmPSOC_STM_STMSPER
 
+/* Registers below are used to pass the boot_if data between ARM and ARC1 */
+#define mmARM_MSG_BOOT_ERR_SET			mmCPU_IF_SPECIAL_GLBL_SPARE_0
+#define mmARM_MSG_BOOT_ERR_CLR			mmCPU_IF_SPECIAL_GLBL_SPARE_1
+#define mmARM_MSG_BOOT_DEV_STS_SET		mmCPU_IF_SPECIAL_GLBL_SPARE_2
+#define mmARM_MSG_BOOT_DEV_STS_CLR		mmCPU_IF_SPECIAL_GLBL_SPARE_3
+#define mmMGMT_MSG_BOOT_ERR			mmCPU_MSTR_IF_SPECIAL_GLBL_SPARE_0
+#define mmMGMT_MSG_BOOT_DEV_STS			mmCPU_MSTR_IF_SPECIAL_GLBL_SPARE_1
+
 #endif /* GAUDI2_REG_MAP_H_ */
diff --git a/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h b/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h
index d408feecd483..b4a5e95be354 100644
--- a/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h
+++ b/drivers/accel/habanalabs/include/hw_ip/mmu/mmu_general.h
@@ -26,6 +26,8 @@
 #define LAST_MASK			0x0000000000800ull
 #define FLAGS_MASK			0x0000000000FFFull
 
+#define MMU_ARCH_3_HOPS			3
+#define MMU_ARCH_4_HOPS			4
 #define MMU_ARCH_5_HOPS			5
 #define MMU_ARCH_6_HOPS			6
 
diff --git a/drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h b/drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h
index 4f951cada077..a75faa00197f 100644
--- a/drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h
+++ b/drivers/accel/habanalabs/include/hw_ip/pci/pci_general.h
@@ -25,7 +25,8 @@ enum hl_revision_id {
 	REV_ID_INVALID				= 0x00,
 	REV_ID_A				= 0x01,
 	REV_ID_B				= 0x02,
-	REV_ID_C				= 0x03
+	REV_ID_C				= 0x03,
+	REV_ID_D				= 0x04
 };
 
 #endif /* INCLUDE_PCI_GENERAL_H_ */
diff --git a/drivers/accel/ivpu/Kconfig b/drivers/accel/ivpu/Kconfig
index 682c53245286..9e055b5ce03d 100644
--- a/drivers/accel/ivpu/Kconfig
+++ b/drivers/accel/ivpu/Kconfig
@@ -8,6 +8,7 @@ config DRM_ACCEL_IVPU
 	select FW_LOADER
 	select DRM_GEM_SHMEM_HELPER
 	select GENERIC_ALLOCATOR
+	select WANT_DEV_COREDUMP
 	help
 	  Choose this option if you have a system with an 14th generation
 	  Intel CPU (Meteor Lake) or newer. Intel NPU (formerly called Intel VPU)
@@ -15,3 +16,12 @@ config DRM_ACCEL_IVPU
 	  and Deep Learning applications.
 
 	  If "M" is selected, the module will be called intel_vpu.
+
+config DRM_ACCEL_IVPU_DEBUG
+	bool "Intel NPU debug mode"
+	depends on DRM_ACCEL_IVPU
+	help
+	  Choose this option to enable additional
+	  debug features for the Intel NPU driver:
+	  - Always print debug messages regardless of dyndbg config,
+	  - Enable unsafe module params.
diff --git a/drivers/accel/ivpu/Makefile b/drivers/accel/ivpu/Makefile
index 95ff7ad16338..1029e0bab061 100644
--- a/drivers/accel/ivpu/Makefile
+++ b/drivers/accel/ivpu/Makefile
@@ -1,20 +1,29 @@
 # SPDX-License-Identifier: GPL-2.0-only
-# Copyright (C) 2023 Intel Corporation
+# Copyright (C) 2023-2024 Intel Corporation
 
 intel_vpu-y := \
 	ivpu_drv.o \
 	ivpu_fw.o \
 	ivpu_fw_log.o \
 	ivpu_gem.o \
-	ivpu_hw_37xx.o \
-	ivpu_hw_40xx.o \
+	ivpu_hw.o \
+	ivpu_hw_btrs.o \
+	ivpu_hw_ip.o \
 	ivpu_ipc.o \
 	ivpu_job.o \
 	ivpu_jsm_msg.o \
 	ivpu_mmu.o \
 	ivpu_mmu_context.o \
-	ivpu_pm.o
+	ivpu_ms.o \
+	ivpu_pm.o \
+	ivpu_sysfs.o \
+	ivpu_trace_points.o
 
 intel_vpu-$(CONFIG_DEBUG_FS) += ivpu_debugfs.o
+intel_vpu-$(CONFIG_DEV_COREDUMP) += ivpu_coredump.o
 
 obj-$(CONFIG_DRM_ACCEL_IVPU) += intel_vpu.o
+
+subdir-ccflags-$(CONFIG_DRM_ACCEL_IVPU_DEBUG) += -DDEBUG
+
+CFLAGS_ivpu_trace_points.o = -I$(src)
diff --git a/drivers/accel/ivpu/ivpu_coredump.c b/drivers/accel/ivpu/ivpu_coredump.c
new file mode 100644
index 000000000000..16ad0c30818c
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_coredump.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020-2024 Intel Corporation
+ */
+
+#include <linux/devcoredump.h>
+#include <linux/firmware.h>
+
+#include "ivpu_coredump.h"
+#include "ivpu_fw.h"
+#include "ivpu_gem.h"
+#include "vpu_boot_api.h"
+
+#define CRASH_DUMP_HEADER "Intel NPU crash dump"
+#define CRASH_DUMP_HEADERS_SIZE SZ_4K
+
+void ivpu_dev_coredump(struct ivpu_device *vdev)
+{
+	struct drm_print_iterator pi = {};
+	struct drm_printer p;
+	size_t coredump_size;
+	char *coredump;
+
+	coredump_size = CRASH_DUMP_HEADERS_SIZE + FW_VERSION_HEADER_SIZE +
+			ivpu_bo_size(vdev->fw->mem_log_crit) + ivpu_bo_size(vdev->fw->mem_log_verb);
+	coredump = vmalloc(coredump_size);
+	if (!coredump)
+		return;
+
+	pi.data = coredump;
+	pi.remain = coredump_size;
+	p = drm_coredump_printer(&pi);
+
+	drm_printf(&p, "%s\n", CRASH_DUMP_HEADER);
+	drm_printf(&p, "FW version: %s\n", vdev->fw->version);
+	ivpu_fw_log_print(vdev, false, &p);
+
+	dev_coredumpv(vdev->drm.dev, coredump, pi.offset, GFP_KERNEL);
+}
diff --git a/drivers/accel/ivpu/ivpu_coredump.h b/drivers/accel/ivpu/ivpu_coredump.h
new file mode 100644
index 000000000000..8efb09d02441
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_coredump.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020-2024 Intel Corporation
+ */
+
+#ifndef __IVPU_COREDUMP_H__
+#define __IVPU_COREDUMP_H__
+
+#include <drm/drm_print.h>
+
+#include "ivpu_drv.h"
+#include "ivpu_fw_log.h"
+
+#ifdef CONFIG_DEV_COREDUMP
+void ivpu_dev_coredump(struct ivpu_device *vdev);
+#else
+static inline void ivpu_dev_coredump(struct ivpu_device *vdev)
+{
+	struct drm_printer p = drm_info_printer(vdev->drm.dev);
+
+	ivpu_fw_log_print(vdev, false, &p);
+}
+#endif
+
+#endif /* __IVPU_COREDUMP_H__ */
diff --git a/drivers/accel/ivpu/ivpu_debugfs.c b/drivers/accel/ivpu/ivpu_debugfs.c
index 7cb962e21453..8180b95ed69d 100644
--- a/drivers/accel/ivpu/ivpu_debugfs.c
+++ b/drivers/accel/ivpu/ivpu_debugfs.c
@@ -1,8 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
+#include <linux/debugfs.h>
+
 #include <drm/drm_debugfs.h>
 #include <drm/drm_file.h>
 #include <drm/drm_print.h>
@@ -43,6 +45,14 @@ static int fw_name_show(struct seq_file *s, void *v)
 	return 0;
 }
 
+static int fw_version_show(struct seq_file *s, void *v)
+{
+	struct ivpu_device *vdev = seq_to_ivpu(s);
+
+	seq_printf(s, "%s\n", vdev->fw->version);
+	return 0;
+}
+
 static int fw_trace_capability_show(struct seq_file *s, void *v)
 {
 	struct ivpu_device *vdev = seq_to_ivpu(s);
@@ -106,41 +116,66 @@ static int reset_pending_show(struct seq_file *s, void *v)
 	return 0;
 }
 
+static int firewall_irq_counter_show(struct seq_file *s, void *v)
+{
+	struct ivpu_device *vdev = seq_to_ivpu(s);
+
+	seq_printf(s, "%d\n", atomic_read(&vdev->hw->firewall_irq_counter));
+	return 0;
+}
+
 static const struct drm_debugfs_info vdev_debugfs_list[] = {
 	{"bo_list", bo_list_show, 0},
 	{"fw_name", fw_name_show, 0},
+	{"fw_version", fw_version_show, 0},
 	{"fw_trace_capability", fw_trace_capability_show, 0},
 	{"fw_trace_config", fw_trace_config_show, 0},
 	{"last_bootmode", last_bootmode_show, 0},
 	{"reset_counter", reset_counter_show, 0},
 	{"reset_pending", reset_pending_show, 0},
+	{"firewall_irq_counter", firewall_irq_counter_show, 0},
 };
 
+static int dvfs_mode_get(void *data, u64 *dvfs_mode)
+{
+	struct ivpu_device *vdev = (struct ivpu_device *)data;
+
+	*dvfs_mode = vdev->fw->dvfs_mode;
+	return 0;
+}
+
+static int dvfs_mode_set(void *data, u64 dvfs_mode)
+{
+	struct ivpu_device *vdev = (struct ivpu_device *)data;
+
+	vdev->fw->dvfs_mode = (u32)dvfs_mode;
+	return pci_try_reset_function(to_pci_dev(vdev->drm.dev));
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(dvfs_mode_fops, dvfs_mode_get, dvfs_mode_set, "%llu\n");
+
 static ssize_t
-dvfs_mode_fops_write(struct file *file, const char __user *user_buf, size_t size, loff_t *pos)
+fw_dyndbg_fops_write(struct file *file, const char __user *user_buf, size_t size, loff_t *pos)
 {
 	struct ivpu_device *vdev = file->private_data;
-	struct ivpu_fw_info *fw = vdev->fw;
-	u32 dvfs_mode;
+	char buffer[VPU_DYNDBG_CMD_MAX_LEN] = {};
 	int ret;
 
-	ret = kstrtou32_from_user(user_buf, size, 0, &dvfs_mode);
-	if (ret < 0)
-		return ret;
-
-	fw->dvfs_mode = dvfs_mode;
+	if (size >= VPU_DYNDBG_CMD_MAX_LEN)
+		return -EINVAL;
 
-	ret = pci_try_reset_function(to_pci_dev(vdev->drm.dev));
-	if (ret)
+	ret = strncpy_from_user(buffer, user_buf, size);
+	if (ret < 0)
 		return ret;
 
+	ivpu_jsm_dyndbg_control(vdev, buffer, size);
 	return size;
 }
 
-static const struct file_operations dvfs_mode_fops = {
+static const struct file_operations fw_dyndbg_fops = {
 	.owner = THIS_MODULE,
 	.open = simple_open,
-	.write = dvfs_mode_fops_write,
+	.write = fw_dyndbg_fops_write,
 };
 
 static int fw_log_show(struct seq_file *s, void *v)
@@ -166,7 +201,7 @@ fw_log_fops_write(struct file *file, const char __user *user_buf, size_t size, l
 	if (!size)
 		return -EINVAL;
 
-	ivpu_fw_log_clear(vdev);
+	ivpu_fw_log_mark_read(vdev);
 	return size;
 }
 
@@ -287,22 +322,6 @@ static const struct file_operations fw_trace_level_fops = {
 };
 
 static ssize_t
-ivpu_reset_engine_fn(struct file *file, const char __user *user_buf, size_t size, loff_t *pos)
-{
-	struct ivpu_device *vdev = file->private_data;
-
-	if (!size)
-		return -EINVAL;
-
-	if (ivpu_jsm_reset_engine(vdev, DRM_IVPU_ENGINE_COMPUTE))
-		return -ENODEV;
-	if (ivpu_jsm_reset_engine(vdev, DRM_IVPU_ENGINE_COPY))
-		return -ENODEV;
-
-	return size;
-}
-
-static ssize_t
 ivpu_force_recovery_fn(struct file *file, const char __user *user_buf, size_t size, loff_t *pos)
 {
 	struct ivpu_device *vdev = file->private_data;
@@ -327,11 +346,56 @@ static const struct file_operations ivpu_force_recovery_fops = {
 	.write = ivpu_force_recovery_fn,
 };
 
-static const struct file_operations ivpu_reset_engine_fops = {
-	.owner = THIS_MODULE,
-	.open = simple_open,
-	.write = ivpu_reset_engine_fn,
-};
+static int ivpu_reset_engine_fn(void *data, u64 val)
+{
+	struct ivpu_device *vdev = (struct ivpu_device *)data;
+
+	return ivpu_jsm_reset_engine(vdev, (u32)val);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(ivpu_reset_engine_fops, NULL, ivpu_reset_engine_fn, "0x%02llx\n");
+
+static int ivpu_resume_engine_fn(void *data, u64 val)
+{
+	struct ivpu_device *vdev = (struct ivpu_device *)data;
+
+	return ivpu_jsm_hws_resume_engine(vdev, (u32)val);
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(ivpu_resume_engine_fops, NULL, ivpu_resume_engine_fn, "0x%02llx\n");
+
+static int dct_active_get(void *data, u64 *active_percent)
+{
+	struct ivpu_device *vdev = data;
+
+	*active_percent = vdev->pm->dct_active_percent;
+
+	return 0;
+}
+
+static int dct_active_set(void *data, u64 active_percent)
+{
+	struct ivpu_device *vdev = data;
+	int ret;
+
+	if (active_percent > 100)
+		return -EINVAL;
+
+	ret = ivpu_rpm_get(vdev);
+	if (ret)
+		return ret;
+
+	if (active_percent)
+		ret = ivpu_pm_dct_enable(vdev, active_percent);
+	else
+		ret = ivpu_pm_dct_disable(vdev);
+
+	ivpu_rpm_put(vdev);
+
+	return ret;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(ivpu_dct_fops, dct_active_get, dct_active_set, "%llu\n");
 
 void ivpu_debugfs_init(struct ivpu_device *vdev)
 {
@@ -342,9 +406,11 @@ void ivpu_debugfs_init(struct ivpu_device *vdev)
 	debugfs_create_file("force_recovery", 0200, debugfs_root, vdev,
 			    &ivpu_force_recovery_fops);
 
-	debugfs_create_file("dvfs_mode", 0200, debugfs_root, vdev,
+	debugfs_create_file("dvfs_mode", 0644, debugfs_root, vdev,
 			    &dvfs_mode_fops);
 
+	debugfs_create_file("fw_dyndbg", 0200, debugfs_root, vdev,
+			    &fw_dyndbg_fops);
 	debugfs_create_file("fw_log", 0644, debugfs_root, vdev,
 			    &fw_log_fops);
 	debugfs_create_file("fw_trace_destination_mask", 0200, debugfs_root, vdev,
@@ -356,8 +422,12 @@ void ivpu_debugfs_init(struct ivpu_device *vdev)
 
 	debugfs_create_file("reset_engine", 0200, debugfs_root, vdev,
 			    &ivpu_reset_engine_fops);
+	debugfs_create_file("resume_engine", 0200, debugfs_root, vdev,
+			    &ivpu_resume_engine_fops);
 
-	if (ivpu_hw_gen(vdev) >= IVPU_HW_40XX)
+	if (ivpu_hw_ip_gen(vdev) >= IVPU_HW_IP_40XX) {
 		debugfs_create_file("fw_profiling_freq_drive", 0200,
 				    debugfs_root, vdev, &fw_profiling_freq_fops);
+		debugfs_create_file("dct", 0644, debugfs_root, vdev, &ivpu_dct_fops);
+	}
 }
diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c
index 4b0640226986..38cf1c342c72 100644
--- a/drivers/accel/ivpu/ivpu_drv.c
+++ b/drivers/accel/ivpu/ivpu_drv.c
@@ -1,12 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #include <linux/firmware.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/pm_runtime.h>
+#include <generated/utsrelease.h>
 
 #include <drm/drm_accel.h>
 #include <drm/drm_file.h>
@@ -14,7 +15,7 @@
 #include <drm/drm_ioctl.h>
 #include <drm/drm_prime.h>
 
-#include "vpu_boot_api.h"
+#include "ivpu_coredump.h"
 #include "ivpu_debugfs.h"
 #include "ivpu_drv.h"
 #include "ivpu_fw.h"
@@ -26,11 +27,13 @@
 #include "ivpu_jsm_msg.h"
 #include "ivpu_mmu.h"
 #include "ivpu_mmu_context.h"
+#include "ivpu_ms.h"
 #include "ivpu_pm.h"
+#include "ivpu_sysfs.h"
+#include "vpu_boot_api.h"
 
 #ifndef DRIVER_VERSION_STR
-#define DRIVER_VERSION_STR __stringify(DRM_IVPU_DRIVER_MAJOR) "." \
-			   __stringify(DRM_IVPU_DRIVER_MINOR) "."
+#define DRIVER_VERSION_STR "1.0.0 " UTS_RELEASE
 #endif
 
 static struct lock_class_key submitted_jobs_xa_lock_class_key;
@@ -40,21 +43,31 @@ module_param_named(dbg_mask, ivpu_dbg_mask, int, 0644);
 MODULE_PARM_DESC(dbg_mask, "Driver debug mask. See IVPU_DBG_* macros.");
 
 int ivpu_test_mode;
+#if IS_ENABLED(CONFIG_DRM_ACCEL_IVPU_DEBUG)
 module_param_named_unsafe(test_mode, ivpu_test_mode, int, 0644);
 MODULE_PARM_DESC(test_mode, "Test mode mask. See IVPU_TEST_MODE_* macros.");
+#endif
 
 u8 ivpu_pll_min_ratio;
 module_param_named(pll_min_ratio, ivpu_pll_min_ratio, byte, 0644);
-MODULE_PARM_DESC(pll_min_ratio, "Minimum PLL ratio used to set VPU frequency");
+MODULE_PARM_DESC(pll_min_ratio, "Minimum PLL ratio used to set NPU frequency");
 
 u8 ivpu_pll_max_ratio = U8_MAX;
 module_param_named(pll_max_ratio, ivpu_pll_max_ratio, byte, 0644);
-MODULE_PARM_DESC(pll_max_ratio, "Maximum PLL ratio used to set VPU frequency");
+MODULE_PARM_DESC(pll_max_ratio, "Maximum PLL ratio used to set NPU frequency");
+
+int ivpu_sched_mode = IVPU_SCHED_MODE_AUTO;
+module_param_named(sched_mode, ivpu_sched_mode, int, 0444);
+MODULE_PARM_DESC(sched_mode, "Scheduler mode: -1 - Use default scheduler, 0 - Use OS scheduler, 1 - Use HW scheduler");
 
 bool ivpu_disable_mmu_cont_pages;
-module_param_named(disable_mmu_cont_pages, ivpu_disable_mmu_cont_pages, bool, 0644);
+module_param_named(disable_mmu_cont_pages, ivpu_disable_mmu_cont_pages, bool, 0444);
 MODULE_PARM_DESC(disable_mmu_cont_pages, "Disable MMU contiguous pages optimization");
 
+bool ivpu_force_snoop;
+module_param_named(force_snoop, ivpu_force_snoop, bool, 0444);
+MODULE_PARM_DESC(force_snoop, "Force snooping for NPU host memory access");
+
 struct ivpu_file_priv *ivpu_file_priv_get(struct ivpu_file_priv *file_priv)
 {
 	struct ivpu_device *vdev = file_priv->vdev;
@@ -74,9 +87,8 @@ static void file_priv_unbind(struct ivpu_device *vdev, struct ivpu_file_priv *fi
 		ivpu_dbg(vdev, FILE, "file_priv unbind: ctx %u\n", file_priv->ctx.id);
 
 		ivpu_cmdq_release_all_locked(file_priv);
-		ivpu_jsm_context_release(vdev, file_priv->ctx.id);
 		ivpu_bo_unbind_all_bos_from_context(vdev, &file_priv->ctx);
-		ivpu_mmu_user_context_fini(vdev, &file_priv->ctx);
+		ivpu_mmu_context_fini(vdev, &file_priv->ctx);
 		file_priv->bound = false;
 		drm_WARN_ON(&vdev->drm, !xa_erase_irq(&vdev->context_xa, file_priv->ctx.id));
 	}
@@ -94,9 +106,12 @@ static void file_priv_release(struct kref *ref)
 	pm_runtime_get_sync(vdev->drm.dev);
 	mutex_lock(&vdev->context_list_lock);
 	file_priv_unbind(vdev, file_priv);
+	drm_WARN_ON(&vdev->drm, !xa_empty(&file_priv->cmdq_xa));
+	xa_destroy(&file_priv->cmdq_xa);
 	mutex_unlock(&vdev->context_list_lock);
 	pm_runtime_put_autosuspend(vdev->drm.dev);
 
+	mutex_destroy(&file_priv->ms_lock);
 	mutex_destroy(&file_priv->lock);
 	kfree(file_priv);
 }
@@ -106,8 +121,6 @@ void ivpu_file_priv_put(struct ivpu_file_priv **link)
 	struct ivpu_file_priv *file_priv = *link;
 	struct ivpu_device *vdev = file_priv->vdev;
 
-	drm_WARN_ON(&vdev->drm, !file_priv);
-
 	ivpu_dbg(vdev, KREF, "file_priv put: ctx %u refcount %u\n",
 		 file_priv->ctx.id, kref_read(&file_priv->ref));
 
@@ -119,7 +132,7 @@ static int ivpu_get_capabilities(struct ivpu_device *vdev, struct drm_ivpu_param
 {
 	switch (args->index) {
 	case DRM_IVPU_CAP_METRIC_STREAMER:
-		args->value = 0;
+		args->value = 1;
 		break;
 	case DRM_IVPU_CAP_DMA_MEMORY_RANGE:
 		args->value = 1;
@@ -131,22 +144,6 @@ static int ivpu_get_capabilities(struct ivpu_device *vdev, struct drm_ivpu_param
 	return 0;
 }
 
-static int ivpu_get_core_clock_rate(struct ivpu_device *vdev, u64 *clk_rate)
-{
-	int ret;
-
-	ret = ivpu_rpm_get_if_active(vdev);
-	if (ret < 0)
-		return ret;
-
-	*clk_rate = ret ? ivpu_hw_reg_pll_freq_get(vdev) : 0;
-
-	if (ret)
-		ivpu_rpm_put(vdev);
-
-	return 0;
-}
-
 static int ivpu_get_param_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 {
 	struct ivpu_file_priv *file_priv = file->driver_priv;
@@ -170,7 +167,7 @@ static int ivpu_get_param_ioctl(struct drm_device *dev, void *data, struct drm_f
 		args->value = vdev->platform;
 		break;
 	case DRM_IVPU_PARAM_CORE_CLOCK_RATE:
-		ret = ivpu_get_core_clock_rate(vdev, &args->value);
+		args->value = ivpu_hw_ratio_to_freq(vdev, vdev->hw->pll.max_ratio);
 		break;
 	case DRM_IVPU_PARAM_NUM_CONTEXTS:
 		args->value = ivpu_get_context_count(vdev);
@@ -244,10 +241,13 @@ static int ivpu_open(struct drm_device *dev, struct drm_file *file)
 		goto err_dev_exit;
 	}
 
+	INIT_LIST_HEAD(&file_priv->ms_instance_list);
+
 	file_priv->vdev = vdev;
 	file_priv->bound = true;
 	kref_init(&file_priv->ref);
 	mutex_init(&file_priv->lock);
+	mutex_init(&file_priv->ms_lock);
 
 	mutex_lock(&vdev->context_list_lock);
 
@@ -258,9 +258,14 @@ static int ivpu_open(struct drm_device *dev, struct drm_file *file)
 		goto err_unlock;
 	}
 
-	ret = ivpu_mmu_user_context_init(vdev, &file_priv->ctx, ctx_id);
-	if (ret)
-		goto err_xa_erase;
+	ivpu_mmu_context_init(vdev, &file_priv->ctx, ctx_id);
+
+	file_priv->job_limit.min = FIELD_PREP(IVPU_JOB_ID_CONTEXT_MASK, (file_priv->ctx.id - 1));
+	file_priv->job_limit.max = file_priv->job_limit.min | IVPU_JOB_ID_JOB_MASK;
+
+	xa_init_flags(&file_priv->cmdq_xa, XA_FLAGS_ALLOC1);
+	file_priv->cmdq_limit.min = IVPU_CMDQ_MIN_ID;
+	file_priv->cmdq_limit.max = IVPU_CMDQ_MAX_ID;
 
 	mutex_unlock(&vdev->context_list_lock);
 	drm_dev_exit(idx);
@@ -272,10 +277,9 @@ static int ivpu_open(struct drm_device *dev, struct drm_file *file)
 
 	return 0;
 
-err_xa_erase:
-	xa_erase_irq(&vdev->context_xa, ctx_id);
 err_unlock:
 	mutex_unlock(&vdev->context_list_lock);
+	mutex_destroy(&file_priv->ms_lock);
 	mutex_destroy(&file_priv->lock);
 	kfree(file_priv);
 err_dev_exit:
@@ -291,6 +295,7 @@ static void ivpu_postclose(struct drm_device *dev, struct drm_file *file)
 	ivpu_dbg(vdev, FILE, "file_priv close: ctx %u process %s pid %d\n",
 		 file_priv->ctx.id, current->comm, task_pid_nr(current));
 
+	ivpu_ms_cleanup(file_priv);
 	ivpu_file_priv_put(&file_priv);
 }
 
@@ -301,6 +306,10 @@ static const struct drm_ioctl_desc ivpu_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(IVPU_BO_INFO, ivpu_bo_info_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(IVPU_SUBMIT, ivpu_submit_ioctl, 0),
 	DRM_IOCTL_DEF_DRV(IVPU_BO_WAIT, ivpu_bo_wait_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(IVPU_METRIC_STREAMER_START, ivpu_ms_start_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(IVPU_METRIC_STREAMER_GET_DATA, ivpu_ms_get_data_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(IVPU_METRIC_STREAMER_STOP, ivpu_ms_stop_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(IVPU_METRIC_STREAMER_GET_INFO, ivpu_ms_get_info_ioctl, 0),
 };
 
 static int ivpu_wait_for_ready(struct ivpu_device *vdev)
@@ -317,7 +326,7 @@ static int ivpu_wait_for_ready(struct ivpu_device *vdev)
 
 	timeout = jiffies + msecs_to_jiffies(vdev->timeout.boot);
 	while (1) {
-		ivpu_ipc_irq_handler(vdev, NULL);
+		ivpu_ipc_irq_handler(vdev);
 		ret = ivpu_ipc_receive(vdev, &cons, &ipc_hdr, NULL, 0);
 		if (ret != -ETIMEDOUT || time_after_eq(jiffies, timeout))
 			break;
@@ -328,13 +337,28 @@ static int ivpu_wait_for_ready(struct ivpu_device *vdev)
 	ivpu_ipc_consumer_del(vdev, &cons);
 
 	if (!ret && ipc_hdr.data_addr != IVPU_IPC_BOOT_MSG_DATA_ADDR) {
-		ivpu_err(vdev, "Invalid VPU ready message: 0x%x\n",
+		ivpu_err(vdev, "Invalid NPU ready message: 0x%x\n",
 			 ipc_hdr.data_addr);
 		return -EIO;
 	}
 
 	if (!ret)
-		ivpu_dbg(vdev, PM, "VPU ready message received successfully\n");
+		ivpu_dbg(vdev, PM, "NPU ready message received successfully\n");
+
+	return ret;
+}
+
+static int ivpu_hw_sched_init(struct ivpu_device *vdev)
+{
+	int ret = 0;
+
+	if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) {
+		ret = ivpu_jsm_hws_setup_priority_bands(vdev);
+		if (ret) {
+			ivpu_err(vdev, "Failed to enable hw scheduler: %d", ret);
+			return ret;
+		}
+	}
 
 	return ret;
 }
@@ -362,17 +386,35 @@ int ivpu_boot(struct ivpu_device *vdev)
 	ret = ivpu_wait_for_ready(vdev);
 	if (ret) {
 		ivpu_err(vdev, "Failed to boot the firmware: %d\n", ret);
-		ivpu_hw_diagnose_failure(vdev);
-		ivpu_mmu_evtq_dump(vdev);
-		ivpu_fw_log_dump(vdev);
-		return ret;
+		goto err_diagnose_failure;
 	}
 
 	ivpu_hw_irq_clear(vdev);
 	enable_irq(vdev->irq);
 	ivpu_hw_irq_enable(vdev);
 	ivpu_ipc_enable(vdev);
+
+	if (ivpu_fw_is_cold_boot(vdev)) {
+		ret = ivpu_pm_dct_init(vdev);
+		if (ret)
+			goto err_disable_ipc;
+
+		ret = ivpu_hw_sched_init(vdev);
+		if (ret)
+			goto err_disable_ipc;
+	}
+
 	return 0;
+
+err_disable_ipc:
+	ivpu_ipc_disable(vdev);
+	ivpu_hw_irq_disable(vdev);
+	disable_irq(vdev->irq);
+err_diagnose_failure:
+	ivpu_hw_diagnose_failure(vdev);
+	ivpu_mmu_evtq_dump(vdev);
+	ivpu_dev_coredump(vdev);
+	return ret;
 }
 
 void ivpu_prepare_for_reset(struct ivpu_device *vdev)
@@ -387,12 +429,15 @@ int ivpu_shutdown(struct ivpu_device *vdev)
 {
 	int ret;
 
-	ivpu_prepare_for_reset(vdev);
+	/* Save PCI state before powering down as it sometimes gets corrupted if NPU hangs */
+	pci_save_state(to_pci_dev(vdev->drm.dev));
 
 	ret = ivpu_hw_power_down(vdev);
 	if (ret)
 		ivpu_warn(vdev, "Failed to power down HW: %d\n", ret);
 
+	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
+
 	return ret;
 }
 
@@ -416,16 +461,56 @@ static const struct drm_driver driver = {
 
 	.name = DRIVER_NAME,
 	.desc = DRIVER_DESC,
-	.date = DRIVER_DATE,
-	.major = DRM_IVPU_DRIVER_MAJOR,
-	.minor = DRM_IVPU_DRIVER_MINOR,
+
+	.major = 1,
 };
 
+static void ivpu_context_abort_invalid(struct ivpu_device *vdev)
+{
+	struct ivpu_file_priv *file_priv;
+	unsigned long ctx_id;
+
+	mutex_lock(&vdev->context_list_lock);
+
+	xa_for_each(&vdev->context_xa, ctx_id, file_priv) {
+		if (!file_priv->has_mmu_faults || file_priv->aborted)
+			continue;
+
+		mutex_lock(&file_priv->lock);
+		ivpu_context_abort_locked(file_priv);
+		file_priv->aborted = true;
+		mutex_unlock(&file_priv->lock);
+	}
+
+	mutex_unlock(&vdev->context_list_lock);
+}
+
 static irqreturn_t ivpu_irq_thread_handler(int irq, void *arg)
 {
 	struct ivpu_device *vdev = arg;
+	u8 irq_src;
 
-	return ivpu_ipc_irq_thread_handler(vdev);
+	if (kfifo_is_empty(&vdev->hw->irq.fifo))
+		return IRQ_NONE;
+
+	while (kfifo_get(&vdev->hw->irq.fifo, &irq_src)) {
+		switch (irq_src) {
+		case IVPU_HW_IRQ_SRC_IPC:
+			ivpu_ipc_irq_thread_handler(vdev);
+			break;
+		case IVPU_HW_IRQ_SRC_MMU_EVTQ:
+			ivpu_context_abort_invalid(vdev);
+			break;
+		case IVPU_HW_IRQ_SRC_DCT:
+			ivpu_pm_dct_irq_thread_handler(vdev);
+			break;
+		default:
+			ivpu_err_ratelimited(vdev, "Unknown IRQ source: %u\n", irq_src);
+			break;
+		}
+	}
+
+	return IRQ_HANDLED;
 }
 
 static int ivpu_irq_init(struct ivpu_device *vdev)
@@ -439,9 +524,11 @@ static int ivpu_irq_init(struct ivpu_device *vdev)
 		return ret;
 	}
 
+	ivpu_irq_handlers_init(vdev);
+
 	vdev->irq = pci_irq_vector(pdev, 0);
 
-	ret = devm_request_threaded_irq(vdev->drm.dev, vdev->irq, vdev->hw->ops->irq_handler,
+	ret = devm_request_threaded_irq(vdev->drm.dev, vdev->irq, ivpu_hw_irq_handler,
 					ivpu_irq_thread_handler, IRQF_NO_AUTOEN, DRIVER_NAME, vdev);
 	if (ret)
 		ivpu_err(vdev, "Failed to request an IRQ %d\n", ret);
@@ -518,23 +605,24 @@ static int ivpu_dev_init(struct ivpu_device *vdev)
 	if (!vdev->pm)
 		return -ENOMEM;
 
-	if (ivpu_hw_gen(vdev) >= IVPU_HW_40XX) {
-		vdev->hw->ops = &ivpu_hw_40xx_ops;
+	if (ivpu_hw_ip_gen(vdev) >= IVPU_HW_IP_40XX)
 		vdev->hw->dma_bits = 48;
-	} else {
-		vdev->hw->ops = &ivpu_hw_37xx_ops;
+	else
 		vdev->hw->dma_bits = 38;
-	}
 
 	vdev->platform = IVPU_PLATFORM_INVALID;
 	vdev->context_xa_limit.min = IVPU_USER_CONTEXT_MIN_SSID;
 	vdev->context_xa_limit.max = IVPU_USER_CONTEXT_MAX_SSID;
 	atomic64_set(&vdev->unique_id_counter, 0);
-	xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC);
+	xa_init_flags(&vdev->context_xa, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
 	xa_init_flags(&vdev->submitted_jobs_xa, XA_FLAGS_ALLOC1);
+	xa_init_flags(&vdev->db_xa, XA_FLAGS_ALLOC1);
 	lockdep_set_class(&vdev->submitted_jobs_xa.xa_lock, &submitted_jobs_xa_lock_class_key);
 	INIT_LIST_HEAD(&vdev->bo_list);
 
+	vdev->db_limit.min = IVPU_MIN_DB;
+	vdev->db_limit.max = IVPU_MAX_DB;
+
 	ret = drmm_mutex_init(&vdev->drm, &vdev->context_list_lock);
 	if (ret)
 		goto err_xa_destroy;
@@ -552,18 +640,16 @@ static int ivpu_dev_init(struct ivpu_device *vdev)
 		goto err_xa_destroy;
 
 	/* Init basic HW info based on buttress registers which are accessible before power up */
-	ret = ivpu_hw_info_init(vdev);
+	ret = ivpu_hw_init(vdev);
 	if (ret)
 		goto err_xa_destroy;
 
 	/* Power up early so the rest of init code can access VPU registers */
 	ret = ivpu_hw_power_up(vdev);
 	if (ret)
-		goto err_power_down;
+		goto err_shutdown;
 
-	ret = ivpu_mmu_global_context_init(vdev);
-	if (ret)
-		goto err_power_down;
+	ivpu_mmu_global_context_init(vdev);
 
 	ret = ivpu_mmu_init(vdev);
 	if (ret)
@@ -600,11 +686,10 @@ err_mmu_rctx_fini:
 	ivpu_mmu_reserved_context_fini(vdev);
 err_mmu_gctx_fini:
 	ivpu_mmu_global_context_fini(vdev);
-err_power_down:
-	ivpu_hw_power_down(vdev);
-	if (IVPU_WA(d3hot_after_power_off))
-		pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
+err_shutdown:
+	ivpu_shutdown(vdev);
 err_xa_destroy:
+	xa_destroy(&vdev->db_xa);
 	xa_destroy(&vdev->submitted_jobs_xa);
 	xa_destroy(&vdev->context_xa);
 	return ret;
@@ -625,14 +710,14 @@ static void ivpu_bo_unbind_all_user_contexts(struct ivpu_device *vdev)
 
 static void ivpu_dev_fini(struct ivpu_device *vdev)
 {
+	ivpu_jobs_abort_all(vdev);
+	ivpu_pm_cancel_recovery(vdev);
 	ivpu_pm_disable(vdev);
+	ivpu_prepare_for_reset(vdev);
 	ivpu_shutdown(vdev);
-	if (IVPU_WA(d3hot_after_power_off))
-		pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
 
-	ivpu_jobs_abort_all(vdev);
+	ivpu_ms_cleanup_all(vdev);
 	ivpu_job_done_consumer_fini(vdev);
-	ivpu_pm_cancel_recovery(vdev);
 	ivpu_bo_unbind_all_user_contexts(vdev);
 
 	ivpu_ipc_fini(vdev);
@@ -640,6 +725,8 @@ static void ivpu_dev_fini(struct ivpu_device *vdev)
 	ivpu_mmu_reserved_context_fini(vdev);
 	ivpu_mmu_global_context_fini(vdev);
 
+	drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->db_xa));
+	xa_destroy(&vdev->db_xa);
 	drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
 	xa_destroy(&vdev->submitted_jobs_xa);
 	drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->context_xa));
@@ -650,6 +737,7 @@ static struct pci_device_id ivpu_pci_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_MTL) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_ARL) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_LNL) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PTL_P) },
 	{ }
 };
 MODULE_DEVICE_TABLE(pci, ivpu_pci_ids);
@@ -670,6 +758,7 @@ static int ivpu_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		return ret;
 
 	ivpu_debugfs_init(vdev);
+	ivpu_sysfs_init(vdev);
 
 	ret = drm_dev_register(&vdev->drm, 0);
 	if (ret) {
diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h
index 069ace4adb2d..3fdff3f6cffd 100644
--- a/drivers/accel/ivpu/ivpu_drv.h
+++ b/drivers/accel/ivpu/ivpu_drv.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #ifndef __IVPU_DRV_H__
@@ -21,14 +21,21 @@
 
 #define DRIVER_NAME "intel_vpu"
 #define DRIVER_DESC "Driver for Intel NPU (Neural Processing Unit)"
-#define DRIVER_DATE "20230117"
 
-#define PCI_DEVICE_ID_MTL   0x7d1d
-#define PCI_DEVICE_ID_ARL   0xad1d
-#define PCI_DEVICE_ID_LNL   0x643e
+#define PCI_DEVICE_ID_MTL	0x7d1d
+#define PCI_DEVICE_ID_ARL	0xad1d
+#define PCI_DEVICE_ID_LNL	0x643e
+#define PCI_DEVICE_ID_PTL_P	0xb03e
 
-#define IVPU_HW_37XX	37
-#define IVPU_HW_40XX	40
+#define IVPU_HW_IP_37XX 37
+#define IVPU_HW_IP_40XX 40
+#define IVPU_HW_IP_50XX 50
+#define IVPU_HW_IP_60XX 60
+
+#define IVPU_HW_IP_REV_LNL_B0 4
+
+#define IVPU_HW_BTRS_MTL 1
+#define IVPU_HW_BTRS_LNL 2
 
 #define IVPU_GLOBAL_CONTEXT_MMU_SSID   0
 /* SSID 1 is used by the VPU to represent reserved context */
@@ -36,13 +43,25 @@
 #define IVPU_USER_CONTEXT_MIN_SSID     2
 #define IVPU_USER_CONTEXT_MAX_SSID     (IVPU_USER_CONTEXT_MIN_SSID + 63)
 
-#define IVPU_NUM_ENGINES 2
+#define IVPU_MIN_DB 1
+#define IVPU_MAX_DB 255
+
+#define IVPU_JOB_ID_JOB_MASK		GENMASK(7, 0)
+#define IVPU_JOB_ID_CONTEXT_MASK	GENMASK(31, 8)
+
+#define IVPU_NUM_PRIORITIES    4
+#define IVPU_NUM_CMDQS_PER_CTX (IVPU_NUM_PRIORITIES)
+
+#define IVPU_CMDQ_MIN_ID 1
+#define IVPU_CMDQ_MAX_ID 255
 
 #define IVPU_PLATFORM_SILICON 0
 #define IVPU_PLATFORM_SIMICS  2
 #define IVPU_PLATFORM_FPGA    3
 #define IVPU_PLATFORM_INVALID 8
 
+#define IVPU_SCHED_MODE_AUTO -1
+
 #define IVPU_DBG_REG	 BIT(0)
 #define IVPU_DBG_IRQ	 BIT(1)
 #define IVPU_DBG_MMU	 BIT(2)
@@ -87,10 +106,10 @@
 struct ivpu_wa_table {
 	bool punit_disabled;
 	bool clear_runtime_mem;
-	bool d3hot_after_power_off;
 	bool interrupt_clear_with_0;
 	bool disable_clock_relinquish;
 	bool disable_d0i3_msg;
+	bool wp0_during_power_up;
 };
 
 struct ivpu_hw_info;
@@ -119,6 +138,10 @@ struct ivpu_device {
 	struct xarray context_xa;
 	struct xa_limit context_xa_limit;
 
+	struct xarray db_xa;
+	struct xa_limit db_limit;
+	u32 db_next;
+
 	struct mutex bo_list_lock; /* Protects bo_list */
 	struct list_head bo_list;
 
@@ -127,13 +150,16 @@ struct ivpu_device {
 
 	atomic64_t unique_id_counter;
 
+	ktime_t busy_start_ts;
+	ktime_t busy_time;
+
 	struct {
 		int boot;
 		int jsm;
 		int tdr;
-		int reschedule_suspend;
 		int autosuspend;
 		int d0i3_entry_msg;
+		int state_dump_msg;
 	} timeout;
 };
 
@@ -145,22 +171,35 @@ struct ivpu_file_priv {
 	struct kref ref;
 	struct ivpu_device *vdev;
 	struct mutex lock; /* Protects cmdq */
-	struct ivpu_cmdq *cmdq[IVPU_NUM_ENGINES];
+	struct xarray cmdq_xa;
 	struct ivpu_mmu_context ctx;
+	struct mutex ms_lock; /* Protects ms_instance_list, ms_info_bo */
+	struct list_head ms_instance_list;
+	struct ivpu_bo *ms_info_bo;
+	struct xa_limit job_limit;
+	u32 job_id_next;
+	struct xa_limit cmdq_limit;
+	u32 cmdq_id_next;
 	bool has_mmu_faults;
 	bool bound;
+	bool aborted;
 };
 
 extern int ivpu_dbg_mask;
 extern u8 ivpu_pll_min_ratio;
 extern u8 ivpu_pll_max_ratio;
+extern int ivpu_sched_mode;
 extern bool ivpu_disable_mmu_cont_pages;
+extern bool ivpu_force_snoop;
 
 #define IVPU_TEST_MODE_FW_TEST            BIT(0)
 #define IVPU_TEST_MODE_NULL_HW            BIT(1)
 #define IVPU_TEST_MODE_NULL_SUBMISSION    BIT(2)
 #define IVPU_TEST_MODE_D0I3_MSG_DISABLE   BIT(4)
 #define IVPU_TEST_MODE_D0I3_MSG_ENABLE    BIT(5)
+#define IVPU_TEST_MODE_MIP_DISABLE        BIT(6)
+#define IVPU_TEST_MODE_DISABLE_TIMEOUTS   BIT(8)
+#define IVPU_TEST_MODE_TURBO		  BIT(9)
 extern int ivpu_test_mode;
 
 struct ivpu_file_priv *ivpu_file_priv_get(struct ivpu_file_priv *file_priv);
@@ -180,16 +219,35 @@ static inline u16 ivpu_device_id(struct ivpu_device *vdev)
 	return to_pci_dev(vdev->drm.dev)->device;
 }
 
-static inline int ivpu_hw_gen(struct ivpu_device *vdev)
+static inline int ivpu_hw_ip_gen(struct ivpu_device *vdev)
 {
 	switch (ivpu_device_id(vdev)) {
 	case PCI_DEVICE_ID_MTL:
 	case PCI_DEVICE_ID_ARL:
-		return IVPU_HW_37XX;
+		return IVPU_HW_IP_37XX;
 	case PCI_DEVICE_ID_LNL:
-		return IVPU_HW_40XX;
+		return IVPU_HW_IP_40XX;
+	case PCI_DEVICE_ID_PTL_P:
+		return IVPU_HW_IP_50XX;
 	default:
-		ivpu_err(vdev, "Unknown VPU device\n");
+		dump_stack();
+		ivpu_err(vdev, "Unknown NPU IP generation\n");
+		return 0;
+	}
+}
+
+static inline int ivpu_hw_btrs_gen(struct ivpu_device *vdev)
+{
+	switch (ivpu_device_id(vdev)) {
+	case PCI_DEVICE_ID_MTL:
+	case PCI_DEVICE_ID_ARL:
+		return IVPU_HW_BTRS_MTL;
+	case PCI_DEVICE_ID_LNL:
+	case PCI_DEVICE_ID_PTL_P:
+		return IVPU_HW_BTRS_LNL;
+	default:
+		dump_stack();
+		ivpu_err(vdev, "Unknown buttress generation\n");
 		return 0;
 	}
 }
@@ -227,4 +285,9 @@ static inline bool ivpu_is_fpga(struct ivpu_device *vdev)
 	return ivpu_get_platform(vdev) == IVPU_PLATFORM_FPGA;
 }
 
+static inline bool ivpu_is_force_snoop_enabled(struct ivpu_device *vdev)
+{
+	return ivpu_force_snoop;
+}
+
 #endif /* __IVPU_DRV_H__ */
diff --git a/drivers/accel/ivpu/ivpu_fw.c b/drivers/accel/ivpu/ivpu_fw.c
index 5fa8bd4603d5..6037ec0b3096 100644
--- a/drivers/accel/ivpu/ivpu_fw.c
+++ b/drivers/accel/ivpu/ivpu_fw.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #include <linux/firmware.h>
@@ -25,7 +25,6 @@
 #define FW_SHAVE_NN_MAX_SIZE	SZ_2M
 #define FW_RUNTIME_MIN_ADDR	(FW_GLOBAL_MEM_START)
 #define FW_RUNTIME_MAX_ADDR	(FW_GLOBAL_MEM_END - FW_SHARED_MEM_SIZE)
-#define FW_VERSION_HEADER_SIZE	SZ_4K
 #define FW_FILE_IMAGE_OFFSET	(VPU_FW_HEADER_SIZE + FW_VERSION_HEADER_SIZE)
 
 #define WATCHDOG_MSS_REDIRECT	32
@@ -44,22 +43,31 @@
 #define IVPU_FW_CHECK_API_VER_LT(vdev, fw_hdr, name, major, minor) \
 	ivpu_fw_check_api_ver_lt(vdev, fw_hdr, #name, VPU_##name##_API_VER_INDEX, major, minor)
 
+#define IVPU_FOCUS_PRESENT_TIMER_MS 1000
+
 static char *ivpu_firmware;
+#if IS_ENABLED(CONFIG_DRM_ACCEL_IVPU_DEBUG)
 module_param_named_unsafe(firmware, ivpu_firmware, charp, 0644);
-MODULE_PARM_DESC(firmware, "VPU firmware binary in /lib/firmware/..");
+MODULE_PARM_DESC(firmware, "NPU firmware binary in /lib/firmware/..");
+#endif
 
-/* TODO: Remove mtl_vpu.bin from names after transition to generation based FW names */
 static struct {
 	int gen;
 	const char *name;
 } fw_names[] = {
-	{ IVPU_HW_37XX, "vpu_37xx.bin" },
-	{ IVPU_HW_37XX, "mtl_vpu.bin" },
-	{ IVPU_HW_37XX, "intel/vpu/vpu_37xx_v0.0.bin" },
-	{ IVPU_HW_40XX, "vpu_40xx.bin" },
-	{ IVPU_HW_40XX, "intel/vpu/vpu_40xx_v0.0.bin" },
+	{ IVPU_HW_IP_37XX, "vpu_37xx.bin" },
+	{ IVPU_HW_IP_37XX, "intel/vpu/vpu_37xx_v0.0.bin" },
+	{ IVPU_HW_IP_40XX, "vpu_40xx.bin" },
+	{ IVPU_HW_IP_40XX, "intel/vpu/vpu_40xx_v0.0.bin" },
+	{ IVPU_HW_IP_50XX, "vpu_50xx.bin" },
+	{ IVPU_HW_IP_50XX, "intel/vpu/vpu_50xx_v0.0.bin" },
 };
 
+/* Production fw_names from the table above */
+MODULE_FIRMWARE("intel/vpu/vpu_37xx_v0.0.bin");
+MODULE_FIRMWARE("intel/vpu/vpu_40xx_v0.0.bin");
+MODULE_FIRMWARE("intel/vpu/vpu_50xx_v0.0.bin");
+
 static int ivpu_fw_request(struct ivpu_device *vdev)
 {
 	int ret = -ENOENT;
@@ -73,7 +81,7 @@ static int ivpu_fw_request(struct ivpu_device *vdev)
 	}
 
 	for (i = 0; i < ARRAY_SIZE(fw_names); i++) {
-		if (fw_names[i].gen != ivpu_hw_gen(vdev))
+		if (fw_names[i].gen != ivpu_hw_ip_gen(vdev))
 			continue;
 
 		ret = firmware_request_nowarn(&vdev->fw->file, fw_names[i].name, vdev->drm.dev);
@@ -123,6 +131,23 @@ ivpu_fw_check_api_ver_lt(struct ivpu_device *vdev, const struct vpu_firmware_hea
 	return false;
 }
 
+static bool is_within_range(u64 addr, size_t size, u64 range_start, size_t range_size)
+{
+	if (addr < range_start || addr + size > range_start + range_size)
+		return false;
+
+	return true;
+}
+
+static u32
+ivpu_fw_sched_mode_select(struct ivpu_device *vdev, const struct vpu_firmware_header *fw_hdr)
+{
+	if (ivpu_sched_mode != IVPU_SCHED_MODE_AUTO)
+		return ivpu_sched_mode;
+
+	return VPU_SCHEDULING_MODE_OS;
+}
+
 static int ivpu_fw_parse(struct ivpu_device *vdev)
 {
 	struct ivpu_fw_info *fw = vdev->fw;
@@ -179,8 +204,10 @@ static int ivpu_fw_parse(struct ivpu_device *vdev)
 	ivpu_dbg(vdev, FW_BOOT, "Header version: 0x%x, format 0x%x\n",
 		 fw_hdr->header_version, fw_hdr->image_format);
 
-	ivpu_info(vdev, "Firmware: %s, version: %s", fw->name,
-		  (const char *)fw_hdr + VPU_FW_HEADER_SIZE);
+	if (!scnprintf(fw->version, sizeof(fw->version), "%s", fw->file->data + VPU_FW_HEADER_SIZE))
+		ivpu_warn(vdev, "Missing firmware version\n");
+
+	ivpu_info(vdev, "Firmware: %s, version: %s\n", fw->name, fw->version);
 
 	if (IVPU_FW_CHECK_API_COMPAT(vdev, fw_hdr, BOOT, 3))
 		return -EINVAL;
@@ -196,16 +223,35 @@ static int ivpu_fw_parse(struct ivpu_device *vdev)
 	fw->cold_boot_entry_point = fw_hdr->entry_point;
 	fw->entry_point = fw->cold_boot_entry_point;
 
-	fw->trace_level = min_t(u32, ivpu_log_level, IVPU_FW_LOG_FATAL);
+	fw->trace_level = min_t(u32, ivpu_fw_log_level, IVPU_FW_LOG_FATAL);
 	fw->trace_destination_mask = VPU_TRACE_DESTINATION_VERBOSE_TRACING;
 	fw->trace_hw_component_mask = -1;
 
 	fw->dvfs_mode = 0;
 
+	fw->sched_mode = ivpu_fw_sched_mode_select(vdev, fw_hdr);
+	fw->primary_preempt_buf_size = fw_hdr->preemption_buffer_1_size;
+	fw->secondary_preempt_buf_size = fw_hdr->preemption_buffer_2_size;
+	ivpu_info(vdev, "Scheduler mode: %s\n", fw->sched_mode ? "HW" : "OS");
+
+	if (fw_hdr->ro_section_start_address && !is_within_range(fw_hdr->ro_section_start_address,
+								 fw_hdr->ro_section_size,
+								 fw_hdr->image_load_address,
+								 fw_hdr->image_size)) {
+		ivpu_err(vdev, "Invalid read-only section: start address 0x%llx, size %u\n",
+			 fw_hdr->ro_section_start_address, fw_hdr->ro_section_size);
+		return -EINVAL;
+	}
+
+	fw->read_only_addr = fw_hdr->ro_section_start_address;
+	fw->read_only_size = fw_hdr->ro_section_size;
+
 	ivpu_dbg(vdev, FW_BOOT, "Size: file %lu image %u runtime %u shavenn %u\n",
 		 fw->file->size, fw->image_size, fw->runtime_size, fw->shave_nn_size);
 	ivpu_dbg(vdev, FW_BOOT, "Address: runtime 0x%llx, load 0x%llx, entry point 0x%llx\n",
 		 fw->runtime_addr, image_load_addr, fw->entry_point);
+	ivpu_dbg(vdev, FW_BOOT, "Read-only section: address 0x%llx, size %u\n",
+		 fw->read_only_addr, fw->read_only_size);
 
 	return 0;
 }
@@ -243,13 +289,14 @@ static int ivpu_fw_update_global_range(struct ivpu_device *vdev)
 		return -EINVAL;
 	}
 
-	ivpu_hw_init_range(&vdev->hw->ranges.global, start, size);
+	ivpu_hw_range_init(&vdev->hw->ranges.global, start, size);
 	return 0;
 }
 
 static int ivpu_fw_mem_init(struct ivpu_device *vdev)
 {
 	struct ivpu_fw_info *fw = vdev->fw;
+	struct ivpu_addr_range fw_range;
 	int log_verb_size;
 	int ret;
 
@@ -257,37 +304,48 @@ static int ivpu_fw_mem_init(struct ivpu_device *vdev)
 	if (ret)
 		return ret;
 
-	fw->mem = ivpu_bo_alloc_internal(vdev, fw->runtime_addr, fw->runtime_size, DRM_IVPU_BO_WC);
+	fw_range.start = fw->runtime_addr;
+	fw_range.end = fw->runtime_addr + fw->runtime_size;
+	fw->mem = ivpu_bo_create(vdev, &vdev->gctx, &fw_range, fw->runtime_size,
+				 DRM_IVPU_BO_WC | DRM_IVPU_BO_MAPPABLE);
 	if (!fw->mem) {
-		ivpu_err(vdev, "Failed to allocate firmware runtime memory\n");
+		ivpu_err(vdev, "Failed to create firmware runtime memory buffer\n");
 		return -ENOMEM;
 	}
 
-	fw->mem_log_crit = ivpu_bo_alloc_internal(vdev, 0, IVPU_FW_CRITICAL_BUFFER_SIZE,
-						  DRM_IVPU_BO_CACHED);
+	ret = ivpu_mmu_context_set_pages_ro(vdev, &vdev->gctx, fw->read_only_addr,
+					    fw->read_only_size);
+	if (ret) {
+		ivpu_err(vdev, "Failed to set firmware image read-only\n");
+		goto err_free_fw_mem;
+	}
+
+	fw->mem_log_crit = ivpu_bo_create_global(vdev, IVPU_FW_CRITICAL_BUFFER_SIZE,
+						 DRM_IVPU_BO_CACHED | DRM_IVPU_BO_MAPPABLE);
 	if (!fw->mem_log_crit) {
-		ivpu_err(vdev, "Failed to allocate critical log buffer\n");
+		ivpu_err(vdev, "Failed to create critical log buffer\n");
 		ret = -ENOMEM;
 		goto err_free_fw_mem;
 	}
 
-	if (ivpu_log_level <= IVPU_FW_LOG_INFO)
+	if (ivpu_fw_log_level <= IVPU_FW_LOG_INFO)
 		log_verb_size = IVPU_FW_VERBOSE_BUFFER_LARGE_SIZE;
 	else
 		log_verb_size = IVPU_FW_VERBOSE_BUFFER_SMALL_SIZE;
 
-	fw->mem_log_verb = ivpu_bo_alloc_internal(vdev, 0, log_verb_size, DRM_IVPU_BO_CACHED);
+	fw->mem_log_verb = ivpu_bo_create_global(vdev, log_verb_size,
+						 DRM_IVPU_BO_CACHED | DRM_IVPU_BO_MAPPABLE);
 	if (!fw->mem_log_verb) {
-		ivpu_err(vdev, "Failed to allocate verbose log buffer\n");
+		ivpu_err(vdev, "Failed to create verbose log buffer\n");
 		ret = -ENOMEM;
 		goto err_free_log_crit;
 	}
 
 	if (fw->shave_nn_size) {
-		fw->mem_shave_nn = ivpu_bo_alloc_internal(vdev, vdev->hw->ranges.shave.start,
-							  fw->shave_nn_size, DRM_IVPU_BO_WC);
+		fw->mem_shave_nn = ivpu_bo_create(vdev, &vdev->gctx, &vdev->hw->ranges.shave,
+						  fw->shave_nn_size, DRM_IVPU_BO_WC);
 		if (!fw->mem_shave_nn) {
-			ivpu_err(vdev, "Failed to allocate shavenn buffer\n");
+			ivpu_err(vdev, "Failed to create shavenn buffer\n");
 			ret = -ENOMEM;
 			goto err_free_log_verb;
 		}
@@ -296,11 +354,11 @@ static int ivpu_fw_mem_init(struct ivpu_device *vdev)
 	return 0;
 
 err_free_log_verb:
-	ivpu_bo_free_internal(fw->mem_log_verb);
+	ivpu_bo_free(fw->mem_log_verb);
 err_free_log_crit:
-	ivpu_bo_free_internal(fw->mem_log_crit);
+	ivpu_bo_free(fw->mem_log_crit);
 err_free_fw_mem:
-	ivpu_bo_free_internal(fw->mem);
+	ivpu_bo_free(fw->mem);
 	return ret;
 }
 
@@ -309,13 +367,13 @@ static void ivpu_fw_mem_fini(struct ivpu_device *vdev)
 	struct ivpu_fw_info *fw = vdev->fw;
 
 	if (fw->mem_shave_nn) {
-		ivpu_bo_free_internal(fw->mem_shave_nn);
+		ivpu_bo_free(fw->mem_shave_nn);
 		fw->mem_shave_nn = NULL;
 	}
 
-	ivpu_bo_free_internal(fw->mem_log_verb);
-	ivpu_bo_free_internal(fw->mem_log_crit);
-	ivpu_bo_free_internal(fw->mem);
+	ivpu_bo_free(fw->mem_log_verb);
+	ivpu_bo_free(fw->mem_log_crit);
+	ivpu_bo_free(fw->mem);
 
 	fw->mem_log_verb = NULL;
 	fw->mem_log_crit = NULL;
@@ -461,6 +519,8 @@ static void ivpu_fw_boot_params_print(struct ivpu_device *vdev, struct vpu_boot_
 		 boot_params->punit_telemetry_sram_size);
 	ivpu_dbg(vdev, FW_BOOT, "boot_params.vpu_telemetry_enable = 0x%x\n",
 		 boot_params->vpu_telemetry_enable);
+	ivpu_dbg(vdev, FW_BOOT, "boot_params.vpu_scheduling_mode = 0x%x\n",
+		 boot_params->vpu_scheduling_mode);
 	ivpu_dbg(vdev, FW_BOOT, "boot_params.dvfs_mode = %u\n",
 		 boot_params->dvfs_mode);
 	ivpu_dbg(vdev, FW_BOOT, "boot_params.d0i3_delayed_entry = %d\n",
@@ -469,6 +529,8 @@ static void ivpu_fw_boot_params_print(struct ivpu_device *vdev, struct vpu_boot_
 		 boot_params->d0i3_residency_time_us);
 	ivpu_dbg(vdev, FW_BOOT, "boot_params.d0i3_entry_vpu_ts = %llu\n",
 		 boot_params->d0i3_entry_vpu_ts);
+	ivpu_dbg(vdev, FW_BOOT, "boot_params.system_time_us = %llu\n",
+		 boot_params->system_time_us);
 }
 
 void ivpu_fw_boot_params_setup(struct ivpu_device *vdev, struct vpu_boot_params *boot_params)
@@ -480,11 +542,14 @@ void ivpu_fw_boot_params_setup(struct ivpu_device *vdev, struct vpu_boot_params
 		boot_params->d0i3_residency_time_us =
 			ktime_us_delta(ktime_get_boottime(), vdev->hw->d0i3_entry_host_ts);
 		boot_params->d0i3_entry_vpu_ts = vdev->hw->d0i3_entry_vpu_ts;
+		boot_params->system_time_us = ktime_to_us(ktime_get_real());
 
 		ivpu_dbg(vdev, FW_BOOT, "boot_params.d0i3_residency_time_us = %lld\n",
 			 boot_params->d0i3_residency_time_us);
 		ivpu_dbg(vdev, FW_BOOT, "boot_params.d0i3_entry_vpu_ts = %llu\n",
 			 boot_params->d0i3_entry_vpu_ts);
+		ivpu_dbg(vdev, FW_BOOT, "boot_params.system_time_us = %llu\n",
+			 boot_params->system_time_us);
 
 		boot_params->save_restore_ret_address = 0;
 		vdev->pm->is_warmboot = true;
@@ -496,7 +561,7 @@ void ivpu_fw_boot_params_setup(struct ivpu_device *vdev, struct vpu_boot_params
 
 	boot_params->magic = VPU_BOOT_PARAMS_MAGIC;
 	boot_params->vpu_id = to_pci_dev(vdev->drm.dev)->bus->number;
-	boot_params->frequency = ivpu_hw_reg_pll_freq_get(vdev);
+	boot_params->frequency = ivpu_hw_pll_freq_get(vdev);
 
 	/*
 	 * This param is a debug firmware feature.  It switches default clock
@@ -519,8 +584,10 @@ void ivpu_fw_boot_params_setup(struct ivpu_device *vdev, struct vpu_boot_params
 	boot_params->ipc_payload_area_start = ipc_mem_rx->vpu_addr + ivpu_bo_size(ipc_mem_rx) / 2;
 	boot_params->ipc_payload_area_size = ivpu_bo_size(ipc_mem_rx) / 2;
 
-	boot_params->global_aliased_pio_base = vdev->hw->ranges.user.start;
-	boot_params->global_aliased_pio_size = ivpu_hw_range_size(&vdev->hw->ranges.user);
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX) {
+		boot_params->global_aliased_pio_base = vdev->hw->ranges.user.start;
+		boot_params->global_aliased_pio_size = ivpu_hw_range_size(&vdev->hw->ranges.user);
+	}
 
 	/* Allow configuration for L2C_PAGE_TABLE with boot param value */
 	boot_params->autoconfig = 1;
@@ -553,15 +620,19 @@ void ivpu_fw_boot_params_setup(struct ivpu_device *vdev, struct vpu_boot_params
 	boot_params->verbose_tracing_buff_addr = vdev->fw->mem_log_verb->vpu_addr;
 	boot_params->verbose_tracing_buff_size = ivpu_bo_size(vdev->fw->mem_log_verb);
 
-	boot_params->punit_telemetry_sram_base = ivpu_hw_reg_telemetry_offset_get(vdev);
-	boot_params->punit_telemetry_sram_size = ivpu_hw_reg_telemetry_size_get(vdev);
-	boot_params->vpu_telemetry_enable = ivpu_hw_reg_telemetry_enable_get(vdev);
+	boot_params->punit_telemetry_sram_base = ivpu_hw_telemetry_offset_get(vdev);
+	boot_params->punit_telemetry_sram_size = ivpu_hw_telemetry_size_get(vdev);
+	boot_params->vpu_telemetry_enable = ivpu_hw_telemetry_enable_get(vdev);
+	boot_params->vpu_scheduling_mode = vdev->fw->sched_mode;
+	if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW)
+		boot_params->vpu_focus_present_timer_ms = IVPU_FOCUS_PRESENT_TIMER_MS;
 	boot_params->dvfs_mode = vdev->fw->dvfs_mode;
 	if (!IVPU_WA(disable_d0i3_msg))
 		boot_params->d0i3_delayed_entry = 1;
 	boot_params->d0i3_residency_time_us = 0;
 	boot_params->d0i3_entry_vpu_ts = 0;
 
+	boot_params->system_time_us = ktime_to_us(ktime_get_real());
 	wmb(); /* Flush WC buffers after writing bootparams */
 
 	ivpu_fw_boot_params_print(vdev, boot_params);
diff --git a/drivers/accel/ivpu/ivpu_fw.h b/drivers/accel/ivpu/ivpu_fw.h
index 66b60fa161b5..1d0b2bd9d65c 100644
--- a/drivers/accel/ivpu/ivpu_fw.h
+++ b/drivers/accel/ivpu/ivpu_fw.h
@@ -1,11 +1,16 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #ifndef __IVPU_FW_H__
 #define __IVPU_FW_H__
 
+#include "vpu_jsm_api.h"
+
+#define FW_VERSION_HEADER_SIZE	SZ_4K
+#define FW_VERSION_STR_SIZE	SZ_256
+
 struct ivpu_device;
 struct ivpu_bo;
 struct vpu_boot_params;
@@ -13,6 +18,7 @@ struct vpu_boot_params;
 struct ivpu_fw_info {
 	const struct firmware *file;
 	const char *name;
+	char version[FW_VERSION_STR_SIZE];
 	struct ivpu_bo *mem;
 	struct ivpu_bo *mem_shave_nn;
 	struct ivpu_bo *mem_log_crit;
@@ -28,6 +34,11 @@ struct ivpu_fw_info {
 	u32 trace_destination_mask;
 	u64 trace_hw_component_mask;
 	u32 dvfs_mode;
+	u32 primary_preempt_buf_size;
+	u32 secondary_preempt_buf_size;
+	u64 read_only_addr;
+	u32 read_only_size;
+	u32 sched_mode;
 };
 
 int ivpu_fw_init(struct ivpu_device *vdev);
diff --git a/drivers/accel/ivpu/ivpu_fw_log.c b/drivers/accel/ivpu/ivpu_fw_log.c
index f6770f5e82a2..337c906b0210 100644
--- a/drivers/accel/ivpu/ivpu_fw_log.c
+++ b/drivers/accel/ivpu/ivpu_fw_log.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #include <linux/ctype.h>
@@ -15,19 +15,19 @@
 #include "ivpu_fw_log.h"
 #include "ivpu_gem.h"
 
-#define IVPU_FW_LOG_LINE_LENGTH	  256
+#define IVPU_FW_LOG_LINE_LENGTH	256
 
-unsigned int ivpu_log_level = IVPU_FW_LOG_ERROR;
-module_param(ivpu_log_level, uint, 0444);
-MODULE_PARM_DESC(ivpu_log_level,
-		 "VPU firmware default trace level: debug=" __stringify(IVPU_FW_LOG_DEBUG)
+unsigned int ivpu_fw_log_level = IVPU_FW_LOG_ERROR;
+module_param_named(fw_log_level, ivpu_fw_log_level, uint, 0444);
+MODULE_PARM_DESC(fw_log_level,
+		 "NPU firmware default log level: debug=" __stringify(IVPU_FW_LOG_DEBUG)
 		 " info=" __stringify(IVPU_FW_LOG_INFO)
 		 " warn=" __stringify(IVPU_FW_LOG_WARN)
 		 " error=" __stringify(IVPU_FW_LOG_ERROR)
 		 " fatal=" __stringify(IVPU_FW_LOG_FATAL));
 
-static int fw_log_ptr(struct ivpu_device *vdev, struct ivpu_bo *bo, u32 *offset,
-		      struct vpu_tracing_buffer_header **log_header)
+static int fw_log_from_bo(struct ivpu_device *vdev, struct ivpu_bo *bo, u32 *offset,
+			  struct vpu_tracing_buffer_header **out_log)
 {
 	struct vpu_tracing_buffer_header *log;
 
@@ -48,7 +48,7 @@ static int fw_log_ptr(struct ivpu_device *vdev, struct ivpu_bo *bo, u32 *offset,
 		return -EINVAL;
 	}
 
-	*log_header = log;
+	*out_log = log;
 	*offset += log->size;
 
 	ivpu_dbg(vdev, FW_BOOT,
@@ -59,7 +59,7 @@ static int fw_log_ptr(struct ivpu_device *vdev, struct ivpu_bo *bo, u32 *offset,
 	return 0;
 }
 
-static void buffer_print(char *buffer, u32 size, struct drm_printer *p)
+static void fw_log_print_lines(char *buffer, u32 size, struct drm_printer *p)
 {
 	char line[IVPU_FW_LOG_LINE_LENGTH];
 	u32 index = 0;
@@ -87,56 +87,89 @@ static void buffer_print(char *buffer, u32 size, struct drm_printer *p)
 	}
 	line[index] = 0;
 	if (index != 0)
-		drm_printf(p, "%s\n", line);
+		drm_printf(p, "%s", line);
 }
 
-static void fw_log_print_buffer(struct ivpu_device *vdev, struct vpu_tracing_buffer_header *log,
-				const char *prefix, bool only_new_msgs, struct drm_printer *p)
+static void fw_log_print_buffer(struct vpu_tracing_buffer_header *log, const char *prefix,
+				bool only_new_msgs, struct drm_printer *p)
 {
-	char *log_buffer = (void *)log + log->header_size;
-	u32 log_size = log->size - log->header_size;
-	u32 log_start = log->read_index;
-	u32 log_end = log->write_index;
-
-	if (!(log->write_index || log->wrap_count) ||
-	    (log->write_index == log->read_index && only_new_msgs)) {
-		drm_printf(p, "==== %s \"%s\" log empty ====\n", prefix, log->name);
-		return;
+	char *log_data = (void *)log + log->header_size;
+	u32 data_size = log->size - log->header_size;
+	u32 log_start = only_new_msgs ? READ_ONCE(log->read_index) : 0;
+	u32 log_end = READ_ONCE(log->write_index);
+
+	if (log->wrap_count == log->read_wrap_count) {
+		if (log_end <= log_start) {
+			drm_printf(p, "==== %s \"%s\" log empty ====\n", prefix, log->name);
+			return;
+		}
+	} else if (log->wrap_count == log->read_wrap_count + 1) {
+		if (log_end > log_start)
+			log_start = log_end;
+	} else {
+		log_start = log_end;
 	}
 
 	drm_printf(p, "==== %s \"%s\" log start ====\n", prefix, log->name);
-	if (log->write_index > log->read_index) {
-		buffer_print(log_buffer + log_start, log_end - log_start, p);
+	if (log_end > log_start) {
+		fw_log_print_lines(log_data + log_start, log_end - log_start, p);
 	} else {
-		buffer_print(log_buffer + log_end, log_size - log_end, p);
-		buffer_print(log_buffer, log_end, p);
+		fw_log_print_lines(log_data + log_start, data_size - log_start, p);
+		fw_log_print_lines(log_data, log_end, p);
 	}
-	drm_printf(p, "\x1b[0m");
+	drm_printf(p, "\n\x1b[0m"); /* add new line and clear formatting */
 	drm_printf(p, "==== %s \"%s\" log end   ====\n", prefix, log->name);
 }
 
-void ivpu_fw_log_print(struct ivpu_device *vdev, bool only_new_msgs, struct drm_printer *p)
+static void
+fw_log_print_all_in_bo(struct ivpu_device *vdev, const char *name,
+		       struct ivpu_bo *bo, bool only_new_msgs, struct drm_printer *p)
 {
-	struct vpu_tracing_buffer_header *log_header;
+	struct vpu_tracing_buffer_header *log;
 	u32 next = 0;
 
-	while (fw_log_ptr(vdev, vdev->fw->mem_log_crit, &next, &log_header) == 0)
-		fw_log_print_buffer(vdev, log_header, "VPU critical", only_new_msgs, p);
+	while (fw_log_from_bo(vdev, bo, &next, &log) == 0)
+		fw_log_print_buffer(log, name, only_new_msgs, p);
+}
+
+void ivpu_fw_log_print(struct ivpu_device *vdev, bool only_new_msgs, struct drm_printer *p)
+{
+	fw_log_print_all_in_bo(vdev, "NPU critical", vdev->fw->mem_log_crit, only_new_msgs, p);
+	fw_log_print_all_in_bo(vdev, "NPU verbose", vdev->fw->mem_log_verb, only_new_msgs, p);
+}
+
+void ivpu_fw_log_mark_read(struct ivpu_device *vdev)
+{
+	struct vpu_tracing_buffer_header *log;
+	u32 next;
+
+	next = 0;
+	while (fw_log_from_bo(vdev, vdev->fw->mem_log_crit, &next, &log) == 0) {
+		log->read_index = READ_ONCE(log->write_index);
+		log->read_wrap_count = READ_ONCE(log->wrap_count);
+	}
 
 	next = 0;
-	while (fw_log_ptr(vdev, vdev->fw->mem_log_verb, &next, &log_header) == 0)
-		fw_log_print_buffer(vdev, log_header, "VPU verbose", only_new_msgs, p);
+	while (fw_log_from_bo(vdev, vdev->fw->mem_log_verb, &next, &log) == 0) {
+		log->read_index = READ_ONCE(log->write_index);
+		log->read_wrap_count = READ_ONCE(log->wrap_count);
+	}
 }
 
-void ivpu_fw_log_clear(struct ivpu_device *vdev)
+void ivpu_fw_log_reset(struct ivpu_device *vdev)
 {
-	struct vpu_tracing_buffer_header *log_header;
-	u32 next = 0;
+	struct vpu_tracing_buffer_header *log;
+	u32 next;
 
-	while (fw_log_ptr(vdev, vdev->fw->mem_log_crit, &next, &log_header) == 0)
-		log_header->read_index = log_header->write_index;
+	next = 0;
+	while (fw_log_from_bo(vdev, vdev->fw->mem_log_crit, &next, &log) == 0) {
+		log->read_index = 0;
+		log->read_wrap_count = 0;
+	}
 
 	next = 0;
-	while (fw_log_ptr(vdev, vdev->fw->mem_log_verb, &next, &log_header) == 0)
-		log_header->read_index = log_header->write_index;
+	while (fw_log_from_bo(vdev, vdev->fw->mem_log_verb, &next, &log) == 0) {
+		log->read_index = 0;
+		log->read_wrap_count = 0;
+	}
 }
diff --git a/drivers/accel/ivpu/ivpu_fw_log.h b/drivers/accel/ivpu/ivpu_fw_log.h
index 0b2573f6f315..8bb528a73cb7 100644
--- a/drivers/accel/ivpu/ivpu_fw_log.h
+++ b/drivers/accel/ivpu/ivpu_fw_log.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #ifndef __IVPU_FW_LOG_H__
@@ -8,8 +8,6 @@
 
 #include <linux/types.h>
 
-#include <drm/drm_print.h>
-
 #include "ivpu_drv.h"
 
 #define IVPU_FW_LOG_DEFAULT 0
@@ -19,20 +17,15 @@
 #define IVPU_FW_LOG_ERROR   4
 #define IVPU_FW_LOG_FATAL   5
 
-extern unsigned int ivpu_log_level;
-
 #define IVPU_FW_VERBOSE_BUFFER_SMALL_SIZE	SZ_1M
 #define IVPU_FW_VERBOSE_BUFFER_LARGE_SIZE	SZ_8M
 #define IVPU_FW_CRITICAL_BUFFER_SIZE		SZ_512K
 
-void ivpu_fw_log_print(struct ivpu_device *vdev, bool only_new_msgs, struct drm_printer *p);
-void ivpu_fw_log_clear(struct ivpu_device *vdev);
+extern unsigned int ivpu_fw_log_level;
 
-static inline void ivpu_fw_log_dump(struct ivpu_device *vdev)
-{
-	struct drm_printer p = drm_info_printer(vdev->drm.dev);
+void ivpu_fw_log_print(struct ivpu_device *vdev, bool only_new_msgs, struct drm_printer *p);
+void ivpu_fw_log_mark_read(struct ivpu_device *vdev);
+void ivpu_fw_log_reset(struct ivpu_device *vdev);
 
-	ivpu_fw_log_print(vdev, false, &p);
-}
 
 #endif /* __IVPU_FW_LOG_H__ */
diff --git a/drivers/accel/ivpu/ivpu_gem.c b/drivers/accel/ivpu/ivpu_gem.c
index e9ddbe9f50eb..16178054e629 100644
--- a/drivers/accel/ivpu/ivpu_gem.c
+++ b/drivers/accel/ivpu/ivpu_gem.c
@@ -172,8 +172,7 @@ struct drm_gem_object *ivpu_gem_create_object(struct drm_device *dev, size_t siz
 	return &bo->base.base;
 }
 
-static struct ivpu_bo *
-ivpu_bo_create(struct ivpu_device *vdev, u64 size, u32 flags)
+static struct ivpu_bo *ivpu_bo_alloc(struct ivpu_device *vdev, u64 size, u32 flags)
 {
 	struct drm_gem_shmem_object *shmem;
 	struct ivpu_bo *bo;
@@ -201,7 +200,7 @@ ivpu_bo_create(struct ivpu_device *vdev, u64 size, u32 flags)
 	return bo;
 }
 
-static int ivpu_bo_open(struct drm_gem_object *obj, struct drm_file *file)
+static int ivpu_gem_bo_open(struct drm_gem_object *obj, struct drm_file *file)
 {
 	struct ivpu_file_priv *file_priv = file->driver_priv;
 	struct ivpu_device *vdev = file_priv->vdev;
@@ -224,7 +223,7 @@ static int ivpu_bo_open(struct drm_gem_object *obj, struct drm_file *file)
 	return ivpu_bo_alloc_vpu_addr(bo, &file_priv->ctx, range);
 }
 
-static void ivpu_bo_free(struct drm_gem_object *obj)
+static void ivpu_gem_bo_free(struct drm_gem_object *obj)
 {
 	struct ivpu_device *vdev = to_ivpu_device(obj->dev);
 	struct ivpu_bo *bo = to_ivpu_bo(obj);
@@ -245,8 +244,8 @@ static void ivpu_bo_free(struct drm_gem_object *obj)
 }
 
 static const struct drm_gem_object_funcs ivpu_gem_funcs = {
-	.free = ivpu_bo_free,
-	.open = ivpu_bo_open,
+	.free = ivpu_gem_bo_free,
+	.open = ivpu_gem_bo_open,
 	.print_info = drm_gem_shmem_object_print_info,
 	.pin = drm_gem_shmem_object_pin,
 	.unpin = drm_gem_shmem_object_unpin,
@@ -272,9 +271,9 @@ int ivpu_bo_create_ioctl(struct drm_device *dev, void *data, struct drm_file *fi
 	if (size == 0)
 		return -EINVAL;
 
-	bo = ivpu_bo_create(vdev, size, args->flags);
+	bo = ivpu_bo_alloc(vdev, size, args->flags);
 	if (IS_ERR(bo)) {
-		ivpu_err(vdev, "Failed to create BO: %pe (ctx %u size %llu flags 0x%x)",
+		ivpu_err(vdev, "Failed to allocate BO: %pe (ctx %u size %llu flags 0x%x)",
 			 bo, file_priv->ctx.id, args->size, args->flags);
 		return PTR_ERR(bo);
 	}
@@ -289,33 +288,28 @@ int ivpu_bo_create_ioctl(struct drm_device *dev, void *data, struct drm_file *fi
 }
 
 struct ivpu_bo *
-ivpu_bo_alloc_internal(struct ivpu_device *vdev, u64 vpu_addr, u64 size, u32 flags)
+ivpu_bo_create(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
+	       struct ivpu_addr_range *range, u64 size, u32 flags)
 {
-	const struct ivpu_addr_range *range;
-	struct ivpu_addr_range fixed_range;
 	struct iosys_map map;
 	struct ivpu_bo *bo;
 	int ret;
 
-	drm_WARN_ON(&vdev->drm, !PAGE_ALIGNED(vpu_addr));
-	drm_WARN_ON(&vdev->drm, !PAGE_ALIGNED(size));
+	if (drm_WARN_ON(&vdev->drm, !range))
+		return NULL;
 
-	if (vpu_addr) {
-		fixed_range.start = vpu_addr;
-		fixed_range.end = vpu_addr + size;
-		range = &fixed_range;
-	} else {
-		range = &vdev->hw->ranges.global;
-	}
+	drm_WARN_ON(&vdev->drm, !PAGE_ALIGNED(range->start));
+	drm_WARN_ON(&vdev->drm, !PAGE_ALIGNED(range->end));
+	drm_WARN_ON(&vdev->drm, !PAGE_ALIGNED(size));
 
-	bo = ivpu_bo_create(vdev, size, flags);
+	bo = ivpu_bo_alloc(vdev, size, flags);
 	if (IS_ERR(bo)) {
-		ivpu_err(vdev, "Failed to create BO: %pe (vpu_addr 0x%llx size %llu flags 0x%x)",
-			 bo, vpu_addr, size, flags);
+		ivpu_err(vdev, "Failed to allocate BO: %pe (vpu_addr 0x%llx size %llu flags 0x%x)",
+			 bo, range->start, size, flags);
 		return NULL;
 	}
 
-	ret = ivpu_bo_alloc_vpu_addr(bo, &vdev->gctx, range);
+	ret = ivpu_bo_alloc_vpu_addr(bo, ctx, range);
 	if (ret)
 		goto err_put;
 
@@ -323,11 +317,14 @@ ivpu_bo_alloc_internal(struct ivpu_device *vdev, u64 vpu_addr, u64 size, u32 fla
 	if (ret)
 		goto err_put;
 
-	dma_resv_lock(bo->base.base.resv, NULL);
-	ret = drm_gem_shmem_vmap(&bo->base, &map);
-	dma_resv_unlock(bo->base.base.resv);
-	if (ret)
-		goto err_put;
+	if (flags & DRM_IVPU_BO_MAPPABLE) {
+		dma_resv_lock(bo->base.base.resv, NULL);
+		ret = drm_gem_shmem_vmap(&bo->base, &map);
+		dma_resv_unlock(bo->base.base.resv);
+
+		if (ret)
+			goto err_put;
+	}
 
 	return bo;
 
@@ -336,13 +333,20 @@ err_put:
 	return NULL;
 }
 
-void ivpu_bo_free_internal(struct ivpu_bo *bo)
+struct ivpu_bo *ivpu_bo_create_global(struct ivpu_device *vdev, u64 size, u32 flags)
+{
+	return ivpu_bo_create(vdev, &vdev->gctx, &vdev->hw->ranges.global, size, flags);
+}
+
+void ivpu_bo_free(struct ivpu_bo *bo)
 {
 	struct iosys_map map = IOSYS_MAP_INIT_VADDR(bo->base.vaddr);
 
-	dma_resv_lock(bo->base.base.resv, NULL);
-	drm_gem_shmem_vunmap(&bo->base, &map);
-	dma_resv_unlock(bo->base.base.resv);
+	if (bo->flags & DRM_IVPU_BO_MAPPABLE) {
+		dma_resv_lock(bo->base.base.resv, NULL);
+		drm_gem_shmem_vunmap(&bo->base, &map);
+		dma_resv_unlock(bo->base.base.resv);
+	}
 
 	drm_gem_object_put(&bo->base.base);
 }
@@ -380,6 +384,9 @@ int ivpu_bo_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file
 
 	timeout = drm_timeout_abs_to_jiffies(args->timeout_ns);
 
+	/* Add 1 jiffy to ensure the wait function never times out before intended timeout_ns */
+	timeout += 1;
+
 	obj = drm_gem_object_lookup(file, args->handle);
 	if (!obj)
 		return -EINVAL;
@@ -402,7 +409,7 @@ static void ivpu_bo_print_info(struct ivpu_bo *bo, struct drm_printer *p)
 	mutex_lock(&bo->lock);
 
 	drm_printf(p, "%-9p %-3u 0x%-12llx %-10lu 0x%-8x %-4u",
-		   bo, bo->ctx->id, bo->vpu_addr, bo->base.base.size,
+		   bo, bo->ctx ? bo->ctx->id : 0, bo->vpu_addr, bo->base.base.size,
 		   bo->flags, kref_read(&bo->base.base.refcount));
 
 	if (bo->base.pages)
diff --git a/drivers/accel/ivpu/ivpu_gem.h b/drivers/accel/ivpu/ivpu_gem.h
index a8559211c70d..d975000abd78 100644
--- a/drivers/accel/ivpu/ivpu_gem.h
+++ b/drivers/accel/ivpu/ivpu_gem.h
@@ -28,8 +28,10 @@ int ivpu_bo_pin(struct ivpu_bo *bo);
 void ivpu_bo_unbind_all_bos_from_context(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx);
 
 struct drm_gem_object *ivpu_gem_create_object(struct drm_device *dev, size_t size);
-struct ivpu_bo *ivpu_bo_alloc_internal(struct ivpu_device *vdev, u64 vpu_addr, u64 size, u32 flags);
-void ivpu_bo_free_internal(struct ivpu_bo *bo);
+struct ivpu_bo *ivpu_bo_create(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
+			       struct ivpu_addr_range *range, u64 size, u32 flags);
+struct ivpu_bo *ivpu_bo_create_global(struct ivpu_device *vdev, u64 size, u32 flags);
+void ivpu_bo_free(struct ivpu_bo *bo);
 
 int ivpu_bo_create_ioctl(struct drm_device *dev, void *data, struct drm_file *file);
 int ivpu_bo_info_ioctl(struct drm_device *dev, void *data, struct drm_file *file);
@@ -58,14 +60,17 @@ static inline u32 ivpu_bo_cache_mode(struct ivpu_bo *bo)
 	return bo->flags & DRM_IVPU_BO_CACHE_MASK;
 }
 
-static inline bool ivpu_bo_is_snooped(struct ivpu_bo *bo)
+static inline struct ivpu_device *ivpu_bo_to_vdev(struct ivpu_bo *bo)
 {
-	return ivpu_bo_cache_mode(bo) == DRM_IVPU_BO_CACHED;
+	return to_ivpu_device(bo->base.base.dev);
 }
 
-static inline struct ivpu_device *ivpu_bo_to_vdev(struct ivpu_bo *bo)
+static inline bool ivpu_bo_is_snooped(struct ivpu_bo *bo)
 {
-	return to_ivpu_device(bo->base.base.dev);
+	if (ivpu_is_force_snoop_enabled(ivpu_bo_to_vdev(bo)))
+		return true;
+
+	return ivpu_bo_cache_mode(bo) == DRM_IVPU_BO_CACHED;
 }
 
 static inline void *ivpu_to_cpu_addr(struct ivpu_bo *bo, u32 vpu_addr)
diff --git a/drivers/accel/ivpu/ivpu_hw.c b/drivers/accel/ivpu/ivpu_hw.c
new file mode 100644
index 000000000000..4e1054f3466e
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_hw.c
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 - 2024 Intel Corporation
+ */
+
+#include "ivpu_drv.h"
+#include "ivpu_hw.h"
+#include "ivpu_hw_btrs.h"
+#include "ivpu_hw_ip.h"
+
+#include <linux/dmi.h>
+
+static char *platform_to_str(u32 platform)
+{
+	switch (platform) {
+	case IVPU_PLATFORM_SILICON:
+		return "SILICON";
+	case IVPU_PLATFORM_SIMICS:
+		return "SIMICS";
+	case IVPU_PLATFORM_FPGA:
+		return "FPGA";
+	default:
+		return "Invalid platform";
+	}
+}
+
+static const struct dmi_system_id dmi_platform_simulation[] = {
+	{
+		.ident = "Intel Simics",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "lnlrvp"),
+			DMI_MATCH(DMI_BOARD_VERSION, "1.0"),
+			DMI_MATCH(DMI_BOARD_SERIAL, "123456789"),
+		},
+	},
+	{
+		.ident = "Intel Simics",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_NAME, "Simics"),
+		},
+	},
+	{ }
+};
+
+static void platform_init(struct ivpu_device *vdev)
+{
+	if (dmi_check_system(dmi_platform_simulation))
+		vdev->platform = IVPU_PLATFORM_SIMICS;
+	else
+		vdev->platform = IVPU_PLATFORM_SILICON;
+
+	ivpu_dbg(vdev, MISC, "Platform type: %s (%d)\n",
+		 platform_to_str(vdev->platform), vdev->platform);
+}
+
+static void wa_init(struct ivpu_device *vdev)
+{
+	vdev->wa.punit_disabled = ivpu_is_fpga(vdev);
+	vdev->wa.clear_runtime_mem = false;
+
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		vdev->wa.interrupt_clear_with_0 = ivpu_hw_btrs_irqs_clear_with_0_mtl(vdev);
+
+	if (ivpu_device_id(vdev) == PCI_DEVICE_ID_LNL &&
+	    ivpu_revision(vdev) < IVPU_HW_IP_REV_LNL_B0)
+		vdev->wa.disable_clock_relinquish = true;
+
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		vdev->wa.wp0_during_power_up = true;
+
+	IVPU_PRINT_WA(punit_disabled);
+	IVPU_PRINT_WA(clear_runtime_mem);
+	IVPU_PRINT_WA(interrupt_clear_with_0);
+	IVPU_PRINT_WA(disable_clock_relinquish);
+	IVPU_PRINT_WA(wp0_during_power_up);
+}
+
+static void timeouts_init(struct ivpu_device *vdev)
+{
+	if (ivpu_test_mode & IVPU_TEST_MODE_DISABLE_TIMEOUTS) {
+		vdev->timeout.boot = -1;
+		vdev->timeout.jsm = -1;
+		vdev->timeout.tdr = -1;
+		vdev->timeout.autosuspend = -1;
+		vdev->timeout.d0i3_entry_msg = -1;
+	} else if (ivpu_is_fpga(vdev)) {
+		vdev->timeout.boot = 100000;
+		vdev->timeout.jsm = 50000;
+		vdev->timeout.tdr = 2000000;
+		vdev->timeout.autosuspend = -1;
+		vdev->timeout.d0i3_entry_msg = 500;
+		vdev->timeout.state_dump_msg = 10;
+	} else if (ivpu_is_simics(vdev)) {
+		vdev->timeout.boot = 50;
+		vdev->timeout.jsm = 500;
+		vdev->timeout.tdr = 10000;
+		vdev->timeout.autosuspend = 100;
+		vdev->timeout.d0i3_entry_msg = 100;
+		vdev->timeout.state_dump_msg = 10;
+	} else {
+		vdev->timeout.boot = 1000;
+		vdev->timeout.jsm = 500;
+		vdev->timeout.tdr = 2000;
+		if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+			vdev->timeout.autosuspend = 10;
+		else
+			vdev->timeout.autosuspend = 100;
+		vdev->timeout.d0i3_entry_msg = 5;
+		vdev->timeout.state_dump_msg = 10;
+	}
+}
+
+static void memory_ranges_init(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX) {
+		ivpu_hw_range_init(&vdev->hw->ranges.global, 0x80000000, SZ_512M);
+		ivpu_hw_range_init(&vdev->hw->ranges.user,   0x88000000, 511 * SZ_1M);
+		ivpu_hw_range_init(&vdev->hw->ranges.shave, 0x180000000, SZ_2G);
+		ivpu_hw_range_init(&vdev->hw->ranges.dma,   0x200000000, SZ_128G);
+	} else {
+		ivpu_hw_range_init(&vdev->hw->ranges.global, 0x80000000, SZ_512M);
+		ivpu_hw_range_init(&vdev->hw->ranges.shave,  0x80000000, SZ_2G);
+		ivpu_hw_range_init(&vdev->hw->ranges.user,  0x100000000, SZ_256G);
+		vdev->hw->ranges.dma = vdev->hw->ranges.user;
+	}
+}
+
+static int wp_enable(struct ivpu_device *vdev)
+{
+	return ivpu_hw_btrs_wp_drive(vdev, true);
+}
+
+static int wp_disable(struct ivpu_device *vdev)
+{
+	return ivpu_hw_btrs_wp_drive(vdev, false);
+}
+
+int ivpu_hw_power_up(struct ivpu_device *vdev)
+{
+	int ret;
+
+	if (IVPU_WA(wp0_during_power_up)) {
+		/* WP requests may fail when powering down, so issue WP 0 here */
+		ret = wp_disable(vdev);
+		if (ret)
+			ivpu_warn(vdev, "Failed to disable workpoint: %d\n", ret);
+	}
+
+	ret = ivpu_hw_btrs_d0i3_disable(vdev);
+	if (ret)
+		ivpu_warn(vdev, "Failed to disable D0I3: %d\n", ret);
+
+	ret = wp_enable(vdev);
+	if (ret) {
+		ivpu_err(vdev, "Failed to enable workpoint: %d\n", ret);
+		return ret;
+	}
+
+	if (ivpu_hw_btrs_gen(vdev) >= IVPU_HW_BTRS_LNL) {
+		if (IVPU_WA(disable_clock_relinquish))
+			ivpu_hw_btrs_clock_relinquish_disable_lnl(vdev);
+		ivpu_hw_btrs_profiling_freq_reg_set_lnl(vdev);
+		ivpu_hw_btrs_ats_print_lnl(vdev);
+	}
+
+	ret = ivpu_hw_ip_host_ss_configure(vdev);
+	if (ret) {
+		ivpu_err(vdev, "Failed to configure host SS: %d\n", ret);
+		return ret;
+	}
+
+	ivpu_hw_ip_idle_gen_disable(vdev);
+
+	ret = ivpu_hw_btrs_wait_for_clock_res_own_ack(vdev);
+	if (ret) {
+		ivpu_err(vdev, "Timed out waiting for clock resource own ACK\n");
+		return ret;
+	}
+
+	ret = ivpu_hw_ip_pwr_domain_enable(vdev);
+	if (ret) {
+		ivpu_err(vdev, "Failed to enable power domain: %d\n", ret);
+		return ret;
+	}
+
+	ret = ivpu_hw_ip_host_ss_axi_enable(vdev);
+	if (ret) {
+		ivpu_err(vdev, "Failed to enable AXI: %d\n", ret);
+		return ret;
+	}
+
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_LNL)
+		ivpu_hw_btrs_set_port_arbitration_weights_lnl(vdev);
+
+	ret = ivpu_hw_ip_top_noc_enable(vdev);
+	if (ret)
+		ivpu_err(vdev, "Failed to enable TOP NOC: %d\n", ret);
+
+	return ret;
+}
+
+static void save_d0i3_entry_timestamp(struct ivpu_device *vdev)
+{
+	vdev->hw->d0i3_entry_host_ts = ktime_get_boottime();
+	vdev->hw->d0i3_entry_vpu_ts = ivpu_hw_ip_read_perf_timer_counter(vdev);
+}
+
+int ivpu_hw_reset(struct ivpu_device *vdev)
+{
+	int ret = 0;
+
+	if (ivpu_hw_btrs_ip_reset(vdev)) {
+		ivpu_err(vdev, "Failed to reset NPU IP\n");
+		ret = -EIO;
+	}
+
+	if (wp_disable(vdev)) {
+		ivpu_err(vdev, "Failed to disable workpoint\n");
+		ret = -EIO;
+	}
+
+	return ret;
+}
+
+int ivpu_hw_power_down(struct ivpu_device *vdev)
+{
+	int ret = 0;
+
+	save_d0i3_entry_timestamp(vdev);
+
+	if (!ivpu_hw_is_idle(vdev))
+		ivpu_warn(vdev, "NPU not idle during power down\n");
+
+	if (ivpu_hw_reset(vdev)) {
+		ivpu_err(vdev, "Failed to reset NPU\n");
+		ret = -EIO;
+	}
+
+	if (ivpu_hw_btrs_d0i3_enable(vdev)) {
+		ivpu_err(vdev, "Failed to enter D0I3\n");
+		ret = -EIO;
+	}
+
+	return ret;
+}
+
+int ivpu_hw_init(struct ivpu_device *vdev)
+{
+	ivpu_hw_btrs_info_init(vdev);
+	ivpu_hw_btrs_freq_ratios_init(vdev);
+	memory_ranges_init(vdev);
+	platform_init(vdev);
+	wa_init(vdev);
+	timeouts_init(vdev);
+	atomic_set(&vdev->hw->firewall_irq_counter, 0);
+
+	return 0;
+}
+
+int ivpu_hw_boot_fw(struct ivpu_device *vdev)
+{
+	int ret;
+
+	ivpu_hw_ip_snoop_disable(vdev);
+	ivpu_hw_ip_tbu_mmu_enable(vdev);
+	ret = ivpu_hw_ip_soc_cpu_boot(vdev);
+	if (ret)
+		ivpu_err(vdev, "Failed to boot SOC CPU: %d\n", ret);
+
+	return ret;
+}
+
+void ivpu_hw_profiling_freq_drive(struct ivpu_device *vdev, bool enable)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX) {
+		vdev->hw->pll.profiling_freq = PLL_PROFILING_FREQ_DEFAULT;
+		return;
+	}
+
+	if (enable)
+		vdev->hw->pll.profiling_freq = PLL_PROFILING_FREQ_HIGH;
+	else
+		vdev->hw->pll.profiling_freq = PLL_PROFILING_FREQ_DEFAULT;
+}
+
+void ivpu_irq_handlers_init(struct ivpu_device *vdev)
+{
+	INIT_KFIFO(vdev->hw->irq.fifo);
+
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		vdev->hw->irq.ip_irq_handler = ivpu_hw_ip_irq_handler_37xx;
+	else
+		vdev->hw->irq.ip_irq_handler = ivpu_hw_ip_irq_handler_40xx;
+
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		vdev->hw->irq.btrs_irq_handler = ivpu_hw_btrs_irq_handler_mtl;
+	else
+		vdev->hw->irq.btrs_irq_handler = ivpu_hw_btrs_irq_handler_lnl;
+}
+
+void ivpu_hw_irq_enable(struct ivpu_device *vdev)
+{
+	kfifo_reset(&vdev->hw->irq.fifo);
+	ivpu_hw_ip_irq_enable(vdev);
+	ivpu_hw_btrs_irq_enable(vdev);
+}
+
+void ivpu_hw_irq_disable(struct ivpu_device *vdev)
+{
+	ivpu_hw_btrs_irq_disable(vdev);
+	ivpu_hw_ip_irq_disable(vdev);
+}
+
+irqreturn_t ivpu_hw_irq_handler(int irq, void *ptr)
+{
+	struct ivpu_device *vdev = ptr;
+	bool ip_handled, btrs_handled;
+
+	ivpu_hw_btrs_global_int_disable(vdev);
+
+	btrs_handled = ivpu_hw_btrs_irq_handler(vdev, irq);
+	if (!ivpu_hw_is_idle((vdev)) || !btrs_handled)
+		ip_handled = ivpu_hw_ip_irq_handler(vdev, irq);
+	else
+		ip_handled = false;
+
+	/* Re-enable global interrupts to re-trigger MSI for pending interrupts */
+	ivpu_hw_btrs_global_int_enable(vdev);
+
+	if (!kfifo_is_empty(&vdev->hw->irq.fifo))
+		return IRQ_WAKE_THREAD;
+	if (ip_handled || btrs_handled)
+		return IRQ_HANDLED;
+	return IRQ_NONE;
+}
diff --git a/drivers/accel/ivpu/ivpu_hw.h b/drivers/accel/ivpu/ivpu_hw.h
index b2909168a0a6..fc4dbfc980c8 100644
--- a/drivers/accel/ivpu/ivpu_hw.h
+++ b/drivers/accel/ivpu/ivpu_hw.h
@@ -1,38 +1,22 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #ifndef __IVPU_HW_H__
 #define __IVPU_HW_H__
 
+#include <linux/kfifo.h>
+
 #include "ivpu_drv.h"
+#include "ivpu_hw_btrs.h"
+#include "ivpu_hw_ip.h"
 
-struct ivpu_hw_ops {
-	int (*info_init)(struct ivpu_device *vdev);
-	int (*power_up)(struct ivpu_device *vdev);
-	int (*boot_fw)(struct ivpu_device *vdev);
-	int (*power_down)(struct ivpu_device *vdev);
-	int (*reset)(struct ivpu_device *vdev);
-	bool (*is_idle)(struct ivpu_device *vdev);
-	int (*wait_for_idle)(struct ivpu_device *vdev);
-	void (*wdt_disable)(struct ivpu_device *vdev);
-	void (*diagnose_failure)(struct ivpu_device *vdev);
-	u32 (*profiling_freq_get)(struct ivpu_device *vdev);
-	void (*profiling_freq_drive)(struct ivpu_device *vdev, bool enable);
-	u32 (*reg_pll_freq_get)(struct ivpu_device *vdev);
-	u32 (*reg_telemetry_offset_get)(struct ivpu_device *vdev);
-	u32 (*reg_telemetry_size_get)(struct ivpu_device *vdev);
-	u32 (*reg_telemetry_enable_get)(struct ivpu_device *vdev);
-	void (*reg_db_set)(struct ivpu_device *vdev, u32 db_id);
-	u32 (*reg_ipc_rx_addr_get)(struct ivpu_device *vdev);
-	u32 (*reg_ipc_rx_count_get)(struct ivpu_device *vdev);
-	void (*reg_ipc_tx_set)(struct ivpu_device *vdev, u32 vpu_addr);
-	void (*irq_clear)(struct ivpu_device *vdev);
-	void (*irq_enable)(struct ivpu_device *vdev);
-	void (*irq_disable)(struct ivpu_device *vdev);
-	irqreturn_t (*irq_handler)(int irq, void *ptr);
-};
+#define IVPU_HW_IRQ_FIFO_LENGTH 1024
+
+#define IVPU_HW_IRQ_SRC_IPC 1
+#define IVPU_HW_IRQ_SRC_MMU_EVTQ 2
+#define IVPU_HW_IRQ_SRC_DCT 3
 
 struct ivpu_addr_range {
 	resource_size_t start;
@@ -40,7 +24,11 @@ struct ivpu_addr_range {
 };
 
 struct ivpu_hw_info {
-	const struct ivpu_hw_ops *ops;
+	struct {
+		bool (*btrs_irq_handler)(struct ivpu_device *vdev, int irq);
+		bool (*ip_irq_handler)(struct ivpu_device *vdev, int irq);
+		DECLARE_KFIFO(fifo, u8, IVPU_HW_IRQ_FIFO_LENGTH);
+	} irq;
 	struct {
 		struct ivpu_addr_range global;
 		struct ivpu_addr_range user;
@@ -63,137 +51,110 @@ struct ivpu_hw_info {
 	int dma_bits;
 	ktime_t d0i3_entry_host_ts;
 	u64 d0i3_entry_vpu_ts;
+	atomic_t firewall_irq_counter;
 };
 
-extern const struct ivpu_hw_ops ivpu_hw_37xx_ops;
-extern const struct ivpu_hw_ops ivpu_hw_40xx_ops;
+int ivpu_hw_init(struct ivpu_device *vdev);
+int ivpu_hw_power_up(struct ivpu_device *vdev);
+int ivpu_hw_power_down(struct ivpu_device *vdev);
+int ivpu_hw_reset(struct ivpu_device *vdev);
+int ivpu_hw_boot_fw(struct ivpu_device *vdev);
+void ivpu_hw_profiling_freq_drive(struct ivpu_device *vdev, bool enable);
+void ivpu_irq_handlers_init(struct ivpu_device *vdev);
+void ivpu_hw_irq_enable(struct ivpu_device *vdev);
+void ivpu_hw_irq_disable(struct ivpu_device *vdev);
+irqreturn_t ivpu_hw_irq_handler(int irq, void *ptr);
 
-static inline int ivpu_hw_info_init(struct ivpu_device *vdev)
+static inline u32 ivpu_hw_btrs_irq_handler(struct ivpu_device *vdev, int irq)
 {
-	return vdev->hw->ops->info_init(vdev);
-};
-
-static inline int ivpu_hw_power_up(struct ivpu_device *vdev)
-{
-	ivpu_dbg(vdev, PM, "HW power up\n");
-
-	return vdev->hw->ops->power_up(vdev);
-};
+	return vdev->hw->irq.btrs_irq_handler(vdev, irq);
+}
 
-static inline int ivpu_hw_boot_fw(struct ivpu_device *vdev)
+static inline u32 ivpu_hw_ip_irq_handler(struct ivpu_device *vdev, int irq)
 {
-	return vdev->hw->ops->boot_fw(vdev);
-};
+	return vdev->hw->irq.ip_irq_handler(vdev, irq);
+}
 
-static inline bool ivpu_hw_is_idle(struct ivpu_device *vdev)
+static inline void ivpu_hw_range_init(struct ivpu_addr_range *range, u64 start, u64 size)
 {
-	return vdev->hw->ops->is_idle(vdev);
-};
+	range->start = start;
+	range->end = start + size;
+}
 
-static inline int ivpu_hw_wait_for_idle(struct ivpu_device *vdev)
+static inline u64 ivpu_hw_range_size(const struct ivpu_addr_range *range)
 {
-	return vdev->hw->ops->wait_for_idle(vdev);
-};
+	return range->end - range->start;
+}
 
-static inline int ivpu_hw_power_down(struct ivpu_device *vdev)
+static inline u32 ivpu_hw_ratio_to_freq(struct ivpu_device *vdev, u32 ratio)
 {
-	ivpu_dbg(vdev, PM, "HW power down\n");
-
-	return vdev->hw->ops->power_down(vdev);
-};
+	return ivpu_hw_btrs_ratio_to_freq(vdev, ratio);
+}
 
-static inline int ivpu_hw_reset(struct ivpu_device *vdev)
+static inline void ivpu_hw_irq_clear(struct ivpu_device *vdev)
 {
-	ivpu_dbg(vdev, PM, "HW reset\n");
-
-	return vdev->hw->ops->reset(vdev);
-};
+	ivpu_hw_ip_irq_clear(vdev);
+}
 
-static inline void ivpu_hw_wdt_disable(struct ivpu_device *vdev)
+static inline u32 ivpu_hw_pll_freq_get(struct ivpu_device *vdev)
 {
-	vdev->hw->ops->wdt_disable(vdev);
-};
+	return ivpu_hw_btrs_pll_freq_get(vdev);
+}
 
 static inline u32 ivpu_hw_profiling_freq_get(struct ivpu_device *vdev)
 {
-	return vdev->hw->ops->profiling_freq_get(vdev);
-};
-
-static inline void ivpu_hw_profiling_freq_drive(struct ivpu_device *vdev, bool enable)
-{
-	return vdev->hw->ops->profiling_freq_drive(vdev, enable);
-};
-
-/* Register indirect accesses */
-static inline u32 ivpu_hw_reg_pll_freq_get(struct ivpu_device *vdev)
-{
-	return vdev->hw->ops->reg_pll_freq_get(vdev);
-};
-
-static inline u32 ivpu_hw_reg_telemetry_offset_get(struct ivpu_device *vdev)
-{
-	return vdev->hw->ops->reg_telemetry_offset_get(vdev);
-};
-
-static inline u32 ivpu_hw_reg_telemetry_size_get(struct ivpu_device *vdev)
-{
-	return vdev->hw->ops->reg_telemetry_size_get(vdev);
-};
-
-static inline u32 ivpu_hw_reg_telemetry_enable_get(struct ivpu_device *vdev)
-{
-	return vdev->hw->ops->reg_telemetry_enable_get(vdev);
-};
+	return vdev->hw->pll.profiling_freq;
+}
 
-static inline void ivpu_hw_reg_db_set(struct ivpu_device *vdev, u32 db_id)
+static inline void ivpu_hw_diagnose_failure(struct ivpu_device *vdev)
 {
-	vdev->hw->ops->reg_db_set(vdev, db_id);
-};
+	ivpu_hw_ip_diagnose_failure(vdev);
+	ivpu_hw_btrs_diagnose_failure(vdev);
+}
 
-static inline u32 ivpu_hw_reg_ipc_rx_addr_get(struct ivpu_device *vdev)
+static inline u32 ivpu_hw_telemetry_offset_get(struct ivpu_device *vdev)
 {
-	return vdev->hw->ops->reg_ipc_rx_addr_get(vdev);
-};
+	return ivpu_hw_btrs_telemetry_offset_get(vdev);
+}
 
-static inline u32 ivpu_hw_reg_ipc_rx_count_get(struct ivpu_device *vdev)
+static inline u32 ivpu_hw_telemetry_size_get(struct ivpu_device *vdev)
 {
-	return vdev->hw->ops->reg_ipc_rx_count_get(vdev);
-};
+	return ivpu_hw_btrs_telemetry_size_get(vdev);
+}
 
-static inline void ivpu_hw_reg_ipc_tx_set(struct ivpu_device *vdev, u32 vpu_addr)
+static inline u32 ivpu_hw_telemetry_enable_get(struct ivpu_device *vdev)
 {
-	vdev->hw->ops->reg_ipc_tx_set(vdev, vpu_addr);
-};
+	return ivpu_hw_btrs_telemetry_enable_get(vdev);
+}
 
-static inline void ivpu_hw_irq_clear(struct ivpu_device *vdev)
+static inline bool ivpu_hw_is_idle(struct ivpu_device *vdev)
 {
-	vdev->hw->ops->irq_clear(vdev);
-};
+	return ivpu_hw_btrs_is_idle(vdev);
+}
 
-static inline void ivpu_hw_irq_enable(struct ivpu_device *vdev)
+static inline int ivpu_hw_wait_for_idle(struct ivpu_device *vdev)
 {
-	vdev->hw->ops->irq_enable(vdev);
-};
+	return ivpu_hw_btrs_wait_for_idle(vdev);
+}
 
-static inline void ivpu_hw_irq_disable(struct ivpu_device *vdev)
+static inline void ivpu_hw_ipc_tx_set(struct ivpu_device *vdev, u32 vpu_addr)
 {
-	vdev->hw->ops->irq_disable(vdev);
-};
+	ivpu_hw_ip_ipc_tx_set(vdev, vpu_addr);
+}
 
-static inline void ivpu_hw_init_range(struct ivpu_addr_range *range, u64 start, u64 size)
+static inline void ivpu_hw_db_set(struct ivpu_device *vdev, u32 db_id)
 {
-	range->start = start;
-	range->end = start + size;
+	ivpu_hw_ip_db_set(vdev, db_id);
 }
 
-static inline u64 ivpu_hw_range_size(const struct ivpu_addr_range *range)
+static inline u32 ivpu_hw_ipc_rx_addr_get(struct ivpu_device *vdev)
 {
-	return range->end - range->start;
+	return ivpu_hw_ip_ipc_rx_addr_get(vdev);
 }
 
-static inline void ivpu_hw_diagnose_failure(struct ivpu_device *vdev)
+static inline u32 ivpu_hw_ipc_rx_count_get(struct ivpu_device *vdev)
 {
-	vdev->hw->ops->diagnose_failure(vdev);
+	return ivpu_hw_ip_ipc_rx_count_get(vdev);
 }
 
 #endif /* __IVPU_HW_H__ */
diff --git a/drivers/accel/ivpu/ivpu_hw_37xx.c b/drivers/accel/ivpu/ivpu_hw_37xx.c
deleted file mode 100644
index 89af1006df55..000000000000
--- a/drivers/accel/ivpu/ivpu_hw_37xx.c
+++ /dev/null
@@ -1,1066 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2020-2023 Intel Corporation
- */
-
-#include "ivpu_drv.h"
-#include "ivpu_fw.h"
-#include "ivpu_hw_37xx_reg.h"
-#include "ivpu_hw_reg_io.h"
-#include "ivpu_hw.h"
-#include "ivpu_ipc.h"
-#include "ivpu_mmu.h"
-#include "ivpu_pm.h"
-
-#define TILE_FUSE_ENABLE_BOTH        0x0
-#define TILE_SKU_BOTH_MTL            0x3630
-
-/* Work point configuration values */
-#define CONFIG_1_TILE                0x01
-#define CONFIG_2_TILE                0x02
-#define PLL_RATIO_5_3                0x01
-#define PLL_RATIO_4_3                0x02
-#define WP_CONFIG(tile, ratio)       (((tile) << 8) | (ratio))
-#define WP_CONFIG_1_TILE_5_3_RATIO   WP_CONFIG(CONFIG_1_TILE, PLL_RATIO_5_3)
-#define WP_CONFIG_1_TILE_4_3_RATIO   WP_CONFIG(CONFIG_1_TILE, PLL_RATIO_4_3)
-#define WP_CONFIG_2_TILE_5_3_RATIO   WP_CONFIG(CONFIG_2_TILE, PLL_RATIO_5_3)
-#define WP_CONFIG_2_TILE_4_3_RATIO   WP_CONFIG(CONFIG_2_TILE, PLL_RATIO_4_3)
-#define WP_CONFIG_0_TILE_PLL_OFF     WP_CONFIG(0, 0)
-
-#define PLL_REF_CLK_FREQ	     (50 * 1000000)
-#define PLL_SIMULATION_FREQ	     (10 * 1000000)
-#define PLL_PROF_CLK_FREQ	     (38400 * 1000)
-#define PLL_DEFAULT_EPP_VALUE	     0x80
-
-#define TIM_SAFE_ENABLE		     0xf1d0dead
-#define TIM_WATCHDOG_RESET_VALUE     0xffffffff
-
-#define TIMEOUT_US		     (150 * USEC_PER_MSEC)
-#define PWR_ISLAND_STATUS_TIMEOUT_US (5 * USEC_PER_MSEC)
-#define PLL_TIMEOUT_US		     (1500 * USEC_PER_MSEC)
-#define IDLE_TIMEOUT_US		     (5 * USEC_PER_MSEC)
-
-#define ICB_0_IRQ_MASK ((REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, HOST_IPC_FIFO_INT)) | \
-			(REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_0_INT)) | \
-			(REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_1_INT)) | \
-			(REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_2_INT)) | \
-			(REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, NOC_FIREWALL_INT)) | \
-			(REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_0_INT)) | \
-			(REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_1_INT)))
-
-#define ICB_1_IRQ_MASK ((REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_1, CPU_INT_REDIRECT_2_INT)) | \
-			(REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_1, CPU_INT_REDIRECT_3_INT)) | \
-			(REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_1, CPU_INT_REDIRECT_4_INT)))
-
-#define ICB_0_1_IRQ_MASK ((((u64)ICB_1_IRQ_MASK) << 32) | ICB_0_IRQ_MASK)
-
-#define BUTTRESS_IRQ_MASK ((REG_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, ATS_ERR)) | \
-			   (REG_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, UFI_ERR)))
-
-#define BUTTRESS_ALL_IRQ_MASK (BUTTRESS_IRQ_MASK | \
-			       (REG_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, FREQ_CHANGE)))
-
-#define BUTTRESS_IRQ_ENABLE_MASK ((u32)~BUTTRESS_IRQ_MASK)
-#define BUTTRESS_IRQ_DISABLE_MASK ((u32)-1)
-
-#define ITF_FIREWALL_VIOLATION_MASK ((REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, CSS_ROM_CMX)) | \
-				     (REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, CSS_DBG)) | \
-				     (REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, CSS_CTRL)) | \
-				     (REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, DEC400)) | \
-				     (REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, MSS_NCE)) | \
-				     (REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, MSS_MBI)) | \
-				     (REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, MSS_MBI_CMX)))
-
-static void ivpu_hw_wa_init(struct ivpu_device *vdev)
-{
-	vdev->wa.punit_disabled = false;
-	vdev->wa.clear_runtime_mem = false;
-	vdev->wa.d3hot_after_power_off = true;
-
-	REGB_WR32(VPU_37XX_BUTTRESS_INTERRUPT_STAT, BUTTRESS_ALL_IRQ_MASK);
-	if (REGB_RD32(VPU_37XX_BUTTRESS_INTERRUPT_STAT) == BUTTRESS_ALL_IRQ_MASK) {
-		/* Writing 1s does not clear the interrupt status register */
-		vdev->wa.interrupt_clear_with_0 = true;
-		REGB_WR32(VPU_37XX_BUTTRESS_INTERRUPT_STAT, 0x0);
-	}
-
-	IVPU_PRINT_WA(punit_disabled);
-	IVPU_PRINT_WA(clear_runtime_mem);
-	IVPU_PRINT_WA(d3hot_after_power_off);
-	IVPU_PRINT_WA(interrupt_clear_with_0);
-}
-
-static void ivpu_hw_timeouts_init(struct ivpu_device *vdev)
-{
-	vdev->timeout.boot = 1000;
-	vdev->timeout.jsm = 500;
-	vdev->timeout.tdr = 2000;
-	vdev->timeout.reschedule_suspend = 10;
-	vdev->timeout.autosuspend = 10;
-	vdev->timeout.d0i3_entry_msg = 5;
-}
-
-static int ivpu_pll_wait_for_cmd_send(struct ivpu_device *vdev)
-{
-	return REGB_POLL_FLD(VPU_37XX_BUTTRESS_WP_REQ_CMD, SEND, 0, PLL_TIMEOUT_US);
-}
-
-/* Send KMD initiated workpoint change */
-static int ivpu_pll_cmd_send(struct ivpu_device *vdev, u16 min_ratio, u16 max_ratio,
-			     u16 target_ratio, u16 config)
-{
-	int ret;
-	u32 val;
-
-	ret = ivpu_pll_wait_for_cmd_send(vdev);
-	if (ret) {
-		ivpu_err(vdev, "Failed to sync before WP request: %d\n", ret);
-		return ret;
-	}
-
-	val = REGB_RD32(VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD0);
-	val = REG_SET_FLD_NUM(VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD0, MIN_RATIO, min_ratio, val);
-	val = REG_SET_FLD_NUM(VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD0, MAX_RATIO, max_ratio, val);
-	REGB_WR32(VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD0, val);
-
-	val = REGB_RD32(VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD1);
-	val = REG_SET_FLD_NUM(VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD1, TARGET_RATIO, target_ratio, val);
-	val = REG_SET_FLD_NUM(VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD1, EPP, PLL_DEFAULT_EPP_VALUE, val);
-	REGB_WR32(VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD1, val);
-
-	val = REGB_RD32(VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD2);
-	val = REG_SET_FLD_NUM(VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD2, CONFIG, config, val);
-	REGB_WR32(VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD2, val);
-
-	val = REGB_RD32(VPU_37XX_BUTTRESS_WP_REQ_CMD);
-	val = REG_SET_FLD(VPU_37XX_BUTTRESS_WP_REQ_CMD, SEND, val);
-	REGB_WR32(VPU_37XX_BUTTRESS_WP_REQ_CMD, val);
-
-	ret = ivpu_pll_wait_for_cmd_send(vdev);
-	if (ret)
-		ivpu_err(vdev, "Failed to sync after WP request: %d\n", ret);
-
-	return ret;
-}
-
-static int ivpu_pll_wait_for_lock(struct ivpu_device *vdev, bool enable)
-{
-	u32 exp_val = enable ? 0x1 : 0x0;
-
-	if (IVPU_WA(punit_disabled))
-		return 0;
-
-	return REGB_POLL_FLD(VPU_37XX_BUTTRESS_PLL_STATUS, LOCK, exp_val, PLL_TIMEOUT_US);
-}
-
-static int ivpu_pll_wait_for_status_ready(struct ivpu_device *vdev)
-{
-	if (IVPU_WA(punit_disabled))
-		return 0;
-
-	return REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_STATUS, READY, 1, PLL_TIMEOUT_US);
-}
-
-static void ivpu_pll_init_frequency_ratios(struct ivpu_device *vdev)
-{
-	struct ivpu_hw_info *hw = vdev->hw;
-	u8 fuse_min_ratio, fuse_max_ratio, fuse_pn_ratio;
-	u32 fmin_fuse, fmax_fuse;
-
-	fmin_fuse = REGB_RD32(VPU_37XX_BUTTRESS_FMIN_FUSE);
-	fuse_min_ratio = REG_GET_FLD(VPU_37XX_BUTTRESS_FMIN_FUSE, MIN_RATIO, fmin_fuse);
-	fuse_pn_ratio = REG_GET_FLD(VPU_37XX_BUTTRESS_FMIN_FUSE, PN_RATIO, fmin_fuse);
-
-	fmax_fuse = REGB_RD32(VPU_37XX_BUTTRESS_FMAX_FUSE);
-	fuse_max_ratio = REG_GET_FLD(VPU_37XX_BUTTRESS_FMAX_FUSE, MAX_RATIO, fmax_fuse);
-
-	hw->pll.min_ratio = clamp_t(u8, ivpu_pll_min_ratio, fuse_min_ratio, fuse_max_ratio);
-	hw->pll.max_ratio = clamp_t(u8, ivpu_pll_max_ratio, hw->pll.min_ratio, fuse_max_ratio);
-	hw->pll.pn_ratio = clamp_t(u8, fuse_pn_ratio, hw->pll.min_ratio, hw->pll.max_ratio);
-}
-
-static int ivpu_hw_37xx_wait_for_vpuip_bar(struct ivpu_device *vdev)
-{
-	return REGV_POLL_FLD(VPU_37XX_HOST_SS_CPR_RST_CLR, AON, 0, 100);
-}
-
-static int ivpu_pll_drive(struct ivpu_device *vdev, bool enable)
-{
-	struct ivpu_hw_info *hw = vdev->hw;
-	u16 target_ratio;
-	u16 config;
-	int ret;
-
-	if (IVPU_WA(punit_disabled)) {
-		ivpu_dbg(vdev, PM, "Skipping PLL request\n");
-		return 0;
-	}
-
-	if (enable) {
-		target_ratio = hw->pll.pn_ratio;
-		config = hw->config;
-	} else {
-		target_ratio = 0;
-		config = 0;
-	}
-
-	ivpu_dbg(vdev, PM, "PLL workpoint request: config 0x%04x pll ratio 0x%x\n",
-		 config, target_ratio);
-
-	ret = ivpu_pll_cmd_send(vdev, hw->pll.min_ratio, hw->pll.max_ratio, target_ratio, config);
-	if (ret) {
-		ivpu_err(vdev, "Failed to send PLL workpoint request: %d\n", ret);
-		return ret;
-	}
-
-	ret = ivpu_pll_wait_for_lock(vdev, enable);
-	if (ret) {
-		ivpu_err(vdev, "Timed out waiting for PLL lock\n");
-		return ret;
-	}
-
-	if (enable) {
-		ret = ivpu_pll_wait_for_status_ready(vdev);
-		if (ret) {
-			ivpu_err(vdev, "Timed out waiting for PLL ready status\n");
-			return ret;
-		}
-
-		ret = ivpu_hw_37xx_wait_for_vpuip_bar(vdev);
-		if (ret) {
-			ivpu_err(vdev, "Timed out waiting for VPUIP bar\n");
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-static int ivpu_pll_enable(struct ivpu_device *vdev)
-{
-	return ivpu_pll_drive(vdev, true);
-}
-
-static int ivpu_pll_disable(struct ivpu_device *vdev)
-{
-	return ivpu_pll_drive(vdev, false);
-}
-
-static void ivpu_boot_host_ss_rst_clr_assert(struct ivpu_device *vdev)
-{
-	u32 val = 0;
-
-	val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_RST_CLR, TOP_NOC, val);
-	val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_RST_CLR, DSS_MAS, val);
-	val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_RST_CLR, MSS_MAS, val);
-
-	REGV_WR32(VPU_37XX_HOST_SS_CPR_RST_CLR, val);
-}
-
-static void ivpu_boot_host_ss_rst_drive(struct ivpu_device *vdev, bool enable)
-{
-	u32 val = REGV_RD32(VPU_37XX_HOST_SS_CPR_RST_SET);
-
-	if (enable) {
-		val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_RST_SET, TOP_NOC, val);
-		val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_RST_SET, DSS_MAS, val);
-		val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_RST_SET, MSS_MAS, val);
-	} else {
-		val = REG_CLR_FLD(VPU_37XX_HOST_SS_CPR_RST_SET, TOP_NOC, val);
-		val = REG_CLR_FLD(VPU_37XX_HOST_SS_CPR_RST_SET, DSS_MAS, val);
-		val = REG_CLR_FLD(VPU_37XX_HOST_SS_CPR_RST_SET, MSS_MAS, val);
-	}
-
-	REGV_WR32(VPU_37XX_HOST_SS_CPR_RST_SET, val);
-}
-
-static void ivpu_boot_host_ss_clk_drive(struct ivpu_device *vdev, bool enable)
-{
-	u32 val = REGV_RD32(VPU_37XX_HOST_SS_CPR_CLK_SET);
-
-	if (enable) {
-		val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_CLK_SET, TOP_NOC, val);
-		val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_CLK_SET, DSS_MAS, val);
-		val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_CLK_SET, MSS_MAS, val);
-	} else {
-		val = REG_CLR_FLD(VPU_37XX_HOST_SS_CPR_CLK_SET, TOP_NOC, val);
-		val = REG_CLR_FLD(VPU_37XX_HOST_SS_CPR_CLK_SET, DSS_MAS, val);
-		val = REG_CLR_FLD(VPU_37XX_HOST_SS_CPR_CLK_SET, MSS_MAS, val);
-	}
-
-	REGV_WR32(VPU_37XX_HOST_SS_CPR_CLK_SET, val);
-}
-
-static int ivpu_boot_noc_qreqn_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_37XX_HOST_SS_NOC_QREQN);
-
-	if (!REG_TEST_FLD_NUM(VPU_37XX_HOST_SS_NOC_QREQN, TOP_SOCMMIO, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static int ivpu_boot_noc_qacceptn_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_37XX_HOST_SS_NOC_QACCEPTN);
-
-	if (!REG_TEST_FLD_NUM(VPU_37XX_HOST_SS_NOC_QACCEPTN, TOP_SOCMMIO, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static int ivpu_boot_noc_qdeny_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_37XX_HOST_SS_NOC_QDENY);
-
-	if (!REG_TEST_FLD_NUM(VPU_37XX_HOST_SS_NOC_QDENY, TOP_SOCMMIO, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static int ivpu_boot_top_noc_qrenqn_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_37XX_TOP_NOC_QREQN);
-
-	if (!REG_TEST_FLD_NUM(VPU_37XX_TOP_NOC_QREQN, CPU_CTRL, exp_val, val) ||
-	    !REG_TEST_FLD_NUM(VPU_37XX_TOP_NOC_QREQN, HOSTIF_L2CACHE, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static int ivpu_boot_top_noc_qacceptn_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_37XX_TOP_NOC_QACCEPTN);
-
-	if (!REG_TEST_FLD_NUM(VPU_37XX_TOP_NOC_QACCEPTN, CPU_CTRL, exp_val, val) ||
-	    !REG_TEST_FLD_NUM(VPU_37XX_TOP_NOC_QACCEPTN, HOSTIF_L2CACHE, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static int ivpu_boot_top_noc_qdeny_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_37XX_TOP_NOC_QDENY);
-
-	if (!REG_TEST_FLD_NUM(VPU_37XX_TOP_NOC_QDENY, CPU_CTRL, exp_val, val) ||
-	    !REG_TEST_FLD_NUM(VPU_37XX_TOP_NOC_QDENY, HOSTIF_L2CACHE, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static int ivpu_boot_host_ss_configure(struct ivpu_device *vdev)
-{
-	ivpu_boot_host_ss_rst_clr_assert(vdev);
-
-	return ivpu_boot_noc_qreqn_check(vdev, 0x0);
-}
-
-static void ivpu_boot_vpu_idle_gen_disable(struct ivpu_device *vdev)
-{
-	REGV_WR32(VPU_37XX_HOST_SS_AON_VPU_IDLE_GEN, 0x0);
-}
-
-static int ivpu_boot_host_ss_axi_drive(struct ivpu_device *vdev, bool enable)
-{
-	int ret;
-	u32 val;
-
-	val = REGV_RD32(VPU_37XX_HOST_SS_NOC_QREQN);
-	if (enable)
-		val = REG_SET_FLD(VPU_37XX_HOST_SS_NOC_QREQN, TOP_SOCMMIO, val);
-	else
-		val = REG_CLR_FLD(VPU_37XX_HOST_SS_NOC_QREQN, TOP_SOCMMIO, val);
-	REGV_WR32(VPU_37XX_HOST_SS_NOC_QREQN, val);
-
-	ret = ivpu_boot_noc_qacceptn_check(vdev, enable ? 0x1 : 0x0);
-	if (ret) {
-		ivpu_err(vdev, "Failed qacceptn check: %d\n", ret);
-		return ret;
-	}
-
-	ret = ivpu_boot_noc_qdeny_check(vdev, 0x0);
-	if (ret)
-		ivpu_err(vdev, "Failed qdeny check: %d\n", ret);
-
-	return ret;
-}
-
-static int ivpu_boot_host_ss_axi_enable(struct ivpu_device *vdev)
-{
-	return ivpu_boot_host_ss_axi_drive(vdev, true);
-}
-
-static int ivpu_boot_host_ss_top_noc_drive(struct ivpu_device *vdev, bool enable)
-{
-	int ret;
-	u32 val;
-
-	val = REGV_RD32(VPU_37XX_TOP_NOC_QREQN);
-	if (enable) {
-		val = REG_SET_FLD(VPU_37XX_TOP_NOC_QREQN, CPU_CTRL, val);
-		val = REG_SET_FLD(VPU_37XX_TOP_NOC_QREQN, HOSTIF_L2CACHE, val);
-	} else {
-		val = REG_CLR_FLD(VPU_37XX_TOP_NOC_QREQN, CPU_CTRL, val);
-		val = REG_CLR_FLD(VPU_37XX_TOP_NOC_QREQN, HOSTIF_L2CACHE, val);
-	}
-	REGV_WR32(VPU_37XX_TOP_NOC_QREQN, val);
-
-	ret = ivpu_boot_top_noc_qacceptn_check(vdev, enable ? 0x1 : 0x0);
-	if (ret) {
-		ivpu_err(vdev, "Failed qacceptn check: %d\n", ret);
-		return ret;
-	}
-
-	ret = ivpu_boot_top_noc_qdeny_check(vdev, 0x0);
-	if (ret)
-		ivpu_err(vdev, "Failed qdeny check: %d\n", ret);
-
-	return ret;
-}
-
-static int ivpu_boot_host_ss_top_noc_enable(struct ivpu_device *vdev)
-{
-	return ivpu_boot_host_ss_top_noc_drive(vdev, true);
-}
-
-static void ivpu_boot_pwr_island_trickle_drive(struct ivpu_device *vdev, bool enable)
-{
-	u32 val = REGV_RD32(VPU_37XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0);
-
-	if (enable)
-		val = REG_SET_FLD(VPU_37XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0, MSS_CPU, val);
-	else
-		val = REG_CLR_FLD(VPU_37XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0, MSS_CPU, val);
-
-	REGV_WR32(VPU_37XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0, val);
-}
-
-static void ivpu_boot_pwr_island_drive(struct ivpu_device *vdev, bool enable)
-{
-	u32 val = REGV_RD32(VPU_37XX_HOST_SS_AON_PWR_ISLAND_EN0);
-
-	if (enable)
-		val = REG_SET_FLD(VPU_37XX_HOST_SS_AON_PWR_ISLAND_EN0, MSS_CPU, val);
-	else
-		val = REG_CLR_FLD(VPU_37XX_HOST_SS_AON_PWR_ISLAND_EN0, MSS_CPU, val);
-
-	REGV_WR32(VPU_37XX_HOST_SS_AON_PWR_ISLAND_EN0, val);
-}
-
-static int ivpu_boot_wait_for_pwr_island_status(struct ivpu_device *vdev, u32 exp_val)
-{
-	return REGV_POLL_FLD(VPU_37XX_HOST_SS_AON_PWR_ISLAND_STATUS0, MSS_CPU,
-			     exp_val, PWR_ISLAND_STATUS_TIMEOUT_US);
-}
-
-static void ivpu_boot_pwr_island_isolation_drive(struct ivpu_device *vdev, bool enable)
-{
-	u32 val = REGV_RD32(VPU_37XX_HOST_SS_AON_PWR_ISO_EN0);
-
-	if (enable)
-		val = REG_SET_FLD(VPU_37XX_HOST_SS_AON_PWR_ISO_EN0, MSS_CPU, val);
-	else
-		val = REG_CLR_FLD(VPU_37XX_HOST_SS_AON_PWR_ISO_EN0, MSS_CPU, val);
-
-	REGV_WR32(VPU_37XX_HOST_SS_AON_PWR_ISO_EN0, val);
-}
-
-static void ivpu_boot_dpu_active_drive(struct ivpu_device *vdev, bool enable)
-{
-	u32 val = REGV_RD32(VPU_37XX_HOST_SS_AON_DPU_ACTIVE);
-
-	if (enable)
-		val = REG_SET_FLD(VPU_37XX_HOST_SS_AON_DPU_ACTIVE, DPU_ACTIVE, val);
-	else
-		val = REG_CLR_FLD(VPU_37XX_HOST_SS_AON_DPU_ACTIVE, DPU_ACTIVE, val);
-
-	REGV_WR32(VPU_37XX_HOST_SS_AON_DPU_ACTIVE, val);
-}
-
-static int ivpu_boot_pwr_domain_enable(struct ivpu_device *vdev)
-{
-	int ret;
-
-	ivpu_boot_pwr_island_trickle_drive(vdev, true);
-	ivpu_boot_pwr_island_drive(vdev, true);
-
-	ret = ivpu_boot_wait_for_pwr_island_status(vdev, 0x1);
-	if (ret) {
-		ivpu_err(vdev, "Timed out waiting for power island status\n");
-		return ret;
-	}
-
-	ret = ivpu_boot_top_noc_qrenqn_check(vdev, 0x0);
-	if (ret) {
-		ivpu_err(vdev, "Failed qrenqn check %d\n", ret);
-		return ret;
-	}
-
-	ivpu_boot_host_ss_clk_drive(vdev, true);
-	ivpu_boot_pwr_island_isolation_drive(vdev, false);
-	ivpu_boot_host_ss_rst_drive(vdev, true);
-	ivpu_boot_dpu_active_drive(vdev, true);
-
-	return ret;
-}
-
-static void ivpu_boot_no_snoop_enable(struct ivpu_device *vdev)
-{
-	u32 val = REGV_RD32(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES);
-
-	val = REG_SET_FLD(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, NOSNOOP_OVERRIDE_EN, val);
-	val = REG_CLR_FLD(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, AW_NOSNOOP_OVERRIDE, val);
-	val = REG_SET_FLD(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, AR_NOSNOOP_OVERRIDE, val);
-
-	REGV_WR32(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, val);
-}
-
-static void ivpu_boot_tbu_mmu_enable(struct ivpu_device *vdev)
-{
-	u32 val = REGV_RD32(VPU_37XX_HOST_IF_TBU_MMUSSIDV);
-
-	val = REG_SET_FLD(VPU_37XX_HOST_IF_TBU_MMUSSIDV, TBU0_AWMMUSSIDV, val);
-	val = REG_SET_FLD(VPU_37XX_HOST_IF_TBU_MMUSSIDV, TBU0_ARMMUSSIDV, val);
-	val = REG_SET_FLD(VPU_37XX_HOST_IF_TBU_MMUSSIDV, TBU2_AWMMUSSIDV, val);
-	val = REG_SET_FLD(VPU_37XX_HOST_IF_TBU_MMUSSIDV, TBU2_ARMMUSSIDV, val);
-
-	REGV_WR32(VPU_37XX_HOST_IF_TBU_MMUSSIDV, val);
-}
-
-static void ivpu_boot_soc_cpu_boot(struct ivpu_device *vdev)
-{
-	u32 val;
-
-	val = REGV_RD32(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC);
-	val = REG_SET_FLD(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, IRQI_RSTRUN0, val);
-
-	val = REG_CLR_FLD(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, IRQI_RSTVEC, val);
-	REGV_WR32(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, val);
-
-	val = REG_SET_FLD(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, IRQI_RESUME0, val);
-	REGV_WR32(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, val);
-
-	val = REG_CLR_FLD(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, IRQI_RESUME0, val);
-	REGV_WR32(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, val);
-
-	val = vdev->fw->entry_point >> 9;
-	REGV_WR32(VPU_37XX_HOST_SS_LOADING_ADDRESS_LO, val);
-
-	val = REG_SET_FLD(VPU_37XX_HOST_SS_LOADING_ADDRESS_LO, DONE, val);
-	REGV_WR32(VPU_37XX_HOST_SS_LOADING_ADDRESS_LO, val);
-
-	ivpu_dbg(vdev, PM, "Booting firmware, mode: %s\n",
-		 vdev->fw->entry_point == vdev->fw->cold_boot_entry_point ? "cold boot" : "resume");
-}
-
-static int ivpu_boot_d0i3_drive(struct ivpu_device *vdev, bool enable)
-{
-	int ret;
-	u32 val;
-
-	ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_D0I3_CONTROL, INPROGRESS, 0, TIMEOUT_US);
-	if (ret) {
-		ivpu_err(vdev, "Failed to sync before D0i3 transition: %d\n", ret);
-		return ret;
-	}
-
-	val = REGB_RD32(VPU_37XX_BUTTRESS_VPU_D0I3_CONTROL);
-	if (enable)
-		val = REG_SET_FLD(VPU_37XX_BUTTRESS_VPU_D0I3_CONTROL, I3, val);
-	else
-		val = REG_CLR_FLD(VPU_37XX_BUTTRESS_VPU_D0I3_CONTROL, I3, val);
-	REGB_WR32(VPU_37XX_BUTTRESS_VPU_D0I3_CONTROL, val);
-
-	ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_D0I3_CONTROL, INPROGRESS, 0, TIMEOUT_US);
-	if (ret)
-		ivpu_err(vdev, "Failed to sync after D0i3 transition: %d\n", ret);
-
-	return ret;
-}
-
-static int ivpu_hw_37xx_info_init(struct ivpu_device *vdev)
-{
-	struct ivpu_hw_info *hw = vdev->hw;
-
-	hw->tile_fuse = TILE_FUSE_ENABLE_BOTH;
-	hw->sku = TILE_SKU_BOTH_MTL;
-	hw->config = WP_CONFIG_2_TILE_4_3_RATIO;
-
-	ivpu_pll_init_frequency_ratios(vdev);
-
-	ivpu_hw_init_range(&hw->ranges.global, 0x80000000, SZ_512M);
-	ivpu_hw_init_range(&hw->ranges.user,   0xc0000000, 255 * SZ_1M);
-	ivpu_hw_init_range(&hw->ranges.shave, 0x180000000, SZ_2G);
-	ivpu_hw_init_range(&hw->ranges.dma,   0x200000000, SZ_8G);
-
-	vdev->platform = IVPU_PLATFORM_SILICON;
-	ivpu_hw_wa_init(vdev);
-	ivpu_hw_timeouts_init(vdev);
-
-	return 0;
-}
-
-static int ivpu_hw_37xx_ip_reset(struct ivpu_device *vdev)
-{
-	int ret;
-	u32 val;
-
-	if (IVPU_WA(punit_disabled))
-		return 0;
-
-	ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US);
-	if (ret) {
-		ivpu_err(vdev, "Timed out waiting for TRIGGER bit\n");
-		return ret;
-	}
-
-	val = REGB_RD32(VPU_37XX_BUTTRESS_VPU_IP_RESET);
-	val = REG_SET_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, val);
-	REGB_WR32(VPU_37XX_BUTTRESS_VPU_IP_RESET, val);
-
-	ret = REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US);
-	if (ret)
-		ivpu_err(vdev, "Timed out waiting for RESET completion\n");
-
-	return ret;
-}
-
-static int ivpu_hw_37xx_reset(struct ivpu_device *vdev)
-{
-	int ret = 0;
-
-	if (ivpu_hw_37xx_ip_reset(vdev)) {
-		ivpu_err(vdev, "Failed to reset NPU\n");
-		ret = -EIO;
-	}
-
-	if (ivpu_pll_disable(vdev)) {
-		ivpu_err(vdev, "Failed to disable PLL\n");
-		ret = -EIO;
-	}
-
-	return ret;
-}
-
-static int ivpu_hw_37xx_d0i3_enable(struct ivpu_device *vdev)
-{
-	int ret;
-
-	ret = ivpu_boot_d0i3_drive(vdev, true);
-	if (ret)
-		ivpu_err(vdev, "Failed to enable D0i3: %d\n", ret);
-
-	udelay(5); /* VPU requires 5 us to complete the transition */
-
-	return ret;
-}
-
-static int ivpu_hw_37xx_d0i3_disable(struct ivpu_device *vdev)
-{
-	int ret;
-
-	ret = ivpu_boot_d0i3_drive(vdev, false);
-	if (ret)
-		ivpu_err(vdev, "Failed to disable D0i3: %d\n", ret);
-
-	return ret;
-}
-
-static int ivpu_hw_37xx_power_up(struct ivpu_device *vdev)
-{
-	int ret;
-
-	/* PLL requests may fail when powering down, so issue WP 0 here */
-	ret = ivpu_pll_disable(vdev);
-	if (ret)
-		ivpu_warn(vdev, "Failed to disable PLL: %d\n", ret);
-
-	ret = ivpu_hw_37xx_d0i3_disable(vdev);
-	if (ret)
-		ivpu_warn(vdev, "Failed to disable D0I3: %d\n", ret);
-
-	ret = ivpu_pll_enable(vdev);
-	if (ret) {
-		ivpu_err(vdev, "Failed to enable PLL: %d\n", ret);
-		return ret;
-	}
-
-	ret = ivpu_boot_host_ss_configure(vdev);
-	if (ret) {
-		ivpu_err(vdev, "Failed to configure host SS: %d\n", ret);
-		return ret;
-	}
-
-	/*
-	 * The control circuitry for vpu_idle indication logic powers up active.
-	 * To ensure unnecessary low power mode signal from LRT during bring up,
-	 * KMD disables the circuitry prior to bringing up the Main Power island.
-	 */
-	ivpu_boot_vpu_idle_gen_disable(vdev);
-
-	ret = ivpu_boot_pwr_domain_enable(vdev);
-	if (ret) {
-		ivpu_err(vdev, "Failed to enable power domain: %d\n", ret);
-		return ret;
-	}
-
-	ret = ivpu_boot_host_ss_axi_enable(vdev);
-	if (ret) {
-		ivpu_err(vdev, "Failed to enable AXI: %d\n", ret);
-		return ret;
-	}
-
-	ret = ivpu_boot_host_ss_top_noc_enable(vdev);
-	if (ret)
-		ivpu_err(vdev, "Failed to enable TOP NOC: %d\n", ret);
-
-	return ret;
-}
-
-static int ivpu_hw_37xx_boot_fw(struct ivpu_device *vdev)
-{
-	ivpu_boot_no_snoop_enable(vdev);
-	ivpu_boot_tbu_mmu_enable(vdev);
-	ivpu_boot_soc_cpu_boot(vdev);
-
-	return 0;
-}
-
-static bool ivpu_hw_37xx_is_idle(struct ivpu_device *vdev)
-{
-	u32 val;
-
-	if (IVPU_WA(punit_disabled))
-		return true;
-
-	val = REGB_RD32(VPU_37XX_BUTTRESS_VPU_STATUS);
-	return REG_TEST_FLD(VPU_37XX_BUTTRESS_VPU_STATUS, READY, val) &&
-	       REG_TEST_FLD(VPU_37XX_BUTTRESS_VPU_STATUS, IDLE, val);
-}
-
-static int ivpu_hw_37xx_wait_for_idle(struct ivpu_device *vdev)
-{
-	return REGB_POLL_FLD(VPU_37XX_BUTTRESS_VPU_STATUS, IDLE, 0x1, IDLE_TIMEOUT_US);
-}
-
-static void ivpu_hw_37xx_save_d0i3_entry_timestamp(struct ivpu_device *vdev)
-{
-	vdev->hw->d0i3_entry_host_ts = ktime_get_boottime();
-	vdev->hw->d0i3_entry_vpu_ts = REGV_RD64(VPU_37XX_CPU_SS_TIM_PERF_FREE_CNT);
-}
-
-static int ivpu_hw_37xx_power_down(struct ivpu_device *vdev)
-{
-	int ret = 0;
-
-	ivpu_hw_37xx_save_d0i3_entry_timestamp(vdev);
-
-	if (!ivpu_hw_37xx_is_idle(vdev))
-		ivpu_warn(vdev, "VPU not idle during power down\n");
-
-	if (ivpu_hw_37xx_reset(vdev)) {
-		ivpu_err(vdev, "Failed to reset VPU\n");
-		ret = -EIO;
-	}
-
-	if (ivpu_hw_37xx_d0i3_enable(vdev)) {
-		ivpu_err(vdev, "Failed to enter D0I3\n");
-		ret = -EIO;
-	}
-
-	return ret;
-}
-
-static void ivpu_hw_37xx_wdt_disable(struct ivpu_device *vdev)
-{
-	u32 val;
-
-	/* Enable writing and set non-zero WDT value */
-	REGV_WR32(VPU_37XX_CPU_SS_TIM_SAFE, TIM_SAFE_ENABLE);
-	REGV_WR32(VPU_37XX_CPU_SS_TIM_WATCHDOG, TIM_WATCHDOG_RESET_VALUE);
-
-	/* Enable writing and disable watchdog timer */
-	REGV_WR32(VPU_37XX_CPU_SS_TIM_SAFE, TIM_SAFE_ENABLE);
-	REGV_WR32(VPU_37XX_CPU_SS_TIM_WDOG_EN, 0);
-
-	/* Now clear the timeout interrupt */
-	val = REGV_RD32(VPU_37XX_CPU_SS_TIM_GEN_CONFIG);
-	val = REG_CLR_FLD(VPU_37XX_CPU_SS_TIM_GEN_CONFIG, WDOG_TO_INT_CLR, val);
-	REGV_WR32(VPU_37XX_CPU_SS_TIM_GEN_CONFIG, val);
-}
-
-static u32 ivpu_hw_37xx_profiling_freq_get(struct ivpu_device *vdev)
-{
-	return PLL_PROF_CLK_FREQ;
-}
-
-static void ivpu_hw_37xx_profiling_freq_drive(struct ivpu_device *vdev, bool enable)
-{
-	/* Profiling freq - is a debug feature. Unavailable on VPU 37XX. */
-}
-
-static u32 ivpu_hw_37xx_pll_to_freq(u32 ratio, u32 config)
-{
-	u32 pll_clock = PLL_REF_CLK_FREQ * ratio;
-	u32 cpu_clock;
-
-	if ((config & 0xff) == PLL_RATIO_4_3)
-		cpu_clock = pll_clock * 2 / 4;
-	else
-		cpu_clock = pll_clock * 2 / 5;
-
-	return cpu_clock;
-}
-
-/* Register indirect accesses */
-static u32 ivpu_hw_37xx_reg_pll_freq_get(struct ivpu_device *vdev)
-{
-	u32 pll_curr_ratio;
-
-	pll_curr_ratio = REGB_RD32(VPU_37XX_BUTTRESS_CURRENT_PLL);
-	pll_curr_ratio &= VPU_37XX_BUTTRESS_CURRENT_PLL_RATIO_MASK;
-
-	if (!ivpu_is_silicon(vdev))
-		return PLL_SIMULATION_FREQ;
-
-	return ivpu_hw_37xx_pll_to_freq(pll_curr_ratio, vdev->hw->config);
-}
-
-static u32 ivpu_hw_37xx_reg_telemetry_offset_get(struct ivpu_device *vdev)
-{
-	return REGB_RD32(VPU_37XX_BUTTRESS_VPU_TELEMETRY_OFFSET);
-}
-
-static u32 ivpu_hw_37xx_reg_telemetry_size_get(struct ivpu_device *vdev)
-{
-	return REGB_RD32(VPU_37XX_BUTTRESS_VPU_TELEMETRY_SIZE);
-}
-
-static u32 ivpu_hw_37xx_reg_telemetry_enable_get(struct ivpu_device *vdev)
-{
-	return REGB_RD32(VPU_37XX_BUTTRESS_VPU_TELEMETRY_ENABLE);
-}
-
-static void ivpu_hw_37xx_reg_db_set(struct ivpu_device *vdev, u32 db_id)
-{
-	u32 reg_stride = VPU_37XX_CPU_SS_DOORBELL_1 - VPU_37XX_CPU_SS_DOORBELL_0;
-	u32 val = REG_FLD(VPU_37XX_CPU_SS_DOORBELL_0, SET);
-
-	REGV_WR32I(VPU_37XX_CPU_SS_DOORBELL_0, reg_stride, db_id, val);
-}
-
-static u32 ivpu_hw_37xx_reg_ipc_rx_addr_get(struct ivpu_device *vdev)
-{
-	return REGV_RD32(VPU_37XX_HOST_SS_TIM_IPC_FIFO_ATM);
-}
-
-static u32 ivpu_hw_37xx_reg_ipc_rx_count_get(struct ivpu_device *vdev)
-{
-	u32 count = REGV_RD32_SILENT(VPU_37XX_HOST_SS_TIM_IPC_FIFO_STAT);
-
-	return REG_GET_FLD(VPU_37XX_HOST_SS_TIM_IPC_FIFO_STAT, FILL_LEVEL, count);
-}
-
-static void ivpu_hw_37xx_reg_ipc_tx_set(struct ivpu_device *vdev, u32 vpu_addr)
-{
-	REGV_WR32(VPU_37XX_CPU_SS_TIM_IPC_FIFO, vpu_addr);
-}
-
-static void ivpu_hw_37xx_irq_clear(struct ivpu_device *vdev)
-{
-	REGV_WR64(VPU_37XX_HOST_SS_ICB_CLEAR_0, ICB_0_1_IRQ_MASK);
-}
-
-static void ivpu_hw_37xx_irq_enable(struct ivpu_device *vdev)
-{
-	REGV_WR32(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, ITF_FIREWALL_VIOLATION_MASK);
-	REGV_WR64(VPU_37XX_HOST_SS_ICB_ENABLE_0, ICB_0_1_IRQ_MASK);
-	REGB_WR32(VPU_37XX_BUTTRESS_LOCAL_INT_MASK, BUTTRESS_IRQ_ENABLE_MASK);
-	REGB_WR32(VPU_37XX_BUTTRESS_GLOBAL_INT_MASK, 0x0);
-}
-
-static void ivpu_hw_37xx_irq_disable(struct ivpu_device *vdev)
-{
-	REGB_WR32(VPU_37XX_BUTTRESS_GLOBAL_INT_MASK, 0x1);
-	REGB_WR32(VPU_37XX_BUTTRESS_LOCAL_INT_MASK, BUTTRESS_IRQ_DISABLE_MASK);
-	REGV_WR64(VPU_37XX_HOST_SS_ICB_ENABLE_0, 0x0ull);
-	REGV_WR32(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, 0x0);
-}
-
-static void ivpu_hw_37xx_irq_wdt_nce_handler(struct ivpu_device *vdev)
-{
-	ivpu_pm_trigger_recovery(vdev, "WDT NCE IRQ");
-}
-
-static void ivpu_hw_37xx_irq_wdt_mss_handler(struct ivpu_device *vdev)
-{
-	ivpu_hw_wdt_disable(vdev);
-	ivpu_pm_trigger_recovery(vdev, "WDT MSS IRQ");
-}
-
-static void ivpu_hw_37xx_irq_noc_firewall_handler(struct ivpu_device *vdev)
-{
-	ivpu_pm_trigger_recovery(vdev, "NOC Firewall IRQ");
-}
-
-/* Handler for IRQs from VPU core (irqV) */
-static bool ivpu_hw_37xx_irqv_handler(struct ivpu_device *vdev, int irq, bool *wake_thread)
-{
-	u32 status = REGV_RD32(VPU_37XX_HOST_SS_ICB_STATUS_0) & ICB_0_IRQ_MASK;
-
-	if (!status)
-		return false;
-
-	REGV_WR32(VPU_37XX_HOST_SS_ICB_CLEAR_0, status);
-
-	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_0_INT, status))
-		ivpu_mmu_irq_evtq_handler(vdev);
-
-	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, HOST_IPC_FIFO_INT, status))
-		ivpu_ipc_irq_handler(vdev, wake_thread);
-
-	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_1_INT, status))
-		ivpu_dbg(vdev, IRQ, "MMU sync complete\n");
-
-	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_2_INT, status))
-		ivpu_mmu_irq_gerr_handler(vdev);
-
-	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_0_INT, status))
-		ivpu_hw_37xx_irq_wdt_mss_handler(vdev);
-
-	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_1_INT, status))
-		ivpu_hw_37xx_irq_wdt_nce_handler(vdev);
-
-	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, NOC_FIREWALL_INT, status))
-		ivpu_hw_37xx_irq_noc_firewall_handler(vdev);
-
-	return true;
-}
-
-/* Handler for IRQs from Buttress core (irqB) */
-static bool ivpu_hw_37xx_irqb_handler(struct ivpu_device *vdev, int irq)
-{
-	u32 status = REGB_RD32(VPU_37XX_BUTTRESS_INTERRUPT_STAT) & BUTTRESS_IRQ_MASK;
-	bool schedule_recovery = false;
-
-	if (!status)
-		return false;
-
-	if (REG_TEST_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, FREQ_CHANGE, status))
-		ivpu_dbg(vdev, IRQ, "FREQ_CHANGE irq: %08x",
-			 REGB_RD32(VPU_37XX_BUTTRESS_CURRENT_PLL));
-
-	if (REG_TEST_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, ATS_ERR, status)) {
-		ivpu_err(vdev, "ATS_ERR irq 0x%016llx", REGB_RD64(VPU_37XX_BUTTRESS_ATS_ERR_LOG_0));
-		REGB_WR32(VPU_37XX_BUTTRESS_ATS_ERR_CLEAR, 0x1);
-		schedule_recovery = true;
-	}
-
-	if (REG_TEST_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, UFI_ERR, status)) {
-		u32 ufi_log = REGB_RD32(VPU_37XX_BUTTRESS_UFI_ERR_LOG);
-
-		ivpu_err(vdev, "UFI_ERR irq (0x%08x) opcode: 0x%02lx axi_id: 0x%02lx cq_id: 0x%03lx",
-			 ufi_log, REG_GET_FLD(VPU_37XX_BUTTRESS_UFI_ERR_LOG, OPCODE, ufi_log),
-			 REG_GET_FLD(VPU_37XX_BUTTRESS_UFI_ERR_LOG, AXI_ID, ufi_log),
-			 REG_GET_FLD(VPU_37XX_BUTTRESS_UFI_ERR_LOG, CQ_ID, ufi_log));
-		REGB_WR32(VPU_37XX_BUTTRESS_UFI_ERR_CLEAR, 0x1);
-		schedule_recovery = true;
-	}
-
-	/* This must be done after interrupts are cleared at the source. */
-	if (IVPU_WA(interrupt_clear_with_0))
-		/*
-		 * Writing 1 triggers an interrupt, so we can't perform read update write.
-		 * Clear local interrupt status by writing 0 to all bits.
-		 */
-		REGB_WR32(VPU_37XX_BUTTRESS_INTERRUPT_STAT, 0x0);
-	else
-		REGB_WR32(VPU_37XX_BUTTRESS_INTERRUPT_STAT, status);
-
-	if (schedule_recovery)
-		ivpu_pm_trigger_recovery(vdev, "Buttress IRQ");
-
-	return true;
-}
-
-static irqreturn_t ivpu_hw_37xx_irq_handler(int irq, void *ptr)
-{
-	struct ivpu_device *vdev = ptr;
-	bool irqv_handled, irqb_handled, wake_thread = false;
-
-	REGB_WR32(VPU_37XX_BUTTRESS_GLOBAL_INT_MASK, 0x1);
-
-	irqv_handled = ivpu_hw_37xx_irqv_handler(vdev, irq, &wake_thread);
-	irqb_handled = ivpu_hw_37xx_irqb_handler(vdev, irq);
-
-	/* Re-enable global interrupts to re-trigger MSI for pending interrupts */
-	REGB_WR32(VPU_37XX_BUTTRESS_GLOBAL_INT_MASK, 0x0);
-
-	if (wake_thread)
-		return IRQ_WAKE_THREAD;
-	if (irqv_handled || irqb_handled)
-		return IRQ_HANDLED;
-	return IRQ_NONE;
-}
-
-static void ivpu_hw_37xx_diagnose_failure(struct ivpu_device *vdev)
-{
-	u32 irqv = REGV_RD32(VPU_37XX_HOST_SS_ICB_STATUS_0) & ICB_0_IRQ_MASK;
-	u32 irqb = REGB_RD32(VPU_37XX_BUTTRESS_INTERRUPT_STAT) & BUTTRESS_IRQ_MASK;
-
-	if (ivpu_hw_37xx_reg_ipc_rx_count_get(vdev))
-		ivpu_err(vdev, "IPC FIFO queue not empty, missed IPC IRQ");
-
-	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_0_INT, irqv))
-		ivpu_err(vdev, "WDT MSS timeout detected\n");
-
-	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_1_INT, irqv))
-		ivpu_err(vdev, "WDT NCE timeout detected\n");
-
-	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, NOC_FIREWALL_INT, irqv))
-		ivpu_err(vdev, "NOC Firewall irq detected\n");
-
-	if (REG_TEST_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, ATS_ERR, irqb))
-		ivpu_err(vdev, "ATS_ERR irq 0x%016llx", REGB_RD64(VPU_37XX_BUTTRESS_ATS_ERR_LOG_0));
-
-	if (REG_TEST_FLD(VPU_37XX_BUTTRESS_INTERRUPT_STAT, UFI_ERR, irqb)) {
-		u32 ufi_log = REGB_RD32(VPU_37XX_BUTTRESS_UFI_ERR_LOG);
-
-		ivpu_err(vdev, "UFI_ERR irq (0x%08x) opcode: 0x%02lx axi_id: 0x%02lx cq_id: 0x%03lx",
-			 ufi_log, REG_GET_FLD(VPU_37XX_BUTTRESS_UFI_ERR_LOG, OPCODE, ufi_log),
-			 REG_GET_FLD(VPU_37XX_BUTTRESS_UFI_ERR_LOG, AXI_ID, ufi_log),
-			 REG_GET_FLD(VPU_37XX_BUTTRESS_UFI_ERR_LOG, CQ_ID, ufi_log));
-	}
-}
-
-const struct ivpu_hw_ops ivpu_hw_37xx_ops = {
-	.info_init = ivpu_hw_37xx_info_init,
-	.power_up = ivpu_hw_37xx_power_up,
-	.is_idle = ivpu_hw_37xx_is_idle,
-	.wait_for_idle = ivpu_hw_37xx_wait_for_idle,
-	.power_down = ivpu_hw_37xx_power_down,
-	.reset = ivpu_hw_37xx_reset,
-	.boot_fw = ivpu_hw_37xx_boot_fw,
-	.wdt_disable = ivpu_hw_37xx_wdt_disable,
-	.diagnose_failure = ivpu_hw_37xx_diagnose_failure,
-	.profiling_freq_get = ivpu_hw_37xx_profiling_freq_get,
-	.profiling_freq_drive = ivpu_hw_37xx_profiling_freq_drive,
-	.reg_pll_freq_get = ivpu_hw_37xx_reg_pll_freq_get,
-	.reg_telemetry_offset_get = ivpu_hw_37xx_reg_telemetry_offset_get,
-	.reg_telemetry_size_get = ivpu_hw_37xx_reg_telemetry_size_get,
-	.reg_telemetry_enable_get = ivpu_hw_37xx_reg_telemetry_enable_get,
-	.reg_db_set = ivpu_hw_37xx_reg_db_set,
-	.reg_ipc_rx_addr_get = ivpu_hw_37xx_reg_ipc_rx_addr_get,
-	.reg_ipc_rx_count_get = ivpu_hw_37xx_reg_ipc_rx_count_get,
-	.reg_ipc_tx_set = ivpu_hw_37xx_reg_ipc_tx_set,
-	.irq_clear = ivpu_hw_37xx_irq_clear,
-	.irq_enable = ivpu_hw_37xx_irq_enable,
-	.irq_disable = ivpu_hw_37xx_irq_disable,
-	.irq_handler = ivpu_hw_37xx_irq_handler,
-};
diff --git a/drivers/accel/ivpu/ivpu_hw_37xx_reg.h b/drivers/accel/ivpu/ivpu_hw_37xx_reg.h
index f6fec1919202..cf5e2f01049c 100644
--- a/drivers/accel/ivpu/ivpu_hw_37xx_reg.h
+++ b/drivers/accel/ivpu/ivpu_hw_37xx_reg.h
@@ -8,78 +8,6 @@
 
 #include <linux/bits.h>
 
-#define VPU_37XX_BUTTRESS_INTERRUPT_TYPE				0x00000000u
-
-#define VPU_37XX_BUTTRESS_INTERRUPT_STAT				0x00000004u
-#define VPU_37XX_BUTTRESS_INTERRUPT_STAT_FREQ_CHANGE_MASK		BIT_MASK(0)
-#define VPU_37XX_BUTTRESS_INTERRUPT_STAT_ATS_ERR_MASK			BIT_MASK(1)
-#define VPU_37XX_BUTTRESS_INTERRUPT_STAT_UFI_ERR_MASK			BIT_MASK(2)
-
-#define VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD0				0x00000008u
-#define VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD0_MIN_RATIO_MASK		GENMASK(15, 0)
-#define VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD0_MAX_RATIO_MASK		GENMASK(31, 16)
-
-#define VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD1				0x0000000cu
-#define VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD1_TARGET_RATIO_MASK		GENMASK(15, 0)
-#define VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD1_EPP_MASK			GENMASK(31, 16)
-
-#define VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD2				0x00000010u
-#define VPU_37XX_BUTTRESS_WP_REQ_PAYLOAD2_CONFIG_MASK			GENMASK(15, 0)
-
-#define VPU_37XX_BUTTRESS_WP_REQ_CMD					0x00000014u
-#define VPU_37XX_BUTTRESS_WP_REQ_CMD_SEND_MASK				BIT_MASK(0)
-
-#define VPU_37XX_BUTTRESS_WP_DOWNLOAD					0x00000018u
-#define VPU_37XX_BUTTRESS_WP_DOWNLOAD_TARGET_RATIO_MASK			GENMASK(15, 0)
-
-#define VPU_37XX_BUTTRESS_CURRENT_PLL					0x0000001cu
-#define VPU_37XX_BUTTRESS_CURRENT_PLL_RATIO_MASK			GENMASK(15, 0)
-
-#define VPU_37XX_BUTTRESS_PLL_ENABLE					0x00000020u
-
-#define VPU_37XX_BUTTRESS_FMIN_FUSE					0x00000024u
-#define VPU_37XX_BUTTRESS_FMIN_FUSE_MIN_RATIO_MASK			GENMASK(7, 0)
-#define VPU_37XX_BUTTRESS_FMIN_FUSE_PN_RATIO_MASK			GENMASK(15, 8)
-
-#define VPU_37XX_BUTTRESS_FMAX_FUSE					0x00000028u
-#define VPU_37XX_BUTTRESS_FMAX_FUSE_MAX_RATIO_MASK			GENMASK(7, 0)
-
-#define VPU_37XX_BUTTRESS_TILE_FUSE					0x0000002cu
-#define VPU_37XX_BUTTRESS_TILE_FUSE_VALID_MASK				BIT_MASK(0)
-#define VPU_37XX_BUTTRESS_TILE_FUSE_SKU_MASK				GENMASK(3, 2)
-
-#define VPU_37XX_BUTTRESS_LOCAL_INT_MASK				0x00000030u
-#define VPU_37XX_BUTTRESS_GLOBAL_INT_MASK				0x00000034u
-
-#define VPU_37XX_BUTTRESS_PLL_STATUS					0x00000040u
-#define VPU_37XX_BUTTRESS_PLL_STATUS_LOCK_MASK				BIT_MASK(1)
-
-#define VPU_37XX_BUTTRESS_VPU_STATUS					0x00000044u
-#define VPU_37XX_BUTTRESS_VPU_STATUS_READY_MASK				BIT_MASK(0)
-#define VPU_37XX_BUTTRESS_VPU_STATUS_IDLE_MASK				BIT_MASK(1)
-
-#define VPU_37XX_BUTTRESS_VPU_D0I3_CONTROL				0x00000060u
-#define VPU_37XX_BUTTRESS_VPU_D0I3_CONTROL_INPROGRESS_MASK		BIT_MASK(0)
-#define VPU_37XX_BUTTRESS_VPU_D0I3_CONTROL_I3_MASK			BIT_MASK(2)
-
-#define VPU_37XX_BUTTRESS_VPU_IP_RESET					0x00000050u
-#define VPU_37XX_BUTTRESS_VPU_IP_RESET_TRIGGER_MASK			BIT_MASK(0)
-
-#define VPU_37XX_BUTTRESS_VPU_TELEMETRY_OFFSET				0x00000080u
-#define VPU_37XX_BUTTRESS_VPU_TELEMETRY_SIZE				0x00000084u
-#define VPU_37XX_BUTTRESS_VPU_TELEMETRY_ENABLE				0x00000088u
-
-#define VPU_37XX_BUTTRESS_ATS_ERR_LOG_0					0x000000a0u
-#define VPU_37XX_BUTTRESS_ATS_ERR_LOG_1					0x000000a4u
-#define VPU_37XX_BUTTRESS_ATS_ERR_CLEAR					0x000000a8u
-
-#define VPU_37XX_BUTTRESS_UFI_ERR_LOG					0x000000b0u
-#define VPU_37XX_BUTTRESS_UFI_ERR_LOG_CQ_ID_MASK			GENMASK(11, 0)
-#define VPU_37XX_BUTTRESS_UFI_ERR_LOG_AXI_ID_MASK			GENMASK(19, 12)
-#define VPU_37XX_BUTTRESS_UFI_ERR_LOG_OPCODE_MASK			GENMASK(24, 20)
-
-#define VPU_37XX_BUTTRESS_UFI_ERR_CLEAR					0x000000b4u
-
 #define VPU_37XX_HOST_SS_CPR_CLK_SET					0x00000084u
 #define VPU_37XX_HOST_SS_CPR_CLK_SET_TOP_NOC_MASK			BIT_MASK(1)
 #define VPU_37XX_HOST_SS_CPR_CLK_SET_DSS_MAS_MASK			BIT_MASK(10)
diff --git a/drivers/accel/ivpu/ivpu_hw_40xx.c b/drivers/accel/ivpu/ivpu_hw_40xx.c
deleted file mode 100644
index a1523d0b1ef3..000000000000
--- a/drivers/accel/ivpu/ivpu_hw_40xx.c
+++ /dev/null
@@ -1,1244 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2020-2023 Intel Corporation
- */
-
-#include "ivpu_drv.h"
-#include "ivpu_fw.h"
-#include "ivpu_hw.h"
-#include "ivpu_hw_40xx_reg.h"
-#include "ivpu_hw_reg_io.h"
-#include "ivpu_ipc.h"
-#include "ivpu_mmu.h"
-#include "ivpu_pm.h"
-
-#include <linux/dmi.h>
-
-#define TILE_MAX_NUM                 6
-#define TILE_MAX_MASK                0x3f
-
-#define LNL_HW_ID                    0x4040
-
-#define SKU_TILE_SHIFT               0u
-#define SKU_TILE_MASK                0x0000ffffu
-#define SKU_HW_ID_SHIFT              16u
-#define SKU_HW_ID_MASK               0xffff0000u
-
-#define PLL_CONFIG_DEFAULT           0x0
-#define PLL_CDYN_DEFAULT             0x80
-#define PLL_EPP_DEFAULT              0x80
-#define PLL_REF_CLK_FREQ	     (50 * 1000000)
-#define PLL_RATIO_TO_FREQ(x)	     ((x) * PLL_REF_CLK_FREQ)
-
-#define PLL_PROFILING_FREQ_DEFAULT   38400000
-#define PLL_PROFILING_FREQ_HIGH      400000000
-
-#define TIM_SAFE_ENABLE		     0xf1d0dead
-#define TIM_WATCHDOG_RESET_VALUE     0xffffffff
-
-#define TIMEOUT_US		     (150 * USEC_PER_MSEC)
-#define PWR_ISLAND_STATUS_TIMEOUT_US (5 * USEC_PER_MSEC)
-#define PLL_TIMEOUT_US		     (1500 * USEC_PER_MSEC)
-#define IDLE_TIMEOUT_US		     (5 * USEC_PER_MSEC)
-
-#define WEIGHTS_DEFAULT              0xf711f711u
-#define WEIGHTS_ATS_DEFAULT          0x0000f711u
-
-#define ICB_0_IRQ_MASK ((REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, HOST_IPC_FIFO_INT)) | \
-			(REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_0_INT)) | \
-			(REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_1_INT)) | \
-			(REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_2_INT)) | \
-			(REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, NOC_FIREWALL_INT)) | \
-			(REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_0_INT)) | \
-			(REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_1_INT)))
-
-#define ICB_1_IRQ_MASK ((REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_1, CPU_INT_REDIRECT_2_INT)) | \
-			(REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_1, CPU_INT_REDIRECT_3_INT)) | \
-			(REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_1, CPU_INT_REDIRECT_4_INT)))
-
-#define ICB_0_1_IRQ_MASK ((((u64)ICB_1_IRQ_MASK) << 32) | ICB_0_IRQ_MASK)
-
-#define BUTTRESS_IRQ_MASK ((REG_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, ATS_ERR)) | \
-			   (REG_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, CFI0_ERR)) | \
-			   (REG_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, CFI1_ERR)) | \
-			   (REG_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, IMR0_ERR)) | \
-			   (REG_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, IMR1_ERR)) | \
-			   (REG_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, SURV_ERR)))
-
-#define BUTTRESS_IRQ_ENABLE_MASK ((u32)~BUTTRESS_IRQ_MASK)
-#define BUTTRESS_IRQ_DISABLE_MASK ((u32)-1)
-
-#define ITF_FIREWALL_VIOLATION_MASK ((REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, CSS_ROM_CMX)) | \
-				     (REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, CSS_DBG)) | \
-				     (REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, CSS_CTRL)) | \
-				     (REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, DEC400)) | \
-				     (REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, MSS_NCE)) | \
-				     (REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, MSS_MBI)) | \
-				     (REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, MSS_MBI_CMX)))
-
-static char *ivpu_platform_to_str(u32 platform)
-{
-	switch (platform) {
-	case IVPU_PLATFORM_SILICON:
-		return "IVPU_PLATFORM_SILICON";
-	case IVPU_PLATFORM_SIMICS:
-		return "IVPU_PLATFORM_SIMICS";
-	case IVPU_PLATFORM_FPGA:
-		return "IVPU_PLATFORM_FPGA";
-	default:
-		return "Invalid platform";
-	}
-}
-
-static const struct dmi_system_id ivpu_dmi_platform_simulation[] = {
-	{
-		.ident = "Intel Simics",
-		.matches = {
-			DMI_MATCH(DMI_BOARD_NAME, "lnlrvp"),
-			DMI_MATCH(DMI_BOARD_VERSION, "1.0"),
-			DMI_MATCH(DMI_BOARD_SERIAL, "123456789"),
-		},
-	},
-	{
-		.ident = "Intel Simics",
-		.matches = {
-			DMI_MATCH(DMI_BOARD_NAME, "Simics"),
-		},
-	},
-	{ }
-};
-
-static void ivpu_hw_read_platform(struct ivpu_device *vdev)
-{
-	if (dmi_check_system(ivpu_dmi_platform_simulation))
-		vdev->platform = IVPU_PLATFORM_SIMICS;
-	else
-		vdev->platform = IVPU_PLATFORM_SILICON;
-
-	ivpu_dbg(vdev, MISC, "Platform type: %s (%d)\n",
-		 ivpu_platform_to_str(vdev->platform), vdev->platform);
-}
-
-static void ivpu_hw_wa_init(struct ivpu_device *vdev)
-{
-	vdev->wa.punit_disabled = ivpu_is_fpga(vdev);
-	vdev->wa.clear_runtime_mem = false;
-
-	if (ivpu_hw_gen(vdev) == IVPU_HW_40XX)
-		vdev->wa.disable_clock_relinquish = true;
-
-	IVPU_PRINT_WA(punit_disabled);
-	IVPU_PRINT_WA(clear_runtime_mem);
-	IVPU_PRINT_WA(disable_clock_relinquish);
-}
-
-static void ivpu_hw_timeouts_init(struct ivpu_device *vdev)
-{
-	if (ivpu_is_fpga(vdev)) {
-		vdev->timeout.boot = 100000;
-		vdev->timeout.jsm = 50000;
-		vdev->timeout.tdr = 2000000;
-		vdev->timeout.reschedule_suspend = 1000;
-		vdev->timeout.autosuspend = -1;
-		vdev->timeout.d0i3_entry_msg = 500;
-	} else if (ivpu_is_simics(vdev)) {
-		vdev->timeout.boot = 50;
-		vdev->timeout.jsm = 500;
-		vdev->timeout.tdr = 10000;
-		vdev->timeout.reschedule_suspend = 10;
-		vdev->timeout.autosuspend = -1;
-		vdev->timeout.d0i3_entry_msg = 100;
-	} else {
-		vdev->timeout.boot = 1000;
-		vdev->timeout.jsm = 500;
-		vdev->timeout.tdr = 2000;
-		vdev->timeout.reschedule_suspend = 10;
-		vdev->timeout.autosuspend = 10;
-		vdev->timeout.d0i3_entry_msg = 5;
-	}
-}
-
-static int ivpu_pll_wait_for_cmd_send(struct ivpu_device *vdev)
-{
-	return REGB_POLL_FLD(VPU_40XX_BUTTRESS_WP_REQ_CMD, SEND, 0, PLL_TIMEOUT_US);
-}
-
-static int ivpu_pll_cmd_send(struct ivpu_device *vdev, u16 min_ratio, u16 max_ratio,
-			     u16 target_ratio, u16 epp, u16 config, u16 cdyn)
-{
-	int ret;
-	u32 val;
-
-	ret = ivpu_pll_wait_for_cmd_send(vdev);
-	if (ret) {
-		ivpu_err(vdev, "Failed to sync before WP request: %d\n", ret);
-		return ret;
-	}
-
-	val = REGB_RD32(VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD0);
-	val = REG_SET_FLD_NUM(VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD0, MIN_RATIO, min_ratio, val);
-	val = REG_SET_FLD_NUM(VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD0, MAX_RATIO, max_ratio, val);
-	REGB_WR32(VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD0, val);
-
-	val = REGB_RD32(VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD1);
-	val = REG_SET_FLD_NUM(VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD1, TARGET_RATIO, target_ratio, val);
-	val = REG_SET_FLD_NUM(VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD1, EPP, epp, val);
-	REGB_WR32(VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD1, val);
-
-	val = REGB_RD32(VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD2);
-	val = REG_SET_FLD_NUM(VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD2, CONFIG, config, val);
-	val = REG_SET_FLD_NUM(VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD2, CDYN, cdyn, val);
-	REGB_WR32(VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD2, val);
-
-	val = REGB_RD32(VPU_40XX_BUTTRESS_WP_REQ_CMD);
-	val = REG_SET_FLD(VPU_40XX_BUTTRESS_WP_REQ_CMD, SEND, val);
-	REGB_WR32(VPU_40XX_BUTTRESS_WP_REQ_CMD, val);
-
-	ret = ivpu_pll_wait_for_cmd_send(vdev);
-	if (ret)
-		ivpu_err(vdev, "Failed to sync after WP request: %d\n", ret);
-
-	return ret;
-}
-
-static int ivpu_pll_wait_for_status_ready(struct ivpu_device *vdev)
-{
-	return REGB_POLL_FLD(VPU_40XX_BUTTRESS_VPU_STATUS, READY, 1, PLL_TIMEOUT_US);
-}
-
-static int ivpu_wait_for_clock_own_resource_ack(struct ivpu_device *vdev)
-{
-	if (ivpu_is_simics(vdev))
-		return 0;
-
-	return REGB_POLL_FLD(VPU_40XX_BUTTRESS_VPU_STATUS, CLOCK_RESOURCE_OWN_ACK, 1, TIMEOUT_US);
-}
-
-static void ivpu_pll_init_frequency_ratios(struct ivpu_device *vdev)
-{
-	struct ivpu_hw_info *hw = vdev->hw;
-	u8 fuse_min_ratio, fuse_pn_ratio, fuse_max_ratio;
-	u32 fmin_fuse, fmax_fuse;
-
-	fmin_fuse = REGB_RD32(VPU_40XX_BUTTRESS_FMIN_FUSE);
-	fuse_min_ratio = REG_GET_FLD(VPU_40XX_BUTTRESS_FMIN_FUSE, MIN_RATIO, fmin_fuse);
-	fuse_pn_ratio = REG_GET_FLD(VPU_40XX_BUTTRESS_FMIN_FUSE, PN_RATIO, fmin_fuse);
-
-	fmax_fuse = REGB_RD32(VPU_40XX_BUTTRESS_FMAX_FUSE);
-	fuse_max_ratio = REG_GET_FLD(VPU_40XX_BUTTRESS_FMAX_FUSE, MAX_RATIO, fmax_fuse);
-
-	hw->pll.min_ratio = clamp_t(u8, ivpu_pll_min_ratio, fuse_min_ratio, fuse_max_ratio);
-	hw->pll.max_ratio = clamp_t(u8, ivpu_pll_max_ratio, hw->pll.min_ratio, fuse_max_ratio);
-	hw->pll.pn_ratio = clamp_t(u8, fuse_pn_ratio, hw->pll.min_ratio, hw->pll.max_ratio);
-}
-
-static int ivpu_pll_drive(struct ivpu_device *vdev, bool enable)
-{
-	u16 config = enable ? PLL_CONFIG_DEFAULT : 0;
-	u16 cdyn = enable ? PLL_CDYN_DEFAULT : 0;
-	u16 epp = enable ? PLL_EPP_DEFAULT : 0;
-	struct ivpu_hw_info *hw = vdev->hw;
-	u16 target_ratio = hw->pll.pn_ratio;
-	int ret;
-
-	ivpu_dbg(vdev, PM, "PLL workpoint request: %u Hz, epp: 0x%x, config: 0x%x, cdyn: 0x%x\n",
-		 PLL_RATIO_TO_FREQ(target_ratio), epp, config, cdyn);
-
-	ret = ivpu_pll_cmd_send(vdev, hw->pll.min_ratio, hw->pll.max_ratio,
-				target_ratio, epp, config, cdyn);
-	if (ret) {
-		ivpu_err(vdev, "Failed to send PLL workpoint request: %d\n", ret);
-		return ret;
-	}
-
-	if (enable) {
-		ret = ivpu_pll_wait_for_status_ready(vdev);
-		if (ret) {
-			ivpu_err(vdev, "Timed out waiting for PLL ready status\n");
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-static int ivpu_pll_enable(struct ivpu_device *vdev)
-{
-	return ivpu_pll_drive(vdev, true);
-}
-
-static int ivpu_pll_disable(struct ivpu_device *vdev)
-{
-	return ivpu_pll_drive(vdev, false);
-}
-
-static void ivpu_boot_host_ss_rst_drive(struct ivpu_device *vdev, bool enable)
-{
-	u32 val = REGV_RD32(VPU_40XX_HOST_SS_CPR_RST_EN);
-
-	if (enable) {
-		val = REG_SET_FLD(VPU_40XX_HOST_SS_CPR_RST_EN, TOP_NOC, val);
-		val = REG_SET_FLD(VPU_40XX_HOST_SS_CPR_RST_EN, DSS_MAS, val);
-		val = REG_SET_FLD(VPU_40XX_HOST_SS_CPR_RST_EN, CSS_MAS, val);
-	} else {
-		val = REG_CLR_FLD(VPU_40XX_HOST_SS_CPR_RST_EN, TOP_NOC, val);
-		val = REG_CLR_FLD(VPU_40XX_HOST_SS_CPR_RST_EN, DSS_MAS, val);
-		val = REG_CLR_FLD(VPU_40XX_HOST_SS_CPR_RST_EN, CSS_MAS, val);
-	}
-
-	REGV_WR32(VPU_40XX_HOST_SS_CPR_RST_EN, val);
-}
-
-static void ivpu_boot_host_ss_clk_drive(struct ivpu_device *vdev, bool enable)
-{
-	u32 val = REGV_RD32(VPU_40XX_HOST_SS_CPR_CLK_EN);
-
-	if (enable) {
-		val = REG_SET_FLD(VPU_40XX_HOST_SS_CPR_CLK_EN, TOP_NOC, val);
-		val = REG_SET_FLD(VPU_40XX_HOST_SS_CPR_CLK_EN, DSS_MAS, val);
-		val = REG_SET_FLD(VPU_40XX_HOST_SS_CPR_CLK_EN, CSS_MAS, val);
-	} else {
-		val = REG_CLR_FLD(VPU_40XX_HOST_SS_CPR_CLK_EN, TOP_NOC, val);
-		val = REG_CLR_FLD(VPU_40XX_HOST_SS_CPR_CLK_EN, DSS_MAS, val);
-		val = REG_CLR_FLD(VPU_40XX_HOST_SS_CPR_CLK_EN, CSS_MAS, val);
-	}
-
-	REGV_WR32(VPU_40XX_HOST_SS_CPR_CLK_EN, val);
-}
-
-static int ivpu_boot_noc_qreqn_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_40XX_HOST_SS_NOC_QREQN);
-
-	if (!REG_TEST_FLD_NUM(VPU_40XX_HOST_SS_NOC_QREQN, TOP_SOCMMIO, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static int ivpu_boot_noc_qacceptn_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_40XX_HOST_SS_NOC_QACCEPTN);
-
-	if (!REG_TEST_FLD_NUM(VPU_40XX_HOST_SS_NOC_QACCEPTN, TOP_SOCMMIO, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static int ivpu_boot_noc_qdeny_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_40XX_HOST_SS_NOC_QDENY);
-
-	if (!REG_TEST_FLD_NUM(VPU_40XX_HOST_SS_NOC_QDENY, TOP_SOCMMIO, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static int ivpu_boot_top_noc_qrenqn_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_40XX_TOP_NOC_QREQN);
-
-	if (!REG_TEST_FLD_NUM(VPU_40XX_TOP_NOC_QREQN, CPU_CTRL, exp_val, val) ||
-	    !REG_TEST_FLD_NUM(VPU_40XX_TOP_NOC_QREQN, HOSTIF_L2CACHE, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static int ivpu_boot_top_noc_qacceptn_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_40XX_TOP_NOC_QACCEPTN);
-
-	if (!REG_TEST_FLD_NUM(VPU_40XX_TOP_NOC_QACCEPTN, CPU_CTRL, exp_val, val) ||
-	    !REG_TEST_FLD_NUM(VPU_40XX_TOP_NOC_QACCEPTN, HOSTIF_L2CACHE, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static int ivpu_boot_top_noc_qdeny_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_40XX_TOP_NOC_QDENY);
-
-	if (!REG_TEST_FLD_NUM(VPU_40XX_TOP_NOC_QDENY, CPU_CTRL, exp_val, val) ||
-	    !REG_TEST_FLD_NUM(VPU_40XX_TOP_NOC_QDENY, HOSTIF_L2CACHE, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static void ivpu_boot_idle_gen_drive(struct ivpu_device *vdev, bool enable)
-{
-	u32 val = REGV_RD32(VPU_40XX_HOST_SS_AON_IDLE_GEN);
-
-	if (enable)
-		val = REG_SET_FLD(VPU_40XX_HOST_SS_AON_IDLE_GEN, EN, val);
-	else
-		val = REG_CLR_FLD(VPU_40XX_HOST_SS_AON_IDLE_GEN, EN, val);
-
-	REGV_WR32(VPU_40XX_HOST_SS_AON_IDLE_GEN, val);
-}
-
-static int ivpu_boot_host_ss_check(struct ivpu_device *vdev)
-{
-	int ret;
-
-	ret = ivpu_boot_noc_qreqn_check(vdev, 0x0);
-	if (ret) {
-		ivpu_err(vdev, "Failed qreqn check: %d\n", ret);
-		return ret;
-	}
-
-	ret = ivpu_boot_noc_qacceptn_check(vdev, 0x0);
-	if (ret) {
-		ivpu_err(vdev, "Failed qacceptn check: %d\n", ret);
-		return ret;
-	}
-
-	ret = ivpu_boot_noc_qdeny_check(vdev, 0x0);
-	if (ret)
-		ivpu_err(vdev, "Failed qdeny check %d\n", ret);
-
-	return ret;
-}
-
-static int ivpu_boot_host_ss_axi_drive(struct ivpu_device *vdev, bool enable)
-{
-	int ret;
-	u32 val;
-
-	val = REGV_RD32(VPU_40XX_HOST_SS_NOC_QREQN);
-	if (enable)
-		val = REG_SET_FLD(VPU_40XX_HOST_SS_NOC_QREQN, TOP_SOCMMIO, val);
-	else
-		val = REG_CLR_FLD(VPU_40XX_HOST_SS_NOC_QREQN, TOP_SOCMMIO, val);
-	REGV_WR32(VPU_40XX_HOST_SS_NOC_QREQN, val);
-
-	ret = ivpu_boot_noc_qacceptn_check(vdev, enable ? 0x1 : 0x0);
-	if (ret) {
-		ivpu_err(vdev, "Failed qacceptn check: %d\n", ret);
-		return ret;
-	}
-
-	ret = ivpu_boot_noc_qdeny_check(vdev, 0x0);
-	if (ret) {
-		ivpu_err(vdev, "Failed qdeny check: %d\n", ret);
-		return ret;
-	}
-
-	if (enable) {
-		REGB_WR32(VPU_40XX_BUTTRESS_PORT_ARBITRATION_WEIGHTS, WEIGHTS_DEFAULT);
-		REGB_WR32(VPU_40XX_BUTTRESS_PORT_ARBITRATION_WEIGHTS_ATS, WEIGHTS_ATS_DEFAULT);
-	}
-
-	return ret;
-}
-
-static int ivpu_boot_host_ss_axi_enable(struct ivpu_device *vdev)
-{
-	return ivpu_boot_host_ss_axi_drive(vdev, true);
-}
-
-static int ivpu_boot_host_ss_top_noc_drive(struct ivpu_device *vdev, bool enable)
-{
-	int ret;
-	u32 val;
-
-	val = REGV_RD32(VPU_40XX_TOP_NOC_QREQN);
-	if (enable) {
-		val = REG_SET_FLD(VPU_40XX_TOP_NOC_QREQN, CPU_CTRL, val);
-		val = REG_SET_FLD(VPU_40XX_TOP_NOC_QREQN, HOSTIF_L2CACHE, val);
-	} else {
-		val = REG_CLR_FLD(VPU_40XX_TOP_NOC_QREQN, CPU_CTRL, val);
-		val = REG_CLR_FLD(VPU_40XX_TOP_NOC_QREQN, HOSTIF_L2CACHE, val);
-	}
-	REGV_WR32(VPU_40XX_TOP_NOC_QREQN, val);
-
-	ret = ivpu_boot_top_noc_qacceptn_check(vdev, enable ? 0x1 : 0x0);
-	if (ret) {
-		ivpu_err(vdev, "Failed qacceptn check: %d\n", ret);
-		return ret;
-	}
-
-	ret = ivpu_boot_top_noc_qdeny_check(vdev, 0x0);
-	if (ret)
-		ivpu_err(vdev, "Failed qdeny check: %d\n", ret);
-
-	return ret;
-}
-
-static int ivpu_boot_host_ss_top_noc_enable(struct ivpu_device *vdev)
-{
-	return ivpu_boot_host_ss_top_noc_drive(vdev, true);
-}
-
-static void ivpu_boot_pwr_island_trickle_drive(struct ivpu_device *vdev, bool enable)
-{
-	u32 val = REGV_RD32(VPU_40XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0);
-
-	if (enable)
-		val = REG_SET_FLD(VPU_40XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0, CSS_CPU, val);
-	else
-		val = REG_CLR_FLD(VPU_40XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0, CSS_CPU, val);
-
-	REGV_WR32(VPU_40XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0, val);
-
-	if (enable)
-		ndelay(500);
-}
-
-static void ivpu_boot_pwr_island_drive(struct ivpu_device *vdev, bool enable)
-{
-	u32 val = REGV_RD32(VPU_40XX_HOST_SS_AON_PWR_ISLAND_EN0);
-
-	if (enable)
-		val = REG_SET_FLD(VPU_40XX_HOST_SS_AON_PWR_ISLAND_EN0, CSS_CPU, val);
-	else
-		val = REG_CLR_FLD(VPU_40XX_HOST_SS_AON_PWR_ISLAND_EN0, CSS_CPU, val);
-
-	REGV_WR32(VPU_40XX_HOST_SS_AON_PWR_ISLAND_EN0, val);
-
-	if (!enable)
-		ndelay(500);
-}
-
-static int ivpu_boot_wait_for_pwr_island_status(struct ivpu_device *vdev, u32 exp_val)
-{
-	if (ivpu_is_fpga(vdev))
-		return 0;
-
-	return REGV_POLL_FLD(VPU_40XX_HOST_SS_AON_PWR_ISLAND_STATUS0, CSS_CPU,
-			     exp_val, PWR_ISLAND_STATUS_TIMEOUT_US);
-}
-
-static void ivpu_boot_pwr_island_isolation_drive(struct ivpu_device *vdev, bool enable)
-{
-	u32 val = REGV_RD32(VPU_40XX_HOST_SS_AON_PWR_ISO_EN0);
-
-	if (enable)
-		val = REG_SET_FLD(VPU_40XX_HOST_SS_AON_PWR_ISO_EN0, CSS_CPU, val);
-	else
-		val = REG_CLR_FLD(VPU_40XX_HOST_SS_AON_PWR_ISO_EN0, CSS_CPU, val);
-
-	REGV_WR32(VPU_40XX_HOST_SS_AON_PWR_ISO_EN0, val);
-}
-
-static void ivpu_boot_no_snoop_enable(struct ivpu_device *vdev)
-{
-	u32 val = REGV_RD32(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES);
-
-	val = REG_SET_FLD(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, SNOOP_OVERRIDE_EN, val);
-	val = REG_SET_FLD(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, AW_SNOOP_OVERRIDE, val);
-	val = REG_CLR_FLD(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, AR_SNOOP_OVERRIDE, val);
-
-	REGV_WR32(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, val);
-}
-
-static void ivpu_boot_tbu_mmu_enable(struct ivpu_device *vdev)
-{
-	u32 val = REGV_RD32(VPU_40XX_HOST_IF_TBU_MMUSSIDV);
-
-	val = REG_SET_FLD(VPU_40XX_HOST_IF_TBU_MMUSSIDV, TBU0_AWMMUSSIDV, val);
-	val = REG_SET_FLD(VPU_40XX_HOST_IF_TBU_MMUSSIDV, TBU0_ARMMUSSIDV, val);
-	val = REG_SET_FLD(VPU_40XX_HOST_IF_TBU_MMUSSIDV, TBU1_AWMMUSSIDV, val);
-	val = REG_SET_FLD(VPU_40XX_HOST_IF_TBU_MMUSSIDV, TBU1_ARMMUSSIDV, val);
-	val = REG_SET_FLD(VPU_40XX_HOST_IF_TBU_MMUSSIDV, TBU2_AWMMUSSIDV, val);
-	val = REG_SET_FLD(VPU_40XX_HOST_IF_TBU_MMUSSIDV, TBU2_ARMMUSSIDV, val);
-
-	REGV_WR32(VPU_40XX_HOST_IF_TBU_MMUSSIDV, val);
-}
-
-static int ivpu_boot_cpu_noc_qacceptn_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_40XX_CPU_SS_CPR_NOC_QACCEPTN);
-
-	if (!REG_TEST_FLD_NUM(VPU_40XX_CPU_SS_CPR_NOC_QACCEPTN, TOP_MMIO, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static int ivpu_boot_cpu_noc_qdeny_check(struct ivpu_device *vdev, u32 exp_val)
-{
-	u32 val = REGV_RD32(VPU_40XX_CPU_SS_CPR_NOC_QDENY);
-
-	if (!REG_TEST_FLD_NUM(VPU_40XX_CPU_SS_CPR_NOC_QDENY, TOP_MMIO, exp_val, val))
-		return -EIO;
-
-	return 0;
-}
-
-static int ivpu_boot_pwr_domain_enable(struct ivpu_device *vdev)
-{
-	int ret;
-
-	ret = ivpu_wait_for_clock_own_resource_ack(vdev);
-	if (ret) {
-		ivpu_err(vdev, "Timed out waiting for clock own resource ACK\n");
-		return ret;
-	}
-
-	ivpu_boot_pwr_island_trickle_drive(vdev, true);
-	ivpu_boot_pwr_island_drive(vdev, true);
-
-	ret = ivpu_boot_wait_for_pwr_island_status(vdev, 0x1);
-	if (ret) {
-		ivpu_err(vdev, "Timed out waiting for power island status\n");
-		return ret;
-	}
-
-	ret = ivpu_boot_top_noc_qrenqn_check(vdev, 0x0);
-	if (ret) {
-		ivpu_err(vdev, "Failed qrenqn check %d\n", ret);
-		return ret;
-	}
-
-	ivpu_boot_host_ss_clk_drive(vdev, true);
-	ivpu_boot_host_ss_rst_drive(vdev, true);
-	ivpu_boot_pwr_island_isolation_drive(vdev, false);
-
-	return ret;
-}
-
-static int ivpu_boot_soc_cpu_drive(struct ivpu_device *vdev, bool enable)
-{
-	int ret;
-	u32 val;
-
-	val = REGV_RD32(VPU_40XX_CPU_SS_CPR_NOC_QREQN);
-	if (enable)
-		val = REG_SET_FLD(VPU_40XX_CPU_SS_CPR_NOC_QREQN, TOP_MMIO, val);
-	else
-		val = REG_CLR_FLD(VPU_40XX_CPU_SS_CPR_NOC_QREQN, TOP_MMIO, val);
-	REGV_WR32(VPU_40XX_CPU_SS_CPR_NOC_QREQN, val);
-
-	ret = ivpu_boot_cpu_noc_qacceptn_check(vdev, enable ? 0x1 : 0x0);
-	if (ret) {
-		ivpu_err(vdev, "Failed qacceptn check: %d\n", ret);
-		return ret;
-	}
-
-	ret = ivpu_boot_cpu_noc_qdeny_check(vdev, 0x0);
-	if (ret)
-		ivpu_err(vdev, "Failed qdeny check: %d\n", ret);
-
-	return ret;
-}
-
-static int ivpu_boot_soc_cpu_enable(struct ivpu_device *vdev)
-{
-	return ivpu_boot_soc_cpu_drive(vdev, true);
-}
-
-static int ivpu_boot_soc_cpu_boot(struct ivpu_device *vdev)
-{
-	int ret;
-	u32 val;
-	u64 val64;
-
-	ret = ivpu_boot_soc_cpu_enable(vdev);
-	if (ret) {
-		ivpu_err(vdev, "Failed to enable SOC CPU: %d\n", ret);
-		return ret;
-	}
-
-	val64 = vdev->fw->entry_point;
-	val64 <<= ffs(VPU_40XX_HOST_SS_VERIFICATION_ADDRESS_LO_IMAGE_LOCATION_MASK) - 1;
-	REGV_WR64(VPU_40XX_HOST_SS_VERIFICATION_ADDRESS_LO, val64);
-
-	val = REGV_RD32(VPU_40XX_HOST_SS_VERIFICATION_ADDRESS_LO);
-	val = REG_SET_FLD(VPU_40XX_HOST_SS_VERIFICATION_ADDRESS_LO, DONE, val);
-	REGV_WR32(VPU_40XX_HOST_SS_VERIFICATION_ADDRESS_LO, val);
-
-	ivpu_dbg(vdev, PM, "Booting firmware, mode: %s\n",
-		 ivpu_fw_is_cold_boot(vdev) ? "cold boot" : "resume");
-
-	return 0;
-}
-
-static int ivpu_boot_d0i3_drive(struct ivpu_device *vdev, bool enable)
-{
-	int ret;
-	u32 val;
-
-	ret = REGB_POLL_FLD(VPU_40XX_BUTTRESS_D0I3_CONTROL, INPROGRESS, 0, TIMEOUT_US);
-	if (ret) {
-		ivpu_err(vdev, "Failed to sync before D0i3 transition: %d\n", ret);
-		return ret;
-	}
-
-	val = REGB_RD32(VPU_40XX_BUTTRESS_D0I3_CONTROL);
-	if (enable)
-		val = REG_SET_FLD(VPU_40XX_BUTTRESS_D0I3_CONTROL, I3, val);
-	else
-		val = REG_CLR_FLD(VPU_40XX_BUTTRESS_D0I3_CONTROL, I3, val);
-	REGB_WR32(VPU_40XX_BUTTRESS_D0I3_CONTROL, val);
-
-	ret = REGB_POLL_FLD(VPU_40XX_BUTTRESS_D0I3_CONTROL, INPROGRESS, 0, TIMEOUT_US);
-	if (ret) {
-		ivpu_err(vdev, "Failed to sync after D0i3 transition: %d\n", ret);
-		return ret;
-	}
-
-	return 0;
-}
-
-static bool ivpu_tile_disable_check(u32 config)
-{
-	/* Allowed values: 0 or one bit from range 0-5 (6 tiles) */
-	if (config == 0)
-		return true;
-
-	if (config > BIT(TILE_MAX_NUM - 1))
-		return false;
-
-	if ((config & (config - 1)) == 0)
-		return true;
-
-	return false;
-}
-
-static int ivpu_hw_40xx_info_init(struct ivpu_device *vdev)
-{
-	struct ivpu_hw_info *hw = vdev->hw;
-	u32 tile_disable;
-	u32 fuse;
-
-	fuse = REGB_RD32(VPU_40XX_BUTTRESS_TILE_FUSE);
-	if (!REG_TEST_FLD(VPU_40XX_BUTTRESS_TILE_FUSE, VALID, fuse)) {
-		ivpu_err(vdev, "Fuse: invalid (0x%x)\n", fuse);
-		return -EIO;
-	}
-
-	tile_disable = REG_GET_FLD(VPU_40XX_BUTTRESS_TILE_FUSE, CONFIG, fuse);
-	if (!ivpu_tile_disable_check(tile_disable)) {
-		ivpu_err(vdev, "Fuse: Invalid tile disable config (0x%x)\n", tile_disable);
-		return -EIO;
-	}
-
-	if (tile_disable)
-		ivpu_dbg(vdev, MISC, "Fuse: %d tiles enabled. Tile number %d disabled\n",
-			 TILE_MAX_NUM - 1, ffs(tile_disable) - 1);
-	else
-		ivpu_dbg(vdev, MISC, "Fuse: All %d tiles enabled\n", TILE_MAX_NUM);
-
-	hw->tile_fuse = tile_disable;
-	hw->pll.profiling_freq = PLL_PROFILING_FREQ_DEFAULT;
-
-	ivpu_pll_init_frequency_ratios(vdev);
-
-	ivpu_hw_init_range(&vdev->hw->ranges.global, 0x80000000, SZ_512M);
-	ivpu_hw_init_range(&vdev->hw->ranges.user,   0x80000000, SZ_256M);
-	ivpu_hw_init_range(&vdev->hw->ranges.shave,  0x80000000 + SZ_256M, SZ_2G - SZ_256M);
-	ivpu_hw_init_range(&vdev->hw->ranges.dma,   0x200000000, SZ_8G);
-
-	ivpu_hw_read_platform(vdev);
-	ivpu_hw_wa_init(vdev);
-	ivpu_hw_timeouts_init(vdev);
-
-	return 0;
-}
-
-static int ivpu_hw_40xx_ip_reset(struct ivpu_device *vdev)
-{
-	int ret;
-	u32 val;
-
-	ret = REGB_POLL_FLD(VPU_40XX_BUTTRESS_IP_RESET, TRIGGER, 0, TIMEOUT_US);
-	if (ret) {
-		ivpu_err(vdev, "Wait for *_TRIGGER timed out\n");
-		return ret;
-	}
-
-	val = REGB_RD32(VPU_40XX_BUTTRESS_IP_RESET);
-	val = REG_SET_FLD(VPU_40XX_BUTTRESS_IP_RESET, TRIGGER, val);
-	REGB_WR32(VPU_40XX_BUTTRESS_IP_RESET, val);
-
-	ret = REGB_POLL_FLD(VPU_40XX_BUTTRESS_IP_RESET, TRIGGER, 0, TIMEOUT_US);
-	if (ret)
-		ivpu_err(vdev, "Timed out waiting for RESET completion\n");
-
-	return ret;
-}
-
-static int ivpu_hw_40xx_reset(struct ivpu_device *vdev)
-{
-	int ret = 0;
-
-	if (ivpu_hw_40xx_ip_reset(vdev)) {
-		ivpu_err(vdev, "Failed to reset VPU IP\n");
-		ret = -EIO;
-	}
-
-	if (ivpu_pll_disable(vdev)) {
-		ivpu_err(vdev, "Failed to disable PLL\n");
-		ret = -EIO;
-	}
-
-	return ret;
-}
-
-static int ivpu_hw_40xx_d0i3_enable(struct ivpu_device *vdev)
-{
-	int ret;
-
-	if (IVPU_WA(punit_disabled))
-		return 0;
-
-	ret = ivpu_boot_d0i3_drive(vdev, true);
-	if (ret)
-		ivpu_err(vdev, "Failed to enable D0i3: %d\n", ret);
-
-	udelay(5); /* VPU requires 5 us to complete the transition */
-
-	return ret;
-}
-
-static int ivpu_hw_40xx_d0i3_disable(struct ivpu_device *vdev)
-{
-	int ret;
-
-	if (IVPU_WA(punit_disabled))
-		return 0;
-
-	ret = ivpu_boot_d0i3_drive(vdev, false);
-	if (ret)
-		ivpu_err(vdev, "Failed to disable D0i3: %d\n", ret);
-
-	return ret;
-}
-
-static void ivpu_hw_40xx_profiling_freq_reg_set(struct ivpu_device *vdev)
-{
-	u32 val = REGB_RD32(VPU_40XX_BUTTRESS_VPU_STATUS);
-
-	if (vdev->hw->pll.profiling_freq == PLL_PROFILING_FREQ_DEFAULT)
-		val = REG_CLR_FLD(VPU_40XX_BUTTRESS_VPU_STATUS, PERF_CLK, val);
-	else
-		val = REG_SET_FLD(VPU_40XX_BUTTRESS_VPU_STATUS, PERF_CLK, val);
-
-	REGB_WR32(VPU_40XX_BUTTRESS_VPU_STATUS, val);
-}
-
-static void ivpu_hw_40xx_ats_print(struct ivpu_device *vdev)
-{
-	ivpu_dbg(vdev, MISC, "Buttress ATS: %s\n",
-		 REGB_RD32(VPU_40XX_BUTTRESS_HM_ATS) ? "Enable" : "Disable");
-}
-
-static void ivpu_hw_40xx_clock_relinquish_disable(struct ivpu_device *vdev)
-{
-	u32 val = REGB_RD32(VPU_40XX_BUTTRESS_VPU_STATUS);
-
-	val = REG_SET_FLD(VPU_40XX_BUTTRESS_VPU_STATUS, DISABLE_CLK_RELINQUISH, val);
-	REGB_WR32(VPU_40XX_BUTTRESS_VPU_STATUS, val);
-}
-
-static int ivpu_hw_40xx_power_up(struct ivpu_device *vdev)
-{
-	int ret;
-
-	ret = ivpu_hw_40xx_d0i3_disable(vdev);
-	if (ret)
-		ivpu_warn(vdev, "Failed to disable D0I3: %d\n", ret);
-
-	ret = ivpu_pll_enable(vdev);
-	if (ret) {
-		ivpu_err(vdev, "Failed to enable PLL: %d\n", ret);
-		return ret;
-	}
-
-	if (IVPU_WA(disable_clock_relinquish))
-		ivpu_hw_40xx_clock_relinquish_disable(vdev);
-	ivpu_hw_40xx_profiling_freq_reg_set(vdev);
-	ivpu_hw_40xx_ats_print(vdev);
-
-	ret = ivpu_boot_host_ss_check(vdev);
-	if (ret) {
-		ivpu_err(vdev, "Failed to configure host SS: %d\n", ret);
-		return ret;
-	}
-
-	ivpu_boot_idle_gen_drive(vdev, false);
-
-	ret = ivpu_boot_pwr_domain_enable(vdev);
-	if (ret) {
-		ivpu_err(vdev, "Failed to enable power domain: %d\n", ret);
-		return ret;
-	}
-
-	ret = ivpu_boot_host_ss_axi_enable(vdev);
-	if (ret) {
-		ivpu_err(vdev, "Failed to enable AXI: %d\n", ret);
-		return ret;
-	}
-
-	ret = ivpu_boot_host_ss_top_noc_enable(vdev);
-	if (ret)
-		ivpu_err(vdev, "Failed to enable TOP NOC: %d\n", ret);
-
-	return ret;
-}
-
-static int ivpu_hw_40xx_boot_fw(struct ivpu_device *vdev)
-{
-	int ret;
-
-	ivpu_boot_no_snoop_enable(vdev);
-	ivpu_boot_tbu_mmu_enable(vdev);
-
-	ret = ivpu_boot_soc_cpu_boot(vdev);
-	if (ret)
-		ivpu_err(vdev, "Failed to boot SOC CPU: %d\n", ret);
-
-	return ret;
-}
-
-static bool ivpu_hw_40xx_is_idle(struct ivpu_device *vdev)
-{
-	u32 val;
-
-	if (IVPU_WA(punit_disabled))
-		return true;
-
-	val = REGB_RD32(VPU_40XX_BUTTRESS_VPU_STATUS);
-	return REG_TEST_FLD(VPU_40XX_BUTTRESS_VPU_STATUS, READY, val) &&
-	       REG_TEST_FLD(VPU_40XX_BUTTRESS_VPU_STATUS, IDLE, val);
-}
-
-static int ivpu_hw_40xx_wait_for_idle(struct ivpu_device *vdev)
-{
-	return REGB_POLL_FLD(VPU_40XX_BUTTRESS_VPU_STATUS, IDLE, 0x1, IDLE_TIMEOUT_US);
-}
-
-static void ivpu_hw_40xx_save_d0i3_entry_timestamp(struct ivpu_device *vdev)
-{
-	vdev->hw->d0i3_entry_host_ts = ktime_get_boottime();
-	vdev->hw->d0i3_entry_vpu_ts = REGV_RD64(VPU_40XX_CPU_SS_TIM_PERF_EXT_FREE_CNT);
-}
-
-static int ivpu_hw_40xx_power_down(struct ivpu_device *vdev)
-{
-	int ret = 0;
-
-	ivpu_hw_40xx_save_d0i3_entry_timestamp(vdev);
-
-	if (!ivpu_hw_40xx_is_idle(vdev) && ivpu_hw_40xx_ip_reset(vdev))
-		ivpu_warn(vdev, "Failed to reset the VPU\n");
-
-	if (ivpu_pll_disable(vdev)) {
-		ivpu_err(vdev, "Failed to disable PLL\n");
-		ret = -EIO;
-	}
-
-	if (ivpu_hw_40xx_d0i3_enable(vdev)) {
-		ivpu_err(vdev, "Failed to enter D0I3\n");
-		ret = -EIO;
-	}
-
-	return ret;
-}
-
-static void ivpu_hw_40xx_wdt_disable(struct ivpu_device *vdev)
-{
-	u32 val;
-
-	REGV_WR32(VPU_40XX_CPU_SS_TIM_SAFE, TIM_SAFE_ENABLE);
-	REGV_WR32(VPU_40XX_CPU_SS_TIM_WATCHDOG, TIM_WATCHDOG_RESET_VALUE);
-
-	REGV_WR32(VPU_40XX_CPU_SS_TIM_SAFE, TIM_SAFE_ENABLE);
-	REGV_WR32(VPU_40XX_CPU_SS_TIM_WDOG_EN, 0);
-
-	val = REGV_RD32(VPU_40XX_CPU_SS_TIM_GEN_CONFIG);
-	val = REG_CLR_FLD(VPU_40XX_CPU_SS_TIM_GEN_CONFIG, WDOG_TO_INT_CLR, val);
-	REGV_WR32(VPU_40XX_CPU_SS_TIM_GEN_CONFIG, val);
-}
-
-static u32 ivpu_hw_40xx_profiling_freq_get(struct ivpu_device *vdev)
-{
-	return vdev->hw->pll.profiling_freq;
-}
-
-static void ivpu_hw_40xx_profiling_freq_drive(struct ivpu_device *vdev, bool enable)
-{
-	if (enable)
-		vdev->hw->pll.profiling_freq = PLL_PROFILING_FREQ_HIGH;
-	else
-		vdev->hw->pll.profiling_freq = PLL_PROFILING_FREQ_DEFAULT;
-}
-
-/* Register indirect accesses */
-static u32 ivpu_hw_40xx_reg_pll_freq_get(struct ivpu_device *vdev)
-{
-	u32 pll_curr_ratio;
-
-	pll_curr_ratio = REGB_RD32(VPU_40XX_BUTTRESS_PLL_FREQ);
-	pll_curr_ratio &= VPU_40XX_BUTTRESS_PLL_FREQ_RATIO_MASK;
-
-	return PLL_RATIO_TO_FREQ(pll_curr_ratio);
-}
-
-static u32 ivpu_hw_40xx_reg_telemetry_offset_get(struct ivpu_device *vdev)
-{
-	return REGB_RD32(VPU_40XX_BUTTRESS_VPU_TELEMETRY_OFFSET);
-}
-
-static u32 ivpu_hw_40xx_reg_telemetry_size_get(struct ivpu_device *vdev)
-{
-	return REGB_RD32(VPU_40XX_BUTTRESS_VPU_TELEMETRY_SIZE);
-}
-
-static u32 ivpu_hw_40xx_reg_telemetry_enable_get(struct ivpu_device *vdev)
-{
-	return REGB_RD32(VPU_40XX_BUTTRESS_VPU_TELEMETRY_ENABLE);
-}
-
-static void ivpu_hw_40xx_reg_db_set(struct ivpu_device *vdev, u32 db_id)
-{
-	u32 reg_stride = VPU_40XX_CPU_SS_DOORBELL_1 - VPU_40XX_CPU_SS_DOORBELL_0;
-	u32 val = REG_FLD(VPU_40XX_CPU_SS_DOORBELL_0, SET);
-
-	REGV_WR32I(VPU_40XX_CPU_SS_DOORBELL_0, reg_stride, db_id, val);
-}
-
-static u32 ivpu_hw_40xx_reg_ipc_rx_addr_get(struct ivpu_device *vdev)
-{
-	return REGV_RD32(VPU_40XX_HOST_SS_TIM_IPC_FIFO_ATM);
-}
-
-static u32 ivpu_hw_40xx_reg_ipc_rx_count_get(struct ivpu_device *vdev)
-{
-	u32 count = REGV_RD32_SILENT(VPU_40XX_HOST_SS_TIM_IPC_FIFO_STAT);
-
-	return REG_GET_FLD(VPU_40XX_HOST_SS_TIM_IPC_FIFO_STAT, FILL_LEVEL, count);
-}
-
-static void ivpu_hw_40xx_reg_ipc_tx_set(struct ivpu_device *vdev, u32 vpu_addr)
-{
-	REGV_WR32(VPU_40XX_CPU_SS_TIM_IPC_FIFO, vpu_addr);
-}
-
-static void ivpu_hw_40xx_irq_clear(struct ivpu_device *vdev)
-{
-	REGV_WR64(VPU_40XX_HOST_SS_ICB_CLEAR_0, ICB_0_1_IRQ_MASK);
-}
-
-static void ivpu_hw_40xx_irq_enable(struct ivpu_device *vdev)
-{
-	REGV_WR32(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, ITF_FIREWALL_VIOLATION_MASK);
-	REGV_WR64(VPU_40XX_HOST_SS_ICB_ENABLE_0, ICB_0_1_IRQ_MASK);
-	REGB_WR32(VPU_40XX_BUTTRESS_LOCAL_INT_MASK, BUTTRESS_IRQ_ENABLE_MASK);
-	REGB_WR32(VPU_40XX_BUTTRESS_GLOBAL_INT_MASK, 0x0);
-}
-
-static void ivpu_hw_40xx_irq_disable(struct ivpu_device *vdev)
-{
-	REGB_WR32(VPU_40XX_BUTTRESS_GLOBAL_INT_MASK, 0x1);
-	REGB_WR32(VPU_40XX_BUTTRESS_LOCAL_INT_MASK, BUTTRESS_IRQ_DISABLE_MASK);
-	REGV_WR64(VPU_40XX_HOST_SS_ICB_ENABLE_0, 0x0ull);
-	REGV_WR32(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, 0x0ul);
-}
-
-static void ivpu_hw_40xx_irq_wdt_nce_handler(struct ivpu_device *vdev)
-{
-	/* TODO: For LNN hang consider engine reset instead of full recovery */
-	ivpu_pm_trigger_recovery(vdev, "WDT NCE IRQ");
-}
-
-static void ivpu_hw_40xx_irq_wdt_mss_handler(struct ivpu_device *vdev)
-{
-	ivpu_hw_wdt_disable(vdev);
-	ivpu_pm_trigger_recovery(vdev, "WDT MSS IRQ");
-}
-
-static void ivpu_hw_40xx_irq_noc_firewall_handler(struct ivpu_device *vdev)
-{
-	ivpu_pm_trigger_recovery(vdev, "NOC Firewall IRQ");
-}
-
-/* Handler for IRQs from VPU core (irqV) */
-static bool ivpu_hw_40xx_irqv_handler(struct ivpu_device *vdev, int irq, bool *wake_thread)
-{
-	u32 status = REGV_RD32(VPU_40XX_HOST_SS_ICB_STATUS_0) & ICB_0_IRQ_MASK;
-
-	if (!status)
-		return false;
-
-	REGV_WR32(VPU_40XX_HOST_SS_ICB_CLEAR_0, status);
-
-	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_0_INT, status))
-		ivpu_mmu_irq_evtq_handler(vdev);
-
-	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, HOST_IPC_FIFO_INT, status))
-		ivpu_ipc_irq_handler(vdev, wake_thread);
-
-	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_1_INT, status))
-		ivpu_dbg(vdev, IRQ, "MMU sync complete\n");
-
-	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_2_INT, status))
-		ivpu_mmu_irq_gerr_handler(vdev);
-
-	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_0_INT, status))
-		ivpu_hw_40xx_irq_wdt_mss_handler(vdev);
-
-	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_1_INT, status))
-		ivpu_hw_40xx_irq_wdt_nce_handler(vdev);
-
-	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, NOC_FIREWALL_INT, status))
-		ivpu_hw_40xx_irq_noc_firewall_handler(vdev);
-
-	return true;
-}
-
-/* Handler for IRQs from Buttress core (irqB) */
-static bool ivpu_hw_40xx_irqb_handler(struct ivpu_device *vdev, int irq)
-{
-	bool schedule_recovery = false;
-	u32 status = REGB_RD32(VPU_40XX_BUTTRESS_INTERRUPT_STAT) & BUTTRESS_IRQ_MASK;
-
-	if (!status)
-		return false;
-
-	if (REG_TEST_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, FREQ_CHANGE, status))
-		ivpu_dbg(vdev, IRQ, "FREQ_CHANGE");
-
-	if (REG_TEST_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, ATS_ERR, status)) {
-		ivpu_err(vdev, "ATS_ERR LOG1 0x%08x ATS_ERR_LOG2 0x%08x\n",
-			 REGB_RD32(VPU_40XX_BUTTRESS_ATS_ERR_LOG1),
-			 REGB_RD32(VPU_40XX_BUTTRESS_ATS_ERR_LOG2));
-		REGB_WR32(VPU_40XX_BUTTRESS_ATS_ERR_CLEAR, 0x1);
-		schedule_recovery = true;
-	}
-
-	if (REG_TEST_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, CFI0_ERR, status)) {
-		ivpu_err(vdev, "CFI0_ERR 0x%08x", REGB_RD32(VPU_40XX_BUTTRESS_CFI0_ERR_LOG));
-		REGB_WR32(VPU_40XX_BUTTRESS_CFI0_ERR_CLEAR, 0x1);
-		schedule_recovery = true;
-	}
-
-	if (REG_TEST_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, CFI1_ERR, status)) {
-		ivpu_err(vdev, "CFI1_ERR 0x%08x", REGB_RD32(VPU_40XX_BUTTRESS_CFI1_ERR_LOG));
-		REGB_WR32(VPU_40XX_BUTTRESS_CFI1_ERR_CLEAR, 0x1);
-		schedule_recovery = true;
-	}
-
-	if (REG_TEST_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, IMR0_ERR, status)) {
-		ivpu_err(vdev, "IMR_ERR_CFI0 LOW: 0x%08x HIGH: 0x%08x",
-			 REGB_RD32(VPU_40XX_BUTTRESS_IMR_ERR_CFI0_LOW),
-			 REGB_RD32(VPU_40XX_BUTTRESS_IMR_ERR_CFI0_HIGH));
-		REGB_WR32(VPU_40XX_BUTTRESS_IMR_ERR_CFI0_CLEAR, 0x1);
-		schedule_recovery = true;
-	}
-
-	if (REG_TEST_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, IMR1_ERR, status)) {
-		ivpu_err(vdev, "IMR_ERR_CFI1 LOW: 0x%08x HIGH: 0x%08x",
-			 REGB_RD32(VPU_40XX_BUTTRESS_IMR_ERR_CFI1_LOW),
-			 REGB_RD32(VPU_40XX_BUTTRESS_IMR_ERR_CFI1_HIGH));
-		REGB_WR32(VPU_40XX_BUTTRESS_IMR_ERR_CFI1_CLEAR, 0x1);
-		schedule_recovery = true;
-	}
-
-	if (REG_TEST_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, SURV_ERR, status)) {
-		ivpu_err(vdev, "Survivability error detected\n");
-		schedule_recovery = true;
-	}
-
-	/* This must be done after interrupts are cleared at the source. */
-	REGB_WR32(VPU_40XX_BUTTRESS_INTERRUPT_STAT, status);
-
-	if (schedule_recovery)
-		ivpu_pm_trigger_recovery(vdev, "Buttress IRQ");
-
-	return true;
-}
-
-static irqreturn_t ivpu_hw_40xx_irq_handler(int irq, void *ptr)
-{
-	bool irqv_handled, irqb_handled, wake_thread = false;
-	struct ivpu_device *vdev = ptr;
-
-	REGB_WR32(VPU_40XX_BUTTRESS_GLOBAL_INT_MASK, 0x1);
-
-	irqv_handled = ivpu_hw_40xx_irqv_handler(vdev, irq, &wake_thread);
-	irqb_handled = ivpu_hw_40xx_irqb_handler(vdev, irq);
-
-	/* Re-enable global interrupts to re-trigger MSI for pending interrupts */
-	REGB_WR32(VPU_40XX_BUTTRESS_GLOBAL_INT_MASK, 0x0);
-
-	if (wake_thread)
-		return IRQ_WAKE_THREAD;
-	if (irqv_handled || irqb_handled)
-		return IRQ_HANDLED;
-	return IRQ_NONE;
-}
-
-static void ivpu_hw_40xx_diagnose_failure(struct ivpu_device *vdev)
-{
-	u32 irqv = REGV_RD32(VPU_40XX_HOST_SS_ICB_STATUS_0) & ICB_0_IRQ_MASK;
-	u32 irqb = REGB_RD32(VPU_40XX_BUTTRESS_INTERRUPT_STAT) & BUTTRESS_IRQ_MASK;
-
-	if (ivpu_hw_40xx_reg_ipc_rx_count_get(vdev))
-		ivpu_err(vdev, "IPC FIFO queue not empty, missed IPC IRQ");
-
-	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_0_INT, irqv))
-		ivpu_err(vdev, "WDT MSS timeout detected\n");
-
-	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_1_INT, irqv))
-		ivpu_err(vdev, "WDT NCE timeout detected\n");
-
-	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, NOC_FIREWALL_INT, irqv))
-		ivpu_err(vdev, "NOC Firewall irq detected\n");
-
-	if (REG_TEST_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, ATS_ERR, irqb)) {
-		ivpu_err(vdev, "ATS_ERR_LOG1 0x%08x ATS_ERR_LOG2 0x%08x\n",
-			 REGB_RD32(VPU_40XX_BUTTRESS_ATS_ERR_LOG1),
-			 REGB_RD32(VPU_40XX_BUTTRESS_ATS_ERR_LOG2));
-	}
-
-	if (REG_TEST_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, CFI0_ERR, irqb))
-		ivpu_err(vdev, "CFI0_ERR_LOG 0x%08x\n", REGB_RD32(VPU_40XX_BUTTRESS_CFI0_ERR_LOG));
-
-	if (REG_TEST_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, CFI1_ERR, irqb))
-		ivpu_err(vdev, "CFI1_ERR_LOG 0x%08x\n", REGB_RD32(VPU_40XX_BUTTRESS_CFI1_ERR_LOG));
-
-	if (REG_TEST_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, IMR0_ERR, irqb))
-		ivpu_err(vdev, "IMR_ERR_CFI0 LOW: 0x%08x HIGH: 0x%08x\n",
-			 REGB_RD32(VPU_40XX_BUTTRESS_IMR_ERR_CFI0_LOW),
-			 REGB_RD32(VPU_40XX_BUTTRESS_IMR_ERR_CFI0_HIGH));
-
-	if (REG_TEST_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, IMR1_ERR, irqb))
-		ivpu_err(vdev, "IMR_ERR_CFI1 LOW: 0x%08x HIGH: 0x%08x\n",
-			 REGB_RD32(VPU_40XX_BUTTRESS_IMR_ERR_CFI1_LOW),
-			 REGB_RD32(VPU_40XX_BUTTRESS_IMR_ERR_CFI1_HIGH));
-
-	if (REG_TEST_FLD(VPU_40XX_BUTTRESS_INTERRUPT_STAT, SURV_ERR, irqb))
-		ivpu_err(vdev, "Survivability error detected\n");
-}
-
-const struct ivpu_hw_ops ivpu_hw_40xx_ops = {
-	.info_init = ivpu_hw_40xx_info_init,
-	.power_up = ivpu_hw_40xx_power_up,
-	.is_idle = ivpu_hw_40xx_is_idle,
-	.wait_for_idle = ivpu_hw_40xx_wait_for_idle,
-	.power_down = ivpu_hw_40xx_power_down,
-	.reset = ivpu_hw_40xx_reset,
-	.boot_fw = ivpu_hw_40xx_boot_fw,
-	.wdt_disable = ivpu_hw_40xx_wdt_disable,
-	.diagnose_failure = ivpu_hw_40xx_diagnose_failure,
-	.profiling_freq_get = ivpu_hw_40xx_profiling_freq_get,
-	.profiling_freq_drive = ivpu_hw_40xx_profiling_freq_drive,
-	.reg_pll_freq_get = ivpu_hw_40xx_reg_pll_freq_get,
-	.reg_telemetry_offset_get = ivpu_hw_40xx_reg_telemetry_offset_get,
-	.reg_telemetry_size_get = ivpu_hw_40xx_reg_telemetry_size_get,
-	.reg_telemetry_enable_get = ivpu_hw_40xx_reg_telemetry_enable_get,
-	.reg_db_set = ivpu_hw_40xx_reg_db_set,
-	.reg_ipc_rx_addr_get = ivpu_hw_40xx_reg_ipc_rx_addr_get,
-	.reg_ipc_rx_count_get = ivpu_hw_40xx_reg_ipc_rx_count_get,
-	.reg_ipc_tx_set = ivpu_hw_40xx_reg_ipc_tx_set,
-	.irq_clear = ivpu_hw_40xx_irq_clear,
-	.irq_enable = ivpu_hw_40xx_irq_enable,
-	.irq_disable = ivpu_hw_40xx_irq_disable,
-	.irq_handler = ivpu_hw_40xx_irq_handler,
-};
diff --git a/drivers/accel/ivpu/ivpu_hw_40xx_reg.h b/drivers/accel/ivpu/ivpu_hw_40xx_reg.h
index ff4a5d4f5821..fc0ee8d637f9 100644
--- a/drivers/accel/ivpu/ivpu_hw_40xx_reg.h
+++ b/drivers/accel/ivpu/ivpu_hw_40xx_reg.h
@@ -8,91 +8,6 @@
 
 #include <linux/bits.h>
 
-#define VPU_40XX_BUTTRESS_INTERRUPT_STAT				0x00000000u
-#define VPU_40XX_BUTTRESS_INTERRUPT_STAT_FREQ_CHANGE_MASK		BIT_MASK(0)
-#define VPU_40XX_BUTTRESS_INTERRUPT_STAT_ATS_ERR_MASK			BIT_MASK(1)
-#define VPU_40XX_BUTTRESS_INTERRUPT_STAT_CFI0_ERR_MASK			BIT_MASK(2)
-#define VPU_40XX_BUTTRESS_INTERRUPT_STAT_CFI1_ERR_MASK			BIT_MASK(3)
-#define VPU_40XX_BUTTRESS_INTERRUPT_STAT_IMR0_ERR_MASK			BIT_MASK(4)
-#define VPU_40XX_BUTTRESS_INTERRUPT_STAT_IMR1_ERR_MASK			BIT_MASK(5)
-#define VPU_40XX_BUTTRESS_INTERRUPT_STAT_SURV_ERR_MASK			BIT_MASK(6)
-
-#define VPU_40XX_BUTTRESS_LOCAL_INT_MASK				0x00000004u
-#define VPU_40XX_BUTTRESS_GLOBAL_INT_MASK				0x00000008u
-
-#define VPU_40XX_BUTTRESS_HM_ATS					0x0000000cu
-
-#define VPU_40XX_BUTTRESS_ATS_ERR_LOG1					0x00000010u
-#define VPU_40XX_BUTTRESS_ATS_ERR_LOG2					0x00000014u
-#define VPU_40XX_BUTTRESS_ATS_ERR_CLEAR					0x00000018u
-
-#define VPU_40XX_BUTTRESS_CFI0_ERR_LOG					0x0000001cu
-#define VPU_40XX_BUTTRESS_CFI0_ERR_CLEAR				0x00000020u
-
-#define VPU_40XX_BUTTRESS_PORT_ARBITRATION_WEIGHTS_ATS			0x00000024u
-
-#define VPU_40XX_BUTTRESS_CFI1_ERR_LOG					0x00000040u
-#define VPU_40XX_BUTTRESS_CFI1_ERR_CLEAR				0x00000044u
-
-#define VPU_40XX_BUTTRESS_IMR_ERR_CFI0_LOW				0x00000048u
-#define VPU_40XX_BUTTRESS_IMR_ERR_CFI0_HIGH				0x0000004cu
-#define VPU_40XX_BUTTRESS_IMR_ERR_CFI0_CLEAR				0x00000050u
-
-#define VPU_40XX_BUTTRESS_PORT_ARBITRATION_WEIGHTS			0x00000054u
-
-#define VPU_40XX_BUTTRESS_IMR_ERR_CFI1_LOW				0x00000058u
-#define VPU_40XX_BUTTRESS_IMR_ERR_CFI1_HIGH				0x0000005cu
-#define VPU_40XX_BUTTRESS_IMR_ERR_CFI1_CLEAR				0x00000060u
-
-#define VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD0				0x00000130u
-#define VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD0_MIN_RATIO_MASK		GENMASK(15, 0)
-#define VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD0_MAX_RATIO_MASK		GENMASK(31, 16)
-
-#define VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD1				0x00000134u
-#define VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD1_TARGET_RATIO_MASK		GENMASK(15, 0)
-#define VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD1_EPP_MASK			GENMASK(31, 16)
-
-#define VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD2				0x00000138u
-#define VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD2_CONFIG_MASK			GENMASK(15, 0)
-#define VPU_40XX_BUTTRESS_WP_REQ_PAYLOAD2_CDYN_MASK			GENMASK(31, 16)
-
-#define VPU_40XX_BUTTRESS_WP_REQ_CMD					0x0000013cu
-#define VPU_40XX_BUTTRESS_WP_REQ_CMD_SEND_MASK				BIT_MASK(0)
-
-#define VPU_40XX_BUTTRESS_PLL_FREQ					0x00000148u
-#define VPU_40XX_BUTTRESS_PLL_FREQ_RATIO_MASK				GENMASK(15, 0)
-
-#define VPU_40XX_BUTTRESS_TILE_FUSE					0x00000150u
-#define VPU_40XX_BUTTRESS_TILE_FUSE_VALID_MASK				BIT_MASK(0)
-#define VPU_40XX_BUTTRESS_TILE_FUSE_CONFIG_MASK				GENMASK(6, 1)
-
-#define VPU_40XX_BUTTRESS_VPU_STATUS					0x00000154u
-#define VPU_40XX_BUTTRESS_VPU_STATUS_READY_MASK				BIT_MASK(0)
-#define VPU_40XX_BUTTRESS_VPU_STATUS_IDLE_MASK				BIT_MASK(1)
-#define VPU_40XX_BUTTRESS_VPU_STATUS_DUP_IDLE_MASK			BIT_MASK(2)
-#define VPU_40XX_BUTTRESS_VPU_STATUS_CLOCK_RESOURCE_OWN_ACK_MASK	BIT_MASK(6)
-#define VPU_40XX_BUTTRESS_VPU_STATUS_POWER_RESOURCE_OWN_ACK_MASK	BIT_MASK(7)
-#define VPU_40XX_BUTTRESS_VPU_STATUS_PERF_CLK_MASK			BIT_MASK(11)
-#define VPU_40XX_BUTTRESS_VPU_STATUS_DISABLE_CLK_RELINQUISH_MASK        BIT_MASK(12)
-
-#define VPU_40XX_BUTTRESS_IP_RESET					0x00000160u
-#define VPU_40XX_BUTTRESS_IP_RESET_TRIGGER_MASK				BIT_MASK(0)
-
-#define VPU_40XX_BUTTRESS_D0I3_CONTROL					0x00000164u
-#define VPU_40XX_BUTTRESS_D0I3_CONTROL_INPROGRESS_MASK			BIT_MASK(0)
-#define VPU_40XX_BUTTRESS_D0I3_CONTROL_I3_MASK				BIT_MASK(2)
-
-#define VPU_40XX_BUTTRESS_VPU_TELEMETRY_OFFSET				0x00000168u
-#define VPU_40XX_BUTTRESS_VPU_TELEMETRY_SIZE				0x0000016cu
-#define VPU_40XX_BUTTRESS_VPU_TELEMETRY_ENABLE				0x00000170u
-
-#define VPU_40XX_BUTTRESS_FMIN_FUSE					0x00000174u
-#define VPU_40XX_BUTTRESS_FMIN_FUSE_MIN_RATIO_MASK			GENMASK(7, 0)
-#define VPU_40XX_BUTTRESS_FMIN_FUSE_PN_RATIO_MASK			GENMASK(15, 8)
-
-#define VPU_40XX_BUTTRESS_FMAX_FUSE					0x00000178u
-#define VPU_40XX_BUTTRESS_FMAX_FUSE_MAX_RATIO_MASK			GENMASK(7, 0)
-
 #define VPU_40XX_HOST_SS_CPR_CLK_EN					0x00000080u
 #define VPU_40XX_HOST_SS_CPR_CLK_EN_TOP_NOC_MASK			BIT_MASK(1)
 #define VPU_40XX_HOST_SS_CPR_CLK_EN_DSS_MAS_MASK			BIT_MASK(10)
@@ -198,6 +113,14 @@
 #define VPU_40XX_HOST_SS_AON_PWR_ISLAND_STATUS0				0x0003002cu
 #define VPU_40XX_HOST_SS_AON_PWR_ISLAND_STATUS0_CSS_CPU_MASK		BIT_MASK(3)
 
+#define VPU_50XX_HOST_SS_AON_PWR_ISLAND_EN_POST_DLY			0x00030068u
+#define VPU_50XX_HOST_SS_AON_PWR_ISLAND_EN_POST_DLY_POST_DLY_MASK	GENMASK(7, 0)
+#define VPU_50XX_HOST_SS_AON_PWR_ISLAND_EN_POST_DLY_POST1_DLY_MASK	GENMASK(15, 8)
+#define VPU_50XX_HOST_SS_AON_PWR_ISLAND_EN_POST_DLY_POST2_DLY_MASK	GENMASK(23, 16)
+
+#define VPU_50XX_HOST_SS_AON_PWR_ISLAND_STATUS_DLY			0x0003006cu
+#define VPU_50XX_HOST_SS_AON_PWR_ISLAND_STATUS_DLY_STATUS_DLY_MASK	GENMASK(7, 0)
+
 #define VPU_40XX_HOST_SS_AON_IDLE_GEN					0x00030200u
 #define VPU_40XX_HOST_SS_AON_IDLE_GEN_EN_MASK				BIT_MASK(0)
 #define VPU_40XX_HOST_SS_AON_IDLE_GEN_HW_PG_EN_MASK			BIT_MASK(1)
@@ -205,6 +128,9 @@
 #define VPU_40XX_HOST_SS_AON_DPU_ACTIVE					0x00030204u
 #define VPU_40XX_HOST_SS_AON_DPU_ACTIVE_DPU_ACTIVE_MASK			BIT_MASK(0)
 
+#define VPU_50XX_HOST_SS_AON_FABRIC_REQ_OVERRIDE			0x00030210u
+#define VPU_50XX_HOST_SS_AON_FABRIC_REQ_OVERRIDE_REQ_OVERRIDE_MASK      BIT_MASK(0)
+
 #define VPU_40XX_HOST_SS_VERIFICATION_ADDRESS_LO			0x00040040u
 #define VPU_40XX_HOST_SS_VERIFICATION_ADDRESS_LO_DONE_MASK		BIT_MASK(0)
 #define VPU_40XX_HOST_SS_VERIFICATION_ADDRESS_LO_IOSF_RS_ID_MASK	GENMASK(2, 1)
diff --git a/drivers/accel/ivpu/ivpu_hw_btrs.c b/drivers/accel/ivpu/ivpu_hw_btrs.c
new file mode 100644
index 000000000000..3212c99f3682
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_hw_btrs.c
@@ -0,0 +1,890 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020-2024 Intel Corporation
+ */
+
+#include "ivpu_drv.h"
+#include "ivpu_hw.h"
+#include "ivpu_hw_btrs.h"
+#include "ivpu_hw_btrs_lnl_reg.h"
+#include "ivpu_hw_btrs_mtl_reg.h"
+#include "ivpu_hw_reg_io.h"
+#include "ivpu_pm.h"
+
+#define BTRS_MTL_IRQ_MASK ((REG_FLD(VPU_HW_BTRS_MTL_INTERRUPT_STAT, ATS_ERR)) | \
+			   (REG_FLD(VPU_HW_BTRS_MTL_INTERRUPT_STAT, UFI_ERR)))
+
+#define BTRS_LNL_IRQ_MASK ((REG_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, ATS_ERR)) | \
+			   (REG_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, CFI0_ERR)) | \
+			   (REG_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, CFI1_ERR)) | \
+			   (REG_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, IMR0_ERR)) | \
+			   (REG_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, IMR1_ERR)) | \
+			   (REG_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, SURV_ERR)))
+
+#define BTRS_MTL_ALL_IRQ_MASK (BTRS_MTL_IRQ_MASK | (REG_FLD(VPU_HW_BTRS_MTL_INTERRUPT_STAT, \
+			       FREQ_CHANGE)))
+
+#define BTRS_IRQ_DISABLE_MASK ((u32)-1)
+
+#define BTRS_LNL_ALL_IRQ_MASK ((u32)-1)
+
+#define BTRS_MTL_WP_CONFIG_1_TILE_5_3_RATIO WP_CONFIG(MTL_CONFIG_1_TILE, MTL_PLL_RATIO_5_3)
+#define BTRS_MTL_WP_CONFIG_1_TILE_4_3_RATIO WP_CONFIG(MTL_CONFIG_1_TILE, MTL_PLL_RATIO_4_3)
+#define BTRS_MTL_WP_CONFIG_2_TILE_5_3_RATIO WP_CONFIG(MTL_CONFIG_2_TILE, MTL_PLL_RATIO_5_3)
+#define BTRS_MTL_WP_CONFIG_2_TILE_4_3_RATIO WP_CONFIG(MTL_CONFIG_2_TILE, MTL_PLL_RATIO_4_3)
+#define BTRS_MTL_WP_CONFIG_0_TILE_PLL_OFF   WP_CONFIG(0, 0)
+
+#define PLL_CDYN_DEFAULT               0x80
+#define PLL_EPP_DEFAULT                0x80
+#define PLL_CONFIG_DEFAULT             0x0
+#define PLL_SIMULATION_FREQ            10000000
+#define PLL_REF_CLK_FREQ               50000000
+#define PLL_TIMEOUT_US		       (1500 * USEC_PER_MSEC)
+#define IDLE_TIMEOUT_US		       (5 * USEC_PER_MSEC)
+#define TIMEOUT_US                     (150 * USEC_PER_MSEC)
+
+/* Work point configuration values */
+#define WP_CONFIG(tile, ratio)         (((tile) << 8) | (ratio))
+#define MTL_CONFIG_1_TILE              0x01
+#define MTL_CONFIG_2_TILE              0x02
+#define MTL_PLL_RATIO_5_3              0x01
+#define MTL_PLL_RATIO_4_3              0x02
+#define BTRS_MTL_TILE_FUSE_ENABLE_BOTH 0x0
+#define BTRS_MTL_TILE_SKU_BOTH         0x3630
+
+#define BTRS_LNL_TILE_MAX_NUM          6
+#define BTRS_LNL_TILE_MAX_MASK         0x3f
+
+#define WEIGHTS_DEFAULT                0xf711f711u
+#define WEIGHTS_ATS_DEFAULT            0x0000f711u
+
+#define DCT_REQ                        0x2
+#define DCT_ENABLE                     0x1
+#define DCT_DISABLE                    0x0
+
+int ivpu_hw_btrs_irqs_clear_with_0_mtl(struct ivpu_device *vdev)
+{
+	REGB_WR32(VPU_HW_BTRS_MTL_INTERRUPT_STAT, BTRS_MTL_ALL_IRQ_MASK);
+	if (REGB_RD32(VPU_HW_BTRS_MTL_INTERRUPT_STAT) == BTRS_MTL_ALL_IRQ_MASK) {
+		/* Writing 1s does not clear the interrupt status register */
+		REGB_WR32(VPU_HW_BTRS_MTL_INTERRUPT_STAT, 0x0);
+		return true;
+	}
+
+	return false;
+}
+
+static void freq_ratios_init_mtl(struct ivpu_device *vdev)
+{
+	struct ivpu_hw_info *hw = vdev->hw;
+	u32 fmin_fuse, fmax_fuse;
+
+	fmin_fuse = REGB_RD32(VPU_HW_BTRS_MTL_FMIN_FUSE);
+	hw->pll.min_ratio = REG_GET_FLD(VPU_HW_BTRS_MTL_FMIN_FUSE, MIN_RATIO, fmin_fuse);
+	hw->pll.pn_ratio = REG_GET_FLD(VPU_HW_BTRS_MTL_FMIN_FUSE, PN_RATIO, fmin_fuse);
+
+	fmax_fuse = REGB_RD32(VPU_HW_BTRS_MTL_FMAX_FUSE);
+	hw->pll.max_ratio = REG_GET_FLD(VPU_HW_BTRS_MTL_FMAX_FUSE, MAX_RATIO, fmax_fuse);
+}
+
+static void freq_ratios_init_lnl(struct ivpu_device *vdev)
+{
+	struct ivpu_hw_info *hw = vdev->hw;
+	u32 fmin_fuse, fmax_fuse;
+
+	fmin_fuse = REGB_RD32(VPU_HW_BTRS_LNL_FMIN_FUSE);
+	hw->pll.min_ratio = REG_GET_FLD(VPU_HW_BTRS_LNL_FMIN_FUSE, MIN_RATIO, fmin_fuse);
+	hw->pll.pn_ratio = REG_GET_FLD(VPU_HW_BTRS_LNL_FMIN_FUSE, PN_RATIO, fmin_fuse);
+
+	fmax_fuse = REGB_RD32(VPU_HW_BTRS_LNL_FMAX_FUSE);
+	hw->pll.max_ratio = REG_GET_FLD(VPU_HW_BTRS_LNL_FMAX_FUSE, MAX_RATIO, fmax_fuse);
+}
+
+void ivpu_hw_btrs_freq_ratios_init(struct ivpu_device *vdev)
+{
+	struct ivpu_hw_info *hw = vdev->hw;
+
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		freq_ratios_init_mtl(vdev);
+	else
+		freq_ratios_init_lnl(vdev);
+
+	hw->pll.min_ratio = clamp_t(u8, ivpu_pll_min_ratio, hw->pll.min_ratio, hw->pll.max_ratio);
+	hw->pll.max_ratio = clamp_t(u8, ivpu_pll_max_ratio, hw->pll.min_ratio, hw->pll.max_ratio);
+	hw->pll.pn_ratio = clamp_t(u8, hw->pll.pn_ratio, hw->pll.min_ratio, hw->pll.max_ratio);
+}
+
+static bool tile_disable_check(u32 config)
+{
+	/* Allowed values: 0 or one bit from range 0-5 (6 tiles) */
+	if (config == 0)
+		return true;
+
+	if (config > BIT(BTRS_LNL_TILE_MAX_NUM - 1))
+		return false;
+
+	if ((config & (config - 1)) == 0)
+		return true;
+
+	return false;
+}
+
+static int read_tile_config_fuse(struct ivpu_device *vdev, u32 *tile_fuse_config)
+{
+	u32 fuse;
+	u32 config;
+
+	fuse = REGB_RD32(VPU_HW_BTRS_LNL_TILE_FUSE);
+	if (!REG_TEST_FLD(VPU_HW_BTRS_LNL_TILE_FUSE, VALID, fuse)) {
+		ivpu_err(vdev, "Fuse: invalid (0x%x)\n", fuse);
+		return -EIO;
+	}
+
+	config = REG_GET_FLD(VPU_HW_BTRS_LNL_TILE_FUSE, CONFIG, fuse);
+	if (!tile_disable_check(config))
+		ivpu_warn(vdev, "More than 1 tile disabled, tile fuse config mask: 0x%x\n", config);
+
+	ivpu_dbg(vdev, MISC, "Tile disable config mask: 0x%x\n", config);
+
+	*tile_fuse_config = config;
+	return 0;
+}
+
+static int info_init_mtl(struct ivpu_device *vdev)
+{
+	struct ivpu_hw_info *hw = vdev->hw;
+
+	hw->tile_fuse = BTRS_MTL_TILE_FUSE_ENABLE_BOTH;
+	hw->sku = BTRS_MTL_TILE_SKU_BOTH;
+	hw->config = BTRS_MTL_WP_CONFIG_2_TILE_4_3_RATIO;
+
+	return 0;
+}
+
+static int info_init_lnl(struct ivpu_device *vdev)
+{
+	struct ivpu_hw_info *hw = vdev->hw;
+	u32 tile_fuse_config;
+	int ret;
+
+	ret = read_tile_config_fuse(vdev, &tile_fuse_config);
+	if (ret)
+		return ret;
+
+	hw->tile_fuse = tile_fuse_config;
+	hw->pll.profiling_freq = PLL_PROFILING_FREQ_DEFAULT;
+
+	return 0;
+}
+
+int ivpu_hw_btrs_info_init(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return info_init_mtl(vdev);
+	else
+		return info_init_lnl(vdev);
+}
+
+static int wp_request_sync(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return REGB_POLL_FLD(VPU_HW_BTRS_MTL_WP_REQ_CMD, SEND, 0, PLL_TIMEOUT_US);
+	else
+		return REGB_POLL_FLD(VPU_HW_BTRS_LNL_WP_REQ_CMD, SEND, 0, PLL_TIMEOUT_US);
+}
+
+static int wait_for_status_ready(struct ivpu_device *vdev, bool enable)
+{
+	u32 exp_val = enable ? 0x1 : 0x0;
+
+	if (IVPU_WA(punit_disabled))
+		return 0;
+
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return REGB_POLL_FLD(VPU_HW_BTRS_MTL_VPU_STATUS, READY, exp_val, PLL_TIMEOUT_US);
+	else
+		return REGB_POLL_FLD(VPU_HW_BTRS_LNL_VPU_STATUS, READY, exp_val, PLL_TIMEOUT_US);
+}
+
+struct wp_request {
+	u16 min;
+	u16 max;
+	u16 target;
+	u16 cfg;
+	u16 epp;
+	u16 cdyn;
+};
+
+static void wp_request_mtl(struct ivpu_device *vdev, struct wp_request *wp)
+{
+	u32 val;
+
+	val = REGB_RD32(VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD0);
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD0, MIN_RATIO, wp->min, val);
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD0, MAX_RATIO, wp->max, val);
+	REGB_WR32(VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD0, val);
+
+	val = REGB_RD32(VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD1);
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD1, TARGET_RATIO, wp->target, val);
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD1, EPP, PLL_EPP_DEFAULT, val);
+	REGB_WR32(VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD1, val);
+
+	val = REGB_RD32(VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD2);
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD2, CONFIG, wp->cfg, val);
+	REGB_WR32(VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD2, val);
+
+	val = REGB_RD32(VPU_HW_BTRS_MTL_WP_REQ_CMD);
+	val = REG_SET_FLD(VPU_HW_BTRS_MTL_WP_REQ_CMD, SEND, val);
+	REGB_WR32(VPU_HW_BTRS_MTL_WP_REQ_CMD, val);
+}
+
+static void wp_request_lnl(struct ivpu_device *vdev, struct wp_request *wp)
+{
+	u32 val;
+
+	val = REGB_RD32(VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD0);
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD0, MIN_RATIO, wp->min, val);
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD0, MAX_RATIO, wp->max, val);
+	REGB_WR32(VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD0, val);
+
+	val = REGB_RD32(VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD1);
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD1, TARGET_RATIO, wp->target, val);
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD1, EPP, wp->epp, val);
+	REGB_WR32(VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD1, val);
+
+	val = REGB_RD32(VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD2);
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD2, CONFIG, wp->cfg, val);
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD2, CDYN, wp->cdyn, val);
+	REGB_WR32(VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD2, val);
+
+	val = REGB_RD32(VPU_HW_BTRS_LNL_WP_REQ_CMD);
+	val = REG_SET_FLD(VPU_HW_BTRS_LNL_WP_REQ_CMD, SEND, val);
+	REGB_WR32(VPU_HW_BTRS_LNL_WP_REQ_CMD, val);
+}
+
+static void wp_request(struct ivpu_device *vdev, struct wp_request *wp)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		wp_request_mtl(vdev, wp);
+	else
+		wp_request_lnl(vdev, wp);
+}
+
+static int wp_request_send(struct ivpu_device *vdev, struct wp_request *wp)
+{
+	int ret;
+
+	ret = wp_request_sync(vdev);
+	if (ret) {
+		ivpu_err(vdev, "Failed to sync before workpoint request: %d\n", ret);
+		return ret;
+	}
+
+	wp_request(vdev, wp);
+
+	ret = wp_request_sync(vdev);
+	if (ret)
+		ivpu_err(vdev, "Failed to sync after workpoint request: %d\n", ret);
+
+	return ret;
+}
+
+static void prepare_wp_request(struct ivpu_device *vdev, struct wp_request *wp, bool enable)
+{
+	struct ivpu_hw_info *hw = vdev->hw;
+
+	wp->min = hw->pll.min_ratio;
+	wp->max = hw->pll.max_ratio;
+
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL) {
+		wp->target = enable ? hw->pll.pn_ratio : 0;
+		wp->cfg = enable ? hw->config : 0;
+		wp->cdyn = 0;
+		wp->epp = 0;
+	} else {
+		wp->target = hw->pll.pn_ratio;
+		wp->cfg = enable ? PLL_CONFIG_DEFAULT : 0;
+		wp->cdyn = enable ? PLL_CDYN_DEFAULT : 0;
+		wp->epp = enable ? PLL_EPP_DEFAULT : 0;
+	}
+}
+
+static int wait_for_pll_lock(struct ivpu_device *vdev, bool enable)
+{
+	u32 exp_val = enable ? 0x1 : 0x0;
+
+	if (ivpu_hw_btrs_gen(vdev) != IVPU_HW_BTRS_MTL)
+		return 0;
+
+	if (IVPU_WA(punit_disabled))
+		return 0;
+
+	return REGB_POLL_FLD(VPU_HW_BTRS_MTL_PLL_STATUS, LOCK, exp_val, PLL_TIMEOUT_US);
+}
+
+int ivpu_hw_btrs_wp_drive(struct ivpu_device *vdev, bool enable)
+{
+	struct wp_request wp;
+	int ret;
+
+	if (IVPU_WA(punit_disabled)) {
+		ivpu_dbg(vdev, PM, "Skipping workpoint request\n");
+		return 0;
+	}
+
+	prepare_wp_request(vdev, &wp, enable);
+
+	ivpu_dbg(vdev, PM, "PLL workpoint request: %u Hz, config: 0x%x, epp: 0x%x, cdyn: 0x%x\n",
+		 PLL_RATIO_TO_FREQ(wp.target), wp.cfg, wp.epp, wp.cdyn);
+
+	ret = wp_request_send(vdev, &wp);
+	if (ret) {
+		ivpu_err(vdev, "Failed to send workpoint request: %d\n", ret);
+		return ret;
+	}
+
+	ret = wait_for_pll_lock(vdev, enable);
+	if (ret) {
+		ivpu_err(vdev, "Timed out waiting for PLL lock\n");
+		return ret;
+	}
+
+	ret = wait_for_status_ready(vdev, enable);
+	if (ret) {
+		ivpu_err(vdev, "Timed out waiting for NPU ready status\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int d0i3_drive_mtl(struct ivpu_device *vdev, bool enable)
+{
+	int ret;
+	u32 val;
+
+	ret = REGB_POLL_FLD(VPU_HW_BTRS_MTL_VPU_D0I3_CONTROL, INPROGRESS, 0, TIMEOUT_US);
+	if (ret) {
+		ivpu_err(vdev, "Failed to sync before D0i3 transition: %d\n", ret);
+		return ret;
+	}
+
+	val = REGB_RD32(VPU_HW_BTRS_MTL_VPU_D0I3_CONTROL);
+	if (enable)
+		val = REG_SET_FLD(VPU_HW_BTRS_MTL_VPU_D0I3_CONTROL, I3, val);
+	else
+		val = REG_CLR_FLD(VPU_HW_BTRS_MTL_VPU_D0I3_CONTROL, I3, val);
+	REGB_WR32(VPU_HW_BTRS_MTL_VPU_D0I3_CONTROL, val);
+
+	ret = REGB_POLL_FLD(VPU_HW_BTRS_MTL_VPU_D0I3_CONTROL, INPROGRESS, 0, TIMEOUT_US);
+	if (ret)
+		ivpu_err(vdev, "Failed to sync after D0i3 transition: %d\n", ret);
+
+	return ret;
+}
+
+static int d0i3_drive_lnl(struct ivpu_device *vdev, bool enable)
+{
+	int ret;
+	u32 val;
+
+	ret = REGB_POLL_FLD(VPU_HW_BTRS_LNL_D0I3_CONTROL, INPROGRESS, 0, TIMEOUT_US);
+	if (ret) {
+		ivpu_err(vdev, "Failed to sync before D0i3 transition: %d\n", ret);
+		return ret;
+	}
+
+	val = REGB_RD32(VPU_HW_BTRS_LNL_D0I3_CONTROL);
+	if (enable)
+		val = REG_SET_FLD(VPU_HW_BTRS_LNL_D0I3_CONTROL, I3, val);
+	else
+		val = REG_CLR_FLD(VPU_HW_BTRS_LNL_D0I3_CONTROL, I3, val);
+	REGB_WR32(VPU_HW_BTRS_LNL_D0I3_CONTROL, val);
+
+	ret = REGB_POLL_FLD(VPU_HW_BTRS_LNL_D0I3_CONTROL, INPROGRESS, 0, TIMEOUT_US);
+	if (ret) {
+		ivpu_err(vdev, "Failed to sync after D0i3 transition: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int d0i3_drive(struct ivpu_device *vdev, bool enable)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return d0i3_drive_mtl(vdev, enable);
+	else
+		return d0i3_drive_lnl(vdev, enable);
+}
+
+int ivpu_hw_btrs_d0i3_enable(struct ivpu_device *vdev)
+{
+	int ret;
+
+	if (IVPU_WA(punit_disabled))
+		return 0;
+
+	ret = d0i3_drive(vdev, true);
+	if (ret)
+		ivpu_err(vdev, "Failed to enable D0i3: %d\n", ret);
+
+	udelay(5); /* VPU requires 5 us to complete the transition */
+
+	return ret;
+}
+
+int ivpu_hw_btrs_d0i3_disable(struct ivpu_device *vdev)
+{
+	int ret;
+
+	if (IVPU_WA(punit_disabled))
+		return 0;
+
+	ret = d0i3_drive(vdev, false);
+	if (ret)
+		ivpu_err(vdev, "Failed to disable D0i3: %d\n", ret);
+
+	return ret;
+}
+
+int ivpu_hw_btrs_wait_for_clock_res_own_ack(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return 0;
+
+	return REGB_POLL_FLD(VPU_HW_BTRS_LNL_VPU_STATUS, CLOCK_RESOURCE_OWN_ACK, 1, TIMEOUT_US);
+}
+
+void ivpu_hw_btrs_set_port_arbitration_weights_lnl(struct ivpu_device *vdev)
+{
+	REGB_WR32(VPU_HW_BTRS_LNL_PORT_ARBITRATION_WEIGHTS, WEIGHTS_DEFAULT);
+	REGB_WR32(VPU_HW_BTRS_LNL_PORT_ARBITRATION_WEIGHTS_ATS, WEIGHTS_ATS_DEFAULT);
+}
+
+static int ip_reset_mtl(struct ivpu_device *vdev)
+{
+	int ret;
+	u32 val;
+
+	ret = REGB_POLL_FLD(VPU_HW_BTRS_MTL_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US);
+	if (ret) {
+		ivpu_err(vdev, "Timed out waiting for TRIGGER bit\n");
+		return ret;
+	}
+
+	val = REGB_RD32(VPU_HW_BTRS_MTL_VPU_IP_RESET);
+	val = REG_SET_FLD(VPU_HW_BTRS_MTL_VPU_IP_RESET, TRIGGER, val);
+	REGB_WR32(VPU_HW_BTRS_MTL_VPU_IP_RESET, val);
+
+	ret = REGB_POLL_FLD(VPU_HW_BTRS_MTL_VPU_IP_RESET, TRIGGER, 0, TIMEOUT_US);
+	if (ret)
+		ivpu_err(vdev, "Timed out waiting for RESET completion\n");
+
+	return ret;
+}
+
+static int ip_reset_lnl(struct ivpu_device *vdev)
+{
+	int ret;
+	u32 val;
+
+	ivpu_hw_btrs_clock_relinquish_disable_lnl(vdev);
+
+	ret = REGB_POLL_FLD(VPU_HW_BTRS_LNL_IP_RESET, TRIGGER, 0, TIMEOUT_US);
+	if (ret) {
+		ivpu_err(vdev, "Wait for *_TRIGGER timed out\n");
+		return ret;
+	}
+
+	val = REGB_RD32(VPU_HW_BTRS_LNL_IP_RESET);
+	val = REG_SET_FLD(VPU_HW_BTRS_LNL_IP_RESET, TRIGGER, val);
+	REGB_WR32(VPU_HW_BTRS_LNL_IP_RESET, val);
+
+	ret = REGB_POLL_FLD(VPU_HW_BTRS_LNL_IP_RESET, TRIGGER, 0, TIMEOUT_US);
+	if (ret)
+		ivpu_err(vdev, "Timed out waiting for RESET completion\n");
+
+	return ret;
+}
+
+int ivpu_hw_btrs_ip_reset(struct ivpu_device *vdev)
+{
+	if (IVPU_WA(punit_disabled))
+		return 0;
+
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return ip_reset_mtl(vdev);
+	else
+		return ip_reset_lnl(vdev);
+}
+
+void ivpu_hw_btrs_profiling_freq_reg_set_lnl(struct ivpu_device *vdev)
+{
+	u32 val = REGB_RD32(VPU_HW_BTRS_LNL_VPU_STATUS);
+
+	if (vdev->hw->pll.profiling_freq == PLL_PROFILING_FREQ_DEFAULT)
+		val = REG_CLR_FLD(VPU_HW_BTRS_LNL_VPU_STATUS, PERF_CLK, val);
+	else
+		val = REG_SET_FLD(VPU_HW_BTRS_LNL_VPU_STATUS, PERF_CLK, val);
+
+	REGB_WR32(VPU_HW_BTRS_LNL_VPU_STATUS, val);
+}
+
+void ivpu_hw_btrs_ats_print_lnl(struct ivpu_device *vdev)
+{
+	ivpu_dbg(vdev, MISC, "Buttress ATS: %s\n",
+		 REGB_RD32(VPU_HW_BTRS_LNL_HM_ATS) ? "Enable" : "Disable");
+}
+
+void ivpu_hw_btrs_clock_relinquish_disable_lnl(struct ivpu_device *vdev)
+{
+	u32 val = REGB_RD32(VPU_HW_BTRS_LNL_VPU_STATUS);
+
+	val = REG_SET_FLD(VPU_HW_BTRS_LNL_VPU_STATUS, DISABLE_CLK_RELINQUISH, val);
+	REGB_WR32(VPU_HW_BTRS_LNL_VPU_STATUS, val);
+}
+
+bool ivpu_hw_btrs_is_idle(struct ivpu_device *vdev)
+{
+	u32 val;
+
+	if (IVPU_WA(punit_disabled))
+		return true;
+
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL) {
+		val = REGB_RD32(VPU_HW_BTRS_MTL_VPU_STATUS);
+
+		return REG_TEST_FLD(VPU_HW_BTRS_MTL_VPU_STATUS, READY, val) &&
+		       REG_TEST_FLD(VPU_HW_BTRS_MTL_VPU_STATUS, IDLE, val);
+	} else {
+		val = REGB_RD32(VPU_HW_BTRS_LNL_VPU_STATUS);
+
+		return REG_TEST_FLD(VPU_HW_BTRS_LNL_VPU_STATUS, READY, val) &&
+		       REG_TEST_FLD(VPU_HW_BTRS_LNL_VPU_STATUS, IDLE, val);
+	}
+}
+
+int ivpu_hw_btrs_wait_for_idle(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return REGB_POLL_FLD(VPU_HW_BTRS_MTL_VPU_STATUS, IDLE, 0x1, IDLE_TIMEOUT_US);
+	else
+		return REGB_POLL_FLD(VPU_HW_BTRS_LNL_VPU_STATUS, IDLE, 0x1, IDLE_TIMEOUT_US);
+}
+
+/* Handler for IRQs from Buttress core (irqB) */
+bool ivpu_hw_btrs_irq_handler_mtl(struct ivpu_device *vdev, int irq)
+{
+	u32 status = REGB_RD32(VPU_HW_BTRS_MTL_INTERRUPT_STAT) & BTRS_MTL_IRQ_MASK;
+	bool schedule_recovery = false;
+
+	if (!status)
+		return false;
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_MTL_INTERRUPT_STAT, FREQ_CHANGE, status))
+		ivpu_dbg(vdev, IRQ, "FREQ_CHANGE irq: %08x",
+			 REGB_RD32(VPU_HW_BTRS_MTL_CURRENT_PLL));
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_MTL_INTERRUPT_STAT, ATS_ERR, status)) {
+		ivpu_err(vdev, "ATS_ERR irq 0x%016llx", REGB_RD64(VPU_HW_BTRS_MTL_ATS_ERR_LOG_0));
+		REGB_WR32(VPU_HW_BTRS_MTL_ATS_ERR_CLEAR, 0x1);
+		schedule_recovery = true;
+	}
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_MTL_INTERRUPT_STAT, UFI_ERR, status)) {
+		u32 ufi_log = REGB_RD32(VPU_HW_BTRS_MTL_UFI_ERR_LOG);
+
+		ivpu_err(vdev, "UFI_ERR irq (0x%08x) opcode: 0x%02lx axi_id: 0x%02lx cq_id: 0x%03lx",
+			 ufi_log, REG_GET_FLD(VPU_HW_BTRS_MTL_UFI_ERR_LOG, OPCODE, ufi_log),
+			 REG_GET_FLD(VPU_HW_BTRS_MTL_UFI_ERR_LOG, AXI_ID, ufi_log),
+			 REG_GET_FLD(VPU_HW_BTRS_MTL_UFI_ERR_LOG, CQ_ID, ufi_log));
+		REGB_WR32(VPU_HW_BTRS_MTL_UFI_ERR_CLEAR, 0x1);
+		schedule_recovery = true;
+	}
+
+	/* This must be done after interrupts are cleared at the source. */
+	if (IVPU_WA(interrupt_clear_with_0))
+		/*
+		 * Writing 1 triggers an interrupt, so we can't perform read update write.
+		 * Clear local interrupt status by writing 0 to all bits.
+		 */
+		REGB_WR32(VPU_HW_BTRS_MTL_INTERRUPT_STAT, 0x0);
+	else
+		REGB_WR32(VPU_HW_BTRS_MTL_INTERRUPT_STAT, status);
+
+	if (schedule_recovery)
+		ivpu_pm_trigger_recovery(vdev, "Buttress IRQ");
+
+	return true;
+}
+
+/* Handler for IRQs from Buttress core (irqB) */
+bool ivpu_hw_btrs_irq_handler_lnl(struct ivpu_device *vdev, int irq)
+{
+	u32 status = REGB_RD32(VPU_HW_BTRS_LNL_INTERRUPT_STAT) & BTRS_LNL_IRQ_MASK;
+	bool schedule_recovery = false;
+
+	if (!status)
+		return false;
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, SURV_ERR, status)) {
+		ivpu_dbg(vdev, IRQ, "Survivability IRQ\n");
+		if (!kfifo_put(&vdev->hw->irq.fifo, IVPU_HW_IRQ_SRC_DCT))
+			ivpu_err_ratelimited(vdev, "IRQ FIFO full\n");
+	}
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, FREQ_CHANGE, status))
+		ivpu_dbg(vdev, IRQ, "FREQ_CHANGE irq: %08x", REGB_RD32(VPU_HW_BTRS_LNL_PLL_FREQ));
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, ATS_ERR, status)) {
+		ivpu_err(vdev, "ATS_ERR LOG1 0x%08x ATS_ERR_LOG2 0x%08x\n",
+			 REGB_RD32(VPU_HW_BTRS_LNL_ATS_ERR_LOG1),
+			 REGB_RD32(VPU_HW_BTRS_LNL_ATS_ERR_LOG2));
+		REGB_WR32(VPU_HW_BTRS_LNL_ATS_ERR_CLEAR, 0x1);
+		schedule_recovery = true;
+	}
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, CFI0_ERR, status)) {
+		ivpu_err(vdev, "CFI0_ERR 0x%08x", REGB_RD32(VPU_HW_BTRS_LNL_CFI0_ERR_LOG));
+		REGB_WR32(VPU_HW_BTRS_LNL_CFI0_ERR_CLEAR, 0x1);
+		schedule_recovery = true;
+	}
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, CFI1_ERR, status)) {
+		ivpu_err(vdev, "CFI1_ERR 0x%08x", REGB_RD32(VPU_HW_BTRS_LNL_CFI1_ERR_LOG));
+		REGB_WR32(VPU_HW_BTRS_LNL_CFI1_ERR_CLEAR, 0x1);
+		schedule_recovery = true;
+	}
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, IMR0_ERR, status)) {
+		ivpu_err(vdev, "IMR_ERR_CFI0 LOW: 0x%08x HIGH: 0x%08x",
+			 REGB_RD32(VPU_HW_BTRS_LNL_IMR_ERR_CFI0_LOW),
+			 REGB_RD32(VPU_HW_BTRS_LNL_IMR_ERR_CFI0_HIGH));
+		REGB_WR32(VPU_HW_BTRS_LNL_IMR_ERR_CFI0_CLEAR, 0x1);
+		schedule_recovery = true;
+	}
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, IMR1_ERR, status)) {
+		ivpu_err(vdev, "IMR_ERR_CFI1 LOW: 0x%08x HIGH: 0x%08x",
+			 REGB_RD32(VPU_HW_BTRS_LNL_IMR_ERR_CFI1_LOW),
+			 REGB_RD32(VPU_HW_BTRS_LNL_IMR_ERR_CFI1_HIGH));
+		REGB_WR32(VPU_HW_BTRS_LNL_IMR_ERR_CFI1_CLEAR, 0x1);
+		schedule_recovery = true;
+	}
+
+	/* This must be done after interrupts are cleared at the source. */
+	REGB_WR32(VPU_HW_BTRS_LNL_INTERRUPT_STAT, status);
+
+	if (schedule_recovery)
+		ivpu_pm_trigger_recovery(vdev, "Buttress IRQ");
+
+	return true;
+}
+
+int ivpu_hw_btrs_dct_get_request(struct ivpu_device *vdev, bool *enable)
+{
+	u32 val = REGB_RD32(VPU_HW_BTRS_LNL_PCODE_MAILBOX_SHADOW);
+	u32 cmd = REG_GET_FLD(VPU_HW_BTRS_LNL_PCODE_MAILBOX_SHADOW, CMD, val);
+	u32 param1 = REG_GET_FLD(VPU_HW_BTRS_LNL_PCODE_MAILBOX_SHADOW, PARAM1, val);
+
+	if (cmd != DCT_REQ) {
+		ivpu_err_ratelimited(vdev, "Unsupported PCODE command: 0x%x\n", cmd);
+		return -EBADR;
+	}
+
+	switch (param1) {
+	case DCT_ENABLE:
+		*enable = true;
+		return 0;
+	case DCT_DISABLE:
+		*enable = false;
+		return 0;
+	default:
+		ivpu_err_ratelimited(vdev, "Invalid PARAM1 value: %u\n", param1);
+		return -EINVAL;
+	}
+}
+
+void ivpu_hw_btrs_dct_set_status(struct ivpu_device *vdev, bool enable, u32 active_percent)
+{
+	u32 val = 0;
+	u32 cmd = enable ? DCT_ENABLE : DCT_DISABLE;
+
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_LNL_PCODE_MAILBOX_STATUS, CMD, DCT_REQ, val);
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_LNL_PCODE_MAILBOX_STATUS, PARAM1, cmd, val);
+	val = REG_SET_FLD_NUM(VPU_HW_BTRS_LNL_PCODE_MAILBOX_STATUS, PARAM2, active_percent, val);
+
+	REGB_WR32(VPU_HW_BTRS_LNL_PCODE_MAILBOX_STATUS, val);
+}
+
+static u32 pll_ratio_to_freq_mtl(u32 ratio, u32 config)
+{
+	u32 pll_clock = PLL_REF_CLK_FREQ * ratio;
+	u32 cpu_clock;
+
+	if ((config & 0xff) == MTL_PLL_RATIO_4_3)
+		cpu_clock = pll_clock * 2 / 4;
+	else
+		cpu_clock = pll_clock * 2 / 5;
+
+	return cpu_clock;
+}
+
+u32 ivpu_hw_btrs_ratio_to_freq(struct ivpu_device *vdev, u32 ratio)
+{
+	struct ivpu_hw_info *hw = vdev->hw;
+
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return pll_ratio_to_freq_mtl(ratio, hw->config);
+	else
+		return PLL_RATIO_TO_FREQ(ratio);
+}
+
+static u32 pll_freq_get_mtl(struct ivpu_device *vdev)
+{
+	u32 pll_curr_ratio;
+
+	pll_curr_ratio = REGB_RD32(VPU_HW_BTRS_MTL_CURRENT_PLL);
+	pll_curr_ratio &= VPU_HW_BTRS_MTL_CURRENT_PLL_RATIO_MASK;
+
+	if (!ivpu_is_silicon(vdev))
+		return PLL_SIMULATION_FREQ;
+
+	return pll_ratio_to_freq_mtl(pll_curr_ratio, vdev->hw->config);
+}
+
+static u32 pll_freq_get_lnl(struct ivpu_device *vdev)
+{
+	u32 pll_curr_ratio;
+
+	pll_curr_ratio = REGB_RD32(VPU_HW_BTRS_LNL_PLL_FREQ);
+	pll_curr_ratio &= VPU_HW_BTRS_LNL_PLL_FREQ_RATIO_MASK;
+
+	return PLL_RATIO_TO_FREQ(pll_curr_ratio);
+}
+
+u32 ivpu_hw_btrs_pll_freq_get(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return pll_freq_get_mtl(vdev);
+	else
+		return pll_freq_get_lnl(vdev);
+}
+
+u32 ivpu_hw_btrs_telemetry_offset_get(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return REGB_RD32(VPU_HW_BTRS_MTL_VPU_TELEMETRY_OFFSET);
+	else
+		return REGB_RD32(VPU_HW_BTRS_LNL_VPU_TELEMETRY_OFFSET);
+}
+
+u32 ivpu_hw_btrs_telemetry_size_get(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return REGB_RD32(VPU_HW_BTRS_MTL_VPU_TELEMETRY_SIZE);
+	else
+		return REGB_RD32(VPU_HW_BTRS_LNL_VPU_TELEMETRY_SIZE);
+}
+
+u32 ivpu_hw_btrs_telemetry_enable_get(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return REGB_RD32(VPU_HW_BTRS_MTL_VPU_TELEMETRY_ENABLE);
+	else
+		return REGB_RD32(VPU_HW_BTRS_LNL_VPU_TELEMETRY_ENABLE);
+}
+
+void ivpu_hw_btrs_global_int_disable(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		REGB_WR32(VPU_HW_BTRS_MTL_GLOBAL_INT_MASK, 0x1);
+	else
+		REGB_WR32(VPU_HW_BTRS_LNL_GLOBAL_INT_MASK, 0x1);
+}
+
+void ivpu_hw_btrs_global_int_enable(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		REGB_WR32(VPU_HW_BTRS_MTL_GLOBAL_INT_MASK, 0x0);
+	else
+		REGB_WR32(VPU_HW_BTRS_LNL_GLOBAL_INT_MASK, 0x0);
+}
+
+void ivpu_hw_btrs_irq_enable(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL) {
+		REGB_WR32(VPU_HW_BTRS_MTL_LOCAL_INT_MASK, (u32)(~BTRS_MTL_IRQ_MASK));
+		REGB_WR32(VPU_HW_BTRS_MTL_GLOBAL_INT_MASK, 0x0);
+	} else {
+		REGB_WR32(VPU_HW_BTRS_LNL_LOCAL_INT_MASK, (u32)(~BTRS_LNL_IRQ_MASK));
+		REGB_WR32(VPU_HW_BTRS_LNL_GLOBAL_INT_MASK, 0x0);
+	}
+}
+
+void ivpu_hw_btrs_irq_disable(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL) {
+		REGB_WR32(VPU_HW_BTRS_MTL_GLOBAL_INT_MASK, 0x1);
+		REGB_WR32(VPU_HW_BTRS_MTL_LOCAL_INT_MASK, BTRS_IRQ_DISABLE_MASK);
+	} else {
+		REGB_WR32(VPU_HW_BTRS_LNL_GLOBAL_INT_MASK, 0x1);
+		REGB_WR32(VPU_HW_BTRS_LNL_LOCAL_INT_MASK, BTRS_IRQ_DISABLE_MASK);
+	}
+}
+
+static void diagnose_failure_mtl(struct ivpu_device *vdev)
+{
+	u32 reg = REGB_RD32(VPU_HW_BTRS_MTL_INTERRUPT_STAT) & BTRS_MTL_IRQ_MASK;
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_MTL_INTERRUPT_STAT, ATS_ERR, reg))
+		ivpu_err(vdev, "ATS_ERR irq 0x%016llx", REGB_RD64(VPU_HW_BTRS_MTL_ATS_ERR_LOG_0));
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_MTL_INTERRUPT_STAT, UFI_ERR, reg)) {
+		u32 log = REGB_RD32(VPU_HW_BTRS_MTL_UFI_ERR_LOG);
+
+		ivpu_err(vdev, "UFI_ERR irq (0x%08x) opcode: 0x%02lx axi_id: 0x%02lx cq_id: 0x%03lx",
+			 log, REG_GET_FLD(VPU_HW_BTRS_MTL_UFI_ERR_LOG, OPCODE, log),
+			 REG_GET_FLD(VPU_HW_BTRS_MTL_UFI_ERR_LOG, AXI_ID, log),
+			 REG_GET_FLD(VPU_HW_BTRS_MTL_UFI_ERR_LOG, CQ_ID, log));
+	}
+}
+
+static void diagnose_failure_lnl(struct ivpu_device *vdev)
+{
+	u32 reg = REGB_RD32(VPU_HW_BTRS_MTL_INTERRUPT_STAT) & BTRS_LNL_IRQ_MASK;
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, ATS_ERR, reg)) {
+		ivpu_err(vdev, "ATS_ERR_LOG1 0x%08x ATS_ERR_LOG2 0x%08x\n",
+			 REGB_RD32(VPU_HW_BTRS_LNL_ATS_ERR_LOG1),
+			 REGB_RD32(VPU_HW_BTRS_LNL_ATS_ERR_LOG2));
+	}
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, CFI0_ERR, reg))
+		ivpu_err(vdev, "CFI0_ERR_LOG 0x%08x\n", REGB_RD32(VPU_HW_BTRS_LNL_CFI0_ERR_LOG));
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, CFI1_ERR, reg))
+		ivpu_err(vdev, "CFI1_ERR_LOG 0x%08x\n", REGB_RD32(VPU_HW_BTRS_LNL_CFI1_ERR_LOG));
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, IMR0_ERR, reg))
+		ivpu_err(vdev, "IMR_ERR_CFI0 LOW: 0x%08x HIGH: 0x%08x\n",
+			 REGB_RD32(VPU_HW_BTRS_LNL_IMR_ERR_CFI0_LOW),
+			 REGB_RD32(VPU_HW_BTRS_LNL_IMR_ERR_CFI0_HIGH));
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, IMR1_ERR, reg))
+		ivpu_err(vdev, "IMR_ERR_CFI1 LOW: 0x%08x HIGH: 0x%08x\n",
+			 REGB_RD32(VPU_HW_BTRS_LNL_IMR_ERR_CFI1_LOW),
+			 REGB_RD32(VPU_HW_BTRS_LNL_IMR_ERR_CFI1_HIGH));
+
+	if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, SURV_ERR, reg))
+		ivpu_err(vdev, "Survivability IRQ\n");
+}
+
+void ivpu_hw_btrs_diagnose_failure(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL)
+		return diagnose_failure_mtl(vdev);
+	else
+		return diagnose_failure_lnl(vdev);
+}
diff --git a/drivers/accel/ivpu/ivpu_hw_btrs.h b/drivers/accel/ivpu/ivpu_hw_btrs.h
new file mode 100644
index 000000000000..04f14f50fed6
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_hw_btrs.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020-2024 Intel Corporation
+ */
+
+#ifndef __IVPU_HW_BTRS_H__
+#define __IVPU_HW_BTRS_H__
+
+#include "ivpu_drv.h"
+#include "ivpu_hw_37xx_reg.h"
+#include "ivpu_hw_40xx_reg.h"
+#include "ivpu_hw_reg_io.h"
+
+#define PLL_PROFILING_FREQ_DEFAULT   38400000
+#define PLL_PROFILING_FREQ_HIGH      400000000
+#define PLL_RATIO_TO_FREQ(x)         ((x) * PLL_REF_CLK_FREQ)
+
+#define DCT_DEFAULT_ACTIVE_PERCENT 15u
+#define DCT_PERIOD_US		   35300u
+
+int ivpu_hw_btrs_info_init(struct ivpu_device *vdev);
+void ivpu_hw_btrs_freq_ratios_init(struct ivpu_device *vdev);
+int ivpu_hw_btrs_irqs_clear_with_0_mtl(struct ivpu_device *vdev);
+int ivpu_hw_btrs_wp_drive(struct ivpu_device *vdev, bool enable);
+int ivpu_hw_btrs_wait_for_clock_res_own_ack(struct ivpu_device *vdev);
+int ivpu_hw_btrs_d0i3_enable(struct ivpu_device *vdev);
+int ivpu_hw_btrs_d0i3_disable(struct ivpu_device *vdev);
+void ivpu_hw_btrs_set_port_arbitration_weights_lnl(struct ivpu_device *vdev);
+bool ivpu_hw_btrs_is_idle(struct ivpu_device *vdev);
+int ivpu_hw_btrs_wait_for_idle(struct ivpu_device *vdev);
+int ivpu_hw_btrs_ip_reset(struct ivpu_device *vdev);
+void ivpu_hw_btrs_profiling_freq_reg_set_lnl(struct ivpu_device *vdev);
+void ivpu_hw_btrs_ats_print_lnl(struct ivpu_device *vdev);
+void ivpu_hw_btrs_clock_relinquish_disable_lnl(struct ivpu_device *vdev);
+bool ivpu_hw_btrs_irq_handler_mtl(struct ivpu_device *vdev, int irq);
+bool ivpu_hw_btrs_irq_handler_lnl(struct ivpu_device *vdev, int irq);
+int ivpu_hw_btrs_dct_get_request(struct ivpu_device *vdev, bool *enable);
+void ivpu_hw_btrs_dct_set_status(struct ivpu_device *vdev, bool enable, u32 dct_percent);
+u32 ivpu_hw_btrs_pll_freq_get(struct ivpu_device *vdev);
+u32 ivpu_hw_btrs_ratio_to_freq(struct ivpu_device *vdev, u32 ratio);
+u32 ivpu_hw_btrs_telemetry_offset_get(struct ivpu_device *vdev);
+u32 ivpu_hw_btrs_telemetry_size_get(struct ivpu_device *vdev);
+u32 ivpu_hw_btrs_telemetry_enable_get(struct ivpu_device *vdev);
+void ivpu_hw_btrs_global_int_enable(struct ivpu_device *vdev);
+void ivpu_hw_btrs_global_int_disable(struct ivpu_device *vdev);
+void ivpu_hw_btrs_irq_enable(struct ivpu_device *vdev);
+void ivpu_hw_btrs_irq_disable(struct ivpu_device *vdev);
+void ivpu_hw_btrs_diagnose_failure(struct ivpu_device *vdev);
+
+#endif /* __IVPU_HW_BTRS_H__ */
diff --git a/drivers/accel/ivpu/ivpu_hw_btrs_lnl_reg.h b/drivers/accel/ivpu/ivpu_hw_btrs_lnl_reg.h
new file mode 100644
index 000000000000..fc51f3098f97
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_hw_btrs_lnl_reg.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020-2024 Intel Corporation
+ */
+
+#ifndef __IVPU_HW_BTRS_LNL_REG_H__
+#define __IVPU_HW_BTRS_LNL_REG_H__
+
+#include <linux/bits.h>
+
+#define VPU_HW_BTRS_LNL_INTERRUPT_STAT				0x00000000u
+#define VPU_HW_BTRS_LNL_INTERRUPT_STAT_FREQ_CHANGE_MASK		BIT_MASK(0)
+#define VPU_HW_BTRS_LNL_INTERRUPT_STAT_ATS_ERR_MASK		BIT_MASK(1)
+#define VPU_HW_BTRS_LNL_INTERRUPT_STAT_CFI0_ERR_MASK		BIT_MASK(2)
+#define VPU_HW_BTRS_LNL_INTERRUPT_STAT_CFI1_ERR_MASK		BIT_MASK(3)
+#define VPU_HW_BTRS_LNL_INTERRUPT_STAT_IMR0_ERR_MASK		BIT_MASK(4)
+#define VPU_HW_BTRS_LNL_INTERRUPT_STAT_IMR1_ERR_MASK		BIT_MASK(5)
+#define VPU_HW_BTRS_LNL_INTERRUPT_STAT_SURV_ERR_MASK		BIT_MASK(6)
+
+#define VPU_HW_BTRS_LNL_LOCAL_INT_MASK				0x00000004u
+#define VPU_HW_BTRS_LNL_GLOBAL_INT_MASK				0x00000008u
+
+#define VPU_HW_BTRS_LNL_HM_ATS					0x0000000cu
+
+#define VPU_HW_BTRS_LNL_ATS_ERR_LOG1				0x00000010u
+#define VPU_HW_BTRS_LNL_ATS_ERR_LOG2				0x00000014u
+#define VPU_HW_BTRS_LNL_ATS_ERR_CLEAR				0x00000018u
+
+#define VPU_HW_BTRS_LNL_CFI0_ERR_LOG				0x0000001cu
+#define VPU_HW_BTRS_LNL_CFI0_ERR_CLEAR				0x00000020u
+
+#define VPU_HW_BTRS_LNL_PORT_ARBITRATION_WEIGHTS_ATS		0x00000024u
+
+#define VPU_HW_BTRS_LNL_CFI1_ERR_LOG				0x00000040u
+#define VPU_HW_BTRS_LNL_CFI1_ERR_CLEAR				0x00000044u
+
+#define VPU_HW_BTRS_LNL_IMR_ERR_CFI0_LOW			0x00000048u
+#define VPU_HW_BTRS_LNL_IMR_ERR_CFI0_HIGH			0x0000004cu
+#define VPU_HW_BTRS_LNL_IMR_ERR_CFI0_CLEAR			0x00000050u
+
+#define VPU_HW_BTRS_LNL_PORT_ARBITRATION_WEIGHTS		0x00000054u
+
+#define VPU_HW_BTRS_LNL_IMR_ERR_CFI1_LOW			0x00000058u
+#define VPU_HW_BTRS_LNL_IMR_ERR_CFI1_HIGH			0x0000005cu
+#define VPU_HW_BTRS_LNL_IMR_ERR_CFI1_CLEAR			0x00000060u
+
+#define VPU_HW_BTRS_LNL_PCODE_MAILBOX_STATUS			0x00000070u
+#define VPU_HW_BTRS_LNL_PCODE_MAILBOX_STATUS_CMD_MASK		GENMASK(7, 0)
+#define VPU_HW_BTRS_LNL_PCODE_MAILBOX_STATUS_PARAM1_MASK	GENMASK(15, 8)
+#define VPU_HW_BTRS_LNL_PCODE_MAILBOX_STATUS_PARAM2_MASK	GENMASK(23, 16)
+#define VPU_HW_BTRS_LNL_PCODE_MAILBOX_STATUS_PARAM3_MASK	GENMASK(31, 24)
+
+#define VPU_HW_BTRS_LNL_PCODE_MAILBOX_SHADOW			0x00000074u
+#define VPU_HW_BTRS_LNL_PCODE_MAILBOX_SHADOW_CMD_MASK		GENMASK(7, 0)
+#define VPU_HW_BTRS_LNL_PCODE_MAILBOX_SHADOW_PARAM1_MASK	GENMASK(15, 8)
+#define VPU_HW_BTRS_LNL_PCODE_MAILBOX_SHADOW_PARAM2_MASK	GENMASK(23, 16)
+#define VPU_HW_BTRS_LNL_PCODE_MAILBOX_SHADOW_PARAM3_MASK	GENMASK(31, 24)
+
+#define VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD0				0x00000130u
+#define VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD0_MIN_RATIO_MASK		GENMASK(15, 0)
+#define VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD0_MAX_RATIO_MASK		GENMASK(31, 16)
+
+#define VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD1				0x00000134u
+#define VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD1_TARGET_RATIO_MASK	GENMASK(15, 0)
+#define VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD1_EPP_MASK		GENMASK(31, 16)
+
+#define VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD2				0x00000138u
+#define VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD2_CONFIG_MASK		GENMASK(15, 0)
+#define VPU_HW_BTRS_LNL_WP_REQ_PAYLOAD2_CDYN_MASK		GENMASK(31, 16)
+
+#define VPU_HW_BTRS_LNL_WP_REQ_CMD				0x0000013cu
+#define VPU_HW_BTRS_LNL_WP_REQ_CMD_SEND_MASK			BIT_MASK(0)
+
+#define VPU_HW_BTRS_LNL_PLL_FREQ				0x00000148u
+#define VPU_HW_BTRS_LNL_PLL_FREQ_RATIO_MASK			GENMASK(15, 0)
+
+#define VPU_HW_BTRS_LNL_TILE_FUSE				0x00000150u
+#define VPU_HW_BTRS_LNL_TILE_FUSE_VALID_MASK			BIT_MASK(0)
+#define VPU_HW_BTRS_LNL_TILE_FUSE_CONFIG_MASK			GENMASK(6, 1)
+
+#define VPU_HW_BTRS_LNL_VPU_STATUS				0x00000154u
+#define VPU_HW_BTRS_LNL_VPU_STATUS_READY_MASK			BIT_MASK(0)
+#define VPU_HW_BTRS_LNL_VPU_STATUS_IDLE_MASK			BIT_MASK(1)
+#define VPU_HW_BTRS_LNL_VPU_STATUS_DUP_IDLE_MASK		BIT_MASK(2)
+#define VPU_HW_BTRS_LNL_VPU_STATUS_CLOCK_RESOURCE_OWN_ACK_MASK	BIT_MASK(6)
+#define VPU_HW_BTRS_LNL_VPU_STATUS_POWER_RESOURCE_OWN_ACK_MASK	BIT_MASK(7)
+#define VPU_HW_BTRS_LNL_VPU_STATUS_PERF_CLK_MASK		BIT_MASK(11)
+#define VPU_HW_BTRS_LNL_VPU_STATUS_DISABLE_CLK_RELINQUISH_MASK  BIT_MASK(12)
+
+#define VPU_HW_BTRS_LNL_IP_RESET				0x00000160u
+#define VPU_HW_BTRS_LNL_IP_RESET_TRIGGER_MASK			BIT_MASK(0)
+
+#define VPU_HW_BTRS_LNL_D0I3_CONTROL				0x00000164u
+#define VPU_HW_BTRS_LNL_D0I3_CONTROL_INPROGRESS_MASK		BIT_MASK(0)
+#define VPU_HW_BTRS_LNL_D0I3_CONTROL_I3_MASK			BIT_MASK(2)
+
+#define VPU_HW_BTRS_LNL_VPU_TELEMETRY_OFFSET			0x00000168u
+#define VPU_HW_BTRS_LNL_VPU_TELEMETRY_SIZE			0x0000016cu
+#define VPU_HW_BTRS_LNL_VPU_TELEMETRY_ENABLE			0x00000170u
+
+#define VPU_HW_BTRS_LNL_FMIN_FUSE				0x00000174u
+#define VPU_HW_BTRS_LNL_FMIN_FUSE_MIN_RATIO_MASK		GENMASK(7, 0)
+#define VPU_HW_BTRS_LNL_FMIN_FUSE_PN_RATIO_MASK			GENMASK(15, 8)
+
+#define VPU_HW_BTRS_LNL_FMAX_FUSE				0x00000178u
+#define VPU_HW_BTRS_LNL_FMAX_FUSE_MAX_RATIO_MASK		GENMASK(7, 0)
+
+#endif /* __IVPU_HW_BTRS_LNL_REG_H__ */
diff --git a/drivers/accel/ivpu/ivpu_hw_btrs_mtl_reg.h b/drivers/accel/ivpu/ivpu_hw_btrs_mtl_reg.h
new file mode 100644
index 000000000000..e93d539e066f
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_hw_btrs_mtl_reg.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020-2023 Intel Corporation
+ */
+
+#ifndef __IVPU_HW_BTRS_MTL_REG_H__
+#define __IVPU_HW_BTRS_MTL_REG_H__
+
+#include <linux/bits.h>
+
+#define VPU_HW_BTRS_MTL_INTERRUPT_TYPE				0x00000000u
+
+#define VPU_HW_BTRS_MTL_INTERRUPT_STAT				0x00000004u
+#define VPU_HW_BTRS_MTL_INTERRUPT_STAT_FREQ_CHANGE_MASK		BIT_MASK(0)
+#define VPU_HW_BTRS_MTL_INTERRUPT_STAT_ATS_ERR_MASK		BIT_MASK(1)
+#define VPU_HW_BTRS_MTL_INTERRUPT_STAT_UFI_ERR_MASK		BIT_MASK(2)
+
+#define VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD0				0x00000008u
+#define VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD0_MIN_RATIO_MASK		GENMASK(15, 0)
+#define VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD0_MAX_RATIO_MASK		GENMASK(31, 16)
+
+#define VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD1				0x0000000cu
+#define VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD1_TARGET_RATIO_MASK	GENMASK(15, 0)
+#define VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD1_EPP_MASK		GENMASK(31, 16)
+
+#define VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD2				0x00000010u
+#define VPU_HW_BTRS_MTL_WP_REQ_PAYLOAD2_CONFIG_MASK		GENMASK(15, 0)
+
+#define VPU_HW_BTRS_MTL_WP_REQ_CMD				0x00000014u
+#define VPU_HW_BTRS_MTL_WP_REQ_CMD_SEND_MASK			BIT_MASK(0)
+
+#define VPU_HW_BTRS_MTL_WP_DOWNLOAD				0x00000018u
+#define VPU_HW_BTRS_MTL_WP_DOWNLOAD_TARGET_RATIO_MASK		GENMASK(15, 0)
+
+#define VPU_HW_BTRS_MTL_CURRENT_PLL				0x0000001cu
+#define VPU_HW_BTRS_MTL_CURRENT_PLL_RATIO_MASK			GENMASK(15, 0)
+
+#define VPU_HW_BTRS_MTL_PLL_ENABLE				0x00000020u
+
+#define VPU_HW_BTRS_MTL_FMIN_FUSE				0x00000024u
+#define VPU_HW_BTRS_MTL_FMIN_FUSE_MIN_RATIO_MASK		GENMASK(7, 0)
+#define VPU_HW_BTRS_MTL_FMIN_FUSE_PN_RATIO_MASK			GENMASK(15, 8)
+
+#define VPU_HW_BTRS_MTL_FMAX_FUSE				0x00000028u
+#define VPU_HW_BTRS_MTL_FMAX_FUSE_MAX_RATIO_MASK		GENMASK(7, 0)
+
+#define VPU_HW_BTRS_MTL_TILE_FUSE				0x0000002cu
+#define VPU_HW_BTRS_MTL_TILE_FUSE_VALID_MASK			BIT_MASK(0)
+#define VPU_HW_BTRS_MTL_TILE_FUSE_SKU_MASK			GENMASK(3, 2)
+
+#define VPU_HW_BTRS_MTL_LOCAL_INT_MASK				0x00000030u
+#define VPU_HW_BTRS_MTL_GLOBAL_INT_MASK				0x00000034u
+
+#define VPU_HW_BTRS_MTL_PLL_STATUS				0x00000040u
+#define VPU_HW_BTRS_MTL_PLL_STATUS_LOCK_MASK			BIT_MASK(1)
+
+#define VPU_HW_BTRS_MTL_VPU_STATUS				0x00000044u
+#define VPU_HW_BTRS_MTL_VPU_STATUS_READY_MASK			BIT_MASK(0)
+#define VPU_HW_BTRS_MTL_VPU_STATUS_IDLE_MASK			BIT_MASK(1)
+
+#define VPU_HW_BTRS_MTL_VPU_D0I3_CONTROL			0x00000060u
+#define VPU_HW_BTRS_MTL_VPU_D0I3_CONTROL_INPROGRESS_MASK	BIT_MASK(0)
+#define VPU_HW_BTRS_MTL_VPU_D0I3_CONTROL_I3_MASK		BIT_MASK(2)
+
+#define VPU_HW_BTRS_MTL_VPU_IP_RESET				0x00000050u
+#define VPU_HW_BTRS_MTL_VPU_IP_RESET_TRIGGER_MASK		BIT_MASK(0)
+
+#define VPU_HW_BTRS_MTL_VPU_TELEMETRY_OFFSET			0x00000080u
+#define VPU_HW_BTRS_MTL_VPU_TELEMETRY_SIZE			0x00000084u
+#define VPU_HW_BTRS_MTL_VPU_TELEMETRY_ENABLE			0x00000088u
+
+#define VPU_HW_BTRS_MTL_ATS_ERR_LOG_0				0x000000a0u
+#define VPU_HW_BTRS_MTL_ATS_ERR_LOG_1				0x000000a4u
+#define VPU_HW_BTRS_MTL_ATS_ERR_CLEAR				0x000000a8u
+
+#define VPU_HW_BTRS_MTL_UFI_ERR_LOG				0x000000b0u
+#define VPU_HW_BTRS_MTL_UFI_ERR_LOG_CQ_ID_MASK			GENMASK(11, 0)
+#define VPU_HW_BTRS_MTL_UFI_ERR_LOG_AXI_ID_MASK			GENMASK(19, 12)
+#define VPU_HW_BTRS_MTL_UFI_ERR_LOG_OPCODE_MASK			GENMASK(24, 20)
+
+#define VPU_HW_BTRS_MTL_UFI_ERR_CLEAR				0x000000b4u
+
+#endif /* __IVPU_HW_BTRS_MTL_REG_H__ */
diff --git a/drivers/accel/ivpu/ivpu_hw_ip.c b/drivers/accel/ivpu/ivpu_hw_ip.c
new file mode 100644
index 000000000000..029dd065614b
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_hw_ip.c
@@ -0,0 +1,1188 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020-2024 Intel Corporation
+ */
+
+#include "ivpu_drv.h"
+#include "ivpu_fw.h"
+#include "ivpu_hw.h"
+#include "ivpu_hw_37xx_reg.h"
+#include "ivpu_hw_40xx_reg.h"
+#include "ivpu_hw_btrs.h"
+#include "ivpu_hw_ip.h"
+#include "ivpu_hw_reg_io.h"
+#include "ivpu_mmu.h"
+#include "ivpu_pm.h"
+
+#define PWR_ISLAND_STATUS_TIMEOUT_US        (5 * USEC_PER_MSEC)
+
+#define TIM_SAFE_ENABLE		            0xf1d0dead
+#define TIM_WATCHDOG_RESET_VALUE            0xffffffff
+
+#define ICB_0_IRQ_MASK_37XX ((REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, HOST_IPC_FIFO_INT)) | \
+			     (REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_0_INT)) | \
+			     (REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_1_INT)) | \
+			     (REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_2_INT)) | \
+			     (REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, NOC_FIREWALL_INT)) | \
+			     (REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_0_INT)) | \
+			     (REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_1_INT)))
+
+#define ICB_1_IRQ_MASK_37XX ((REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_1, CPU_INT_REDIRECT_2_INT)) | \
+			     (REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_1, CPU_INT_REDIRECT_3_INT)) | \
+			     (REG_FLD(VPU_37XX_HOST_SS_ICB_STATUS_1, CPU_INT_REDIRECT_4_INT)))
+
+#define ICB_0_1_IRQ_MASK_37XX ((((u64)ICB_1_IRQ_MASK_37XX) << 32) | ICB_0_IRQ_MASK_37XX)
+
+#define ICB_0_IRQ_MASK_40XX ((REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, HOST_IPC_FIFO_INT)) | \
+			     (REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_0_INT)) | \
+			     (REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_1_INT)) | \
+			     (REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_2_INT)) | \
+			     (REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, NOC_FIREWALL_INT)) | \
+			     (REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_0_INT)) | \
+			     (REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_1_INT)))
+
+#define ICB_1_IRQ_MASK_40XX ((REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_1, CPU_INT_REDIRECT_2_INT)) | \
+			     (REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_1, CPU_INT_REDIRECT_3_INT)) | \
+			     (REG_FLD(VPU_40XX_HOST_SS_ICB_STATUS_1, CPU_INT_REDIRECT_4_INT)))
+
+#define ICB_0_1_IRQ_MASK_40XX ((((u64)ICB_1_IRQ_MASK_40XX) << 32) | ICB_0_IRQ_MASK_40XX)
+
+#define ITF_FIREWALL_VIOLATION_MASK_37XX ((REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, CSS_ROM_CMX)) | \
+					  (REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, CSS_DBG)) | \
+					  (REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, CSS_CTRL)) | \
+					  (REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, DEC400)) | \
+					  (REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, MSS_NCE)) | \
+					  (REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, MSS_MBI)) | \
+					  (REG_FLD(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, MSS_MBI_CMX)))
+
+#define ITF_FIREWALL_VIOLATION_MASK_40XX ((REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, CSS_ROM_CMX)) | \
+					  (REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, CSS_DBG)) | \
+					  (REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, CSS_CTRL)) | \
+					  (REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, DEC400)) | \
+					  (REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, MSS_NCE)) | \
+					  (REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, MSS_MBI)) | \
+					  (REG_FLD(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, MSS_MBI_CMX)))
+
+static int wait_for_ip_bar(struct ivpu_device *vdev)
+{
+	return REGV_POLL_FLD(VPU_37XX_HOST_SS_CPR_RST_CLR, AON, 0, 100);
+}
+
+static void host_ss_rst_clr(struct ivpu_device *vdev)
+{
+	u32 val = 0;
+
+	val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_RST_CLR, TOP_NOC, val);
+	val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_RST_CLR, DSS_MAS, val);
+	val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_RST_CLR, MSS_MAS, val);
+
+	REGV_WR32(VPU_37XX_HOST_SS_CPR_RST_CLR, val);
+}
+
+static int host_ss_noc_qreqn_check_37xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_37XX_HOST_SS_NOC_QREQN);
+
+	if (!REG_TEST_FLD_NUM(VPU_37XX_HOST_SS_NOC_QREQN, TOP_SOCMMIO, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int host_ss_noc_qreqn_check_40xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_40XX_HOST_SS_NOC_QREQN);
+
+	if (!REG_TEST_FLD_NUM(VPU_40XX_HOST_SS_NOC_QREQN, TOP_SOCMMIO, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int host_ss_noc_qreqn_check(struct ivpu_device *vdev, u32 exp_val)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return host_ss_noc_qreqn_check_37xx(vdev, exp_val);
+	else
+		return host_ss_noc_qreqn_check_40xx(vdev, exp_val);
+}
+
+static int host_ss_noc_qacceptn_check_37xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_37XX_HOST_SS_NOC_QACCEPTN);
+
+	if (!REG_TEST_FLD_NUM(VPU_37XX_HOST_SS_NOC_QACCEPTN, TOP_SOCMMIO, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int host_ss_noc_qacceptn_check_40xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_40XX_HOST_SS_NOC_QACCEPTN);
+
+	if (!REG_TEST_FLD_NUM(VPU_40XX_HOST_SS_NOC_QACCEPTN, TOP_SOCMMIO, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int host_ss_noc_qacceptn_check(struct ivpu_device *vdev, u32 exp_val)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return host_ss_noc_qacceptn_check_37xx(vdev, exp_val);
+	else
+		return host_ss_noc_qacceptn_check_40xx(vdev, exp_val);
+}
+
+static int host_ss_noc_qdeny_check_37xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_37XX_HOST_SS_NOC_QDENY);
+
+	if (!REG_TEST_FLD_NUM(VPU_37XX_HOST_SS_NOC_QDENY, TOP_SOCMMIO, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int host_ss_noc_qdeny_check_40xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_40XX_HOST_SS_NOC_QDENY);
+
+	if (!REG_TEST_FLD_NUM(VPU_40XX_HOST_SS_NOC_QDENY, TOP_SOCMMIO, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int host_ss_noc_qdeny_check(struct ivpu_device *vdev, u32 exp_val)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return host_ss_noc_qdeny_check_37xx(vdev, exp_val);
+	else
+		return host_ss_noc_qdeny_check_40xx(vdev, exp_val);
+}
+
+static int top_noc_qrenqn_check_37xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_37XX_TOP_NOC_QREQN);
+
+	if (!REG_TEST_FLD_NUM(VPU_37XX_TOP_NOC_QREQN, CPU_CTRL, exp_val, val) ||
+	    !REG_TEST_FLD_NUM(VPU_37XX_TOP_NOC_QREQN, HOSTIF_L2CACHE, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int top_noc_qrenqn_check_40xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_40XX_TOP_NOC_QREQN);
+
+	if (!REG_TEST_FLD_NUM(VPU_40XX_TOP_NOC_QREQN, CPU_CTRL, exp_val, val) ||
+	    !REG_TEST_FLD_NUM(VPU_40XX_TOP_NOC_QREQN, HOSTIF_L2CACHE, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int top_noc_qreqn_check(struct ivpu_device *vdev, u32 exp_val)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return top_noc_qrenqn_check_37xx(vdev, exp_val);
+	else
+		return top_noc_qrenqn_check_40xx(vdev, exp_val);
+}
+
+int ivpu_hw_ip_host_ss_configure(struct ivpu_device *vdev)
+{
+	int ret;
+
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX) {
+		ret = wait_for_ip_bar(vdev);
+		if (ret) {
+			ivpu_err(vdev, "Timed out waiting for NPU IP bar\n");
+			return ret;
+		}
+		host_ss_rst_clr(vdev);
+	}
+
+	ret = host_ss_noc_qreqn_check(vdev, 0x0);
+	if (ret) {
+		ivpu_err(vdev, "Failed qreqn check: %d\n", ret);
+		return ret;
+	}
+
+	ret = host_ss_noc_qacceptn_check(vdev, 0x0);
+	if (ret) {
+		ivpu_err(vdev, "Failed qacceptn check: %d\n", ret);
+		return ret;
+	}
+
+	ret = host_ss_noc_qdeny_check(vdev, 0x0);
+	if (ret)
+		ivpu_err(vdev, "Failed qdeny check %d\n", ret);
+
+	return ret;
+}
+
+static void idle_gen_drive_37xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_37XX_HOST_SS_AON_VPU_IDLE_GEN);
+
+	if (enable)
+		val = REG_SET_FLD(VPU_37XX_HOST_SS_AON_VPU_IDLE_GEN, EN, val);
+	else
+		val = REG_CLR_FLD(VPU_37XX_HOST_SS_AON_VPU_IDLE_GEN, EN, val);
+
+	REGV_WR32(VPU_37XX_HOST_SS_AON_VPU_IDLE_GEN, val);
+}
+
+static void idle_gen_drive_40xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_40XX_HOST_SS_AON_IDLE_GEN);
+
+	if (enable)
+		val = REG_SET_FLD(VPU_40XX_HOST_SS_AON_IDLE_GEN, EN, val);
+	else
+		val = REG_CLR_FLD(VPU_40XX_HOST_SS_AON_IDLE_GEN, EN, val);
+
+	REGV_WR32(VPU_40XX_HOST_SS_AON_IDLE_GEN, val);
+}
+
+void ivpu_hw_ip_idle_gen_enable(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		idle_gen_drive_37xx(vdev, true);
+	else
+		idle_gen_drive_40xx(vdev, true);
+}
+
+void ivpu_hw_ip_idle_gen_disable(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		idle_gen_drive_37xx(vdev, false);
+	else
+		idle_gen_drive_40xx(vdev, false);
+}
+
+static void
+pwr_island_delay_set_50xx(struct ivpu_device *vdev, u32 post, u32 post1, u32 post2, u32 status)
+{
+	u32 val;
+
+	val = REGV_RD32(VPU_50XX_HOST_SS_AON_PWR_ISLAND_EN_POST_DLY);
+	val = REG_SET_FLD_NUM(VPU_50XX_HOST_SS_AON_PWR_ISLAND_EN_POST_DLY, POST_DLY, post, val);
+	val = REG_SET_FLD_NUM(VPU_50XX_HOST_SS_AON_PWR_ISLAND_EN_POST_DLY, POST1_DLY, post1, val);
+	val = REG_SET_FLD_NUM(VPU_50XX_HOST_SS_AON_PWR_ISLAND_EN_POST_DLY, POST2_DLY, post2, val);
+	REGV_WR32(VPU_50XX_HOST_SS_AON_PWR_ISLAND_EN_POST_DLY, val);
+
+	val = REGV_RD32(VPU_50XX_HOST_SS_AON_PWR_ISLAND_STATUS_DLY);
+	val = REG_SET_FLD_NUM(VPU_50XX_HOST_SS_AON_PWR_ISLAND_STATUS_DLY, STATUS_DLY, status, val);
+	REGV_WR32(VPU_50XX_HOST_SS_AON_PWR_ISLAND_STATUS_DLY, val);
+}
+
+static void pwr_island_trickle_drive_37xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_37XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0);
+
+	if (enable)
+		val = REG_SET_FLD(VPU_37XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0, MSS_CPU, val);
+	else
+		val = REG_CLR_FLD(VPU_37XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0, MSS_CPU, val);
+
+	REGV_WR32(VPU_37XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0, val);
+}
+
+static void pwr_island_trickle_drive_40xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_40XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0);
+
+	if (enable)
+		val = REG_SET_FLD(VPU_40XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0, CSS_CPU, val);
+	else
+		val = REG_CLR_FLD(VPU_40XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0, CSS_CPU, val);
+
+	REGV_WR32(VPU_40XX_HOST_SS_AON_PWR_ISLAND_TRICKLE_EN0, val);
+}
+
+static void pwr_island_drive_37xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_40XX_HOST_SS_AON_PWR_ISLAND_EN0);
+
+	if (enable)
+		val = REG_SET_FLD(VPU_40XX_HOST_SS_AON_PWR_ISLAND_EN0, CSS_CPU, val);
+	else
+		val = REG_CLR_FLD(VPU_40XX_HOST_SS_AON_PWR_ISLAND_EN0, CSS_CPU, val);
+
+	REGV_WR32(VPU_40XX_HOST_SS_AON_PWR_ISLAND_EN0, val);
+}
+
+static void pwr_island_drive_40xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_37XX_HOST_SS_AON_PWR_ISLAND_EN0);
+
+	if (enable)
+		val = REG_SET_FLD(VPU_37XX_HOST_SS_AON_PWR_ISLAND_EN0, MSS_CPU, val);
+	else
+		val = REG_CLR_FLD(VPU_37XX_HOST_SS_AON_PWR_ISLAND_EN0, MSS_CPU, val);
+
+	REGV_WR32(VPU_37XX_HOST_SS_AON_PWR_ISLAND_EN0, val);
+}
+
+static void pwr_island_enable(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX) {
+		pwr_island_trickle_drive_37xx(vdev, true);
+		ndelay(500);
+		pwr_island_drive_37xx(vdev, true);
+	} else {
+		pwr_island_trickle_drive_40xx(vdev, true);
+		ndelay(500);
+		pwr_island_drive_40xx(vdev, true);
+	}
+}
+
+static int wait_for_pwr_island_status(struct ivpu_device *vdev, u32 exp_val)
+{
+	if (IVPU_WA(punit_disabled))
+		return 0;
+
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return REGV_POLL_FLD(VPU_37XX_HOST_SS_AON_PWR_ISLAND_STATUS0, MSS_CPU, exp_val,
+				     PWR_ISLAND_STATUS_TIMEOUT_US);
+	else
+		return REGV_POLL_FLD(VPU_40XX_HOST_SS_AON_PWR_ISLAND_STATUS0, CSS_CPU, exp_val,
+				     PWR_ISLAND_STATUS_TIMEOUT_US);
+}
+
+static void pwr_island_isolation_drive_37xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_37XX_HOST_SS_AON_PWR_ISO_EN0);
+
+	if (enable)
+		val = REG_SET_FLD(VPU_37XX_HOST_SS_AON_PWR_ISO_EN0, MSS_CPU, val);
+	else
+		val = REG_CLR_FLD(VPU_37XX_HOST_SS_AON_PWR_ISO_EN0, MSS_CPU, val);
+
+	REGV_WR32(VPU_37XX_HOST_SS_AON_PWR_ISO_EN0, val);
+}
+
+static void pwr_island_isolation_drive_40xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_40XX_HOST_SS_AON_PWR_ISO_EN0);
+
+	if (enable)
+		val = REG_SET_FLD(VPU_40XX_HOST_SS_AON_PWR_ISO_EN0, CSS_CPU, val);
+	else
+		val = REG_CLR_FLD(VPU_40XX_HOST_SS_AON_PWR_ISO_EN0, CSS_CPU, val);
+
+	REGV_WR32(VPU_40XX_HOST_SS_AON_PWR_ISO_EN0, val);
+}
+
+static void pwr_island_isolation_drive(struct ivpu_device *vdev, bool enable)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		pwr_island_isolation_drive_37xx(vdev, enable);
+	else
+		pwr_island_isolation_drive_40xx(vdev, enable);
+}
+
+static void pwr_island_isolation_disable(struct ivpu_device *vdev)
+{
+	pwr_island_isolation_drive(vdev, false);
+}
+
+static void host_ss_clk_drive_37xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_37XX_HOST_SS_CPR_CLK_SET);
+
+	if (enable) {
+		val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_CLK_SET, TOP_NOC, val);
+		val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_CLK_SET, DSS_MAS, val);
+		val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_CLK_SET, MSS_MAS, val);
+	} else {
+		val = REG_CLR_FLD(VPU_37XX_HOST_SS_CPR_CLK_SET, TOP_NOC, val);
+		val = REG_CLR_FLD(VPU_37XX_HOST_SS_CPR_CLK_SET, DSS_MAS, val);
+		val = REG_CLR_FLD(VPU_37XX_HOST_SS_CPR_CLK_SET, MSS_MAS, val);
+	}
+
+	REGV_WR32(VPU_37XX_HOST_SS_CPR_CLK_SET, val);
+}
+
+static void host_ss_clk_drive_40xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_40XX_HOST_SS_CPR_CLK_EN);
+
+	if (enable) {
+		val = REG_SET_FLD(VPU_40XX_HOST_SS_CPR_CLK_EN, TOP_NOC, val);
+		val = REG_SET_FLD(VPU_40XX_HOST_SS_CPR_CLK_EN, DSS_MAS, val);
+		val = REG_SET_FLD(VPU_40XX_HOST_SS_CPR_CLK_EN, CSS_MAS, val);
+	} else {
+		val = REG_CLR_FLD(VPU_40XX_HOST_SS_CPR_CLK_EN, TOP_NOC, val);
+		val = REG_CLR_FLD(VPU_40XX_HOST_SS_CPR_CLK_EN, DSS_MAS, val);
+		val = REG_CLR_FLD(VPU_40XX_HOST_SS_CPR_CLK_EN, CSS_MAS, val);
+	}
+
+	REGV_WR32(VPU_40XX_HOST_SS_CPR_CLK_EN, val);
+}
+
+static void host_ss_clk_drive(struct ivpu_device *vdev, bool enable)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		host_ss_clk_drive_37xx(vdev, enable);
+	else
+		host_ss_clk_drive_40xx(vdev, enable);
+}
+
+static void host_ss_clk_enable(struct ivpu_device *vdev)
+{
+	host_ss_clk_drive(vdev, true);
+}
+
+static void host_ss_rst_drive_37xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_37XX_HOST_SS_CPR_RST_SET);
+
+	if (enable) {
+		val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_RST_SET, TOP_NOC, val);
+		val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_RST_SET, DSS_MAS, val);
+		val = REG_SET_FLD(VPU_37XX_HOST_SS_CPR_RST_SET, MSS_MAS, val);
+	} else {
+		val = REG_CLR_FLD(VPU_37XX_HOST_SS_CPR_RST_SET, TOP_NOC, val);
+		val = REG_CLR_FLD(VPU_37XX_HOST_SS_CPR_RST_SET, DSS_MAS, val);
+		val = REG_CLR_FLD(VPU_37XX_HOST_SS_CPR_RST_SET, MSS_MAS, val);
+	}
+
+	REGV_WR32(VPU_37XX_HOST_SS_CPR_RST_SET, val);
+}
+
+static void host_ss_rst_drive_40xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_40XX_HOST_SS_CPR_RST_EN);
+
+	if (enable) {
+		val = REG_SET_FLD(VPU_40XX_HOST_SS_CPR_RST_EN, TOP_NOC, val);
+		val = REG_SET_FLD(VPU_40XX_HOST_SS_CPR_RST_EN, DSS_MAS, val);
+		val = REG_SET_FLD(VPU_40XX_HOST_SS_CPR_RST_EN, CSS_MAS, val);
+	} else {
+		val = REG_CLR_FLD(VPU_40XX_HOST_SS_CPR_RST_EN, TOP_NOC, val);
+		val = REG_CLR_FLD(VPU_40XX_HOST_SS_CPR_RST_EN, DSS_MAS, val);
+		val = REG_CLR_FLD(VPU_40XX_HOST_SS_CPR_RST_EN, CSS_MAS, val);
+	}
+
+	REGV_WR32(VPU_40XX_HOST_SS_CPR_RST_EN, val);
+}
+
+static void host_ss_rst_drive(struct ivpu_device *vdev, bool enable)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		host_ss_rst_drive_37xx(vdev, enable);
+	else
+		host_ss_rst_drive_40xx(vdev, enable);
+}
+
+static void host_ss_rst_enable(struct ivpu_device *vdev)
+{
+	host_ss_rst_drive(vdev, true);
+}
+
+static void host_ss_noc_qreqn_top_socmmio_drive_37xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_37XX_HOST_SS_NOC_QREQN);
+
+	if (enable)
+		val = REG_SET_FLD(VPU_37XX_HOST_SS_NOC_QREQN, TOP_SOCMMIO, val);
+	else
+		val = REG_CLR_FLD(VPU_37XX_HOST_SS_NOC_QREQN, TOP_SOCMMIO, val);
+	REGV_WR32(VPU_37XX_HOST_SS_NOC_QREQN, val);
+}
+
+static void host_ss_noc_qreqn_top_socmmio_drive_40xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_40XX_HOST_SS_NOC_QREQN);
+
+	if (enable)
+		val = REG_SET_FLD(VPU_40XX_HOST_SS_NOC_QREQN, TOP_SOCMMIO, val);
+	else
+		val = REG_CLR_FLD(VPU_40XX_HOST_SS_NOC_QREQN, TOP_SOCMMIO, val);
+	REGV_WR32(VPU_40XX_HOST_SS_NOC_QREQN, val);
+}
+
+static void host_ss_noc_qreqn_top_socmmio_drive(struct ivpu_device *vdev, bool enable)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		host_ss_noc_qreqn_top_socmmio_drive_37xx(vdev, enable);
+	else
+		host_ss_noc_qreqn_top_socmmio_drive_40xx(vdev, enable);
+}
+
+static int host_ss_axi_drive(struct ivpu_device *vdev, bool enable)
+{
+	int ret;
+
+	host_ss_noc_qreqn_top_socmmio_drive(vdev, enable);
+
+	ret = host_ss_noc_qacceptn_check(vdev, enable ? 0x1 : 0x0);
+	if (ret) {
+		ivpu_err(vdev, "Failed HOST SS NOC QACCEPTN check: %d\n", ret);
+		return ret;
+	}
+
+	ret = host_ss_noc_qdeny_check(vdev, 0x0);
+	if (ret)
+		ivpu_err(vdev, "Failed HOST SS NOC QDENY check: %d\n", ret);
+
+	return ret;
+}
+
+static void top_noc_qreqn_drive_40xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_40XX_TOP_NOC_QREQN);
+
+	if (enable) {
+		val = REG_SET_FLD(VPU_40XX_TOP_NOC_QREQN, CPU_CTRL, val);
+		val = REG_SET_FLD(VPU_40XX_TOP_NOC_QREQN, HOSTIF_L2CACHE, val);
+	} else {
+		val = REG_CLR_FLD(VPU_40XX_TOP_NOC_QREQN, CPU_CTRL, val);
+		val = REG_CLR_FLD(VPU_40XX_TOP_NOC_QREQN, HOSTIF_L2CACHE, val);
+	}
+
+	REGV_WR32(VPU_40XX_TOP_NOC_QREQN, val);
+}
+
+static void top_noc_qreqn_drive_37xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_37XX_TOP_NOC_QREQN);
+
+	if (enable) {
+		val = REG_SET_FLD(VPU_37XX_TOP_NOC_QREQN, CPU_CTRL, val);
+		val = REG_SET_FLD(VPU_37XX_TOP_NOC_QREQN, HOSTIF_L2CACHE, val);
+	} else {
+		val = REG_CLR_FLD(VPU_37XX_TOP_NOC_QREQN, CPU_CTRL, val);
+		val = REG_CLR_FLD(VPU_37XX_TOP_NOC_QREQN, HOSTIF_L2CACHE, val);
+	}
+
+	REGV_WR32(VPU_37XX_TOP_NOC_QREQN, val);
+}
+
+static void top_noc_qreqn_drive(struct ivpu_device *vdev, bool enable)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		top_noc_qreqn_drive_37xx(vdev, enable);
+	else
+		top_noc_qreqn_drive_40xx(vdev, enable);
+}
+
+int ivpu_hw_ip_host_ss_axi_enable(struct ivpu_device *vdev)
+{
+	return host_ss_axi_drive(vdev, true);
+}
+
+static int top_noc_qacceptn_check_37xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_37XX_TOP_NOC_QACCEPTN);
+
+	if (!REG_TEST_FLD_NUM(VPU_37XX_TOP_NOC_QACCEPTN, CPU_CTRL, exp_val, val) ||
+	    !REG_TEST_FLD_NUM(VPU_37XX_TOP_NOC_QACCEPTN, HOSTIF_L2CACHE, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int top_noc_qacceptn_check_40xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_40XX_TOP_NOC_QACCEPTN);
+
+	if (!REG_TEST_FLD_NUM(VPU_40XX_TOP_NOC_QACCEPTN, CPU_CTRL, exp_val, val) ||
+	    !REG_TEST_FLD_NUM(VPU_40XX_TOP_NOC_QACCEPTN, HOSTIF_L2CACHE, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int top_noc_qacceptn_check(struct ivpu_device *vdev, u32 exp_val)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return top_noc_qacceptn_check_37xx(vdev, exp_val);
+	else
+		return top_noc_qacceptn_check_40xx(vdev, exp_val);
+}
+
+static int top_noc_qdeny_check_37xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_37XX_TOP_NOC_QDENY);
+
+	if (!REG_TEST_FLD_NUM(VPU_37XX_TOP_NOC_QDENY, CPU_CTRL, exp_val, val) ||
+	    !REG_TEST_FLD_NUM(VPU_37XX_TOP_NOC_QDENY, HOSTIF_L2CACHE, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int top_noc_qdeny_check_40xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_40XX_TOP_NOC_QDENY);
+
+	if (!REG_TEST_FLD_NUM(VPU_40XX_TOP_NOC_QDENY, CPU_CTRL, exp_val, val) ||
+	    !REG_TEST_FLD_NUM(VPU_40XX_TOP_NOC_QDENY, HOSTIF_L2CACHE, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int top_noc_qdeny_check(struct ivpu_device *vdev, u32 exp_val)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return top_noc_qdeny_check_37xx(vdev, exp_val);
+	else
+		return top_noc_qdeny_check_40xx(vdev, exp_val);
+}
+
+static int top_noc_drive(struct ivpu_device *vdev, bool enable)
+{
+	int ret;
+
+	top_noc_qreqn_drive(vdev, enable);
+
+	ret = top_noc_qacceptn_check(vdev, enable ? 0x1 : 0x0);
+	if (ret) {
+		ivpu_err(vdev, "Failed TOP NOC QACCEPTN check: %d\n", ret);
+		return ret;
+	}
+
+	ret = top_noc_qdeny_check(vdev, 0x0);
+	if (ret)
+		ivpu_err(vdev, "Failed TOP NOC QDENY check: %d\n", ret);
+
+	return ret;
+}
+
+int ivpu_hw_ip_top_noc_enable(struct ivpu_device *vdev)
+{
+	return top_noc_drive(vdev, true);
+}
+
+static void dpu_active_drive_37xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_37XX_HOST_SS_AON_DPU_ACTIVE);
+
+	if (enable)
+		val = REG_SET_FLD(VPU_37XX_HOST_SS_AON_DPU_ACTIVE, DPU_ACTIVE, val);
+	else
+		val = REG_CLR_FLD(VPU_37XX_HOST_SS_AON_DPU_ACTIVE, DPU_ACTIVE, val);
+
+	REGV_WR32(VPU_37XX_HOST_SS_AON_DPU_ACTIVE, val);
+}
+
+static void pwr_island_delay_set(struct ivpu_device *vdev)
+{
+	bool high = vdev->hw->pll.profiling_freq == PLL_PROFILING_FREQ_HIGH;
+	u32 post, post1, post2, status;
+
+	if (ivpu_hw_ip_gen(vdev) < IVPU_HW_IP_50XX)
+		return;
+
+	switch (ivpu_device_id(vdev)) {
+	case PCI_DEVICE_ID_PTL_P:
+		post = high ? 18 : 0;
+		post1 = 0;
+		post2 = 0;
+		status = high ? 46 : 3;
+		break;
+
+	default:
+		dump_stack();
+		ivpu_err(vdev, "Unknown device ID\n");
+		return;
+	}
+
+	pwr_island_delay_set_50xx(vdev, post, post1, post2, status);
+}
+
+int ivpu_hw_ip_pwr_domain_enable(struct ivpu_device *vdev)
+{
+	int ret;
+
+	pwr_island_delay_set(vdev);
+	pwr_island_enable(vdev);
+
+	ret = wait_for_pwr_island_status(vdev, 0x1);
+	if (ret) {
+		ivpu_err(vdev, "Timed out waiting for power island status\n");
+		return ret;
+	}
+
+	ret = top_noc_qreqn_check(vdev, 0x0);
+	if (ret) {
+		ivpu_err(vdev, "Failed TOP NOC QREQN check %d\n", ret);
+		return ret;
+	}
+
+	host_ss_clk_enable(vdev);
+	pwr_island_isolation_disable(vdev);
+	host_ss_rst_enable(vdev);
+
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		dpu_active_drive_37xx(vdev, true);
+
+	return ret;
+}
+
+u64 ivpu_hw_ip_read_perf_timer_counter(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return REGV_RD64(VPU_37XX_CPU_SS_TIM_PERF_FREE_CNT);
+	else
+		return REGV_RD64(VPU_40XX_CPU_SS_TIM_PERF_EXT_FREE_CNT);
+}
+
+static void ivpu_hw_ip_snoop_disable_37xx(struct ivpu_device *vdev)
+{
+	u32 val = REGV_RD32(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES);
+
+	val = REG_SET_FLD(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, NOSNOOP_OVERRIDE_EN, val);
+	val = REG_CLR_FLD(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, AW_NOSNOOP_OVERRIDE, val);
+
+	if (ivpu_is_force_snoop_enabled(vdev))
+		val = REG_CLR_FLD(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, AR_NOSNOOP_OVERRIDE, val);
+	else
+		val = REG_SET_FLD(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, AR_NOSNOOP_OVERRIDE, val);
+
+	REGV_WR32(VPU_37XX_HOST_IF_TCU_PTW_OVERRIDES, val);
+}
+
+static void ivpu_hw_ip_snoop_disable_40xx(struct ivpu_device *vdev)
+{
+	u32 val = REGV_RD32(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES);
+
+	val = REG_SET_FLD(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, SNOOP_OVERRIDE_EN, val);
+	val = REG_SET_FLD(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, AW_SNOOP_OVERRIDE, val);
+
+	if (ivpu_is_force_snoop_enabled(vdev))
+		val = REG_SET_FLD(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, AR_SNOOP_OVERRIDE, val);
+	else
+		val = REG_CLR_FLD(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, AR_SNOOP_OVERRIDE, val);
+
+	REGV_WR32(VPU_40XX_HOST_IF_TCU_PTW_OVERRIDES, val);
+}
+
+void ivpu_hw_ip_snoop_disable(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return ivpu_hw_ip_snoop_disable_37xx(vdev);
+	else
+		return ivpu_hw_ip_snoop_disable_40xx(vdev);
+}
+
+static void ivpu_hw_ip_tbu_mmu_enable_37xx(struct ivpu_device *vdev)
+{
+	u32 val = REGV_RD32(VPU_37XX_HOST_IF_TBU_MMUSSIDV);
+
+	val = REG_SET_FLD(VPU_37XX_HOST_IF_TBU_MMUSSIDV, TBU0_AWMMUSSIDV, val);
+	val = REG_SET_FLD(VPU_37XX_HOST_IF_TBU_MMUSSIDV, TBU0_ARMMUSSIDV, val);
+	val = REG_SET_FLD(VPU_37XX_HOST_IF_TBU_MMUSSIDV, TBU2_AWMMUSSIDV, val);
+	val = REG_SET_FLD(VPU_37XX_HOST_IF_TBU_MMUSSIDV, TBU2_ARMMUSSIDV, val);
+
+	REGV_WR32(VPU_37XX_HOST_IF_TBU_MMUSSIDV, val);
+}
+
+static void ivpu_hw_ip_tbu_mmu_enable_40xx(struct ivpu_device *vdev)
+{
+	u32 val = REGV_RD32(VPU_40XX_HOST_IF_TBU_MMUSSIDV);
+
+	val = REG_SET_FLD(VPU_40XX_HOST_IF_TBU_MMUSSIDV, TBU0_AWMMUSSIDV, val);
+	val = REG_SET_FLD(VPU_40XX_HOST_IF_TBU_MMUSSIDV, TBU0_ARMMUSSIDV, val);
+	val = REG_SET_FLD(VPU_40XX_HOST_IF_TBU_MMUSSIDV, TBU1_AWMMUSSIDV, val);
+	val = REG_SET_FLD(VPU_40XX_HOST_IF_TBU_MMUSSIDV, TBU1_ARMMUSSIDV, val);
+	val = REG_SET_FLD(VPU_40XX_HOST_IF_TBU_MMUSSIDV, TBU2_AWMMUSSIDV, val);
+	val = REG_SET_FLD(VPU_40XX_HOST_IF_TBU_MMUSSIDV, TBU2_ARMMUSSIDV, val);
+
+	REGV_WR32(VPU_40XX_HOST_IF_TBU_MMUSSIDV, val);
+}
+
+void ivpu_hw_ip_tbu_mmu_enable(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return ivpu_hw_ip_tbu_mmu_enable_37xx(vdev);
+	else
+		return ivpu_hw_ip_tbu_mmu_enable_40xx(vdev);
+}
+
+static int soc_cpu_boot_37xx(struct ivpu_device *vdev)
+{
+	u32 val;
+
+	val = REGV_RD32(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC);
+	val = REG_SET_FLD(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, IRQI_RSTRUN0, val);
+
+	val = REG_CLR_FLD(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, IRQI_RSTVEC, val);
+	REGV_WR32(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, val);
+
+	val = REG_SET_FLD(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, IRQI_RESUME0, val);
+	REGV_WR32(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, val);
+
+	val = REG_CLR_FLD(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, IRQI_RESUME0, val);
+	REGV_WR32(VPU_37XX_CPU_SS_MSSCPU_CPR_LEON_RT_VEC, val);
+
+	val = vdev->fw->entry_point >> 9;
+	REGV_WR32(VPU_37XX_HOST_SS_LOADING_ADDRESS_LO, val);
+
+	val = REG_SET_FLD(VPU_37XX_HOST_SS_LOADING_ADDRESS_LO, DONE, val);
+	REGV_WR32(VPU_37XX_HOST_SS_LOADING_ADDRESS_LO, val);
+
+	ivpu_dbg(vdev, PM, "Booting firmware, mode: %s\n",
+		 vdev->fw->entry_point == vdev->fw->cold_boot_entry_point ? "cold boot" : "resume");
+
+	return 0;
+}
+
+static int cpu_noc_qacceptn_check_40xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_40XX_CPU_SS_CPR_NOC_QACCEPTN);
+
+	if (!REG_TEST_FLD_NUM(VPU_40XX_CPU_SS_CPR_NOC_QACCEPTN, TOP_MMIO, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static int cpu_noc_qdeny_check_40xx(struct ivpu_device *vdev, u32 exp_val)
+{
+	u32 val = REGV_RD32(VPU_40XX_CPU_SS_CPR_NOC_QDENY);
+
+	if (!REG_TEST_FLD_NUM(VPU_40XX_CPU_SS_CPR_NOC_QDENY, TOP_MMIO, exp_val, val))
+		return -EIO;
+
+	return 0;
+}
+
+static void cpu_noc_top_mmio_drive_40xx(struct ivpu_device *vdev, bool enable)
+{
+	u32 val = REGV_RD32(VPU_40XX_CPU_SS_CPR_NOC_QREQN);
+
+	if (enable)
+		val = REG_SET_FLD(VPU_40XX_CPU_SS_CPR_NOC_QREQN, TOP_MMIO, val);
+	else
+		val = REG_CLR_FLD(VPU_40XX_CPU_SS_CPR_NOC_QREQN, TOP_MMIO, val);
+	REGV_WR32(VPU_40XX_CPU_SS_CPR_NOC_QREQN, val);
+}
+
+static int soc_cpu_drive_40xx(struct ivpu_device *vdev, bool enable)
+{
+	int ret;
+
+	cpu_noc_top_mmio_drive_40xx(vdev, enable);
+
+	ret = cpu_noc_qacceptn_check_40xx(vdev, enable ? 0x1 : 0x0);
+	if (ret) {
+		ivpu_err(vdev, "Failed qacceptn check: %d\n", ret);
+		return ret;
+	}
+
+	ret = cpu_noc_qdeny_check_40xx(vdev, 0x0);
+	if (ret)
+		ivpu_err(vdev, "Failed qdeny check: %d\n", ret);
+
+	return ret;
+}
+
+static int soc_cpu_enable(struct ivpu_device *vdev)
+{
+	return soc_cpu_drive_40xx(vdev, true);
+}
+
+static int soc_cpu_boot_40xx(struct ivpu_device *vdev)
+{
+	int ret;
+	u32 val;
+	u64 val64;
+
+	ret = soc_cpu_enable(vdev);
+	if (ret) {
+		ivpu_err(vdev, "Failed to enable SOC CPU: %d\n", ret);
+		return ret;
+	}
+
+	val64 = vdev->fw->entry_point;
+	val64 <<= ffs(VPU_40XX_HOST_SS_VERIFICATION_ADDRESS_LO_IMAGE_LOCATION_MASK) - 1;
+	REGV_WR64(VPU_40XX_HOST_SS_VERIFICATION_ADDRESS_LO, val64);
+
+	val = REGV_RD32(VPU_40XX_HOST_SS_VERIFICATION_ADDRESS_LO);
+	val = REG_SET_FLD(VPU_40XX_HOST_SS_VERIFICATION_ADDRESS_LO, DONE, val);
+	REGV_WR32(VPU_40XX_HOST_SS_VERIFICATION_ADDRESS_LO, val);
+
+	ivpu_dbg(vdev, PM, "Booting firmware, mode: %s\n",
+		 ivpu_fw_is_cold_boot(vdev) ? "cold boot" : "resume");
+
+	return 0;
+}
+
+int ivpu_hw_ip_soc_cpu_boot(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return soc_cpu_boot_37xx(vdev);
+	else
+		return soc_cpu_boot_40xx(vdev);
+}
+
+static void wdt_disable_37xx(struct ivpu_device *vdev)
+{
+	u32 val;
+
+	/* Enable writing and set non-zero WDT value */
+	REGV_WR32(VPU_37XX_CPU_SS_TIM_SAFE, TIM_SAFE_ENABLE);
+	REGV_WR32(VPU_37XX_CPU_SS_TIM_WATCHDOG, TIM_WATCHDOG_RESET_VALUE);
+
+	/* Enable writing and disable watchdog timer */
+	REGV_WR32(VPU_37XX_CPU_SS_TIM_SAFE, TIM_SAFE_ENABLE);
+	REGV_WR32(VPU_37XX_CPU_SS_TIM_WDOG_EN, 0);
+
+	/* Now clear the timeout interrupt */
+	val = REGV_RD32(VPU_37XX_CPU_SS_TIM_GEN_CONFIG);
+	val = REG_CLR_FLD(VPU_37XX_CPU_SS_TIM_GEN_CONFIG, WDOG_TO_INT_CLR, val);
+	REGV_WR32(VPU_37XX_CPU_SS_TIM_GEN_CONFIG, val);
+}
+
+static void wdt_disable_40xx(struct ivpu_device *vdev)
+{
+	u32 val;
+
+	REGV_WR32(VPU_40XX_CPU_SS_TIM_SAFE, TIM_SAFE_ENABLE);
+	REGV_WR32(VPU_40XX_CPU_SS_TIM_WATCHDOG, TIM_WATCHDOG_RESET_VALUE);
+
+	REGV_WR32(VPU_40XX_CPU_SS_TIM_SAFE, TIM_SAFE_ENABLE);
+	REGV_WR32(VPU_40XX_CPU_SS_TIM_WDOG_EN, 0);
+
+	val = REGV_RD32(VPU_40XX_CPU_SS_TIM_GEN_CONFIG);
+	val = REG_CLR_FLD(VPU_40XX_CPU_SS_TIM_GEN_CONFIG, WDOG_TO_INT_CLR, val);
+	REGV_WR32(VPU_40XX_CPU_SS_TIM_GEN_CONFIG, val);
+}
+
+void ivpu_hw_ip_wdt_disable(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return wdt_disable_37xx(vdev);
+	else
+		return wdt_disable_40xx(vdev);
+}
+
+static u32 ipc_rx_count_get_37xx(struct ivpu_device *vdev)
+{
+	u32 count = REGV_RD32_SILENT(VPU_37XX_HOST_SS_TIM_IPC_FIFO_STAT);
+
+	return REG_GET_FLD(VPU_37XX_HOST_SS_TIM_IPC_FIFO_STAT, FILL_LEVEL, count);
+}
+
+static u32 ipc_rx_count_get_40xx(struct ivpu_device *vdev)
+{
+	u32 count = REGV_RD32_SILENT(VPU_40XX_HOST_SS_TIM_IPC_FIFO_STAT);
+
+	return REG_GET_FLD(VPU_40XX_HOST_SS_TIM_IPC_FIFO_STAT, FILL_LEVEL, count);
+}
+
+u32 ivpu_hw_ip_ipc_rx_count_get(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return ipc_rx_count_get_37xx(vdev);
+	else
+		return ipc_rx_count_get_40xx(vdev);
+}
+
+void ivpu_hw_ip_irq_enable(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX) {
+		REGV_WR32(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, ITF_FIREWALL_VIOLATION_MASK_37XX);
+		REGV_WR64(VPU_37XX_HOST_SS_ICB_ENABLE_0, ICB_0_1_IRQ_MASK_37XX);
+	} else {
+		REGV_WR32(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, ITF_FIREWALL_VIOLATION_MASK_40XX);
+		REGV_WR64(VPU_40XX_HOST_SS_ICB_ENABLE_0, ICB_0_1_IRQ_MASK_40XX);
+	}
+}
+
+void ivpu_hw_ip_irq_disable(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX) {
+		REGV_WR64(VPU_37XX_HOST_SS_ICB_ENABLE_0, 0x0ull);
+		REGV_WR32(VPU_37XX_HOST_SS_FW_SOC_IRQ_EN, 0x0);
+	} else {
+		REGV_WR64(VPU_40XX_HOST_SS_ICB_ENABLE_0, 0x0ull);
+		REGV_WR32(VPU_40XX_HOST_SS_FW_SOC_IRQ_EN, 0x0ul);
+	}
+}
+
+static void diagnose_failure_37xx(struct ivpu_device *vdev)
+{
+	u32 reg = REGV_RD32(VPU_37XX_HOST_SS_ICB_STATUS_0) & ICB_0_IRQ_MASK_37XX;
+
+	if (ipc_rx_count_get_37xx(vdev))
+		ivpu_err(vdev, "IPC FIFO queue not empty, missed IPC IRQ");
+
+	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_0_INT, reg))
+		ivpu_err(vdev, "WDT MSS timeout detected\n");
+
+	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_1_INT, reg))
+		ivpu_err(vdev, "WDT NCE timeout detected\n");
+
+	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, NOC_FIREWALL_INT, reg))
+		ivpu_err(vdev, "NOC Firewall irq detected\n");
+}
+
+static void diagnose_failure_40xx(struct ivpu_device *vdev)
+{
+	u32 reg = REGV_RD32(VPU_40XX_HOST_SS_ICB_STATUS_0) & ICB_0_IRQ_MASK_40XX;
+
+	if (ipc_rx_count_get_40xx(vdev))
+		ivpu_err(vdev, "IPC FIFO queue not empty, missed IPC IRQ");
+
+	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_0_INT, reg))
+		ivpu_err(vdev, "WDT MSS timeout detected\n");
+
+	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_1_INT, reg))
+		ivpu_err(vdev, "WDT NCE timeout detected\n");
+
+	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, NOC_FIREWALL_INT, reg))
+		ivpu_err(vdev, "NOC Firewall irq detected\n");
+}
+
+void ivpu_hw_ip_diagnose_failure(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		diagnose_failure_37xx(vdev);
+	else
+		diagnose_failure_40xx(vdev);
+}
+
+void ivpu_hw_ip_irq_clear(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		REGV_WR64(VPU_37XX_HOST_SS_ICB_CLEAR_0, ICB_0_1_IRQ_MASK_37XX);
+	else
+		REGV_WR64(VPU_40XX_HOST_SS_ICB_CLEAR_0, ICB_0_1_IRQ_MASK_40XX);
+}
+
+static void irq_wdt_nce_handler(struct ivpu_device *vdev)
+{
+	ivpu_pm_trigger_recovery(vdev, "WDT NCE IRQ");
+}
+
+static void irq_wdt_mss_handler(struct ivpu_device *vdev)
+{
+	ivpu_hw_ip_wdt_disable(vdev);
+	ivpu_pm_trigger_recovery(vdev, "WDT MSS IRQ");
+}
+
+static void irq_noc_firewall_handler(struct ivpu_device *vdev)
+{
+	atomic_inc(&vdev->hw->firewall_irq_counter);
+
+	ivpu_dbg(vdev, IRQ, "NOC Firewall interrupt detected, counter %d\n",
+		 atomic_read(&vdev->hw->firewall_irq_counter));
+}
+
+/* Handler for IRQs from NPU core */
+bool ivpu_hw_ip_irq_handler_37xx(struct ivpu_device *vdev, int irq)
+{
+	u32 status = REGV_RD32(VPU_37XX_HOST_SS_ICB_STATUS_0) & ICB_0_IRQ_MASK_37XX;
+
+	if (!status)
+		return false;
+
+	REGV_WR32(VPU_37XX_HOST_SS_ICB_CLEAR_0, status);
+
+	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_0_INT, status))
+		ivpu_mmu_irq_evtq_handler(vdev);
+
+	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, HOST_IPC_FIFO_INT, status))
+		ivpu_ipc_irq_handler(vdev);
+
+	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_1_INT, status))
+		ivpu_dbg(vdev, IRQ, "MMU sync complete\n");
+
+	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_2_INT, status))
+		ivpu_mmu_irq_gerr_handler(vdev);
+
+	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_0_INT, status))
+		irq_wdt_mss_handler(vdev);
+
+	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_1_INT, status))
+		irq_wdt_nce_handler(vdev);
+
+	if (REG_TEST_FLD(VPU_37XX_HOST_SS_ICB_STATUS_0, NOC_FIREWALL_INT, status))
+		irq_noc_firewall_handler(vdev);
+
+	return true;
+}
+
+/* Handler for IRQs from NPU core */
+bool ivpu_hw_ip_irq_handler_40xx(struct ivpu_device *vdev, int irq)
+{
+	u32 status = REGV_RD32(VPU_40XX_HOST_SS_ICB_STATUS_0) & ICB_0_IRQ_MASK_40XX;
+
+	if (!status)
+		return false;
+
+	REGV_WR32(VPU_40XX_HOST_SS_ICB_CLEAR_0, status);
+
+	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_0_INT, status))
+		ivpu_mmu_irq_evtq_handler(vdev);
+
+	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, HOST_IPC_FIFO_INT, status))
+		ivpu_ipc_irq_handler(vdev);
+
+	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_1_INT, status))
+		ivpu_dbg(vdev, IRQ, "MMU sync complete\n");
+
+	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, MMU_IRQ_2_INT, status))
+		ivpu_mmu_irq_gerr_handler(vdev);
+
+	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_0_INT, status))
+		irq_wdt_mss_handler(vdev);
+
+	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, CPU_INT_REDIRECT_1_INT, status))
+		irq_wdt_nce_handler(vdev);
+
+	if (REG_TEST_FLD(VPU_40XX_HOST_SS_ICB_STATUS_0, NOC_FIREWALL_INT, status))
+		irq_noc_firewall_handler(vdev);
+
+	return true;
+}
+
+static void db_set_37xx(struct ivpu_device *vdev, u32 db_id)
+{
+	u32 reg_stride = VPU_37XX_CPU_SS_DOORBELL_1 - VPU_37XX_CPU_SS_DOORBELL_0;
+	u32 val = REG_FLD(VPU_37XX_CPU_SS_DOORBELL_0, SET);
+
+	REGV_WR32I(VPU_37XX_CPU_SS_DOORBELL_0, reg_stride, db_id, val);
+}
+
+static void db_set_40xx(struct ivpu_device *vdev, u32 db_id)
+{
+	u32 reg_stride = VPU_40XX_CPU_SS_DOORBELL_1 - VPU_40XX_CPU_SS_DOORBELL_0;
+	u32 val = REG_FLD(VPU_40XX_CPU_SS_DOORBELL_0, SET);
+
+	REGV_WR32I(VPU_40XX_CPU_SS_DOORBELL_0, reg_stride, db_id, val);
+}
+
+void ivpu_hw_ip_db_set(struct ivpu_device *vdev, u32 db_id)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		db_set_37xx(vdev, db_id);
+	else
+		db_set_40xx(vdev, db_id);
+}
+
+u32 ivpu_hw_ip_ipc_rx_addr_get(struct ivpu_device *vdev)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		return REGV_RD32(VPU_37XX_HOST_SS_TIM_IPC_FIFO_ATM);
+	else
+		return REGV_RD32(VPU_40XX_HOST_SS_TIM_IPC_FIFO_ATM);
+}
+
+void ivpu_hw_ip_ipc_tx_set(struct ivpu_device *vdev, u32 vpu_addr)
+{
+	if (ivpu_hw_ip_gen(vdev) == IVPU_HW_IP_37XX)
+		REGV_WR32(VPU_37XX_CPU_SS_TIM_IPC_FIFO, vpu_addr);
+	else
+		REGV_WR32(VPU_40XX_CPU_SS_TIM_IPC_FIFO, vpu_addr);
+}
diff --git a/drivers/accel/ivpu/ivpu_hw_ip.h b/drivers/accel/ivpu/ivpu_hw_ip.h
new file mode 100644
index 000000000000..5b1b391aa577
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_hw_ip.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020-2024 Intel Corporation
+ */
+
+#ifndef __IVPU_HW_IP_H__
+#define __IVPU_HW_IP_H__
+
+#include "ivpu_drv.h"
+
+int ivpu_hw_ip_host_ss_configure(struct ivpu_device *vdev);
+void ivpu_hw_ip_idle_gen_enable(struct ivpu_device *vdev);
+void ivpu_hw_ip_idle_gen_disable(struct ivpu_device *vdev);
+int ivpu_hw_ip_pwr_domain_enable(struct ivpu_device *vdev);
+int ivpu_hw_ip_host_ss_axi_enable(struct ivpu_device *vdev);
+int ivpu_hw_ip_top_noc_enable(struct ivpu_device *vdev);
+u64 ivpu_hw_ip_read_perf_timer_counter(struct ivpu_device *vdev);
+void ivpu_hw_ip_snoop_disable(struct ivpu_device *vdev);
+void ivpu_hw_ip_tbu_mmu_enable(struct ivpu_device *vdev);
+int ivpu_hw_ip_soc_cpu_boot(struct ivpu_device *vdev);
+void ivpu_hw_ip_wdt_disable(struct ivpu_device *vdev);
+void ivpu_hw_ip_diagnose_failure(struct ivpu_device *vdev);
+u32 ivpu_hw_ip_ipc_rx_count_get(struct ivpu_device *vdev);
+void ivpu_hw_ip_irq_clear(struct ivpu_device *vdev);
+bool ivpu_hw_ip_irq_handler_37xx(struct ivpu_device *vdev, int irq);
+bool ivpu_hw_ip_irq_handler_40xx(struct ivpu_device *vdev, int irq);
+void ivpu_hw_ip_db_set(struct ivpu_device *vdev, u32 db_id);
+u32 ivpu_hw_ip_ipc_rx_addr_get(struct ivpu_device *vdev);
+void ivpu_hw_ip_ipc_tx_set(struct ivpu_device *vdev, u32 vpu_addr);
+void ivpu_hw_ip_irq_enable(struct ivpu_device *vdev);
+void ivpu_hw_ip_irq_disable(struct ivpu_device *vdev);
+void ivpu_hw_ip_diagnose_failure(struct ivpu_device *vdev);
+void ivpu_hw_ip_fabric_req_override_enable_50xx(struct ivpu_device *vdev);
+void ivpu_hw_ip_fabric_req_override_disable_50xx(struct ivpu_device *vdev);
+
+#endif /* __IVPU_HW_IP_H__ */
diff --git a/drivers/accel/ivpu/ivpu_ipc.c b/drivers/accel/ivpu/ivpu_ipc.c
index fa66c39b57ec..01ebf88fe6ef 100644
--- a/drivers/accel/ivpu/ivpu_ipc.c
+++ b/drivers/accel/ivpu/ivpu_ipc.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #include <linux/genalloc.h>
@@ -15,6 +15,7 @@
 #include "ivpu_ipc.h"
 #include "ivpu_jsm_msg.h"
 #include "ivpu_pm.h"
+#include "ivpu_trace.h"
 
 #define IPC_MAX_RX_MSG	128
 
@@ -58,8 +59,8 @@ static void ivpu_ipc_mem_fini(struct ivpu_device *vdev)
 {
 	struct ivpu_ipc_info *ipc = vdev->ipc;
 
-	ivpu_bo_free_internal(ipc->mem_rx);
-	ivpu_bo_free_internal(ipc->mem_tx);
+	ivpu_bo_free(ipc->mem_rx);
+	ivpu_bo_free(ipc->mem_tx);
 }
 
 static int
@@ -129,7 +130,7 @@ static void ivpu_ipc_tx_release(struct ivpu_device *vdev, u32 vpu_addr)
 
 static void ivpu_ipc_tx(struct ivpu_device *vdev, u32 vpu_addr)
 {
-	ivpu_hw_reg_ipc_tx_set(vdev, vpu_addr);
+	ivpu_hw_ipc_tx_set(vdev, vpu_addr);
 }
 
 static void
@@ -210,8 +211,7 @@ void ivpu_ipc_consumer_del(struct ivpu_device *vdev, struct ivpu_ipc_consumer *c
 	ivpu_ipc_tx_release(vdev, cons->tx_vpu_addr);
 }
 
-static int
-ivpu_ipc_send(struct ivpu_device *vdev, struct ivpu_ipc_consumer *cons, struct vpu_jsm_msg *req)
+int ivpu_ipc_send(struct ivpu_device *vdev, struct ivpu_ipc_consumer *cons, struct vpu_jsm_msg *req)
 {
 	struct ivpu_ipc_info *ipc = vdev->ipc;
 	int ret;
@@ -228,6 +228,7 @@ ivpu_ipc_send(struct ivpu_device *vdev, struct ivpu_ipc_consumer *cons, struct v
 		goto unlock;
 
 	ivpu_ipc_tx(vdev, cons->tx_vpu_addr);
+	trace_jsm("[tx]", req);
 
 unlock:
 	mutex_unlock(&ipc->lock);
@@ -279,12 +280,13 @@ int ivpu_ipc_receive(struct ivpu_device *vdev, struct ivpu_ipc_consumer *cons,
 		u32 size = min_t(int, rx_msg->ipc_hdr->data_size, sizeof(*jsm_msg));
 
 		if (rx_msg->jsm_msg->result != VPU_JSM_STATUS_SUCCESS) {
-			ivpu_dbg(vdev, IPC, "IPC resp result error: %d\n", rx_msg->jsm_msg->result);
+			ivpu_err(vdev, "IPC resp result error: %d\n", rx_msg->jsm_msg->result);
 			ret = -EBADMSG;
 		}
 
 		if (jsm_msg)
 			memcpy(jsm_msg, rx_msg->jsm_msg, size);
+		trace_jsm("[rx]", rx_msg->jsm_msg);
 	}
 
 	ivpu_ipc_rx_msg_del(vdev, rx_msg);
@@ -292,15 +294,16 @@ int ivpu_ipc_receive(struct ivpu_device *vdev, struct ivpu_ipc_consumer *cons,
 	return ret;
 }
 
-static int
+int
 ivpu_ipc_send_receive_internal(struct ivpu_device *vdev, struct vpu_jsm_msg *req,
 			       enum vpu_ipc_msg_type expected_resp_type,
-			       struct vpu_jsm_msg *resp, u32 channel,
-			       unsigned long timeout_ms)
+			       struct vpu_jsm_msg *resp, u32 channel, unsigned long timeout_ms)
 {
 	struct ivpu_ipc_consumer cons;
 	int ret;
 
+	drm_WARN_ON(&vdev->drm, pm_runtime_status_suspended(vdev->drm.dev));
+
 	ivpu_ipc_consumer_add(vdev, &cons, channel, NULL);
 
 	ret = ivpu_ipc_send(vdev, &cons, req);
@@ -326,19 +329,21 @@ consumer_del:
 	return ret;
 }
 
-int ivpu_ipc_send_receive_active(struct ivpu_device *vdev, struct vpu_jsm_msg *req,
-				 enum vpu_ipc_msg_type expected_resp, struct vpu_jsm_msg *resp,
-				 u32 channel, unsigned long timeout_ms)
+int ivpu_ipc_send_receive(struct ivpu_device *vdev, struct vpu_jsm_msg *req,
+			  enum vpu_ipc_msg_type expected_resp, struct vpu_jsm_msg *resp,
+			  u32 channel, unsigned long timeout_ms)
 {
 	struct vpu_jsm_msg hb_req = { .type = VPU_JSM_MSG_QUERY_ENGINE_HB };
 	struct vpu_jsm_msg hb_resp;
 	int ret, hb_ret;
 
-	drm_WARN_ON(&vdev->drm, pm_runtime_status_suspended(vdev->drm.dev));
+	ret = ivpu_rpm_get(vdev);
+	if (ret < 0)
+		return ret;
 
 	ret = ivpu_ipc_send_receive_internal(vdev, req, expected_resp, resp, channel, timeout_ms);
 	if (ret != -ETIMEDOUT)
-		return ret;
+		goto rpm_put;
 
 	hb_ret = ivpu_ipc_send_receive_internal(vdev, &hb_req, VPU_JSM_MSG_QUERY_ENGINE_HB_DONE,
 						&hb_resp, VPU_IPC_CHAN_ASYNC_CMD,
@@ -346,21 +351,33 @@ int ivpu_ipc_send_receive_active(struct ivpu_device *vdev, struct vpu_jsm_msg *r
 	if (hb_ret == -ETIMEDOUT)
 		ivpu_pm_trigger_recovery(vdev, "IPC timeout");
 
+rpm_put:
+	ivpu_rpm_put(vdev);
 	return ret;
 }
 
-int ivpu_ipc_send_receive(struct ivpu_device *vdev, struct vpu_jsm_msg *req,
-			  enum vpu_ipc_msg_type expected_resp, struct vpu_jsm_msg *resp,
-			  u32 channel, unsigned long timeout_ms)
+int ivpu_ipc_send_and_wait(struct ivpu_device *vdev, struct vpu_jsm_msg *req,
+			   u32 channel, unsigned long timeout_ms)
 {
+	struct ivpu_ipc_consumer cons;
 	int ret;
 
 	ret = ivpu_rpm_get(vdev);
 	if (ret < 0)
 		return ret;
 
-	ret = ivpu_ipc_send_receive_active(vdev, req, expected_resp, resp, channel, timeout_ms);
+	ivpu_ipc_consumer_add(vdev, &cons, channel, NULL);
+
+	ret = ivpu_ipc_send(vdev, &cons, req);
+	if (ret) {
+		ivpu_warn_ratelimited(vdev, "IPC send failed: %d\n", ret);
+		goto consumer_del;
+	}
+
+	msleep(timeout_ms);
 
+consumer_del:
+	ivpu_ipc_consumer_del(vdev, &cons);
 	ivpu_rpm_put(vdev);
 	return ret;
 }
@@ -378,7 +395,7 @@ ivpu_ipc_match_consumer(struct ivpu_device *vdev, struct ivpu_ipc_consumer *cons
 	return false;
 }
 
-void ivpu_ipc_irq_handler(struct ivpu_device *vdev, bool *wake_thread)
+void ivpu_ipc_irq_handler(struct ivpu_device *vdev)
 {
 	struct ivpu_ipc_info *ipc = vdev->ipc;
 	struct ivpu_ipc_consumer *cons;
@@ -392,8 +409,8 @@ void ivpu_ipc_irq_handler(struct ivpu_device *vdev, bool *wake_thread)
 	 * Driver needs to purge all messages from IPC FIFO to clear IPC interrupt.
 	 * Without purge IPC FIFO to 0 next IPC interrupts won't be generated.
 	 */
-	while (ivpu_hw_reg_ipc_rx_count_get(vdev)) {
-		vpu_addr = ivpu_hw_reg_ipc_rx_addr_get(vdev);
+	while (ivpu_hw_ipc_rx_count_get(vdev)) {
+		vpu_addr = ivpu_hw_ipc_rx_addr_get(vdev);
 		if (vpu_addr == REG_IO_ERROR) {
 			ivpu_err_ratelimited(vdev, "Failed to read IPC rx addr register\n");
 			return;
@@ -442,11 +459,12 @@ void ivpu_ipc_irq_handler(struct ivpu_device *vdev, bool *wake_thread)
 		}
 	}
 
-	if (wake_thread)
-		*wake_thread = !list_empty(&ipc->cb_msg_list);
+	if (!list_empty(&ipc->cb_msg_list))
+		if (!kfifo_put(&vdev->hw->irq.fifo, IVPU_HW_IRQ_SRC_IPC))
+			ivpu_err_ratelimited(vdev, "IRQ FIFO full\n");
 }
 
-irqreturn_t ivpu_ipc_irq_thread_handler(struct ivpu_device *vdev)
+void ivpu_ipc_irq_thread_handler(struct ivpu_device *vdev)
 {
 	struct ivpu_ipc_info *ipc = vdev->ipc;
 	struct ivpu_ipc_rx_msg *rx_msg, *r;
@@ -462,8 +480,6 @@ irqreturn_t ivpu_ipc_irq_thread_handler(struct ivpu_device *vdev)
 		rx_msg->callback(vdev, rx_msg->ipc_hdr, rx_msg->jsm_msg);
 		ivpu_ipc_rx_msg_del(vdev, rx_msg);
 	}
-
-	return IRQ_HANDLED;
 }
 
 int ivpu_ipc_init(struct ivpu_device *vdev)
@@ -471,13 +487,13 @@ int ivpu_ipc_init(struct ivpu_device *vdev)
 	struct ivpu_ipc_info *ipc = vdev->ipc;
 	int ret;
 
-	ipc->mem_tx = ivpu_bo_alloc_internal(vdev, 0, SZ_16K, DRM_IVPU_BO_WC);
+	ipc->mem_tx = ivpu_bo_create_global(vdev, SZ_16K, DRM_IVPU_BO_WC | DRM_IVPU_BO_MAPPABLE);
 	if (!ipc->mem_tx) {
 		ivpu_err(vdev, "Failed to allocate mem_tx\n");
 		return -ENOMEM;
 	}
 
-	ipc->mem_rx = ivpu_bo_alloc_internal(vdev, 0, SZ_16K, DRM_IVPU_BO_WC);
+	ipc->mem_rx = ivpu_bo_create_global(vdev, SZ_16K, DRM_IVPU_BO_WC | DRM_IVPU_BO_MAPPABLE);
 	if (!ipc->mem_rx) {
 		ivpu_err(vdev, "Failed to allocate mem_rx\n");
 		ret = -ENOMEM;
@@ -501,14 +517,18 @@ int ivpu_ipc_init(struct ivpu_device *vdev)
 	spin_lock_init(&ipc->cons_lock);
 	INIT_LIST_HEAD(&ipc->cons_list);
 	INIT_LIST_HEAD(&ipc->cb_msg_list);
-	drmm_mutex_init(&vdev->drm, &ipc->lock);
+	ret = drmm_mutex_init(&vdev->drm, &ipc->lock);
+	if (ret) {
+		ivpu_err(vdev, "Failed to initialize ipc->lock, ret %d\n", ret);
+		goto err_free_rx;
+	}
 	ivpu_ipc_reset(vdev);
 	return 0;
 
 err_free_rx:
-	ivpu_bo_free_internal(ipc->mem_rx);
+	ivpu_bo_free(ipc->mem_rx);
 err_free_tx:
-	ivpu_bo_free_internal(ipc->mem_tx);
+	ivpu_bo_free(ipc->mem_tx);
 	return ret;
 }
 
@@ -516,7 +536,6 @@ void ivpu_ipc_fini(struct ivpu_device *vdev)
 {
 	struct ivpu_ipc_info *ipc = vdev->ipc;
 
-	drm_WARN_ON(&vdev->drm, ipc->on);
 	drm_WARN_ON(&vdev->drm, !list_empty(&ipc->cons_list));
 	drm_WARN_ON(&vdev->drm, !list_empty(&ipc->cb_msg_list));
 	drm_WARN_ON(&vdev->drm, atomic_read(&ipc->rx_msg_count) > 0);
diff --git a/drivers/accel/ivpu/ivpu_ipc.h b/drivers/accel/ivpu/ivpu_ipc.h
index 40ca3cc4e61f..b4dfb504679b 100644
--- a/drivers/accel/ivpu/ivpu_ipc.h
+++ b/drivers/accel/ivpu/ivpu_ipc.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #ifndef __IVPU_IPC_H__
@@ -89,22 +89,25 @@ void ivpu_ipc_enable(struct ivpu_device *vdev);
 void ivpu_ipc_disable(struct ivpu_device *vdev);
 void ivpu_ipc_reset(struct ivpu_device *vdev);
 
-void ivpu_ipc_irq_handler(struct ivpu_device *vdev, bool *wake_thread);
-irqreturn_t ivpu_ipc_irq_thread_handler(struct ivpu_device *vdev);
+void ivpu_ipc_irq_handler(struct ivpu_device *vdev);
+void ivpu_ipc_irq_thread_handler(struct ivpu_device *vdev);
 
 void ivpu_ipc_consumer_add(struct ivpu_device *vdev, struct ivpu_ipc_consumer *cons,
 			   u32 channel, ivpu_ipc_rx_callback_t callback);
 void ivpu_ipc_consumer_del(struct ivpu_device *vdev, struct ivpu_ipc_consumer *cons);
 
+int ivpu_ipc_send(struct ivpu_device *vdev, struct ivpu_ipc_consumer *cons,
+		  struct vpu_jsm_msg *req);
 int ivpu_ipc_receive(struct ivpu_device *vdev, struct ivpu_ipc_consumer *cons,
 		     struct ivpu_ipc_hdr *ipc_buf, struct vpu_jsm_msg *jsm_msg,
 		     unsigned long timeout_ms);
-
-int ivpu_ipc_send_receive_active(struct ivpu_device *vdev, struct vpu_jsm_msg *req,
-				 enum vpu_ipc_msg_type expected_resp, struct vpu_jsm_msg *resp,
-				 u32 channel, unsigned long timeout_ms);
+int ivpu_ipc_send_receive_internal(struct ivpu_device *vdev, struct vpu_jsm_msg *req,
+				   enum vpu_ipc_msg_type expected_resp_type,
+				   struct vpu_jsm_msg *resp, u32 channel, unsigned long timeout_ms);
 int ivpu_ipc_send_receive(struct ivpu_device *vdev, struct vpu_jsm_msg *req,
 			  enum vpu_ipc_msg_type expected_resp, struct vpu_jsm_msg *resp,
 			  u32 channel, unsigned long timeout_ms);
+int ivpu_ipc_send_and_wait(struct ivpu_device *vdev, struct vpu_jsm_msg *req,
+			   u32 channel, unsigned long timeout_ms);
 
 #endif /* __IVPU_IPC_H__ */
diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c
index e70cfb859339..7149312f16e1 100644
--- a/drivers/accel/ivpu/ivpu_job.c
+++ b/drivers/accel/ivpu/ivpu_job.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #include <drm/drm_file.h>
@@ -12,50 +12,106 @@
 #include <uapi/drm/ivpu_accel.h>
 
 #include "ivpu_drv.h"
+#include "ivpu_fw.h"
 #include "ivpu_hw.h"
 #include "ivpu_ipc.h"
 #include "ivpu_job.h"
 #include "ivpu_jsm_msg.h"
 #include "ivpu_pm.h"
+#include "ivpu_trace.h"
+#include "vpu_boot_api.h"
 
 #define CMD_BUF_IDX	     0
-#define JOB_ID_JOB_MASK	     GENMASK(7, 0)
-#define JOB_ID_CONTEXT_MASK  GENMASK(31, 8)
 #define JOB_MAX_BUFFER_COUNT 65535
 
 static void ivpu_cmdq_ring_db(struct ivpu_device *vdev, struct ivpu_cmdq *cmdq)
 {
-	ivpu_hw_reg_db_set(vdev, cmdq->db_id);
+	ivpu_hw_db_set(vdev, cmdq->db_id);
 }
 
-static struct ivpu_cmdq *ivpu_cmdq_alloc(struct ivpu_file_priv *file_priv, u16 engine)
+static int ivpu_preemption_buffers_create(struct ivpu_device *vdev,
+					  struct ivpu_file_priv *file_priv, struct ivpu_cmdq *cmdq)
+{
+	u64 primary_size = ALIGN(vdev->fw->primary_preempt_buf_size, PAGE_SIZE);
+	u64 secondary_size = ALIGN(vdev->fw->secondary_preempt_buf_size, PAGE_SIZE);
+
+	if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW ||
+	    ivpu_test_mode & IVPU_TEST_MODE_MIP_DISABLE)
+		return 0;
+
+	cmdq->primary_preempt_buf = ivpu_bo_create(vdev, &file_priv->ctx, &vdev->hw->ranges.user,
+						   primary_size, DRM_IVPU_BO_WC);
+	if (!cmdq->primary_preempt_buf) {
+		ivpu_err(vdev, "Failed to create primary preemption buffer\n");
+		return -ENOMEM;
+	}
+
+	cmdq->secondary_preempt_buf = ivpu_bo_create(vdev, &file_priv->ctx, &vdev->hw->ranges.dma,
+						     secondary_size, DRM_IVPU_BO_WC);
+	if (!cmdq->secondary_preempt_buf) {
+		ivpu_err(vdev, "Failed to create secondary preemption buffer\n");
+		goto err_free_primary;
+	}
+
+	return 0;
+
+err_free_primary:
+	ivpu_bo_free(cmdq->primary_preempt_buf);
+	cmdq->primary_preempt_buf = NULL;
+	return -ENOMEM;
+}
+
+static void ivpu_preemption_buffers_free(struct ivpu_device *vdev,
+					 struct ivpu_file_priv *file_priv, struct ivpu_cmdq *cmdq)
+{
+	if (vdev->fw->sched_mode != VPU_SCHEDULING_MODE_HW)
+		return;
+
+	if (cmdq->primary_preempt_buf)
+		ivpu_bo_free(cmdq->primary_preempt_buf);
+	if (cmdq->secondary_preempt_buf)
+		ivpu_bo_free(cmdq->secondary_preempt_buf);
+}
+
+static struct ivpu_cmdq *ivpu_cmdq_alloc(struct ivpu_file_priv *file_priv)
 {
 	struct ivpu_device *vdev = file_priv->vdev;
-	struct vpu_job_queue_header *jobq_header;
 	struct ivpu_cmdq *cmdq;
+	int ret;
 
 	cmdq = kzalloc(sizeof(*cmdq), GFP_KERNEL);
 	if (!cmdq)
 		return NULL;
 
-	cmdq->mem = ivpu_bo_alloc_internal(vdev, 0, SZ_4K, DRM_IVPU_BO_WC);
-	if (!cmdq->mem)
-		goto cmdq_free;
+	ret = xa_alloc_cyclic(&vdev->db_xa, &cmdq->db_id, NULL, vdev->db_limit, &vdev->db_next,
+			      GFP_KERNEL);
+	if (ret < 0) {
+		ivpu_err(vdev, "Failed to allocate doorbell id: %d\n", ret);
+		goto err_free_cmdq;
+	}
 
-	cmdq->db_id = file_priv->ctx.id + engine * ivpu_get_context_count(vdev);
-	cmdq->entry_count = (u32)((ivpu_bo_size(cmdq->mem) - sizeof(struct vpu_job_queue_header)) /
-				  sizeof(struct vpu_job_queue_entry));
+	ret = xa_alloc_cyclic(&file_priv->cmdq_xa, &cmdq->id, cmdq, file_priv->cmdq_limit,
+			      &file_priv->cmdq_id_next, GFP_KERNEL);
+	if (ret < 0) {
+		ivpu_err(vdev, "Failed to allocate command queue id: %d\n", ret);
+		goto err_erase_db_xa;
+	}
 
-	cmdq->jobq = (struct vpu_job_queue *)ivpu_bo_vaddr(cmdq->mem);
-	jobq_header = &cmdq->jobq->header;
-	jobq_header->engine_idx = engine;
-	jobq_header->head = 0;
-	jobq_header->tail = 0;
-	wmb(); /* Flush WC buffer for jobq->header */
+	cmdq->mem = ivpu_bo_create_global(vdev, SZ_4K, DRM_IVPU_BO_WC | DRM_IVPU_BO_MAPPABLE);
+	if (!cmdq->mem)
+		goto err_erase_cmdq_xa;
+
+	ret = ivpu_preemption_buffers_create(vdev, file_priv, cmdq);
+	if (ret)
+		ivpu_warn(vdev, "Failed to allocate preemption buffers, preemption limited\n");
 
 	return cmdq;
 
-cmdq_free:
+err_erase_cmdq_xa:
+	xa_erase(&file_priv->cmdq_xa, cmdq->id);
+err_erase_db_xa:
+	xa_erase(&vdev->db_xa, cmdq->db_id);
+err_free_cmdq:
 	kfree(cmdq);
 	return NULL;
 }
@@ -65,91 +121,173 @@ static void ivpu_cmdq_free(struct ivpu_file_priv *file_priv, struct ivpu_cmdq *c
 	if (!cmdq)
 		return;
 
-	ivpu_bo_free_internal(cmdq->mem);
+	ivpu_preemption_buffers_free(file_priv->vdev, file_priv, cmdq);
+	ivpu_bo_free(cmdq->mem);
+	xa_erase(&file_priv->vdev->db_xa, cmdq->db_id);
 	kfree(cmdq);
 }
 
-static struct ivpu_cmdq *ivpu_cmdq_acquire(struct ivpu_file_priv *file_priv, u16 engine)
+static int ivpu_hws_cmdq_init(struct ivpu_file_priv *file_priv, struct ivpu_cmdq *cmdq, u16 engine,
+			      u8 priority)
+{
+	struct ivpu_device *vdev = file_priv->vdev;
+	int ret;
+
+	ret = ivpu_jsm_hws_create_cmdq(vdev, file_priv->ctx.id, file_priv->ctx.id, cmdq->id,
+				       task_pid_nr(current), engine,
+				       cmdq->mem->vpu_addr, ivpu_bo_size(cmdq->mem));
+	if (ret)
+		return ret;
+
+	ret = ivpu_jsm_hws_set_context_sched_properties(vdev, file_priv->ctx.id, cmdq->id,
+							priority);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int ivpu_register_db(struct ivpu_file_priv *file_priv, struct ivpu_cmdq *cmdq)
 {
 	struct ivpu_device *vdev = file_priv->vdev;
-	struct ivpu_cmdq *cmdq = file_priv->cmdq[engine];
+	int ret;
+
+	if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW)
+		ret = ivpu_jsm_hws_register_db(vdev, file_priv->ctx.id, cmdq->id, cmdq->db_id,
+					       cmdq->mem->vpu_addr, ivpu_bo_size(cmdq->mem));
+	else
+		ret = ivpu_jsm_register_db(vdev, file_priv->ctx.id, cmdq->db_id,
+					   cmdq->mem->vpu_addr, ivpu_bo_size(cmdq->mem));
+
+	if (!ret)
+		ivpu_dbg(vdev, JOB, "DB %d registered to cmdq %d ctx %d\n",
+			 cmdq->db_id, cmdq->id, file_priv->ctx.id);
+
+	return ret;
+}
+
+static int
+ivpu_cmdq_init(struct ivpu_file_priv *file_priv, struct ivpu_cmdq *cmdq, u8 priority)
+{
+	struct ivpu_device *vdev = file_priv->vdev;
+	struct vpu_job_queue_header *jobq_header;
 	int ret;
 
 	lockdep_assert_held(&file_priv->lock);
 
-	if (!cmdq) {
-		cmdq = ivpu_cmdq_alloc(file_priv, engine);
-		if (!cmdq)
-			return NULL;
-		file_priv->cmdq[engine] = cmdq;
+	if (cmdq->db_registered)
+		return 0;
+
+	cmdq->entry_count = (u32)((ivpu_bo_size(cmdq->mem) - sizeof(struct vpu_job_queue_header)) /
+				  sizeof(struct vpu_job_queue_entry));
+
+	cmdq->jobq = (struct vpu_job_queue *)ivpu_bo_vaddr(cmdq->mem);
+	jobq_header = &cmdq->jobq->header;
+	jobq_header->engine_idx = VPU_ENGINE_COMPUTE;
+	jobq_header->head = 0;
+	jobq_header->tail = 0;
+	if (ivpu_test_mode & IVPU_TEST_MODE_TURBO) {
+		ivpu_dbg(vdev, JOB, "Turbo mode enabled");
+		jobq_header->flags = VPU_JOB_QUEUE_FLAGS_TURBO_MODE;
 	}
 
-	if (cmdq->db_registered)
-		return cmdq;
+	wmb(); /* Flush WC buffer for jobq->header */
 
-	ret = ivpu_jsm_register_db(vdev, file_priv->ctx.id, cmdq->db_id,
-				   cmdq->mem->vpu_addr, ivpu_bo_size(cmdq->mem));
+	if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) {
+		ret = ivpu_hws_cmdq_init(file_priv, cmdq, VPU_ENGINE_COMPUTE, priority);
+		if (ret)
+			return ret;
+	}
+
+	ret = ivpu_register_db(file_priv, cmdq);
 	if (ret)
-		return NULL;
+		return ret;
 
 	cmdq->db_registered = true;
 
-	return cmdq;
+	return 0;
 }
 
-static void ivpu_cmdq_release_locked(struct ivpu_file_priv *file_priv, u16 engine)
+static int ivpu_cmdq_fini(struct ivpu_file_priv *file_priv, struct ivpu_cmdq *cmdq)
 {
-	struct ivpu_cmdq *cmdq = file_priv->cmdq[engine];
+	struct ivpu_device *vdev = file_priv->vdev;
+	int ret;
 
 	lockdep_assert_held(&file_priv->lock);
 
-	if (cmdq) {
-		file_priv->cmdq[engine] = NULL;
-		if (cmdq->db_registered)
-			ivpu_jsm_unregister_db(file_priv->vdev, cmdq->db_id);
+	if (!cmdq->db_registered)
+		return 0;
 
-		ivpu_cmdq_free(file_priv, cmdq);
+	cmdq->db_registered = false;
+
+	if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) {
+		ret = ivpu_jsm_hws_destroy_cmdq(vdev, file_priv->ctx.id, cmdq->id);
+		if (!ret)
+			ivpu_dbg(vdev, JOB, "Command queue %d destroyed\n", cmdq->id);
 	}
+
+	ret = ivpu_jsm_unregister_db(vdev, cmdq->db_id);
+	if (!ret)
+		ivpu_dbg(vdev, JOB, "DB %d unregistered\n", cmdq->db_id);
+
+	return 0;
 }
 
-void ivpu_cmdq_release_all_locked(struct ivpu_file_priv *file_priv)
+static struct ivpu_cmdq *ivpu_cmdq_acquire(struct ivpu_file_priv *file_priv, u8 priority)
 {
-	int i;
+	struct ivpu_cmdq *cmdq;
+	unsigned long cmdq_id;
+	int ret;
 
 	lockdep_assert_held(&file_priv->lock);
 
-	for (i = 0; i < IVPU_NUM_ENGINES; i++)
-		ivpu_cmdq_release_locked(file_priv, i);
+	xa_for_each(&file_priv->cmdq_xa, cmdq_id, cmdq)
+		if (cmdq->priority == priority)
+			break;
+
+	if (!cmdq) {
+		cmdq = ivpu_cmdq_alloc(file_priv);
+		if (!cmdq)
+			return NULL;
+		cmdq->priority = priority;
+	}
+
+	ret = ivpu_cmdq_init(file_priv, cmdq, priority);
+	if (ret)
+		return NULL;
+
+	return cmdq;
 }
 
-/*
- * Mark the doorbell as unregistered and reset job queue pointers.
- * This function needs to be called when the VPU hardware is restarted
- * and FW loses job queue state. The next time job queue is used it
- * will be registered again.
- */
-static void ivpu_cmdq_reset_locked(struct ivpu_file_priv *file_priv, u16 engine)
+void ivpu_cmdq_release_all_locked(struct ivpu_file_priv *file_priv)
 {
-	struct ivpu_cmdq *cmdq = file_priv->cmdq[engine];
+	struct ivpu_cmdq *cmdq;
+	unsigned long cmdq_id;
 
 	lockdep_assert_held(&file_priv->lock);
 
-	if (cmdq) {
-		cmdq->db_registered = false;
-		cmdq->jobq->header.head = 0;
-		cmdq->jobq->header.tail = 0;
-		wmb(); /* Flush WC buffer for jobq header */
+	xa_for_each(&file_priv->cmdq_xa, cmdq_id, cmdq) {
+		xa_erase(&file_priv->cmdq_xa, cmdq_id);
+		ivpu_cmdq_fini(file_priv, cmdq);
+		ivpu_cmdq_free(file_priv, cmdq);
 	}
 }
 
-static void ivpu_cmdq_reset_all(struct ivpu_file_priv *file_priv)
+/*
+ * Mark the doorbell as unregistered
+ * This function needs to be called when the VPU hardware is restarted
+ * and FW loses job queue state. The next time job queue is used it
+ * will be registered again.
+ */
+static void ivpu_cmdq_reset(struct ivpu_file_priv *file_priv)
 {
-	int i;
+	struct ivpu_cmdq *cmdq;
+	unsigned long cmdq_id;
 
 	mutex_lock(&file_priv->lock);
 
-	for (i = 0; i < IVPU_NUM_ENGINES; i++)
-		ivpu_cmdq_reset_locked(file_priv, i);
+	xa_for_each(&file_priv->cmdq_xa, cmdq_id, cmdq)
+		cmdq->db_registered = false;
 
 	mutex_unlock(&file_priv->lock);
 }
@@ -162,10 +300,30 @@ void ivpu_cmdq_reset_all_contexts(struct ivpu_device *vdev)
 	mutex_lock(&vdev->context_list_lock);
 
 	xa_for_each(&vdev->context_xa, ctx_id, file_priv)
-		ivpu_cmdq_reset_all(file_priv);
+		ivpu_cmdq_reset(file_priv);
 
 	mutex_unlock(&vdev->context_list_lock);
+}
+
+static void ivpu_cmdq_fini_all(struct ivpu_file_priv *file_priv)
+{
+	struct ivpu_cmdq *cmdq;
+	unsigned long cmdq_id;
 
+	xa_for_each(&file_priv->cmdq_xa, cmdq_id, cmdq)
+		ivpu_cmdq_fini(file_priv, cmdq);
+}
+
+void ivpu_context_abort_locked(struct ivpu_file_priv *file_priv)
+{
+	struct ivpu_device *vdev = file_priv->vdev;
+
+	lockdep_assert_held(&file_priv->lock);
+
+	ivpu_cmdq_fini_all(file_priv);
+
+	if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_OS)
+		ivpu_jsm_context_release(vdev, file_priv->ctx.id);
 }
 
 static int ivpu_cmdq_push_job(struct ivpu_cmdq *cmdq, struct ivpu_job *job)
@@ -178,17 +336,31 @@ static int ivpu_cmdq_push_job(struct ivpu_cmdq *cmdq, struct ivpu_job *job)
 
 	/* Check if there is space left in job queue */
 	if (next_entry == header->head) {
-		ivpu_dbg(vdev, JOB, "Job queue full: ctx %d engine %d db %d head %d tail %d\n",
-			 job->file_priv->ctx.id, job->engine_idx, cmdq->db_id, header->head, tail);
+		ivpu_dbg(vdev, JOB, "Job queue full: ctx %d cmdq %d db %d head %d tail %d\n",
+			 job->file_priv->ctx.id, cmdq->id, cmdq->db_id, header->head, tail);
 		return -EBUSY;
 	}
 
-	entry = &cmdq->jobq->job[tail];
+	entry = &cmdq->jobq->slot[tail].job;
 	entry->batch_buf_addr = job->cmd_buf_vpu_addr;
 	entry->job_id = job->job_id;
 	entry->flags = 0;
 	if (unlikely(ivpu_test_mode & IVPU_TEST_MODE_NULL_SUBMISSION))
 		entry->flags = VPU_JOB_FLAGS_NULL_SUBMISSION_MASK;
+
+	if (vdev->fw->sched_mode == VPU_SCHEDULING_MODE_HW) {
+		if (cmdq->primary_preempt_buf) {
+			entry->primary_preempt_buf_addr = cmdq->primary_preempt_buf->vpu_addr;
+			entry->primary_preempt_buf_size = ivpu_bo_size(cmdq->primary_preempt_buf);
+		}
+
+		if (cmdq->secondary_preempt_buf) {
+			entry->secondary_preempt_buf_addr = cmdq->secondary_preempt_buf->vpu_addr;
+			entry->secondary_preempt_buf_size =
+				ivpu_bo_size(cmdq->secondary_preempt_buf);
+		}
+	}
+
 	wmb(); /* Ensure that tail is updated after filling entry */
 	header->tail = next_entry;
 	wmb(); /* Flush WC buffer for jobq header */
@@ -277,6 +449,7 @@ ivpu_job_create(struct ivpu_file_priv *file_priv, u32 engine_idx, u32 bo_count)
 
 	job->file_priv = ivpu_file_priv_get(file_priv);
 
+	trace_job("create", job);
 	ivpu_dbg(vdev, JOB, "Job created: ctx %2d engine %d", file_priv->ctx.id, job->engine_idx);
 	return job;
 
@@ -285,11 +458,28 @@ err_free_job:
 	return NULL;
 }
 
+static struct ivpu_job *ivpu_job_remove_from_submitted_jobs(struct ivpu_device *vdev, u32 job_id)
+{
+	struct ivpu_job *job;
+
+	xa_lock(&vdev->submitted_jobs_xa);
+	job = __xa_erase(&vdev->submitted_jobs_xa, job_id);
+
+	if (xa_empty(&vdev->submitted_jobs_xa) && job) {
+		vdev->busy_time = ktime_add(ktime_sub(ktime_get(), vdev->busy_start_ts),
+					    vdev->busy_time);
+	}
+
+	xa_unlock(&vdev->submitted_jobs_xa);
+
+	return job;
+}
+
 static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32 job_status)
 {
 	struct ivpu_job *job;
 
-	job = xa_erase(&vdev->submitted_jobs_xa, job_id);
+	job = ivpu_job_remove_from_submitted_jobs(vdev, job_id);
 	if (!job)
 		return -ENOENT;
 
@@ -299,6 +489,7 @@ static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32
 	job->bos[CMD_BUF_IDX]->job_status = job_status;
 	dma_fence_signal(job->done_fence);
 
+	trace_job("done", job);
 	ivpu_dbg(vdev, JOB, "Job complete:  id %3u ctx %2d engine %d status 0x%x\n",
 		 job->job_id, job->file_priv->ctx.id, job->engine_idx, job_status);
 
@@ -318,12 +509,12 @@ void ivpu_jobs_abort_all(struct ivpu_device *vdev)
 		ivpu_job_signal_and_destroy(vdev, id, DRM_IVPU_JOB_STATUS_ABORTED);
 }
 
-static int ivpu_job_submit(struct ivpu_job *job)
+static int ivpu_job_submit(struct ivpu_job *job, u8 priority)
 {
 	struct ivpu_file_priv *file_priv = job->file_priv;
 	struct ivpu_device *vdev = job->vdev;
-	struct xa_limit job_id_range;
 	struct ivpu_cmdq *cmdq;
+	bool is_first_job;
 	int ret;
 
 	ret = ivpu_rpm_get(vdev);
@@ -332,20 +523,19 @@ static int ivpu_job_submit(struct ivpu_job *job)
 
 	mutex_lock(&file_priv->lock);
 
-	cmdq = ivpu_cmdq_acquire(job->file_priv, job->engine_idx);
+	cmdq = ivpu_cmdq_acquire(file_priv, priority);
 	if (!cmdq) {
-		ivpu_warn_ratelimited(vdev, "Failed get job queue, ctx %d engine %d\n",
-				      file_priv->ctx.id, job->engine_idx);
+		ivpu_warn_ratelimited(vdev, "Failed to get job queue, ctx %d engine %d prio %d\n",
+				      file_priv->ctx.id, job->engine_idx, priority);
 		ret = -EINVAL;
 		goto err_unlock_file_priv;
 	}
 
-	job_id_range.min = FIELD_PREP(JOB_ID_CONTEXT_MASK, (file_priv->ctx.id - 1));
-	job_id_range.max = job_id_range.min | JOB_ID_JOB_MASK;
-
 	xa_lock(&vdev->submitted_jobs_xa);
-	ret = __xa_alloc(&vdev->submitted_jobs_xa, &job->job_id, job, job_id_range, GFP_KERNEL);
-	if (ret) {
+	is_first_job = xa_empty(&vdev->submitted_jobs_xa);
+	ret = __xa_alloc_cyclic(&vdev->submitted_jobs_xa, &job->job_id, job, file_priv->job_limit,
+				&file_priv->job_id_next, GFP_KERNEL);
+	if (ret < 0) {
 		ivpu_dbg(vdev, JOB, "Too many active jobs in ctx %d\n",
 			 file_priv->ctx.id);
 		ret = -EBUSY;
@@ -363,10 +553,13 @@ static int ivpu_job_submit(struct ivpu_job *job)
 		wmb(); /* Flush WC buffer for jobq header */
 	} else {
 		ivpu_cmdq_ring_db(vdev, cmdq);
+		if (is_first_job)
+			vdev->busy_start_ts = ktime_get();
 	}
 
-	ivpu_dbg(vdev, JOB, "Job submitted: id %3u ctx %2d engine %d addr 0x%llx next %d\n",
-		 job->job_id, file_priv->ctx.id, job->engine_idx,
+	trace_job("submit", job);
+	ivpu_dbg(vdev, JOB, "Job submitted: id %3u ctx %2d engine %d prio %d addr 0x%llx next %d\n",
+		 job->job_id, file_priv->ctx.id, job->engine_idx, priority,
 		 job->cmd_buf_vpu_addr, cmdq->jobq->header.tail);
 
 	xa_unlock(&vdev->submitted_jobs_xa);
@@ -454,6 +647,14 @@ unlock_reservations:
 	return ret;
 }
 
+static inline u8 ivpu_job_to_hws_priority(struct ivpu_file_priv *file_priv, u8 priority)
+{
+	if (priority == DRM_IVPU_JOB_PRIORITY_DEFAULT)
+		return DRM_IVPU_JOB_PRIORITY_NORMAL;
+
+	return priority - 1;
+}
+
 int ivpu_submit_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 {
 	struct ivpu_file_priv *file_priv = file->driver_priv;
@@ -462,8 +663,9 @@ int ivpu_submit_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	struct ivpu_job *job;
 	u32 *buf_handles;
 	int idx, ret;
+	u8 priority;
 
-	if (params->engine > DRM_IVPU_ENGINE_COPY)
+	if (params->engine != DRM_IVPU_ENGINE_COMPUTE)
 		return -EINVAL;
 
 	if (params->priority > DRM_IVPU_JOB_PRIORITY_REALTIME)
@@ -515,8 +717,10 @@ int ivpu_submit_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 		goto err_destroy_job;
 	}
 
+	priority = ivpu_job_to_hws_priority(file_priv, params->priority);
+
 	down_read(&vdev->pm->reset_lock);
-	ret = ivpu_job_submit(job);
+	ret = ivpu_job_submit(job, priority);
 	up_read(&vdev->pm->reset_lock);
 	if (ret)
 		goto err_signal_fence;
diff --git a/drivers/accel/ivpu/ivpu_job.h b/drivers/accel/ivpu/ivpu_job.h
index ca4984071cc7..8b19e3f8b4cf 100644
--- a/drivers/accel/ivpu/ivpu_job.h
+++ b/drivers/accel/ivpu/ivpu_job.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #ifndef __IVPU_JOB_H__
@@ -24,10 +24,14 @@ struct ivpu_file_priv;
  */
 struct ivpu_cmdq {
 	struct vpu_job_queue *jobq;
+	struct ivpu_bo *primary_preempt_buf;
+	struct ivpu_bo *secondary_preempt_buf;
 	struct ivpu_bo *mem;
 	u32 entry_count;
+	u32 id;
 	u32 db_id;
 	bool db_registered;
+	u8 priority;
 };
 
 /**
@@ -55,6 +59,8 @@ struct ivpu_job {
 
 int ivpu_submit_ioctl(struct drm_device *dev, void *data, struct drm_file *file);
 
+void ivpu_context_abort_locked(struct ivpu_file_priv *file_priv);
+
 void ivpu_cmdq_release_all_locked(struct ivpu_file_priv *file_priv);
 void ivpu_cmdq_reset_all_contexts(struct ivpu_device *vdev);
 
diff --git a/drivers/accel/ivpu/ivpu_jsm_msg.c b/drivers/accel/ivpu/ivpu_jsm_msg.c
index 8cea0dd731b9..30a40be76930 100644
--- a/drivers/accel/ivpu/ivpu_jsm_msg.c
+++ b/drivers/accel/ivpu/ivpu_jsm_msg.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #include "ivpu_drv.h"
@@ -48,9 +48,10 @@ const char *ivpu_jsm_msg_type_to_str(enum vpu_ipc_msg_type type)
 	IVPU_CASE_TO_STR(VPU_JSM_MSG_HWS_RESUME_ENGINE_DONE);
 	IVPU_CASE_TO_STR(VPU_JSM_MSG_STATE_DUMP);
 	IVPU_CASE_TO_STR(VPU_JSM_MSG_STATE_DUMP_RSP);
-	IVPU_CASE_TO_STR(VPU_JSM_MSG_BLOB_DEINIT);
+	IVPU_CASE_TO_STR(VPU_JSM_MSG_BLOB_DEINIT_DEPRECATED);
 	IVPU_CASE_TO_STR(VPU_JSM_MSG_DYNDBG_CONTROL);
 	IVPU_CASE_TO_STR(VPU_JSM_MSG_JOB_DONE);
+	IVPU_CASE_TO_STR(VPU_JSM_MSG_NATIVE_FENCE_SIGNALLED);
 	IVPU_CASE_TO_STR(VPU_JSM_MSG_ENGINE_RESET_DONE);
 	IVPU_CASE_TO_STR(VPU_JSM_MSG_ENGINE_PREEMPT_DONE);
 	IVPU_CASE_TO_STR(VPU_JSM_MSG_REGISTER_DB_DONE);
@@ -103,14 +104,10 @@ int ivpu_jsm_register_db(struct ivpu_device *vdev, u32 ctx_id, u32 db_id,
 
 	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_REGISTER_DB_DONE, &resp,
 				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
-	if (ret) {
-		ivpu_err_ratelimited(vdev, "Failed to register doorbell %d: %d\n", db_id, ret);
-		return ret;
-	}
-
-	ivpu_dbg(vdev, JSM, "Doorbell %d registered to context %d\n", db_id, ctx_id);
+	if (ret)
+		ivpu_err_ratelimited(vdev, "Failed to register doorbell %u: %d\n", db_id, ret);
 
-	return 0;
+	return ret;
 }
 
 int ivpu_jsm_unregister_db(struct ivpu_device *vdev, u32 db_id)
@@ -123,14 +120,10 @@ int ivpu_jsm_unregister_db(struct ivpu_device *vdev, u32 db_id)
 
 	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_UNREGISTER_DB_DONE, &resp,
 				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
-	if (ret) {
-		ivpu_warn_ratelimited(vdev, "Failed to unregister doorbell %d: %d\n", db_id, ret);
-		return ret;
-	}
-
-	ivpu_dbg(vdev, JSM, "Doorbell %d unregistered\n", db_id);
+	if (ret)
+		ivpu_warn_ratelimited(vdev, "Failed to unregister doorbell %u: %d\n", db_id, ret);
 
-	return 0;
+	return ret;
 }
 
 int ivpu_jsm_get_heartbeat(struct ivpu_device *vdev, u32 engine, u64 *heartbeat)
@@ -139,7 +132,7 @@ int ivpu_jsm_get_heartbeat(struct ivpu_device *vdev, u32 engine, u64 *heartbeat)
 	struct vpu_jsm_msg resp;
 	int ret;
 
-	if (engine > VPU_ENGINE_COPY)
+	if (engine != VPU_ENGINE_COMPUTE)
 		return -EINVAL;
 
 	req.payload.query_engine_hb.engine_idx = engine;
@@ -162,7 +155,7 @@ int ivpu_jsm_reset_engine(struct ivpu_device *vdev, u32 engine)
 	struct vpu_jsm_msg resp;
 	int ret;
 
-	if (engine > VPU_ENGINE_COPY)
+	if (engine != VPU_ENGINE_COMPUTE)
 		return -EINVAL;
 
 	req.payload.engine_reset.engine_idx = engine;
@@ -181,7 +174,7 @@ int ivpu_jsm_preempt_engine(struct ivpu_device *vdev, u32 engine, u32 preempt_id
 	struct vpu_jsm_msg resp;
 	int ret;
 
-	if (engine > VPU_ENGINE_COPY)
+	if (engine != VPU_ENGINE_COMPUTE)
 		return -EINVAL;
 
 	req.payload.engine_preempt.engine_idx = engine;
@@ -204,7 +197,7 @@ int ivpu_jsm_dyndbg_control(struct ivpu_device *vdev, char *command, size_t size
 	strscpy(req.payload.dyndbg_control.dyndbg_cmd, command, VPU_DYNDBG_CMD_MAX_LEN);
 
 	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_DYNDBG_CONTROL_RSP, &resp,
-				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+				    VPU_IPC_CHAN_GEN_CMD, vdev->timeout.jsm);
 	if (ret)
 		ivpu_warn_ratelimited(vdev, "Failed to send command \"%s\": ret %d\n",
 				      command, ret);
@@ -255,11 +248,16 @@ int ivpu_jsm_context_release(struct ivpu_device *vdev, u32 host_ssid)
 {
 	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_SSID_RELEASE };
 	struct vpu_jsm_msg resp;
+	int ret;
 
 	req.payload.ssid_release.host_ssid = host_ssid;
 
-	return ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_SSID_RELEASE_DONE, &resp,
-				     VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_SSID_RELEASE_DONE, &resp,
+				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+	if (ret)
+		ivpu_warn_ratelimited(vdev, "Failed to release context: %d\n", ret);
+
+	return ret;
 }
 
 int ivpu_jsm_pwr_d0i3_enter(struct ivpu_device *vdev)
@@ -273,11 +271,294 @@ int ivpu_jsm_pwr_d0i3_enter(struct ivpu_device *vdev)
 
 	req.payload.pwr_d0i3_enter.send_response = 1;
 
-	ret = ivpu_ipc_send_receive_active(vdev, &req, VPU_JSM_MSG_PWR_D0I3_ENTER_DONE,
-					   &resp, VPU_IPC_CHAN_GEN_CMD,
-					   vdev->timeout.d0i3_entry_msg);
+	ret = ivpu_ipc_send_receive_internal(vdev, &req, VPU_JSM_MSG_PWR_D0I3_ENTER_DONE, &resp,
+					     VPU_IPC_CHAN_GEN_CMD, vdev->timeout.d0i3_entry_msg);
 	if (ret)
 		return ret;
 
 	return ivpu_hw_wait_for_idle(vdev);
 }
+
+int ivpu_jsm_hws_create_cmdq(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_group, u32 cmdq_id,
+			     u32 pid, u32 engine, u64 cmdq_base, u32 cmdq_size)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_CREATE_CMD_QUEUE };
+	struct vpu_jsm_msg resp;
+	int ret;
+
+	req.payload.hws_create_cmdq.host_ssid = ctx_id;
+	req.payload.hws_create_cmdq.process_id = pid;
+	req.payload.hws_create_cmdq.engine_idx = engine;
+	req.payload.hws_create_cmdq.cmdq_group = cmdq_group;
+	req.payload.hws_create_cmdq.cmdq_id = cmdq_id;
+	req.payload.hws_create_cmdq.cmdq_base = cmdq_base;
+	req.payload.hws_create_cmdq.cmdq_size = cmdq_size;
+
+	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_CREATE_CMD_QUEUE_RSP, &resp,
+				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+	if (ret)
+		ivpu_warn_ratelimited(vdev, "Failed to create command queue: %d\n", ret);
+
+	return ret;
+}
+
+int ivpu_jsm_hws_destroy_cmdq(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_id)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_DESTROY_CMD_QUEUE };
+	struct vpu_jsm_msg resp;
+	int ret;
+
+	req.payload.hws_destroy_cmdq.host_ssid = ctx_id;
+	req.payload.hws_destroy_cmdq.cmdq_id = cmdq_id;
+
+	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_DESTROY_CMD_QUEUE_RSP, &resp,
+				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+	if (ret)
+		ivpu_warn_ratelimited(vdev, "Failed to destroy command queue: %d\n", ret);
+
+	return ret;
+}
+
+int ivpu_jsm_hws_register_db(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_id, u32 db_id,
+			     u64 cmdq_base, u32 cmdq_size)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_HWS_REGISTER_DB };
+	struct vpu_jsm_msg resp;
+	int ret = 0;
+
+	req.payload.hws_register_db.db_id = db_id;
+	req.payload.hws_register_db.host_ssid = ctx_id;
+	req.payload.hws_register_db.cmdq_id = cmdq_id;
+	req.payload.hws_register_db.cmdq_base = cmdq_base;
+	req.payload.hws_register_db.cmdq_size = cmdq_size;
+
+	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_REGISTER_DB_DONE, &resp,
+				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+	if (ret)
+		ivpu_err_ratelimited(vdev, "Failed to register doorbell %u: %d\n", db_id, ret);
+
+	return ret;
+}
+
+int ivpu_jsm_hws_resume_engine(struct ivpu_device *vdev, u32 engine)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_HWS_ENGINE_RESUME };
+	struct vpu_jsm_msg resp;
+	int ret;
+
+	if (engine != VPU_ENGINE_COMPUTE)
+		return -EINVAL;
+
+	req.payload.hws_resume_engine.engine_idx = engine;
+
+	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_HWS_RESUME_ENGINE_DONE, &resp,
+				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+	if (ret)
+		ivpu_err_ratelimited(vdev, "Failed to resume engine %d: %d\n", engine, ret);
+
+	return ret;
+}
+
+int ivpu_jsm_hws_set_context_sched_properties(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_id,
+					      u32 priority)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_SET_CONTEXT_SCHED_PROPERTIES };
+	struct vpu_jsm_msg resp;
+	int ret;
+
+	req.payload.hws_set_context_sched_properties.host_ssid = ctx_id;
+	req.payload.hws_set_context_sched_properties.cmdq_id = cmdq_id;
+	req.payload.hws_set_context_sched_properties.priority_band = priority;
+	req.payload.hws_set_context_sched_properties.realtime_priority_level = 0;
+	req.payload.hws_set_context_sched_properties.in_process_priority = 0;
+	req.payload.hws_set_context_sched_properties.context_quantum = 20000;
+	req.payload.hws_set_context_sched_properties.grace_period_same_priority = 10000;
+	req.payload.hws_set_context_sched_properties.grace_period_lower_priority = 0;
+
+	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_SET_CONTEXT_SCHED_PROPERTIES_RSP, &resp,
+				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+	if (ret)
+		ivpu_warn_ratelimited(vdev, "Failed to set context sched properties: %d\n", ret);
+
+	return ret;
+}
+
+int ivpu_jsm_hws_set_scheduling_log(struct ivpu_device *vdev, u32 engine_idx, u32 host_ssid,
+				    u64 vpu_log_buffer_va)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_HWS_SET_SCHEDULING_LOG };
+	struct vpu_jsm_msg resp;
+	int ret;
+
+	req.payload.hws_set_scheduling_log.engine_idx = engine_idx;
+	req.payload.hws_set_scheduling_log.host_ssid = host_ssid;
+	req.payload.hws_set_scheduling_log.vpu_log_buffer_va = vpu_log_buffer_va;
+	req.payload.hws_set_scheduling_log.notify_index = 0;
+
+	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_HWS_SET_SCHEDULING_LOG_RSP, &resp,
+				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+	if (ret)
+		ivpu_warn_ratelimited(vdev, "Failed to set scheduling log: %d\n", ret);
+
+	return ret;
+}
+
+int ivpu_jsm_hws_setup_priority_bands(struct ivpu_device *vdev)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_SET_PRIORITY_BAND_SETUP };
+	struct vpu_jsm_msg resp;
+	int ret;
+
+	/* Idle */
+	req.payload.hws_priority_band_setup.grace_period[0] = 0;
+	req.payload.hws_priority_band_setup.process_grace_period[0] = 50000;
+	req.payload.hws_priority_band_setup.process_quantum[0] = 160000;
+	/* Normal */
+	req.payload.hws_priority_band_setup.grace_period[1] = 50000;
+	req.payload.hws_priority_band_setup.process_grace_period[1] = 50000;
+	req.payload.hws_priority_band_setup.process_quantum[1] = 300000;
+	/* Focus */
+	req.payload.hws_priority_band_setup.grace_period[2] = 50000;
+	req.payload.hws_priority_band_setup.process_grace_period[2] = 50000;
+	req.payload.hws_priority_band_setup.process_quantum[2] = 200000;
+	/* Realtime */
+	req.payload.hws_priority_band_setup.grace_period[3] = 0;
+	req.payload.hws_priority_band_setup.process_grace_period[3] = 50000;
+	req.payload.hws_priority_band_setup.process_quantum[3] = 200000;
+
+	req.payload.hws_priority_band_setup.normal_band_percentage = 10;
+
+	ret = ivpu_ipc_send_receive_internal(vdev, &req, VPU_JSM_MSG_SET_PRIORITY_BAND_SETUP_RSP,
+					     &resp, VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+	if (ret)
+		ivpu_warn_ratelimited(vdev, "Failed to set priority bands: %d\n", ret);
+
+	return ret;
+}
+
+int ivpu_jsm_metric_streamer_start(struct ivpu_device *vdev, u64 metric_group_mask,
+				   u64 sampling_rate, u64 buffer_addr, u64 buffer_size)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_METRIC_STREAMER_START };
+	struct vpu_jsm_msg resp;
+	int ret;
+
+	req.payload.metric_streamer_start.metric_group_mask = metric_group_mask;
+	req.payload.metric_streamer_start.sampling_rate = sampling_rate;
+	req.payload.metric_streamer_start.buffer_addr = buffer_addr;
+	req.payload.metric_streamer_start.buffer_size = buffer_size;
+
+	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_METRIC_STREAMER_START_DONE, &resp,
+				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+	if (ret) {
+		ivpu_warn_ratelimited(vdev, "Failed to start metric streamer: ret %d\n", ret);
+		return ret;
+	}
+
+	return ret;
+}
+
+int ivpu_jsm_metric_streamer_stop(struct ivpu_device *vdev, u64 metric_group_mask)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_METRIC_STREAMER_STOP };
+	struct vpu_jsm_msg resp;
+	int ret;
+
+	req.payload.metric_streamer_stop.metric_group_mask = metric_group_mask;
+
+	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_METRIC_STREAMER_STOP_DONE, &resp,
+				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+	if (ret)
+		ivpu_warn_ratelimited(vdev, "Failed to stop metric streamer: ret %d\n", ret);
+
+	return ret;
+}
+
+int ivpu_jsm_metric_streamer_update(struct ivpu_device *vdev, u64 metric_group_mask,
+				    u64 buffer_addr, u64 buffer_size, u64 *bytes_written)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_METRIC_STREAMER_UPDATE };
+	struct vpu_jsm_msg resp;
+	int ret;
+
+	req.payload.metric_streamer_update.metric_group_mask = metric_group_mask;
+	req.payload.metric_streamer_update.buffer_addr = buffer_addr;
+	req.payload.metric_streamer_update.buffer_size = buffer_size;
+
+	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_METRIC_STREAMER_UPDATE_DONE, &resp,
+				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+	if (ret) {
+		ivpu_warn_ratelimited(vdev, "Failed to update metric streamer: ret %d\n", ret);
+		return ret;
+	}
+
+	if (buffer_size && resp.payload.metric_streamer_done.bytes_written > buffer_size) {
+		ivpu_warn_ratelimited(vdev, "MS buffer overflow: bytes_written %#llx > buffer_size %#llx\n",
+				      resp.payload.metric_streamer_done.bytes_written, buffer_size);
+		return -EOVERFLOW;
+	}
+
+	*bytes_written = resp.payload.metric_streamer_done.bytes_written;
+
+	return ret;
+}
+
+int ivpu_jsm_metric_streamer_info(struct ivpu_device *vdev, u64 metric_group_mask, u64 buffer_addr,
+				  u64 buffer_size, u32 *sample_size, u64 *info_size)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_METRIC_STREAMER_INFO };
+	struct vpu_jsm_msg resp;
+	int ret;
+
+	req.payload.metric_streamer_start.metric_group_mask = metric_group_mask;
+	req.payload.metric_streamer_start.buffer_addr = buffer_addr;
+	req.payload.metric_streamer_start.buffer_size = buffer_size;
+
+	ret = ivpu_ipc_send_receive(vdev, &req, VPU_JSM_MSG_METRIC_STREAMER_INFO_DONE, &resp,
+				    VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+	if (ret) {
+		ivpu_warn_ratelimited(vdev, "Failed to get metric streamer info: ret %d\n", ret);
+		return ret;
+	}
+
+	if (!resp.payload.metric_streamer_done.sample_size) {
+		ivpu_warn_ratelimited(vdev, "Invalid sample size\n");
+		return -EBADMSG;
+	}
+
+	if (sample_size)
+		*sample_size = resp.payload.metric_streamer_done.sample_size;
+	if (info_size)
+		*info_size = resp.payload.metric_streamer_done.bytes_written;
+
+	return ret;
+}
+
+int ivpu_jsm_dct_enable(struct ivpu_device *vdev, u32 active_us, u32 inactive_us)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_DCT_ENABLE };
+	struct vpu_jsm_msg resp;
+
+	req.payload.pwr_dct_control.dct_active_us = active_us;
+	req.payload.pwr_dct_control.dct_inactive_us = inactive_us;
+
+	return ivpu_ipc_send_receive_internal(vdev, &req, VPU_JSM_MSG_DCT_ENABLE_DONE, &resp,
+					      VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+}
+
+int ivpu_jsm_dct_disable(struct ivpu_device *vdev)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_DCT_DISABLE };
+	struct vpu_jsm_msg resp;
+
+	return ivpu_ipc_send_receive_internal(vdev, &req, VPU_JSM_MSG_DCT_DISABLE_DONE, &resp,
+					      VPU_IPC_CHAN_ASYNC_CMD, vdev->timeout.jsm);
+}
+
+int ivpu_jsm_state_dump(struct ivpu_device *vdev)
+{
+	struct vpu_jsm_msg req = { .type = VPU_JSM_MSG_STATE_DUMP };
+
+	return ivpu_ipc_send_and_wait(vdev, &req, VPU_IPC_CHAN_ASYNC_CMD,
+				      vdev->timeout.state_dump_msg);
+}
diff --git a/drivers/accel/ivpu/ivpu_jsm_msg.h b/drivers/accel/ivpu/ivpu_jsm_msg.h
index ae75e5dbcc41..9e84d3526a14 100644
--- a/drivers/accel/ivpu/ivpu_jsm_msg.h
+++ b/drivers/accel/ivpu/ivpu_jsm_msg.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #ifndef __IVPU_JSM_MSG_H__
@@ -23,4 +23,26 @@ int ivpu_jsm_trace_set_config(struct ivpu_device *vdev, u32 trace_level, u32 tra
 			      u64 trace_hw_component_mask);
 int ivpu_jsm_context_release(struct ivpu_device *vdev, u32 host_ssid);
 int ivpu_jsm_pwr_d0i3_enter(struct ivpu_device *vdev);
+int ivpu_jsm_hws_create_cmdq(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_group, u32 cmdq_id,
+			     u32 pid, u32 engine, u64 cmdq_base, u32 cmdq_size);
+int ivpu_jsm_hws_destroy_cmdq(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_id);
+int ivpu_jsm_hws_register_db(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_id, u32 db_id,
+			     u64 cmdq_base, u32 cmdq_size);
+int ivpu_jsm_hws_resume_engine(struct ivpu_device *vdev, u32 engine);
+int ivpu_jsm_hws_set_context_sched_properties(struct ivpu_device *vdev, u32 ctx_id, u32 cmdq_id,
+					      u32 priority);
+int ivpu_jsm_hws_set_scheduling_log(struct ivpu_device *vdev, u32 engine_idx, u32 host_ssid,
+				    u64 vpu_log_buffer_va);
+int ivpu_jsm_hws_setup_priority_bands(struct ivpu_device *vdev);
+int ivpu_jsm_metric_streamer_start(struct ivpu_device *vdev, u64 metric_group_mask,
+				   u64 sampling_rate, u64 buffer_addr, u64 buffer_size);
+int ivpu_jsm_metric_streamer_stop(struct ivpu_device *vdev, u64 metric_group_mask);
+int ivpu_jsm_metric_streamer_update(struct ivpu_device *vdev, u64 metric_group_mask,
+				    u64 buffer_addr, u64 buffer_size, u64 *bytes_written);
+int ivpu_jsm_metric_streamer_info(struct ivpu_device *vdev, u64 metric_group_mask, u64 buffer_addr,
+				  u64 buffer_size, u32 *sample_size, u64 *info_size);
+int ivpu_jsm_dct_enable(struct ivpu_device *vdev, u32 active_us, u32 inactive_us);
+int ivpu_jsm_dct_disable(struct ivpu_device *vdev);
+int ivpu_jsm_state_dump(struct ivpu_device *vdev);
+
 #endif
diff --git a/drivers/accel/ivpu/ivpu_mmu.c b/drivers/accel/ivpu/ivpu_mmu.c
index 91bd640655ab..26ef52fbb93e 100644
--- a/drivers/accel/ivpu/ivpu_mmu.c
+++ b/drivers/accel/ivpu/ivpu_mmu.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #include <linux/circ_buf.h>
@@ -278,7 +278,7 @@ static const char *ivpu_mmu_event_to_str(u32 cmd)
 	case IVPU_MMU_EVT_F_VMS_FETCH:
 		return "Fetch of VMS caused external abort";
 	default:
-		return "Unknown CMDQ command";
+		return "Unknown event";
 	}
 }
 
@@ -286,15 +286,15 @@ static const char *ivpu_mmu_cmdq_err_to_str(u32 err)
 {
 	switch (err) {
 	case IVPU_MMU_CERROR_NONE:
-		return "No CMDQ Error";
+		return "No error";
 	case IVPU_MMU_CERROR_ILL:
 		return "Illegal command";
 	case IVPU_MMU_CERROR_ABT:
-		return "External abort on CMDQ read";
+		return "External abort on command queue read";
 	case IVPU_MMU_CERROR_ATC_INV_SYNC:
 		return "Sync failed to complete ATS invalidation";
 	default:
-		return "Unknown CMDQ Error";
+		return "Unknown error";
 	}
 }
 
@@ -519,7 +519,8 @@ static int ivpu_mmu_cmdq_sync(struct ivpu_device *vdev)
 	if (ret)
 		return ret;
 
-	clflush_cache_range(q->base, IVPU_MMU_CMDQ_SIZE);
+	if (!ivpu_is_force_snoop_enabled(vdev))
+		clflush_cache_range(q->base, IVPU_MMU_CMDQ_SIZE);
 	REGV_WR32(IVPU_MMU_REG_CMDQ_PROD, q->prod);
 
 	ret = ivpu_mmu_cmdq_wait_for_cons(vdev);
@@ -567,7 +568,8 @@ static int ivpu_mmu_reset(struct ivpu_device *vdev)
 	int ret;
 
 	memset(mmu->cmdq.base, 0, IVPU_MMU_CMDQ_SIZE);
-	clflush_cache_range(mmu->cmdq.base, IVPU_MMU_CMDQ_SIZE);
+	if (!ivpu_is_force_snoop_enabled(vdev))
+		clflush_cache_range(mmu->cmdq.base, IVPU_MMU_CMDQ_SIZE);
 	mmu->cmdq.prod = 0;
 	mmu->cmdq.cons = 0;
 
@@ -661,7 +663,8 @@ static void ivpu_mmu_strtab_link_cd(struct ivpu_device *vdev, u32 sid)
 	WRITE_ONCE(entry[1], str[1]);
 	WRITE_ONCE(entry[0], str[0]);
 
-	clflush_cache_range(entry, IVPU_MMU_STRTAB_ENT_SIZE);
+	if (!ivpu_is_force_snoop_enabled(vdev))
+		clflush_cache_range(entry, IVPU_MMU_STRTAB_ENT_SIZE);
 
 	ivpu_dbg(vdev, MMU, "STRTAB write entry (SSID=%u): 0x%llx, 0x%llx\n", sid, str[0], str[1]);
 }
@@ -693,7 +696,7 @@ unlock:
 	return ret;
 }
 
-static int ivpu_mmu_cd_add(struct ivpu_device *vdev, u32 ssid, u64 cd_dma)
+static int ivpu_mmu_cdtab_entry_set(struct ivpu_device *vdev, u32 ssid, u64 cd_dma, bool valid)
 {
 	struct ivpu_mmu_info *mmu = vdev->mmu;
 	struct ivpu_mmu_cdtab *cdtab = &mmu->cdtab;
@@ -705,40 +708,40 @@ static int ivpu_mmu_cd_add(struct ivpu_device *vdev, u32 ssid, u64 cd_dma)
 		return -EINVAL;
 
 	entry = cdtab->base + (ssid * IVPU_MMU_CDTAB_ENT_SIZE);
-
-	if (cd_dma != 0) {
-		cd[0] = FIELD_PREP(IVPU_MMU_CD_0_TCR_T0SZ, IVPU_MMU_T0SZ_48BIT) |
-			FIELD_PREP(IVPU_MMU_CD_0_TCR_TG0, 0) |
-			FIELD_PREP(IVPU_MMU_CD_0_TCR_IRGN0, 0) |
-			FIELD_PREP(IVPU_MMU_CD_0_TCR_ORGN0, 0) |
-			FIELD_PREP(IVPU_MMU_CD_0_TCR_SH0, 0) |
-			FIELD_PREP(IVPU_MMU_CD_0_TCR_IPS, IVPU_MMU_IPS_48BIT) |
-			FIELD_PREP(IVPU_MMU_CD_0_ASID, ssid) |
-			IVPU_MMU_CD_0_TCR_EPD1 |
-			IVPU_MMU_CD_0_AA64 |
-			IVPU_MMU_CD_0_R |
-			IVPU_MMU_CD_0_ASET |
-			IVPU_MMU_CD_0_V;
-		cd[1] = cd_dma & IVPU_MMU_CD_1_TTB0_MASK;
-		cd[2] = 0;
-		cd[3] = 0x0000000000007444;
-
-		/* For global context generate memory fault on VPU */
-		if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID)
-			cd[0] |= IVPU_MMU_CD_0_A;
-	} else {
-		memset(cd, 0, sizeof(cd));
-	}
+	drm_WARN_ON(&vdev->drm, (entry[0] & IVPU_MMU_CD_0_V) == valid);
+
+	cd[0] = FIELD_PREP(IVPU_MMU_CD_0_TCR_T0SZ, IVPU_MMU_T0SZ_48BIT) |
+		FIELD_PREP(IVPU_MMU_CD_0_TCR_TG0, 0) |
+		FIELD_PREP(IVPU_MMU_CD_0_TCR_IRGN0, 0) |
+		FIELD_PREP(IVPU_MMU_CD_0_TCR_ORGN0, 0) |
+		FIELD_PREP(IVPU_MMU_CD_0_TCR_SH0, 0) |
+		FIELD_PREP(IVPU_MMU_CD_0_TCR_IPS, IVPU_MMU_IPS_48BIT) |
+		FIELD_PREP(IVPU_MMU_CD_0_ASID, ssid) |
+		IVPU_MMU_CD_0_TCR_EPD1 |
+		IVPU_MMU_CD_0_AA64 |
+		IVPU_MMU_CD_0_R |
+		IVPU_MMU_CD_0_ASET;
+	cd[1] = cd_dma & IVPU_MMU_CD_1_TTB0_MASK;
+	cd[2] = 0;
+	cd[3] = 0x0000000000007444;
+
+	/* For global context generate memory fault on VPU */
+	if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID)
+		cd[0] |= IVPU_MMU_CD_0_A;
+
+	if (valid)
+		cd[0] |= IVPU_MMU_CD_0_V;
 
 	WRITE_ONCE(entry[1], cd[1]);
 	WRITE_ONCE(entry[2], cd[2]);
 	WRITE_ONCE(entry[3], cd[3]);
 	WRITE_ONCE(entry[0], cd[0]);
 
-	clflush_cache_range(entry, IVPU_MMU_CDTAB_ENT_SIZE);
+	if (!ivpu_is_force_snoop_enabled(vdev))
+		clflush_cache_range(entry, IVPU_MMU_CDTAB_ENT_SIZE);
 
-	ivpu_dbg(vdev, MMU, "CDTAB %s entry (SSID=%u, dma=%pad): 0x%llx, 0x%llx, 0x%llx, 0x%llx\n",
-		 cd_dma ? "write" : "clear", ssid, &cd_dma, cd[0], cd[1], cd[2], cd[3]);
+	ivpu_dbg(vdev, MMU, "CDTAB set %s entry (SSID=%u, dma=%pad): 0x%llx, 0x%llx, 0x%llx, 0x%llx\n",
+		 valid ? "valid" : "invalid", ssid, &cd_dma, cd[0], cd[1], cd[2], cd[3]);
 
 	mutex_lock(&mmu->lock);
 	if (!mmu->on)
@@ -746,38 +749,18 @@ static int ivpu_mmu_cd_add(struct ivpu_device *vdev, u32 ssid, u64 cd_dma)
 
 	ret = ivpu_mmu_cmdq_write_cfgi_all(vdev);
 	if (ret)
-		goto unlock;
+		goto err_invalidate;
 
 	ret = ivpu_mmu_cmdq_sync(vdev);
+	if (ret)
+		goto err_invalidate;
 unlock:
 	mutex_unlock(&mmu->lock);
-	return ret;
-}
-
-static int ivpu_mmu_cd_add_gbl(struct ivpu_device *vdev)
-{
-	int ret;
-
-	ret = ivpu_mmu_cd_add(vdev, 0, vdev->gctx.pgtable.pgd_dma);
-	if (ret)
-		ivpu_err(vdev, "Failed to add global CD entry: %d\n", ret);
-
-	return ret;
-}
-
-static int ivpu_mmu_cd_add_user(struct ivpu_device *vdev, u32 ssid, dma_addr_t cd_dma)
-{
-	int ret;
-
-	if (ssid == 0) {
-		ivpu_err(vdev, "Invalid SSID: %u\n", ssid);
-		return -EINVAL;
-	}
-
-	ret = ivpu_mmu_cd_add(vdev, ssid, cd_dma);
-	if (ret)
-		ivpu_err(vdev, "Failed to add CD entry SSID=%u: %d\n", ssid, ret);
+	return 0;
 
+err_invalidate:
+	WRITE_ONCE(entry[0], 0);
+	mutex_unlock(&mmu->lock);
 	return ret;
 }
 
@@ -804,12 +787,6 @@ int ivpu_mmu_init(struct ivpu_device *vdev)
 		return ret;
 	}
 
-	ret = ivpu_mmu_cd_add_gbl(vdev);
-	if (ret) {
-		ivpu_err(vdev, "Failed to initialize strtab: %d\n", ret);
-		return ret;
-	}
-
 	ret = ivpu_mmu_enable(vdev);
 	if (ret) {
 		ivpu_err(vdev, "Failed to resume MMU: %d\n", ret);
@@ -874,8 +851,9 @@ static void ivpu_mmu_dump_event(struct ivpu_device *vdev, u32 *event)
 	u64 in_addr = ((u64)event[5]) << 32 | event[4];
 	u32 sid = event[1];
 
-	ivpu_err(vdev, "MMU EVTQ: 0x%x (%s) SSID: %d SID: %d, e[2] %08x, e[3] %08x, in addr: 0x%llx, fetch addr: 0x%llx\n",
-		 op, ivpu_mmu_event_to_str(op), ssid, sid, event[2], event[3], in_addr, fetch_addr);
+	ivpu_err_ratelimited(vdev, "MMU EVTQ: 0x%x (%s) SSID: %d SID: %d, e[2] %08x, e[3] %08x, in addr: 0x%llx, fetch addr: 0x%llx\n",
+			     op, ivpu_mmu_event_to_str(op), ssid, sid,
+			     event[2], event[3], in_addr, fetch_addr);
 }
 
 static u32 *ivpu_mmu_get_event(struct ivpu_device *vdev)
@@ -911,6 +889,9 @@ void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev)
 		ivpu_mmu_user_context_mark_invalid(vdev, ssid);
 		REGV_WR32(IVPU_MMU_REG_EVTQ_CONS_SEC, vdev->mmu->evtq.cons);
 	}
+
+	if (!kfifo_put(&vdev->hw->irq.fifo, IVPU_HW_IRQ_SRC_MMU_EVTQ))
+		ivpu_err_ratelimited(vdev, "IRQ FIFO full\n");
 }
 
 void ivpu_mmu_evtq_dump(struct ivpu_device *vdev)
@@ -958,12 +939,12 @@ void ivpu_mmu_irq_gerr_handler(struct ivpu_device *vdev)
 	REGV_WR32(IVPU_MMU_REG_GERRORN, gerror_val);
 }
 
-int ivpu_mmu_set_pgtable(struct ivpu_device *vdev, int ssid, struct ivpu_mmu_pgtable *pgtable)
+int ivpu_mmu_cd_set(struct ivpu_device *vdev, int ssid, struct ivpu_mmu_pgtable *pgtable)
 {
-	return ivpu_mmu_cd_add_user(vdev, ssid, pgtable->pgd_dma);
+	return ivpu_mmu_cdtab_entry_set(vdev, ssid, pgtable->pgd_dma, true);
 }
 
-void ivpu_mmu_clear_pgtable(struct ivpu_device *vdev, int ssid)
+void ivpu_mmu_cd_clear(struct ivpu_device *vdev, int ssid)
 {
-	ivpu_mmu_cd_add_user(vdev, ssid, 0); /* 0 will clear CD entry */
+	ivpu_mmu_cdtab_entry_set(vdev, ssid, 0, false);
 }
diff --git a/drivers/accel/ivpu/ivpu_mmu.h b/drivers/accel/ivpu/ivpu_mmu.h
index 6fa35c240710..7afea9cd8731 100644
--- a/drivers/accel/ivpu/ivpu_mmu.h
+++ b/drivers/accel/ivpu/ivpu_mmu.h
@@ -40,8 +40,8 @@ struct ivpu_mmu_info {
 int ivpu_mmu_init(struct ivpu_device *vdev);
 void ivpu_mmu_disable(struct ivpu_device *vdev);
 int ivpu_mmu_enable(struct ivpu_device *vdev);
-int ivpu_mmu_set_pgtable(struct ivpu_device *vdev, int ssid, struct ivpu_mmu_pgtable *pgtable);
-void ivpu_mmu_clear_pgtable(struct ivpu_device *vdev, int ssid);
+int ivpu_mmu_cd_set(struct ivpu_device *vdev, int ssid, struct ivpu_mmu_pgtable *pgtable);
+void ivpu_mmu_cd_clear(struct ivpu_device *vdev, int ssid);
 int ivpu_mmu_invalidate_tlb(struct ivpu_device *vdev, u16 ssid);
 
 void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev);
diff --git a/drivers/accel/ivpu/ivpu_mmu_context.c b/drivers/accel/ivpu/ivpu_mmu_context.c
index fe6161299236..0af614dfb6f9 100644
--- a/drivers/accel/ivpu/ivpu_mmu_context.c
+++ b/drivers/accel/ivpu/ivpu_mmu_context.c
@@ -6,6 +6,7 @@
 #include <linux/bitfield.h>
 #include <linux/highmem.h>
 #include <linux/set_memory.h>
+#include <linux/vmalloc.h>
 
 #include <drm/drm_cache.h>
 
@@ -23,6 +24,7 @@
 #define IVPU_MMU_ENTRY_FLAG_CONT         BIT(52)
 #define IVPU_MMU_ENTRY_FLAG_NG           BIT(11)
 #define IVPU_MMU_ENTRY_FLAG_AF           BIT(10)
+#define IVPU_MMU_ENTRY_FLAG_RO           BIT(7)
 #define IVPU_MMU_ENTRY_FLAG_USER         BIT(6)
 #define IVPU_MMU_ENTRY_FLAG_LLC_COHERENT BIT(2)
 #define IVPU_MMU_ENTRY_FLAG_TYPE_PAGE    BIT(1)
@@ -88,19 +90,6 @@ static void ivpu_pgtable_free_page(struct ivpu_device *vdev, u64 *cpu_addr, dma_
 	}
 }
 
-static int ivpu_mmu_pgtable_init(struct ivpu_device *vdev, struct ivpu_mmu_pgtable *pgtable)
-{
-	dma_addr_t pgd_dma;
-
-	pgtable->pgd_dma_ptr = ivpu_pgtable_alloc_page(vdev, &pgd_dma);
-	if (!pgtable->pgd_dma_ptr)
-		return -ENOMEM;
-
-	pgtable->pgd_dma = pgd_dma;
-
-	return 0;
-}
-
 static void ivpu_mmu_pgtables_free(struct ivpu_device *vdev, struct ivpu_mmu_pgtable *pgtable)
 {
 	int pgd_idx, pud_idx, pmd_idx;
@@ -138,6 +127,27 @@ static void ivpu_mmu_pgtables_free(struct ivpu_device *vdev, struct ivpu_mmu_pgt
 	}
 
 	ivpu_pgtable_free_page(vdev, pgtable->pgd_dma_ptr, pgtable->pgd_dma);
+	pgtable->pgd_dma_ptr = NULL;
+	pgtable->pgd_dma = 0;
+}
+
+static u64*
+ivpu_mmu_ensure_pgd(struct ivpu_device *vdev, struct ivpu_mmu_pgtable *pgtable)
+{
+	u64 *pgd_dma_ptr = pgtable->pgd_dma_ptr;
+	dma_addr_t pgd_dma;
+
+	if (pgd_dma_ptr)
+		return pgd_dma_ptr;
+
+	pgd_dma_ptr = ivpu_pgtable_alloc_page(vdev, &pgd_dma);
+	if (!pgd_dma_ptr)
+		return NULL;
+
+	pgtable->pgd_dma_ptr = pgd_dma_ptr;
+	pgtable->pgd_dma = pgd_dma;
+
+	return pgd_dma_ptr;
 }
 
 static u64*
@@ -235,6 +245,12 @@ ivpu_mmu_context_map_page(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx
 	int pmd_idx = FIELD_GET(IVPU_MMU_PMD_INDEX_MASK, vpu_addr);
 	int pte_idx = FIELD_GET(IVPU_MMU_PTE_INDEX_MASK, vpu_addr);
 
+	drm_WARN_ON(&vdev->drm, ctx->id == IVPU_RESERVED_CONTEXT_MMU_SSID);
+
+	/* Allocate PGD - first level page table if needed */
+	if (!ivpu_mmu_ensure_pgd(vdev, &ctx->pgtable))
+		return -ENOMEM;
+
 	/* Allocate PUD - second level page table if needed */
 	if (!ivpu_mmu_ensure_pud(vdev, &ctx->pgtable, pgd_idx))
 		return -ENOMEM;
@@ -318,6 +334,91 @@ ivpu_mmu_context_map_pages(struct ivpu_device *vdev, struct ivpu_mmu_context *ct
 	return 0;
 }
 
+static void ivpu_mmu_context_set_page_ro(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
+					 u64 vpu_addr)
+{
+	int pgd_idx = FIELD_GET(IVPU_MMU_PGD_INDEX_MASK, vpu_addr);
+	int pud_idx = FIELD_GET(IVPU_MMU_PUD_INDEX_MASK, vpu_addr);
+	int pmd_idx = FIELD_GET(IVPU_MMU_PMD_INDEX_MASK, vpu_addr);
+	int pte_idx = FIELD_GET(IVPU_MMU_PTE_INDEX_MASK, vpu_addr);
+
+	ctx->pgtable.pte_ptrs[pgd_idx][pud_idx][pmd_idx][pte_idx] |= IVPU_MMU_ENTRY_FLAG_RO;
+}
+
+static void ivpu_mmu_context_split_page(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
+					u64 vpu_addr)
+{
+	int pgd_idx = FIELD_GET(IVPU_MMU_PGD_INDEX_MASK, vpu_addr);
+	int pud_idx = FIELD_GET(IVPU_MMU_PUD_INDEX_MASK, vpu_addr);
+	int pmd_idx = FIELD_GET(IVPU_MMU_PMD_INDEX_MASK, vpu_addr);
+	int pte_idx = FIELD_GET(IVPU_MMU_PTE_INDEX_MASK, vpu_addr);
+
+	ctx->pgtable.pte_ptrs[pgd_idx][pud_idx][pmd_idx][pte_idx] &= ~IVPU_MMU_ENTRY_FLAG_CONT;
+}
+
+static void ivpu_mmu_context_split_64k_page(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
+					    u64 vpu_addr)
+{
+	u64 start = ALIGN_DOWN(vpu_addr, IVPU_MMU_CONT_PAGES_SIZE);
+	u64 end = ALIGN(vpu_addr, IVPU_MMU_CONT_PAGES_SIZE);
+	u64 offset = 0;
+
+	ivpu_dbg(vdev, MMU_MAP, "Split 64K page ctx: %u vpu_addr: 0x%llx\n", ctx->id, vpu_addr);
+
+	while (start + offset < end) {
+		ivpu_mmu_context_split_page(vdev, ctx, start + offset);
+		offset += IVPU_MMU_PAGE_SIZE;
+	}
+}
+
+int
+ivpu_mmu_context_set_pages_ro(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx, u64 vpu_addr,
+			      size_t size)
+{
+	u64 end = vpu_addr + size;
+	size_t size_left = size;
+	int ret;
+
+	if (size == 0)
+		return 0;
+
+	if (drm_WARN_ON(&vdev->drm, !IS_ALIGNED(vpu_addr | size, IVPU_MMU_PAGE_SIZE)))
+		return -EINVAL;
+
+	mutex_lock(&ctx->lock);
+
+	ivpu_dbg(vdev, MMU_MAP, "Set read-only pages ctx: %u vpu_addr: 0x%llx size: %lu\n",
+		 ctx->id, vpu_addr, size);
+
+	if (!ivpu_disable_mmu_cont_pages) {
+		/* Split 64K contiguous page at the beginning if needed */
+		if (!IS_ALIGNED(vpu_addr, IVPU_MMU_CONT_PAGES_SIZE))
+			ivpu_mmu_context_split_64k_page(vdev, ctx, vpu_addr);
+
+		/* Split 64K contiguous page at the end if needed */
+		if (!IS_ALIGNED(vpu_addr + size, IVPU_MMU_CONT_PAGES_SIZE))
+			ivpu_mmu_context_split_64k_page(vdev, ctx, vpu_addr + size);
+	}
+
+	while (size_left) {
+		if (vpu_addr < end)
+			ivpu_mmu_context_set_page_ro(vdev, ctx, vpu_addr);
+
+		vpu_addr += IVPU_MMU_PAGE_SIZE;
+		size_left -= IVPU_MMU_PAGE_SIZE;
+	}
+
+	/* Ensure page table modifications are flushed from wc buffers to memory */
+	wmb();
+
+	mutex_unlock(&ctx->lock);
+	ret = ivpu_mmu_invalidate_tlb(vdev, ctx->id);
+	if (ret)
+		ivpu_err(vdev, "Failed to invalidate TLB for ctx %u: %d\n", ctx->id, ret);
+
+	return 0;
+}
+
 static void ivpu_mmu_context_unmap_pages(struct ivpu_mmu_context *ctx, u64 vpu_addr, size_t size)
 {
 	while (size) {
@@ -331,6 +432,7 @@ int
 ivpu_mmu_context_map_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
 			 u64 vpu_addr, struct sg_table *sgt,  bool llc_coherent)
 {
+	size_t start_vpu_addr = vpu_addr;
 	struct scatterlist *sg;
 	int ret;
 	u64 prot;
@@ -361,20 +463,36 @@ ivpu_mmu_context_map_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
 		ret = ivpu_mmu_context_map_pages(vdev, ctx, vpu_addr, dma_addr, size, prot);
 		if (ret) {
 			ivpu_err(vdev, "Failed to map context pages\n");
-			mutex_unlock(&ctx->lock);
-			return ret;
+			goto err_unmap_pages;
 		}
 		vpu_addr += size;
 	}
 
+	if (!ctx->is_cd_valid) {
+		ret = ivpu_mmu_cd_set(vdev, ctx->id, &ctx->pgtable);
+		if (ret) {
+			ivpu_err(vdev, "Failed to set context descriptor for context %u: %d\n",
+				 ctx->id, ret);
+			goto err_unmap_pages;
+		}
+		ctx->is_cd_valid = true;
+	}
+
 	/* Ensure page table modifications are flushed from wc buffers to memory */
 	wmb();
 
-	mutex_unlock(&ctx->lock);
-
 	ret = ivpu_mmu_invalidate_tlb(vdev, ctx->id);
-	if (ret)
+	if (ret) {
 		ivpu_err(vdev, "Failed to invalidate TLB for ctx %u: %d\n", ctx->id, ret);
+		goto err_unmap_pages;
+	}
+
+	mutex_unlock(&ctx->lock);
+	return 0;
+
+err_unmap_pages:
+	ivpu_mmu_context_unmap_pages(ctx, start_vpu_addr, vpu_addr - start_vpu_addr);
+	mutex_unlock(&ctx->lock);
 	return ret;
 }
 
@@ -443,65 +561,79 @@ ivpu_mmu_context_remove_node(struct ivpu_mmu_context *ctx, struct drm_mm_node *n
 	mutex_unlock(&ctx->lock);
 }
 
-static int
-ivpu_mmu_context_init(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx, u32 context_id)
+void ivpu_mmu_context_init(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx, u32 context_id)
 {
 	u64 start, end;
-	int ret;
 
 	mutex_init(&ctx->lock);
 
-	ret = ivpu_mmu_pgtable_init(vdev, &ctx->pgtable);
-	if (ret) {
-		ivpu_err(vdev, "Failed to initialize pgtable for ctx %u: %d\n", context_id, ret);
-		return ret;
-	}
-
 	if (!context_id) {
 		start = vdev->hw->ranges.global.start;
 		end = vdev->hw->ranges.shave.end;
 	} else {
-		start = vdev->hw->ranges.user.start;
-		end = vdev->hw->ranges.dma.end;
+		start = min_t(u64, vdev->hw->ranges.user.start, vdev->hw->ranges.shave.start);
+		end = max_t(u64, vdev->hw->ranges.user.end, vdev->hw->ranges.dma.end);
 	}
 
 	drm_mm_init(&ctx->mm, start, end - start);
 	ctx->id = context_id;
-
-	return 0;
 }
 
-static void ivpu_mmu_context_fini(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx)
+void ivpu_mmu_context_fini(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx)
 {
-	if (drm_WARN_ON(&vdev->drm, !ctx->pgtable.pgd_dma_ptr))
-		return;
+	if (ctx->is_cd_valid) {
+		ivpu_mmu_cd_clear(vdev, ctx->id);
+		ctx->is_cd_valid = false;
+	}
 
 	mutex_destroy(&ctx->lock);
 	ivpu_mmu_pgtables_free(vdev, &ctx->pgtable);
 	drm_mm_takedown(&ctx->mm);
-
-	ctx->pgtable.pgd_dma_ptr = NULL;
-	ctx->pgtable.pgd_dma = 0;
 }
 
-int ivpu_mmu_global_context_init(struct ivpu_device *vdev)
+void ivpu_mmu_global_context_init(struct ivpu_device *vdev)
 {
-	return ivpu_mmu_context_init(vdev, &vdev->gctx, IVPU_GLOBAL_CONTEXT_MMU_SSID);
+	ivpu_mmu_context_init(vdev, &vdev->gctx, IVPU_GLOBAL_CONTEXT_MMU_SSID);
 }
 
 void ivpu_mmu_global_context_fini(struct ivpu_device *vdev)
 {
-	return ivpu_mmu_context_fini(vdev, &vdev->gctx);
+	ivpu_mmu_context_fini(vdev, &vdev->gctx);
 }
 
 int ivpu_mmu_reserved_context_init(struct ivpu_device *vdev)
 {
-	return ivpu_mmu_user_context_init(vdev, &vdev->rctx, IVPU_RESERVED_CONTEXT_MMU_SSID);
+	int ret;
+
+	ivpu_mmu_context_init(vdev, &vdev->rctx, IVPU_RESERVED_CONTEXT_MMU_SSID);
+
+	mutex_lock(&vdev->rctx.lock);
+
+	if (!ivpu_mmu_ensure_pgd(vdev, &vdev->rctx.pgtable)) {
+		ivpu_err(vdev, "Failed to allocate root page table for reserved context\n");
+		ret = -ENOMEM;
+		goto err_ctx_fini;
+	}
+
+	ret = ivpu_mmu_cd_set(vdev, vdev->rctx.id, &vdev->rctx.pgtable);
+	if (ret) {
+		ivpu_err(vdev, "Failed to set context descriptor for reserved context\n");
+		goto err_ctx_fini;
+	}
+
+	mutex_unlock(&vdev->rctx.lock);
+	return ret;
+
+err_ctx_fini:
+	mutex_unlock(&vdev->rctx.lock);
+	ivpu_mmu_context_fini(vdev, &vdev->rctx);
+	return ret;
 }
 
 void ivpu_mmu_reserved_context_fini(struct ivpu_device *vdev)
 {
-	return ivpu_mmu_user_context_fini(vdev, &vdev->rctx);
+	ivpu_mmu_cd_clear(vdev, vdev->rctx.id);
+	ivpu_mmu_context_fini(vdev, &vdev->rctx);
 }
 
 void ivpu_mmu_user_context_mark_invalid(struct ivpu_device *vdev, u32 ssid)
@@ -516,36 +648,3 @@ void ivpu_mmu_user_context_mark_invalid(struct ivpu_device *vdev, u32 ssid)
 
 	xa_unlock(&vdev->context_xa);
 }
-
-int ivpu_mmu_user_context_init(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx, u32 ctx_id)
-{
-	int ret;
-
-	drm_WARN_ON(&vdev->drm, !ctx_id);
-
-	ret = ivpu_mmu_context_init(vdev, ctx, ctx_id);
-	if (ret) {
-		ivpu_err(vdev, "Failed to initialize context %u: %d\n", ctx_id, ret);
-		return ret;
-	}
-
-	ret = ivpu_mmu_set_pgtable(vdev, ctx_id, &ctx->pgtable);
-	if (ret) {
-		ivpu_err(vdev, "Failed to set page table for context %u: %d\n", ctx_id, ret);
-		goto err_context_fini;
-	}
-
-	return 0;
-
-err_context_fini:
-	ivpu_mmu_context_fini(vdev, ctx);
-	return ret;
-}
-
-void ivpu_mmu_user_context_fini(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx)
-{
-	drm_WARN_ON(&vdev->drm, !ctx->id);
-
-	ivpu_mmu_clear_pgtable(vdev, ctx->id);
-	ivpu_mmu_context_fini(vdev, ctx);
-}
diff --git a/drivers/accel/ivpu/ivpu_mmu_context.h b/drivers/accel/ivpu/ivpu_mmu_context.h
index 535db3a1fc74..8042fc067062 100644
--- a/drivers/accel/ivpu/ivpu_mmu_context.h
+++ b/drivers/accel/ivpu/ivpu_mmu_context.h
@@ -23,19 +23,20 @@ struct ivpu_mmu_pgtable {
 };
 
 struct ivpu_mmu_context {
-	struct mutex lock; /* Protects: mm, pgtable */
+	struct mutex lock; /* Protects: mm, pgtable, is_cd_valid */
 	struct drm_mm mm;
 	struct ivpu_mmu_pgtable pgtable;
+	bool is_cd_valid;
 	u32 id;
 };
 
-int ivpu_mmu_global_context_init(struct ivpu_device *vdev);
+void ivpu_mmu_context_init(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx, u32 context_id);
+void ivpu_mmu_context_fini(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx);
+void ivpu_mmu_global_context_init(struct ivpu_device *vdev);
 void ivpu_mmu_global_context_fini(struct ivpu_device *vdev);
 int ivpu_mmu_reserved_context_init(struct ivpu_device *vdev);
 void ivpu_mmu_reserved_context_fini(struct ivpu_device *vdev);
 
-int ivpu_mmu_user_context_init(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx, u32 ctx_id);
-void ivpu_mmu_user_context_fini(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx);
 void ivpu_mmu_user_context_mark_invalid(struct ivpu_device *vdev, u32 ssid);
 
 int ivpu_mmu_context_insert_node(struct ivpu_mmu_context *ctx, const struct ivpu_addr_range *range,
@@ -46,5 +47,7 @@ int ivpu_mmu_context_map_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *
 			     u64 vpu_addr, struct sg_table *sgt, bool llc_coherent);
 void ivpu_mmu_context_unmap_sgt(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
 				u64 vpu_addr, struct sg_table *sgt);
+int ivpu_mmu_context_set_pages_ro(struct ivpu_device *vdev, struct ivpu_mmu_context *ctx,
+				  u64 vpu_addr, size_t size);
 
 #endif /* __IVPU_MMU_CONTEXT_H__ */
diff --git a/drivers/accel/ivpu/ivpu_ms.c b/drivers/accel/ivpu/ivpu_ms.c
new file mode 100644
index 000000000000..ffe7b10f8a76
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_ms.c
@@ -0,0 +1,309 @@
+// SPDX-License-Identifier: GPL-2.0-only OR MIT
+/*
+ * Copyright (C) 2020-2024 Intel Corporation
+ */
+
+#include <drm/drm_file.h>
+
+#include "ivpu_drv.h"
+#include "ivpu_gem.h"
+#include "ivpu_jsm_msg.h"
+#include "ivpu_ms.h"
+#include "ivpu_pm.h"
+
+#define MS_INFO_BUFFER_SIZE	  SZ_64K
+#define MS_NUM_BUFFERS		  2
+#define MS_READ_PERIOD_MULTIPLIER 2
+#define MS_MIN_SAMPLE_PERIOD_NS   1000000
+
+static struct ivpu_ms_instance *
+get_instance_by_mask(struct ivpu_file_priv *file_priv, u64 metric_mask)
+{
+	struct ivpu_ms_instance *ms;
+
+	lockdep_assert_held(&file_priv->ms_lock);
+
+	list_for_each_entry(ms, &file_priv->ms_instance_list, ms_instance_node)
+		if (ms->mask == metric_mask)
+			return ms;
+
+	return NULL;
+}
+
+int ivpu_ms_start_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
+{
+	struct ivpu_file_priv *file_priv = file->driver_priv;
+	struct drm_ivpu_metric_streamer_start *args = data;
+	struct ivpu_device *vdev = file_priv->vdev;
+	struct ivpu_ms_instance *ms;
+	u64 single_buff_size;
+	u32 sample_size;
+	int ret;
+
+	if (!args->metric_group_mask || !args->read_period_samples ||
+	    args->sampling_period_ns < MS_MIN_SAMPLE_PERIOD_NS)
+		return -EINVAL;
+
+	mutex_lock(&file_priv->ms_lock);
+
+	if (get_instance_by_mask(file_priv, args->metric_group_mask)) {
+		ivpu_err(vdev, "Instance already exists (mask %#llx)\n", args->metric_group_mask);
+		ret = -EALREADY;
+		goto unlock;
+	}
+
+	ms = kzalloc(sizeof(*ms), GFP_KERNEL);
+	if (!ms) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	ms->mask = args->metric_group_mask;
+
+	ret = ivpu_jsm_metric_streamer_info(vdev, ms->mask, 0, 0, &sample_size, NULL);
+	if (ret)
+		goto err_free_ms;
+
+	single_buff_size = sample_size *
+		((u64)args->read_period_samples * MS_READ_PERIOD_MULTIPLIER);
+	ms->bo = ivpu_bo_create_global(vdev, PAGE_ALIGN(single_buff_size * MS_NUM_BUFFERS),
+				       DRM_IVPU_BO_CACHED | DRM_IVPU_BO_MAPPABLE);
+	if (!ms->bo) {
+		ivpu_err(vdev, "Failed to allocate MS buffer (size %llu)\n", single_buff_size);
+		ret = -ENOMEM;
+		goto err_free_ms;
+	}
+
+	ms->buff_size = ivpu_bo_size(ms->bo) / MS_NUM_BUFFERS;
+	ms->active_buff_vpu_addr = ms->bo->vpu_addr;
+	ms->inactive_buff_vpu_addr = ms->bo->vpu_addr + ms->buff_size;
+	ms->active_buff_ptr = ivpu_bo_vaddr(ms->bo);
+	ms->inactive_buff_ptr = ivpu_bo_vaddr(ms->bo) + ms->buff_size;
+
+	ret = ivpu_jsm_metric_streamer_start(vdev, ms->mask, args->sampling_period_ns,
+					     ms->active_buff_vpu_addr, ms->buff_size);
+	if (ret)
+		goto err_free_bo;
+
+	args->sample_size = sample_size;
+	args->max_data_size = ivpu_bo_size(ms->bo);
+	list_add_tail(&ms->ms_instance_node, &file_priv->ms_instance_list);
+	goto unlock;
+
+err_free_bo:
+	ivpu_bo_free(ms->bo);
+err_free_ms:
+	kfree(ms);
+unlock:
+	mutex_unlock(&file_priv->ms_lock);
+	return ret;
+}
+
+static int
+copy_leftover_bytes(struct ivpu_ms_instance *ms,
+		    void __user *user_ptr, u64 user_size, u64 *user_bytes_copied)
+{
+	u64 copy_bytes;
+
+	if (ms->leftover_bytes) {
+		copy_bytes = min(user_size - *user_bytes_copied, ms->leftover_bytes);
+		if (copy_to_user(user_ptr + *user_bytes_copied, ms->leftover_addr, copy_bytes))
+			return -EFAULT;
+
+		ms->leftover_bytes -= copy_bytes;
+		ms->leftover_addr += copy_bytes;
+		*user_bytes_copied += copy_bytes;
+	}
+
+	return 0;
+}
+
+static int
+copy_samples_to_user(struct ivpu_device *vdev, struct ivpu_ms_instance *ms,
+		     void __user *user_ptr, u64 user_size, u64 *user_bytes_copied)
+{
+	u64 bytes_written;
+	int ret;
+
+	*user_bytes_copied = 0;
+
+	ret = copy_leftover_bytes(ms, user_ptr, user_size, user_bytes_copied);
+	if (ret)
+		return ret;
+
+	if (*user_bytes_copied == user_size)
+		return 0;
+
+	ret = ivpu_jsm_metric_streamer_update(vdev, ms->mask, ms->inactive_buff_vpu_addr,
+					      ms->buff_size, &bytes_written);
+	if (ret)
+		return ret;
+
+	swap(ms->active_buff_vpu_addr, ms->inactive_buff_vpu_addr);
+	swap(ms->active_buff_ptr, ms->inactive_buff_ptr);
+
+	ms->leftover_bytes = bytes_written;
+	ms->leftover_addr = ms->inactive_buff_ptr;
+
+	return copy_leftover_bytes(ms, user_ptr, user_size, user_bytes_copied);
+}
+
+int ivpu_ms_get_data_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
+{
+	struct drm_ivpu_metric_streamer_get_data *args = data;
+	struct ivpu_file_priv *file_priv = file->driver_priv;
+	struct ivpu_device *vdev = file_priv->vdev;
+	struct ivpu_ms_instance *ms;
+	u64 bytes_written;
+	int ret;
+
+	if (!args->metric_group_mask)
+		return -EINVAL;
+
+	mutex_lock(&file_priv->ms_lock);
+
+	ms = get_instance_by_mask(file_priv, args->metric_group_mask);
+	if (!ms) {
+		ivpu_err(vdev, "Instance doesn't exist for mask: %#llx\n", args->metric_group_mask);
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	if (!args->buffer_size) {
+		ret = ivpu_jsm_metric_streamer_update(vdev, ms->mask, 0, 0, &bytes_written);
+		if (ret)
+			goto unlock;
+		args->data_size = bytes_written + ms->leftover_bytes;
+		goto unlock;
+	}
+
+	if (!args->buffer_ptr) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	ret = copy_samples_to_user(vdev, ms, u64_to_user_ptr(args->buffer_ptr),
+				   args->buffer_size, &args->data_size);
+unlock:
+	mutex_unlock(&file_priv->ms_lock);
+
+	return ret;
+}
+
+static void free_instance(struct ivpu_file_priv *file_priv, struct ivpu_ms_instance *ms)
+{
+	lockdep_assert_held(&file_priv->ms_lock);
+
+	list_del(&ms->ms_instance_node);
+	ivpu_jsm_metric_streamer_stop(file_priv->vdev, ms->mask);
+	ivpu_bo_free(ms->bo);
+	kfree(ms);
+}
+
+int ivpu_ms_stop_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
+{
+	struct ivpu_file_priv *file_priv = file->driver_priv;
+	struct drm_ivpu_metric_streamer_stop *args = data;
+	struct ivpu_ms_instance *ms;
+
+	if (!args->metric_group_mask)
+		return -EINVAL;
+
+	mutex_lock(&file_priv->ms_lock);
+
+	ms = get_instance_by_mask(file_priv, args->metric_group_mask);
+	if (ms)
+		free_instance(file_priv, ms);
+
+	mutex_unlock(&file_priv->ms_lock);
+
+	return ms ? 0 : -EINVAL;
+}
+
+static inline struct ivpu_bo *get_ms_info_bo(struct ivpu_file_priv *file_priv)
+{
+	lockdep_assert_held(&file_priv->ms_lock);
+
+	if (file_priv->ms_info_bo)
+		return file_priv->ms_info_bo;
+
+	file_priv->ms_info_bo = ivpu_bo_create_global(file_priv->vdev, MS_INFO_BUFFER_SIZE,
+						      DRM_IVPU_BO_CACHED | DRM_IVPU_BO_MAPPABLE);
+	return file_priv->ms_info_bo;
+}
+
+int ivpu_ms_get_info_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
+{
+	struct drm_ivpu_metric_streamer_get_data *args = data;
+	struct ivpu_file_priv *file_priv = file->driver_priv;
+	struct ivpu_device *vdev = file_priv->vdev;
+	struct ivpu_bo *bo;
+	u64 info_size;
+	int ret;
+
+	if (!args->metric_group_mask)
+		return -EINVAL;
+
+	if (!args->buffer_size)
+		return ivpu_jsm_metric_streamer_info(vdev, args->metric_group_mask,
+						     0, 0, NULL, &args->data_size);
+	if (!args->buffer_ptr)
+		return -EINVAL;
+
+	mutex_lock(&file_priv->ms_lock);
+
+	bo = get_ms_info_bo(file_priv);
+	if (!bo) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	ret = ivpu_jsm_metric_streamer_info(vdev, args->metric_group_mask, bo->vpu_addr,
+					    ivpu_bo_size(bo), NULL, &info_size);
+	if (ret)
+		goto unlock;
+
+	if (args->buffer_size < info_size) {
+		ret = -ENOSPC;
+		goto unlock;
+	}
+
+	if (copy_to_user(u64_to_user_ptr(args->buffer_ptr), ivpu_bo_vaddr(bo), info_size))
+		ret = -EFAULT;
+
+	args->data_size = info_size;
+unlock:
+	mutex_unlock(&file_priv->ms_lock);
+
+	return ret;
+}
+
+void ivpu_ms_cleanup(struct ivpu_file_priv *file_priv)
+{
+	struct ivpu_ms_instance *ms, *tmp;
+
+	mutex_lock(&file_priv->ms_lock);
+
+	if (file_priv->ms_info_bo) {
+		ivpu_bo_free(file_priv->ms_info_bo);
+		file_priv->ms_info_bo = NULL;
+	}
+
+	list_for_each_entry_safe(ms, tmp, &file_priv->ms_instance_list, ms_instance_node)
+		free_instance(file_priv, ms);
+
+	mutex_unlock(&file_priv->ms_lock);
+}
+
+void ivpu_ms_cleanup_all(struct ivpu_device *vdev)
+{
+	struct ivpu_file_priv *file_priv;
+	unsigned long ctx_id;
+
+	mutex_lock(&vdev->context_list_lock);
+
+	xa_for_each(&vdev->context_xa, ctx_id, file_priv)
+		ivpu_ms_cleanup(file_priv);
+
+	mutex_unlock(&vdev->context_list_lock);
+}
diff --git a/drivers/accel/ivpu/ivpu_ms.h b/drivers/accel/ivpu/ivpu_ms.h
new file mode 100644
index 000000000000..fbd5ebebc3d9
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_ms.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR MIT */
+/*
+ * Copyright (C) 2020-2024 Intel Corporation
+ */
+#ifndef __IVPU_MS_H__
+#define __IVPU_MS_H__
+
+#include <linux/list.h>
+
+struct drm_device;
+struct drm_file;
+struct ivpu_bo;
+struct ivpu_device;
+struct ivpu_file_priv;
+
+struct ivpu_ms_instance {
+	struct ivpu_bo *bo;
+	struct list_head ms_instance_node;
+	u64 mask;
+	u64 buff_size;
+	u64 active_buff_vpu_addr;
+	u64 inactive_buff_vpu_addr;
+	void *active_buff_ptr;
+	void *inactive_buff_ptr;
+	u64 leftover_bytes;
+	void *leftover_addr;
+};
+
+int ivpu_ms_start_ioctl(struct drm_device *dev, void *data, struct drm_file *file);
+int ivpu_ms_stop_ioctl(struct drm_device *dev, void *data, struct drm_file *file);
+int ivpu_ms_get_data_ioctl(struct drm_device *dev, void *data, struct drm_file *file);
+int ivpu_ms_get_info_ioctl(struct drm_device *dev, void *data, struct drm_file *file);
+void ivpu_ms_cleanup(struct ivpu_file_priv *file_priv);
+void ivpu_ms_cleanup_all(struct ivpu_device *vdev);
+
+#endif /* __IVPU_MS_H__ */
diff --git a/drivers/accel/ivpu/ivpu_pm.c b/drivers/accel/ivpu/ivpu_pm.c
index 5f73854234ba..5060c5dd40d1 100644
--- a/drivers/accel/ivpu/ivpu_pm.c
+++ b/drivers/accel/ivpu/ivpu_pm.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #include <linux/highmem.h>
@@ -9,20 +9,25 @@
 #include <linux/pm_runtime.h>
 #include <linux/reboot.h>
 
-#include "vpu_boot_api.h"
+#include "ivpu_coredump.h"
 #include "ivpu_drv.h"
-#include "ivpu_hw.h"
 #include "ivpu_fw.h"
 #include "ivpu_fw_log.h"
+#include "ivpu_hw.h"
 #include "ivpu_ipc.h"
 #include "ivpu_job.h"
 #include "ivpu_jsm_msg.h"
 #include "ivpu_mmu.h"
+#include "ivpu_ms.h"
 #include "ivpu_pm.h"
+#include "ivpu_trace.h"
+#include "vpu_boot_api.h"
 
 static bool ivpu_disable_recovery;
+#if IS_ENABLED(CONFIG_DRM_ACCEL_IVPU_DEBUG)
 module_param_named_unsafe(disable_recovery, ivpu_disable_recovery, bool, 0644);
-MODULE_PARM_DESC(disable_recovery, "Disables recovery when VPU hang is detected");
+MODULE_PARM_DESC(disable_recovery, "Disables recovery when NPU hang is detected");
+#endif
 
 static unsigned long ivpu_tdr_timeout_ms;
 module_param_named(tdr_timeout_ms, ivpu_tdr_timeout_ms, ulong, 0644);
@@ -36,6 +41,7 @@ static void ivpu_pm_prepare_cold_boot(struct ivpu_device *vdev)
 
 	ivpu_cmdq_reset_all_contexts(vdev);
 	ivpu_ipc_reset(vdev);
+	ivpu_fw_log_reset(vdev);
 	ivpu_fw_load(vdev);
 	fw->entry_point = fw->cold_boot_entry_point;
 }
@@ -58,14 +64,11 @@ static int ivpu_suspend(struct ivpu_device *vdev)
 {
 	int ret;
 
-	/* Save PCI state before powering down as it sometimes gets corrupted if NPU hangs */
-	pci_save_state(to_pci_dev(vdev->drm.dev));
+	ivpu_prepare_for_reset(vdev);
 
 	ret = ivpu_shutdown(vdev);
 	if (ret)
-		ivpu_err(vdev, "Failed to shutdown VPU: %d\n", ret);
-
-	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
+		ivpu_err(vdev, "Failed to shutdown NPU: %d\n", ret);
 
 	return ret;
 }
@@ -74,10 +77,10 @@ static int ivpu_resume(struct ivpu_device *vdev)
 {
 	int ret;
 
+retry:
 	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D0);
 	pci_restore_state(to_pci_dev(vdev->drm.dev));
 
-retry:
 	ret = ivpu_hw_power_up(vdev);
 	if (ret) {
 		ivpu_err(vdev, "Failed to power up HW: %d\n", ret);
@@ -100,6 +103,7 @@ err_mmu_disable:
 	ivpu_mmu_disable(vdev);
 err_power_down:
 	ivpu_hw_power_down(vdev);
+	pci_set_power_state(to_pci_dev(vdev->drm.dev), PCI_D3hot);
 
 	if (!ivpu_fw_is_cold_boot(vdev)) {
 		ivpu_pm_prepare_cold_boot(vdev);
@@ -111,39 +115,57 @@ err_power_down:
 	return ret;
 }
 
-static void ivpu_pm_recovery_work(struct work_struct *work)
+static void ivpu_pm_reset_begin(struct ivpu_device *vdev)
 {
-	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work);
-	struct ivpu_device *vdev = pm->vdev;
-	char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
-	int ret;
-
-	ivpu_err(vdev, "Recovering the VPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
-
-	ret = pm_runtime_resume_and_get(vdev->drm.dev);
-	if (ret)
-		ivpu_err(vdev, "Failed to resume VPU: %d\n", ret);
-
-	ivpu_fw_log_dump(vdev);
+	pm_runtime_disable(vdev->drm.dev);
 
 	atomic_inc(&vdev->pm->reset_counter);
 	atomic_set(&vdev->pm->reset_pending, 1);
 	down_write(&vdev->pm->reset_lock);
+}
+
+static void ivpu_pm_reset_complete(struct ivpu_device *vdev)
+{
+	int ret;
 
-	ivpu_suspend(vdev);
 	ivpu_pm_prepare_cold_boot(vdev);
 	ivpu_jobs_abort_all(vdev);
+	ivpu_ms_cleanup_all(vdev);
 
 	ret = ivpu_resume(vdev);
-	if (ret)
+	if (ret) {
 		ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
+		pm_runtime_set_suspended(vdev->drm.dev);
+	} else {
+		pm_runtime_set_active(vdev->drm.dev);
+	}
 
 	up_write(&vdev->pm->reset_lock);
 	atomic_set(&vdev->pm->reset_pending, 0);
 
-	kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
 	pm_runtime_mark_last_busy(vdev->drm.dev);
-	pm_runtime_put_autosuspend(vdev->drm.dev);
+	pm_runtime_enable(vdev->drm.dev);
+}
+
+static void ivpu_pm_recovery_work(struct work_struct *work)
+{
+	struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, recovery_work);
+	struct ivpu_device *vdev = pm->vdev;
+	char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
+
+	ivpu_err(vdev, "Recovering the NPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
+
+	ivpu_pm_reset_begin(vdev);
+
+	if (!pm_runtime_status_suspended(vdev->drm.dev)) {
+		ivpu_jsm_state_dump(vdev);
+		ivpu_dev_coredump(vdev);
+		ivpu_suspend(vdev);
+	}
+
+	ivpu_pm_reset_complete(vdev);
+
+	kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
 }
 
 void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
@@ -195,6 +217,7 @@ int ivpu_pm_suspend_cb(struct device *dev)
 	struct ivpu_device *vdev = to_ivpu_device(drm);
 	unsigned long timeout;
 
+	trace_pm("suspend");
 	ivpu_dbg(vdev, PM, "Suspend..\n");
 
 	timeout = jiffies + msecs_to_jiffies(vdev->timeout.tdr);
@@ -212,6 +235,7 @@ int ivpu_pm_suspend_cb(struct device *dev)
 	ivpu_pm_prepare_warm_boot(vdev);
 
 	ivpu_dbg(vdev, PM, "Suspend done.\n");
+	trace_pm("suspend done");
 
 	return 0;
 }
@@ -222,6 +246,7 @@ int ivpu_pm_resume_cb(struct device *dev)
 	struct ivpu_device *vdev = to_ivpu_device(drm);
 	int ret;
 
+	trace_pm("resume");
 	ivpu_dbg(vdev, PM, "Resume..\n");
 
 	ret = ivpu_resume(vdev);
@@ -229,6 +254,7 @@ int ivpu_pm_resume_cb(struct device *dev)
 		ivpu_err(vdev, "Failed to resume: %d\n", ret);
 
 	ivpu_dbg(vdev, PM, "Resume done.\n");
+	trace_pm("resume done");
 
 	return ret;
 }
@@ -237,42 +263,40 @@ int ivpu_pm_runtime_suspend_cb(struct device *dev)
 {
 	struct drm_device *drm = dev_get_drvdata(dev);
 	struct ivpu_device *vdev = to_ivpu_device(drm);
-	bool hw_is_idle = true;
-	int ret;
+	int ret, ret_d0i3;
+	bool is_idle;
 
 	drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
 	drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work));
 
+	trace_pm("runtime suspend");
 	ivpu_dbg(vdev, PM, "Runtime suspend..\n");
 
-	if (!ivpu_hw_is_idle(vdev) && vdev->pm->suspend_reschedule_counter) {
-		ivpu_dbg(vdev, PM, "Failed to enter idle, rescheduling suspend, retries left %d\n",
-			 vdev->pm->suspend_reschedule_counter);
-		pm_schedule_suspend(dev, vdev->timeout.reschedule_suspend);
-		vdev->pm->suspend_reschedule_counter--;
-		return -EAGAIN;
-	}
+	ivpu_mmu_disable(vdev);
+
+	is_idle = ivpu_hw_is_idle(vdev) || vdev->pm->dct_active_percent;
+	if (!is_idle)
+		ivpu_err(vdev, "NPU is not idle before autosuspend\n");
 
-	if (!vdev->pm->suspend_reschedule_counter)
-		hw_is_idle = false;
-	else if (ivpu_jsm_pwr_d0i3_enter(vdev))
-		hw_is_idle = false;
+	ret_d0i3 = ivpu_jsm_pwr_d0i3_enter(vdev);
+	if (ret_d0i3)
+		ivpu_err(vdev, "Failed to prepare for d0i3: %d\n", ret_d0i3);
 
 	ret = ivpu_suspend(vdev);
 	if (ret)
-		ivpu_err(vdev, "Failed to set suspend VPU: %d\n", ret);
+		ivpu_err(vdev, "Failed to suspend NPU: %d\n", ret);
 
-	if (!hw_is_idle) {
-		ivpu_err(vdev, "VPU failed to enter idle, force suspended.\n");
-		ivpu_fw_log_dump(vdev);
+	if (!is_idle || ret_d0i3) {
+		ivpu_err(vdev, "Forcing cold boot due to previous errors\n");
+		atomic_inc(&vdev->pm->reset_counter);
+		ivpu_dev_coredump(vdev);
 		ivpu_pm_prepare_cold_boot(vdev);
 	} else {
 		ivpu_pm_prepare_warm_boot(vdev);
 	}
 
-	vdev->pm->suspend_reschedule_counter = PM_RESCHEDULE_LIMIT;
-
 	ivpu_dbg(vdev, PM, "Runtime suspend done.\n");
+	trace_pm("runtime suspend done");
 
 	return 0;
 }
@@ -283,6 +307,7 @@ int ivpu_pm_runtime_resume_cb(struct device *dev)
 	struct ivpu_device *vdev = to_ivpu_device(drm);
 	int ret;
 
+	trace_pm("runtime resume");
 	ivpu_dbg(vdev, PM, "Runtime resume..\n");
 
 	ret = ivpu_resume(vdev);
@@ -290,6 +315,7 @@ int ivpu_pm_runtime_resume_cb(struct device *dev)
 		ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
 
 	ivpu_dbg(vdev, PM, "Runtime resume done.\n");
+	trace_pm("runtime resume done");
 
 	return ret;
 }
@@ -299,18 +325,10 @@ int ivpu_rpm_get(struct ivpu_device *vdev)
 	int ret;
 
 	ret = pm_runtime_resume_and_get(vdev->drm.dev);
-	if (!drm_WARN_ON(&vdev->drm, ret < 0))
-		vdev->pm->suspend_reschedule_counter = PM_RESCHEDULE_LIMIT;
-
-	return ret;
-}
-
-int ivpu_rpm_get_if_active(struct ivpu_device *vdev)
-{
-	int ret;
-
-	ret = pm_runtime_get_if_active(vdev->drm.dev, false);
-	drm_WARN_ON(&vdev->drm, ret < 0);
+	if (ret < 0) {
+		ivpu_err(vdev, "Failed to resume NPU: %d\n", ret);
+		pm_runtime_set_suspended(vdev->drm.dev);
+	}
 
 	return ret;
 }
@@ -326,33 +344,26 @@ void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev)
 	struct ivpu_device *vdev = pci_get_drvdata(pdev);
 
 	ivpu_dbg(vdev, PM, "Pre-reset..\n");
-	atomic_inc(&vdev->pm->reset_counter);
-	atomic_set(&vdev->pm->reset_pending, 1);
 
-	pm_runtime_get_sync(vdev->drm.dev);
-	down_write(&vdev->pm->reset_lock);
-	ivpu_prepare_for_reset(vdev);
-	ivpu_hw_reset(vdev);
-	ivpu_pm_prepare_cold_boot(vdev);
-	ivpu_jobs_abort_all(vdev);
+	ivpu_pm_reset_begin(vdev);
+
+	if (!pm_runtime_status_suspended(vdev->drm.dev)) {
+		ivpu_prepare_for_reset(vdev);
+		ivpu_hw_reset(vdev);
+	}
+
 	ivpu_dbg(vdev, PM, "Pre-reset done.\n");
 }
 
 void ivpu_pm_reset_done_cb(struct pci_dev *pdev)
 {
 	struct ivpu_device *vdev = pci_get_drvdata(pdev);
-	int ret;
 
 	ivpu_dbg(vdev, PM, "Post-reset..\n");
-	ret = ivpu_resume(vdev);
-	if (ret)
-		ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
-	up_write(&vdev->pm->reset_lock);
-	atomic_set(&vdev->pm->reset_pending, 0);
-	ivpu_dbg(vdev, PM, "Post-reset done.\n");
 
-	pm_runtime_mark_last_busy(vdev->drm.dev);
-	pm_runtime_put_autosuspend(vdev->drm.dev);
+	ivpu_pm_reset_complete(vdev);
+
+	ivpu_dbg(vdev, PM, "Post-reset done.\n");
 }
 
 void ivpu_pm_init(struct ivpu_device *vdev)
@@ -362,7 +373,6 @@ void ivpu_pm_init(struct ivpu_device *vdev)
 	int delay;
 
 	pm->vdev = vdev;
-	pm->suspend_reschedule_counter = PM_RESCHEDULE_LIMIT;
 
 	init_rwsem(&pm->reset_lock);
 	atomic_set(&pm->reset_pending, 0);
@@ -378,6 +388,7 @@ void ivpu_pm_init(struct ivpu_device *vdev)
 
 	pm_runtime_use_autosuspend(dev);
 	pm_runtime_set_autosuspend_delay(dev, delay);
+	pm_runtime_set_active(dev);
 
 	ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
 }
@@ -392,7 +403,6 @@ void ivpu_pm_enable(struct ivpu_device *vdev)
 {
 	struct device *dev = vdev->drm.dev;
 
-	pm_runtime_set_active(dev);
 	pm_runtime_allow(dev);
 	pm_runtime_mark_last_busy(dev);
 	pm_runtime_put_autosuspend(dev);
@@ -403,3 +413,68 @@ void ivpu_pm_disable(struct ivpu_device *vdev)
 	pm_runtime_get_noresume(vdev->drm.dev);
 	pm_runtime_forbid(vdev->drm.dev);
 }
+
+int ivpu_pm_dct_init(struct ivpu_device *vdev)
+{
+	if (vdev->pm->dct_active_percent)
+		return ivpu_pm_dct_enable(vdev, vdev->pm->dct_active_percent);
+
+	return 0;
+}
+
+int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent)
+{
+	u32 active_us, inactive_us;
+	int ret;
+
+	if (active_percent == 0 || active_percent > 100)
+		return -EINVAL;
+
+	active_us = (DCT_PERIOD_US * active_percent) / 100;
+	inactive_us = DCT_PERIOD_US - active_us;
+
+	ret = ivpu_jsm_dct_enable(vdev, active_us, inactive_us);
+	if (ret) {
+		ivpu_err_ratelimited(vdev, "Failed to enable DCT: %d\n", ret);
+		return ret;
+	}
+
+	vdev->pm->dct_active_percent = active_percent;
+
+	ivpu_dbg(vdev, PM, "DCT set to %u%% (D0: %uus, D0i2: %uus)\n",
+		 active_percent, active_us, inactive_us);
+	return 0;
+}
+
+int ivpu_pm_dct_disable(struct ivpu_device *vdev)
+{
+	int ret;
+
+	ret = ivpu_jsm_dct_disable(vdev);
+	if (ret) {
+		ivpu_err_ratelimited(vdev, "Failed to disable DCT: %d\n", ret);
+		return ret;
+	}
+
+	vdev->pm->dct_active_percent = 0;
+
+	ivpu_dbg(vdev, PM, "DCT disabled\n");
+	return 0;
+}
+
+void ivpu_pm_dct_irq_thread_handler(struct ivpu_device *vdev)
+{
+	bool enable;
+	int ret;
+
+	if (ivpu_hw_btrs_dct_get_request(vdev, &enable))
+		return;
+
+	if (vdev->pm->dct_active_percent)
+		ret = ivpu_pm_dct_enable(vdev, DCT_DEFAULT_ACTIVE_PERCENT);
+	else
+		ret = ivpu_pm_dct_disable(vdev);
+
+	if (!ret)
+		ivpu_hw_btrs_dct_set_status(vdev, enable, vdev->pm->dct_active_percent);
+}
diff --git a/drivers/accel/ivpu/ivpu_pm.h b/drivers/accel/ivpu/ivpu_pm.h
index ec60fbeefefc..b70efe6c36e4 100644
--- a/drivers/accel/ivpu/ivpu_pm.h
+++ b/drivers/accel/ivpu/ivpu_pm.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
  */
 
 #ifndef __IVPU_PM_H__
@@ -19,7 +19,7 @@ struct ivpu_pm_info {
 	atomic_t reset_counter;
 	atomic_t reset_pending;
 	bool is_warmboot;
-	u32 suspend_reschedule_counter;
+	u8 dct_active_percent;
 };
 
 void ivpu_pm_init(struct ivpu_device *vdev);
@@ -36,11 +36,15 @@ void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev);
 void ivpu_pm_reset_done_cb(struct pci_dev *pdev);
 
 int __must_check ivpu_rpm_get(struct ivpu_device *vdev);
-int __must_check ivpu_rpm_get_if_active(struct ivpu_device *vdev);
 void ivpu_rpm_put(struct ivpu_device *vdev);
 
 void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason);
 void ivpu_start_job_timeout_detection(struct ivpu_device *vdev);
 void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev);
 
+int ivpu_pm_dct_init(struct ivpu_device *vdev);
+int ivpu_pm_dct_enable(struct ivpu_device *vdev, u8 active_percent);
+int ivpu_pm_dct_disable(struct ivpu_device *vdev);
+void ivpu_pm_dct_irq_thread_handler(struct ivpu_device *vdev);
+
 #endif /* __IVPU_PM_H__ */
diff --git a/drivers/accel/ivpu/ivpu_sysfs.c b/drivers/accel/ivpu/ivpu_sysfs.c
new file mode 100644
index 000000000000..616477fc17fa
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_sysfs.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Intel Corporation
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+
+#include "ivpu_drv.h"
+#include "ivpu_fw.h"
+#include "ivpu_hw.h"
+#include "ivpu_sysfs.h"
+
+/*
+ * npu_busy_time_us is the time that the device spent executing jobs.
+ * The time is counted when and only when there are jobs submitted to firmware.
+ *
+ * This time can be used to measure the utilization of NPU, either by calculating
+ * npu_busy_time_us difference between two timepoints (i.e. measuring the time
+ * that the NPU was active during some workload) or monitoring utilization percentage
+ * by reading npu_busy_time_us periodically.
+ *
+ * When reading the value periodically, it shouldn't be read too often as it may have
+ * an impact on job submission performance. Recommended period is 1 second.
+ */
+static ssize_t
+npu_busy_time_us_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct drm_device *drm = dev_get_drvdata(dev);
+	struct ivpu_device *vdev = to_ivpu_device(drm);
+	ktime_t total, now = 0;
+
+	xa_lock(&vdev->submitted_jobs_xa);
+	total = vdev->busy_time;
+	if (!xa_empty(&vdev->submitted_jobs_xa))
+		now = ktime_sub(ktime_get(), vdev->busy_start_ts);
+	xa_unlock(&vdev->submitted_jobs_xa);
+
+	return sysfs_emit(buf, "%lld\n", ktime_to_us(ktime_add(total, now)));
+}
+
+static DEVICE_ATTR_RO(npu_busy_time_us);
+
+/**
+ * DOC: sched_mode
+ *
+ * The sched_mode is used to report current NPU scheduling mode.
+ *
+ * It returns following strings:
+ * - "HW"		- Hardware Scheduler mode
+ * - "OS"		- Operating System Scheduler mode
+ *
+ */
+static ssize_t
+sched_mode_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct drm_device *drm = dev_get_drvdata(dev);
+	struct ivpu_device *vdev = to_ivpu_device(drm);
+
+	return sysfs_emit(buf, "%s\n", vdev->fw->sched_mode ? "HW" : "OS");
+}
+
+static DEVICE_ATTR_RO(sched_mode);
+
+static struct attribute *ivpu_dev_attrs[] = {
+	&dev_attr_npu_busy_time_us.attr,
+	&dev_attr_sched_mode.attr,
+	NULL,
+};
+
+static struct attribute_group ivpu_dev_attr_group = {
+	.attrs = ivpu_dev_attrs,
+};
+
+void ivpu_sysfs_init(struct ivpu_device *vdev)
+{
+	int ret;
+
+	ret = devm_device_add_group(vdev->drm.dev, &ivpu_dev_attr_group);
+	if (ret)
+		ivpu_warn(vdev, "Failed to add group to device, ret %d", ret);
+}
diff --git a/drivers/accel/ivpu/ivpu_sysfs.h b/drivers/accel/ivpu/ivpu_sysfs.h
new file mode 100644
index 000000000000..9836f09b35a3
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_sysfs.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2024 Intel Corporation
+ */
+
+#ifndef __IVPU_SYSFS_H__
+#define __IVPU_SYSFS_H__
+
+#include "ivpu_drv.h"
+
+void ivpu_sysfs_init(struct ivpu_device *vdev);
+
+#endif /* __IVPU_SYSFS_H__ */
diff --git a/drivers/accel/ivpu/ivpu_trace.h b/drivers/accel/ivpu/ivpu_trace.h
new file mode 100644
index 000000000000..eb792038e701
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_trace.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2020-2024 Intel Corporation
+ */
+
+#if !defined(__IVPU_TRACE_H__) || defined(TRACE_HEADER_MULTI_READ)
+#define __IVPU_TRACE_H__
+
+#include <linux/tracepoint.h>
+#include "ivpu_drv.h"
+#include "ivpu_job.h"
+#include "vpu_jsm_api.h"
+#include "ivpu_jsm_msg.h"
+#include "ivpu_ipc.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM vpu
+#define TRACE_INCLUDE_FILE ivpu_trace
+
+TRACE_EVENT(pm,
+	    TP_PROTO(const char *event),
+	    TP_ARGS(event),
+	    TP_STRUCT__entry(__field(const char *, event)),
+	    TP_fast_assign(__entry->event = event;),
+	    TP_printk("%s", __entry->event)
+);
+
+TRACE_EVENT(job,
+	    TP_PROTO(const char *event, struct ivpu_job *job),
+	    TP_ARGS(event, job),
+	    TP_STRUCT__entry(__field(const char *, event)
+		__field(u32, ctx_id)
+		__field(u32, engine_id)
+		__field(u32, job_id)
+		),
+	    TP_fast_assign(__entry->event = event;
+		__entry->ctx_id = job->file_priv->ctx.id;
+		__entry->engine_id = job->engine_idx;
+		__entry->job_id = job->job_id;),
+	    TP_printk("%s context:%d engine:%d job:%d",
+		      __entry->event,
+		      __entry->ctx_id,
+		      __entry->engine_id,
+		      __entry->job_id)
+);
+
+TRACE_EVENT(jsm,
+	    TP_PROTO(const char *event, struct vpu_jsm_msg *msg),
+	    TP_ARGS(event, msg),
+	    TP_STRUCT__entry(__field(const char *, event)
+		__field(const char *, type)
+		__field(enum vpu_ipc_msg_status, status)
+		__field(u32, request_id)
+		__field(u32, result)
+		),
+	    TP_fast_assign(__entry->event = event;
+		__entry->type = ivpu_jsm_msg_type_to_str(msg->type);
+		__entry->status = msg->status;
+		__entry->request_id = msg->request_id;
+		__entry->result = msg->result;),
+	    TP_printk("%s type:%s, status:%#x, id:%#x, result:%#x",
+		      __entry->event,
+		      __entry->type,
+		      __entry->status,
+		      __entry->request_id,
+		      __entry->result)
+);
+
+#endif /* __IVPU_TRACE_H__ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#include <trace/define_trace.h>
diff --git a/drivers/accel/ivpu/ivpu_trace_points.c b/drivers/accel/ivpu/ivpu_trace_points.c
new file mode 100644
index 000000000000..f8fb99de0de3
--- /dev/null
+++ b/drivers/accel/ivpu/ivpu_trace_points.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020-2024 Intel Corporation
+ */
+
+#ifndef __CHECKER__
+#define CREATE_TRACE_POINTS
+#include "ivpu_trace.h"
+#endif
diff --git a/drivers/accel/ivpu/vpu_boot_api.h b/drivers/accel/ivpu/vpu_boot_api.h
index 04c954258563..908e68ea1c39 100644
--- a/drivers/accel/ivpu/vpu_boot_api.h
+++ b/drivers/accel/ivpu/vpu_boot_api.h
@@ -1,14 +1,13 @@
 /* SPDX-License-Identifier: MIT */
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (c) 2020-2024, Intel Corporation.
  */
 
 #ifndef VPU_BOOT_API_H
 #define VPU_BOOT_API_H
 
 /*
- * =========== FW API version information beginning ================
- *  The bellow values will be used to construct the version info this way:
+ *  The below values will be used to construct the version info this way:
  *  fw_bin_header->api_version[VPU_BOOT_API_VER_ID] = (VPU_BOOT_API_VER_MAJOR << 16) |
  *  VPU_BOOT_API_VER_MINOR;
  *  VPU_BOOT_API_VER_PATCH will be ignored. KMD and compatibility is not affected if this changes
@@ -27,21 +26,20 @@
  * Minor version changes when API backward compatibility is preserved.
  * Resets to 0 if Major version is incremented.
  */
-#define VPU_BOOT_API_VER_MINOR 20
+#define VPU_BOOT_API_VER_MINOR 26
 
 /*
  * API header changed (field names, documentation, formatting) but API itself has not been changed
  */
-#define VPU_BOOT_API_VER_PATCH 4
+#define VPU_BOOT_API_VER_PATCH 3
 
 /*
  * Index in the API version table
  * Must be unique for each API
  */
 #define VPU_BOOT_API_VER_INDEX 0
-/* ------------ FW API version information end ---------------------*/
 
-#pragma pack(push, 1)
+#pragma pack(push, 4)
 
 /*
  * Firmware image header format
@@ -66,12 +64,25 @@ struct vpu_firmware_header {
 	/* Size of memory require for firmware execution */
 	u32 runtime_size;
 	u32 shave_nn_fw_size;
-	/* Size of primary preemption buffer. */
+	/*
+	 * Size of primary preemption buffer, assuming a 2-job submission queue.
+	 * NOTE: host driver is expected to adapt size accordingly to actual
+	 * submission queue size and device capabilities.
+	 */
 	u32 preemption_buffer_1_size;
-	/* Size of secondary preemption buffer. */
+	/*
+	 * Size of secondary preemption buffer, assuming a 2-job submission queue.
+	 * NOTE: host driver is expected to adapt size accordingly to actual
+	 * submission queue size and device capabilities.
+	 */
 	u32 preemption_buffer_2_size;
 	/* Space reserved for future preemption-related fields. */
 	u32 preemption_reserved[6];
+	/* FW image read only section start address, 4KB aligned */
+	u64 ro_section_start_address;
+	/* FW image read only section size, 4KB aligned */
+	u32 ro_section_size;
+	u32 reserved;
 };
 
 /*
@@ -151,8 +162,6 @@ enum vpu_trace_destination {
 /* VPU 30xx HW component IDs are sequential, so define first and last IDs. */
 #define VPU_TRACE_PROC_BIT_30XX_FIRST VPU_TRACE_PROC_BIT_LRT
 #define VPU_TRACE_PROC_BIT_30XX_LAST  VPU_TRACE_PROC_BIT_SHV_15
-#define VPU_TRACE_PROC_BIT_KMB_FIRST  VPU_TRACE_PROC_BIT_30XX_FIRST
-#define VPU_TRACE_PROC_BIT_KMB_LAST   VPU_TRACE_PROC_BIT_30XX_LAST
 
 struct vpu_boot_l2_cache_config {
 	u8 use;
@@ -181,10 +190,21 @@ struct vpu_warm_boot_section {
 #define VPU_PRESENT_CALL_PERIOD_MS_MAX	   10000
 
 /**
- * Macros to enable various operation modes within the VPU.
+ * Macros to enable various power profiles within the NPU.
  * To be defined as part of 32 bit mask.
  */
-#define VPU_OP_MODE_SURVIVABILITY 0x1
+#define POWER_PROFILE_SURVIVABILITY 0x1
+
+/**
+ * Enum for dvfs_mode boot param.
+ */
+enum vpu_governor {
+	VPU_GOV_DEFAULT = 0, /* Default Governor for the system */
+	VPU_GOV_MAX_PERFORMANCE = 1, /* Maximum performance governor */
+	VPU_GOV_ON_DEMAND = 2, /* On Demand frequency control governor */
+	VPU_GOV_POWER_SAVE = 3, /* Power save governor */
+	VPU_GOV_ON_DEMAND_PRIORITY_AWARE = 4 /* On Demand priority based governor */
+};
 
 struct vpu_boot_params {
 	u32 magic;
@@ -288,7 +308,14 @@ struct vpu_boot_params {
 	u32 temp_sensor_period_ms;
 	/** PLL ratio for efficient clock frequency */
 	u32 pn_freq_pll_ratio;
-	/** DVFS Mode: Default: 0, Max Performance: 1, On Demand: 2, Power Save: 3 */
+	/**
+	 * DVFS Mode:
+	 * 0 - Default, DVFS mode selected by the firmware
+	 * 1 - Max Performance
+	 * 2 - On Demand
+	 * 3 - Power Save
+	 * 4 - On Demand Priority Aware
+	 */
 	u32 dvfs_mode;
 	/**
 	 * Depending on DVFS Mode:
@@ -317,7 +344,22 @@ struct vpu_boot_params {
 	u64 d0i3_residency_time_us;
 	/* Value of VPU perf counter at the time of entering D0i3 state . */
 	u64 d0i3_entry_vpu_ts;
-	u32 pad4[20];
+	/*
+	 * The system time of the host operating system in microseconds.
+	 * E.g the number of microseconds since 1st of January 1970, or whatever
+	 * date the host operating system uses to maintain system time.
+	 * This value will be used to track system time on the VPU.
+	 * The KMD is required to update this value on every VPU reset.
+	 */
+	u64 system_time_us;
+	u32 pad4[2];
+	/*
+	 * The delta between device monotonic time and the current value of the
+	 * HW timestamp register, in ticks. Written by the firmware during boot.
+	 * Can be used by the KMD to calculate device time.
+	 */
+	u64 device_time_delta_ticks;
+	u32 pad7[14];
 	/* Warm boot information: 0x400 - 0x43F */
 	u32 warm_boot_sections_count;
 	u32 warm_boot_start_address_reference;
@@ -344,16 +386,17 @@ struct vpu_boot_params {
 	u32 vpu_focus_present_timer_ms;
 	/* VPU ECC Signaling */
 	u32 vpu_uses_ecc_mca_signal;
-	/* Values defined by VPU_OP_MODE* macros */
-	u32 vpu_operation_mode;
-	/* Unused/reserved: 0x480 - 0xFFF */
-	u32 pad6[736];
+	/* Values defined by POWER_PROFILE* macros */
+	u32 power_profile;
+	/* Microsecond value for DCT active cycle */
+	u32 dct_active_us;
+	/* Microsecond value for DCT inactive cycle */
+	u32 dct_inactive_us;
+	/* Unused/reserved: 0x488 - 0xFFF */
+	u32 pad6[734];
 };
 
-/*
- * Magic numbers set between host and vpu to detect corruptio of tracing init
- */
-
+/* Magic numbers set between host and vpu to detect corruption of tracing init */
 #define VPU_TRACING_BUFFER_CANARY (0xCAFECAFE)
 
 /* Tracing buffer message format definitions */
@@ -373,7 +416,9 @@ struct vpu_tracing_buffer_header {
 	u32 host_canary_start;
 	/* offset from start of buffer for trace entries */
 	u32 read_index;
-	u32 pad_to_cache_line_size_0[14];
+	/* keeps track of wrapping on the reader side */
+	u32 read_wrap_count;
+	u32 pad_to_cache_line_size_0[13];
 	/* End of first cache line */
 
 	/**
diff --git a/drivers/accel/ivpu/vpu_jsm_api.h b/drivers/accel/ivpu/vpu_jsm_api.h
index 7da7622742be..7215c144158c 100644
--- a/drivers/accel/ivpu/vpu_jsm_api.h
+++ b/drivers/accel/ivpu/vpu_jsm_api.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: MIT */
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (c) 2020-2024, Intel Corporation.
  */
 
 /**
@@ -22,7 +22,7 @@
 /*
  * Minor version changes when API backward compatibility is preserved.
  */
-#define VPU_JSM_API_VER_MINOR 15
+#define VPU_JSM_API_VER_MINOR 25
 
 /*
  * API header changed (field names, documentation, formatting) but API itself has not been changed
@@ -36,15 +36,18 @@
 
 /*
  * Number of Priority Bands for Hardware Scheduling
- * Bands: RealTime, Focus, Normal, Idle
+ * Bands: Idle(0), Normal(1), Focus(2), RealTime(3)
  */
 #define VPU_HWS_NUM_PRIORITY_BANDS 4
 
 /* Max number of impacted contexts that can be dealt with the engine reset command */
 #define VPU_MAX_ENGINE_RESET_IMPACTED_CONTEXTS 3
 
-/** Pack the API structures for now, once alignment issues are fixed this can be removed */
-#pragma pack(push, 1)
+/*
+ * Pack the API structures to enforce binary compatibility
+ * Align to 8 bytes for optimal performance
+ */
+#pragma pack(push, 8)
 
 /*
  * Engine indexes.
@@ -71,6 +74,7 @@
 #define VPU_JSM_STATUS_MVNCI_INTERNAL_ERROR		 0xCU
 /* Job status returned when the job was preempted mid-inference */
 #define VPU_JSM_STATUS_PREEMPTED_MID_INFERENCE		 0xDU
+#define VPU_JSM_STATUS_MVNCI_CONTEXT_VIOLATION_HW	 0xEU
 
 /*
  * Host <-> VPU IPC channels.
@@ -83,18 +87,58 @@
 /*
  * Job flags bit masks.
  */
-#define VPU_JOB_FLAGS_NULL_SUBMISSION_MASK 0x00000001
-#define VPU_JOB_FLAGS_PRIVATE_DATA_MASK	   0xFF000000
+enum {
+	/*
+	 * Null submission mask.
+	 * When set, batch buffer's commands are not processed but returned as
+	 * successful immediately, except fences and timestamps.
+	 * When cleared, batch buffer's commands are processed normally.
+	 * Used for testing and profiling purposes.
+	 */
+	VPU_JOB_FLAGS_NULL_SUBMISSION_MASK = (1 << 0U),
+	/*
+	 * Inline command mask.
+	 * When set, the object in job queue is an inline command (see struct vpu_inline_cmd below).
+	 * When cleared, the object in job queue is a job (see struct vpu_job_queue_entry below).
+	 */
+	VPU_JOB_FLAGS_INLINE_CMD_MASK = (1 << 1U),
+	/*
+	 * VPU private data mask.
+	 * Reserved for the VPU to store private data about the job (or inline command)
+	 * while being processed.
+	 */
+	VPU_JOB_FLAGS_PRIVATE_DATA_MASK = 0xFFFF0000U
+};
 
 /*
- * Sizes of the reserved areas in jobs, in bytes.
+ * Job queue flags bit masks.
  */
-#define VPU_JOB_RESERVED_BYTES 8
+enum {
+	/*
+	 * No job done notification mask.
+	 * When set, indicates that no job done notification should be sent for any
+	 * job from this queue. When cleared, indicates that job done notification
+	 * should be sent for every job completed from this queue.
+	 */
+	VPU_JOB_QUEUE_FLAGS_NO_JOB_DONE_MASK = (1 << 0U),
+	/*
+	 * Native fence usage mask.
+	 * When set, indicates that job queue uses native fences (as inline commands
+	 * in job queue). Such queues may also use legacy fences (as commands in batch buffers).
+	 * When cleared, indicates the job queue only uses legacy fences.
+	 * NOTE: For queues using native fences, VPU expects that all jobs in the queue
+	 * are immediately followed by an inline command object. This object is expected
+	 * to be a fence signal command in most cases, but can also be a NOP in case the host
+	 * does not need per-job fence signalling. Other inline commands objects can be
+	 * inserted between "job and inline command" pairs.
+	 */
+	VPU_JOB_QUEUE_FLAGS_USE_NATIVE_FENCE_MASK = (1 << 1U),
 
-/*
- * Sizes of the reserved areas in job queues, in bytes.
- */
-#define VPU_JOB_QUEUE_RESERVED_BYTES 52
+	/*
+	 * Enable turbo mode for testing NPU performance; not recommended for regular usage.
+	 */
+	VPU_JOB_QUEUE_FLAGS_TURBO_MODE = (1 << 2U)
+};
 
 /*
  * Max length (including trailing NULL char) of trace entity name (e.g., the
@@ -125,23 +169,125 @@
 #define VPU_HWS_MAX_REALTIME_PRIORITY_LEVEL 31U
 
 /*
+ * vpu_jsm_engine_reset_context flag definitions
+ */
+#define VPU_ENGINE_RESET_CONTEXT_FLAG_COLLATERAL_DAMAGE_MASK BIT(0)
+#define VPU_ENGINE_RESET_CONTEXT_HANG_PRIMARY_CAUSE	     0
+#define VPU_ENGINE_RESET_CONTEXT_COLLATERAL_DAMAGE	     1
+
+/*
+ * Invalid command queue handle identifier. Applies to cmdq_id and cmdq_group
+ * in this API.
+ */
+#define VPU_HWS_INVALID_CMDQ_HANDLE 0ULL
+
+/*
+ * Inline commands types.
+ */
+/*
+ * NOP.
+ * VPU does nothing other than consuming the inline command object.
+ */
+#define VPU_INLINE_CMD_TYPE_NOP		 0x0
+/*
+ * Fence wait.
+ * VPU waits for the fence current value to reach monitored value.
+ * Fence wait operations are executed upon job dispatching. While waiting for
+ * the fence to be satisfied, VPU blocks fetching of the next objects in the queue.
+ * Jobs present in the queue prior to the fence wait object may be processed
+ * concurrently.
+ */
+#define VPU_INLINE_CMD_TYPE_FENCE_WAIT	 0x1
+/*
+ * Fence signal.
+ * VPU sets the fence current value to the provided value. If new current value
+ * is equal to or higher than monitored value, VPU sends fence signalled notification
+ * to the host. Fence signal operations are executed upon completion of all the jobs
+ * present in the queue prior to them, and in-order relative to each other in the queue.
+ * But jobs in-between them may be processed concurrently and may complete out-of-order.
+ */
+#define VPU_INLINE_CMD_TYPE_FENCE_SIGNAL 0x2
+
+/*
+ * Job scheduling priority bands for both hardware scheduling and OS scheduling.
+ */
+enum vpu_job_scheduling_priority_band {
+	VPU_JOB_SCHEDULING_PRIORITY_BAND_IDLE = 0,
+	VPU_JOB_SCHEDULING_PRIORITY_BAND_NORMAL = 1,
+	VPU_JOB_SCHEDULING_PRIORITY_BAND_FOCUS = 2,
+	VPU_JOB_SCHEDULING_PRIORITY_BAND_REALTIME = 3,
+	VPU_JOB_SCHEDULING_PRIORITY_BAND_COUNT = 4,
+};
+
+/*
  * Job format.
+ * Jobs defines the actual workloads to be executed by a given engine.
  */
 struct vpu_job_queue_entry {
-	u64 batch_buf_addr; /**< Address of VPU commands batch buffer */
-	u32 job_id;	  /**< Job ID */
-	u32 flags; /**< Flags bit field, see VPU_JOB_FLAGS_* above */
-	u64 root_page_table_addr; /**< Address of root page table to use for this job */
-	u64 root_page_table_update_counter; /**< Page tables update events counter */
-	u64 primary_preempt_buf_addr;
+	/**< Address of VPU commands batch buffer */
+	u64 batch_buf_addr;
+	/**< Job ID */
+	u32 job_id;
+	/**< Flags bit field, see VPU_JOB_FLAGS_* above */
+	u32 flags;
+	/**
+	 * Doorbell ring timestamp taken by KMD from SoC's global system clock, in
+	 * microseconds. NPU can convert this value to its own fixed clock's timebase,
+	 * to match other profiling timestamps.
+	 */
+	u64 doorbell_timestamp;
+	/**< Extra id for job tracking, used only in the firmware perf traces */
+	u64 host_tracking_id;
 	/**< Address of the primary preemption buffer to use for this job */
-	u32 primary_preempt_buf_size;
+	u64 primary_preempt_buf_addr;
 	/**< Size of the primary preemption buffer to use for this job */
-	u32 secondary_preempt_buf_size;
+	u32 primary_preempt_buf_size;
 	/**< Size of secondary preemption buffer to use for this job */
-	u64 secondary_preempt_buf_addr;
+	u32 secondary_preempt_buf_size;
 	/**< Address of secondary preemption buffer to use for this job */
-	u8 reserved_0[VPU_JOB_RESERVED_BYTES];
+	u64 secondary_preempt_buf_addr;
+	u64 reserved_0;
+};
+
+/*
+ * Inline command format.
+ * Inline commands are the commands executed at scheduler level (typically,
+ * synchronization directives). Inline command and job objects must be of
+ * the same size and have flags field at same offset.
+ */
+struct vpu_inline_cmd {
+	u64 reserved_0;
+	/* Inline command type, see VPU_INLINE_CMD_TYPE_* defines. */
+	u32 type;
+	/* Flags bit field, see VPU_JOB_FLAGS_* above. */
+	u32 flags;
+	/* Inline command payload. Depends on inline command type. */
+	union {
+		/* Fence (wait and signal) commands' payload. */
+		struct {
+			/* Fence object handle. */
+			u64 fence_handle;
+			/* User VA of the current fence value. */
+			u64 current_value_va;
+			/* User VA of the monitored fence value (read-only). */
+			u64 monitored_value_va;
+			/* Value to wait for or write in fence location. */
+			u64 value;
+			/* User VA of the log buffer in which to add log entry on completion. */
+			u64 log_buffer_va;
+		} fence;
+		/* Other commands do not have a payload. */
+		/* Payload definition for future inline commands can be inserted here. */
+		u64 reserved_1[6];
+	} payload;
+};
+
+/*
+ * Job queue slots can be populated either with job objects or inline command objects.
+ */
+union vpu_jobq_slot {
+	struct vpu_job_queue_entry job;
+	struct vpu_inline_cmd inline_cmd;
 };
 
 /*
@@ -151,7 +297,21 @@ struct vpu_job_queue_header {
 	u32 engine_idx;
 	u32 head;
 	u32 tail;
-	u8 reserved_0[VPU_JOB_QUEUE_RESERVED_BYTES];
+	u32 flags;
+	/* Set to 1 to indicate priority_band field is valid */
+	u32 priority_band_valid;
+	/*
+	 * Priority for the work of this job queue, valid only if the HWS is NOT used
+	 * and the `priority_band_valid` is set to 1. It is applied only during
+	 * the VPU_JSM_MSG_REGISTER_DB message processing.
+	 * The device firmware might use the `priority_band` to optimize the power
+	 * management logic, but it will not affect the order of jobs.
+	 * Available priority bands: @see enum vpu_job_scheduling_priority_band
+	 */
+	u32 priority_band;
+	/* Inside realtime band assigns a further priority, limited to 0..31 range */
+	u32 realtime_priority_level;
+	u32 reserved_0[9];
 };
 
 /*
@@ -159,7 +319,7 @@ struct vpu_job_queue_header {
  */
 struct vpu_job_queue {
 	struct vpu_job_queue_header header;
-	struct vpu_job_queue_entry job[];
+	union vpu_jobq_slot slot[];
 };
 
 /**
@@ -181,9 +341,7 @@ enum vpu_trace_entity_type {
 struct vpu_hws_log_buffer_header {
 	/* Written by VPU after adding a log entry. Initialised by host to 0. */
 	u32 first_free_entry_index;
-	/* Incremented by VPU every time the VPU overwrites the 0th entry;
-	 * initialised by host to 0.
-	 */
+	/* Incremented by VPU every time the VPU writes the 0th entry; initialised by host to 0. */
 	u32 wraparound_count;
 	/*
 	 * This is the number of buffers that can be stored in the log buffer provided by the host.
@@ -214,14 +372,80 @@ struct vpu_hws_log_buffer_entry {
 	u64 operation_data[2];
 };
 
+/* Native fence log buffer types. */
+enum vpu_hws_native_fence_log_type {
+	VPU_HWS_NATIVE_FENCE_LOG_TYPE_WAITS = 1,
+	VPU_HWS_NATIVE_FENCE_LOG_TYPE_SIGNALS = 2
+};
+
+/* HWS native fence log buffer header. */
+struct vpu_hws_native_fence_log_header {
+	union {
+		struct {
+			/* Index of the first free entry in buffer. */
+			u32 first_free_entry_idx;
+			/* Incremented each time NPU wraps around the buffer to write next entry. */
+			u32 wraparound_count;
+		};
+		/* Field allowing atomic update of both fields above. */
+		u64 atomic_wraparound_and_entry_idx;
+	};
+	/* Log buffer type, see enum vpu_hws_native_fence_log_type. */
+	u64 type;
+	/* Allocated number of entries in the log buffer. */
+	u64 entry_nb;
+	u64 reserved[2];
+};
+
+/* Native fence log operation types. */
+enum vpu_hws_native_fence_log_op {
+	VPU_HWS_NATIVE_FENCE_LOG_OP_SIGNAL_EXECUTED = 0,
+	VPU_HWS_NATIVE_FENCE_LOG_OP_WAIT_UNBLOCKED = 1
+};
+
+/* HWS native fence log entry. */
+struct vpu_hws_native_fence_log_entry {
+	/* Newly signaled/unblocked fence value. */
+	u64 fence_value;
+	/* Native fence object handle to which this operation belongs. */
+	u64 fence_handle;
+	/* Operation type, see enum vpu_hws_native_fence_log_op. */
+	u64 op_type;
+	u64 reserved_0;
+	/*
+	 * VPU_HWS_NATIVE_FENCE_LOG_OP_WAIT_UNBLOCKED only: Timestamp at which fence
+	 * wait was started (in NPU SysTime).
+	 */
+	u64 fence_wait_start_ts;
+	u64 reserved_1;
+	/* Timestamp at which fence operation was completed (in NPU SysTime). */
+	u64 fence_end_ts;
+};
+
+/* Native fence log buffer. */
+struct vpu_hws_native_fence_log_buffer {
+	struct vpu_hws_native_fence_log_header header;
+	struct vpu_hws_native_fence_log_entry entry[];
+};
+
 /*
  * Host <-> VPU IPC messages types.
  */
 enum vpu_ipc_msg_type {
 	VPU_JSM_MSG_UNKNOWN = 0xFFFFFFFF,
+
 	/* IPC Host -> Device, Async commands */
 	VPU_JSM_MSG_ASYNC_CMD = 0x1100,
 	VPU_JSM_MSG_ENGINE_RESET = VPU_JSM_MSG_ASYNC_CMD,
+	/**
+	 * Preempt engine. The NPU stops (preempts) all the jobs currently
+	 * executing on the target engine making the engine become idle and ready to
+	 * execute new jobs.
+	 * NOTE: The NPU does not remove unstarted jobs (if any) from job queues of
+	 * the target engine, but it stops processing them (until the queue doorbell
+	 * is rung again); the host is responsible to reset the job queue, either
+	 * after preemption or when resubmitting jobs to the queue.
+	 */
 	VPU_JSM_MSG_ENGINE_PREEMPT = 0x1101,
 	VPU_JSM_MSG_REGISTER_DB = 0x1102,
 	VPU_JSM_MSG_UNREGISTER_DB = 0x1103,
@@ -307,9 +531,10 @@ enum vpu_ipc_msg_type {
 	 * NOTE: Please introduce new ASYNC commands before this one. *
 	 */
 	VPU_JSM_MSG_STATE_DUMP = 0x11FF,
+
 	/* IPC Host -> Device, General commands */
 	VPU_JSM_MSG_GENERAL_CMD = 0x1200,
-	VPU_JSM_MSG_BLOB_DEINIT = VPU_JSM_MSG_GENERAL_CMD,
+	VPU_JSM_MSG_BLOB_DEINIT_DEPRECATED = VPU_JSM_MSG_GENERAL_CMD,
 	/**
 	 * Control dyndbg behavior by executing a dyndbg command; equivalent to
 	 * Linux command: `echo '<dyndbg_cmd>' > <debugfs>/dynamic_debug/control`.
@@ -319,8 +544,12 @@ enum vpu_ipc_msg_type {
 	 * Perform the save procedure for the D0i3 entry
 	 */
 	VPU_JSM_MSG_PWR_D0I3_ENTER = 0x1202,
+
 	/* IPC Device -> Host, Job completion */
 	VPU_JSM_MSG_JOB_DONE = 0x2100,
+	/* IPC Device -> Host, Fence signalled */
+	VPU_JSM_MSG_NATIVE_FENCE_SIGNALLED = 0x2101,
+
 	/* IPC Device -> Host, Async command completion */
 	VPU_JSM_MSG_ASYNC_CMD_DONE = 0x2200,
 	VPU_JSM_MSG_ENGINE_RESET_DONE = VPU_JSM_MSG_ASYNC_CMD_DONE,
@@ -406,6 +635,7 @@ enum vpu_ipc_msg_type {
 	 * NOTE: Please introduce new ASYNC responses before this one. *
 	 */
 	VPU_JSM_MSG_STATE_DUMP_RSP = 0x22FF,
+
 	/* IPC Device -> Host, General command completion */
 	VPU_JSM_MSG_GENERAL_CMD_DONE = 0x2300,
 	VPU_JSM_MSG_BLOB_DEINIT_DONE = VPU_JSM_MSG_GENERAL_CMD_DONE,
@@ -584,11 +814,6 @@ struct vpu_jsm_metric_streamer_update {
 	u64 next_buffer_size;
 };
 
-struct vpu_ipc_msg_payload_blob_deinit {
-	/* 64-bit unique ID for the blob to be de-initialized. */
-	u64 blob_id;
-};
-
 struct vpu_ipc_msg_payload_job_done {
 	/* Engine to which the job was submitted. */
 	u32 engine_idx;
@@ -606,6 +831,21 @@ struct vpu_ipc_msg_payload_job_done {
 	u64 cmdq_id;
 };
 
+/*
+ * Notification message upon native fence signalling.
+ * @see VPU_JSM_MSG_NATIVE_FENCE_SIGNALLED
+ */
+struct vpu_ipc_msg_payload_native_fence_signalled {
+	/* Engine ID. */
+	u32 engine_idx;
+	/* Host SSID. */
+	u32 host_ssid;
+	/* CMDQ ID */
+	u64 cmdq_id;
+	/* Fence object handle. */
+	u64 fence_handle;
+};
+
 struct vpu_jsm_engine_reset_context {
 	/* Host SSID */
 	u32 host_ssid;
@@ -613,7 +853,7 @@ struct vpu_jsm_engine_reset_context {
 	u32 reserved_0;
 	/* Command queue id */
 	u64 cmdq_id;
-	/* Flags: 0: cause of hang; 1: collateral damage of reset */
+	/* See VPU_ENGINE_RESET_CONTEXT_* defines */
 	u64 flags;
 };
 
@@ -684,11 +924,6 @@ struct vpu_ipc_msg_payload_get_power_level_count_done {
 	u8 power_limit[16];
 };
 
-struct vpu_ipc_msg_payload_blob_deinit_done {
-	/* 64-bit unique ID for the blob de-initialized. */
-	u64 blob_id;
-};
-
 /* HWS priority band setup request / response */
 struct vpu_ipc_msg_payload_hws_priority_band_setup {
 	/*
@@ -730,11 +965,7 @@ struct vpu_ipc_msg_payload_hws_create_cmdq {
 	u32 host_ssid;
 	/* Engine for which queue is being created */
 	u32 engine_idx;
-	/*
-	 * Cmdq group may be set to 0 or equal to
-	 * cmdq_id while each priority band contains
-	 * only single engine instances.
-	 */
+	/* Cmdq group: only used for HWS logging of state changes */
 	u64 cmdq_group;
 	/* Command queue id */
 	u64 cmdq_id;
@@ -782,7 +1013,10 @@ struct vpu_ipc_msg_payload_hws_set_context_sched_properties {
 	u32 reserved_0;
 	/* Command queue id */
 	u64 cmdq_id;
-	/* Priority band to assign to work of this context */
+	/*
+	 * Priority band to assign to work of this context.
+	 * Available priority bands: @see enum vpu_job_scheduling_priority_band
+	 */
 	u32 priority_band;
 	/* Inside realtime band assigns a further priority */
 	u32 realtime_priority_level;
@@ -856,6 +1090,12 @@ struct vpu_ipc_msg_payload_hws_set_scheduling_log {
 	 * is generated when an event log is written to this index.
 	 */
 	u64 notify_index;
+	/*
+	 * Field is now deprecated, will be removed when KMD is updated to support removal
+	 */
+	u32 enable_extra_events;
+	/* Zero Padding */
+	u32 reserved_0;
 };
 
 /*
@@ -1223,10 +1463,10 @@ union vpu_ipc_msg_payload {
 	struct vpu_jsm_metric_streamer_start metric_streamer_start;
 	struct vpu_jsm_metric_streamer_stop metric_streamer_stop;
 	struct vpu_jsm_metric_streamer_update metric_streamer_update;
-	struct vpu_ipc_msg_payload_blob_deinit blob_deinit;
 	struct vpu_ipc_msg_payload_ssid_release ssid_release;
 	struct vpu_jsm_hws_register_db hws_register_db;
 	struct vpu_ipc_msg_payload_job_done job_done;
+	struct vpu_ipc_msg_payload_native_fence_signalled native_fence_signalled;
 	struct vpu_ipc_msg_payload_engine_reset_done engine_reset_done;
 	struct vpu_ipc_msg_payload_engine_preempt_done engine_preempt_done;
 	struct vpu_ipc_msg_payload_register_db_done register_db_done;
@@ -1234,7 +1474,6 @@ union vpu_ipc_msg_payload {
 	struct vpu_ipc_msg_payload_query_engine_hb_done query_engine_hb_done;
 	struct vpu_ipc_msg_payload_get_power_level_count_done get_power_level_count_done;
 	struct vpu_jsm_metric_streamer_done metric_streamer_done;
-	struct vpu_ipc_msg_payload_blob_deinit_done blob_deinit_done;
 	struct vpu_ipc_msg_payload_trace_config trace_config;
 	struct vpu_ipc_msg_payload_trace_capability_rsp trace_capability;
 	struct vpu_ipc_msg_payload_trace_get_name trace_get_name;
diff --git a/drivers/accel/qaic/Makefile b/drivers/accel/qaic/Makefile
index 3f7f6dfde7f2..35e883515629 100644
--- a/drivers/accel/qaic/Makefile
+++ b/drivers/accel/qaic/Makefile
@@ -10,4 +10,7 @@ qaic-y := \
 	qaic_control.o \
 	qaic_data.o \
 	qaic_drv.o \
-	qaic_timesync.o
+	qaic_timesync.o \
+	sahara.o
+
+qaic-$(CONFIG_DEBUG_FS) += qaic_debugfs.o
diff --git a/drivers/accel/qaic/mhi_controller.c b/drivers/accel/qaic/mhi_controller.c
index cb77d048ed54..8ab82e78dd94 100644
--- a/drivers/accel/qaic/mhi_controller.c
+++ b/drivers/accel/qaic/mhi_controller.c
@@ -20,7 +20,7 @@ static unsigned int mhi_timeout_ms = 2000; /* 2 sec default */
 module_param(mhi_timeout_ms, uint, 0600);
 MODULE_PARM_DESC(mhi_timeout_ms, "MHI controller timeout value");
 
-static struct mhi_channel_config aic100_channels[] = {
+static const struct mhi_channel_config aic100_channels[] = {
 	{
 		.name = "QAIC_LOOPBACK",
 		.num = 0,
@@ -358,8 +358,8 @@ static struct mhi_channel_config aic100_channels[] = {
 		.wake_capable = false,
 	},
 	{
-		.num = 21,
 		.name = "QAIC_TIMESYNC",
+		.num = 21,
 		.num_elements = 32,
 		.local_elements = 0,
 		.event_ring = 0,
@@ -390,8 +390,8 @@ static struct mhi_channel_config aic100_channels[] = {
 		.wake_capable = false,
 	},
 	{
-		.num = 23,
 		.name = "QAIC_TIMESYNC_PERIODIC",
+		.num = 23,
 		.num_elements = 32,
 		.local_elements = 0,
 		.event_ring = 0,
@@ -405,6 +405,38 @@ static struct mhi_channel_config aic100_channels[] = {
 		.auto_queue = false,
 		.wake_capable = false,
 	},
+	{
+		.name = "IPCR",
+		.num = 24,
+		.num_elements = 32,
+		.local_elements = 0,
+		.event_ring = 0,
+		.dir = DMA_TO_DEVICE,
+		.ee_mask = MHI_CH_EE_AMSS,
+		.pollcfg = 0,
+		.doorbell = MHI_DB_BRST_DISABLE,
+		.lpm_notify = false,
+		.offload_channel = false,
+		.doorbell_mode_switch = false,
+		.auto_queue = false,
+		.wake_capable = false,
+	},
+	{
+		.name = "IPCR",
+		.num = 25,
+		.num_elements = 32,
+		.local_elements = 0,
+		.event_ring = 0,
+		.dir = DMA_FROM_DEVICE,
+		.ee_mask = MHI_CH_EE_AMSS,
+		.pollcfg = 0,
+		.doorbell = MHI_DB_BRST_DISABLE,
+		.lpm_notify = false,
+		.offload_channel = false,
+		.doorbell_mode_switch = false,
+		.auto_queue = true,
+		.wake_capable = false,
+	},
 };
 
 static struct mhi_event_config aic100_events[] = {
diff --git a/drivers/accel/qaic/qaic.h b/drivers/accel/qaic/qaic.h
index 582836f9538f..02561b6cecc6 100644
--- a/drivers/accel/qaic/qaic.h
+++ b/drivers/accel/qaic/qaic.h
@@ -30,6 +30,7 @@
 #define to_qaic_drm_device(dev) container_of(dev, struct qaic_drm_device, drm)
 #define to_drm(qddev) (&(qddev)->drm)
 #define to_accel_kdev(qddev) (to_drm(qddev)->accel->kdev) /* Return Linux device of accel node */
+#define to_qaic_device(dev) (to_qaic_drm_device((dev))->qdev)
 
 enum __packed dev_states {
 	/* Device is offline or will be very soon */
@@ -152,6 +153,14 @@ struct qaic_device {
 	struct mhi_device	*qts_ch;
 	/* Work queue for tasks related to MHI "QAIC_TIMESYNC" channel */
 	struct workqueue_struct	*qts_wq;
+	/* Head of list of page allocated by MHI bootlog device */
+	struct list_head        bootlog;
+	/* MHI bootlog channel device */
+	struct mhi_device       *bootlog_ch;
+	/* Work queue for tasks related to MHI bootlog device */
+	struct workqueue_struct *bootlog_wq;
+	/* Synchronizes access of pages in MHI bootlog device */
+	struct mutex            bootlog_mutex;
 };
 
 struct qaic_drm_device {
@@ -191,8 +200,6 @@ struct qaic_bo {
 	u32			nr_slice;
 	/* Number of slice that have been transferred by DMA engine */
 	u32			nr_slice_xfer_done;
-	/* true = BO is queued for execution, true = BO is not queued */
-	bool			queued;
 	/*
 	 * If true then user has attached slicing information to this BO by
 	 * calling DRM_IOCTL_QAIC_ATTACH_SLICE_BO ioctl.
@@ -281,6 +288,7 @@ int disable_dbc(struct qaic_device *qdev, u32 dbc_id, struct qaic_user *usr);
 void enable_dbc(struct qaic_device *qdev, u32 dbc_id, struct qaic_user *usr);
 void wakeup_dbc(struct qaic_device *qdev, u32 dbc_id);
 void release_dbc(struct qaic_device *qdev, u32 dbc_id);
+void qaic_data_get_fifo_info(struct dma_bridge_chan *dbc, u32 *head, u32 *tail);
 
 void wake_all_cntl(struct qaic_device *qdev);
 void qaic_dev_reset_clean_local_state(struct qaic_device *qdev);
diff --git a/drivers/accel/qaic/qaic_control.c b/drivers/accel/qaic/qaic_control.c
index 9e8a8cbadf6b..d8bdab69f800 100644
--- a/drivers/accel/qaic/qaic_control.c
+++ b/drivers/accel/qaic/qaic_control.c
@@ -496,7 +496,7 @@ static int encode_addr_size_pairs(struct dma_xfer *xfer, struct wrapper_list *wr
 	nents = sgt->nents;
 	nents_dma = nents;
 	*size = QAIC_MANAGE_EXT_MSG_LENGTH - msg_hdr_len - sizeof(**out_trans);
-	for_each_sgtable_sg(sgt, sg, i) {
+	for_each_sgtable_dma_sg(sgt, sg, i) {
 		*size -= sizeof(*asp);
 		/* Save 1K for possible follow-up transactions. */
 		if (*size < SZ_1K) {
diff --git a/drivers/accel/qaic/qaic_data.c b/drivers/accel/qaic/qaic_data.c
index 03c9a793da35..43aba57b48f0 100644
--- a/drivers/accel/qaic/qaic_data.c
+++ b/drivers/accel/qaic/qaic_data.c
@@ -141,6 +141,11 @@ struct dbc_rsp {
 	__le16	status;
 } __packed;
 
+static inline bool bo_queued(struct qaic_bo *bo)
+{
+	return !list_empty(&bo->xfer_list);
+}
+
 inline int get_dbc_req_elem_size(void)
 {
 	return sizeof(struct dbc_req);
@@ -167,9 +172,10 @@ static void free_slice(struct kref *kref)
 static int clone_range_of_sgt_for_slice(struct qaic_device *qdev, struct sg_table **sgt_out,
 					struct sg_table *sgt_in, u64 size, u64 offset)
 {
-	int total_len, len, nents, offf = 0, offl = 0;
 	struct scatterlist *sg, *sgn, *sgf, *sgl;
+	unsigned int len, nents, offf, offl;
 	struct sg_table *sgt;
+	size_t total_len;
 	int ret, j;
 
 	/* find out number of relevant nents needed for this mem */
@@ -177,9 +183,11 @@ static int clone_range_of_sgt_for_slice(struct qaic_device *qdev, struct sg_tabl
 	sgf = NULL;
 	sgl = NULL;
 	nents = 0;
+	offf = 0;
+	offl = 0;
 
 	size = size ? size : PAGE_SIZE;
-	for (sg = sgt_in->sgl; sg; sg = sg_next(sg)) {
+	for_each_sgtable_dma_sg(sgt_in, sg, j) {
 		len = sg_dma_len(sg);
 
 		if (!len)
@@ -216,7 +224,7 @@ static int clone_range_of_sgt_for_slice(struct qaic_device *qdev, struct sg_tabl
 
 	/* copy relevant sg node and fix page and length */
 	sgn = sgf;
-	for_each_sgtable_sg(sgt, sg, j) {
+	for_each_sgtable_dma_sg(sgt, sg, j) {
 		memcpy(sg, sgn, sizeof(*sg));
 		if (sgn == sgf) {
 			sg_dma_address(sg) += offf;
@@ -296,7 +304,7 @@ static int encode_reqs(struct qaic_device *qdev, struct bo_slice *slice,
 	 * fence.
 	 */
 	dev_addr = req->dev_addr;
-	for_each_sgtable_sg(slice->sgt, sg, i) {
+	for_each_sgtable_dma_sg(slice->sgt, sg, i) {
 		slice->reqs[i].cmd = cmd;
 		slice->reqs[i].src_addr = cpu_to_le64(slice->dir == DMA_TO_DEVICE ?
 						      sg_dma_address(sg) : dev_addr);
@@ -549,6 +557,7 @@ static bool invalid_sem(struct qaic_sem *sem)
 static int qaic_validate_req(struct qaic_device *qdev, struct qaic_attach_slice_entry *slice_ent,
 			     u32 count, u64 total_size)
 {
+	u64 total;
 	int i;
 
 	for (i = 0; i < count; i++) {
@@ -558,7 +567,8 @@ static int qaic_validate_req(struct qaic_device *qdev, struct qaic_attach_slice_
 		      invalid_sem(&slice_ent[i].sem2) || invalid_sem(&slice_ent[i].sem3))
 			return -EINVAL;
 
-		if (slice_ent[i].offset + slice_ent[i].size > total_size)
+		if (check_add_overflow(slice_ent[i].offset, slice_ent[i].size, &total) ||
+		    total > total_size)
 			return -EINVAL;
 	}
 
@@ -569,6 +579,9 @@ static void qaic_free_sgt(struct sg_table *sgt)
 {
 	struct scatterlist *sg;
 
+	if (!sgt)
+		return;
+
 	for (sg = sgt->sgl; sg; sg = sg_next(sg))
 		if (sg_page(sg))
 			__free_pages(sg_page(sg), get_order(sg->length));
@@ -648,6 +661,7 @@ static void qaic_init_bo(struct qaic_bo *bo, bool reinit)
 	}
 	complete_all(&bo->xfer_done);
 	INIT_LIST_HEAD(&bo->slices);
+	INIT_LIST_HEAD(&bo->xfer_list);
 }
 
 static struct qaic_bo *qaic_alloc_init_bo(void)
@@ -709,9 +723,13 @@ int qaic_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *fi
 	if (ret)
 		goto free_bo;
 
+	ret = drm_gem_create_mmap_offset(obj);
+	if (ret)
+		goto free_bo;
+
 	ret = drm_gem_handle_create(file_priv, obj, &args->handle);
 	if (ret)
-		goto free_sgt;
+		goto free_bo;
 
 	bo->handle = args->handle;
 	drm_gem_object_put(obj);
@@ -720,10 +738,8 @@ int qaic_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *fi
 
 	return 0;
 
-free_sgt:
-	qaic_free_sgt(bo->sgt);
 free_bo:
-	kfree(bo);
+	drm_gem_object_put(obj);
 unlock_dev_srcu:
 	srcu_read_unlock(&qdev->dev_lock, qdev_rcu_id);
 unlock_usr_srcu:
@@ -738,7 +754,7 @@ int qaic_mmap_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file
 	struct drm_gem_object *obj;
 	struct qaic_device *qdev;
 	struct qaic_user *usr;
-	int ret;
+	int ret = 0;
 
 	usr = file_priv->driver_priv;
 	usr_rcu_id = srcu_read_lock(&usr->qddev_lock);
@@ -760,9 +776,7 @@ int qaic_mmap_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file
 		goto unlock_dev_srcu;
 	}
 
-	ret = drm_gem_create_mmap_offset(obj);
-	if (ret == 0)
-		args->offset = drm_vma_node_offset_addr(&obj->vma_node);
+	args->offset = drm_vma_node_offset_addr(&obj->vma_node);
 
 	drm_gem_object_put(obj);
 
@@ -828,9 +842,6 @@ static int qaic_prepare_import_bo(struct qaic_bo *bo, struct qaic_attach_slice_h
 	struct sg_table *sgt;
 	int ret;
 
-	if (obj->import_attach->dmabuf->size < hdr->size)
-		return -EINVAL;
-
 	sgt = dma_buf_map_attachment(obj->import_attach, hdr->dir);
 	if (IS_ERR(sgt)) {
 		ret = PTR_ERR(sgt);
@@ -847,9 +858,6 @@ static int qaic_prepare_export_bo(struct qaic_device *qdev, struct qaic_bo *bo,
 {
 	int ret;
 
-	if (bo->base.size < hdr->size)
-		return -EINVAL;
-
 	ret = dma_map_sgtable(&qdev->pdev->dev, bo->sgt, hdr->dir, 0);
 	if (ret)
 		return -EFAULT;
@@ -950,9 +958,6 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi
 	if (arg_size / args->hdr.count != sizeof(*slice_ent))
 		return -EINVAL;
 
-	if (args->hdr.size == 0)
-		return -EINVAL;
-
 	if (!(args->hdr.dir == DMA_TO_DEVICE || args->hdr.dir == DMA_FROM_DEVICE))
 		return -EINVAL;
 
@@ -992,16 +997,16 @@ int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi
 		goto free_slice_ent;
 	}
 
-	ret = qaic_validate_req(qdev, slice_ent, args->hdr.count, args->hdr.size);
-	if (ret)
-		goto free_slice_ent;
-
 	obj = drm_gem_object_lookup(file_priv, args->hdr.handle);
 	if (!obj) {
 		ret = -ENOENT;
 		goto free_slice_ent;
 	}
 
+	ret = qaic_validate_req(qdev, slice_ent, args->hdr.count, obj->size);
+	if (ret)
+		goto put_bo;
+
 	bo = to_qaic_bo(obj);
 	ret = mutex_lock_interruptible(&bo->lock);
 	if (ret)
@@ -1173,7 +1178,6 @@ static int send_bo_list_to_device(struct qaic_device *qdev, struct drm_file *fil
 	struct bo_slice *slice;
 	unsigned long flags;
 	struct qaic_bo *bo;
-	bool queued;
 	int i, j;
 	int ret;
 
@@ -1205,9 +1209,7 @@ static int send_bo_list_to_device(struct qaic_device *qdev, struct drm_file *fil
 		}
 
 		spin_lock_irqsave(&dbc->xfer_lock, flags);
-		queued = bo->queued;
-		bo->queued = true;
-		if (queued) {
+		if (bo_queued(bo)) {
 			spin_unlock_irqrestore(&dbc->xfer_lock, flags);
 			ret = -EINVAL;
 			goto unlock_bo;
@@ -1230,7 +1232,6 @@ static int send_bo_list_to_device(struct qaic_device *qdev, struct drm_file *fil
 			else
 				ret = copy_exec_reqs(qdev, slice, dbc->id, head, tail);
 			if (ret) {
-				bo->queued = false;
 				spin_unlock_irqrestore(&dbc->xfer_lock, flags);
 				goto unlock_bo;
 			}
@@ -1253,8 +1254,7 @@ failed_to_send_bo:
 		spin_lock_irqsave(&dbc->xfer_lock, flags);
 		bo = list_last_entry(&dbc->xfer_list, struct qaic_bo, xfer_list);
 		obj = &bo->base;
-		bo->queued = false;
-		list_del(&bo->xfer_list);
+		list_del_init(&bo->xfer_list);
 		spin_unlock_irqrestore(&dbc->xfer_lock, flags);
 		dma_sync_sgtable_for_cpu(&qdev->pdev->dev, bo->sgt, bo->dir);
 		drm_gem_object_put(obj);
@@ -1615,8 +1615,7 @@ read_fifo:
 			 */
 			dma_sync_sgtable_for_cpu(&qdev->pdev->dev, bo->sgt, bo->dir);
 			bo->nr_slice_xfer_done = 0;
-			bo->queued = false;
-			list_del(&bo->xfer_list);
+			list_del_init(&bo->xfer_list);
 			bo->perf_stats.req_processed_ts = ktime_get_ns();
 			complete_all(&bo->xfer_done);
 			drm_gem_object_put(&bo->base);
@@ -1875,7 +1874,7 @@ int qaic_detach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_fi
 
 	/* Check if BO is committed to H/W for DMA */
 	spin_lock_irqsave(&dbc->xfer_lock, flags);
-	if (bo->queued) {
+	if (bo_queued(bo)) {
 		spin_unlock_irqrestore(&dbc->xfer_lock, flags);
 		ret = -EBUSY;
 		goto unlock_ch_srcu;
@@ -1905,8 +1904,7 @@ static void empty_xfer_list(struct qaic_device *qdev, struct dma_bridge_chan *db
 	spin_lock_irqsave(&dbc->xfer_lock, flags);
 	while (!list_empty(&dbc->xfer_list)) {
 		bo = list_first_entry(&dbc->xfer_list, typeof(*bo), xfer_list);
-		bo->queued = false;
-		list_del(&bo->xfer_list);
+		list_del_init(&bo->xfer_list);
 		spin_unlock_irqrestore(&dbc->xfer_lock, flags);
 		bo->nr_slice_xfer_done = 0;
 		bo->req_id = 0;
@@ -1988,3 +1986,12 @@ void release_dbc(struct qaic_device *qdev, u32 dbc_id)
 	dbc->in_use = false;
 	wake_up(&dbc->dbc_release);
 }
+
+void qaic_data_get_fifo_info(struct dma_bridge_chan *dbc, u32 *head, u32 *tail)
+{
+	if (!dbc || !head || !tail)
+		return;
+
+	*head = readl(dbc->dbc_base + REQHP_OFF);
+	*tail = readl(dbc->dbc_base + REQTP_OFF);
+}
diff --git a/drivers/accel/qaic/qaic_debugfs.c b/drivers/accel/qaic/qaic_debugfs.c
new file mode 100644
index 000000000000..ba0cf2f94732
--- /dev/null
+++ b/drivers/accel/qaic/qaic_debugfs.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* Copyright (c) 2020, The Linux Foundation. All rights reserved. */
+/* Copyright (c) 2021-2024 Qualcomm Innovation Center, Inc. All rights reserved. */
+
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/mhi.h>
+#include <linux/mutex.h>
+#include <linux/overflow.h>
+#include <linux/pci.h>
+#include <linux/seq_file.h>
+#include <linux/sprintf.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "qaic.h"
+#include "qaic_debugfs.h"
+
+#define BOOTLOG_POOL_SIZE		16
+#define BOOTLOG_MSG_SIZE		512
+#define QAIC_DBC_DIR_NAME		9
+
+struct bootlog_msg {
+	/* Buffer for bootlog messages */
+	char str[BOOTLOG_MSG_SIZE];
+	/* Root struct of device, used to access device resources */
+	struct qaic_device *qdev;
+	/* Work struct to schedule work coming on QAIC_LOGGING channel */
+	struct work_struct work;
+};
+
+struct bootlog_page {
+	/* Node in list of bootlog pages maintained by root device struct */
+	struct list_head node;
+	/* Total size of the buffer that holds the bootlogs. It is PAGE_SIZE */
+	unsigned int size;
+	/* Offset for the next bootlog */
+	unsigned int offset;
+};
+
+static int bootlog_show(struct seq_file *s, void *unused)
+{
+	struct bootlog_page *page;
+	struct qaic_device *qdev;
+	void *page_end;
+	void *log;
+
+	qdev = s->private;
+	mutex_lock(&qdev->bootlog_mutex);
+	list_for_each_entry(page, &qdev->bootlog, node) {
+		log = page + 1;
+		page_end = (void *)page + page->offset;
+		while (log < page_end) {
+			seq_printf(s, "%s", (char *)log);
+			log += strlen(log) + 1;
+		}
+	}
+	mutex_unlock(&qdev->bootlog_mutex);
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(bootlog);
+
+static int fifo_size_show(struct seq_file *s, void *unused)
+{
+	struct dma_bridge_chan *dbc = s->private;
+
+	seq_printf(s, "%u\n", dbc->nelem);
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(fifo_size);
+
+static int queued_show(struct seq_file *s, void *unused)
+{
+	struct dma_bridge_chan *dbc = s->private;
+	u32 tail = 0, head = 0;
+
+	qaic_data_get_fifo_info(dbc, &head, &tail);
+
+	if (head == U32_MAX || tail == U32_MAX)
+		seq_printf(s, "%u\n", 0);
+	else if (head > tail)
+		seq_printf(s, "%u\n", dbc->nelem - head + tail);
+	else
+		seq_printf(s, "%u\n", tail - head);
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(queued);
+
+void qaic_debugfs_init(struct qaic_drm_device *qddev)
+{
+	struct qaic_device *qdev = qddev->qdev;
+	struct dentry *debugfs_root;
+	struct dentry *debugfs_dir;
+	char name[QAIC_DBC_DIR_NAME];
+	u32 i;
+
+	debugfs_root = to_drm(qddev)->debugfs_root;
+
+	debugfs_create_file("bootlog", 0400, debugfs_root, qdev, &bootlog_fops);
+	/*
+	 * 256 dbcs per device is likely the max we will ever see and lets static checking see a
+	 * reasonable range.
+	 */
+	for (i = 0; i < qdev->num_dbc && i < 256; ++i) {
+		snprintf(name, QAIC_DBC_DIR_NAME, "dbc%03u", i);
+		debugfs_dir = debugfs_create_dir(name, debugfs_root);
+		debugfs_create_file("fifo_size", 0400, debugfs_dir, &qdev->dbc[i], &fifo_size_fops);
+		debugfs_create_file("queued", 0400, debugfs_dir, &qdev->dbc[i], &queued_fops);
+	}
+}
+
+static struct bootlog_page *alloc_bootlog_page(struct qaic_device *qdev)
+{
+	struct bootlog_page *page;
+
+	page = (struct bootlog_page *)devm_get_free_pages(&qdev->pdev->dev, GFP_KERNEL, 0);
+	if (!page)
+		return page;
+
+	page->size = PAGE_SIZE;
+	page->offset = sizeof(*page);
+	list_add_tail(&page->node, &qdev->bootlog);
+
+	return page;
+}
+
+static int reset_bootlog(struct qaic_device *qdev)
+{
+	struct bootlog_page *page;
+	struct bootlog_page *i;
+
+	mutex_lock(&qdev->bootlog_mutex);
+	list_for_each_entry_safe(page, i, &qdev->bootlog, node) {
+		list_del(&page->node);
+		devm_free_pages(&qdev->pdev->dev, (unsigned long)page);
+	}
+
+	page = alloc_bootlog_page(qdev);
+	mutex_unlock(&qdev->bootlog_mutex);
+	if (!page)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void *bootlog_get_space(struct qaic_device *qdev, unsigned int size)
+{
+	struct bootlog_page *page;
+
+	page = list_last_entry(&qdev->bootlog, struct bootlog_page, node);
+
+	if (size_add(size, sizeof(*page)) > page->size)
+		return NULL;
+
+	if (page->offset + size > page->size) {
+		page = alloc_bootlog_page(qdev);
+		if (!page)
+			return NULL;
+	}
+
+	return (void *)page + page->offset;
+}
+
+static void bootlog_commit(struct qaic_device *qdev, unsigned int size)
+{
+	struct bootlog_page *page;
+
+	page = list_last_entry(&qdev->bootlog, struct bootlog_page, node);
+
+	page->offset += size;
+}
+
+static void bootlog_log(struct work_struct *work)
+{
+	struct bootlog_msg *msg = container_of(work, struct bootlog_msg, work);
+	unsigned int len = strlen(msg->str) + 1;
+	struct qaic_device *qdev = msg->qdev;
+	void *log;
+
+	mutex_lock(&qdev->bootlog_mutex);
+	log = bootlog_get_space(qdev, len);
+	if (log) {
+		memcpy(log, msg, len);
+		bootlog_commit(qdev, len);
+	}
+	mutex_unlock(&qdev->bootlog_mutex);
+
+	if (mhi_queue_buf(qdev->bootlog_ch, DMA_FROM_DEVICE, msg, BOOTLOG_MSG_SIZE, MHI_EOT))
+		devm_kfree(&qdev->pdev->dev, msg);
+}
+
+static int qaic_bootlog_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id)
+{
+	struct qaic_device *qdev = pci_get_drvdata(to_pci_dev(mhi_dev->mhi_cntrl->cntrl_dev));
+	struct bootlog_msg *msg;
+	int i, ret;
+
+	qdev->bootlog_wq = alloc_ordered_workqueue("qaic_bootlog", 0);
+	if (!qdev->bootlog_wq) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = reset_bootlog(qdev);
+	if (ret)
+		goto destroy_workqueue;
+
+	ret = mhi_prepare_for_transfer(mhi_dev);
+	if (ret)
+		goto destroy_workqueue;
+
+	for (i = 0; i < BOOTLOG_POOL_SIZE; i++) {
+		msg = devm_kzalloc(&qdev->pdev->dev, sizeof(*msg), GFP_KERNEL);
+		if (!msg) {
+			ret = -ENOMEM;
+			goto mhi_unprepare;
+		}
+
+		msg->qdev = qdev;
+		INIT_WORK(&msg->work, bootlog_log);
+
+		ret = mhi_queue_buf(mhi_dev, DMA_FROM_DEVICE, msg, BOOTLOG_MSG_SIZE, MHI_EOT);
+		if (ret)
+			goto mhi_unprepare;
+	}
+
+	dev_set_drvdata(&mhi_dev->dev, qdev);
+	qdev->bootlog_ch = mhi_dev;
+	return 0;
+
+mhi_unprepare:
+	mhi_unprepare_from_transfer(mhi_dev);
+destroy_workqueue:
+	flush_workqueue(qdev->bootlog_wq);
+	destroy_workqueue(qdev->bootlog_wq);
+out:
+	return ret;
+}
+
+static void qaic_bootlog_mhi_remove(struct mhi_device *mhi_dev)
+{
+	struct qaic_device *qdev;
+
+	qdev = dev_get_drvdata(&mhi_dev->dev);
+
+	mhi_unprepare_from_transfer(qdev->bootlog_ch);
+	flush_workqueue(qdev->bootlog_wq);
+	destroy_workqueue(qdev->bootlog_wq);
+	qdev->bootlog_ch = NULL;
+}
+
+static void qaic_bootlog_mhi_ul_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
+{
+}
+
+static void qaic_bootlog_mhi_dl_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
+{
+	struct qaic_device *qdev = dev_get_drvdata(&mhi_dev->dev);
+	struct bootlog_msg *msg = mhi_result->buf_addr;
+
+	if (mhi_result->transaction_status) {
+		devm_kfree(&qdev->pdev->dev, msg);
+		return;
+	}
+
+	/* Force a null at the end of the transferred string */
+	msg->str[mhi_result->bytes_xferd - 1] = 0;
+
+	queue_work(qdev->bootlog_wq, &msg->work);
+}
+
+static const struct mhi_device_id qaic_bootlog_mhi_match_table[] = {
+	{ .chan = "QAIC_LOGGING", },
+	{},
+};
+
+static struct mhi_driver qaic_bootlog_mhi_driver = {
+	.id_table = qaic_bootlog_mhi_match_table,
+	.remove = qaic_bootlog_mhi_remove,
+	.probe = qaic_bootlog_mhi_probe,
+	.ul_xfer_cb = qaic_bootlog_mhi_ul_xfer_cb,
+	.dl_xfer_cb = qaic_bootlog_mhi_dl_xfer_cb,
+	.driver = {
+		.name = "qaic_bootlog",
+	},
+};
+
+int qaic_bootlog_register(void)
+{
+	return mhi_driver_register(&qaic_bootlog_mhi_driver);
+}
+
+void qaic_bootlog_unregister(void)
+{
+	mhi_driver_unregister(&qaic_bootlog_mhi_driver);
+}
diff --git a/drivers/accel/qaic/qaic_debugfs.h b/drivers/accel/qaic/qaic_debugfs.h
new file mode 100644
index 000000000000..05e74f84cf9f
--- /dev/null
+++ b/drivers/accel/qaic/qaic_debugfs.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/* Copyright (c) 2020, The Linux Foundation. All rights reserved. */
+/* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. */
+
+#ifndef __QAIC_DEBUGFS_H__
+#define __QAIC_DEBUGFS_H__
+
+#include <drm/drm_file.h>
+
+#ifdef CONFIG_DEBUG_FS
+int qaic_bootlog_register(void);
+void qaic_bootlog_unregister(void);
+void qaic_debugfs_init(struct qaic_drm_device *qddev);
+#else
+static inline int qaic_bootlog_register(void) { return 0; }
+static inline void qaic_bootlog_unregister(void) {}
+static inline void qaic_debugfs_init(struct qaic_drm_device *qddev) {}
+#endif /* CONFIG_DEBUG_FS */
+#endif /* __QAIC_DEBUGFS_H__ */
diff --git a/drivers/accel/qaic/qaic_drv.c b/drivers/accel/qaic/qaic_drv.c
index 2a313eb69b12..81819b9ef8d4 100644
--- a/drivers/accel/qaic/qaic_drv.c
+++ b/drivers/accel/qaic/qaic_drv.c
@@ -28,10 +28,13 @@
 
 #include "mhi_controller.h"
 #include "qaic.h"
+#include "qaic_debugfs.h"
 #include "qaic_timesync.h"
+#include "sahara.h"
 
-MODULE_IMPORT_NS(DMA_BUF);
+MODULE_IMPORT_NS("DMA_BUF");
 
+#define PCI_DEV_AIC080			0xa080
 #define PCI_DEV_AIC100			0xa100
 #define QAIC_NAME			"qaic"
 #define QAIC_DESC			"Qualcomm Cloud AI Accelerators"
@@ -44,6 +47,53 @@ MODULE_PARM_DESC(datapath_polling, "Operate the datapath in polling mode");
 static bool link_up;
 static DEFINE_IDA(qaic_usrs);
 
+static void qaicm_wq_release(struct drm_device *dev, void *res)
+{
+	struct workqueue_struct *wq = res;
+
+	destroy_workqueue(wq);
+}
+
+static struct workqueue_struct *qaicm_wq_init(struct drm_device *dev, const char *name)
+{
+	struct workqueue_struct *wq;
+	int ret;
+
+	wq = alloc_workqueue("%s", WQ_UNBOUND, 0, name);
+	if (!wq)
+		return ERR_PTR(-ENOMEM);
+	ret = drmm_add_action_or_reset(dev, qaicm_wq_release, wq);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return wq;
+}
+
+static void qaicm_srcu_release(struct drm_device *dev, void *res)
+{
+	struct srcu_struct *lock = res;
+
+	cleanup_srcu_struct(lock);
+}
+
+static int qaicm_srcu_init(struct drm_device *dev, struct srcu_struct *lock)
+{
+	int ret;
+
+	ret = init_srcu_struct(lock);
+	if (ret)
+		return ret;
+
+	return drmm_add_action_or_reset(dev, qaicm_srcu_release, lock);
+}
+
+static void qaicm_pci_release(struct drm_device *dev, void *res)
+{
+	struct qaic_device *qdev = to_qaic_device(dev);
+
+	pci_set_drvdata(qdev->pdev, NULL);
+}
+
 static void free_usr(struct kref *kref)
 {
 	struct qaic_user *usr = container_of(kref, struct qaic_user, ref_count);
@@ -158,7 +208,6 @@ static const struct drm_driver qaic_accel_driver = {
 
 	.name			= QAIC_NAME,
 	.desc			= QAIC_DESC,
-	.date			= "20190618",
 
 	.fops			= &qaic_accel_fops,
 	.open			= qaic_open,
@@ -182,8 +231,12 @@ static int qaic_create_drm_device(struct qaic_device *qdev, s32 partition_id)
 	qddev->partition_id = partition_id;
 
 	ret = drm_dev_register(drm, 0);
-	if (ret)
+	if (ret) {
 		pci_dbg(qdev->pdev, "drm_dev_register failed %d\n", ret);
+		return ret;
+	}
+
+	qaic_debugfs_init(qddev);
 
 	return ret;
 }
@@ -299,74 +352,77 @@ void qaic_dev_reset_clean_local_state(struct qaic_device *qdev)
 		release_dbc(qdev, i);
 }
 
-static void cleanup_qdev(struct qaic_device *qdev)
-{
-	int i;
-
-	for (i = 0; i < qdev->num_dbc; ++i)
-		cleanup_srcu_struct(&qdev->dbc[i].ch_lock);
-	cleanup_srcu_struct(&qdev->dev_lock);
-	pci_set_drvdata(qdev->pdev, NULL);
-	destroy_workqueue(qdev->cntl_wq);
-	destroy_workqueue(qdev->qts_wq);
-}
-
 static struct qaic_device *create_qdev(struct pci_dev *pdev, const struct pci_device_id *id)
 {
+	struct device *dev = &pdev->dev;
 	struct qaic_drm_device *qddev;
 	struct qaic_device *qdev;
-	int i;
+	struct drm_device *drm;
+	int i, ret;
 
-	qdev = devm_kzalloc(&pdev->dev, sizeof(*qdev), GFP_KERNEL);
+	qdev = devm_kzalloc(dev, sizeof(*qdev), GFP_KERNEL);
 	if (!qdev)
 		return NULL;
 
 	qdev->dev_state = QAIC_OFFLINE;
-	if (id->device == PCI_DEV_AIC100) {
+	if (id->device == PCI_DEV_AIC080 || id->device == PCI_DEV_AIC100) {
 		qdev->num_dbc = 16;
-		qdev->dbc = devm_kcalloc(&pdev->dev, qdev->num_dbc, sizeof(*qdev->dbc), GFP_KERNEL);
+		qdev->dbc = devm_kcalloc(dev, qdev->num_dbc, sizeof(*qdev->dbc), GFP_KERNEL);
 		if (!qdev->dbc)
 			return NULL;
 	}
 
-	qdev->cntl_wq = alloc_workqueue("qaic_cntl", WQ_UNBOUND, 0);
-	if (!qdev->cntl_wq)
+	qddev = devm_drm_dev_alloc(&pdev->dev, &qaic_accel_driver, struct qaic_drm_device, drm);
+	if (IS_ERR(qddev))
 		return NULL;
 
-	qdev->qts_wq = alloc_workqueue("qaic_ts", WQ_UNBOUND, 0);
-	if (!qdev->qts_wq) {
-		destroy_workqueue(qdev->cntl_wq);
+	drm = to_drm(qddev);
+	pci_set_drvdata(pdev, qdev);
+
+	ret = drmm_mutex_init(drm, &qddev->users_mutex);
+	if (ret)
+		return NULL;
+	ret = drmm_add_action_or_reset(drm, qaicm_pci_release, NULL);
+	if (ret)
+		return NULL;
+	ret = drmm_mutex_init(drm, &qdev->cntl_mutex);
+	if (ret)
+		return NULL;
+	ret = drmm_mutex_init(drm, &qdev->bootlog_mutex);
+	if (ret)
 		return NULL;
-	}
 
-	pci_set_drvdata(pdev, qdev);
+	qdev->cntl_wq = qaicm_wq_init(drm, "qaic_cntl");
+	if (IS_ERR(qdev->cntl_wq))
+		return NULL;
+	qdev->qts_wq = qaicm_wq_init(drm, "qaic_ts");
+	if (IS_ERR(qdev->qts_wq))
+		return NULL;
+
+	ret = qaicm_srcu_init(drm, &qdev->dev_lock);
+	if (ret)
+		return NULL;
+
+	qdev->qddev = qddev;
 	qdev->pdev = pdev;
+	qddev->qdev = qdev;
 
-	mutex_init(&qdev->cntl_mutex);
 	INIT_LIST_HEAD(&qdev->cntl_xfer_list);
-	init_srcu_struct(&qdev->dev_lock);
+	INIT_LIST_HEAD(&qdev->bootlog);
+	INIT_LIST_HEAD(&qddev->users);
 
 	for (i = 0; i < qdev->num_dbc; ++i) {
 		spin_lock_init(&qdev->dbc[i].xfer_lock);
 		qdev->dbc[i].qdev = qdev;
 		qdev->dbc[i].id = i;
 		INIT_LIST_HEAD(&qdev->dbc[i].xfer_list);
-		init_srcu_struct(&qdev->dbc[i].ch_lock);
+		ret = qaicm_srcu_init(drm, &qdev->dbc[i].ch_lock);
+		if (ret)
+			return NULL;
 		init_waitqueue_head(&qdev->dbc[i].dbc_release);
 		INIT_LIST_HEAD(&qdev->dbc[i].bo_lists);
 	}
 
-	qddev = devm_drm_dev_alloc(&pdev->dev, &qaic_accel_driver, struct qaic_drm_device, drm);
-	if (IS_ERR(qddev)) {
-		cleanup_qdev(qdev);
-		return NULL;
-	}
-
-	drmm_mutex_init(to_drm(qddev), &qddev->users_mutex);
-	INIT_LIST_HEAD(&qddev->users);
-	qddev->qdev = qdev;
-	qdev->qddev = qddev;
-
 	return qdev;
 }
 
@@ -391,9 +447,7 @@ static int init_pci(struct qaic_device *qdev, struct pci_dev *pdev)
 	ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
 	if (ret)
 		return ret;
-	ret = dma_set_max_seg_size(&pdev->dev, UINT_MAX);
-	if (ret)
-		return ret;
+	dma_set_max_seg_size(&pdev->dev, UINT_MAX);
 
 	qdev->bar_0 = devm_ioremap_resource(&pdev->dev, &pdev->resource[0]);
 	if (IS_ERR(qdev->bar_0))
@@ -472,35 +526,28 @@ static int qaic_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	ret = init_pci(qdev, pdev);
 	if (ret)
-		goto cleanup_qdev;
+		return ret;
 
 	for (i = 0; i < qdev->num_dbc; ++i)
 		qdev->dbc[i].dbc_base = qdev->bar_2 + QAIC_DBC_OFF(i);
 
 	mhi_irq = init_msi(qdev, pdev);
-	if (mhi_irq < 0) {
-		ret = mhi_irq;
-		goto cleanup_qdev;
-	}
+	if (mhi_irq < 0)
+		return mhi_irq;
 
 	ret = qaic_create_drm_device(qdev, QAIC_NO_PARTITION);
 	if (ret)
-		goto cleanup_qdev;
+		return ret;
 
 	qdev->mhi_cntrl = qaic_mhi_register_controller(pdev, qdev->bar_0, mhi_irq,
 						       qdev->single_msi);
 	if (IS_ERR(qdev->mhi_cntrl)) {
 		ret = PTR_ERR(qdev->mhi_cntrl);
-		goto cleanup_drm_dev;
+		qaic_destroy_drm_device(qdev, QAIC_NO_PARTITION);
+		return ret;
 	}
 
 	return 0;
-
-cleanup_drm_dev:
-	qaic_destroy_drm_device(qdev, QAIC_NO_PARTITION);
-cleanup_qdev:
-	cleanup_qdev(qdev);
-	return ret;
 }
 
 static void qaic_pci_remove(struct pci_dev *pdev)
@@ -511,9 +558,8 @@ static void qaic_pci_remove(struct pci_dev *pdev)
 		return;
 
 	qaic_dev_reset_clean_local_state(qdev);
-	qaic_destroy_drm_device(qdev, QAIC_NO_PARTITION);
 	qaic_mhi_free_controller(qdev->mhi_cntrl, link_up);
-	cleanup_qdev(qdev);
+	qaic_destroy_drm_device(qdev, QAIC_NO_PARTITION);
 }
 
 static void qaic_pci_shutdown(struct pci_dev *pdev)
@@ -561,6 +607,7 @@ static struct mhi_driver qaic_mhi_driver = {
 };
 
 static const struct pci_device_id qaic_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_QCOM, PCI_DEV_AIC080), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_QCOM, PCI_DEV_AIC100), },
 	{ }
 };
@@ -597,12 +644,24 @@ static int __init qaic_init(void)
 		goto free_pci;
 	}
 
+	ret = sahara_register();
+	if (ret) {
+		pr_debug("qaic: sahara_register failed %d\n", ret);
+		goto free_mhi;
+	}
+
 	ret = qaic_timesync_init();
 	if (ret)
 		pr_debug("qaic: qaic_timesync_init failed %d\n", ret);
 
+	ret = qaic_bootlog_register();
+	if (ret)
+		pr_debug("qaic: qaic_bootlog_register failed %d\n", ret);
+
 	return 0;
 
+free_mhi:
+	mhi_driver_unregister(&qaic_mhi_driver);
 free_pci:
 	pci_unregister_driver(&qaic_pci_driver);
 	return ret;
@@ -626,7 +685,9 @@ static void __exit qaic_exit(void)
 	 * reinitializing the link_up state after the cleanup is done.
 	 */
 	link_up = true;
+	qaic_bootlog_unregister();
 	qaic_timesync_deinit();
+	sahara_unregister();
 	mhi_driver_unregister(&qaic_mhi_driver);
 	pci_unregister_driver(&qaic_pci_driver);
 }
diff --git a/drivers/accel/qaic/sahara.c b/drivers/accel/qaic/sahara.c
new file mode 100644
index 000000000000..21d58aed0deb
--- /dev/null
+++ b/drivers/accel/qaic/sahara.c
@@ -0,0 +1,822 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. */
+
+#include <linux/devcoredump.h>
+#include <linux/firmware.h>
+#include <linux/limits.h>
+#include <linux/mhi.h>
+#include <linux/minmax.h>
+#include <linux/mod_devicetable.h>
+#include <linux/overflow.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#include "sahara.h"
+
+#define SAHARA_HELLO_CMD		0x1  /* Min protocol version 1.0 */
+#define SAHARA_HELLO_RESP_CMD		0x2  /* Min protocol version 1.0 */
+#define SAHARA_READ_DATA_CMD		0x3  /* Min protocol version 1.0 */
+#define SAHARA_END_OF_IMAGE_CMD		0x4  /* Min protocol version 1.0 */
+#define SAHARA_DONE_CMD			0x5  /* Min protocol version 1.0 */
+#define SAHARA_DONE_RESP_CMD		0x6  /* Min protocol version 1.0 */
+#define SAHARA_RESET_CMD		0x7  /* Min protocol version 1.0 */
+#define SAHARA_RESET_RESP_CMD		0x8  /* Min protocol version 1.0 */
+#define SAHARA_MEM_DEBUG_CMD		0x9  /* Min protocol version 2.0 */
+#define SAHARA_MEM_READ_CMD		0xa  /* Min protocol version 2.0 */
+#define SAHARA_CMD_READY_CMD		0xb  /* Min protocol version 2.1 */
+#define SAHARA_SWITCH_MODE_CMD		0xc  /* Min protocol version 2.1 */
+#define SAHARA_EXECUTE_CMD		0xd  /* Min protocol version 2.1 */
+#define SAHARA_EXECUTE_RESP_CMD		0xe  /* Min protocol version 2.1 */
+#define SAHARA_EXECUTE_DATA_CMD		0xf  /* Min protocol version 2.1 */
+#define SAHARA_MEM_DEBUG64_CMD		0x10 /* Min protocol version 2.5 */
+#define SAHARA_MEM_READ64_CMD		0x11 /* Min protocol version 2.5 */
+#define SAHARA_READ_DATA64_CMD		0x12 /* Min protocol version 2.8 */
+#define SAHARA_RESET_STATE_CMD		0x13 /* Min protocol version 2.9 */
+#define SAHARA_WRITE_DATA_CMD		0x14 /* Min protocol version 3.0 */
+
+#define SAHARA_PACKET_MAX_SIZE		0xffffU /* MHI_MAX_MTU */
+#define SAHARA_TRANSFER_MAX_SIZE	0x80000
+#define SAHARA_READ_MAX_SIZE		0xfff0U /* Avoid unaligned requests */
+#define SAHARA_NUM_TX_BUF		DIV_ROUND_UP(SAHARA_TRANSFER_MAX_SIZE,\
+							SAHARA_PACKET_MAX_SIZE)
+#define SAHARA_IMAGE_ID_NONE		U32_MAX
+
+#define SAHARA_VERSION			2
+#define SAHARA_SUCCESS			0
+#define SAHARA_TABLE_ENTRY_STR_LEN	20
+
+#define SAHARA_MODE_IMAGE_TX_PENDING	0x0
+#define SAHARA_MODE_IMAGE_TX_COMPLETE	0x1
+#define SAHARA_MODE_MEMORY_DEBUG	0x2
+#define SAHARA_MODE_COMMAND		0x3
+
+#define SAHARA_HELLO_LENGTH		0x30
+#define SAHARA_READ_DATA_LENGTH		0x14
+#define SAHARA_END_OF_IMAGE_LENGTH	0x10
+#define SAHARA_DONE_LENGTH		0x8
+#define SAHARA_RESET_LENGTH		0x8
+#define SAHARA_MEM_DEBUG64_LENGTH	0x18
+#define SAHARA_MEM_READ64_LENGTH	0x18
+
+struct sahara_packet {
+	__le32 cmd;
+	__le32 length;
+
+	union {
+		struct {
+			__le32 version;
+			__le32 version_compat;
+			__le32 max_length;
+			__le32 mode;
+		} hello;
+		struct {
+			__le32 version;
+			__le32 version_compat;
+			__le32 status;
+			__le32 mode;
+		} hello_resp;
+		struct {
+			__le32 image;
+			__le32 offset;
+			__le32 length;
+		} read_data;
+		struct {
+			__le32 image;
+			__le32 status;
+		} end_of_image;
+		struct {
+			__le64 table_address;
+			__le64 table_length;
+		} memory_debug64;
+		struct {
+			__le64 memory_address;
+			__le64 memory_length;
+		} memory_read64;
+	};
+};
+
+struct sahara_debug_table_entry64 {
+	__le64	type;
+	__le64	address;
+	__le64	length;
+	char	description[SAHARA_TABLE_ENTRY_STR_LEN];
+	char	filename[SAHARA_TABLE_ENTRY_STR_LEN];
+};
+
+struct sahara_dump_table_entry {
+	u64	type;
+	u64	address;
+	u64	length;
+	char	description[SAHARA_TABLE_ENTRY_STR_LEN];
+	char	filename[SAHARA_TABLE_ENTRY_STR_LEN];
+};
+
+#define SAHARA_DUMP_V1_MAGIC 0x1234567890abcdef
+#define SAHARA_DUMP_V1_VER   1
+struct sahara_memory_dump_meta_v1 {
+	u64	magic;
+	u64	version;
+	u64	dump_size;
+	u64	table_size;
+};
+
+/*
+ * Layout of crashdump provided to user via devcoredump
+ *              +------------------------------------------+
+ *              |         Crashdump Meta structure         |
+ *              | type: struct sahara_memory_dump_meta_v1  |
+ *              +------------------------------------------+
+ *              |             Crashdump Table              |
+ *              | type: array of struct                    |
+ *              |       sahara_dump_table_entry            |
+ *              |                                          |
+ *              |                                          |
+ *              +------------------------------------------+
+ *              |                Crashdump                 |
+ *              |                                          |
+ *              |                                          |
+ *              |                                          |
+ *              |                                          |
+ *              |                                          |
+ *              +------------------------------------------+
+ *
+ * First is the metadata header. Userspace can use the magic number to verify
+ * the content type, and then check the version for the rest of the format.
+ * New versions should keep the magic number location/value, and version
+ * location, but increment the version value.
+ *
+ * For v1, the metadata lists the size of the entire dump (header + table +
+ * dump) and the size of the table. Then the dump image table, which describes
+ * the contents of the dump. Finally all the images are listed in order, with
+ * no deadspace in between. Userspace can use the sizes listed in the image
+ * table to reconstruct the individual images.
+ */
+
+struct sahara_context {
+	struct sahara_packet		*tx[SAHARA_NUM_TX_BUF];
+	struct sahara_packet		*rx;
+	struct work_struct		fw_work;
+	struct work_struct		dump_work;
+	struct mhi_device		*mhi_dev;
+	const char			**image_table;
+	u32				table_size;
+	u32				active_image_id;
+	const struct firmware		*firmware;
+	u64				dump_table_address;
+	u64				dump_table_length;
+	size_t				rx_size;
+	size_t				rx_size_requested;
+	void				*mem_dump;
+	size_t				mem_dump_sz;
+	struct sahara_dump_table_entry	*dump_image;
+	u64				dump_image_offset;
+	void				*mem_dump_freespace;
+	u64				dump_images_left;
+	bool				is_mem_dump_mode;
+};
+
+static const char *aic100_image_table[] = {
+	[1]  = "qcom/aic100/fw1.bin",
+	[2]  = "qcom/aic100/fw2.bin",
+	[4]  = "qcom/aic100/fw4.bin",
+	[5]  = "qcom/aic100/fw5.bin",
+	[6]  = "qcom/aic100/fw6.bin",
+	[8]  = "qcom/aic100/fw8.bin",
+	[9]  = "qcom/aic100/fw9.bin",
+	[10] = "qcom/aic100/fw10.bin",
+};
+
+static int sahara_find_image(struct sahara_context *context, u32 image_id)
+{
+	int ret;
+
+	if (image_id == context->active_image_id)
+		return 0;
+
+	if (context->active_image_id != SAHARA_IMAGE_ID_NONE) {
+		dev_err(&context->mhi_dev->dev, "image id %d is not valid as %d is active\n",
+			image_id, context->active_image_id);
+		return -EINVAL;
+	}
+
+	if (image_id >= context->table_size || !context->image_table[image_id]) {
+		dev_err(&context->mhi_dev->dev, "request for unknown image: %d\n", image_id);
+		return -EINVAL;
+	}
+
+	/*
+	 * This image might be optional. The device may continue without it.
+	 * Only the device knows. Suppress error messages that could suggest an
+	 * a problem when we were actually able to continue.
+	 */
+	ret = firmware_request_nowarn(&context->firmware,
+				      context->image_table[image_id],
+				      &context->mhi_dev->dev);
+	if (ret) {
+		dev_dbg(&context->mhi_dev->dev, "request for image id %d / file %s failed %d\n",
+			image_id, context->image_table[image_id], ret);
+		return ret;
+	}
+
+	context->active_image_id = image_id;
+
+	return 0;
+}
+
+static void sahara_release_image(struct sahara_context *context)
+{
+	if (context->active_image_id != SAHARA_IMAGE_ID_NONE)
+		release_firmware(context->firmware);
+	context->active_image_id = SAHARA_IMAGE_ID_NONE;
+}
+
+static void sahara_send_reset(struct sahara_context *context)
+{
+	int ret;
+
+	context->is_mem_dump_mode = false;
+
+	context->tx[0]->cmd = cpu_to_le32(SAHARA_RESET_CMD);
+	context->tx[0]->length = cpu_to_le32(SAHARA_RESET_LENGTH);
+
+	ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE, context->tx[0],
+			    SAHARA_RESET_LENGTH, MHI_EOT);
+	if (ret)
+		dev_err(&context->mhi_dev->dev, "Unable to send reset response %d\n", ret);
+}
+
+static void sahara_hello(struct sahara_context *context)
+{
+	int ret;
+
+	dev_dbg(&context->mhi_dev->dev,
+		"HELLO cmd received. length:%d version:%d version_compat:%d max_length:%d mode:%d\n",
+		le32_to_cpu(context->rx->length),
+		le32_to_cpu(context->rx->hello.version),
+		le32_to_cpu(context->rx->hello.version_compat),
+		le32_to_cpu(context->rx->hello.max_length),
+		le32_to_cpu(context->rx->hello.mode));
+
+	if (le32_to_cpu(context->rx->length) != SAHARA_HELLO_LENGTH) {
+		dev_err(&context->mhi_dev->dev, "Malformed hello packet - length %d\n",
+			le32_to_cpu(context->rx->length));
+		return;
+	}
+	if (le32_to_cpu(context->rx->hello.version) != SAHARA_VERSION) {
+		dev_err(&context->mhi_dev->dev, "Unsupported hello packet - version %d\n",
+			le32_to_cpu(context->rx->hello.version));
+		return;
+	}
+
+	if (le32_to_cpu(context->rx->hello.mode) != SAHARA_MODE_IMAGE_TX_PENDING &&
+	    le32_to_cpu(context->rx->hello.mode) != SAHARA_MODE_IMAGE_TX_COMPLETE &&
+	    le32_to_cpu(context->rx->hello.mode) != SAHARA_MODE_MEMORY_DEBUG) {
+		dev_err(&context->mhi_dev->dev, "Unsupported hello packet - mode %d\n",
+			le32_to_cpu(context->rx->hello.mode));
+		return;
+	}
+
+	context->tx[0]->cmd = cpu_to_le32(SAHARA_HELLO_RESP_CMD);
+	context->tx[0]->length = cpu_to_le32(SAHARA_HELLO_LENGTH);
+	context->tx[0]->hello_resp.version = cpu_to_le32(SAHARA_VERSION);
+	context->tx[0]->hello_resp.version_compat = cpu_to_le32(SAHARA_VERSION);
+	context->tx[0]->hello_resp.status = cpu_to_le32(SAHARA_SUCCESS);
+	context->tx[0]->hello_resp.mode = context->rx->hello_resp.mode;
+
+	ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE, context->tx[0],
+			    SAHARA_HELLO_LENGTH, MHI_EOT);
+	if (ret)
+		dev_err(&context->mhi_dev->dev, "Unable to send hello response %d\n", ret);
+}
+
+static void sahara_read_data(struct sahara_context *context)
+{
+	u32 image_id, data_offset, data_len, pkt_data_len;
+	int ret;
+	int i;
+
+	dev_dbg(&context->mhi_dev->dev,
+		"READ_DATA cmd received. length:%d image:%d offset:%d data_length:%d\n",
+		le32_to_cpu(context->rx->length),
+		le32_to_cpu(context->rx->read_data.image),
+		le32_to_cpu(context->rx->read_data.offset),
+		le32_to_cpu(context->rx->read_data.length));
+
+	if (le32_to_cpu(context->rx->length) != SAHARA_READ_DATA_LENGTH) {
+		dev_err(&context->mhi_dev->dev, "Malformed read_data packet - length %d\n",
+			le32_to_cpu(context->rx->length));
+		return;
+	}
+
+	image_id = le32_to_cpu(context->rx->read_data.image);
+	data_offset = le32_to_cpu(context->rx->read_data.offset);
+	data_len = le32_to_cpu(context->rx->read_data.length);
+
+	ret = sahara_find_image(context, image_id);
+	if (ret) {
+		sahara_send_reset(context);
+		return;
+	}
+
+	/*
+	 * Image is released when the device is done with it via
+	 * SAHARA_END_OF_IMAGE_CMD. sahara_send_reset() will either cause the
+	 * device to retry the operation with a modification, or decide to be
+	 * done with the image and trigger SAHARA_END_OF_IMAGE_CMD.
+	 * release_image() is called from SAHARA_END_OF_IMAGE_CMD. processing
+	 * and is not needed here on error.
+	 */
+
+	if (data_len > SAHARA_TRANSFER_MAX_SIZE) {
+		dev_err(&context->mhi_dev->dev, "Malformed read_data packet - data len %d exceeds max xfer size %d\n",
+			data_len, SAHARA_TRANSFER_MAX_SIZE);
+		sahara_send_reset(context);
+		return;
+	}
+
+	if (data_offset >= context->firmware->size) {
+		dev_err(&context->mhi_dev->dev, "Malformed read_data packet - data offset %d exceeds file size %zu\n",
+			data_offset, context->firmware->size);
+		sahara_send_reset(context);
+		return;
+	}
+
+	if (size_add(data_offset, data_len) > context->firmware->size) {
+		dev_err(&context->mhi_dev->dev, "Malformed read_data packet - data offset %d and length %d exceeds file size %zu\n",
+			data_offset, data_len, context->firmware->size);
+		sahara_send_reset(context);
+		return;
+	}
+
+	for (i = 0; i < SAHARA_NUM_TX_BUF && data_len; ++i) {
+		pkt_data_len = min(data_len, SAHARA_PACKET_MAX_SIZE);
+
+		memcpy(context->tx[i], &context->firmware->data[data_offset], pkt_data_len);
+
+		data_offset += pkt_data_len;
+		data_len -= pkt_data_len;
+
+		ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE,
+				    context->tx[i], pkt_data_len,
+				    !data_len ? MHI_EOT : MHI_CHAIN);
+		if (ret) {
+			dev_err(&context->mhi_dev->dev, "Unable to send read_data response %d\n",
+				ret);
+			return;
+		}
+	}
+}
+
+static void sahara_end_of_image(struct sahara_context *context)
+{
+	int ret;
+
+	dev_dbg(&context->mhi_dev->dev,
+		"END_OF_IMAGE cmd received. length:%d image:%d status:%d\n",
+		le32_to_cpu(context->rx->length),
+		le32_to_cpu(context->rx->end_of_image.image),
+		le32_to_cpu(context->rx->end_of_image.status));
+
+	if (le32_to_cpu(context->rx->length) != SAHARA_END_OF_IMAGE_LENGTH) {
+		dev_err(&context->mhi_dev->dev, "Malformed end_of_image packet - length %d\n",
+			le32_to_cpu(context->rx->length));
+		return;
+	}
+
+	if (context->active_image_id != SAHARA_IMAGE_ID_NONE &&
+	    le32_to_cpu(context->rx->end_of_image.image) != context->active_image_id) {
+		dev_err(&context->mhi_dev->dev, "Malformed end_of_image packet - image %d is not the active image\n",
+			le32_to_cpu(context->rx->end_of_image.image));
+		return;
+	}
+
+	sahara_release_image(context);
+
+	if (le32_to_cpu(context->rx->end_of_image.status))
+		return;
+
+	context->tx[0]->cmd = cpu_to_le32(SAHARA_DONE_CMD);
+	context->tx[0]->length = cpu_to_le32(SAHARA_DONE_LENGTH);
+
+	ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE, context->tx[0],
+			    SAHARA_DONE_LENGTH, MHI_EOT);
+	if (ret)
+		dev_dbg(&context->mhi_dev->dev, "Unable to send done response %d\n", ret);
+}
+
+static void sahara_memory_debug64(struct sahara_context *context)
+{
+	int ret;
+
+	dev_dbg(&context->mhi_dev->dev,
+		"MEMORY DEBUG64 cmd received. length:%d table_address:%#llx table_length:%#llx\n",
+		le32_to_cpu(context->rx->length),
+		le64_to_cpu(context->rx->memory_debug64.table_address),
+		le64_to_cpu(context->rx->memory_debug64.table_length));
+
+	if (le32_to_cpu(context->rx->length) != SAHARA_MEM_DEBUG64_LENGTH) {
+		dev_err(&context->mhi_dev->dev, "Malformed memory debug64 packet - length %d\n",
+			le32_to_cpu(context->rx->length));
+		return;
+	}
+
+	context->dump_table_address = le64_to_cpu(context->rx->memory_debug64.table_address);
+	context->dump_table_length = le64_to_cpu(context->rx->memory_debug64.table_length);
+
+	if (context->dump_table_length % sizeof(struct sahara_debug_table_entry64) != 0 ||
+	    !context->dump_table_length) {
+		dev_err(&context->mhi_dev->dev, "Malformed memory debug64 packet - table length %lld\n",
+			context->dump_table_length);
+		return;
+	}
+
+	/*
+	 * From this point, the protocol flips. We make memory_read requests to
+	 * the device, and the device responds with the raw data. If the device
+	 * has an error, it will send an End of Image command. First we need to
+	 * request the memory dump table so that we know where all the pieces
+	 * of the dump are that we can consume.
+	 */
+
+	context->is_mem_dump_mode = true;
+
+	/*
+	 * Assume that the table is smaller than our MTU so that we can read it
+	 * in one shot. The spec does not put an upper limit on the table, but
+	 * no known device will exceed this.
+	 */
+	if (context->dump_table_length > SAHARA_PACKET_MAX_SIZE) {
+		dev_err(&context->mhi_dev->dev, "Memory dump table length %lld exceeds supported size. Discarding dump\n",
+			context->dump_table_length);
+		sahara_send_reset(context);
+		return;
+	}
+
+	context->tx[0]->cmd = cpu_to_le32(SAHARA_MEM_READ64_CMD);
+	context->tx[0]->length = cpu_to_le32(SAHARA_MEM_READ64_LENGTH);
+	context->tx[0]->memory_read64.memory_address = cpu_to_le64(context->dump_table_address);
+	context->tx[0]->memory_read64.memory_length = cpu_to_le64(context->dump_table_length);
+
+	context->rx_size_requested = context->dump_table_length;
+
+	ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE, context->tx[0],
+			    SAHARA_MEM_READ64_LENGTH, MHI_EOT);
+	if (ret)
+		dev_err(&context->mhi_dev->dev, "Unable to send read for dump table %d\n", ret);
+}
+
+static void sahara_processing(struct work_struct *work)
+{
+	struct sahara_context *context = container_of(work, struct sahara_context, fw_work);
+	int ret;
+
+	switch (le32_to_cpu(context->rx->cmd)) {
+	case SAHARA_HELLO_CMD:
+		sahara_hello(context);
+		break;
+	case SAHARA_READ_DATA_CMD:
+		sahara_read_data(context);
+		break;
+	case SAHARA_END_OF_IMAGE_CMD:
+		sahara_end_of_image(context);
+		break;
+	case SAHARA_DONE_RESP_CMD:
+		/* Intentional do nothing as we don't need to exit an app */
+		break;
+	case SAHARA_RESET_RESP_CMD:
+		/* Intentional do nothing as we don't need to exit an app */
+		break;
+	case SAHARA_MEM_DEBUG64_CMD:
+		sahara_memory_debug64(context);
+		break;
+	default:
+		dev_err(&context->mhi_dev->dev, "Unknown command %d\n",
+			le32_to_cpu(context->rx->cmd));
+		break;
+	}
+
+	ret = mhi_queue_buf(context->mhi_dev, DMA_FROM_DEVICE, context->rx,
+			    SAHARA_PACKET_MAX_SIZE, MHI_EOT);
+	if (ret)
+		dev_err(&context->mhi_dev->dev, "Unable to requeue rx buf %d\n", ret);
+}
+
+static void sahara_parse_dump_table(struct sahara_context *context)
+{
+	struct sahara_dump_table_entry *image_out_table;
+	struct sahara_debug_table_entry64 *dev_table;
+	struct sahara_memory_dump_meta_v1 *dump_meta;
+	u64 table_nents;
+	u64 dump_length;
+	int ret;
+	u64 i;
+
+	table_nents = context->dump_table_length / sizeof(*dev_table);
+	context->dump_images_left = table_nents;
+	dump_length = 0;
+
+	dev_table = (struct sahara_debug_table_entry64 *)(context->rx);
+	for (i = 0; i < table_nents; ++i) {
+		/* Do not trust the device, ensure the strings are terminated */
+		dev_table[i].description[SAHARA_TABLE_ENTRY_STR_LEN - 1] = 0;
+		dev_table[i].filename[SAHARA_TABLE_ENTRY_STR_LEN - 1] = 0;
+
+		dump_length = size_add(dump_length, le64_to_cpu(dev_table[i].length));
+		if (dump_length == SIZE_MAX) {
+			/* Discard the dump */
+			sahara_send_reset(context);
+			return;
+		}
+
+		dev_dbg(&context->mhi_dev->dev,
+			"Memory dump table entry %lld type: %lld address: %#llx length: %#llx description: \"%s\" filename \"%s\"\n",
+			i,
+			le64_to_cpu(dev_table[i].type),
+			le64_to_cpu(dev_table[i].address),
+			le64_to_cpu(dev_table[i].length),
+			dev_table[i].description,
+			dev_table[i].filename);
+	}
+
+	dump_length = size_add(dump_length, sizeof(*dump_meta));
+	if (dump_length == SIZE_MAX) {
+		/* Discard the dump */
+		sahara_send_reset(context);
+		return;
+	}
+	dump_length = size_add(dump_length, size_mul(sizeof(*image_out_table), table_nents));
+	if (dump_length == SIZE_MAX) {
+		/* Discard the dump */
+		sahara_send_reset(context);
+		return;
+	}
+
+	context->mem_dump_sz = dump_length;
+	context->mem_dump = vzalloc(dump_length);
+	if (!context->mem_dump) {
+		/* Discard the dump */
+		sahara_send_reset(context);
+		return;
+	}
+
+	/* Populate the dump metadata and table for userspace */
+	dump_meta = context->mem_dump;
+	dump_meta->magic = SAHARA_DUMP_V1_MAGIC;
+	dump_meta->version = SAHARA_DUMP_V1_VER;
+	dump_meta->dump_size = dump_length;
+	dump_meta->table_size = context->dump_table_length;
+
+	image_out_table = context->mem_dump + sizeof(*dump_meta);
+	for (i = 0; i < table_nents; ++i) {
+		image_out_table[i].type = le64_to_cpu(dev_table[i].type);
+		image_out_table[i].address = le64_to_cpu(dev_table[i].address);
+		image_out_table[i].length = le64_to_cpu(dev_table[i].length);
+		strscpy(image_out_table[i].description, dev_table[i].description,
+			SAHARA_TABLE_ENTRY_STR_LEN);
+		strscpy(image_out_table[i].filename,
+			dev_table[i].filename,
+			SAHARA_TABLE_ENTRY_STR_LEN);
+	}
+
+	context->mem_dump_freespace = &image_out_table[i];
+
+	/* Done parsing the table, switch to image dump mode */
+	context->dump_table_length = 0;
+
+	/* Request the first chunk of the first image */
+	context->dump_image = &image_out_table[0];
+	dump_length = min(context->dump_image->length, SAHARA_READ_MAX_SIZE);
+	/* Avoid requesting EOI sized data so that we can identify errors */
+	if (dump_length == SAHARA_END_OF_IMAGE_LENGTH)
+		dump_length = SAHARA_END_OF_IMAGE_LENGTH / 2;
+
+	context->dump_image_offset = dump_length;
+
+	context->tx[0]->cmd = cpu_to_le32(SAHARA_MEM_READ64_CMD);
+	context->tx[0]->length = cpu_to_le32(SAHARA_MEM_READ64_LENGTH);
+	context->tx[0]->memory_read64.memory_address = cpu_to_le64(context->dump_image->address);
+	context->tx[0]->memory_read64.memory_length = cpu_to_le64(dump_length);
+
+	context->rx_size_requested = dump_length;
+
+	ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE, context->tx[0],
+			    SAHARA_MEM_READ64_LENGTH, MHI_EOT);
+	if (ret)
+		dev_err(&context->mhi_dev->dev, "Unable to send read for dump content %d\n", ret);
+}
+
+static void sahara_parse_dump_image(struct sahara_context *context)
+{
+	u64 dump_length;
+	int ret;
+
+	memcpy(context->mem_dump_freespace, context->rx, context->rx_size);
+	context->mem_dump_freespace += context->rx_size;
+
+	if (context->dump_image_offset >= context->dump_image->length) {
+		/* Need to move to next image */
+		context->dump_image++;
+		context->dump_images_left--;
+		context->dump_image_offset = 0;
+
+		if (!context->dump_images_left) {
+			/* Dump done */
+			dev_coredumpv(context->mhi_dev->mhi_cntrl->cntrl_dev,
+				      context->mem_dump,
+				      context->mem_dump_sz,
+				      GFP_KERNEL);
+			context->mem_dump = NULL;
+			sahara_send_reset(context);
+			return;
+		}
+	}
+
+	/* Get next image chunk */
+	dump_length = context->dump_image->length - context->dump_image_offset;
+	dump_length = min(dump_length, SAHARA_READ_MAX_SIZE);
+	/* Avoid requesting EOI sized data so that we can identify errors */
+	if (dump_length == SAHARA_END_OF_IMAGE_LENGTH)
+		dump_length = SAHARA_END_OF_IMAGE_LENGTH / 2;
+
+	context->tx[0]->cmd = cpu_to_le32(SAHARA_MEM_READ64_CMD);
+	context->tx[0]->length = cpu_to_le32(SAHARA_MEM_READ64_LENGTH);
+	context->tx[0]->memory_read64.memory_address =
+		cpu_to_le64(context->dump_image->address + context->dump_image_offset);
+	context->tx[0]->memory_read64.memory_length = cpu_to_le64(dump_length);
+
+	context->dump_image_offset += dump_length;
+	context->rx_size_requested = dump_length;
+
+	ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE, context->tx[0],
+			    SAHARA_MEM_READ64_LENGTH, MHI_EOT);
+	if (ret)
+		dev_err(&context->mhi_dev->dev,
+			"Unable to send read for dump content %d\n", ret);
+}
+
+static void sahara_dump_processing(struct work_struct *work)
+{
+	struct sahara_context *context = container_of(work, struct sahara_context, dump_work);
+	int ret;
+
+	/*
+	 * We should get the expected raw data, but if the device has an error
+	 * it is supposed to send EOI with an error code.
+	 */
+	if (context->rx_size != context->rx_size_requested &&
+	    context->rx_size != SAHARA_END_OF_IMAGE_LENGTH) {
+		dev_err(&context->mhi_dev->dev,
+			"Unexpected response to read_data. Expected size: %#zx got: %#zx\n",
+			context->rx_size_requested,
+			context->rx_size);
+		goto error;
+	}
+
+	if (context->rx_size == SAHARA_END_OF_IMAGE_LENGTH &&
+	    le32_to_cpu(context->rx->cmd) == SAHARA_END_OF_IMAGE_CMD) {
+		dev_err(&context->mhi_dev->dev,
+			"Unexpected EOI response to read_data. Status: %d\n",
+			le32_to_cpu(context->rx->end_of_image.status));
+		goto error;
+	}
+
+	if (context->rx_size == SAHARA_END_OF_IMAGE_LENGTH &&
+	    le32_to_cpu(context->rx->cmd) != SAHARA_END_OF_IMAGE_CMD) {
+		dev_err(&context->mhi_dev->dev,
+			"Invalid EOI response to read_data. CMD: %d\n",
+			le32_to_cpu(context->rx->cmd));
+		goto error;
+	}
+
+	/*
+	 * Need to know if we received the dump table, or part of a dump image.
+	 * Since we get raw data, we cannot tell from the data itself. Instead,
+	 * we use the stored dump_table_length, which we zero after we read and
+	 * process the entire table.
+	 */
+	if (context->dump_table_length)
+		sahara_parse_dump_table(context);
+	else
+		sahara_parse_dump_image(context);
+
+	ret = mhi_queue_buf(context->mhi_dev, DMA_FROM_DEVICE, context->rx,
+			    SAHARA_PACKET_MAX_SIZE, MHI_EOT);
+	if (ret)
+		dev_err(&context->mhi_dev->dev, "Unable to requeue rx buf %d\n", ret);
+
+	return;
+
+error:
+	vfree(context->mem_dump);
+	context->mem_dump = NULL;
+	sahara_send_reset(context);
+}
+
+static int sahara_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id)
+{
+	struct sahara_context *context;
+	int ret;
+	int i;
+
+	context = devm_kzalloc(&mhi_dev->dev, sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	context->rx = devm_kzalloc(&mhi_dev->dev, SAHARA_PACKET_MAX_SIZE, GFP_KERNEL);
+	if (!context->rx)
+		return -ENOMEM;
+
+	/*
+	 * AIC100 defines SAHARA_TRANSFER_MAX_SIZE as the largest value it
+	 * will request for READ_DATA. This is larger than
+	 * SAHARA_PACKET_MAX_SIZE, and we need 9x SAHARA_PACKET_MAX_SIZE to
+	 * cover SAHARA_TRANSFER_MAX_SIZE. When the remote side issues a
+	 * READ_DATA, it requires a transfer of the exact size requested. We
+	 * can use MHI_CHAIN to link multiple buffers into a single transfer
+	 * but the remote side will not consume the buffers until it sees an
+	 * EOT, thus we need to allocate enough buffers to put in the tx fifo
+	 * to cover an entire READ_DATA request of the max size.
+	 */
+	for (i = 0; i < SAHARA_NUM_TX_BUF; ++i) {
+		context->tx[i] = devm_kzalloc(&mhi_dev->dev, SAHARA_PACKET_MAX_SIZE, GFP_KERNEL);
+		if (!context->tx[i])
+			return -ENOMEM;
+	}
+
+	context->mhi_dev = mhi_dev;
+	INIT_WORK(&context->fw_work, sahara_processing);
+	INIT_WORK(&context->dump_work, sahara_dump_processing);
+	context->image_table = aic100_image_table;
+	context->table_size = ARRAY_SIZE(aic100_image_table);
+	context->active_image_id = SAHARA_IMAGE_ID_NONE;
+	dev_set_drvdata(&mhi_dev->dev, context);
+
+	ret = mhi_prepare_for_transfer(mhi_dev);
+	if (ret)
+		return ret;
+
+	ret = mhi_queue_buf(mhi_dev, DMA_FROM_DEVICE, context->rx, SAHARA_PACKET_MAX_SIZE, MHI_EOT);
+	if (ret) {
+		mhi_unprepare_from_transfer(mhi_dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void sahara_mhi_remove(struct mhi_device *mhi_dev)
+{
+	struct sahara_context *context = dev_get_drvdata(&mhi_dev->dev);
+
+	cancel_work_sync(&context->fw_work);
+	cancel_work_sync(&context->dump_work);
+	vfree(context->mem_dump);
+	sahara_release_image(context);
+	mhi_unprepare_from_transfer(mhi_dev);
+}
+
+static void sahara_mhi_ul_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
+{
+}
+
+static void sahara_mhi_dl_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result)
+{
+	struct sahara_context *context = dev_get_drvdata(&mhi_dev->dev);
+
+	if (!mhi_result->transaction_status) {
+		context->rx_size = mhi_result->bytes_xferd;
+		if (context->is_mem_dump_mode)
+			schedule_work(&context->dump_work);
+		else
+			schedule_work(&context->fw_work);
+	}
+
+}
+
+static const struct mhi_device_id sahara_mhi_match_table[] = {
+	{ .chan = "QAIC_SAHARA", },
+	{},
+};
+
+static struct mhi_driver sahara_mhi_driver = {
+	.id_table = sahara_mhi_match_table,
+	.remove = sahara_mhi_remove,
+	.probe = sahara_mhi_probe,
+	.ul_xfer_cb = sahara_mhi_ul_xfer_cb,
+	.dl_xfer_cb = sahara_mhi_dl_xfer_cb,
+	.driver = {
+		.name = "sahara",
+	},
+};
+
+int sahara_register(void)
+{
+	return mhi_driver_register(&sahara_mhi_driver);
+}
+
+void sahara_unregister(void)
+{
+	mhi_driver_unregister(&sahara_mhi_driver);
+}
diff --git a/drivers/accel/qaic/sahara.h b/drivers/accel/qaic/sahara.h
new file mode 100644
index 000000000000..640208acc0d1
--- /dev/null
+++ b/drivers/accel/qaic/sahara.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. */
+
+#ifndef __SAHARA_H__
+#define __SAHARA_H__
+
+int sahara_register(void);
+void sahara_unregister(void);
+#endif /* __SAHARA_H__ */