16 files changed, 1813 insertions, 1187 deletions
diff --git a/drivers/misc/habanalabs/common/Makefile b/drivers/misc/habanalabs/common/Makefile
index 6ebe3c7001ff..934a3a4aedc9 100644
--- a/drivers/misc/habanalabs/common/Makefile
+++ b/drivers/misc/habanalabs/common/Makefile
@@ -11,4 +11,4 @@ HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \
 		common/command_buffer.o common/hw_queue.o common/irq.o \
 		common/sysfs.o common/hwmon.o common/memory.o \
 		common/command_submission.o common/firmware_if.o \
-		common/state_dump.o
+		common/state_dump.o common/memory_mgr.o
diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c
index a507110f6443..e13b2b39c058 100644
--- a/drivers/misc/habanalabs/common/command_buffer.c
+++ b/drivers/misc/habanalabs/common/command_buffer.c
@@ -160,24 +160,6 @@ static void cb_do_release(struct hl_device *hdev, struct hl_cb *cb)
 	}
 }
 
-static void cb_release(struct kref *ref)
-{
-	struct hl_device *hdev;
-	struct hl_cb *cb;
-
-	cb = container_of(ref, struct hl_cb, refcount);
-	hdev = cb->hdev;
-
-	hl_debugfs_remove_cb(cb);
-
-	if (cb->is_mmu_mapped)
-		cb_unmap_mem(cb->ctx, cb);
-
-	hl_ctx_put(cb->ctx);
-
-	cb_do_release(hdev, cb);
-}
-
 static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
 					int ctx_id, bool internal_cb)
 {
@@ -238,168 +220,175 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
 	return cb;
 }
 
-int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
-			struct hl_ctx *ctx, u32 cb_size, bool internal_cb,
-			bool map_cb, u64 *handle)
+struct hl_cb_mmap_mem_alloc_args {
+	struct hl_device *hdev;
+	struct hl_ctx *ctx;
+	u32 cb_size;
+	bool internal_cb;
+	bool map_cb;
+};
+
+static void hl_cb_mmap_mem_release(struct hl_mmap_mem_buf *buf)
 {
-	struct hl_cb *cb;
-	bool alloc_new_cb = true;
-	int rc, ctx_id = ctx->asid;
+	struct hl_cb *cb = buf->private;
 
-	/*
-	 * Can't use generic function to check this because of special case
-	 * where we create a CB as part of the reset process
-	 */
-	if ((hdev->disabled) || (hdev->reset_info.in_reset && (ctx_id != HL_KERNEL_ASID_ID))) {
-		dev_warn_ratelimited(hdev->dev,
-			"Device is disabled or in reset. Can't create new CBs\n");
-		rc = -EBUSY;
-		goto out_err;
-	}
+	hl_debugfs_remove_cb(cb);
 
-	if (cb_size > SZ_2M) {
-		dev_err(hdev->dev, "CB size %d must be less than %d\n",
-			cb_size, SZ_2M);
-		rc = -EINVAL;
-		goto out_err;
-	}
+	if (cb->is_mmu_mapped)
+		cb_unmap_mem(cb->ctx, cb);
+
+	hl_ctx_put(cb->ctx);
 
-	if (!internal_cb) {
+	cb_do_release(cb->hdev, cb);
+}
+
+static int hl_cb_mmap_mem_alloc(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args)
+{
+	struct hl_cb_mmap_mem_alloc_args *cb_args = args;
+	struct hl_cb *cb;
+	int rc, ctx_id = cb_args->ctx->asid;
+	bool alloc_new_cb = true;
+
+	if (!cb_args->internal_cb) {
 		/* Minimum allocation must be PAGE SIZE */
-		if (cb_size < PAGE_SIZE)
-			cb_size = PAGE_SIZE;
+		if (cb_args->cb_size < PAGE_SIZE)
+			cb_args->cb_size = PAGE_SIZE;
 
 		if (ctx_id == HL_KERNEL_ASID_ID &&
-				cb_size <= hdev->asic_prop.cb_pool_cb_size) {
+				cb_args->cb_size <= cb_args->hdev->asic_prop.cb_pool_cb_size) {
 
-			spin_lock(&hdev->cb_pool_lock);
-			if (!list_empty(&hdev->cb_pool)) {
-				cb = list_first_entry(&hdev->cb_pool,
+			spin_lock(&cb_args->hdev->cb_pool_lock);
+			if (!list_empty(&cb_args->hdev->cb_pool)) {
+				cb = list_first_entry(&cb_args->hdev->cb_pool,
 						typeof(*cb), pool_list);
 				list_del(&cb->pool_list);
-				spin_unlock(&hdev->cb_pool_lock);
+				spin_unlock(&cb_args->hdev->cb_pool_lock);
 				alloc_new_cb = false;
 			} else {
-				spin_unlock(&hdev->cb_pool_lock);
-				dev_dbg(hdev->dev, "CB pool is empty\n");
+				spin_unlock(&cb_args->hdev->cb_pool_lock);
+				dev_dbg(cb_args->hdev->dev, "CB pool is empty\n");
 			}
 		}
 	}
 
 	if (alloc_new_cb) {
-		cb = hl_cb_alloc(hdev, cb_size, ctx_id, internal_cb);
-		if (!cb) {
-			rc = -ENOMEM;
-			goto out_err;
-		}
+		cb = hl_cb_alloc(cb_args->hdev, cb_args->cb_size, ctx_id, cb_args->internal_cb);
+		if (!cb)
+			return -ENOMEM;
 	}
 
-	cb->hdev = hdev;
-	cb->ctx = ctx;
-	hl_ctx_get(hdev, cb->ctx);
+	cb->hdev = cb_args->hdev;
+	cb->ctx = cb_args->ctx;
+	cb->buf = buf;
+	cb->buf->mappable_size = cb->size;
+	cb->buf->private = cb;
+
+	hl_ctx_get(cb->ctx);
 
-	if (map_cb) {
+	if (cb_args->map_cb) {
 		if (ctx_id == HL_KERNEL_ASID_ID) {
-			dev_err(hdev->dev,
+			dev_err(cb_args->hdev->dev,
 				"CB mapping is not supported for kernel context\n");
 			rc = -EINVAL;
 			goto release_cb;
 		}
 
-		rc = cb_map_mem(ctx, cb);
+		rc = cb_map_mem(cb_args->ctx, cb);
 		if (rc)
 			goto release_cb;
 	}
 
-	spin_lock(&mgr->cb_lock);
-	rc = idr_alloc(&mgr->cb_handles, cb, 1, 0, GFP_ATOMIC);
-	spin_unlock(&mgr->cb_lock);
-
-	if (rc < 0) {
-		dev_err(hdev->dev, "Failed to allocate IDR for a new CB\n");
-		goto unmap_mem;
-	}
-
-	cb->id = (u64) rc;
-
-	kref_init(&cb->refcount);
-	spin_lock_init(&cb->lock);
-
-	/*
-	 * idr is 32-bit so we can safely OR it with a mask that is above
-	 * 32 bit
-	 */
-	*handle = cb->id | HL_MMAP_TYPE_CB;
-	*handle <<= PAGE_SHIFT;
-
 	hl_debugfs_add_cb(cb);
 
 	return 0;
 
-unmap_mem:
-	if (cb->is_mmu_mapped)
-		cb_unmap_mem(cb->ctx, cb);
 release_cb:
 	hl_ctx_put(cb->ctx);
-	cb_do_release(hdev, cb);
-out_err:
-	*handle = 0;
+	cb_do_release(cb_args->hdev, cb);
 
 	return rc;
 }
 
-int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle)
+static int hl_cb_mmap(struct hl_mmap_mem_buf *buf,
+				      struct vm_area_struct *vma, void *args)
 {
-	struct hl_cb *cb;
-	u32 handle;
-	int rc = 0;
+	struct hl_cb *cb = buf->private;
 
-	/*
-	 * handle was given to user to do mmap, I need to shift it back to
-	 * how the idr module gave it to me
-	 */
-	cb_handle >>= PAGE_SHIFT;
-	handle = (u32) cb_handle;
+	return cb->hdev->asic_funcs->mmap(cb->hdev, vma, cb->kernel_address,
+					cb->bus_address, cb->size);
+}
 
-	spin_lock(&mgr->cb_lock);
+static struct hl_mmap_mem_buf_behavior cb_behavior = {
+	.topic = "CB",
+	.mem_id = HL_MMAP_TYPE_CB,
+	.alloc = hl_cb_mmap_mem_alloc,
+	.release = hl_cb_mmap_mem_release,
+	.mmap = hl_cb_mmap,
+};
 
-	cb = idr_find(&mgr->cb_handles, handle);
-	if (cb) {
-		idr_remove(&mgr->cb_handles, handle);
-		spin_unlock(&mgr->cb_lock);
-		kref_put(&cb->refcount, cb_release);
-	} else {
-		spin_unlock(&mgr->cb_lock);
-		dev_err(hdev->dev,
-			"CB destroy failed, no match to handle 0x%x\n", handle);
-		rc = -EINVAL;
+int hl_cb_create(struct hl_device *hdev, struct hl_mem_mgr *mmg,
+			struct hl_ctx *ctx, u32 cb_size, bool internal_cb,
+			bool map_cb, u64 *handle)
+{
+	struct hl_cb_mmap_mem_alloc_args args = {
+		.hdev = hdev,
+		.ctx = ctx,
+		.cb_size = cb_size,
+		.internal_cb = internal_cb,
+		.map_cb = map_cb,
+	};
+	struct hl_mmap_mem_buf *buf;
+	int ctx_id = ctx->asid;
+
+	if ((hdev->disabled) || (hdev->reset_info.in_reset && (ctx_id != HL_KERNEL_ASID_ID))) {
+		dev_warn_ratelimited(hdev->dev,
+			"Device is disabled or in reset. Can't create new CBs\n");
+		return -EBUSY;
 	}
 
-	return rc;
+	if (cb_size > SZ_2M) {
+		dev_err(hdev->dev, "CB size %d must be less than %d\n",
+			cb_size, SZ_2M);
+		return -EINVAL;
+	}
+
+	buf = hl_mmap_mem_buf_alloc(
+		mmg, &cb_behavior,
+		ctx_id == HL_KERNEL_ASID_ID ? GFP_ATOMIC : GFP_KERNEL, &args);
+	if (!buf)
+		return -ENOMEM;
+
+	*handle = buf->handle;
+
+	return 0;
+}
+
+int hl_cb_destroy(struct hl_mem_mgr *mmg, u64 cb_handle)
+{
+	int rc;
+
+	rc = hl_mmap_mem_buf_put_handle(mmg, cb_handle);
+	if (rc < 0)
+		return rc; /* Invalid handle */
+
+	if (rc == 0)
+		dev_dbg(mmg->dev, "CB 0x%llx is destroyed while still in use\n", cb_handle);
+
+	return 0;
 }
 
-static int hl_cb_info(struct hl_device *hdev, struct hl_cb_mgr *mgr,
-			u64 cb_handle, u32 flags, u32 *usage_cnt, u64 *device_va)
+static int hl_cb_info(struct hl_mem_mgr *mmg,
+			u64 handle, u32 flags, u32 *usage_cnt, u64 *device_va)
 {
 	struct hl_vm_va_block *va_block;
 	struct hl_cb *cb;
-	u32 handle;
 	int rc = 0;
 
-	/* The CB handle was given to user to do mmap, so need to shift it back
-	 * to the value which was allocated by the IDR module.
-	 */
-	cb_handle >>= PAGE_SHIFT;
-	handle = (u32) cb_handle;
-
-	spin_lock(&mgr->cb_lock);
-
-	cb = idr_find(&mgr->cb_handles, handle);
+	cb = hl_cb_get(mmg, handle);
 	if (!cb) {
-		dev_err(hdev->dev,
-			"CB info failed, no match to handle 0x%x\n", handle);
-		rc = -EINVAL;
-		goto out;
+		dev_err(mmg->dev,
+			"CB info failed, no match to handle 0x%llx\n", handle);
+		return -EINVAL;
 	}
 
 	if (flags & HL_CB_FLAGS_GET_DEVICE_VA) {
@@ -407,7 +396,7 @@ static int hl_cb_info(struct hl_device *hdev, struct hl_cb_mgr *mgr,
 		if (va_block) {
 			*device_va = va_block->start;
 		} else {
-			dev_err(hdev->dev, "CB is not mapped to the device's MMU\n");
+			dev_err(mmg->dev, "CB is not mapped to the device's MMU\n");
 			rc = -EINVAL;
 			goto out;
 		}
@@ -416,7 +405,7 @@ static int hl_cb_info(struct hl_device *hdev, struct hl_cb_mgr *mgr,
 	}
 
 out:
-	spin_unlock(&mgr->cb_lock);
+	hl_cb_put(cb);
 	return rc;
 }
 
@@ -444,7 +433,7 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
 				args->in.cb_size, HL_MAX_CB_SIZE);
 			rc = -EINVAL;
 		} else {
-			rc = hl_cb_create(hdev, &hpriv->cb_mgr, hpriv->ctx,
+			rc = hl_cb_create(hdev, &hpriv->mem_mgr, hpriv->ctx,
 					args->in.cb_size, false,
 					!!(args->in.flags & HL_CB_FLAGS_MAP),
 					&handle);
@@ -455,12 +444,12 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
 		break;
 
 	case HL_CB_OP_DESTROY:
-		rc = hl_cb_destroy(hdev, &hpriv->cb_mgr,
+		rc = hl_cb_destroy(&hpriv->mem_mgr,
 					args->in.cb_handle);
 		break;
 
 	case HL_CB_OP_INFO:
-		rc = hl_cb_info(hdev, &hpriv->cb_mgr, args->in.cb_handle,
+		rc = hl_cb_info(&hpriv->mem_mgr, args->in.cb_handle,
 				args->in.flags,
 				&usage_cnt,
 				&device_va);
@@ -483,163 +472,20 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
 	return rc;
 }
 
-static void cb_vm_close(struct vm_area_struct *vma)
+struct hl_cb *hl_cb_get(struct hl_mem_mgr *mmg, u64 handle)
 {
-	struct hl_cb *cb = (struct hl_cb *) vma->vm_private_data;
-	long new_mmap_size;
-
-	new_mmap_size = cb->mmap_size - (vma->vm_end - vma->vm_start);
-
-	if (new_mmap_size > 0) {
-		cb->mmap_size = new_mmap_size;
-		return;
-	}
-
-	spin_lock(&cb->lock);
-	cb->mmap = false;
-	spin_unlock(&cb->lock);
-
-	hl_cb_put(cb);
-	vma->vm_private_data = NULL;
-}
-
-static const struct vm_operations_struct cb_vm_ops = {
-	.close = cb_vm_close
-};
-
-int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
-{
-	struct hl_device *hdev = hpriv->hdev;
-	struct hl_cb *cb;
-	u32 handle, user_cb_size;
-	int rc;
-
-	/* We use the page offset to hold the idr and thus we need to clear
-	 * it before doing the mmap itself
-	 */
-	handle = vma->vm_pgoff;
-	vma->vm_pgoff = 0;
-
-	/* reference was taken here */
-	cb = hl_cb_get(hdev, &hpriv->cb_mgr, handle);
-	if (!cb) {
-		dev_err(hdev->dev,
-			"CB mmap failed, no match to handle 0x%x\n", handle);
-		return -EINVAL;
-	}
-
-	/* Validation check */
-	user_cb_size = vma->vm_end - vma->vm_start;
-	if (user_cb_size != ALIGN(cb->size, PAGE_SIZE)) {
-		dev_err(hdev->dev,
-			"CB mmap failed, mmap size 0x%lx != 0x%x cb size\n",
-			vma->vm_end - vma->vm_start, cb->size);
-		rc = -EINVAL;
-		goto put_cb;
-	}
-
-	if (!access_ok((void __user *) (uintptr_t) vma->vm_start,
-							user_cb_size)) {
-		dev_err(hdev->dev,
-			"user pointer is invalid - 0x%lx\n",
-			vma->vm_start);
-
-		rc = -EINVAL;
-		goto put_cb;
-	}
-
-	spin_lock(&cb->lock);
+	struct hl_mmap_mem_buf *buf;
 
-	if (cb->mmap) {
-		dev_err(hdev->dev,
-			"CB mmap failed, CB already mmaped to user\n");
-		rc = -EINVAL;
-		goto release_lock;
-	}
-
-	cb->mmap = true;
-
-	spin_unlock(&cb->lock);
-
-	vma->vm_ops = &cb_vm_ops;
-
-	/*
-	 * Note: We're transferring the cb reference to
-	 * vma->vm_private_data here.
-	 */
-
-	vma->vm_private_data = cb;
-
-	rc = hdev->asic_funcs->mmap(hdev, vma, cb->kernel_address,
-					cb->bus_address, cb->size);
-	if (rc) {
-		spin_lock(&cb->lock);
-		cb->mmap = false;
-		goto release_lock;
-	}
-
-	cb->mmap_size = cb->size;
-	vma->vm_pgoff = handle;
-
-	return 0;
-
-release_lock:
-	spin_unlock(&cb->lock);
-put_cb:
-	hl_cb_put(cb);
-	return rc;
-}
-
-struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr,
-			u32 handle)
-{
-	struct hl_cb *cb;
-
-	spin_lock(&mgr->cb_lock);
-	cb = idr_find(&mgr->cb_handles, handle);
-
-	if (!cb) {
-		spin_unlock(&mgr->cb_lock);
-		dev_warn(hdev->dev,
-			"CB get failed, no match to handle 0x%x\n", handle);
+	buf = hl_mmap_mem_buf_get(mmg, handle);
+	if (!buf)
 		return NULL;
-	}
-
-	kref_get(&cb->refcount);
-
-	spin_unlock(&mgr->cb_lock);
-
-	return cb;
+	return buf->private;
 
 }
 
 void hl_cb_put(struct hl_cb *cb)
 {
-	kref_put(&cb->refcount, cb_release);
-}
-
-void hl_cb_mgr_init(struct hl_cb_mgr *mgr)
-{
-	spin_lock_init(&mgr->cb_lock);
-	idr_init(&mgr->cb_handles);
-}
-
-void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr)
-{
-	struct hl_cb *cb;
-	struct idr *idp;
-	u32 id;
-
-	idp = &mgr->cb_handles;
-
-	idr_for_each_entry(idp, cb, id) {
-		if (kref_put(&cb->refcount, cb_release) != 1)
-			dev_err(hdev->dev,
-				"CB %d for CTX ID %d is still alive\n",
-				id, cb->ctx->asid);
-	}
-
-	idr_destroy(&mgr->cb_handles);
+	hl_mmap_mem_buf_put(cb->buf);
 }
 
 struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
@@ -649,7 +495,7 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
 	struct hl_cb *cb;
 	int rc;
 
-	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, hdev->kernel_ctx, cb_size,
+	rc = hl_cb_create(hdev, &hdev->kernel_mem_mgr, hdev->kernel_ctx, cb_size,
 				internal_cb, false, &cb_handle);
 	if (rc) {
 		dev_err(hdev->dev,
@@ -657,8 +503,7 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
 		return NULL;
 	}
 
-	cb_handle >>= PAGE_SHIFT;
-	cb = hl_cb_get(hdev, &hdev->kernel_cb_mgr, (u32) cb_handle);
+	cb = hl_cb_get(&hdev->kernel_mem_mgr, cb_handle);
 	/* hl_cb_get should never fail here */
 	if (!cb) {
 		dev_crit(hdev->dev, "Kernel CB handle invalid 0x%x\n",
@@ -669,7 +514,7 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
 	return cb;
 
 destroy_cb:
-	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb_handle << PAGE_SHIFT);
+	hl_cb_destroy(&hdev->kernel_mem_mgr, cb_handle);
 
 	return NULL;
 }
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index d93ef9f1c45c..fb30b7de4aab 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -407,8 +407,7 @@ static void staged_cs_put(struct hl_device *hdev, struct hl_cs *cs)
 
 static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs)
 {
-	bool next_entry_found = false;
-	struct hl_cs *next, *first_cs;
+	struct hl_cs *next = NULL, *iter, *first_cs;
 
 	if (!cs_needs_timeout(cs))
 		return;
@@ -443,13 +442,13 @@ static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs)
 	spin_lock(&hdev->cs_mirror_lock);
 
 	/* queue TDR for next CS */
-	list_for_each_entry(next, &hdev->cs_mirror_list, mirror_node)
-		if (cs_needs_timeout(next)) {
-			next_entry_found = true;
+	list_for_each_entry(iter, &hdev->cs_mirror_list, mirror_node)
+		if (cs_needs_timeout(iter)) {
+			next = iter;
 			break;
 		}
 
-	if (next_entry_found && !next->tdr_active) {
+	if (next && !next->tdr_active) {
 		next->tdr_active = true;
 		schedule_delayed_work(&next->work_tdr, next->timeout_jiffies);
 	}
@@ -736,11 +735,10 @@ static void cs_timedout(struct work_struct *work)
 	hdev = cs->ctx->hdev;
 
 	/* Save only the first CS timeout parameters */
-	rc = atomic_cmpxchg(&hdev->last_error.cs_write_disable, 0, 1);
+	rc = atomic_cmpxchg(&hdev->last_error.cs_timeout.write_disable, 0, 1);
 	if (!rc) {
-		hdev->last_error.open_dev_timestamp = hdev->last_successful_open_ktime;
-		hdev->last_error.cs_timeout_timestamp = ktime_get();
-		hdev->last_error.cs_timeout_seq = cs->sequence;
+		hdev->last_error.cs_timeout.timestamp = ktime_get();
+		hdev->last_error.cs_timeout.seq = cs->sequence;
 	}
 
 	switch (cs->type) {
@@ -806,7 +804,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	}
 
 	/* increment refcnt for context */
-	hl_ctx_get(hdev, ctx);
+	hl_ctx_get(ctx);
 
 	cs->ctx = ctx;
 	cs->submitted = false;
@@ -958,9 +956,9 @@ wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt)
 
 	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
 	list_for_each_entry_safe(pend, temp, &interrupt->wait_list_head, wait_list_node) {
-		if (pend->ts_reg_info.ts_buff) {
+		if (pend->ts_reg_info.buf) {
 			list_del(&pend->wait_list_node);
-			hl_ts_put(pend->ts_reg_info.ts_buff);
+			hl_mmap_mem_buf_put(pend->ts_reg_info.buf);
 			hl_cb_put(pend->ts_reg_info.cq_cb);
 		} else {
 			pend->fence.error = -EIO;
@@ -1072,17 +1070,14 @@ static int validate_queue_index(struct hl_device *hdev,
 }
 
 static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
-					struct hl_cb_mgr *cb_mgr,
+					struct hl_mem_mgr *mmg,
 					struct hl_cs_chunk *chunk)
 {
 	struct hl_cb *cb;
-	u32 cb_handle;
 
-	cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT);
-
-	cb = hl_cb_get(hdev, cb_mgr, cb_handle);
+	cb = hl_cb_get(mmg, chunk->cb_handle);
 	if (!cb) {
-		dev_err(hdev->dev, "CB handle 0x%x invalid\n", cb_handle);
+		dev_err(hdev->dev, "CB handle 0x%llx invalid\n", chunk->cb_handle);
 		return NULL;
 	}
 
@@ -1344,7 +1339,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 		}
 
 		if (is_kernel_allocated_cb) {
-			cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk);
+			cb = get_cb_from_cs_chunk(hdev, &hpriv->mem_mgr, chunk);
 			if (!cb) {
 				atomic64_inc(
 					&ctx->cs_counters.validation_drop_cnt);
@@ -1772,7 +1767,7 @@ static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
 	 */
 	job->patched_cb = job->user_cb;
 	job->job_cb_size = job->user_cb_size;
-	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
+	hl_cb_destroy(&hdev->kernel_mem_mgr, cb->buf->handle);
 
 	/* increment refcount as for external queues we get completion */
 	cs_get(cs);
@@ -1834,7 +1829,7 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
 
 	handle->count = count;
 
-	hl_ctx_get(hdev, hpriv->ctx);
+	hl_ctx_get(hpriv->ctx);
 	handle->ctx = hpriv->ctx;
 	mgr = &hpriv->ctx->sig_mgr;
 
@@ -2528,7 +2523,7 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 	if (timestamp)
 		*timestamp = 0;
 
-	hl_ctx_get(hdev, ctx);
+	hl_ctx_get(ctx);
 
 	fence = hl_ctx_get_fence(ctx, seq);
 
@@ -2668,7 +2663,7 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 {
 	struct multi_cs_completion *mcs_compl;
 	struct hl_device *hdev = hpriv->hdev;
-	struct multi_cs_data mcs_data = {0};
+	struct multi_cs_data mcs_data = {};
 	union hl_wait_cs_args *args = data;
 	struct hl_ctx *ctx = hpriv->ctx;
 	struct hl_fence **fence_arr;
@@ -2719,7 +2714,7 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 	mcs_data.fence_arr = fence_arr;
 	mcs_data.arr_len = seq_arr_len;
 
-	hl_ctx_get(hdev, ctx);
+	hl_ctx_get(ctx);
 
 	/* wait (with timeout) for the first CS to be completed */
 	mcs_data.timeout_jiffies = hl_usecs64_to_jiffies(args->in.timeout_us);
@@ -2868,12 +2863,13 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 	return 0;
 }
 
-static int ts_buff_get_kernel_ts_record(struct hl_ts_buff *ts_buff,
+static int ts_buff_get_kernel_ts_record(struct hl_mmap_mem_buf *buf,
 					struct hl_cb *cq_cb,
 					u64 ts_offset, u64 cq_offset, u64 target_value,
 					spinlock_t *wait_list_lock,
 					struct hl_user_pending_interrupt **pend)
 {
+	struct hl_ts_buff *ts_buff = buf->private;
 	struct hl_user_pending_interrupt *requested_offset_record =
 				(struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
 				ts_offset;
@@ -2885,7 +2881,7 @@ static int ts_buff_get_kernel_ts_record(struct hl_ts_buff *ts_buff,
 
 	/* Validate ts_offset not exceeding last max */
 	if (requested_offset_record > cb_last) {
-		dev_err(ts_buff->hdev->dev, "Ts offset exceeds max CB offset(0x%llx)\n",
+		dev_err(buf->mmg->dev, "Ts offset exceeds max CB offset(0x%llx)\n",
 								(u64)(uintptr_t)cb_last);
 		return -EINVAL;
 	}
@@ -2904,18 +2900,21 @@ start_over:
 			list_del(&requested_offset_record->wait_list_node);
 			spin_unlock_irqrestore(wait_list_lock, flags);
 
-			hl_ts_put(requested_offset_record->ts_reg_info.ts_buff);
+			hl_mmap_mem_buf_put(requested_offset_record->ts_reg_info.buf);
 			hl_cb_put(requested_offset_record->ts_reg_info.cq_cb);
 
-			dev_dbg(ts_buff->hdev->dev, "ts node removed from interrupt list now can re-use\n");
+			dev_dbg(buf->mmg->dev,
+				"ts node removed from interrupt list now can re-use\n");
 		} else {
-			dev_dbg(ts_buff->hdev->dev, "ts node in middle of irq handling\n");
+			dev_dbg(buf->mmg->dev,
+				"ts node in middle of irq handling\n");
 
 			/* irq handling in the middle give it time to finish */
 			spin_unlock_irqrestore(wait_list_lock, flags);
 			usleep_range(1, 10);
 			if (++iter_counter == MAX_TS_ITER_NUM) {
-				dev_err(ts_buff->hdev->dev, "handling registration interrupt took too long!!\n");
+				dev_err(buf->mmg->dev,
+					"handling registration interrupt took too long!!\n");
 				return -EINVAL;
 			}
 
@@ -2927,7 +2926,7 @@ start_over:
 
 	/* Fill up the new registration node info */
 	requested_offset_record->ts_reg_info.in_use = 1;
-	requested_offset_record->ts_reg_info.ts_buff = ts_buff;
+	requested_offset_record->ts_reg_info.buf = buf;
 	requested_offset_record->ts_reg_info.cq_cb = cq_cb;
 	requested_offset_record->ts_reg_info.timestamp_kernel_addr =
 			(u64 *) ts_buff->user_buff_address + ts_offset;
@@ -2937,21 +2936,20 @@ start_over:
 
 	*pend = requested_offset_record;
 
-	dev_dbg(ts_buff->hdev->dev, "Found available node in TS kernel CB(0x%llx)\n",
+	dev_dbg(buf->mmg->dev, "Found available node in TS kernel CB(0x%llx)\n",
 						(u64)(uintptr_t)requested_offset_record);
 	return 0;
 }
 
 static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
-				struct hl_cb_mgr *cb_mgr, struct hl_ts_mgr *ts_mgr,
+				struct hl_mem_mgr *cb_mmg, struct hl_mem_mgr *mmg,
 				u64 timeout_us, u64 cq_counters_handle,	u64 cq_counters_offset,
 				u64 target_value, struct hl_user_interrupt *interrupt,
 				bool register_ts_record, u64 ts_handle, u64 ts_offset,
 				u32 *status, u64 *timestamp)
 {
-	u32 cq_patched_handle, ts_patched_handle;
 	struct hl_user_pending_interrupt *pend;
-	struct hl_ts_buff *ts_buff;
+	struct hl_mmap_mem_buf *buf;
 	struct hl_cb *cq_cb;
 	unsigned long timeout, flags;
 	long completion_rc;
@@ -2959,10 +2957,9 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 
 	timeout = hl_usecs64_to_jiffies(timeout_us);
 
-	hl_ctx_get(hdev, ctx);
+	hl_ctx_get(ctx);
 
-	cq_patched_handle = lower_32_bits(cq_counters_handle >> PAGE_SHIFT);
-	cq_cb = hl_cb_get(hdev, cb_mgr, cq_patched_handle);
+	cq_cb = hl_cb_get(cb_mmg, cq_counters_handle);
 	if (!cq_cb) {
 		rc = -EINVAL;
 		goto put_ctx;
@@ -2971,16 +2968,14 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 	if (register_ts_record) {
 		dev_dbg(hdev->dev, "Timestamp registration: interrupt id: %u, ts offset: %llu, cq_offset: %llu\n",
 					interrupt->interrupt_id, ts_offset, cq_counters_offset);
-
-		ts_patched_handle = lower_32_bits(ts_handle >> PAGE_SHIFT);
-		ts_buff = hl_ts_get(hdev, ts_mgr, ts_patched_handle);
-		if (!ts_buff) {
+		buf = hl_mmap_mem_buf_get(mmg, ts_handle);
+		if (!buf) {
 			rc = -EINVAL;
 			goto put_cq_cb;
 		}
 
 		/* Find first available record */
-		rc = ts_buff_get_kernel_ts_record(ts_buff, cq_cb, ts_offset,
+		rc = ts_buff_get_kernel_ts_record(buf, cq_cb, ts_offset,
 						cq_counters_offset, target_value,
 						&interrupt->wait_list_lock, &pend);
 		if (rc)
@@ -3087,7 +3082,7 @@ ts_registration_exit:
 	return rc;
 
 put_ts_buff:
-	hl_ts_put(ts_buff);
+	hl_mmap_mem_buf_put(buf);
 put_cq_cb:
 	hl_cb_put(cq_cb);
 put_ctx:
@@ -3111,7 +3106,7 @@ static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_
 
 	timeout = hl_usecs64_to_jiffies(timeout_us);
 
-	hl_ctx_get(hdev, ctx);
+	hl_ctx_get(ctx);
 
 	pend = kzalloc(sizeof(*pend), GFP_KERNEL);
 	if (!pend) {
@@ -3249,7 +3244,7 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 		interrupt = &hdev->user_interrupt[interrupt_id - first_interrupt];
 
 	if (args->in.flags & HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ)
-		rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, &hpriv->cb_mgr, &hpriv->ts_mem_mgr,
+		rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, &hpriv->mem_mgr, &hpriv->mem_mgr,
 				args->in.interrupt_timeout_us, args->in.cq_counters_handle,
 				args->in.cq_counters_offset,
 				args->in.target, interrupt,
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index c6360e33bce8..ed2cfd0c6e99 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -262,7 +262,7 @@ err_hw_block_mem_fini:
 	return rc;
 }
 
-void hl_ctx_get(struct hl_device *hdev, struct hl_ctx *ctx)
+void hl_ctx_get(struct hl_ctx *ctx)
 {
 	kref_get(&ctx->refcount);
 }
@@ -284,7 +284,7 @@ struct hl_ctx *hl_get_compute_ctx(struct hl_device *hdev)
 		 * immediately once we find him
 		 */
 		ctx = hpriv->ctx;
-		hl_ctx_get(hdev, ctx);
+		hl_ctx_get(ctx);
 		break;
 	}
 
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index f18495545854..c6744bfc6da4 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -11,6 +11,7 @@
 #include <linux/pci.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
+#include <linux/iommu.h>
 
 #define MMU_ADDR_BUF_SIZE	40
 #define MMU_ASID_BUF_SIZE	10
@@ -125,9 +126,9 @@ static int command_buffers_show(struct seq_file *s, void *data)
 		}
 		seq_printf(s,
 			"   %03llu        %d    0x%08x      %d          %d          %d\n",
-			cb->id, cb->ctx->asid, cb->size,
-			kref_read(&cb->refcount),
-			cb->mmap, atomic_read(&cb->cs_cnt));
+			cb->buf->handle, cb->ctx->asid, cb->size,
+			kref_read(&cb->buf->refcount),
+			atomic_read(&cb->buf->mmap), atomic_read(&cb->cs_cnt));
 	}
 
 	spin_unlock(&dev_entry->cb_spinlock);
@@ -369,8 +370,7 @@ static int userptr_lookup_show(struct seq_file *s, void *data)
 		if (dev_entry->userptr_lookup >= userptr->addr &&
 		dev_entry->userptr_lookup < userptr->addr + userptr->size) {
 			total_npages = 0;
-			for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents,
-					i) {
+			for_each_sgtable_dma_sg(userptr->sgt, sg, i) {
 				npages = hl_get_sg_info(sg, &dma_addr);
 				sg_start = userptr->addr +
 					total_npages * PAGE_SIZE;
@@ -538,6 +538,39 @@ static int engines_show(struct seq_file *s, void *data)
 	return 0;
 }
 
+static ssize_t hl_memory_scrub(struct file *f, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+	struct hl_device *hdev = entry->hdev;
+	u64 val = entry->memory_scrub_val;
+	int rc;
+
+	if (!hl_device_operational(hdev, NULL)) {
+		dev_warn_ratelimited(hdev->dev, "Can't scrub memory, device is not operational\n");
+		return -EIO;
+	}
+
+	mutex_lock(&hdev->fpriv_list_lock);
+	if (hdev->is_compute_ctx_active) {
+		mutex_unlock(&hdev->fpriv_list_lock);
+		dev_err(hdev->dev, "can't scrub dram, context exist\n");
+		return -EBUSY;
+	}
+	hdev->is_in_dram_scrub = true;
+	mutex_unlock(&hdev->fpriv_list_lock);
+
+	rc = hdev->asic_funcs->scrub_device_dram(hdev, val);
+
+	mutex_lock(&hdev->fpriv_list_lock);
+	hdev->is_in_dram_scrub = false;
+	mutex_unlock(&hdev->fpriv_list_lock);
+
+	if (rc)
+		return rc;
+	return count;
+}
+
 static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -647,13 +680,105 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr, u32 size,
 	return rc;
 }
 
+static int hl_access_dev_mem_by_region(struct hl_device *hdev, u64 addr,
+		u64 *val, enum debugfs_access_type acc_type, bool *found)
+{
+	size_t acc_size = (acc_type == DEBUGFS_READ64 || acc_type == DEBUGFS_WRITE64) ?
+		sizeof(u64) : sizeof(u32);
+	struct pci_mem_region *mem_reg;
+	int i;
+
+	for (i = 0; i < PCI_REGION_NUMBER; i++) {
+		mem_reg = &hdev->pci_mem_region[i];
+		if (!mem_reg->used)
+			continue;
+		if (addr >= mem_reg->region_base &&
+			addr <= mem_reg->region_base + mem_reg->region_size - acc_size) {
+			*found = true;
+			return hdev->asic_funcs->access_dev_mem(hdev, mem_reg, i,
+				addr, val, acc_type);
+		}
+	}
+	return 0;
+}
+
+static void hl_access_host_mem(struct hl_device *hdev, u64 addr, u64 *val,
+		enum debugfs_access_type acc_type)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 offset = prop->device_dma_offset_for_host_access;
+
+	switch (acc_type) {
+	case DEBUGFS_READ32:
+		*val = *(u32 *) phys_to_virt(addr - offset);
+		break;
+	case DEBUGFS_WRITE32:
+		*(u32 *) phys_to_virt(addr - offset) = *val;
+		break;
+	case DEBUGFS_READ64:
+		*val = *(u64 *) phys_to_virt(addr - offset);
+		break;
+	case DEBUGFS_WRITE64:
+		*(u64 *) phys_to_virt(addr - offset) = *val;
+		break;
+	default:
+		dev_err(hdev->dev, "hostmem access-type %d id not supported\n", acc_type);
+		break;
+	}
+}
+
+static int hl_access_mem(struct hl_device *hdev, u64 addr, u64 *val,
+	enum debugfs_access_type acc_type)
+{
+	size_t acc_size = (acc_type == DEBUGFS_READ64 || acc_type == DEBUGFS_WRITE64) ?
+		sizeof(u64) : sizeof(u32);
+	u64 host_start = hdev->asic_prop.host_base_address;
+	u64 host_end = hdev->asic_prop.host_end_address;
+	bool user_address, found = false;
+	int rc;
+
+	user_address = hl_is_device_va(hdev, addr);
+	if (user_address) {
+		rc = device_va_to_pa(hdev, addr, acc_size, &addr);
+		if (rc)
+			return rc;
+	}
+
+	rc = hl_access_dev_mem_by_region(hdev, addr, val, acc_type, &found);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed reading addr %#llx from dev mem (%d)\n",
+			addr, rc);
+		return rc;
+	}
+
+	if (found)
+		return 0;
+
+	if (!user_address || device_iommu_mapped(&hdev->pdev->dev)) {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	if (addr >= host_start && addr <= host_end - acc_size) {
+		hl_access_host_mem(hdev, addr, val, acc_type);
+	} else {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	return 0;
+err:
+	dev_err(hdev->dev, "invalid addr %#llx\n", addr);
+	return rc;
+}
+
 static ssize_t hl_data_read32(struct file *f, char __user *buf,
 					size_t count, loff_t *ppos)
 {
 	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
 	struct hl_device *hdev = entry->hdev;
-	u64 addr = entry->addr;
-	bool user_address;
+	u64 value64, addr = entry->addr;
 	char tmp_buf[32];
 	ssize_t rc;
 	u32 val;
@@ -666,18 +791,11 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
 	if (*ppos)
 		return 0;
 
-	user_address = hl_is_device_va(hdev, addr);
-	if (user_address) {
-		rc = device_va_to_pa(hdev, addr, sizeof(val), &addr);
-		if (rc)
-			return rc;
-	}
-
-	rc = hdev->asic_funcs->debugfs_read32(hdev, addr, user_address, &val);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to read from 0x%010llx\n", addr);
+	rc = hl_access_mem(hdev, addr, &value64, DEBUGFS_READ32);
+	if (rc)
 		return rc;
-	}
+
+	val = value64; /* downcast back to 32 */
 
 	sprintf(tmp_buf, "0x%08x\n", val);
 	return simple_read_from_buffer(buf, count, ppos, tmp_buf,
@@ -689,8 +807,7 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
 {
 	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
 	struct hl_device *hdev = entry->hdev;
-	u64 addr = entry->addr;
-	bool user_address;
+	u64 value64, addr = entry->addr;
 	u32 value;
 	ssize_t rc;
 
@@ -703,19 +820,10 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
 	if (rc)
 		return rc;
 
-	user_address = hl_is_device_va(hdev, addr);
-	if (user_address) {
-		rc = device_va_to_pa(hdev, addr, sizeof(value), &addr);
-		if (rc)
-			return rc;
-	}
-
-	rc = hdev->asic_funcs->debugfs_write32(hdev, addr, user_address, value);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to write 0x%08x to 0x%010llx\n",
-			value, addr);
+	value64 = value;
+	rc = hl_access_mem(hdev, addr, &value64, DEBUGFS_WRITE32);
+	if (rc)
 		return rc;
-	}
 
 	return count;
 }
@@ -726,7 +834,6 @@ static ssize_t hl_data_read64(struct file *f, char __user *buf,
 	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
 	struct hl_device *hdev = entry->hdev;
 	u64 addr = entry->addr;
-	bool user_address;
 	char tmp_buf[32];
 	ssize_t rc;
 	u64 val;
@@ -739,18 +846,9 @@ static ssize_t hl_data_read64(struct file *f, char __user *buf,
 	if (*ppos)
 		return 0;
 
-	user_address = hl_is_device_va(hdev, addr);
-	if (user_address) {
-		rc = device_va_to_pa(hdev, addr, sizeof(val), &addr);
-		if (rc)
-			return rc;
-	}
-
-	rc = hdev->asic_funcs->debugfs_read64(hdev, addr, user_address, &val);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to read from 0x%010llx\n", addr);
+	rc = hl_access_mem(hdev, addr, &val, DEBUGFS_READ64);
+	if (rc)
 		return rc;
-	}
 
 	sprintf(tmp_buf, "0x%016llx\n", val);
 	return simple_read_from_buffer(buf, count, ppos, tmp_buf,
@@ -763,7 +861,6 @@ static ssize_t hl_data_write64(struct file *f, const char __user *buf,
 	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
 	struct hl_device *hdev = entry->hdev;
 	u64 addr = entry->addr;
-	bool user_address;
 	u64 value;
 	ssize_t rc;
 
@@ -776,19 +873,9 @@ static ssize_t hl_data_write64(struct file *f, const char __user *buf,
 	if (rc)
 		return rc;
 
-	user_address = hl_is_device_va(hdev, addr);
-	if (user_address) {
-		rc = device_va_to_pa(hdev, addr, sizeof(value), &addr);
-		if (rc)
-			return rc;
-	}
-
-	rc = hdev->asic_funcs->debugfs_write64(hdev, addr, user_address, value);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to write 0x%016llx to 0x%010llx\n",
-			value, addr);
+	rc = hl_access_mem(hdev, addr, &value, DEBUGFS_WRITE64);
+	if (rc)
 		return rc;
-	}
 
 	return count;
 }
@@ -829,23 +916,67 @@ static ssize_t hl_dma_size_write(struct file *f, const char __user *buf,
 	}
 
 	/* Free the previous allocation, if there was any */
-	entry->blob_desc.size = 0;
-	vfree(entry->blob_desc.data);
+	entry->data_dma_blob_desc.size = 0;
+	vfree(entry->data_dma_blob_desc.data);
 
-	entry->blob_desc.data = vmalloc(size);
-	if (!entry->blob_desc.data)
+	entry->data_dma_blob_desc.data = vmalloc(size);
+	if (!entry->data_dma_blob_desc.data)
 		return -ENOMEM;
 
 	rc = hdev->asic_funcs->debugfs_read_dma(hdev, addr, size,
-						entry->blob_desc.data);
+						entry->data_dma_blob_desc.data);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to DMA from 0x%010llx\n", addr);
-		vfree(entry->blob_desc.data);
-		entry->blob_desc.data = NULL;
+		vfree(entry->data_dma_blob_desc.data);
+		entry->data_dma_blob_desc.data = NULL;
+		return -EIO;
+	}
+
+	entry->data_dma_blob_desc.size = size;
+
+	return count;
+}
+
+static ssize_t hl_monitor_dump_trigger(struct file *f, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+	struct hl_device *hdev = entry->hdev;
+	u32 size, trig;
+	ssize_t rc;
+
+	if (hdev->reset_info.in_reset) {
+		dev_warn_ratelimited(hdev->dev, "Can't dump monitors during reset\n");
+		return 0;
+	}
+	rc = kstrtouint_from_user(buf, count, 10, &trig);
+	if (rc)
+		return rc;
+
+	if (trig != 1) {
+		dev_err(hdev->dev, "Must write 1 to trigger monitor dump\n");
+		return -EINVAL;
+	}
+
+	size = sizeof(struct cpucp_monitor_dump);
+
+	/* Free the previous allocation, if there was any */
+	entry->mon_dump_blob_desc.size = 0;
+	vfree(entry->mon_dump_blob_desc.data);
+
+	entry->mon_dump_blob_desc.data = vmalloc(size);
+	if (!entry->mon_dump_blob_desc.data)
+		return -ENOMEM;
+
+	rc = hdev->asic_funcs->get_monitor_dump(hdev, entry->mon_dump_blob_desc.data);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to dump monitors\n");
+		vfree(entry->mon_dump_blob_desc.data);
+		entry->mon_dump_blob_desc.data = NULL;
 		return -EIO;
 	}
 
-	entry->blob_desc.size = size;
+	entry->mon_dump_blob_desc.size = size;
 
 	return count;
 }
@@ -1218,6 +1349,11 @@ static ssize_t hl_timeout_locked_write(struct file *f, const char __user *buf,
 	return count;
 }
 
+static const struct file_operations hl_mem_scrub_fops = {
+	.owner = THIS_MODULE,
+	.write = hl_memory_scrub,
+};
+
 static const struct file_operations hl_data32b_fops = {
 	.owner = THIS_MODULE,
 	.read = hl_data_read32,
@@ -1235,6 +1371,11 @@ static const struct file_operations hl_dma_size_fops = {
 	.write = hl_dma_size_write
 };
 
+static const struct file_operations hl_monitor_dump_fops = {
+	.owner = THIS_MODULE,
+	.write = hl_monitor_dump_trigger
+};
+
 static const struct file_operations hl_i2c_data_fops = {
 	.owner = THIS_MODULE,
 	.read = hl_i2c_data_read,
@@ -1350,8 +1491,10 @@ void hl_debugfs_add_device(struct hl_device *hdev)
 	if (!dev_entry->entry_arr)
 		return;
 
-	dev_entry->blob_desc.size = 0;
-	dev_entry->blob_desc.data = NULL;
+	dev_entry->data_dma_blob_desc.size = 0;
+	dev_entry->data_dma_blob_desc.data = NULL;
+	dev_entry->mon_dump_blob_desc.size = 0;
+	dev_entry->mon_dump_blob_desc.data = NULL;
 
 	INIT_LIST_HEAD(&dev_entry->file_list);
 	INIT_LIST_HEAD(&dev_entry->cb_list);
@@ -1370,6 +1513,17 @@ void hl_debugfs_add_device(struct hl_device *hdev)
 	dev_entry->root = debugfs_create_dir(dev_name(hdev->dev),
 						hl_debug_root);
 
+	debugfs_create_x64("memory_scrub_val",
+				0644,
+				dev_entry->root,
+				&dev_entry->memory_scrub_val);
+
+	debugfs_create_file("memory_scrub",
+				0200,
+				dev_entry->root,
+				dev_entry,
+				&hl_mem_scrub_fops);
+
 	debugfs_create_x64("addr",
 				0644,
 				dev_entry->root,
@@ -1470,7 +1624,18 @@ void hl_debugfs_add_device(struct hl_device *hdev)
 	debugfs_create_blob("data_dma",
 				0400,
 				dev_entry->root,
-				&dev_entry->blob_desc);
+				&dev_entry->data_dma_blob_desc);
+
+	debugfs_create_file("monitor_dump_trig",
+				0200,
+				dev_entry->root,
+				dev_entry,
+				&hl_monitor_dump_fops);
+
+	debugfs_create_blob("monitor_dump",
+				0400,
+				dev_entry->root,
+				&dev_entry->mon_dump_blob_desc);
 
 	debugfs_create_x8("skip_reset_on_timeout",
 				0644,
@@ -1509,7 +1674,8 @@ void hl_debugfs_remove_device(struct hl_device *hdev)
 
 	mutex_destroy(&entry->file_mutex);
 
-	vfree(entry->blob_desc.data);
+	vfree(entry->data_dma_blob_desc.data);
+	vfree(entry->mon_dump_blob_desc.data);
 
 	for (i = 0; i < ARRAY_SIZE(entry->state_dump); ++i)
 		vfree(entry->state_dump[i]);
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index dc9341a64541..b4f14c6d3970 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -15,6 +15,182 @@
 
 #define HL_RESET_DELAY_USEC		10000	/* 10ms */
 
+/*
+ * hl_set_dram_bar- sets the bar to allow later access to address
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @addr: the address the caller wants to access.
+ *
+ * @return: the old BAR base address on success, U64_MAX for failure.
+ *	    The caller should set it back to the old address after use.
+ *
+ * In case the bar space does not cover the whole address space,
+ * the bar base address should be set to allow access to a given address.
+ * This function can be called also if the bar doesn't need to be set,
+ * in that case it just won't change the base.
+ */
+static uint64_t hl_set_dram_bar(struct hl_device *hdev, u64 addr)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 bar_base_addr;
+
+	bar_base_addr = addr & ~(prop->dram_pci_bar_size - 0x1ull);
+
+	return hdev->asic_funcs->set_dram_bar_base(hdev, bar_base_addr);
+}
+
+
+static int hl_access_sram_dram_region(struct hl_device *hdev, u64 addr, u64 *val,
+	enum debugfs_access_type acc_type, enum pci_region region_type)
+{
+	struct pci_mem_region *region = &hdev->pci_mem_region[region_type];
+	u64 old_base, rc;
+
+	if (region_type == PCI_REGION_DRAM) {
+		old_base = hl_set_dram_bar(hdev, addr);
+		if (old_base == U64_MAX)
+			return -EIO;
+	}
+
+	switch (acc_type) {
+	case DEBUGFS_READ8:
+		*val = readb(hdev->pcie_bar[region->bar_id] +
+			addr - region->region_base + region->offset_in_bar);
+		break;
+	case DEBUGFS_WRITE8:
+		writeb(*val, hdev->pcie_bar[region->bar_id] +
+			addr - region->region_base + region->offset_in_bar);
+		break;
+	case DEBUGFS_READ32:
+		*val = readl(hdev->pcie_bar[region->bar_id] +
+			addr - region->region_base + region->offset_in_bar);
+		break;
+	case DEBUGFS_WRITE32:
+		writel(*val, hdev->pcie_bar[region->bar_id] +
+			addr - region->region_base + region->offset_in_bar);
+		break;
+	case DEBUGFS_READ64:
+		*val = readq(hdev->pcie_bar[region->bar_id] +
+			addr - region->region_base + region->offset_in_bar);
+		break;
+	case DEBUGFS_WRITE64:
+		writeq(*val, hdev->pcie_bar[region->bar_id] +
+			addr - region->region_base + region->offset_in_bar);
+		break;
+	}
+
+	if (region_type == PCI_REGION_DRAM) {
+		rc = hl_set_dram_bar(hdev, old_base);
+		if (rc == U64_MAX)
+			return -EIO;
+	}
+
+	return 0;
+}
+
+int hl_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct scatterlist *sg;
+	int rc, i;
+
+	rc = dma_map_sgtable(&hdev->pdev->dev, sgt, dir, 0);
+	if (rc)
+		return rc;
+
+	/* Shift to the device's base physical address of host memory if necessary */
+	if (prop->device_dma_offset_for_host_access)
+		for_each_sgtable_dma_sg(sgt, sg, i)
+			sg->dma_address += prop->device_dma_offset_for_host_access;
+
+	return 0;
+}
+
+void hl_dma_unmap_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct scatterlist *sg;
+	int i;
+
+	/* Cancel the device's base physical address of host memory if necessary */
+	if (prop->device_dma_offset_for_host_access)
+		for_each_sgtable_dma_sg(sgt, sg, i)
+			sg->dma_address -= prop->device_dma_offset_for_host_access;
+
+	dma_unmap_sgtable(&hdev->pdev->dev, sgt, dir, 0);
+}
+
+/*
+ * hl_access_cfg_region - access the config region
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @addr: the address to access
+ * @val: the value to write from or read to
+ * @acc_type: the type of access (read/write 64/32)
+ */
+int hl_access_cfg_region(struct hl_device *hdev, u64 addr, u64 *val,
+	enum debugfs_access_type acc_type)
+{
+	struct pci_mem_region *cfg_region = &hdev->pci_mem_region[PCI_REGION_CFG];
+	u32 val_h, val_l;
+
+	if (!IS_ALIGNED(addr, sizeof(u32))) {
+		dev_err(hdev->dev, "address %#llx not a multiple of %zu\n", addr, sizeof(u32));
+		return -EINVAL;
+	}
+
+	switch (acc_type) {
+	case DEBUGFS_READ32:
+		*val = RREG32(addr - cfg_region->region_base);
+		break;
+	case DEBUGFS_WRITE32:
+		WREG32(addr - cfg_region->region_base, *val);
+		break;
+	case DEBUGFS_READ64:
+		val_l = RREG32(addr - cfg_region->region_base);
+		val_h = RREG32(addr + sizeof(u32) - cfg_region->region_base);
+
+		*val = (((u64) val_h) << 32) | val_l;
+		break;
+	case DEBUGFS_WRITE64:
+		WREG32(addr - cfg_region->region_base, lower_32_bits(*val));
+		WREG32(addr + sizeof(u32) - cfg_region->region_base, upper_32_bits(*val));
+		break;
+	default:
+		dev_err(hdev->dev, "access type %d is not supported\n", acc_type);
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+/*
+ * hl_access_dev_mem - access device memory
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @region: the memory region the address belongs to
+ * @region_type: the type of the region the address belongs to
+ * @addr: the address to access
+ * @val: the value to write from or read to
+ * @acc_type: the type of access (r/w, 32/64)
+ */
+int hl_access_dev_mem(struct hl_device *hdev, struct pci_mem_region *region,
+		enum pci_region region_type, u64 addr, u64 *val, enum debugfs_access_type acc_type)
+{
+	switch (region_type) {
+	case PCI_REGION_CFG:
+		return hl_access_cfg_region(hdev, addr, val, acc_type);
+	case PCI_REGION_SRAM:
+	case PCI_REGION_DRAM:
+		return hl_access_sram_dram_region(hdev, addr, val, acc_type,
+			region_type);
+	default:
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
 enum hl_device_status hl_device_status(struct hl_device *hdev)
 {
 	enum hl_device_status status;
@@ -107,6 +283,14 @@ static void hpriv_release(struct kref *ref)
 	hdev->is_compute_ctx_active = false;
 	mutex_unlock(&hdev->fpriv_list_lock);
 
+	hdev->compute_ctx_in_release = 0;
+
+	/* release the eventfd */
+	if (hpriv->notifier_event.eventfd)
+		eventfd_ctx_put(hpriv->notifier_event.eventfd);
+
+	mutex_destroy(&hpriv->notifier_event.lock);
+
 	kfree(hpriv);
 }
 
@@ -146,10 +330,11 @@ static int hl_device_release(struct inode *inode, struct file *filp)
 	 */
 	hl_release_pending_user_interrupts(hpriv->hdev);
 
-	hl_cb_mgr_fini(hdev, &hpriv->cb_mgr);
-	hl_ts_mgr_fini(hpriv->hdev, &hpriv->ts_mem_mgr);
+	hl_mem_mgr_fini(&hpriv->mem_mgr);
 	hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
 
+	hdev->compute_ctx_in_release = 1;
+
 	if (!hl_hpriv_put(hpriv))
 		dev_notice(hdev->dev,
 			"User process closed FD but device still in use\n");
@@ -176,6 +361,11 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
 	list_del(&hpriv->dev_node);
 	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
 out:
+	/* release the eventfd */
+	if (hpriv->notifier_event.eventfd)
+		eventfd_ctx_put(hpriv->notifier_event.eventfd);
+
+	mutex_destroy(&hpriv->notifier_event.lock);
 	put_pid(hpriv->taskpid);
 
 	kfree(hpriv);
@@ -204,17 +394,15 @@ static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
 	}
 
 	vm_pgoff = vma->vm_pgoff;
-	vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
 
 	switch (vm_pgoff & HL_MMAP_TYPE_MASK) {
-	case HL_MMAP_TYPE_CB:
-		return hl_cb_mmap(hpriv, vma);
-
 	case HL_MMAP_TYPE_BLOCK:
+		vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
 		return hl_hw_block_mmap(hpriv, vma);
 
+	case HL_MMAP_TYPE_CB:
 	case HL_MMAP_TYPE_TS_BUFF:
-		return hl_ts_mmap(hpriv, vma);
+		return hl_mem_mgr_mmap(&hpriv->mem_mgr, vma, NULL);
 	}
 
 	return -EINVAL;
@@ -424,18 +612,25 @@ static int device_early_init(struct hl_device *hdev)
 		goto free_eq_wq;
 	}
 
+	hdev->pf_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0);
+	if (!hdev->pf_wq) {
+		dev_err(hdev->dev, "Failed to allocate MMU prefetch workqueue\n");
+		rc = -ENOMEM;
+		goto free_ts_free_wq;
+	}
+
 	hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
 					GFP_KERNEL);
 	if (!hdev->hl_chip_info) {
 		rc = -ENOMEM;
-		goto free_ts_free_wq;
+		goto free_pf_wq;
 	}
 
 	rc = hl_mmu_if_set_funcs(hdev);
 	if (rc)
 		goto free_chip_info;
 
-	hl_cb_mgr_init(&hdev->kernel_cb_mgr);
+	hl_mem_mgr_init(hdev->dev, &hdev->kernel_mem_mgr);
 
 	hdev->device_reset_work.wq =
 			create_singlethread_workqueue("hl_device_reset");
@@ -464,9 +659,11 @@ static int device_early_init(struct hl_device *hdev)
 	return 0;
 
 free_cb_mgr:
-	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
+	hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
 free_chip_info:
 	kfree(hdev->hl_chip_info);
+free_pf_wq:
+	destroy_workqueue(hdev->pf_wq);
 free_ts_free_wq:
 	destroy_workqueue(hdev->ts_free_obj_wq);
 free_eq_wq:
@@ -503,10 +700,11 @@ static void device_early_fini(struct hl_device *hdev)
 
 	mutex_destroy(&hdev->clk_throttling.lock);
 
-	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
+	hl_mem_mgr_fini(&hdev->kernel_mem_mgr);
 
 	kfree(hdev->hl_chip_info);
 
+	destroy_workqueue(hdev->pf_wq);
 	destroy_workqueue(hdev->ts_free_obj_wq);
 	destroy_workqueue(hdev->eq_wq);
 	destroy_workqueue(hdev->device_reset_work.wq);
@@ -703,6 +901,9 @@ static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_r
 	/* Go over all the queues, release all CS and their jobs */
 	hl_cs_rollback_all(hdev, skip_wq_flush);
 
+	/* flush the MMU prefetch workqueue */
+	flush_workqueue(hdev->pf_wq);
+
 	/* Release all pending user interrupts, each pending user interrupt
 	 * holds a reference to user context
 	 */
@@ -847,10 +1048,13 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout, bool
 
 			put_task_struct(task);
 		} else {
-			dev_warn(hdev->dev,
-				"Can't get task struct for PID so giving up on killing process\n");
-			mutex_unlock(fd_lock);
-			return -ETIME;
+			/*
+			 * If we got here, it means that process was killed from outside the driver
+			 * right after it started looping on fd_list and before get_pid_task, thus
+			 * we don't need to kill it.
+			 */
+			dev_dbg(hdev->dev,
+				"Can't get task struct for user process, assuming process was killed from outside the driver\n");
 		}
 	}
 
@@ -1062,9 +1266,9 @@ do_reset:
 		if (hard_reset)
 			dev_info(hdev->dev, "Going to reset device\n");
 		else if (reset_upon_device_release)
-			dev_info(hdev->dev, "Going to reset device after release by user\n");
+			dev_dbg(hdev->dev, "Going to reset device after release by user\n");
 		else
-			dev_info(hdev->dev, "Going to reset engines of inference device\n");
+			dev_dbg(hdev->dev, "Going to reset engines of inference device\n");
 	}
 
 again:
@@ -1270,7 +1474,10 @@ kill_processes:
 
 	hdev->reset_info.needs_reset = false;
 
-	dev_notice(hdev->dev, "Successfully finished resetting the device\n");
+	if (hard_reset)
+		dev_info(hdev->dev, "Successfully finished resetting the device\n");
+	else
+		dev_dbg(hdev->dev, "Successfully finished resetting the device\n");
 
 	if (hard_reset) {
 		hdev->reset_info.hard_reset_cnt++;
@@ -1323,6 +1530,43 @@ out_err:
 	return rc;
 }
 
+static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event)
+{
+	mutex_lock(&notifier_event->lock);
+	notifier_event->events_mask |= event;
+	if (notifier_event->eventfd)
+		eventfd_signal(notifier_event->eventfd, 1);
+
+	mutex_unlock(&notifier_event->lock);
+}
+
+/*
+ * hl_notifier_event_send_all - notify all user processes via eventfd
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @event: the occurred event
+ * Returns 0 for success or an error on failure.
+ */
+void hl_notifier_event_send_all(struct hl_device *hdev, u64 event)
+{
+	struct hl_fpriv	*hpriv;
+
+	mutex_lock(&hdev->fpriv_list_lock);
+
+	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node)
+		hl_notifier_event_send(&hpriv->notifier_event, event);
+
+	mutex_unlock(&hdev->fpriv_list_lock);
+
+	/* control device */
+	mutex_lock(&hdev->fpriv_ctrl_list_lock);
+
+	list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node)
+		hl_notifier_event_send(&hpriv->notifier_event, event);
+
+	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
+}
+
 /*
  * hl_device_init - main initialization function for habanalabs device
  *
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 3262126cc7ca..828a36af5b14 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -18,8 +18,9 @@
 static char *extract_fw_ver_from_str(const char *fw_str)
 {
 	char *str, *fw_ver, *whitespace;
+	u32 ver_offset;
 
-	fw_ver = kmalloc(16, GFP_KERNEL);
+	fw_ver = kmalloc(VERSION_MAX_LEN, GFP_KERNEL);
 	if (!fw_ver)
 		return NULL;
 
@@ -29,9 +30,10 @@ static char *extract_fw_ver_from_str(const char *fw_str)
 
 	/* Skip the fw- part */
 	str += 3;
+	ver_offset = str - fw_str;
 
 	/* Copy until the next whitespace */
-	whitespace =  strnstr(str, " ", 15);
+	whitespace =  strnstr(str, " ", VERSION_MAX_LEN - ver_offset);
 	if (!whitespace)
 		goto free_fw_ver;
 
@@ -819,6 +821,54 @@ out:
 	return rc;
 }
 
+int hl_fw_get_monitor_dump(struct hl_device *hdev, void *data)
+{
+	struct cpucp_monitor_dump *mon_dump_cpu_addr;
+	dma_addr_t mon_dump_dma_addr;
+	struct cpucp_packet pkt = {};
+	size_t data_size;
+	__le32 *src_ptr;
+	u32 *dst_ptr;
+	u64 result;
+	int i, rc;
+
+	data_size = sizeof(struct cpucp_monitor_dump);
+	mon_dump_cpu_addr = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, data_size,
+										&mon_dump_dma_addr);
+	if (!mon_dump_cpu_addr) {
+		dev_err(hdev->dev,
+			"Failed to allocate DMA memory for CPU-CP monitor-dump packet\n");
+		return -ENOMEM;
+	}
+
+	memset(mon_dump_cpu_addr, 0, data_size);
+
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_MONITOR_DUMP_GET << CPUCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.addr = cpu_to_le64(mon_dump_dma_addr);
+	pkt.data_max_size = cpu_to_le32(data_size);
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+							HL_CPUCP_MON_DUMP_TIMEOUT_USEC, &result);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to handle CPU-CP monitor-dump packet, error %d\n", rc);
+		goto out;
+	}
+
+	/* result contains the actual size */
+	src_ptr = (__le32 *) mon_dump_cpu_addr;
+	dst_ptr = data;
+	for (i = 0; i < (data_size / sizeof(u32)); i++) {
+		*dst_ptr = le32_to_cpu(*src_ptr);
+		src_ptr++;
+		dst_ptr++;
+	}
+
+out:
+	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev, data_size, mon_dump_cpu_addr);
+
+	return rc;
+}
+
 int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
 		struct hl_info_pci_counters *counters)
 {
@@ -1539,7 +1589,7 @@ static int hl_fw_dynamic_wait_for_status(struct hl_device *hdev,
 		le32_to_cpu(dyn_regs->cpu_cmd_status_to_host),
 		status,
 		FIELD_GET(COMMS_STATUS_STATUS_MASK, status) == expected_status,
-		hdev->fw_poll_interval_usec,
+		hdev->fw_comms_poll_interval_usec,
 		timeout);
 
 	if (rc) {
@@ -1909,7 +1959,7 @@ static int hl_fw_dynamic_request_descriptor(struct hl_device *hdev,
  * @fwc: the firmware component
  * @fw_version: fw component's version string
  */
-static void hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev,
+static int hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev,
 					enum hl_fw_component fwc,
 					const char *fw_version)
 {
@@ -1933,23 +1983,33 @@ static void hl_fw_dynamic_read_device_fw_version(struct hl_device *hdev,
 						VERSION_MAX_LEN);
 		if (preboot_ver && preboot_ver != prop->preboot_ver) {
 			strscpy(btl_ver, prop->preboot_ver,
-				min((int) (preboot_ver - prop->preboot_ver),
-									31));
+				min((int) (preboot_ver - prop->preboot_ver), 31));
 			dev_info(hdev->dev, "%s\n", btl_ver);
 		}
 
 		preboot_ver = extract_fw_ver_from_str(prop->preboot_ver);
 		if (preboot_ver) {
-			dev_info(hdev->dev, "preboot version %s\n",
-								preboot_ver);
+			char major[8];
+			int rc;
+
+			dev_info(hdev->dev, "preboot version %s\n", preboot_ver);
+			sprintf(major, "%.2s", preboot_ver);
 			kfree(preboot_ver);
+
+			rc = kstrtou32(major, 10, &hdev->fw_major_version);
+			if (rc) {
+				dev_err(hdev->dev, "Error %d parsing preboot major version\n", rc);
+				return rc;
+			}
 		}
 
 		break;
 	default:
 		dev_warn(hdev->dev, "Undefined FW component: %d\n", fwc);
-		return;
+		return -EINVAL;
 	}
+
+	return 0;
 }
 
 /**
@@ -2121,9 +2181,10 @@ static int hl_fw_dynamic_load_image(struct hl_device *hdev,
 		goto release_fw;
 
 	/* read preboot version */
-	hl_fw_dynamic_read_device_fw_version(hdev, cur_fwc,
+	rc = hl_fw_dynamic_read_device_fw_version(hdev, cur_fwc,
 				fw_loader->dynamic_loader.comm_desc.cur_fw_ver);
-
+	if (rc)
+		goto release_fw;
 
 	/* update state according to boot stage */
 	if (cur_fwc == FW_COMP_BOOT_FIT) {
@@ -2390,9 +2451,8 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 			goto protocol_err;
 
 		/* read preboot version */
-		hl_fw_dynamic_read_device_fw_version(hdev, FW_COMP_PREBOOT,
+		return hl_fw_dynamic_read_device_fw_version(hdev, FW_COMP_PREBOOT,
 				fw_loader->dynamic_loader.comm_desc.cur_fw_ver);
-		return 0;
 	}
 
 	/* load boot fit to FW */
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 1edaf6ab67bd..b0b0f3f89865 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -21,6 +21,7 @@
 #include <linux/hashtable.h>
 #include <linux/debugfs.h>
 #include <linux/rwsem.h>
+#include <linux/eventfd.h>
 #include <linux/bitfield.h>
 #include <linux/genalloc.h>
 #include <linux/sched/signal.h>
@@ -61,8 +62,10 @@
 
 #define HL_CPUCP_INFO_TIMEOUT_USEC	10000000 /* 10s */
 #define HL_CPUCP_EEPROM_TIMEOUT_USEC	10000000 /* 10s */
+#define HL_CPUCP_MON_DUMP_TIMEOUT_USEC	10000000 /* 10s */
 
 #define HL_FW_STATUS_POLL_INTERVAL_USEC		10000 /* 10ms */
+#define HL_FW_COMMS_STATUS_PLDM_POLL_INTERVAL_USEC	1000000 /* 1s */
 
 #define HL_PCI_ELBI_TIMEOUT_MSEC	10 /* 10ms */
 
@@ -394,18 +397,8 @@ enum hl_device_hw_state {
  * struct hl_mmu_properties - ASIC specific MMU address translation properties.
  * @start_addr: virtual start address of the memory region.
  * @end_addr: virtual end address of the memory region.
- * @hop0_shift: shift of hop 0 mask.
- * @hop1_shift: shift of hop 1 mask.
- * @hop2_shift: shift of hop 2 mask.
- * @hop3_shift: shift of hop 3 mask.
- * @hop4_shift: shift of hop 4 mask.
- * @hop5_shift: shift of hop 5 mask.
- * @hop0_mask: mask to get the PTE address in hop 0.
- * @hop1_mask: mask to get the PTE address in hop 1.
- * @hop2_mask: mask to get the PTE address in hop 2.
- * @hop3_mask: mask to get the PTE address in hop 3.
- * @hop4_mask: mask to get the PTE address in hop 4.
- * @hop5_mask: mask to get the PTE address in hop 5.
+ * @hop_shifts: array holds HOPs shifts.
+ * @hop_masks: array holds HOPs masks.
  * @last_mask: mask to get the bit indicating this is the last hop.
  * @pgt_size: size for page tables.
  * @page_size: default page size used to allocate memory.
@@ -418,18 +411,8 @@ enum hl_device_hw_state {
 struct hl_mmu_properties {
 	u64	start_addr;
 	u64	end_addr;
-	u64	hop0_shift;
-	u64	hop1_shift;
-	u64	hop2_shift;
-	u64	hop3_shift;
-	u64	hop4_shift;
-	u64	hop5_shift;
-	u64	hop0_mask;
-	u64	hop1_mask;
-	u64	hop2_mask;
-	u64	hop3_mask;
-	u64	hop4_mask;
-	u64	hop5_mask;
+	u64	hop_shifts[MMU_HOP_MAX];
+	u64	hop_masks[MMU_HOP_MAX];
 	u64	last_mask;
 	u64	pgt_size;
 	u32	page_size;
@@ -486,8 +469,10 @@ struct hl_hints_range {
  *                  the device's MMU.
  * @dram_hints_align_mask: dram va hint addresses alignment mask which is used
  *                  for hints validity check.
- * device_dma_offset_for_host_access: the offset to add to host DMA addresses
- *                                    to enable the device to access them.
+ * @device_dma_offset_for_host_access: the offset to add to host DMA addresses
+ *                                     to enable the device to access them.
+ * @host_base_address: host physical start address for host DMA from device
+ * @host_end_address: host physical end address for host DMA from device
  * @max_freq_value: current max clk frequency.
  * @clk_pll_index: clock PLL index that specify which PLL determines the clock
  *                 we display to the user
@@ -528,6 +513,10 @@ struct hl_hints_range {
  * @fw_app_cpu_boot_dev_sts1: bitmap representation of application security
  *                            status reported by FW, bit description can be
  *                            found in CPU_BOOT_DEV_STS1
+ * @device_mem_alloc_default_page_size: may be different than dram_page_size only for ASICs for
+ *                                      which the property supports_user_set_page_size is true
+ *                                      (i.e. the DRAM supports multiple page sizes), otherwise
+ *                                      it will shall  be equal to dram_page_size.
  * @collective_first_sob: first sync object available for collective use
  * @collective_first_mon: first monitor available for collective use
  * @sync_stream_first_sob: first sync object available for sync stream use
@@ -568,6 +557,7 @@ struct hl_hints_range {
  * @configurable_stop_on_err: is stop-on-error option configurable via debugfs.
  * @set_max_power_on_device_init: true if need to set max power in F/W on device init.
  * @supports_user_set_page_size: true if user can set the allocation page size.
+ * @dma_mask: the dma mask to be set for this device
  */
 struct asic_fixed_properties {
 	struct hw_queue_properties	*hw_queues_props;
@@ -599,6 +589,8 @@ struct asic_fixed_properties {
 	u64				cb_va_end_addr;
 	u64				dram_hints_align_mask;
 	u64				device_dma_offset_for_host_access;
+	u64				host_base_address;
+	u64				host_end_address;
 	u64				max_freq_value;
 	u32				clk_pll_index;
 	u32				mmu_pgt_size;
@@ -626,6 +618,7 @@ struct asic_fixed_properties {
 	u32				fw_bootfit_cpu_boot_dev_sts1;
 	u32				fw_app_cpu_boot_dev_sts0;
 	u32				fw_app_cpu_boot_dev_sts1;
+	u32				device_mem_alloc_default_page_size;
 	u16				collective_first_sob;
 	u16				collective_first_mon;
 	u16				sync_stream_first_sob;
@@ -654,6 +647,7 @@ struct asic_fixed_properties {
 	u8				configurable_stop_on_err;
 	u8				set_max_power_on_device_init;
 	u8				supports_user_set_page_size;
+	u8				dma_mask;
 };
 
 /**
@@ -711,85 +705,102 @@ struct hl_cs_compl {
  */
 
 /**
- * struct hl_cb_mgr - describes a Command Buffer Manager.
- * @cb_lock: protects cb_handles.
- * @cb_handles: an idr to hold all command buffer handles.
- */
-struct hl_cb_mgr {
-	spinlock_t		cb_lock;
-	struct idr		cb_handles; /* protected by cb_lock */
-};
-
-/**
- * struct hl_ts_mgr - describes the timestamp registration memory manager.
- * @ts_lock: protects ts_handles.
- * @ts_handles: an idr to hold all ts bufferes handles.
- */
-struct hl_ts_mgr {
-	spinlock_t		ts_lock;
-	struct idr		ts_handles;
-};
-
-/**
  * struct hl_ts_buff - describes a timestamp buffer.
- * @refcount: reference counter for usage of the buffer.
- * @hdev: pointer to device this buffer belongs to.
- * @mmap: true if the buff is currently mapped to user.
  * @kernel_buff_address: Holds the internal buffer's kernel virtual address.
  * @user_buff_address: Holds the user buffer's kernel virtual address.
- * @id: the buffer ID.
- * @mmap_size: Holds the buffer size that was mmaped.
  * @kernel_buff_size: Holds the internal kernel buffer size.
- * @user_buff_size: Holds the user buffer size.
  */
 struct hl_ts_buff {
-	struct kref		refcount;
-	struct hl_device	*hdev;
-	atomic_t		mmap;
 	void			*kernel_buff_address;
 	void			*user_buff_address;
-	u32			id;
-	u32			mmap_size;
 	u32			kernel_buff_size;
-	u32			user_buff_size;
+};
+
+struct hl_mmap_mem_buf;
+
+/**
+ * struct hl_mem_mgr - describes unified memory manager for mappable memory chunks.
+ * @dev: back pointer to the owning device
+ * @lock: protects handles
+ * @handles: an idr holding all active handles to the memory buffers in the system.
+ */
+struct hl_mem_mgr {
+	struct device *dev;
+	spinlock_t lock;
+	struct idr handles;
+};
+
+/**
+ * struct hl_mmap_mem_buf_behavior - describes unified memory manager buffer behavior
+ * @topic: string identifier used for logging
+ * @mem_id: memory type identifier, embedded in the handle and used to identify
+ *          the memory type by handle.
+ * @alloc: callback executed on buffer allocation, shall allocate the memory,
+ *         set it under buffer private, and set mappable size.
+ * @mmap: callback executed on mmap, must map the buffer to vma
+ * @release: callback executed on release, must free the resources used by the buffer
+ */
+struct hl_mmap_mem_buf_behavior {
+	const char *topic;
+	u64 mem_id;
+
+	int (*alloc)(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args);
+	int (*mmap)(struct hl_mmap_mem_buf *buf, struct vm_area_struct *vma, void *args);
+	void (*release)(struct hl_mmap_mem_buf *buf);
+};
+
+/**
+ * struct hl_mmap_mem_buf - describes a single unified memory buffer
+ * @behavior: buffer behavior
+ * @mmg: back pointer to the unified memory manager
+ * @refcount: reference counter for buffer users
+ * @private: pointer to buffer behavior private data
+ * @mmap: atomic boolean indicating whether or not the buffer is mapped right now
+ * @real_mapped_size: the actual size of buffer mapped, after part of it may be released,
+ *                   may change at runtime.
+ * @mappable_size: the original mappable size of the buffer, does not change after
+ *                 the allocation.
+ * @handle: the buffer id in mmg handles store
+ */
+struct hl_mmap_mem_buf {
+	struct hl_mmap_mem_buf_behavior *behavior;
+	struct hl_mem_mgr *mmg;
+	struct kref refcount;
+	void *private;
+	atomic_t mmap;
+	u64 real_mapped_size;
+	u64 mappable_size;
+	u64 handle;
 };
 
 /**
  * struct hl_cb - describes a Command Buffer.
- * @refcount: reference counter for usage of the CB.
  * @hdev: pointer to device this CB belongs to.
  * @ctx: pointer to the CB owner's context.
- * @lock: spinlock to protect mmap flows.
+ * @buf: back pointer to the parent mappable memory buffer
  * @debugfs_list: node in debugfs list of command buffers.
  * @pool_list: node in pool list of command buffers.
  * @va_block_list: list of virtual addresses blocks of the CB if it is mapped to
  *                 the device's MMU.
- * @id: the CB's ID.
  * @kernel_address: Holds the CB's kernel virtual address.
  * @bus_address: Holds the CB's DMA address.
- * @mmap_size: Holds the CB's size that was mmaped.
  * @size: holds the CB's size.
  * @cs_cnt: holds number of CS that this CB participates in.
- * @mmap: true if the CB is currently mmaped to user.
  * @is_pool: true if CB was acquired from the pool, false otherwise.
  * @is_internal: internaly allocated
  * @is_mmu_mapped: true if the CB is mapped to the device's MMU.
  */
 struct hl_cb {
-	struct kref		refcount;
 	struct hl_device	*hdev;
 	struct hl_ctx		*ctx;
-	spinlock_t		lock;
+	struct hl_mmap_mem_buf	*buf;
 	struct list_head	debugfs_list;
 	struct list_head	pool_list;
 	struct list_head	va_block_list;
-	u64			id;
 	void			*kernel_address;
 	dma_addr_t		bus_address;
-	u32			mmap_size;
 	u32			size;
 	atomic_t		cs_cnt;
-	u8			mmap;
 	u8			is_pool;
 	u8			is_internal;
 	u8			is_mmu_mapped;
@@ -935,12 +946,12 @@ struct hl_user_interrupt {
  * struct timestamp_reg_free_node - holds the timestamp registration free objects node
  * @free_objects_node: node in the list free_obj_jobs
  * @cq_cb: pointer to cq command buffer to be freed
- * @ts_buff: pointer to timestamp buffer to be freed
+ * @buf: pointer to timestamp buffer to be freed
  */
 struct timestamp_reg_free_node {
 	struct list_head	free_objects_node;
 	struct hl_cb		*cq_cb;
-	struct hl_ts_buff	*ts_buff;
+	struct hl_mmap_mem_buf	*buf;
 };
 
 /* struct timestamp_reg_work_obj - holds the timestamp registration free objects job
@@ -957,8 +968,8 @@ struct timestamp_reg_work_obj {
 };
 
 /* struct timestamp_reg_info - holds the timestamp registration related data.
- * @ts_buff: pointer to the timestamp buffer which include both user/kernel buffers.
- *           relevant only when doing timestamps records registration.
+ * @buf: pointer to the timestamp buffer which include both user/kernel buffers.
+ *       relevant only when doing timestamps records registration.
  * @cq_cb: pointer to CQ counter CB.
  * @timestamp_kernel_addr: timestamp handle address, where to set timestamp
  *                         relevant only when doing timestamps records
@@ -969,7 +980,7 @@ struct timestamp_reg_work_obj {
  *          allocating records dynamically.
  */
 struct timestamp_reg_info {
-	struct hl_ts_buff	*ts_buff;
+	struct hl_mmap_mem_buf	*buf;
 	struct hl_cb		*cq_cb;
 	u64			*timestamp_kernel_addr;
 	u8			in_use;
@@ -1068,6 +1079,15 @@ enum div_select_defs {
 	DIV_SEL_DIVIDED_PLL = 3,
 };
 
+enum debugfs_access_type {
+	DEBUGFS_READ8,
+	DEBUGFS_WRITE8,
+	DEBUGFS_READ32,
+	DEBUGFS_WRITE32,
+	DEBUGFS_READ64,
+	DEBUGFS_WRITE64,
+};
+
 enum pci_region {
 	PCI_REGION_CFG,
 	PCI_REGION_SRAM,
@@ -1229,6 +1249,7 @@ struct fw_load_mgr {
  *                           its implementation is not trivial when the driver
  *                           is loaded in simulation mode (not upstreamed).
  * @scrub_device_mem: Scrub device memory given an address and size
+ * @scrub_device_dram: Scrub the dram memory of the device.
  * @get_int_queue_base: get the internal queue base address.
  * @test_queues: run simple test on all queues for sanity check.
  * @asic_dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool.
@@ -1236,18 +1257,14 @@ struct fw_load_mgr {
  * @asic_dma_pool_free: free small DMA allocation from pool.
  * @cpu_accessible_dma_pool_alloc: allocate CPU PQ packet from DMA pool.
  * @cpu_accessible_dma_pool_free: free CPU PQ packet from DMA pool.
- * @hl_dma_unmap_sg: DMA unmap scatter-gather list.
+ * @hl_dma_unmap_sgtable: DMA unmap scatter-gather table.
  * @cs_parser: parse Command Submission.
- * @asic_dma_map_sg: DMA map scatter-gather list.
+ * @asic_dma_map_sgtable: DMA map scatter-gather table.
  * @get_dma_desc_list_size: get number of LIN_DMA packets required for CB.
  * @add_end_of_cb_packets: Add packets to the end of CB, if device requires it.
  * @update_eq_ci: update event queue CI.
  * @context_switch: called upon ASID context switch.
  * @restore_phase_topology: clear all SOBs amd MONs.
- * @debugfs_read32: debug interface for reading u32 from DRAM/SRAM/Host memory.
- * @debugfs_write32: debug interface for writing u32 to DRAM/SRAM/Host memory.
- * @debugfs_read64: debug interface for reading u64 from DRAM/SRAM/Host memory.
- * @debugfs_write64: debug interface for writing u64 to DRAM/SRAM/Host memory.
  * @debugfs_read_dma: debug interface for reading up to 2MB from the device's
  *                    internal memory via DMA engine.
  * @add_device_attr: add ASIC specific device attributes.
@@ -1257,8 +1274,8 @@ struct fw_load_mgr {
  * @write_pte: write MMU page table entry to DRAM.
  * @mmu_invalidate_cache: flush MMU STLB host/DRAM cache, either with soft
  *                        (L1 only) or hard (L0 & L1) flush.
- * @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with
- *                              ASID-VA-size mask.
+ * @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with ASID-VA-size mask.
+ * @mmu_prefetch_cache_range: pre-fetch specific MMU STLB cache lines with ASID-VA-size mask.
  * @send_heartbeat: send is-alive packet to CPU-CP and verify response.
  * @debug_coresight: perform certain actions on Coresight for debugging.
  * @is_device_idle: return true if device is idle, false otherwise.
@@ -1267,6 +1284,7 @@ struct fw_load_mgr {
  * @hw_queues_unlock: release H/W queues lock.
  * @get_pci_id: retrieve PCI ID.
  * @get_eeprom_data: retrieve EEPROM data from F/W.
+ * @get_monitor_dump: retrieve monitor registers dump from F/W.
  * @send_cpu_message: send message to F/W. If the message is timedout, the
  *                    driver will eventually reset the device. The timeout can
  *                    be determined by the calling function or it can be 0 and
@@ -1289,8 +1307,6 @@ struct fw_load_mgr {
  * @gen_wait_cb: Generate a wait CB.
  * @reset_sob: Reset a SOB.
  * @reset_sob_group: Reset SOB group
- * @set_dma_mask_from_fw: set the DMA mask in the driver according to the
- *                        firmware configuration
  * @get_device_time: Get the device time.
  * @collective_wait_init_cs: Generate collective master/slave packets
  *                           and place them in the relevant cs jobs
@@ -1319,6 +1335,9 @@ struct fw_load_mgr {
  * @get_stream_master_qid_arr: get pointer to stream masters QID array
  * @is_valid_dram_page_size: return true if page size is supported in device
  *                           memory allocation, otherwise false.
+ * @get_valid_dram_page_orders: get valid device memory allocation page orders
+ * @access_dev_mem: access device memory
+ * @set_dram_bar_base: set the base of the DRAM BAR
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -1342,6 +1361,7 @@ struct hl_asic_funcs {
 	void (*asic_dma_free_coherent)(struct hl_device *hdev, size_t size,
 					void *cpu_addr, dma_addr_t dma_handle);
 	int (*scrub_device_mem)(struct hl_device *hdev, u64 addr, u64 size);
+	int (*scrub_device_dram)(struct hl_device *hdev, u64 val);
 	void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id,
 				dma_addr_t *dma_handle, u16 *queue_len);
 	int (*test_queues)(struct hl_device *hdev);
@@ -1353,12 +1373,11 @@ struct hl_asic_funcs {
 				size_t size, dma_addr_t *dma_handle);
 	void (*cpu_accessible_dma_pool_free)(struct hl_device *hdev,
 				size_t size, void *vaddr);
-	void (*hl_dma_unmap_sg)(struct hl_device *hdev,
-				struct scatterlist *sgl, int nents,
+	void (*hl_dma_unmap_sgtable)(struct hl_device *hdev,
+				struct sg_table *sgt,
 				enum dma_data_direction dir);
 	int (*cs_parser)(struct hl_device *hdev, struct hl_cs_parser *parser);
-	int (*asic_dma_map_sg)(struct hl_device *hdev,
-				struct scatterlist *sgl, int nents,
+	int (*asic_dma_map_sgtable)(struct hl_device *hdev, struct sg_table *sgt,
 				enum dma_data_direction dir);
 	u32 (*get_dma_desc_list_size)(struct hl_device *hdev,
 					struct sg_table *sgt);
@@ -1369,14 +1388,6 @@ struct hl_asic_funcs {
 	void (*update_eq_ci)(struct hl_device *hdev, u32 val);
 	int (*context_switch)(struct hl_device *hdev, u32 asid);
 	void (*restore_phase_topology)(struct hl_device *hdev);
-	int (*debugfs_read32)(struct hl_device *hdev, u64 addr,
-				bool user_address, u32 *val);
-	int (*debugfs_write32)(struct hl_device *hdev, u64 addr,
-				bool user_address, u32 val);
-	int (*debugfs_read64)(struct hl_device *hdev, u64 addr,
-				bool user_address, u64 *val);
-	int (*debugfs_write64)(struct hl_device *hdev, u64 addr,
-				bool user_address, u64 val);
 	int (*debugfs_read_dma)(struct hl_device *hdev, u64 addr, u32 size,
 				void *blob_addr);
 	void (*add_device_attr)(struct hl_device *hdev, struct attribute_group *dev_clk_attr_grp,
@@ -1391,6 +1402,7 @@ struct hl_asic_funcs {
 					u32 flags);
 	int (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
 				u32 flags, u32 asid, u64 va, u64 size);
+	int (*mmu_prefetch_cache_range)(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size);
 	int (*send_heartbeat)(struct hl_device *hdev);
 	int (*debug_coresight)(struct hl_device *hdev, struct hl_ctx *ctx, void *data);
 	bool (*is_device_idle)(struct hl_device *hdev, u64 *mask_arr,
@@ -1399,8 +1411,8 @@ struct hl_asic_funcs {
 	void (*hw_queues_lock)(struct hl_device *hdev);
 	void (*hw_queues_unlock)(struct hl_device *hdev);
 	u32 (*get_pci_id)(struct hl_device *hdev);
-	int (*get_eeprom_data)(struct hl_device *hdev, void *data,
-				size_t max_size);
+	int (*get_eeprom_data)(struct hl_device *hdev, void *data, size_t max_size);
+	int (*get_monitor_dump)(struct hl_device *hdev, void *data);
 	int (*send_cpu_message)(struct hl_device *hdev, u32 *msg,
 				u16 len, u32 timeout, u64 *result);
 	int (*pci_bars_map)(struct hl_device *hdev);
@@ -1421,7 +1433,6 @@ struct hl_asic_funcs {
 			struct hl_gen_wait_properties *prop);
 	void (*reset_sob)(struct hl_device *hdev, void *data);
 	void (*reset_sob_group)(struct hl_device *hdev, u16 sob_group);
-	void (*set_dma_mask_from_fw)(struct hl_device *hdev);
 	u64 (*get_device_time)(struct hl_device *hdev);
 	int (*collective_wait_init_cs)(struct hl_cs *cs);
 	int (*collective_wait_create_jobs)(struct hl_device *hdev,
@@ -1445,6 +1456,12 @@ struct hl_asic_funcs {
 	void (*set_pci_memory_regions)(struct hl_device *hdev);
 	u32* (*get_stream_master_qid_arr)(void);
 	bool (*is_valid_dram_page_size)(u32 page_size);
+	int (*mmu_get_real_page_size)(struct hl_device *hdev, struct hl_mmu_properties *mmu_prop,
+					u32 page_size, u32 *real_page_size, bool is_dram_addr);
+	void (*get_valid_dram_page_orders)(struct hl_info_dev_memalloc_page_sizes *info);
+	int (*access_dev_mem)(struct hl_device *hdev, struct pci_mem_region *region,
+		enum pci_region region_type, u64 addr, u64 *val, enum debugfs_access_type acc_type);
+	u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr);
 };
 
 
@@ -1915,6 +1932,18 @@ struct hl_debug_params {
 	bool enable;
 };
 
+/**
+ * struct hl_notifier_event - holds the notifier data structure
+ * @eventfd: the event file descriptor to raise the notifications
+ * @lock: mutex lock to protect the notifier data flows
+ * @events_mask: indicates the bitmap events
+ */
+struct hl_notifier_event {
+	struct eventfd_ctx	*eventfd;
+	struct mutex		lock;
+	u64			events_mask;
+};
+
 /*
  * FILE PRIVATE STRUCTURE
  */
@@ -1926,25 +1955,25 @@ struct hl_debug_params {
  * @taskpid: current process ID.
  * @ctx: current executing context. TODO: remove for multiple ctx per process
  * @ctx_mgr: context manager to handle multiple context for this FD.
- * @cb_mgr: command buffer manager to handle multiple buffers for this FD.
- * @ts_mem_mgr: timestamp registration manager for alloc/free/map timestamp buffers.
+ * @mem_mgr: manager descriptor for memory exportable via mmap
+ * @notifier_event: notifier eventfd towards user process
  * @debugfs_list: list of relevant ASIC debugfs.
  * @dev_node: node in the device list of file private data
  * @refcount: number of related contexts.
  * @restore_phase_mutex: lock for context switch and restore phase.
  */
 struct hl_fpriv {
-	struct hl_device	*hdev;
-	struct file		*filp;
-	struct pid		*taskpid;
-	struct hl_ctx		*ctx;
-	struct hl_ctx_mgr	ctx_mgr;
-	struct hl_cb_mgr	cb_mgr;
-	struct hl_ts_mgr	ts_mem_mgr;
-	struct list_head	debugfs_list;
-	struct list_head	dev_node;
-	struct kref		refcount;
-	struct mutex		restore_phase_mutex;
+	struct hl_device		*hdev;
+	struct file			*filp;
+	struct pid			*taskpid;
+	struct hl_ctx			*ctx;
+	struct hl_ctx_mgr		ctx_mgr;
+	struct hl_mem_mgr		mem_mgr;
+	struct hl_notifier_event	notifier_event;
+	struct list_head		debugfs_list;
+	struct list_head		dev_node;
+	struct kref			refcount;
+	struct mutex			restore_phase_mutex;
 };
 
 
@@ -1992,12 +2021,14 @@ struct hl_debugfs_entry {
  * @userptr_spinlock: protects userptr_list.
  * @ctx_mem_hash_list: list of available contexts with MMU mappings.
  * @ctx_mem_hash_spinlock: protects cb_list.
- * @blob_desc: descriptor of blob
+ * @data_dma_blob_desc: data DMA descriptor of blob.
+ * @mon_dump_blob_desc: monitor dump descriptor of blob.
  * @state_dump: data of the system states in case of a bad cs.
  * @state_dump_sem: protects state_dump.
  * @addr: next address to read/write from/to in read/write32.
  * @mmu_addr: next virtual address to translate to physical address in mmu_show.
  * @userptr_lookup: the target user ptr to look up for on demand.
+ * @memory_scrub_val: the value to which the dram will be scrubbed to using cb scrub_device_dram
  * @mmu_asid: ASID to use while translating in mmu_show.
  * @state_dump_head: index of the latest state dump
  * @i2c_bus: generic u8 debugfs file for bus value to use in i2c_data_read.
@@ -2021,12 +2052,14 @@ struct hl_dbg_device_entry {
 	spinlock_t			userptr_spinlock;
 	struct list_head		ctx_mem_hash_list;
 	spinlock_t			ctx_mem_hash_spinlock;
-	struct debugfs_blob_wrapper	blob_desc;
+	struct debugfs_blob_wrapper	data_dma_blob_desc;
+	struct debugfs_blob_wrapper	mon_dump_blob_desc;
 	char				*state_dump[HL_STATE_DUMP_HIST_LEN];
 	struct rw_semaphore		state_dump_sem;
 	u64				addr;
 	u64				mmu_addr;
 	u64				userptr_lookup;
+	u64				memory_scrub_val;
 	u32				mmu_asid;
 	u32				state_dump_head;
 	u8				i2c_bus;
@@ -2442,6 +2475,24 @@ struct hl_mmu_funcs {
 };
 
 /**
+ * struct hl_prefetch_work - prefetch work structure handler
+ * @pf_work: actual work struct.
+ * @ctx: compute context.
+ * @va: virtual address to pre-fetch.
+ * @size: pre-fetch size.
+ * @flags: operation flags.
+ * @asid: ASID for maintenance operation.
+ */
+struct hl_prefetch_work {
+	struct work_struct	pf_work;
+	struct hl_ctx		*ctx;
+	u64			va;
+	u64			size;
+	u32			flags;
+	u32			asid;
+};
+
+/*
  * number of user contexts allowed to call wait_for_multi_cs ioctl in
  * parallel
  */
@@ -2517,37 +2568,50 @@ struct hl_clk_throttle {
 };
 
 /**
- * struct last_error_session_info - info about last session in which CS timeout or
- *                                    razwi error occurred.
- * @open_dev_timestamp: device open timestamp.
- * @cs_timeout_timestamp: CS timeout timestamp.
- * @razwi_timestamp: razwi timestamp.
- * @cs_write_disable: if set writing to CS parameters in the structure is disabled so the
- *                    first (root cause) CS timeout will not be overwritten.
- * @razwi_write_disable: if set writing to razwi parameters in the structure is disabled so the
- *                       first (root cause) razwi will not be overwritten.
- * @cs_timeout_seq: CS timeout sequence number.
- * @razwi_addr: address that caused razwi.
- * @razwi_engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does
- *                     not have engine id it will be set to U16_MAX.
- * @razwi_engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
- *                     engines which one them caused the razwi. In that case, it will contain the
- *                     second possible engine id, otherwise it will be set to U16_MAX.
- * @razwi_non_engine_initiator: in case the initiator of the razwi does not have engine id.
- * @razwi_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
+ * struct cs_timeout_info - info of last CS timeout occurred.
+ * @timestamp: CS timeout timestamp.
+ * @write_disable: if set writing to CS parameters in the structure is disabled so,
+ *                 the first (root cause) CS timeout will not be overwritten.
+ * @seq: CS timeout sequence number.
+ */
+struct cs_timeout_info {
+	ktime_t		timestamp;
+	atomic_t	write_disable;
+	u64		seq;
+};
+
+/**
+ * struct razwi_info - info about last razwi error occurred.
+ * @timestamp: razwi timestamp.
+ * @write_disable: if set writing to razwi parameters in the structure is disabled so the
+ *                 first (root cause) razwi will not be overwritten.
+ * @addr: address that caused razwi.
+ * @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does
+ *               not have engine id it will be set to U16_MAX.
+ * @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
+ *               engines which one them caused the razwi. In that case, it will contain the
+ *               second possible engine id, otherwise it will be set to U16_MAX.
+ * @non_engine_initiator: in case the initiator of the razwi does not have engine id.
+ * @type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
+ */
+struct razwi_info {
+	ktime_t		timestamp;
+	atomic_t	write_disable;
+	u64		addr;
+	u16		engine_id_1;
+	u16		engine_id_2;
+	u8		non_engine_initiator;
+	u8		type;
+};
+
+/**
+ * struct last_error_session_info - info about last session errors occurred.
+ * @cs_timeout: CS timeout error last information.
+ * @razwi: razwi last information.
  */
 struct last_error_session_info {
-	ktime_t		open_dev_timestamp;
-	ktime_t		cs_timeout_timestamp;
-	ktime_t		razwi_timestamp;
-	atomic_t	cs_write_disable;
-	atomic_t	razwi_write_disable;
-	u64		cs_timeout_seq;
-	u64		razwi_addr;
-	u16		razwi_engine_id_1;
-	u16		razwi_engine_id_2;
-	u8		razwi_non_engine_initiator;
-	u8		razwi_type;
+	struct	cs_timeout_info	cs_timeout;
+	struct	razwi_info	razwi;
 };
 
 /**
@@ -2614,11 +2678,12 @@ struct hl_reset_info {
  *         context.
  * @eq_wq: work queue of event queue for executing work in process context.
  * @ts_free_obj_wq: work queue for timestamp registration objects release.
+ * @pf_wq: work queue for MMU pre-fetch operations.
  * @kernel_ctx: Kernel driver context structure.
  * @kernel_queues: array of hl_hw_queue.
  * @cs_mirror_list: CS mirror list for TDR.
  * @cs_mirror_lock: protects cs_mirror_list.
- * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CBs.
+ * @kernel_mem_mgr: memory manager for memory buffers with lifespan of driver.
  * @event_queue: event queue for IRQ from CPU-CP.
  * @dma_pool: DMA pool for small allocations.
  * @cpu_accessible_dma_mem: Host <-> CPU-CP shared memory CPU address.
@@ -2656,9 +2721,10 @@ struct hl_reset_info {
  * @state_dump_specs: constants and dictionaries needed to dump system state.
  * @multi_cs_completion: array of multi-CS completion.
  * @clk_throttling: holds information about current/previous clock throttling events
- * @reset_info: holds current device reset information.
  * @last_error: holds information about last session in which CS timeout or razwi error occurred.
+ * @reset_info: holds current device reset information.
  * @stream_master_qid_arr: pointer to array with QIDs of master streams.
+ * @fw_major_version: major version of current loaded preboot
  * @dram_used_mem: current DRAM memory consumption.
  * @timeout_jiffies: device CS timeout value.
  * @max_power: the max power of the device, as configured by the sysadmin. This
@@ -2678,6 +2744,9 @@ struct hl_reset_info {
  *                                  session.
  * @open_counter: number of successful device open operations.
  * @fw_poll_interval_usec: FW status poll interval in usec.
+ *                         used for CPU boot status
+ * @fw_comms_poll_interval_usec: FW comms/protocol poll interval in usec.
+ *                                  used for COMMs protocols cmds(COMMS_STS_*)
  * @card_type: Various ASICs have several card types. This indicates the card
  *             type of the current device.
  * @major: habanalabs kernel driver major.
@@ -2686,6 +2755,7 @@ struct hl_reset_info {
  * @id_control: minor of the control device
  * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
  *                    addresses.
+ * @is_in_dram_scrub: true if dram scrub operation is on going.
  * @disabled: is device disabled.
  * @late_init_done: is late init stage was done during initialization.
  * @hwmon_initialized: is H/W monitor sensors was initialized.
@@ -2699,7 +2769,6 @@ struct hl_reset_info {
  *                   huge pages.
  * @init_done: is the initialization of the device done.
  * @device_cpu_disabled: is the device CPU disabled (due to timeouts)
- * @dma_mask: the dma mask that was set for this device
  * @in_debug: whether the device is in a state where the profiling/tracing infrastructure
  *            can be used. This indication is needed because in some ASICs we need to do
  *            specific operations to enable that infrastructure.
@@ -2721,6 +2790,8 @@ struct hl_reset_info {
  *                        cases where Linux was not loaded to device CPU
  * @supports_wait_for_multi_cs: true if wait for multi CS is supported
  * @is_compute_ctx_active: Whether there is an active compute context executing.
+ * @compute_ctx_in_release: true if the current compute context is being released.
+ * @supports_mmu_prefetch: true if prefetch is supported, otherwise false.
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -2742,11 +2813,12 @@ struct hl_device {
 	struct workqueue_struct		**cq_wq;
 	struct workqueue_struct		*eq_wq;
 	struct workqueue_struct		*ts_free_obj_wq;
+	struct workqueue_struct		*pf_wq;
 	struct hl_ctx			*kernel_ctx;
 	struct hl_hw_queue		*kernel_queues;
 	struct list_head		cs_mirror_list;
 	spinlock_t			cs_mirror_lock;
-	struct hl_cb_mgr		kernel_cb_mgr;
+	struct hl_mem_mgr		kernel_mem_mgr;
 	struct hl_eq			event_queue;
 	struct dma_pool			*dma_pool;
 	void				*cpu_accessible_dma_mem;
@@ -2797,6 +2869,7 @@ struct hl_device {
 	struct hl_reset_info		reset_info;
 
 	u32				*stream_master_qid_arr;
+	u32				fw_major_version;
 	atomic64_t			dram_used_mem;
 	u64				timeout_jiffies;
 	u64				max_power;
@@ -2807,12 +2880,15 @@ struct hl_device {
 	u64				open_counter;
 	u64				fw_poll_interval_usec;
 	ktime_t				last_successful_open_ktime;
+	u64				fw_comms_poll_interval_usec;
+
 	enum cpucp_card_types		card_type;
 	u32				major;
 	u32				high_pll;
 	u16				id;
 	u16				id_control;
 	u16				cpu_pci_msb_addr;
+	u8				is_in_dram_scrub;
 	u8				disabled;
 	u8				late_init_done;
 	u8				hwmon_initialized;
@@ -2823,7 +2899,6 @@ struct hl_device {
 	u8				pmmu_huge_range;
 	u8				init_done;
 	u8				device_cpu_disabled;
-	u8				dma_mask;
 	u8				in_debug;
 	u8				cdev_sysfs_created;
 	u8				stop_on_err;
@@ -2839,6 +2914,8 @@ struct hl_device {
 	u8				supports_wait_for_multi_cs;
 	u8				stream_master_qid_arr_size;
 	u8				is_compute_ctx_active;
+	u8				compute_ctx_in_release;
+	u8				supports_mmu_prefetch;
 
 	/* Parameters for bring-up */
 	u64				nic_ports_mask;
@@ -2971,6 +3048,14 @@ static inline bool hl_mem_area_crosses_range(u64 address, u32 size,
 	return ((address <= range_end_address) && (range_start_address <= end_address));
 }
 
+uint64_t hl_set_dram_bar_default(struct hl_device *hdev, u64 addr);
+int hl_dma_map_sgtable(struct hl_device *hdev, struct sg_table *sgt, enum dma_data_direction dir);
+void hl_dma_unmap_sgtable(struct hl_device *hdev, struct sg_table *sgt,
+				enum dma_data_direction dir);
+int hl_access_cfg_region(struct hl_device *hdev, u64 addr, u64 *val,
+	enum debugfs_access_type acc_type);
+int hl_access_dev_mem(struct hl_device *hdev, struct pci_mem_region *region,
+		enum pci_region region_type, u64 addr, u64 *val, enum debugfs_access_type acc_type);
 int hl_device_open(struct inode *inode, struct file *filp);
 int hl_device_open_ctrl(struct inode *inode, struct file *filp);
 bool hl_device_operational(struct hl_device *hdev,
@@ -3013,7 +3098,7 @@ int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv);
 void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx);
 int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx);
 void hl_ctx_do_release(struct kref *ref);
-void hl_ctx_get(struct hl_device *hdev,	struct hl_ctx *ctx);
+void hl_ctx_get(struct hl_ctx *ctx);
 int hl_ctx_put(struct hl_ctx *ctx);
 struct hl_ctx *hl_get_compute_ctx(struct hl_device *hdev);
 struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
@@ -3034,23 +3119,21 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization);
 int hl_build_hwmon_channel_info(struct hl_device *hdev,
 		struct cpucp_sensor *sensors_arr);
 
+void hl_notifier_event_send_all(struct hl_device *hdev, u64 event);
+
 int hl_sysfs_init(struct hl_device *hdev);
 void hl_sysfs_fini(struct hl_device *hdev);
 
 int hl_hwmon_init(struct hl_device *hdev);
 void hl_hwmon_fini(struct hl_device *hdev);
 
-int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
+int hl_cb_create(struct hl_device *hdev, struct hl_mem_mgr *mmg,
 			struct hl_ctx *ctx, u32 cb_size, bool internal_cb,
 			bool map_cb, u64 *handle);
-int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle);
-int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
+int hl_cb_destroy(struct hl_mem_mgr *mmg, u64 cb_handle);
 int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
-struct hl_cb *hl_cb_get(struct hl_device *hdev,	struct hl_cb_mgr *mgr,
-			u32 handle);
+struct hl_cb *hl_cb_get(struct hl_mem_mgr *mmg, u64 handle);
 void hl_cb_put(struct hl_cb *cb);
-void hl_cb_mgr_init(struct hl_cb_mgr *mgr);
-void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr);
 struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
 					bool internal_cb);
 int hl_cb_pool_init(struct hl_device *hdev);
@@ -3104,6 +3187,8 @@ int hl_mmu_ctx_init(struct hl_ctx *ctx);
 void hl_mmu_ctx_fini(struct hl_ctx *ctx);
 int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 		u32 page_size, bool flush_pte);
+int hl_mmu_get_real_page_size(struct hl_device *hdev, struct hl_mmu_properties *mmu_prop,
+				u32 page_size, u32 *real_page_size, bool is_dram_addr);
 int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 		bool flush_pte);
 int hl_mmu_map_contiguous(struct hl_ctx *ctx, u64 virt_addr,
@@ -3112,6 +3197,7 @@ int hl_mmu_unmap_contiguous(struct hl_ctx *ctx, u64 virt_addr, u32 size);
 int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags);
 int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard,
 					u32 flags, u32 asid, u64 va, u64 size);
+int hl_mmu_prefetch_cache_range(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size);
 u64 hl_mmu_get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte);
 u64 hl_mmu_get_hop_pte_phys_addr(struct hl_ctx *ctx, struct hl_mmu_properties *mmu_prop,
 					u8 hop_idx, u64 hop_addr, u64 virt_addr);
@@ -3149,6 +3235,7 @@ int hl_fw_cpucp_handshake(struct hl_device *hdev,
 				u32 sts_boot_dev_sts1_reg, u32 boot_err0_reg,
 				u32 boot_err1_reg);
 int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
+int hl_fw_get_monitor_dump(struct hl_device *hdev, void *data);
 int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
 		struct hl_info_pci_counters *counters);
 int hl_fw_cpucp_total_energy_get(struct hl_device *hdev,
@@ -3224,11 +3311,19 @@ __printf(4, 5) int hl_snprintf_resize(char **buf, size_t *size, size_t *offset,
 					const char *format, ...);
 char *hl_format_as_binary(char *buf, size_t buf_len, u32 n);
 const char *hl_sync_engine_to_string(enum hl_sync_engine_type engine_type);
-void hl_ts_mgr_init(struct hl_ts_mgr *mgr);
-void hl_ts_mgr_fini(struct hl_device *hdev, struct hl_ts_mgr *mgr);
-int hl_ts_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
-struct hl_ts_buff *hl_ts_get(struct hl_device *hdev, struct hl_ts_mgr *mgr, u32 handle);
-void hl_ts_put(struct hl_ts_buff *buff);
+
+void hl_mem_mgr_init(struct device *dev, struct hl_mem_mgr *mmg);
+void hl_mem_mgr_fini(struct hl_mem_mgr *mmg);
+int hl_mem_mgr_mmap(struct hl_mem_mgr *mmg, struct vm_area_struct *vma,
+		    void *args);
+struct hl_mmap_mem_buf *hl_mmap_mem_buf_get(struct hl_mem_mgr *mmg,
+						   u64 handle);
+int hl_mmap_mem_buf_put_handle(struct hl_mem_mgr *mmg, u64 handle);
+int hl_mmap_mem_buf_put(struct hl_mmap_mem_buf *buf);
+struct hl_mmap_mem_buf *
+hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg,
+		      struct hl_mmap_mem_buf_behavior *behavior, gfp_t gfp,
+		      void *args);
 
 #ifdef CONFIG_DEBUG_FS
 
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index ca404ed9d9a7..37edb69a7255 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -134,13 +134,14 @@ int hl_device_open(struct inode *inode, struct file *filp)
 	hpriv->hdev = hdev;
 	filp->private_data = hpriv;
 	hpriv->filp = filp;
+
+	mutex_init(&hpriv->notifier_event.lock);
 	mutex_init(&hpriv->restore_phase_mutex);
 	kref_init(&hpriv->refcount);
 	nonseekable_open(inode, filp);
 
-	hl_cb_mgr_init(&hpriv->cb_mgr);
 	hl_ctx_mgr_init(&hpriv->ctx_mgr);
-	hl_ts_mgr_init(&hpriv->ts_mem_mgr);
+	hl_mem_mgr_init(hpriv->hdev->dev, &hpriv->mem_mgr);
 
 	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
 
@@ -150,7 +151,28 @@ int hl_device_open(struct inode *inode, struct file *filp)
 		dev_err_ratelimited(hdev->dev,
 			"Can't open %s because it is %s\n",
 			dev_name(hdev->dev), hdev->status[status]);
-		rc = -EPERM;
+
+		if (status == HL_DEVICE_STATUS_IN_RESET)
+			rc = -EAGAIN;
+		else
+			rc = -EPERM;
+
+		goto out_err;
+	}
+
+	if (hdev->is_in_dram_scrub) {
+		dev_dbg_ratelimited(hdev->dev,
+			"Can't open %s during dram scrub\n",
+			dev_name(hdev->dev));
+		rc = -EAGAIN;
+		goto out_err;
+	}
+
+	if (hdev->compute_ctx_in_release) {
+		dev_dbg_ratelimited(hdev->dev,
+			"Can't open %s because another user is still releasing it\n",
+			dev_name(hdev->dev));
+		rc = -EAGAIN;
 		goto out_err;
 	}
 
@@ -173,8 +195,8 @@ int hl_device_open(struct inode *inode, struct file *filp)
 
 	hl_debugfs_add_file(hpriv);
 
-	atomic_set(&hdev->last_error.cs_write_disable, 0);
-	atomic_set(&hdev->last_error.razwi_write_disable, 0);
+	atomic_set(&hdev->last_error.cs_timeout.write_disable, 0);
+	atomic_set(&hdev->last_error.razwi.write_disable, 0);
 
 	hdev->open_counter++;
 	hdev->last_successful_open_jif = jiffies;
@@ -184,11 +206,11 @@ int hl_device_open(struct inode *inode, struct file *filp)
 
 out_err:
 	mutex_unlock(&hdev->fpriv_list_lock);
-	hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
-	hl_ts_mgr_fini(hpriv->hdev, &hpriv->ts_mem_mgr);
+	hl_mem_mgr_fini(&hpriv->mem_mgr);
 	hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
 	filp->private_data = NULL;
 	mutex_destroy(&hpriv->restore_phase_mutex);
+	mutex_destroy(&hpriv->notifier_event.lock);
 	put_pid(hpriv->taskpid);
 
 	kfree(hpriv);
@@ -222,9 +244,11 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp)
 	hpriv->hdev = hdev;
 	filp->private_data = hpriv;
 	hpriv->filp = filp;
+
+	mutex_init(&hpriv->notifier_event.lock);
 	nonseekable_open(inode, filp);
 
-	hpriv->taskpid = find_get_pid(current->pid);
+	hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);
 
 	mutex_lock(&hdev->fpriv_ctrl_list_lock);
 
@@ -288,6 +312,7 @@ static int fixup_device_params(struct hl_device *hdev)
 	hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
 
 	hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
+	hdev->fw_comms_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
 
 	hdev->stop_on_err = true;
 	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
@@ -296,9 +321,6 @@ static int fixup_device_params(struct hl_device *hdev)
 	/* Enable only after the initialization of the device */
 	hdev->disabled = true;
 
-	/* Set default DMA mask to 32 bits */
-	hdev->dma_mask = 32;
-
 	return 0;
 }
 
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index c13a3c2a7013..c7864d6bb0a1 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -76,6 +76,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	if (hw_ip.dram_size > PAGE_SIZE)
 		hw_ip.dram_enabled = 1;
 	hw_ip.dram_page_size = prop->dram_page_size;
+	hw_ip.device_mem_alloc_default_page_size = prop->device_mem_alloc_default_page_size;
 	hw_ip.num_of_events = prop->num_of_events;
 
 	memcpy(hw_ip.cpucp_version, prop->cpucp_info.cpucp_version,
@@ -115,6 +116,23 @@ static int hw_events_info(struct hl_device *hdev, bool aggregate,
 	return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0;
 }
 
+static int events_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	u32 max_size = args->return_size;
+	u64 events_mask;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((max_size < sizeof(u64)) || (!out))
+		return -EINVAL;
+
+	mutex_lock(&hpriv->notifier_event.lock);
+	events_mask = hpriv->notifier_event.events_mask;
+	hpriv->notifier_event.events_mask = 0;
+	mutex_unlock(&hpriv->notifier_event.lock);
+
+	return copy_to_user(out, &events_mask, sizeof(u64)) ? -EFAULT : 0;
+}
+
 static int dram_usage_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
 	struct hl_device *hdev = hpriv->hdev;
@@ -497,6 +515,8 @@ static int open_stats_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 	open_stats_info.last_open_period_ms = jiffies64_to_msecs(
 		hdev->last_open_session_duration_jif);
 	open_stats_info.open_counter = hdev->open_counter;
+	open_stats_info.is_compute_ctx_active = hdev->is_compute_ctx_active;
+	open_stats_info.compute_ctx_in_release = hdev->compute_ctx_in_release;
 
 	return copy_to_user(out, &open_stats_info,
 		min((size_t) max_size, sizeof(open_stats_info))) ? -EFAULT : 0;
@@ -549,7 +569,7 @@ static int last_err_open_dev_info(struct hl_fpriv *hpriv, struct hl_info_args *a
 	if ((!max_size) || (!out))
 		return -EINVAL;
 
-	info.timestamp = ktime_to_ns(hdev->last_error.open_dev_timestamp);
+	info.timestamp = ktime_to_ns(hdev->last_successful_open_ktime);
 
 	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
 }
@@ -564,8 +584,8 @@ static int cs_timeout_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 	if ((!max_size) || (!out))
 		return -EINVAL;
 
-	info.seq = hdev->last_error.cs_timeout_seq;
-	info.timestamp = ktime_to_ns(hdev->last_error.cs_timeout_timestamp);
+	info.seq = hdev->last_error.cs_timeout.seq;
+	info.timestamp = ktime_to_ns(hdev->last_error.cs_timeout.timestamp);
 
 	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
 }
@@ -580,16 +600,74 @@ static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 	if ((!max_size) || (!out))
 		return -EINVAL;
 
-	info.timestamp = ktime_to_ns(hdev->last_error.razwi_timestamp);
-	info.addr = hdev->last_error.razwi_addr;
-	info.engine_id_1 = hdev->last_error.razwi_engine_id_1;
-	info.engine_id_2 = hdev->last_error.razwi_engine_id_2;
-	info.no_engine_id = hdev->last_error.razwi_non_engine_initiator;
-	info.error_type = hdev->last_error.razwi_type;
+	info.timestamp = ktime_to_ns(hdev->last_error.razwi.timestamp);
+	info.addr = hdev->last_error.razwi.addr;
+	info.engine_id_1 = hdev->last_error.razwi.engine_id_1;
+	info.engine_id_2 = hdev->last_error.razwi.engine_id_2;
+	info.no_engine_id = hdev->last_error.razwi.non_engine_initiator;
+	info.error_type = hdev->last_error.razwi.type;
+
+	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+}
+
+static int dev_mem_alloc_page_sizes_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	struct hl_info_dev_memalloc_page_sizes info = {0};
+	struct hl_device *hdev = hpriv->hdev;
+	u32 max_size = args->return_size;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	/*
+	 * Future ASICs that will support multiple DRAM page sizes will support only "powers of 2"
+	 * pages (unlike some of the ASICs before supporting multiple page sizes).
+	 * For this reason for all ASICs that not support multiple page size the function will
+	 * return an empty bitmask indicating that multiple page sizes is not supported.
+	 */
+	hdev->asic_funcs->get_valid_dram_page_orders(&info);
 
 	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
 }
 
+static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	int rc;
+
+	/* check if there is already a registered on that process */
+	mutex_lock(&hpriv->notifier_event.lock);
+	if (hpriv->notifier_event.eventfd) {
+		mutex_unlock(&hpriv->notifier_event.lock);
+		return -EINVAL;
+	}
+
+	hpriv->notifier_event.eventfd = eventfd_ctx_fdget(args->eventfd);
+	if (IS_ERR(hpriv->notifier_event.eventfd)) {
+		rc = PTR_ERR(hpriv->notifier_event.eventfd);
+		hpriv->notifier_event.eventfd = NULL;
+		mutex_unlock(&hpriv->notifier_event.lock);
+		return rc;
+	}
+
+	mutex_unlock(&hpriv->notifier_event.lock);
+	return 0;
+}
+
+static int eventfd_unregister(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	mutex_lock(&hpriv->notifier_event.lock);
+	if (!hpriv->notifier_event.eventfd) {
+		mutex_unlock(&hpriv->notifier_event.lock);
+		return -EINVAL;
+	}
+
+	eventfd_ctx_put(hpriv->notifier_event.eventfd);
+	hpriv->notifier_event.eventfd = NULL;
+	mutex_unlock(&hpriv->notifier_event.lock);
+	return 0;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
@@ -640,6 +718,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_RAZWI_EVENT:
 		return razwi_info(hpriv, args);
 
+	case HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES:
+		return dev_mem_alloc_page_sizes_info(hpriv, args);
+
+	case HL_INFO_GET_EVENTS:
+		return events_info(hpriv, args);
+
 	default:
 		break;
 	}
@@ -690,6 +774,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_DRAM_PENDING_ROWS:
 		return dram_pending_rows_info(hpriv, args);
 
+	case HL_INFO_REGISTER_EVENTFD:
+		return eventfd_register(hpriv, args);
+
+	case HL_INFO_UNREGISTER_EVENTFD:
+		return eventfd_unregister(hpriv, args);
+
 	default:
 		dev_err(dev, "Invalid request %d\n", args->op);
 		rc = -EINVAL;
diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c
index e2bc128f2291..8500e15ef743 100644
--- a/drivers/misc/habanalabs/common/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@@ -152,11 +152,11 @@ static void hl_ts_free_objects(struct work_struct *work)
 	struct hl_device *hdev = job->hdev;
 
 	list_for_each_entry_safe(free_obj, temp_free_obj, free_list_head, free_objects_node) {
-		dev_dbg(hdev->dev, "About to put refcount to ts_buff (%p) cq_cb(%p)\n",
-					free_obj->ts_buff,
+		dev_dbg(hdev->dev, "About to put refcount to buf (%p) cq_cb(%p)\n",
+					free_obj->buf,
 					free_obj->cq_cb);
 
-		hl_ts_put(free_obj->ts_buff);
+		hl_mmap_mem_buf_put(free_obj->buf);
 		hl_cb_put(free_obj->cq_cb);
 		kfree(free_obj);
 	}
@@ -210,7 +210,7 @@ static int handle_registration_node(struct hl_device *hdev, struct hl_user_pendi
 	/* Putting the refcount for ts_buff and cq_cb objects will be handled
 	 * in workqueue context, just add job to free_list.
 	 */
-	free_node->ts_buff = pend->ts_reg_info.ts_buff;
+	free_node->buf = pend->ts_reg_info.buf;
 	free_node->cq_cb = pend->ts_reg_info.cq_cb;
 	list_add(&free_node->free_objects_node, *free_list);
 
@@ -244,7 +244,7 @@ static void handle_user_cq(struct hl_device *hdev,
 	list_for_each_entry_safe(pend, temp_pend, &user_cq->wait_list_head, wait_list_node) {
 		if ((pend->cq_kernel_addr && *(pend->cq_kernel_addr) >= pend->cq_target_value) ||
 				!pend->cq_kernel_addr) {
-			if (pend->ts_reg_info.ts_buff) {
+			if (pend->ts_reg_info.buf) {
 				if (!reg_node_handle_fail) {
 					rc = handle_registration_node(hdev, pend,
 									&ts_reg_free_list_head);
@@ -282,10 +282,6 @@ irqreturn_t hl_irq_handler_user_cq(int irq, void *arg)
 	struct hl_user_interrupt *user_cq = arg;
 	struct hl_device *hdev = user_cq->hdev;
 
-	dev_dbg(hdev->dev,
-		"got user completion interrupt id %u",
-		user_cq->interrupt_id);
-
 	/* Handle user cq interrupts registered on all interrupts */
 	handle_user_cq(hdev, &hdev->common_user_interrupt);
 
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index a13506dd8119..663dd7e589d4 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -41,7 +41,7 @@ static int set_alloc_page_size(struct hl_device *hdev, struct hl_mem_in *args, u
 			return -EINVAL;
 		}
 	} else {
-		psize = hdev->asic_prop.dram_page_size;
+		psize = prop->device_mem_alloc_default_page_size;
 	}
 
 	*page_size = psize;
@@ -117,7 +117,7 @@ static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
 			paddr = gen_pool_alloc(vm->dram_pg_pool, total_size);
 		if (!paddr) {
 			dev_err(hdev->dev,
-				"failed to allocate %llu contiguous pages with total size of %llu\n",
+				"Cannot allocate %llu contiguous pages with total size of %llu\n",
 				num_pgs, total_size);
 			return -ENOMEM;
 		}
@@ -156,9 +156,10 @@ static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
 			else
 				phys_pg_pack->pages[i] = gen_pool_alloc(vm->dram_pg_pool,
 									page_size);
+
 			if (!phys_pg_pack->pages[i]) {
 				dev_err(hdev->dev,
-					"Failed to allocate device memory (out of memory)\n");
+					"Cannot allocate device memory (out of memory)\n");
 				rc = -ENOMEM;
 				goto page_err;
 			}
@@ -237,19 +238,18 @@ static int dma_map_host_va(struct hl_device *hdev, u64 addr, u64 size,
 		goto pin_err;
 	}
 
-	rc = hdev->asic_funcs->asic_dma_map_sg(hdev, userptr->sgt->sgl,
-					userptr->sgt->nents, DMA_BIDIRECTIONAL);
-	if (rc) {
-		dev_err(hdev->dev, "failed to map sgt with DMA region\n");
-		goto dma_map_err;
-	}
-
 	userptr->dma_mapped = true;
 	userptr->dir = DMA_BIDIRECTIONAL;
 	userptr->vm_type = VM_TYPE_USERPTR;
 
 	*p_userptr = userptr;
 
+	rc = hdev->asic_funcs->asic_dma_map_sgtable(hdev, userptr->sgt, DMA_BIDIRECTIONAL);
+	if (rc) {
+		dev_err(hdev->dev, "failed to map sgt with DMA region\n");
+		goto dma_map_err;
+	}
+
 	return 0;
 
 dma_map_err:
@@ -900,7 +900,7 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
 	 * consecutive block.
 	 */
 	total_npages = 0;
-	for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
+	for_each_sgtable_dma_sg(userptr->sgt, sg, i) {
 		npages = hl_get_sg_info(sg, &dma_addr);
 
 		total_npages += npages;
@@ -929,7 +929,7 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
 	phys_pg_pack->total_size = total_npages * page_size;
 
 	j = 0;
-	for_each_sg(userptr->sgt->sgl, sg, userptr->sgt->nents, i) {
+	for_each_sgtable_dma_sg(userptr->sgt, sg, i) {
 		npages = hl_get_sg_info(sg, &dma_addr);
 
 		/* align down to physical page size and save the offset */
@@ -1102,21 +1102,24 @@ static int get_paddr_from_handle(struct hl_ctx *ctx, struct hl_mem_in *args,
  *   map a device virtual block to this pages and return the start address of
  *   this block.
  */
-static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
-		u64 *device_addr)
+static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device_addr)
 {
-	struct hl_device *hdev = ctx->hdev;
-	struct hl_vm *vm = &hdev->vm;
 	struct hl_vm_phys_pg_pack *phys_pg_pack;
+	enum hl_va_range_type va_range_type = 0;
+	struct hl_device *hdev = ctx->hdev;
 	struct hl_userptr *userptr = NULL;
+	u32 handle = 0, va_block_align;
 	struct hl_vm_hash_node *hnode;
+	struct hl_vm *vm = &hdev->vm;
 	struct hl_va_range *va_range;
-	enum vm_type *vm_type;
+	bool is_userptr, do_prefetch;
 	u64 ret_vaddr, hint_addr;
-	u32 handle = 0, va_block_align;
+	enum vm_type *vm_type;
 	int rc;
-	bool is_userptr = args->flags & HL_MEM_USERPTR;
-	enum hl_va_range_type va_range_type = 0;
+
+	/* set map flags */
+	is_userptr = args->flags & HL_MEM_USERPTR;
+	do_prefetch = hdev->supports_mmu_prefetch && (args->flags & HL_MEM_PREFETCH);
 
 	/* Assume failure */
 	*device_addr = 0;
@@ -1241,19 +1244,27 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 
 	rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack);
 	if (rc) {
-		mutex_unlock(&ctx->mmu_lock);
-		dev_err(hdev->dev, "mapping page pack failed for handle %u\n",
-				handle);
+		dev_err(hdev->dev, "mapping page pack failed for handle %u\n", handle);
 		goto map_err;
 	}
 
 	rc = hl_mmu_invalidate_cache_range(hdev, false, *vm_type | MMU_OP_SKIP_LOW_CACHE_INV,
 				ctx->asid, ret_vaddr, phys_pg_pack->total_size);
+	if (rc)
+		goto map_err;
 
 	mutex_unlock(&ctx->mmu_lock);
 
-	if (rc)
-		goto map_err;
+	/*
+	 * prefetch is done upon user's request. it is performed in WQ as and so can
+	 * be outside the MMU lock. the operation itself is already protected by the mmu lock
+	 */
+	if (do_prefetch) {
+		rc = hl_mmu_prefetch_cache_range(ctx, *vm_type, ctx->asid, ret_vaddr,
+							phys_pg_pack->total_size);
+		if (rc)
+			goto map_err;
+	}
 
 	ret_vaddr += phys_pg_pack->offset;
 
@@ -1272,6 +1283,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	return rc;
 
 map_err:
+	mutex_unlock(&ctx->mmu_lock);
+
 	if (add_va_block(hdev, va_range, ret_vaddr,
 				ret_vaddr + phys_pg_pack->total_size - 1))
 		dev_warn(hdev->dev,
@@ -1509,7 +1522,7 @@ int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
 	vma->vm_ops = &hw_block_vm_ops;
 	vma->vm_private_data = lnode;
 
-	hl_ctx_get(hdev, ctx);
+	hl_ctx_get(ctx);
 
 	rc = hdev->asic_funcs->hw_block_mmap(hdev, vma, block_id, block_size);
 	if (rc) {
@@ -1819,7 +1832,7 @@ static int export_dmabuf_common(struct hl_ctx *ctx,
 	}
 
 	hl_dmabuf->ctx = ctx;
-	hl_ctx_get(hdev, hl_dmabuf->ctx);
+	hl_ctx_get(hl_dmabuf->ctx);
 
 	*dmabuf_fd = fd;
 
@@ -2076,164 +2089,34 @@ out:
 	return rc;
 }
 
-static void ts_buff_release(struct kref *ref)
-{
-	struct hl_ts_buff *buff;
-
-	buff = container_of(ref, struct hl_ts_buff, refcount);
-
-	vfree(buff->kernel_buff_address);
-	vfree(buff->user_buff_address);
-	kfree(buff);
-}
-
-struct hl_ts_buff *hl_ts_get(struct hl_device *hdev, struct hl_ts_mgr *mgr,
-					u32 handle)
-{
-	struct hl_ts_buff *buff;
-
-	spin_lock(&mgr->ts_lock);
-	buff = idr_find(&mgr->ts_handles, handle);
-	if (!buff) {
-		spin_unlock(&mgr->ts_lock);
-		dev_warn(hdev->dev,
-			"TS buff get failed, no match to handle 0x%x\n", handle);
-		return NULL;
-	}
-	kref_get(&buff->refcount);
-	spin_unlock(&mgr->ts_lock);
-
-	return buff;
-}
-
-void hl_ts_put(struct hl_ts_buff *buff)
+static void ts_buff_release(struct hl_mmap_mem_buf *buf)
 {
-	kref_put(&buff->refcount, ts_buff_release);
-}
-
-static void buff_vm_close(struct vm_area_struct *vma)
-{
-	struct hl_ts_buff *buff = (struct hl_ts_buff *) vma->vm_private_data;
-	long new_mmap_size;
-
-	new_mmap_size = buff->mmap_size - (vma->vm_end - vma->vm_start);
+	struct hl_ts_buff *ts_buff = buf->private;
 
-	if (new_mmap_size > 0) {
-		buff->mmap_size = new_mmap_size;
-		return;
-	}
-
-	atomic_set(&buff->mmap, 0);
-	hl_ts_put(buff);
-	vma->vm_private_data = NULL;
+	vfree(ts_buff->kernel_buff_address);
+	vfree(ts_buff->user_buff_address);
+	kfree(ts_buff);
 }
 
-static const struct vm_operations_struct ts_buff_vm_ops = {
-	.close = buff_vm_close
-};
-
-int hl_ts_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
+static int hl_ts_mmap(struct hl_mmap_mem_buf *buf, struct vm_area_struct *vma, void *args)
 {
-	struct hl_device *hdev = hpriv->hdev;
-	struct hl_ts_buff *buff;
-	u32 handle, user_buff_size;
-	int rc;
-
-	/* We use the page offset to hold the idr and thus we need to clear
-	 * it before doing the mmap itself
-	 */
-	handle = vma->vm_pgoff;
-	vma->vm_pgoff = 0;
-
-	buff = hl_ts_get(hdev, &hpriv->ts_mem_mgr, handle);
-	if (!buff) {
-		dev_err(hdev->dev,
-			"TS buff mmap failed, no match to handle 0x%x\n", handle);
-		return -EINVAL;
-	}
-
-	/* Validation check */
-	user_buff_size = vma->vm_end - vma->vm_start;
-	if (user_buff_size != ALIGN(buff->user_buff_size, PAGE_SIZE)) {
-		dev_err(hdev->dev,
-			"TS buff mmap failed, mmap size 0x%x != 0x%x buff size\n",
-			user_buff_size, ALIGN(buff->user_buff_size, PAGE_SIZE));
-		rc = -EINVAL;
-		goto put_buff;
-	}
-
-#ifdef _HAS_TYPE_ARG_IN_ACCESS_OK
-	if (!access_ok(VERIFY_WRITE,
-		(void __user *) (uintptr_t) vma->vm_start, user_buff_size)) {
-#else
-	if (!access_ok((void __user *) (uintptr_t) vma->vm_start,
-						user_buff_size)) {
-#endif
-		dev_err(hdev->dev,
-			"user pointer is invalid - 0x%lx\n",
-			vma->vm_start);
-
-		rc = -EINVAL;
-		goto put_buff;
-	}
+	struct hl_ts_buff *ts_buff = buf->private;
 
-	if (atomic_cmpxchg(&buff->mmap, 0, 1)) {
-		dev_err(hdev->dev, "TS buff memory mmap failed, already mmaped to user\n");
-		rc = -EINVAL;
-		goto put_buff;
-	}
-
-	vma->vm_ops = &ts_buff_vm_ops;
-	vma->vm_private_data = buff;
 	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY | VM_NORESERVE;
-	rc = remap_vmalloc_range(vma, buff->user_buff_address, 0);
-	if (rc) {
-		atomic_set(&buff->mmap, 0);
-		goto put_buff;
-	}
-
-	buff->mmap_size = buff->user_buff_size;
-	vma->vm_pgoff = handle;
-
-	return 0;
-
-put_buff:
-	hl_ts_put(buff);
-	return rc;
-}
-
-void hl_ts_mgr_init(struct hl_ts_mgr *mgr)
-{
-	spin_lock_init(&mgr->ts_lock);
-	idr_init(&mgr->ts_handles);
+	return remap_vmalloc_range(vma, ts_buff->user_buff_address, 0);
 }
 
-void hl_ts_mgr_fini(struct hl_device *hdev, struct hl_ts_mgr *mgr)
-{
-	struct hl_ts_buff *buff;
-	struct idr *idp;
-	u32 id;
-
-	idp = &mgr->ts_handles;
-
-	idr_for_each_entry(idp, buff, id) {
-		if (kref_put(&buff->refcount, ts_buff_release) != 1)
-			dev_err(hdev->dev, "TS buff handle %d for CTX is still alive\n",
-							id);
-	}
-
-	idr_destroy(&mgr->ts_handles);
-}
-
-static struct hl_ts_buff *hl_ts_alloc_buff(struct hl_device *hdev, u32 num_elements)
+static int hl_ts_alloc_buf(struct hl_mmap_mem_buf *buf, gfp_t gfp, void *args)
 {
 	struct hl_ts_buff *ts_buff = NULL;
-	u32 size;
+	u32 size, num_elements;
 	void *p;
 
+	num_elements = *(u32 *)args;
+
 	ts_buff = kzalloc(sizeof(*ts_buff), GFP_KERNEL);
 	if (!ts_buff)
-		return NULL;
+		return -ENOMEM;
 
 	/* Allocate the user buffer */
 	size = num_elements * sizeof(u64);
@@ -2242,7 +2125,7 @@ static struct hl_ts_buff *hl_ts_alloc_buff(struct hl_device *hdev, u32 num_eleme
 		goto free_mem;
 
 	ts_buff->user_buff_address = p;
-	ts_buff->user_buff_size = size;
+	buf->mappable_size = size;
 
 	/* Allocate the internal kernel buffer */
 	size = num_elements * sizeof(struct hl_user_pending_interrupt);
@@ -2253,15 +2136,25 @@ static struct hl_ts_buff *hl_ts_alloc_buff(struct hl_device *hdev, u32 num_eleme
 	ts_buff->kernel_buff_address = p;
 	ts_buff->kernel_buff_size = size;
 
-	return ts_buff;
+	buf->private = ts_buff;
+
+	return 0;
 
 free_user_buff:
 	vfree(ts_buff->user_buff_address);
 free_mem:
 	kfree(ts_buff);
-	return NULL;
+	return -ENOMEM;
 }
 
+static struct hl_mmap_mem_buf_behavior hl_ts_behavior = {
+	.topic = "TS",
+	.mem_id = HL_MMAP_TYPE_TS_BUFF,
+	.mmap = hl_ts_mmap,
+	.alloc = hl_ts_alloc_buf,
+	.release = ts_buff_release,
+};
+
 /**
  * allocate_timestamps_buffers() - allocate timestamps buffers
  * This function will allocate ts buffer that will later on be mapped to the user
@@ -2278,54 +2171,22 @@ free_mem:
  */
 static int allocate_timestamps_buffers(struct hl_fpriv *hpriv, struct hl_mem_in *args, u64 *handle)
 {
-	struct hl_ts_mgr *ts_mgr = &hpriv->ts_mem_mgr;
-	struct hl_device *hdev = hpriv->hdev;
-	struct hl_ts_buff *ts_buff;
-	int rc = 0;
+	struct hl_mem_mgr *mmg = &hpriv->mem_mgr;
+	struct hl_mmap_mem_buf *buf;
 
 	if (args->num_of_elements > TS_MAX_ELEMENTS_NUM) {
-		dev_err(hdev->dev, "Num of elements exceeds Max allowed number (0x%x > 0x%x)\n",
+		dev_err(mmg->dev, "Num of elements exceeds Max allowed number (0x%x > 0x%x)\n",
 				args->num_of_elements, TS_MAX_ELEMENTS_NUM);
 		return -EINVAL;
 	}
 
-	/* Allocate ts buffer object
-	 * This object will contain two buffers one that will be mapped to the user
-	 * and another internal buffer for the driver use only, which won't be mapped
-	 * to the user.
-	 */
-	ts_buff = hl_ts_alloc_buff(hdev, args->num_of_elements);
-	if (!ts_buff) {
-		rc = -ENOMEM;
-		goto out_err;
-	}
-
-	spin_lock(&ts_mgr->ts_lock);
-	rc = idr_alloc(&ts_mgr->ts_handles, ts_buff, 1, 0, GFP_ATOMIC);
-	spin_unlock(&ts_mgr->ts_lock);
-	if (rc < 0) {
-		dev_err(hdev->dev, "Failed to allocate IDR for a new ts buffer\n");
-		goto release_ts_buff;
-	}
-
-	ts_buff->id = rc;
-	ts_buff->hdev = hdev;
-
-	kref_init(&ts_buff->refcount);
-
-	/* idr is 32-bit so we can safely OR it with a mask that is above 32 bit */
-	*handle = (u64) ts_buff->id | HL_MMAP_TYPE_TS_BUFF;
-	*handle <<= PAGE_SHIFT;
+	buf = hl_mmap_mem_buf_alloc(mmg, &hl_ts_behavior, GFP_KERNEL, &args->num_of_elements);
+	if (!buf)
+		return -ENOMEM;
 
-	dev_dbg(hdev->dev, "Created ts buff object handle(%u)\n", ts_buff->id);
+	*handle = buf->handle;
 
 	return 0;
-
-release_ts_buff:
-	kref_put(&ts_buff->refcount, ts_buff_release);
-out_err:
-	*handle = 0;
-	return rc;
 }
 
 int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
@@ -2587,9 +2448,7 @@ void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr)
 	hl_debugfs_remove_userptr(hdev, userptr);
 
 	if (userptr->dma_mapped)
-		hdev->asic_funcs->hl_dma_unmap_sg(hdev, userptr->sgt->sgl,
-							userptr->sgt->nents,
-							userptr->dir);
+		hdev->asic_funcs->hl_dma_unmap_sgtable(hdev, userptr->sgt, userptr->dir);
 
 	unpin_user_pages_dirty_lock(userptr->pages, userptr->npages, true);
 	kvfree(userptr->pages);
diff --git a/drivers/misc/habanalabs/common/memory_mgr.c b/drivers/misc/habanalabs/common/memory_mgr.c
new file mode 100644
index 000000000000..ea5f2bd31b0a
--- /dev/null
+++ b/drivers/misc/habanalabs/common/memory_mgr.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2022 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include "habanalabs.h"
+
+/**
+ * hl_mmap_mem_buf_get - increase the buffer refcount and return a pointer to
+ *                        the buffer descriptor.
+ *
+ * @mmg: parent unifed memory manager
+ * @handle: requested buffer handle
+ *
+ * Find the buffer in the store and return a pointer to its descriptor.
+ * Increase buffer refcount. If not found - return NULL.
+ */
+struct hl_mmap_mem_buf *hl_mmap_mem_buf_get(struct hl_mem_mgr *mmg, u64 handle)
+{
+	struct hl_mmap_mem_buf *buf;
+
+	spin_lock(&mmg->lock);
+	buf = idr_find(&mmg->handles, lower_32_bits(handle >> PAGE_SHIFT));
+	if (!buf) {
+		spin_unlock(&mmg->lock);
+		dev_warn(mmg->dev,
+			 "Buff get failed, no match to handle %#llx\n", handle);
+		return NULL;
+	}
+	kref_get(&buf->refcount);
+	spin_unlock(&mmg->lock);
+	return buf;
+}
+
+/**
+ * hl_mmap_mem_buf_destroy - destroy the unused buffer
+ *
+ * @buf: memory manager buffer descriptor
+ *
+ * Internal function, used as a final step of buffer release. Shall be invoked
+ * only when the buffer is no longer in use (removed from idr). Will call the
+ * release callback (if applicable), and free the memory.
+ */
+static void hl_mmap_mem_buf_destroy(struct hl_mmap_mem_buf *buf)
+{
+	if (buf->behavior->release)
+		buf->behavior->release(buf);
+
+	kfree(buf);
+}
+
+/**
+ * hl_mmap_mem_buf_release - release buffer
+ *
+ * @kref: kref that reached 0.
+ *
+ * Internal function, used as a kref release callback, when the last user of
+ * the buffer is released. Shall be called from an interrupt context.
+ */
+static void hl_mmap_mem_buf_release(struct kref *kref)
+{
+	struct hl_mmap_mem_buf *buf =
+		container_of(kref, struct hl_mmap_mem_buf, refcount);
+
+	spin_lock(&buf->mmg->lock);
+	idr_remove(&buf->mmg->handles, lower_32_bits(buf->handle >> PAGE_SHIFT));
+	spin_unlock(&buf->mmg->lock);
+
+	hl_mmap_mem_buf_destroy(buf);
+}
+
+/**
+ * hl_mmap_mem_buf_remove_idr_locked - remove handle from idr
+ *
+ * @kref: kref that reached 0.
+ *
+ * Internal function, used for kref put by handle. Assumes mmg lock is taken.
+ * Will remove the buffer from idr, without destroying it.
+ */
+static void hl_mmap_mem_buf_remove_idr_locked(struct kref *kref)
+{
+	struct hl_mmap_mem_buf *buf =
+		container_of(kref, struct hl_mmap_mem_buf, refcount);
+
+	idr_remove(&buf->mmg->handles, lower_32_bits(buf->handle >> PAGE_SHIFT));
+}
+
+/**
+ * hl_mmap_mem_buf_put - decrease the reference to the buffer
+ *
+ * @buf: memory manager buffer descriptor
+ *
+ * Decrease the reference to the buffer, and release it if it was the last one.
+ * Shall be called from an interrupt context.
+ */
+int hl_mmap_mem_buf_put(struct hl_mmap_mem_buf *buf)
+{
+	return kref_put(&buf->refcount, hl_mmap_mem_buf_release);
+}
+
+/**
+ * hl_mmap_mem_buf_put_handle - decrease the reference to the buffer with the
+ *                              given handle.
+ *
+ * @mmg: parent unifed memory manager
+ * @handle: requested buffer handle
+ *
+ * Decrease the reference to the buffer, and release it if it was the last one.
+ * Shall not be called from an interrupt context. Return -EINVAL if handle was
+ * not found, else return the put outcome (0 or 1).
+ */
+int hl_mmap_mem_buf_put_handle(struct hl_mem_mgr *mmg, u64 handle)
+{
+	struct hl_mmap_mem_buf *buf;
+
+	spin_lock(&mmg->lock);
+	buf = idr_find(&mmg->handles, lower_32_bits(handle >> PAGE_SHIFT));
+	if (!buf) {
+		spin_unlock(&mmg->lock);
+		dev_dbg(mmg->dev,
+			 "Buff put failed, no match to handle %#llx\n", handle);
+		return -EINVAL;
+	}
+
+	if (kref_put(&buf->refcount, hl_mmap_mem_buf_remove_idr_locked)) {
+		spin_unlock(&mmg->lock);
+		hl_mmap_mem_buf_destroy(buf);
+		return 1;
+	}
+
+	spin_unlock(&mmg->lock);
+	return 0;
+}
+
+/**
+ * @hl_mmap_mem_buf_alloc - allocate a new mappable buffer
+ *
+ * @mmg: parent unifed memory manager
+ * @behavior: behavior object describing this buffer polymorphic behavior
+ * @gfp: gfp flags to use for the memory allocations
+ * @args: additional args passed to behavior->alloc
+ *
+ * Allocate and register a new memory buffer inside the give memory manager.
+ * Return the pointer to the new buffer on success or NULL on failure.
+ */
+struct hl_mmap_mem_buf *
+hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg,
+		      struct hl_mmap_mem_buf_behavior *behavior, gfp_t gfp,
+		      void *args)
+{
+	struct hl_mmap_mem_buf *buf;
+	int rc;
+
+	buf = kzalloc(sizeof(*buf), gfp);
+	if (!buf)
+		return NULL;
+
+	spin_lock(&mmg->lock);
+	rc = idr_alloc(&mmg->handles, buf, 1, 0, GFP_ATOMIC);
+	spin_unlock(&mmg->lock);
+	if (rc < 0) {
+		dev_err(mmg->dev,
+			"%s: Failed to allocate IDR for a new buffer, rc=%d\n",
+			behavior->topic, rc);
+		goto free_buf;
+	}
+
+	buf->mmg = mmg;
+	buf->behavior = behavior;
+	buf->handle = (((u64)rc | buf->behavior->mem_id) << PAGE_SHIFT);
+	kref_init(&buf->refcount);
+
+	rc = buf->behavior->alloc(buf, gfp, args);
+	if (rc) {
+		dev_err(mmg->dev, "%s: Failure in buffer alloc callback %d\n",
+			behavior->topic, rc);
+		goto remove_idr;
+	}
+
+	return buf;
+
+remove_idr:
+	spin_lock(&mmg->lock);
+	idr_remove(&mmg->handles, lower_32_bits(buf->handle >> PAGE_SHIFT));
+	spin_unlock(&mmg->lock);
+free_buf:
+	kfree(buf);
+	return NULL;
+}
+
+/**
+ * hl_mmap_mem_buf_vm_close - handle mmap close
+ *
+ * @vma: the vma object for which mmap was closed.
+ *
+ * Put the memory buffer if it is no longer mapped.
+ */
+static void hl_mmap_mem_buf_vm_close(struct vm_area_struct *vma)
+{
+	struct hl_mmap_mem_buf *buf =
+		(struct hl_mmap_mem_buf *)vma->vm_private_data;
+	long new_mmap_size;
+
+	new_mmap_size = buf->real_mapped_size - (vma->vm_end - vma->vm_start);
+
+	if (new_mmap_size > 0) {
+		buf->real_mapped_size = new_mmap_size;
+		return;
+	}
+
+	atomic_set(&buf->mmap, 0);
+	hl_mmap_mem_buf_put(buf);
+	vma->vm_private_data = NULL;
+}
+
+static const struct vm_operations_struct hl_mmap_mem_buf_vm_ops = {
+	.close = hl_mmap_mem_buf_vm_close
+};
+
+/**
+ * hl_mem_mgr_mmap - map the given buffer to the user
+ *
+ * @mmg: unifed memory manager
+ * @vma: the vma object for which mmap was closed.
+ * @args: additional args passed to behavior->mmap
+ *
+ * Map the buffer specified by the vma->vm_pgoff to the given vma.
+ */
+int hl_mem_mgr_mmap(struct hl_mem_mgr *mmg, struct vm_area_struct *vma,
+		    void *args)
+{
+	struct hl_mmap_mem_buf *buf;
+	u64 user_mem_size;
+	u64 handle;
+	int rc;
+
+	/* We use the page offset to hold the idr and thus we need to clear
+	 * it before doing the mmap itself
+	 */
+	handle = vma->vm_pgoff << PAGE_SHIFT;
+	vma->vm_pgoff = 0;
+
+	/* Reference was taken here */
+	buf = hl_mmap_mem_buf_get(mmg, handle);
+	if (!buf) {
+		dev_err(mmg->dev,
+			"Memory mmap failed, no match to handle %#llx\n", handle);
+		return -EINVAL;
+	}
+
+	/* Validation check */
+	user_mem_size = vma->vm_end - vma->vm_start;
+	if (user_mem_size != ALIGN(buf->mappable_size, PAGE_SIZE)) {
+		dev_err(mmg->dev,
+			"%s: Memory mmap failed, mmap VM size 0x%llx != 0x%llx allocated physical mem size\n",
+			buf->behavior->topic, user_mem_size, buf->mappable_size);
+		rc = -EINVAL;
+		goto put_mem;
+	}
+
+#ifdef _HAS_TYPE_ARG_IN_ACCESS_OK
+	if (!access_ok(VERIFY_WRITE, (void __user *)(uintptr_t)vma->vm_start,
+		       user_mem_size)) {
+#else
+	if (!access_ok((void __user *)(uintptr_t)vma->vm_start,
+		       user_mem_size)) {
+#endif
+		dev_err(mmg->dev, "%s: User pointer is invalid - 0x%lx\n",
+			buf->behavior->topic, vma->vm_start);
+
+		rc = -EINVAL;
+		goto put_mem;
+	}
+
+	if (atomic_cmpxchg(&buf->mmap, 0, 1)) {
+		dev_err(mmg->dev,
+			"%s, Memory mmap failed, already mmaped to user\n",
+			buf->behavior->topic);
+		rc = -EINVAL;
+		goto put_mem;
+	}
+
+	vma->vm_ops = &hl_mmap_mem_buf_vm_ops;
+
+	/* Note: We're transferring the memory reference to vma->vm_private_data here. */
+
+	vma->vm_private_data = buf;
+
+	rc = buf->behavior->mmap(buf, vma, args);
+	if (rc) {
+		atomic_set(&buf->mmap, 0);
+		goto put_mem;
+	}
+
+	buf->real_mapped_size = buf->mappable_size;
+	vma->vm_pgoff = handle >> PAGE_SHIFT;
+
+	return 0;
+
+put_mem:
+	hl_mmap_mem_buf_put(buf);
+	return rc;
+}
+
+/**
+ * hl_mem_mgr_init - initialize unified memory manager
+ *
+ * @dev: owner device pointer
+ * @mmg: structure to initialize
+ *
+ * Initialize an instance of unified memory manager
+ */
+void hl_mem_mgr_init(struct device *dev, struct hl_mem_mgr *mmg)
+{
+	mmg->dev = dev;
+	spin_lock_init(&mmg->lock);
+	idr_init(&mmg->handles);
+}
+
+/**
+ * hl_mem_mgr_fini - release unified memory manager
+ *
+ * @mmg: parent unifed memory manager
+ *
+ * Release the unified memory manager. Shall be called from an interrupt context.
+ */
+void hl_mem_mgr_fini(struct hl_mem_mgr *mmg)
+{
+	struct hl_mmap_mem_buf *buf;
+	struct idr *idp;
+	const char *topic;
+	u32 id;
+
+	idp = &mmg->handles;
+
+	idr_for_each_entry(idp, buf, id) {
+		topic = buf->behavior->topic;
+		if (hl_mmap_mem_buf_put(buf) != 1)
+			dev_err(mmg->dev,
+				"%s: Buff handle %u for CTX is still alive\n",
+				topic, id);
+	}
+
+	/* TODO: can it happen that some buffer is still in use at this point? */
+
+	idr_destroy(&mmg->handles);
+}
diff --git a/drivers/misc/habanalabs/common/mmu/mmu.c b/drivers/misc/habanalabs/common/mmu/mmu.c
index 810b73421ce1..f3734718d94f 100644
--- a/drivers/misc/habanalabs/common/mmu/mmu.c
+++ b/drivers/misc/habanalabs/common/mmu/mmu.c
@@ -9,6 +9,20 @@
 
 #include "../habanalabs.h"
 
+/**
+ * hl_mmu_get_funcs() - get MMU functions structure
+ * @hdev: habanalabs device structure.
+ * @pgt_residency: page table residency.
+ * @is_dram_addr: true if we need HMMU functions
+ *
+ * @return appropriate MMU functions structure
+ */
+static struct hl_mmu_funcs *hl_mmu_get_funcs(struct hl_device *hdev, int pgt_residency,
+									bool is_dram_addr)
+{
+	return &hdev->mmu_func[pgt_residency];
+}
+
 bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
@@ -122,6 +136,53 @@ void hl_mmu_ctx_fini(struct hl_ctx *ctx)
 }
 
 /*
+ * hl_mmu_get_real_page_size - get real page size to use in map/unmap operation
+ *
+ * @hdev: pointer to device data.
+ * @mmu_prop: MMU properties.
+ * @page_size: page size
+ * @real_page_size: set here the actual page size to use for the operation
+ * @is_dram_addr: true if DRAM address, otherwise false.
+ *
+ * @return 0 on success, otherwise non 0 error code
+ *
+ * note that this is general implementation that can fit most MMU arch. but as this is used as an
+ * MMU function:
+ * 1. it shall not be called directly- only from mmu_func structure instance
+ * 2. each MMU may modify the implementation internally
+ */
+int hl_mmu_get_real_page_size(struct hl_device *hdev, struct hl_mmu_properties *mmu_prop,
+				u32 page_size, u32 *real_page_size, bool is_dram_addr)
+{
+	/*
+	 * The H/W handles mapping of specific page sizes. Hence if the page
+	 * size is bigger, we break it to sub-pages and map them separately.
+	 */
+	if ((page_size % mmu_prop->page_size) == 0) {
+		*real_page_size = mmu_prop->page_size;
+		return 0;
+	}
+
+	dev_err(hdev->dev, "page size of %u is not %uKB aligned, can't map\n",
+						page_size, mmu_prop->page_size >> 10);
+
+	return -EFAULT;
+}
+
+static struct hl_mmu_properties *hl_mmu_get_prop(struct hl_device *hdev, u32 page_size,
+							bool is_dram_addr)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+
+	if (is_dram_addr)
+		return &prop->dmmu;
+	else if ((page_size % prop->pmmu_huge.page_size) == 0)
+		return &prop->pmmu_huge;
+
+	return &prop->pmmu;
+}
+
+/*
  * hl_mmu_unmap_page - unmaps a virtual addr
  *
  * @ctx: pointer to the context structure
@@ -142,60 +203,35 @@ void hl_mmu_ctx_fini(struct hl_ctx *ctx)
  * For optimization reasons PCI flush may be requested once after unmapping of
  * large area.
  */
-int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
-		bool flush_pte)
+int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size, bool flush_pte)
 {
 	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct hl_mmu_properties *mmu_prop;
-	u64 real_virt_addr;
+	struct hl_mmu_funcs *mmu_funcs;
+	int i, pgt_residency, rc = 0;
 	u32 real_page_size, npages;
-	int i, rc = 0, pgt_residency;
+	u64 real_virt_addr;
 	bool is_dram_addr;
 
 	if (!hdev->mmu_enable)
 		return 0;
 
 	is_dram_addr = hl_is_dram_va(hdev, virt_addr);
-
-	if (is_dram_addr)
-		mmu_prop = &prop->dmmu;
-	else if ((page_size % prop->pmmu_huge.page_size) == 0)
-		mmu_prop = &prop->pmmu_huge;
-	else
-		mmu_prop = &prop->pmmu;
+	mmu_prop = hl_mmu_get_prop(hdev, page_size, is_dram_addr);
 
 	pgt_residency = mmu_prop->host_resident ? MMU_HR_PGT : MMU_DR_PGT;
-	/*
-	 * The H/W handles mapping of specific page sizes. Hence if the page
-	 * size is bigger, we break it to sub-pages and unmap them separately.
-	 */
-	if ((page_size % mmu_prop->page_size) == 0) {
-		real_page_size = mmu_prop->page_size;
-	} else {
-		/*
-		 * MMU page size may differ from DRAM page size.
-		 * In such case work with the DRAM page size and let the MMU
-		 * scrambling routine to handle this mismatch when
-		 * calculating the address to remove from the MMU page table
-		 */
-		if (is_dram_addr && ((page_size % prop->dram_page_size) == 0)) {
-			real_page_size = prop->dram_page_size;
-		} else {
-			dev_err(hdev->dev,
-				"page size of %u is not %uKB aligned, can't unmap\n",
-				page_size, mmu_prop->page_size >> 10);
+	mmu_funcs = hl_mmu_get_funcs(hdev, pgt_residency, is_dram_addr);
 
-			return -EFAULT;
-		}
-	}
+	rc = hdev->asic_funcs->mmu_get_real_page_size(hdev, mmu_prop, page_size, &real_page_size,
+							is_dram_addr);
+	if (rc)
+		return rc;
 
 	npages = page_size / real_page_size;
 	real_virt_addr = virt_addr;
 
 	for (i = 0 ; i < npages ; i++) {
-		rc = hdev->mmu_func[pgt_residency].unmap(ctx,
-						real_virt_addr, is_dram_addr);
+		rc = mmu_funcs->unmap(ctx, real_virt_addr, is_dram_addr);
 		if (rc)
 			break;
 
@@ -203,7 +239,7 @@ int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 	}
 
 	if (flush_pte)
-		hdev->mmu_func[pgt_residency].flush(ctx);
+		mmu_funcs->flush(ctx);
 
 	return rc;
 }
@@ -230,15 +266,15 @@ int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
  * For optimization reasons PCI flush may be requested once after mapping of
  * large area.
  */
-int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
-		u32 page_size, bool flush_pte)
+int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size,
+			bool flush_pte)
 {
+	int i, rc, pgt_residency, mapped_cnt = 0;
 	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct hl_mmu_properties *mmu_prop;
 	u64 real_virt_addr, real_phys_addr;
+	struct hl_mmu_funcs *mmu_funcs;
 	u32 real_page_size, npages;
-	int i, rc, pgt_residency, mapped_cnt = 0;
 	bool is_dram_addr;
 
 
@@ -246,40 +282,15 @@ int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 		return 0;
 
 	is_dram_addr = hl_is_dram_va(hdev, virt_addr);
-
-	if (is_dram_addr)
-		mmu_prop = &prop->dmmu;
-	else if ((page_size % prop->pmmu_huge.page_size) == 0)
-		mmu_prop = &prop->pmmu_huge;
-	else
-		mmu_prop = &prop->pmmu;
+	mmu_prop = hl_mmu_get_prop(hdev, page_size, is_dram_addr);
 
 	pgt_residency = mmu_prop->host_resident ? MMU_HR_PGT : MMU_DR_PGT;
+	mmu_funcs = hl_mmu_get_funcs(hdev, pgt_residency, is_dram_addr);
 
-	/*
-	 * The H/W handles mapping of specific page sizes. Hence if the page
-	 * size is bigger, we break it to sub-pages and map them separately.
-	 */
-	if ((page_size % mmu_prop->page_size) == 0) {
-		real_page_size = mmu_prop->page_size;
-	} else if (is_dram_addr && ((page_size % prop->dram_page_size) == 0) &&
-			(prop->dram_page_size < mmu_prop->page_size)) {
-		/*
-		 * MMU page size may differ from DRAM page size.
-		 * In such case work with the DRAM page size and let the MMU
-		 * scrambling routine handle this mismatch when calculating
-		 * the address to place in the MMU page table. (in that case
-		 * also make sure that the dram_page_size smaller than the
-		 * mmu page size)
-		 */
-		real_page_size = prop->dram_page_size;
-	} else {
-		dev_err(hdev->dev,
-			"page size of %u is not %uKB aligned, can't map\n",
-			page_size, mmu_prop->page_size >> 10);
-
-		return -EFAULT;
-	}
+	rc = hdev->asic_funcs->mmu_get_real_page_size(hdev, mmu_prop, page_size, &real_page_size,
+							is_dram_addr);
+	if (rc)
+		return rc;
 
 	/*
 	 * Verify that the phys and virt addresses are aligned with the
@@ -302,9 +313,8 @@ int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 	real_phys_addr = phys_addr;
 
 	for (i = 0 ; i < npages ; i++) {
-		rc = hdev->mmu_func[pgt_residency].map(ctx,
-						real_virt_addr, real_phys_addr,
-						real_page_size, is_dram_addr);
+		rc = mmu_funcs->map(ctx, real_virt_addr, real_phys_addr, real_page_size,
+										is_dram_addr);
 		if (rc)
 			goto err;
 
@@ -314,22 +324,21 @@ int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 	}
 
 	if (flush_pte)
-		hdev->mmu_func[pgt_residency].flush(ctx);
+		mmu_funcs->flush(ctx);
 
 	return 0;
 
 err:
 	real_virt_addr = virt_addr;
 	for (i = 0 ; i < mapped_cnt ; i++) {
-		if (hdev->mmu_func[pgt_residency].unmap(ctx,
-						real_virt_addr, is_dram_addr))
+		if (mmu_funcs->unmap(ctx, real_virt_addr, is_dram_addr))
 			dev_warn_ratelimited(hdev->dev,
 				"failed to unmap va: 0x%llx\n", real_virt_addr);
 
 		real_virt_addr += real_page_size;
 	}
 
-	hdev->mmu_func[pgt_residency].flush(ctx);
+	mmu_funcs->flush(ctx);
 
 	return rc;
 }
@@ -480,11 +489,9 @@ static void hl_mmu_pa_page_with_offset(struct hl_ctx *ctx, u64 virt_addr,
 						struct hl_mmu_hop_info *hops,
 						u64 *phys_addr)
 {
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
 	u64 offset_mask, addr_mask, hop_shift, tmp_phys_addr;
-	u32 hop0_shift_off;
-	void *p;
+	struct hl_mmu_properties *mmu_prop;
 
 	/* last hop holds the phys address and flags */
 	if (hops->unscrambled_paddr)
@@ -493,11 +500,11 @@ static void hl_mmu_pa_page_with_offset(struct hl_ctx *ctx, u64 virt_addr,
 		tmp_phys_addr = hops->hop_info[hops->used_hops - 1].hop_pte_val;
 
 	if (hops->range_type == HL_VA_RANGE_TYPE_HOST_HUGE)
-		p = &prop->pmmu_huge;
+		mmu_prop = &prop->pmmu_huge;
 	else if (hops->range_type == HL_VA_RANGE_TYPE_HOST)
-		p = &prop->pmmu;
+		mmu_prop = &prop->pmmu;
 	else /* HL_VA_RANGE_TYPE_DRAM */
-		p = &prop->dmmu;
+		mmu_prop = &prop->dmmu;
 
 	if ((hops->range_type == HL_VA_RANGE_TYPE_DRAM) &&
 			!is_power_of_2(prop->dram_page_size)) {
@@ -508,7 +515,7 @@ static void hl_mmu_pa_page_with_offset(struct hl_ctx *ctx, u64 virt_addr,
 		/*
 		 * Bit arithmetics cannot be used for non power of two page
 		 * sizes. In addition, since bit arithmetics is not used,
-		 * we cannot ignore dram base. All that shall be considerd.
+		 * we cannot ignore dram base. All that shall be considered.
 		 */
 
 		dram_page_size = prop->dram_page_size;
@@ -526,10 +533,7 @@ static void hl_mmu_pa_page_with_offset(struct hl_ctx *ctx, u64 virt_addr,
 		 * structure in order to determine the right masks
 		 * for the page offset.
 		 */
-		hop0_shift_off = offsetof(struct hl_mmu_properties, hop0_shift);
-		p = (char *)p + hop0_shift_off;
-		p = (char *)p + ((hops->used_hops - 1) * sizeof(u64));
-		hop_shift = *(u64 *)p;
+		hop_shift = mmu_prop->hop_shifts[hops->used_hops - 1];
 		offset_mask = (1ull << hop_shift) - 1;
 		addr_mask = ~(offset_mask);
 		*phys_addr = (tmp_phys_addr & addr_mask) |
@@ -557,40 +561,39 @@ int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
 			struct hl_mmu_hop_info *hops)
 {
 	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct asic_fixed_properties *prop;
 	struct hl_mmu_properties *mmu_prop;
-	int rc;
+	struct hl_mmu_funcs *mmu_funcs;
+	int pgt_residency, rc;
 	bool is_dram_addr;
 
 	if (!hdev->mmu_enable)
 		return -EOPNOTSUPP;
 
+	prop = &hdev->asic_prop;
 	hops->scrambled_vaddr = virt_addr;      /* assume no scrambling */
 
 	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
-						prop->dmmu.start_addr,
-						prop->dmmu.end_addr);
+								prop->dmmu.start_addr,
+								prop->dmmu.end_addr);
 
-	/* host-residency is the same in PMMU and HPMMU, use one of them */
+	/* host-residency is the same in PMMU and PMMU huge, no need to distinguish here */
 	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
+	pgt_residency = mmu_prop->host_resident ? MMU_HR_PGT : MMU_DR_PGT;
+	mmu_funcs = hl_mmu_get_funcs(hdev, pgt_residency, is_dram_addr);
 
 	mutex_lock(&ctx->mmu_lock);
-
-	if (mmu_prop->host_resident)
-		rc = hdev->mmu_func[MMU_HR_PGT].get_tlb_info(ctx,
-							virt_addr, hops);
-	else
-		rc = hdev->mmu_func[MMU_DR_PGT].get_tlb_info(ctx,
-							virt_addr, hops);
-
+	rc = mmu_funcs->get_tlb_info(ctx, virt_addr, hops);
 	mutex_unlock(&ctx->mmu_lock);
 
+	if (rc)
+		return rc;
+
 	/* add page offset to physical address */
 	if (hops->unscrambled_paddr)
-		hl_mmu_pa_page_with_offset(ctx, virt_addr, hops,
-					&hops->unscrambled_paddr);
+		hl_mmu_pa_page_with_offset(ctx, virt_addr, hops, &hops->unscrambled_paddr);
 
-	return rc;
+	return 0;
 }
 
 int hl_mmu_if_set_funcs(struct hl_device *hdev)
@@ -662,6 +665,55 @@ int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard,
 	return rc;
 }
 
+static void hl_mmu_prefetch_work_function(struct work_struct *work)
+{
+	struct hl_prefetch_work *pfw = container_of(work, struct hl_prefetch_work, pf_work);
+	struct hl_ctx *ctx = pfw->ctx;
+
+	if (!hl_device_operational(ctx->hdev, NULL))
+		goto put_ctx;
+
+	mutex_lock(&ctx->mmu_lock);
+
+	ctx->hdev->asic_funcs->mmu_prefetch_cache_range(ctx, pfw->flags, pfw->asid,
+								pfw->va, pfw->size);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+put_ctx:
+	/*
+	 * context was taken in the common mmu prefetch function- see comment there about
+	 * context handling.
+	 */
+	hl_ctx_put(ctx);
+	kfree(pfw);
+}
+
+int hl_mmu_prefetch_cache_range(struct hl_ctx *ctx, u32 flags, u32 asid, u64 va, u64 size)
+{
+	struct hl_prefetch_work *handle_pf_work;
+
+	handle_pf_work = kmalloc(sizeof(*handle_pf_work), GFP_KERNEL);
+	if (!handle_pf_work)
+		return -ENOMEM;
+
+	INIT_WORK(&handle_pf_work->pf_work, hl_mmu_prefetch_work_function);
+	handle_pf_work->ctx = ctx;
+	handle_pf_work->va = va;
+	handle_pf_work->size = size;
+	handle_pf_work->flags = flags;
+	handle_pf_work->asid = asid;
+
+	/*
+	 * as actual prefetch is done in a WQ we must get the context (and put it
+	 * at the end of the work function)
+	 */
+	hl_ctx_get(ctx);
+	queue_work(ctx->hdev->pf_wq, &handle_pf_work->pf_work);
+
+	return 0;
+}
+
 u64 hl_mmu_get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
 {
 	return (curr_pte & PAGE_PRESENT_MASK) ? (curr_pte & HOP_PHYS_ADDR_MASK) : ULLONG_MAX;
@@ -670,6 +722,7 @@ u64 hl_mmu_get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
 /**
  * hl_mmu_get_hop_pte_phys_addr() - extract PTE address from HOP
  * @ctx: pointer to the context structure to initialize.
+ * @mmu_prop: MMU properties.
  * @hop_idx: HOP index.
  * @hop_addr: HOP address.
  * @virt_addr: virtual address fro the translation.
@@ -686,33 +739,8 @@ u64 hl_mmu_get_hop_pte_phys_addr(struct hl_ctx *ctx, struct hl_mmu_properties *m
 		return U64_MAX;
 	}
 
-	/* currently max number of HOPs is 6 */
-	switch (hop_idx) {
-	case 0:
-		mask = mmu_prop->hop0_mask;
-		shift = mmu_prop->hop0_shift;
-		break;
-	case 1:
-		mask = mmu_prop->hop1_mask;
-		shift = mmu_prop->hop1_shift;
-		break;
-	case 2:
-		mask = mmu_prop->hop2_mask;
-		shift = mmu_prop->hop2_shift;
-		break;
-	case 3:
-		mask = mmu_prop->hop3_mask;
-		shift = mmu_prop->hop3_shift;
-		break;
-	case 4:
-		mask = mmu_prop->hop4_mask;
-		shift = mmu_prop->hop4_shift;
-		break;
-	default:
-		mask = mmu_prop->hop5_mask;
-		shift = mmu_prop->hop5_shift;
-		break;
-	}
+	shift = mmu_prop->hop_shifts[hop_idx];
+	mask = mmu_prop->hop_masks[hop_idx];
 
 	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size * ((virt_addr & mask) >> shift);
 }
diff --git a/drivers/misc/habanalabs/common/mmu/mmu_v1.c b/drivers/misc/habanalabs/common/mmu/mmu_v1.c
index d03786d0c407..e2d91a69acc2 100644
--- a/drivers/misc/habanalabs/common/mmu/mmu_v1.c
+++ b/drivers/misc/habanalabs/common/mmu/mmu_v1.c
@@ -10,6 +10,8 @@
 
 #include <linux/slab.h>
 
+#define MMU_V1_MAX_HOPS	(MMU_HOP4 + 1)
+
 static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
 
 static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
@@ -170,51 +172,15 @@ static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr)
 	return num_of_ptes_left;
 }
 
-static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-					u64 virt_addr, u64 mask, u64 shift)
-{
-	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & mask) >> shift);
-}
-
-static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop0_mask,
-					mmu_prop->hop0_shift);
-}
-
-static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop1_mask,
-					mmu_prop->hop1_shift);
-}
-
-static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
+static inline u64 get_hop_pte_addr(struct hl_ctx *ctx, struct hl_mmu_properties *mmu_prop,
+					u64 *hop_addr_arr, u64 virt_addr, enum mmu_hop_num hop_idx)
 {
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop2_mask,
-					mmu_prop->hop2_shift);
-}
+	u64 mask, shift;
 
-static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop3_mask,
-					mmu_prop->hop3_shift);
-}
-
-static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop4_mask,
-					mmu_prop->hop4_shift);
+	mask = mmu_prop->hop_masks[hop_idx];
+	shift = mmu_prop->hop_shifts[hop_idx];
+	return hop_addr_arr[hop_idx] +
+			ctx->hdev->asic_prop.mmu_pte_size * ((virt_addr & mask) >> shift);
 }
 
 static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte,
@@ -516,74 +482,50 @@ static void hl_mmu_v1_ctx_fini(struct hl_ctx *ctx)
 	}
 }
 
-static int _hl_mmu_v1_unmap(struct hl_ctx *ctx,
+static int hl_mmu_v1_unmap(struct hl_ctx *ctx,
 				u64 virt_addr, bool is_dram_addr)
 {
+	u64 hop_addr[MMU_V1_MAX_HOPS] = {0}, hop_pte_addr[MMU_V1_MAX_HOPS] = {0}, curr_pte = 0;
 	struct hl_device *hdev = ctx->hdev;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct hl_mmu_properties *mmu_prop;
-	u64 hop0_addr = 0, hop0_pte_addr = 0,
-		hop1_addr = 0, hop1_pte_addr = 0,
-		hop2_addr = 0, hop2_pte_addr = 0,
-		hop3_addr = 0, hop3_pte_addr = 0,
-		hop4_addr = 0, hop4_pte_addr = 0,
-		curr_pte;
 	bool is_huge, clear_hop3 = true;
+	int hop_idx;
 
 	/* shifts and masks are the same in PMMU and HPMMU, use one of them */
 	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
 
-	hop0_addr = get_hop0_addr(ctx);
-	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
-
-	curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
-
-	hop1_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte);
-
-	if (hop1_addr == ULLONG_MAX)
-		goto not_mapped;
-
-	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
-
-	curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
-
-	hop2_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte);
-
-	if (hop2_addr == ULLONG_MAX)
-		goto not_mapped;
-
-	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
-
-	curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
-
-	hop3_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte);
-
-	if (hop3_addr == ULLONG_MAX)
-		goto not_mapped;
+	for (hop_idx = MMU_HOP0; hop_idx < MMU_HOP4; hop_idx++) {
+		if (hop_idx == MMU_HOP0) {
+			hop_addr[hop_idx] = get_hop0_addr(ctx);
+		} else {
+			hop_addr[hop_idx] = hl_mmu_get_next_hop_addr(ctx, curr_pte);
+			if (hop_addr[hop_idx] == ULLONG_MAX)
+				goto not_mapped;
+		}
 
-	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
+		hop_pte_addr[hop_idx] =
+				get_hop_pte_addr(ctx, mmu_prop, hop_addr, virt_addr, hop_idx);
 
-	curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
+		curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[hop_idx];
+	}
 
 	is_huge = curr_pte & mmu_prop->last_mask;
 
 	if (is_dram_addr && !is_huge) {
-		dev_err(hdev->dev,
-				"DRAM unmapping should use huge pages only\n");
+		dev_err(hdev->dev, "DRAM unmapping should use huge pages only\n");
 		return -EFAULT;
 	}
 
 	if (!is_huge) {
-		hop4_addr = hl_mmu_get_next_hop_addr(ctx, curr_pte);
-
-		if (hop4_addr == ULLONG_MAX)
+		hop_idx = MMU_HOP4;
+		hop_addr[hop_idx] = hl_mmu_get_next_hop_addr(ctx, curr_pte);
+		if (hop_addr[hop_idx] == ULLONG_MAX)
 			goto not_mapped;
 
-		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
-							virt_addr);
-
-		curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
-
+		hop_pte_addr[hop_idx] =
+				get_hop_pte_addr(ctx, mmu_prop, hop_addr, virt_addr, hop_idx);
+		curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[hop_idx];
 		clear_hop3 = false;
 	}
 
@@ -605,39 +547,33 @@ static int _hl_mmu_v1_unmap(struct hl_ctx *ctx,
 			goto not_mapped;
 		}
 
-		write_final_pte(ctx, hop3_pte_addr, default_pte);
-		put_pte(ctx, hop3_addr);
+		hop_idx = MMU_HOP3;
+		write_final_pte(ctx, hop_pte_addr[hop_idx], default_pte);
+		put_pte(ctx, hop_addr[hop_idx]);
 	} else {
 		if (!(curr_pte & PAGE_PRESENT_MASK))
 			goto not_mapped;
 
-		if (hop4_addr)
-			clear_pte(ctx, hop4_pte_addr);
+		if (hop_addr[MMU_HOP4])
+			clear_pte(ctx, hop_pte_addr[MMU_HOP4]);
 		else
-			clear_pte(ctx, hop3_pte_addr);
+			clear_pte(ctx, hop_pte_addr[MMU_HOP3]);
 
-		if (hop4_addr && !put_pte(ctx, hop4_addr))
+		if (hop_addr[MMU_HOP4] && !put_pte(ctx, hop_addr[MMU_HOP4]))
 			clear_hop3 = true;
 
 		if (!clear_hop3)
 			goto mapped;
 
-		clear_pte(ctx, hop3_pte_addr);
+		for (hop_idx = MMU_HOP3; hop_idx >= 0; hop_idx--) {
+			clear_pte(ctx, hop_pte_addr[hop_idx]);
 
-		if (put_pte(ctx, hop3_addr))
-			goto mapped;
+			if (hop_idx == MMU_HOP0)
+				break;
 
-		clear_pte(ctx, hop2_pte_addr);
-
-		if (put_pte(ctx, hop2_addr))
-			goto mapped;
-
-		clear_pte(ctx, hop1_pte_addr);
-
-		if (put_pte(ctx, hop1_addr))
-			goto mapped;
-
-		clear_pte(ctx, hop0_pte_addr);
+			if (put_pte(ctx, hop_addr[hop_idx]))
+				goto mapped;
+		}
 	}
 
 mapped:
@@ -650,21 +586,15 @@ not_mapped:
 	return -EINVAL;
 }
 
-static int _hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
+static int hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 			u32 page_size, bool is_dram_addr)
 {
+	u64 hop_addr[MMU_V1_MAX_HOPS] = {0}, hop_pte_addr[MMU_V1_MAX_HOPS] = {0}, curr_pte = 0;
 	struct hl_device *hdev = ctx->hdev;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct hl_mmu_properties *mmu_prop;
-	u64 hop0_addr = 0, hop0_pte_addr = 0,
-		hop1_addr = 0, hop1_pte_addr = 0,
-		hop2_addr = 0, hop2_pte_addr = 0,
-		hop3_addr = 0, hop3_pte_addr = 0,
-		hop4_addr = 0, hop4_pte_addr = 0,
-		curr_pte = 0;
-	bool hop1_new = false, hop2_new = false, hop3_new = false,
-		hop4_new = false, is_huge;
-	int rc = -ENOMEM;
+	bool is_huge, hop_new[MMU_V1_MAX_HOPS] = {false};
+	int num_hops, hop_idx, prev_hop, rc = -ENOMEM;
 
 	/*
 	 * This mapping function can map a page or a huge page. For huge page
@@ -684,39 +614,21 @@ static int _hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 		is_huge = false;
 	}
 
-	hop0_addr = get_hop0_addr(ctx);
-	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
-	curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
-
-	hop1_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop1_new);
-	if (hop1_addr == ULLONG_MAX)
-		goto err;
-
-	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
-	curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
-
-	hop2_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop2_new);
-	if (hop2_addr == ULLONG_MAX)
-		goto err;
-
-	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
-	curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
+	num_hops = is_huge ? (MMU_V1_MAX_HOPS - 1) : MMU_V1_MAX_HOPS;
 
-	hop3_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop3_new);
-	if (hop3_addr == ULLONG_MAX)
-		goto err;
-
-	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
-	curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
-
-	if (!is_huge) {
-		hop4_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop4_new);
-		if (hop4_addr == ULLONG_MAX)
-			goto err;
+	for (hop_idx = MMU_HOP0; hop_idx < num_hops; hop_idx++) {
+		if (hop_idx == MMU_HOP0) {
+			hop_addr[hop_idx] = get_hop0_addr(ctx);
+		} else {
+			hop_addr[hop_idx] =
+					get_alloc_next_hop_addr(ctx, curr_pte, &hop_new[hop_idx]);
+			if (hop_addr[hop_idx] == ULLONG_MAX)
+				goto err;
+		}
 
-		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
-							virt_addr);
-		curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
+		hop_pte_addr[hop_idx] =
+				get_hop_pte_addr(ctx, mmu_prop, hop_addr, virt_addr, hop_idx);
+		curr_pte = *(u64 *) (uintptr_t) hop_pte_addr[hop_idx];
 	}
 
 	if (hdev->dram_default_page_mapping && is_dram_addr) {
@@ -732,30 +644,22 @@ static int _hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 			goto err;
 		}
 
-		if (hop1_new || hop2_new || hop3_new || hop4_new) {
-			dev_err(hdev->dev,
-				"DRAM mapping should not allocate more hops\n");
-			rc = -EFAULT;
-			goto err;
+		for (hop_idx = MMU_HOP1; hop_idx < num_hops; hop_idx++) {
+			if (hop_new[hop_idx]) {
+				dev_err(hdev->dev, "DRAM mapping should not allocate more hops\n");
+				rc = -EFAULT;
+				goto err;
+			}
 		}
 	} else if (curr_pte & PAGE_PRESENT_MASK) {
 		dev_err(hdev->dev,
 			"mapping already exists for virt_addr 0x%llx\n",
 				virt_addr);
 
-		dev_dbg(hdev->dev, "hop0 pte: 0x%llx (0x%llx)\n",
-			*(u64 *) (uintptr_t) hop0_pte_addr, hop0_pte_addr);
-		dev_dbg(hdev->dev, "hop1 pte: 0x%llx (0x%llx)\n",
-			*(u64 *) (uintptr_t) hop1_pte_addr, hop1_pte_addr);
-		dev_dbg(hdev->dev, "hop2 pte: 0x%llx (0x%llx)\n",
-			*(u64 *) (uintptr_t) hop2_pte_addr, hop2_pte_addr);
-		dev_dbg(hdev->dev, "hop3 pte: 0x%llx (0x%llx)\n",
-			*(u64 *) (uintptr_t) hop3_pte_addr, hop3_pte_addr);
-
-		if (!is_huge)
-			dev_dbg(hdev->dev, "hop4 pte: 0x%llx (0x%llx)\n",
-				*(u64 *) (uintptr_t) hop4_pte_addr,
-				hop4_pte_addr);
+		for (hop_idx = MMU_HOP0; hop_idx < num_hops; hop_idx++)
+			dev_dbg(hdev->dev, "hop%d pte: 0x%llx (0x%llx)\n", hop_idx,
+					*(u64 *) (uintptr_t) hop_pte_addr[hop_idx],
+					hop_pte_addr[hop_idx]);
 
 		rc = -EINVAL;
 		goto err;
@@ -764,53 +668,28 @@ static int _hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 	curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask
 			| PAGE_PRESENT_MASK;
 
-	if (is_huge)
-		write_final_pte(ctx, hop3_pte_addr, curr_pte);
-	else
-		write_final_pte(ctx, hop4_pte_addr, curr_pte);
+	write_final_pte(ctx, hop_pte_addr[num_hops - 1], curr_pte);
 
-	if (hop1_new) {
-		curr_pte =
-			(hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-		write_pte(ctx, hop0_pte_addr, curr_pte);
-	}
-	if (hop2_new) {
-		curr_pte =
-			(hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-		write_pte(ctx, hop1_pte_addr, curr_pte);
-		get_pte(ctx, hop1_addr);
-	}
-	if (hop3_new) {
-		curr_pte =
-			(hop3_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-		write_pte(ctx, hop2_pte_addr, curr_pte);
-		get_pte(ctx, hop2_addr);
-	}
+	for (hop_idx = MMU_HOP1; hop_idx < num_hops; hop_idx++) {
+		prev_hop = hop_idx - 1;
 
-	if (!is_huge) {
-		if (hop4_new) {
-			curr_pte = (hop4_addr & HOP_PHYS_ADDR_MASK) |
-					PAGE_PRESENT_MASK;
-			write_pte(ctx, hop3_pte_addr, curr_pte);
-			get_pte(ctx, hop3_addr);
+		if (hop_new[hop_idx]) {
+			curr_pte = (hop_addr[hop_idx] & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+			write_pte(ctx, hop_pte_addr[prev_hop], curr_pte);
+			if (hop_idx != MMU_HOP1)
+				get_pte(ctx, hop_addr[prev_hop]);
 		}
-
-		get_pte(ctx, hop4_addr);
-	} else {
-		get_pte(ctx, hop3_addr);
 	}
 
+	get_pte(ctx, hop_addr[num_hops - 1]);
+
 	return 0;
 
 err:
-	if (hop4_new)
-		free_hop(ctx, hop4_addr);
-	if (hop3_new)
-		free_hop(ctx, hop3_addr);
-	if (hop2_new)
-		free_hop(ctx, hop2_addr);
-	if (hop1_new)
-		free_hop(ctx, hop1_addr);
+	for (hop_idx = num_hops; hop_idx > MMU_HOP0; hop_idx--) {
+		if (hop_new[hop_idx])
+			free_hop(ctx, hop_addr[hop_idx]);
+	}
 
 	return rc;
 }
@@ -928,8 +807,8 @@ void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu)
 	mmu->fini = hl_mmu_v1_fini;
 	mmu->ctx_init = hl_mmu_v1_ctx_init;
 	mmu->ctx_fini = hl_mmu_v1_ctx_fini;
-	mmu->map = _hl_mmu_v1_map;
-	mmu->unmap = _hl_mmu_v1_unmap;
+	mmu->map = hl_mmu_v1_map;
+	mmu->unmap = hl_mmu_v1_unmap;
 	mmu->flush = flush;
 	mmu->swap_out = hl_mmu_v1_swap_out;
 	mmu->swap_in = hl_mmu_v1_swap_in;
diff --git a/drivers/misc/habanalabs/common/pci/pci.c b/drivers/misc/habanalabs/common/pci/pci.c
index bb9ce22bafc4..610acd4a8057 100644
--- a/drivers/misc/habanalabs/common/pci/pci.c
+++ b/drivers/misc/habanalabs/common/pci/pci.c
@@ -392,6 +392,7 @@ enum pci_region hl_get_pci_memory_region(struct hl_device *hdev, u64 addr)
  */
 int hl_pci_init(struct hl_device *hdev)
 {
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct pci_dev *pdev = hdev->pdev;
 	int rc;
 
@@ -419,17 +420,14 @@ int hl_pci_init(struct hl_device *hdev)
 	}
 
 	/* Driver must sleep in order for FW to finish the iATU configuration */
-	if (hdev->asic_prop.iatu_done_by_fw) {
+	if (hdev->asic_prop.iatu_done_by_fw)
 		usleep_range(2000, 3000);
-		hdev->asic_funcs->set_dma_mask_from_fw(hdev);
-	}
 
-	rc = dma_set_mask_and_coherent(&pdev->dev,
-					DMA_BIT_MASK(hdev->dma_mask));
+	rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(prop->dma_mask));
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to set dma mask to %d bits, error %d\n",
-			hdev->dma_mask, rc);
+			prop->dma_mask, rc);
 		goto unmap_pci_bars;
 	}