habanalabs: improve hard reset procedure

We want to handle the scenario in which the driver was not able to kill all user processes due to many memory mappings. We need to retry again after some period while releasing the cores. The devices will be unusable and "in-reset" status during that time. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
author: Ofir Bitton <obitton@habana.ai> 2020-10-08 10:27:42 +0300
committer: Oded Gabbay <ogabbay@kernel.org> 2020-11-30 10:47:34 +0200
commit: adb51298fdc41b89f47c520171d86e4859cc7731 (patch)
tree: 376f52afa6005ee756d27319ea07dad2c1f909a0 /drivers/misc/habanalabs/common/device.c
parent: 804a72276cc4a1e27822d611907350b74f7fe561 (diff)
1 files changed, 92 insertions, 36 deletions
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index ce0a1270e5ff..c9011541c647 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -13,8 +13,6 @@
 #include <linux/hwmon.h>
 #include <uapi/misc/habanalabs.h>
 
-#define HL_PLDM_PENDING_RESET_PER_SEC	(HL_PENDING_RESET_PER_SEC * 10)
-
 enum hl_device_status hl_device_status(struct hl_device *hdev)
 {
 	enum hl_device_status status;
@@ -256,6 +254,26 @@ static void device_cdev_sysfs_del(struct hl_device *hdev)
 	cdev_device_del(&hdev->cdev, hdev->dev);
 }
 
+static void device_hard_reset_pending(struct work_struct *work)
+{
+	struct hl_device_reset_work *device_reset_work =
+		container_of(work, struct hl_device_reset_work,
+				reset_work.work);
+	struct hl_device *hdev = device_reset_work->hdev;
+	int rc;
+
+	rc = hl_device_reset(hdev, true, true);
+	if ((rc == -EBUSY) && !hdev->device_fini_pending) {
+		dev_info(hdev->dev,
+			"Could not reset device. will try again in %u seconds",
+			HL_PENDING_RESET_PER_SEC);
+
+		queue_delayed_work(device_reset_work->wq,
+			&device_reset_work->reset_work,
+			msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000));
+	}
+}
+
 /*
  * device_early_init - do some early initialization for the habanalabs device
  *
@@ -340,6 +358,19 @@ static int device_early_init(struct hl_device *hdev)
 
 	hl_cb_mgr_init(&hdev->kernel_cb_mgr);
 
+	hdev->device_reset_work.wq =
+			create_singlethread_workqueue("hl_device_reset");
+	if (!hdev->device_reset_work.wq) {
+		rc = -ENOMEM;
+		dev_err(hdev->dev, "Failed to create device reset WQ\n");
+		goto free_cb_mgr;
+	}
+
+	INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work,
+			device_hard_reset_pending);
+	hdev->device_reset_work.hdev = hdev;
+	hdev->device_fini_pending = 0;
+
 	mutex_init(&hdev->send_cpu_message_lock);
 	mutex_init(&hdev->debug_lock);
 	mutex_init(&hdev->mmu_cache_lock);
@@ -351,6 +382,8 @@ static int device_early_init(struct hl_device *hdev)
 
 	return 0;
 
+free_cb_mgr:
+	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
 free_idle_busy_ts_arr:
 	kfree(hdev->idle_busy_ts_arr);
 free_chip_info:
@@ -393,6 +426,7 @@ static void device_early_fini(struct hl_device *hdev)
 	kfree(hdev->hl_chip_info);
 
 	destroy_workqueue(hdev->eq_wq);
+	destroy_workqueue(hdev->device_reset_work.wq);
 
 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
 		destroy_workqueue(hdev->cq_wq[i]);
@@ -771,16 +805,12 @@ disable_device:
 	return rc;
 }
 
-static int device_kill_open_processes(struct hl_device *hdev)
+static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
 {
-	u16 pending_total, pending_cnt;
 	struct hl_fpriv	*hpriv;
 	struct task_struct *task = NULL;
+	u32 pending_cnt;
 
-	if (hdev->pldm)
-		pending_total = HL_PLDM_PENDING_RESET_PER_SEC;
-	else
-		pending_total = HL_PENDING_RESET_PER_SEC;
 
 	/* Giving time for user to close FD, and for processes that are inside
 	 * hl_device_open to finish
@@ -788,6 +818,19 @@ static int device_kill_open_processes(struct hl_device *hdev)
 	if (!list_empty(&hdev->fpriv_list))
 		ssleep(1);
 
+	if (timeout) {
+		pending_cnt = timeout;
+	} else {
+		if (hdev->process_kill_trial_cnt) {
+			/* Processes have been already killed */
+			pending_cnt = 1;
+			goto wait_for_processes;
+		} else {
+			/* Wait a small period after process kill */
+			pending_cnt = HL_PENDING_RESET_PER_SEC;
+		}
+	}
+
 	mutex_lock(&hdev->fpriv_list_lock);
 
 	/* This section must be protected because we are dereferencing
@@ -816,29 +859,27 @@ static int device_kill_open_processes(struct hl_device *hdev)
 	 * continuing with the reset.
 	 */
 
-	pending_cnt = pending_total;
-
+wait_for_processes:
 	while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) {
-		dev_info(hdev->dev,
-			"Waiting for all user contexts to get closed before hard reset\n");
+		dev_dbg(hdev->dev,
+			"Waiting for all unmap operations to finish before hard reset\n");
 
 		pending_cnt--;
 
 		ssleep(1);
 	}
 
-	return list_empty(&hdev->fpriv_list) ? 0 : -EBUSY;
-}
+	/* All processes exited successfully */
+	if (list_empty(&hdev->fpriv_list))
+		return 0;
 
-static void device_hard_reset_pending(struct work_struct *work)
-{
-	struct hl_device_reset_work *device_reset_work =
-		container_of(work, struct hl_device_reset_work, reset_work);
-	struct hl_device *hdev = device_reset_work->hdev;
+	/* Give up waiting for processes to exit */
+	if (hdev->process_kill_trial_cnt == HL_PENDING_RESET_MAX_TRIALS)
+		return -ETIME;
 
-	hl_device_reset(hdev, true, true);
+	hdev->process_kill_trial_cnt++;
 
-	kfree(device_reset_work);
+	return -EBUSY;
 }
 
 /*
@@ -875,6 +916,10 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 		hard_reset = true;
 	}
 
+	/* Re-entry of reset thread */
+	if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
+		goto kill_processes;
+
 	/*
 	 * Prevent concurrency in this function - only one reset should be
 	 * done at any given time. Only need to perform this if we didn't
@@ -920,26 +965,17 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 
 again:
 	if ((hard_reset) && (!from_hard_reset_thread)) {
-		struct hl_device_reset_work *device_reset_work;
-
 		hdev->hard_reset_pending = true;
 
-		device_reset_work = kzalloc(sizeof(*device_reset_work),
-						GFP_ATOMIC);
-		if (!device_reset_work) {
-			rc = -ENOMEM;
-			goto out_err;
-		}
+		hdev->process_kill_trial_cnt = 0;
 
 		/*
 		 * Because the reset function can't run from interrupt or
 		 * from heartbeat work, we need to call the reset function
 		 * from a dedicated work
 		 */
-		INIT_WORK(&device_reset_work->reset_work,
-				device_hard_reset_pending);
-		device_reset_work->hdev = hdev;
-		schedule_work(&device_reset_work->reset_work);
+		queue_delayed_work(hdev->device_reset_work.wq,
+			&hdev->device_reset_work.reset_work, 0);
 
 		return 0;
 	}
@@ -965,12 +1001,25 @@ again:
 	/* Go over all the queues, release all CS and their jobs */
 	hl_cs_rollback_all(hdev);
 
+kill_processes:
 	if (hard_reset) {
 		/* Kill processes here after CS rollback. This is because the
 		 * process can't really exit until all its CSs are done, which
 		 * is what we do in cs rollback
 		 */
-		rc = device_kill_open_processes(hdev);
+		rc = device_kill_open_processes(hdev, 0);
+
+		if (rc == -EBUSY) {
+			if (hdev->device_fini_pending) {
+				dev_crit(hdev->dev,
+					"Failed to kill all open processes, stopping hard reset\n");
+				goto out_err;
+			}
+
+			/* signal reset thread to reschedule */
+			return rc;
+		}
+
 		if (rc) {
 			dev_crit(hdev->dev,
 				"Failed to kill all open processes, stopping hard reset\n");
@@ -1408,11 +1457,14 @@ out_disabled:
  */
 void hl_device_fini(struct hl_device *hdev)
 {
-	int i, rc;
 	ktime_t timeout;
+	int i, rc;
 
 	dev_info(hdev->dev, "Removing device\n");
 
+	hdev->device_fini_pending = 1;
+	flush_delayed_work(&hdev->device_reset_work.reset_work);
+
 	/*
 	 * This function is competing with the reset function, so try to
 	 * take the reset atomic and if we are already in middle of reset,
@@ -1468,7 +1520,11 @@ void hl_device_fini(struct hl_device *hdev)
 	 * can't really exit until all its CSs are done, which is what we
 	 * do in cs rollback
 	 */
-	rc = device_kill_open_processes(hdev);
+	dev_info(hdev->dev,
+		"Waiting for all processes to exit (timeout of %u seconds)",
+		HL_PENDING_RESET_LONG_SEC);
+
+	rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC);
 	if (rc)
 		dev_crit(hdev->dev, "Failed to kill all open processes\n");
author	Ofir Bitton <obitton@habana.ai>	2020-10-08 10:27:42 +0300
committer	Oded Gabbay <ogabbay@kernel.org>	2020-11-30 10:47:34 +0200
commit	adb51298fdc41b89f47c520171d86e4859cc7731 (patch)
tree	376f52afa6005ee756d27319ea07dad2c1f909a0 /drivers/misc/habanalabs/common/device.c
parent	804a72276cc4a1e27822d611907350b74f7fe561 (diff)