1 files changed, 69 insertions, 24 deletions
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 77d51be66c7e..91a9e47a3482 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -5,11 +5,14 @@
  * All Rights Reserved.
  */
 
+#define pr_fmt(fmt)			"habanalabs: " fmt
+
 #include "habanalabs.h"
 
 #include <linux/pci.h>
 #include <linux/sched/signal.h>
 #include <linux/hwmon.h>
+#include <uapi/misc/habanalabs.h>
 
 #define HL_PLDM_PENDING_RESET_PER_SEC	(HL_PENDING_RESET_PER_SEC * 10)
 
@@ -21,6 +24,20 @@ bool hl_device_disabled_or_in_reset(struct hl_device *hdev)
 		return false;
 }
 
+enum hl_device_status hl_device_status(struct hl_device *hdev)
+{
+	enum hl_device_status status;
+
+	if (hdev->disabled)
+		status = HL_DEVICE_STATUS_MALFUNCTION;
+	else if (atomic_read(&hdev->in_reset))
+		status = HL_DEVICE_STATUS_IN_RESET;
+	else
+		status = HL_DEVICE_STATUS_OPERATIONAL;
+
+	return status;
+};
+
 static void hpriv_release(struct kref *ref)
 {
 	struct hl_fpriv *hpriv;
@@ -498,11 +515,8 @@ disable_device:
 	return rc;
 }
 
-static void hl_device_hard_reset_pending(struct work_struct *work)
+static void device_kill_open_processes(struct hl_device *hdev)
 {
-	struct hl_device_reset_work *device_reset_work =
-		container_of(work, struct hl_device_reset_work, reset_work);
-	struct hl_device *hdev = device_reset_work->hdev;
 	u16 pending_total, pending_cnt;
 	struct task_struct *task = NULL;
 
@@ -537,6 +551,12 @@ static void hl_device_hard_reset_pending(struct work_struct *work)
 		}
 	}
 
+	/* We killed the open users, but because the driver cleans up after the
+	 * user contexts are closed (e.g. mmu mappings), we need to wait again
+	 * to make sure the cleaning phase is finished before continuing with
+	 * the reset
+	 */
+
 	pending_cnt = pending_total;
 
 	while ((atomic_read(&hdev->fd_open_cnt)) && (pending_cnt)) {
@@ -552,6 +572,16 @@ static void hl_device_hard_reset_pending(struct work_struct *work)
 
 	mutex_unlock(&hdev->fd_open_cnt_lock);
 
+}
+
+static void device_hard_reset_pending(struct work_struct *work)
+{
+	struct hl_device_reset_work *device_reset_work =
+		container_of(work, struct hl_device_reset_work, reset_work);
+	struct hl_device *hdev = device_reset_work->hdev;
+
+	device_kill_open_processes(hdev);
+
 	hl_device_reset(hdev, true, true);
 
 	kfree(device_reset_work);
@@ -613,6 +643,8 @@ again:
 	if ((hard_reset) && (!from_hard_reset_thread)) {
 		struct hl_device_reset_work *device_reset_work;
 
+		hdev->hard_reset_pending = true;
+
 		if (!hdev->pdev) {
 			dev_err(hdev->dev,
 				"Reset action is NOT supported in simulator\n");
@@ -620,8 +652,6 @@ again:
 			goto out_err;
 		}
 
-		hdev->hard_reset_pending = true;
-
 		device_reset_work = kzalloc(sizeof(*device_reset_work),
 						GFP_ATOMIC);
 		if (!device_reset_work) {
@@ -635,7 +665,7 @@ again:
 		 * from a dedicated work
 		 */
 		INIT_WORK(&device_reset_work->reset_work,
-				hl_device_hard_reset_pending);
+				device_hard_reset_pending);
 		device_reset_work->hdev = hdev;
 		schedule_work(&device_reset_work->reset_work);
 
@@ -663,17 +693,9 @@ again:
 	/* Go over all the queues, release all CS and their jobs */
 	hl_cs_rollback_all(hdev);
 
-	if (hard_reset) {
-		/* Release kernel context */
-		if (hl_ctx_put(hdev->kernel_ctx) != 1) {
-			dev_err(hdev->dev,
-				"kernel ctx is alive during hard reset\n");
-			rc = -EBUSY;
-			goto out_err;
-		}
-
+	/* Release kernel context */
+	if ((hard_reset) && (hl_ctx_put(hdev->kernel_ctx) == 1))
 		hdev->kernel_ctx = NULL;
-	}
 
 	/* Reset the H/W. It will be in idle state after this returns */
 	hdev->asic_funcs->hw_fini(hdev, hard_reset);
@@ -688,16 +710,24 @@ again:
 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
 		hl_cq_reset(hdev, &hdev->completion_queue[i]);
 
-	/* Make sure the setup phase for the user context will run again */
+	/* Make sure the context switch phase will run again */
 	if (hdev->user_ctx) {
-		atomic_set(&hdev->user_ctx->thread_restore_token, 1);
-		hdev->user_ctx->thread_restore_wait_token = 0;
+		atomic_set(&hdev->user_ctx->thread_ctx_switch_token, 1);
+		hdev->user_ctx->thread_ctx_switch_wait_token = 0;
 	}
 
 	/* Finished tear-down, starting to re-initialize */
 
 	if (hard_reset) {
 		hdev->device_cpu_disabled = false;
+		hdev->hard_reset_pending = false;
+
+		if (hdev->kernel_ctx) {
+			dev_crit(hdev->dev,
+				"kernel ctx was alive during hard reset, something is terribly wrong\n");
+			rc = -EBUSY;
+			goto out_err;
+		}
 
 		/* Allocate the kernel context */
 		hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
@@ -752,8 +782,6 @@ again:
 		}
 
 		hl_set_max_power(hdev, hdev->max_power);
-
-		hdev->hard_reset_pending = false;
 	} else {
 		rc = hdev->asic_funcs->soft_reset_late_init(hdev);
 		if (rc) {
@@ -1030,11 +1058,22 @@ void hl_device_fini(struct hl_device *hdev)
 			WARN(1, "Failed to remove device because reset function did not finish\n");
 			return;
 		}
-	};
+	}
 
 	/* Mark device as disabled */
 	hdev->disabled = true;
 
+	/*
+	 * Flush anyone that is inside the critical section of enqueue
+	 * jobs to the H/W
+	 */
+	hdev->asic_funcs->hw_queues_lock(hdev);
+	hdev->asic_funcs->hw_queues_unlock(hdev);
+
+	hdev->hard_reset_pending = true;
+
+	device_kill_open_processes(hdev);
+
 	hl_hwmon_fini(hdev);
 
 	device_late_fini(hdev);
@@ -1108,7 +1147,13 @@ int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr,
 	 * either by the direct access of the device or by another core
 	 */
 	u32 *paddr = (u32 *) (uintptr_t) addr;
-	ktime_t timeout = ktime_add_us(ktime_get(), timeout_us);
+	ktime_t timeout;
+
+	/* timeout should be longer when working with simulator */
+	if (!hdev->pdev)
+		timeout_us *= 10;
+
+	timeout = ktime_add_us(ktime_get(), timeout_us);
 
 	might_sleep();