diff options
Diffstat (limited to 'drivers/misc/habanalabs/device.c')
-rw-r--r-- | drivers/misc/habanalabs/device.c | 93 |
1 files changed, 69 insertions, 24 deletions
diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 77d51be66c7e..91a9e47a3482 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -5,11 +5,14 @@ * All Rights Reserved. */ +#define pr_fmt(fmt) "habanalabs: " fmt + #include "habanalabs.h" #include <linux/pci.h> #include <linux/sched/signal.h> #include <linux/hwmon.h> +#include <uapi/misc/habanalabs.h> #define HL_PLDM_PENDING_RESET_PER_SEC (HL_PENDING_RESET_PER_SEC * 10) @@ -21,6 +24,20 @@ bool hl_device_disabled_or_in_reset(struct hl_device *hdev) return false; } +enum hl_device_status hl_device_status(struct hl_device *hdev) +{ + enum hl_device_status status; + + if (hdev->disabled) + status = HL_DEVICE_STATUS_MALFUNCTION; + else if (atomic_read(&hdev->in_reset)) + status = HL_DEVICE_STATUS_IN_RESET; + else + status = HL_DEVICE_STATUS_OPERATIONAL; + + return status; +}; + static void hpriv_release(struct kref *ref) { struct hl_fpriv *hpriv; @@ -498,11 +515,8 @@ disable_device: return rc; } -static void hl_device_hard_reset_pending(struct work_struct *work) +static void device_kill_open_processes(struct hl_device *hdev) { - struct hl_device_reset_work *device_reset_work = - container_of(work, struct hl_device_reset_work, reset_work); - struct hl_device *hdev = device_reset_work->hdev; u16 pending_total, pending_cnt; struct task_struct *task = NULL; @@ -537,6 +551,12 @@ static void hl_device_hard_reset_pending(struct work_struct *work) } } + /* We killed the open users, but because the driver cleans up after the + * user contexts are closed (e.g. mmu mappings), we need to wait again + * to make sure the cleaning phase is finished before continuing with + * the reset + */ + pending_cnt = pending_total; while ((atomic_read(&hdev->fd_open_cnt)) && (pending_cnt)) { @@ -552,6 +572,16 @@ static void hl_device_hard_reset_pending(struct work_struct *work) mutex_unlock(&hdev->fd_open_cnt_lock); +} + +static void device_hard_reset_pending(struct work_struct *work) +{ + struct hl_device_reset_work *device_reset_work = + container_of(work, struct hl_device_reset_work, reset_work); + struct hl_device *hdev = device_reset_work->hdev; + + device_kill_open_processes(hdev); + hl_device_reset(hdev, true, true); kfree(device_reset_work); @@ -613,6 +643,8 @@ again: if ((hard_reset) && (!from_hard_reset_thread)) { struct hl_device_reset_work *device_reset_work; + hdev->hard_reset_pending = true; + if (!hdev->pdev) { dev_err(hdev->dev, "Reset action is NOT supported in simulator\n"); @@ -620,8 +652,6 @@ again: goto out_err; } - hdev->hard_reset_pending = true; - device_reset_work = kzalloc(sizeof(*device_reset_work), GFP_ATOMIC); if (!device_reset_work) { @@ -635,7 +665,7 @@ again: * from a dedicated work */ INIT_WORK(&device_reset_work->reset_work, - hl_device_hard_reset_pending); + device_hard_reset_pending); device_reset_work->hdev = hdev; schedule_work(&device_reset_work->reset_work); @@ -663,17 +693,9 @@ again: /* Go over all the queues, release all CS and their jobs */ hl_cs_rollback_all(hdev); - if (hard_reset) { - /* Release kernel context */ - if (hl_ctx_put(hdev->kernel_ctx) != 1) { - dev_err(hdev->dev, - "kernel ctx is alive during hard reset\n"); - rc = -EBUSY; - goto out_err; - } - + /* Release kernel context */ + if ((hard_reset) && (hl_ctx_put(hdev->kernel_ctx) == 1)) hdev->kernel_ctx = NULL; - } /* Reset the H/W. It will be in idle state after this returns */ hdev->asic_funcs->hw_fini(hdev, hard_reset); @@ -688,16 +710,24 @@ again: for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) hl_cq_reset(hdev, &hdev->completion_queue[i]); - /* Make sure the setup phase for the user context will run again */ + /* Make sure the context switch phase will run again */ if (hdev->user_ctx) { - atomic_set(&hdev->user_ctx->thread_restore_token, 1); - hdev->user_ctx->thread_restore_wait_token = 0; + atomic_set(&hdev->user_ctx->thread_ctx_switch_token, 1); + hdev->user_ctx->thread_ctx_switch_wait_token = 0; } /* Finished tear-down, starting to re-initialize */ if (hard_reset) { hdev->device_cpu_disabled = false; + hdev->hard_reset_pending = false; + + if (hdev->kernel_ctx) { + dev_crit(hdev->dev, + "kernel ctx was alive during hard reset, something is terribly wrong\n"); + rc = -EBUSY; + goto out_err; + } /* Allocate the kernel context */ hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), @@ -752,8 +782,6 @@ again: } hl_set_max_power(hdev, hdev->max_power); - - hdev->hard_reset_pending = false; } else { rc = hdev->asic_funcs->soft_reset_late_init(hdev); if (rc) { @@ -1030,11 +1058,22 @@ void hl_device_fini(struct hl_device *hdev) WARN(1, "Failed to remove device because reset function did not finish\n"); return; } - }; + } /* Mark device as disabled */ hdev->disabled = true; + /* + * Flush anyone that is inside the critical section of enqueue + * jobs to the H/W + */ + hdev->asic_funcs->hw_queues_lock(hdev); + hdev->asic_funcs->hw_queues_unlock(hdev); + + hdev->hard_reset_pending = true; + + device_kill_open_processes(hdev); + hl_hwmon_fini(hdev); device_late_fini(hdev); @@ -1108,7 +1147,13 @@ int hl_poll_timeout_memory(struct hl_device *hdev, u64 addr, * either by the direct access of the device or by another core */ u32 *paddr = (u32 *) (uintptr_t) addr; - ktime_t timeout = ktime_add_us(ktime_get(), timeout_us); + ktime_t timeout; + + /* timeout should be longer when working with simulator */ + if (!hdev->pdev) + timeout_us *= 10; + + timeout = ktime_add_us(ktime_get(), timeout_us); might_sleep(); |