diff options
author | Tomer Tayar <ttayar@habana.ai> | 2022-11-14 13:26:21 +0200 |
---|---|---|
committer | Oded Gabbay <ogabbay@kernel.org> | 2022-11-23 16:45:23 +0200 |
commit | 408c46bd6eb7a4e2fb9fd686218e4a13b9de844c (patch) | |
tree | a91cd60f8b1c19bf95e3edcbb17301a18f7bca40 /drivers/misc | |
parent | 0abcae8b48850e0f488d0eb7232323d93bdc4b13 (diff) |
habanalabs: print context refcount value if hard reset fails
Failing to kill a user process during a hard reset can be due to a
reference to the user context which isn't released.
To make it easier to understand if this the reason for the failure and
not something else, add a print of the context refcount value.
Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/misc')
-rw-r--r-- | drivers/misc/habanalabs/common/device.c | 18 |
1 files changed, 15 insertions, 3 deletions
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index f5864893237c..926f230def56 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -696,10 +696,22 @@ static void device_hard_reset_pending(struct work_struct *work) flags = device_reset_work->flags | HL_DRV_RESET_FROM_RESET_THR; rc = hl_device_reset(hdev, flags); + if ((rc == -EBUSY) && !hdev->device_fini_pending) { - dev_info(hdev->dev, - "Could not reset device. will try again in %u seconds", - HL_PENDING_RESET_PER_SEC); + struct hl_ctx *ctx = hl_get_compute_ctx(hdev); + + if (ctx) { + /* The read refcount value should subtracted by one, because the read is + * protected with hl_get_compute_ctx(). + */ + dev_info(hdev->dev, + "Could not reset device (compute_ctx refcount %u). will try again in %u seconds", + kref_read(&ctx->refcount) - 1, HL_PENDING_RESET_PER_SEC); + hl_ctx_put(ctx); + } else { + dev_info(hdev->dev, "Could not reset device. will try again in %u seconds", + HL_PENDING_RESET_PER_SEC); + } queue_delayed_work(hdev->reset_wq, &device_reset_work->reset_work, msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000)); |