habanalabs: fail reset if device is not idle

After any reset (soft or hard) the device (the engines/QMANs) should
be idle. If they are not idle, fail the reset. If it is soft-reset,
the driver will try to do hard-reset automatically. If it is hard-reset,
the driver will make the device non-operational.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Oded Gabbay 2021-02-16 22:46:17 +02:00
parent 35862d1c99
commit 23c3efd1fb

View File

@ -71,21 +71,8 @@ static void hpriv_release(struct kref *ref)
kfree(hpriv);
if (hdev->reset_upon_device_release) {
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
/* We try soft reset first */
if (hdev->reset_upon_device_release)
hl_device_reset(hdev, false, false);
/* If device is not idle perform hard reset */
if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
dev_info(hdev->dev,
"device is not idle (mask %#llx %#llx) after soft reset, performing hard reset",
idle_mask[0], idle_mask[1]);
hl_device_reset(hdev, true, false);
}
}
}
void hl_hpriv_get(struct hl_fpriv *hpriv)
@ -948,6 +935,7 @@ static void device_disable_open_processes(struct hl_device *hdev)
int hl_device_reset(struct hl_device *hdev, bool hard_reset,
bool from_hard_reset_thread)
{
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
int i, rc;
if (!hdev->init_done) {
@ -1167,6 +1155,16 @@ kill_processes:
goto out_err;
}
/* If device is not idle fail the reset process */
if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
dev_err(hdev->dev,
"device is not idle (mask %#llx %#llx) after reset\n",
idle_mask[0], idle_mask[1]);
rc = -EIO;
goto out_err;
}
/* Check that the communication with the device is working */
rc = hdev->asic_funcs->test_queues(hdev);
if (rc) {