From 136cfcb8dce9bc7a17a8d32e497f4dfe80dd357d Mon Sep 17 00:00:00 2001
From: Mark O'Donovan <shiftee@posteo.net>
Date: Fri, 24 Nov 2023 20:56:59 +0000
Subject: [PATCH 1/4] nvme: fine-tune sending of first keep-alive

Keep-alive commands are sent half-way through the kato period.
This normally works well but fails when the keep-alive system is
started when we are more than half way through the kato.
This can happen on larger setups or due to host delays.
With this change we now time the initial keep-alive command from
the controller initialisation time, rather than the keep-alive
mechanism activation time.

Signed-off-by: Mark O'Donovan <shiftee@posteo.net>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 46a4c9c5ea96..8bf24c1cd8bb 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1192,8 +1192,16 @@ static unsigned long nvme_keep_alive_work_period(struct nvme_ctrl *ctrl)
 
 static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
 {
-	queue_delayed_work(nvme_wq, &ctrl->ka_work,
-			   nvme_keep_alive_work_period(ctrl));
+	unsigned long now = jiffies;
+	unsigned long delay = nvme_keep_alive_work_period(ctrl);
+	unsigned long ka_next_check_tm = ctrl->ka_last_check_time + delay;
+
+	if (time_after(now, ka_next_check_tm))
+		delay = 0;
+	else
+		delay = ka_next_check_tm - now;
+
+	queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
 }
 
 static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
@@ -4471,6 +4479,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
 	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
 	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
+	ctrl->ka_last_check_time = jiffies;
 
 	BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
 			PAGE_SIZE);

From e3139cef8257fcab1725441e2fd5fd0ccb5481b1 Mon Sep 17 00:00:00 2001
From: Maurizio Lombardi <mlombard@redhat.com>
Date: Thu, 23 Nov 2023 15:07:41 +0100
Subject: [PATCH 2/4] nvme-core: fix a memory leak in
 nvme_ns_info_from_identify()

In case of error, free the nvme_id_ns structure that was allocated
by nvme_identify_ns().

Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8bf24c1cd8bb..af94d067ac14 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1487,7 +1487,8 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
 	if (id->ncap == 0) {
 		/* namespace not allocated or attached */
 		info->is_removed = true;
-		return -ENODEV;
+		ret = -ENODEV;
+		goto error;
 	}
 
 	info->anagrpid = id->anagrpid;
@@ -1505,8 +1506,10 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
 		    !memchr_inv(ids->nguid, 0, sizeof(ids->nguid)))
 			memcpy(ids->nguid, id->nguid, sizeof(ids->nguid));
 	}
+
+error:
 	kfree(id);
-	return 0;
+	return ret;
 }
 
 static int nvme_ns_info_from_id_cs_indep(struct nvme_ctrl *ctrl,

From d8b90d600aff181936457f032d116dbd8534db06 Mon Sep 17 00:00:00 2001
From: "Ewan D. Milne" <emilne@redhat.com>
Date: Mon, 27 Nov 2023 15:56:57 -0500
Subject: [PATCH 3/4] nvme: check for valid nvme_identify_ns() before using it

When scanning namespaces, it is possible to get valid data from the first
call to nvme_identify_ns() in nvme_alloc_ns(), but not from the second
call in nvme_update_ns_info_block().  In particular, if the NSID becomes
inactive between the two commands, a storage device may return a buffer
filled with zero as per 4.1.5.1.  In this case, we can get a kernel crash
due to a divide-by-zero in blk_stack_limits() because ns->lba_shift will
be set to zero.

PID: 326      TASK: ffff95fec3cd8000  CPU: 29   COMMAND: "kworker/u98:10"
 #0 [ffffad8f8702f9e0] machine_kexec at ffffffff91c76ec7
 #1 [ffffad8f8702fa38] __crash_kexec at ffffffff91dea4fa
 #2 [ffffad8f8702faf8] crash_kexec at ffffffff91deb788
 #3 [ffffad8f8702fb00] oops_end at ffffffff91c2e4bb
 #4 [ffffad8f8702fb20] do_trap at ffffffff91c2a4ce
 #5 [ffffad8f8702fb70] do_error_trap at ffffffff91c2a595
 #6 [ffffad8f8702fbb0] exc_divide_error at ffffffff928506e6
 #7 [ffffad8f8702fbd0] asm_exc_divide_error at ffffffff92a00926
    [exception RIP: blk_stack_limits+434]
    RIP: ffffffff92191872  RSP: ffffad8f8702fc80  RFLAGS: 00010246
    RAX: 0000000000000000  RBX: ffff95efa0c91800  RCX: 0000000000000001
    RDX: 0000000000000000  RSI: 0000000000000001  RDI: 0000000000000001
    RBP: 00000000ffffffff   R8: ffff95fec7df35a8   R9: 0000000000000000
    R10: 0000000000000000  R11: 0000000000000001  R12: 0000000000000000
    R13: 0000000000000000  R14: 0000000000000000  R15: ffff95fed33c09a8
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
 #8 [ffffad8f8702fce0] nvme_update_ns_info_block at ffffffffc06d3533 [nvme_core]
 #9 [ffffad8f8702fd18] nvme_scan_ns at ffffffffc06d6fa7 [nvme_core]

This happened when the check for valid data was moved out of nvme_identify_ns()
into one of the callers.  Fix this by checking in both callers.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=218186
Fixes: 0dd6fff2aad4 ("nvme: bring back auto-removal of deleted namespaces during sequential scan")
Cc: stable@vger.kernel.org
Signed-off-by: Ewan D. Milne <emilne@redhat.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index af94d067ac14..a55c2a774b9c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2040,6 +2040,13 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
 	if (ret)
 		return ret;
 
+	if (id->ncap == 0) {
+		/* namespace not allocated or attached */
+		info->is_removed = true;
+		ret = -ENODEV;
+		goto error;
+	}
+
 	blk_mq_freeze_queue(ns->disk->queue);
 	lbaf = nvme_lbaf_index(id->flbas);
 	ns->lba_shift = id->lbaf[lbaf].ds;
@@ -2101,6 +2108,8 @@ out:
 		set_bit(NVME_NS_READY, &ns->flags);
 		ret = 0;
 	}
+
+error:
 	kfree(id);
 	return ret;
 }

From 74fbc88e161424b3b96a22b23a8e3e1edab9d05c Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Tue, 28 Nov 2023 09:36:04 -0800
Subject: [PATCH 4/4] nvme-core: check for too small lba shift

The block layer doesn't support logical block sizes smaller than 512
bytes. The nvme spec doesn't support that small either, but the driver
isn't checking to make sure the device responded with usable data.
Failing to catch this will result in a kernel bug, either from a
division by zero when stacking, or a zero length bio.

Reviewed-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a55c2a774b9c..1be1ce522896 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1901,9 +1901,10 @@ static void nvme_update_disk_info(struct gendisk *disk,
 
 	/*
 	 * The block layer can't support LBA sizes larger than the page size
-	 * yet, so catch this early and don't allow block I/O.
+	 * or smaller than a sector size yet, so catch this early and don't
+	 * allow block I/O.
 	 */
-	if (ns->lba_shift > PAGE_SHIFT) {
+	if (ns->lba_shift > PAGE_SHIFT || ns->lba_shift < SECTOR_SHIFT) {
 		capacity = 0;
 		bs = (1 << 9);
 	}