From 4a09a845c1773fe3cd96c2afe737b58a88e3d30b Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Fri, 14 Jan 2022 16:58:54 +0100 Subject: [PATCH 01/51] block/rnbd-clt: fix CHECK:BRACES warning This patch fix the "CHECK:BRACES: braces {} should be used on all arms of this statement" warning from checkpatch Signed-off-by: Gioh Kim Signed-off-by: Md Haris Iqbal Link: https://lore.kernel.org/r/20220114155855.984144-2-haris.iqbal@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index c08971de369f..f0370a345c3e 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1262,9 +1262,9 @@ find_and_get_or_create_sess(const char *sessname, struct rtrs_clt_ops rtrs_ops; sess = find_or_create_sess(sessname, &first); - if (sess == ERR_PTR(-ENOMEM)) + if (sess == ERR_PTR(-ENOMEM)) { return ERR_PTR(-ENOMEM); - else if ((nr_poll_queues && !first) || (!nr_poll_queues && sess->nr_poll_queues)) { + } else if ((nr_poll_queues && !first) || (!nr_poll_queues && sess->nr_poll_queues)) { /* * A device MUST have its own session to use the polling-mode. * It must fail to map new device with the same session. From 030ce8ba97d2e85e5310b1ae8236cd640bd384d0 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Fri, 14 Jan 2022 16:58:55 +0100 Subject: [PATCH 02/51] block/rnbd: client device does not care queue/rotational On client side, the device is a network device. There is no reason to set rotational even-if the target device on server is rotational. Signed-off-by: Gioh Kim Signed-off-by: Jack Wang Signed-off-by: Md Haris Iqbal Link: https://lore.kernel.org/r/20220114155855.984144-3-haris.iqbal@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 11 ++++++----- drivers/block/rnbd/rnbd-clt.h | 1 - drivers/block/rnbd/rnbd-proto.h | 4 ++-- drivers/block/rnbd/rnbd-srv.c | 1 - 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index f0370a345c3e..9a880d559ab8 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -87,7 +87,6 @@ static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev, dev->discard_granularity = le32_to_cpu(rsp->discard_granularity); dev->discard_alignment = le32_to_cpu(rsp->discard_alignment); dev->secure_discard = le16_to_cpu(rsp->secure_discard); - dev->rotational = rsp->rotational; dev->wc = !!(rsp->cache_policy & RNBD_WRITEBACK); dev->fua = !!(rsp->cache_policy & RNBD_FUA); @@ -1410,8 +1409,10 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) dev->read_only = false; } - if (!dev->rotational) - blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); + /* + * Network device does not need rotational + */ + blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); err = add_disk(dev->gd); if (err) blk_cleanup_disk(dev->gd); @@ -1610,13 +1611,13 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, } rnbd_clt_info(dev, - "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_write_same_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, rotational: %d, wc: %d, fua: %d)\n", + "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_write_same_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n", dev->gd->disk_name, dev->nsectors, dev->logical_block_size, dev->physical_block_size, dev->max_write_same_sectors, dev->max_discard_sectors, dev->discard_granularity, dev->discard_alignment, dev->secure_discard, dev->max_segments, - dev->max_hw_sectors, dev->rotational, dev->wc, dev->fua); + dev->max_hw_sectors, dev->wc, dev->fua); mutex_unlock(&dev->lock); rnbd_clt_put_sess(sess); diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index 0c2cae7f39b9..62bf7c3fa63c 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -118,7 +118,6 @@ struct rnbd_clt_dev { enum rnbd_access_mode access_mode; u32 nr_poll_queues; bool read_only; - bool rotational; bool wc; bool fua; u32 max_hw_sectors; diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index de5d5a8df81d..c4a68b3a1cbe 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -128,7 +128,7 @@ enum rnbd_cache_policy { * @logical_block_size: logical block size device supports in bytes * @max_segments: max segments hardware support in one transfer * @secure_discard: supports secure discard - * @rotation: is a rotational disc? + * @obsolete_rotational: obsolete, not in used. * @cache_policy: support write-back caching or FUA? */ struct rnbd_msg_open_rsp { @@ -144,7 +144,7 @@ struct rnbd_msg_open_rsp { __le16 logical_block_size; __le16 max_segments; __le16 secure_discard; - u8 rotational; + u8 obsolete_rotational; u8 cache_policy; u8 reserved[10]; }; diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index 132e950685d5..6499efae5c43 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -558,7 +558,6 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, cpu_to_le32(rnbd_dev_get_discard_alignment(rnbd_dev)); rsp->secure_discard = cpu_to_le16(rnbd_dev_get_secure_discard(rnbd_dev)); - rsp->rotational = !blk_queue_nonrot(q); rsp->cache_policy = 0; if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) rsp->cache_policy |= RNBD_WRITEBACK; From 24afc15dbe218f860994f627b4ba1fb09225a298 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 7 Feb 2022 21:48:19 +0100 Subject: [PATCH 03/51] block/rnbd: Remove a useless mutex According to lib/idr.c, The IDA handles its own locking. It is safe to call any of the IDA functions without synchronisation in your code. so the 'ida_lock' mutex can just be removed. It is here only to protect some ida_simple_get()/ida_simple_remove() calls. While at it, switch to ida_alloc_XXX()/ida_free() instead to ida_simple_get()/ida_simple_remove(). The latter is deprecated and more verbose. Signed-off-by: Christophe JAILLET Acked-by: Jack Wang Link: https://lore.kernel.org/r/7f9eccd8b1fce1bac45ac9b01a78cf72f54c0a61.1644266862.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 9a880d559ab8..1f63f308eb39 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -23,7 +23,6 @@ MODULE_LICENSE("GPL"); static int rnbd_client_major; static DEFINE_IDA(index_ida); -static DEFINE_MUTEX(ida_lock); static DEFINE_MUTEX(sess_lock); static LIST_HEAD(sess_list); @@ -55,9 +54,7 @@ static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev) if (!refcount_dec_and_test(&dev->refcount)) return; - mutex_lock(&ida_lock); - ida_simple_remove(&index_ida, dev->clt_device_id); - mutex_unlock(&ida_lock); + ida_free(&index_ida, dev->clt_device_id); kfree(dev->hw_queues); kfree(dev->pathname); rnbd_clt_put_sess(dev->sess); @@ -1460,10 +1457,8 @@ static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, goto out_alloc; } - mutex_lock(&ida_lock); - ret = ida_simple_get(&index_ida, 0, 1 << (MINORBITS - RNBD_PART_BITS), - GFP_KERNEL); - mutex_unlock(&ida_lock); + ret = ida_alloc_max(&index_ida, 1 << (MINORBITS - RNBD_PART_BITS), + GFP_KERNEL); if (ret < 0) { pr_err("Failed to initialize device '%s' from session %s, allocating idr failed, err: %d\n", pathname, sess->sessname, ret); From d9a74051a73c4fbd065ae806b8da151cbded84f1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 8 Feb 2022 11:46:56 +0000 Subject: [PATCH 04/51] loop: clean up grammar in warning message The phrase "has still" should be "still has" to clean up the grammar. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20220208114656.61629-1-colin.i.king@gmail.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 01cbbfc4e9e2..bdea448d2419 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1262,7 +1262,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) if (size_changed && lo->lo_device->bd_inode->i_mapping->nrpages) { /* If any pages were dirtied after invalidate_bdev(), try again */ err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", + pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n", __func__, lo->lo_number, lo->lo_file_name, lo->lo_device->bd_inode->i_mapping->nrpages); goto out_unfreeze; @@ -1482,7 +1482,7 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) /* invalidate_bdev should have truncated all the pages */ if (lo->lo_device->bd_inode->i_mapping->nrpages) { err = -EAGAIN; - pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", + pr_warn("%s: loop%d (%s) still has dirty pages (nrpages=%lu)\n", __func__, lo->lo_number, lo->lo_file_name, lo->lo_device->bd_inode->i_mapping->nrpages); goto out_unfreeze; From a75110c3b36959d04d3a586ba43510ddf9b410b5 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 15 Feb 2022 03:59:51 -0800 Subject: [PATCH 05/51] null_blk: fix return value from null_add_dev() The function nullb_device_power_store() returns -ENOMEM when null_add_dev() fails. null_add_dev() can fail with return value other than -ENOMEM such as -EINVAL when Zoned Block Device option is used, see : nullb_device_power_store() null_add_dev() null_init_zoned_dev() return -EINVAL; When trying to load the module having -ENOMEM value returned on the command line creates confusion when pleanty of memory is free on the machine. Instead of hardcoding -ENOMEM return the value of null_add_dev() function. Signed-off-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220215115951.15945-1-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 13004beb48ca..90b6bd2a114b 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -431,9 +431,10 @@ static ssize_t nullb_device_power_store(struct config_item *item, if (!dev->power && newp) { if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) return count; - if (null_add_dev(dev)) { + ret = null_add_dev(dev); + if (ret) { clear_bit(NULLB_DEV_FL_UP, &dev->flags); - return -ENOMEM; + return ret; } set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); From b27824d31f09ea7b4a6ba2c1b18bd328df3e8bed Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 15 Feb 2022 13:33:07 -0800 Subject: [PATCH 06/51] loop: use sysfs_emit() in the sysfs xxx show() sprintf does not know the PAGE_SIZE maximum of the temporary buffer used for outputting sysfs content and it's possible to overrun the PAGE_SIZE buffer length. Use a generic sysfs_emit function that knows the size of the temporary buffer and ensures that no overrun is done for offset attribute in loop_attr_[offset|sizelimit|autoclear|partscan|dio]_show() callbacks. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20220215213310.7264-2-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bdea448d2419..a55e5eda1d17 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -680,33 +680,33 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset); + return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset); } static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf) { - return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); + return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); } static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) { int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR); - return sprintf(buf, "%s\n", autoclear ? "1" : "0"); + return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0"); } static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) { int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); - return sprintf(buf, "%s\n", partscan ? "1" : "0"); + return sysfs_emit(buf, "%s\n", partscan ? "1" : "0"); } static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) { int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO); - return sprintf(buf, "%s\n", dio ? "1" : "0"); + return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); } LOOP_ATTR_RO(backing_file); From 0aab29b85478b994422f1551ce36e5640b09db2b Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 15 Feb 2022 13:33:08 -0800 Subject: [PATCH 07/51] loop: remove extra variable in lo_fallocate() The local variable q is used to pass it to the blk_queue_discard(). We can get away with using lo->lo_queue instead of storing in a local variable which is not used anywhere else. No functional change in this patch. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20220215213310.7264-3-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a55e5eda1d17..77c61eaaa6e4 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -308,12 +308,11 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, * a.k.a. discard/zerorange. */ struct file *file = lo->lo_backing_file; - struct request_queue *q = lo->lo_queue; int ret; mode |= FALLOC_FL_KEEP_SIZE; - if (!blk_queue_discard(q)) { + if (!blk_queue_discard(lo->lo_queue)) { ret = -EOPNOTSUPP; goto out; } From 9c64e38cc639537ad9b542635af1733e8cb5e19b Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 15 Feb 2022 13:33:09 -0800 Subject: [PATCH 08/51] loop: remove extra variable in lo_req_flush The local variable file is used to pass it to the vfs_fsync(). We can get away with using lo->lo_backing_file instead of storing in a local variable which is not used anywhere else. No functional change in this patch. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20220215213310.7264-4-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 77c61eaaa6e4..18b30a56bfc4 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -326,8 +326,7 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, static int lo_req_flush(struct loop_device *lo, struct request *rq) { - struct file *file = lo->lo_backing_file; - int ret = vfs_fsync(file, 0); + int ret = vfs_fsync(lo->lo_backing_file, 0); if (unlikely(ret && ret != -EINVAL)) ret = -EIO; From ef44c50837ab7818920bd9994b36d6e573312abc Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 15 Feb 2022 13:33:10 -0800 Subject: [PATCH 09/51] loop: allow user to set the queue depth Instead of hardcoding queue depth allow user to set the hw queue depth using module parameter. Set default value to 128 to retain the existing behavior. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20220215213310.7264-5-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 18b30a56bfc4..8fb89d0624fc 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -85,6 +85,7 @@ #include #define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ) +#define LOOP_DEFAULT_HW_Q_DEPTH (128) static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_ctl_mutex); @@ -1785,6 +1786,24 @@ module_param(max_loop, int, 0444); MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); + +static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH; + +static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p) +{ + int ret = kstrtoint(s, 10, &hw_queue_depth); + + return (ret || (hw_queue_depth < 1)) ? -EINVAL : 0; +} + +static const struct kernel_param_ops loop_hw_qdepth_param_ops = { + .set = loop_set_hw_queue_depth, + .get = param_get_int, +}; + +device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444); +MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 128"); + MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1979,7 +1998,7 @@ static int loop_add(int i) lo->tag_set.ops = &loop_mq_ops; lo->tag_set.nr_hw_queues = 1; - lo->tag_set.queue_depth = 128; + lo->tag_set.queue_depth = hw_queue_depth; lo->tag_set.numa_node = NUMA_NO_NODE; lo->tag_set.cmd_size = sizeof(struct loop_cmd); lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING | From 3d3472f3ed419fe1a9d3f881a4905fa8e03d750c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 16 Feb 2022 09:29:45 -0800 Subject: [PATCH 10/51] null_blk: remove hardcoded alloc_cmd() parameter Only caller of alloc_cmd() is null_submit_bio() unconditionally sets second parameter to true and that is statically hard-coded in null_blk. There is no point in having statically hardcoded function parameter. Remove the unnecessary parameter can_wait and adjust the code so it can retain existing behavior of waiting when we don't get valid nullb_cmd from __alloc_cmd() in alloc_cmd(). The restructured code avoids multiple return statements, multiple calls to __alloc_cmd() and resulting a fast path call to prepare_to_wait() due to removal of first alloc_cmd() call. Follow the pattern that we have in bio_alloc() to set the structure members in the structure allocation function in alloc_cmd() and pass bio to initialize newly allocated cmd->bio member. Follow the pattern in copy_to_nullb() to use result of one function call (null_cache_active()) to be used as a parameter to another function call (null_insert_page()), use result of alloc_cmd() as a first parameter to the null_handle_cmd() in null_submit_bio() function. This allow us to remove the local variable cmd on stack in null_submit_bio() that is in fast path. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Link: https://lore.kernel.org/r/20220216172945.31124-2-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 90b6bd2a114b..29e183719e77 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -720,26 +720,25 @@ static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) return NULL; } -static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait) +static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio) { struct nullb_cmd *cmd; DEFINE_WAIT(wait); - cmd = __alloc_cmd(nq); - if (cmd || !can_wait) - return cmd; - do { - prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); + /* + * This avoids multiple return statements, multiple calls to + * __alloc_cmd() and a fast path call to prepare_to_wait(). + */ cmd = __alloc_cmd(nq); - if (cmd) - break; - + if (cmd) { + cmd->bio = bio; + return cmd; + } + prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); io_schedule(); + finish_wait(&nq->wait, &wait); } while (1); - - finish_wait(&nq->wait, &wait); - return cmd; } static void end_cmd(struct nullb_cmd *cmd) @@ -1477,12 +1476,8 @@ static void null_submit_bio(struct bio *bio) sector_t nr_sectors = bio_sectors(bio); struct nullb *nullb = bio->bi_bdev->bd_disk->private_data; struct nullb_queue *nq = nullb_to_queue(nullb); - struct nullb_cmd *cmd; - cmd = alloc_cmd(nq, 1); - cmd->bio = bio; - - null_handle_cmd(cmd, sector, nr_sectors, bio_op(bio)); + null_handle_cmd(alloc_cmd(nq, bio), sector, nr_sectors, bio_op(bio)); } static bool should_timeout_request(struct request *rq) From c90b6b50b42dfdfeb27b02e670e812b69ff364f5 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 22 Feb 2022 07:28:51 -0800 Subject: [PATCH 11/51] null_blk: remove hardcoded null_alloc_page() param Only caller of null_alloc_page() is null_insert_page() unconditionally sets only parameter to GFP_NOIO and that is statically hard-coded in null_blk. There is no point in having statically hardcoded function parameter. Remove the unnecessary parameter gfp_flags and adjust the code, so it can retain existing behavior null_alloc_page() with GFP_NOIO. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220222152852.26043-2-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 29e183719e77..80f9a6ba376d 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -777,15 +777,15 @@ static void null_complete_rq(struct request *rq) end_cmd(blk_mq_rq_to_pdu(rq)); } -static struct nullb_page *null_alloc_page(gfp_t gfp_flags) +static struct nullb_page *null_alloc_page(void) { struct nullb_page *t_page; - t_page = kmalloc(sizeof(struct nullb_page), gfp_flags); + t_page = kmalloc(sizeof(struct nullb_page), GFP_NOIO); if (!t_page) goto out; - t_page->page = alloc_pages(gfp_flags, 0); + t_page->page = alloc_pages(GFP_NOIO, 0); if (!t_page->page) goto out_freepage; @@ -932,7 +932,7 @@ static struct nullb_page *null_insert_page(struct nullb *nullb, spin_unlock_irq(&nullb->lock); - t_page = null_alloc_page(GFP_NOIO); + t_page = null_alloc_page(); if (!t_page) goto out_lock; From df00b1d26c3c3ff9dae4b572a6ad878ab65334e1 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 22 Feb 2022 07:28:52 -0800 Subject: [PATCH 12/51] null_blk: null_alloc_page() cleanup Remove goto labels and use direct returns as error unwinding code only needs to free t_page variable if we alloc_pages() call fails as having two labels for one kfree() can be avoided easily. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220222152852.26043-3-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 80f9a6ba376d..05b1120e6623 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -783,18 +783,16 @@ static struct nullb_page *null_alloc_page(void) t_page = kmalloc(sizeof(struct nullb_page), GFP_NOIO); if (!t_page) - goto out; + return NULL; t_page->page = alloc_pages(GFP_NOIO, 0); - if (!t_page->page) - goto out_freepage; + if (!t_page->page) { + kfree(t_page); + return NULL; + } memset(t_page->bitmap, 0, sizeof(t_page->bitmap)); return t_page; -out_freepage: - kfree(t_page); -out: - return NULL; } static void null_free_page(struct nullb_page *t_page) From e65b831a1e191caff3fc0d06bc7019cdaf8f868e Mon Sep 17 00:00:00 2001 From: Qinghua Jin Date: Fri, 7 Jan 2022 10:22:58 +0800 Subject: [PATCH 13/51] nvme-fc: fix a typo subsytem -> subsystem Signed-off-by: Qinghua Jin Signed-off-by: Christoph Hellwig --- include/linux/nvme-fc-driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index cb909edb76c4..5358a5facdee 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -721,7 +721,7 @@ enum { * * Fields with static values for the port. Initialized by the * port_info struct supplied to the registration call. - * @port_num: NVME-FC transport subsytem port number + * @port_num: NVME-FC transport subsystem port number * @node_name: FC WWNN for the port * @port_name: FC WWPN for the port * @private: pointer to memory allocated alongside the local port From 50ab19d89feaf4ebeca6872b46da4b503eee20c1 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 18 Jan 2022 23:49:54 -0800 Subject: [PATCH 14/51] nvme-core: remove unnecessary semicolon It is not a good practice to have a semicolon at the end of the function definition. Remove it from nvme_pr_type(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5e0bfda04bd7..a1d793e79982 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1979,7 +1979,7 @@ static char nvme_pr_type(enum pr_type type) default: return 0; } -}; +} static int nvme_send_ns_head_pr_command(struct block_device *bdev, struct nvme_command *c, u8 data[16]) From ba3266434d6615ff8015b01846d0e7756c9ad936 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Fri, 21 Jan 2022 21:05:39 -0800 Subject: [PATCH 15/51] nvme-core: remove unnecessary function parameter In function nvme_execute_rq() we don't use gendisk parameter at all. Remove the unsed parameter and adjust the calls. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a1d793e79982..dd18861f77c0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1049,8 +1049,7 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); * >0: nvme controller's cqe status response * <0: kernel error in lieu of controller response */ -static int nvme_execute_rq(struct gendisk *disk, struct request *rq, - bool at_head) +static int nvme_execute_rq(struct request *rq, bool at_head) { blk_status_t status; @@ -1090,7 +1089,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, goto out; } - ret = nvme_execute_rq(NULL, req, at_head); + ret = nvme_execute_rq(req, at_head); if (result && ret >= 0) *result = nvme_req(req)->result; out: @@ -1206,12 +1205,11 @@ int nvme_execute_passthru_rq(struct request *rq) struct nvme_command *cmd = nvme_req(rq)->cmd; struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; struct nvme_ns *ns = rq->q->queuedata; - struct gendisk *disk = ns ? ns->disk : NULL; u32 effects; int ret; effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); - ret = nvme_execute_rq(disk, rq, false); + ret = nvme_execute_rq(rq, false); if (effects) /* nothing to be done for zero cmd effects */ nvme_passthru_end(ctrl, effects, cmd, ret); From 572c97355bdc0e7cdd5979304b8554712e26ceb8 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 11 Jan 2022 22:20:59 -0800 Subject: [PATCH 16/51] nvme-fabrics: use unsigned int type Loop variable i will never have a negative value, so use unsigned int type instaed of int. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index f79a66d4e22c..9f81a0562e3e 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -873,7 +873,7 @@ static int nvmf_check_required_opts(struct nvmf_ctrl_options *opts, unsigned int required_opts) { if ((opts->mask & required_opts) != required_opts) { - int i; + unsigned int i; for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) { if ((opt_tokens[i].token & required_opts) && From 0801a4b630ab0949ddf0fc3f26cb17976e2d3afb Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 11 Jan 2022 22:21:00 -0800 Subject: [PATCH 17/51] nvme-fabrics: use unsigned int type Loop variable i will never have a negative value, so use unsigned int type instaed of int. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 9f81a0562e3e..131e78e62d00 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -923,7 +923,7 @@ static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts, unsigned int allowed_opts) { if (opts->mask & ~allowed_opts) { - int i; + unsigned int i; for (i = 0; i < ARRAY_SIZE(opt_tokens); i++) { if ((opt_tokens[i].token & opts->mask) && From 72b3eab456ba57bf933b25e1b2adea3e3eafd153 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 11 Jan 2022 22:20:57 -0800 Subject: [PATCH 18/51] nvme-fabrics: use consistent zeroout pattern Remove zeroout memeset call & zeroout local variable cmd at the time of declaration in nvmf_ref_read32() similar to what we have done in nvmf_reg_read64(), nvmf_reg_write32(), nvmf_connect_admin_queue(), and nvmf_connect_io_queue(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 131e78e62d00..f4aa40124720 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -144,11 +144,10 @@ EXPORT_SYMBOL_GPL(nvmf_get_address); */ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) { - struct nvme_command cmd; + struct nvme_command cmd = { }; union nvme_result res; int ret; - memset(&cmd, 0, sizeof(cmd)); cmd.prop_get.opcode = nvme_fabrics_command; cmd.prop_get.fctype = nvme_fabrics_type_property_get; cmd.prop_get.offset = cpu_to_le32(off); From 581f19dd72b9d148ce667b6dcb20d3ef16b9a1cf Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 11 Jan 2022 22:20:58 -0800 Subject: [PATCH 19/51] nvme-fabrics: remove unnecessary braces for case Braces are not required for enum value NVME_SC_CONNECT_INVALID_PARAM when used on the switch-case statement, remove the braces. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index f4aa40124720..ee79a6d639b4 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -271,7 +271,7 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl, int err_sctype = errval & ~NVME_SC_DNR; switch (err_sctype) { - case (NVME_SC_CONNECT_INVALID_PARAM): + case NVME_SC_CONNECT_INVALID_PARAM: if (offset >> 16) { char *inv_data = "Connect Invalid Data Parameter"; From 2caecd62ea5160803b25d96cb1a14ce755c2c259 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 2 Feb 2022 01:04:44 -0800 Subject: [PATCH 20/51] nvmet: use i_size_read() to set size for file-ns Instead of calling vfs_getattr() use i_size_read() to read the size of file so we can read the size of not only file type but also block type with one call. This is needed to implement buffered_io support for the NVMeOF block device backend. We also change return type of function nvmet_file_ns_revalidate() from int to void, since this function does not return any meaning value. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-file.c | 17 ++++------------- drivers/nvme/target/nvmet.h | 2 +- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 6be6e59d273b..6485dc8eb974 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -14,16 +14,9 @@ #define NVMET_MAX_MPOOL_BVEC 16 #define NVMET_MIN_MPOOL_OBJ 16 -int nvmet_file_ns_revalidate(struct nvmet_ns *ns) +void nvmet_file_ns_revalidate(struct nvmet_ns *ns) { - struct kstat stat; - int ret; - - ret = vfs_getattr(&ns->file->f_path, &stat, STATX_SIZE, - AT_STATX_FORCE_SYNC); - if (!ret) - ns->size = stat.size; - return ret; + ns->size = i_size_read(ns->file->f_mapping->host); } void nvmet_file_ns_disable(struct nvmet_ns *ns) @@ -43,7 +36,7 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns) int nvmet_file_ns_enable(struct nvmet_ns *ns) { int flags = O_RDWR | O_LARGEFILE; - int ret; + int ret = 0; if (!ns->buffered_io) flags |= O_DIRECT; @@ -57,9 +50,7 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns) return ret; } - ret = nvmet_file_ns_revalidate(ns); - if (ret) - goto err; + nvmet_file_ns_revalidate(ns); /* * i_blkbits can be greater than the universally accepted upper bound, diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index af193423c10b..69637bf8f8e1 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -541,7 +541,7 @@ u16 nvmet_bdev_flush(struct nvmet_req *req); u16 nvmet_file_flush(struct nvmet_req *req); void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid); void nvmet_bdev_ns_revalidate(struct nvmet_ns *ns); -int nvmet_file_ns_revalidate(struct nvmet_ns *ns); +void nvmet_file_ns_revalidate(struct nvmet_ns *ns); void nvmet_ns_revalidate(struct nvmet_ns *ns); u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts); From 6f6d604b4ef896793901f213db3b45eefa7288fa Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 2 Feb 2022 01:04:45 -0800 Subject: [PATCH 21/51] nvmet: allow bdev in buffered_io mode Allow block device to be configured in the buffered I/O mode by using the file backend. In this way now we can use cache for the block device namespace which shows significant performance improvement. We update the block device ns enable function and return early when buffered_io flag is set. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-bdev.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 95c2bbb0b2f5..e9194804ddee 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -76,6 +76,14 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns) { int ret; + /* + * When buffered_io namespace attribute is enabled that means user want + * this block device to be used as a file, so block device can take + * an advantage of cache. + */ + if (ns->buffered_io) + return -ENOTBLK; + ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE, NULL); if (IS_ERR(ns->bdev)) { From 8b850475c08caa9545c460d7dc823d0a8c7eafd3 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Feb 2022 11:07:27 +0200 Subject: [PATCH 22/51] nvme: replace ida_simple[get|remove] with the simler ida_[alloc|free] ida_simple_[get|remove] are wrappers anyways. Also, use ida_alloc_min with the ns_ida as namespace enumeration starts with 1. Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dd18861f77c0..da0e19148177 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -561,7 +561,7 @@ static void nvme_free_ns_head(struct kref *ref) container_of(ref, struct nvme_ns_head, ref); nvme_mpath_remove_disk(head); - ida_simple_remove(&head->subsys->ns_ida, head->instance); + ida_free(&head->subsys->ns_ida, head->instance); cleanup_srcu_struct(&head->srcu); nvme_put_subsystem(head->subsys); kfree(head); @@ -2565,7 +2565,7 @@ static void nvme_release_subsystem(struct device *dev) container_of(dev, struct nvme_subsystem, dev); if (subsys->instance >= 0) - ida_simple_remove(&nvme_instance_ida, subsys->instance); + ida_free(&nvme_instance_ida, subsys->instance); kfree(subsys); } @@ -3616,7 +3616,7 @@ static int __nvme_check_ids(struct nvme_subsystem *subsys, static void nvme_cdev_rel(struct device *dev) { - ida_simple_remove(&nvme_ns_chr_minor_ida, MINOR(dev->devt)); + ida_free(&nvme_ns_chr_minor_ida, MINOR(dev->devt)); } void nvme_cdev_del(struct cdev *cdev, struct device *cdev_device) @@ -3630,7 +3630,7 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, { int minor, ret; - minor = ida_simple_get(&nvme_ns_chr_minor_ida, 0, 0, GFP_KERNEL); + minor = ida_alloc(&nvme_ns_chr_minor_ida, GFP_KERNEL); if (minor < 0) return minor; cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); @@ -3693,7 +3693,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, head = kzalloc(size, GFP_KERNEL); if (!head) goto out; - ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL); + ret = ida_alloc_min(&ctrl->subsys->ns_ida, 1, GFP_KERNEL); if (ret < 0) goto out_free_head; head->instance = ret; @@ -3732,7 +3732,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, out_cleanup_srcu: cleanup_srcu_struct(&head->srcu); out_ida_remove: - ida_simple_remove(&ctrl->subsys->ns_ida, head->instance); + ida_free(&ctrl->subsys->ns_ida, head->instance); out_free_head: kfree(head); out: @@ -4429,7 +4429,7 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_subsystem *subsys = ctrl->subsys; if (!subsys || ctrl->instance != subsys->instance) - ida_simple_remove(&nvme_instance_ida, ctrl->instance); + ida_free(&nvme_instance_ida, ctrl->instance); nvme_free_cels(ctrl); nvme_mpath_uninit(ctrl); @@ -4488,7 +4488,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, goto out; } - ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); + ret = ida_alloc(&nvme_instance_ida, GFP_KERNEL); if (ret < 0) goto out; ctrl->instance = ret; @@ -4529,7 +4529,7 @@ out_free_name: nvme_put_ctrl(ctrl); kfree_const(ctrl->device->kobj.name); out_release_instance: - ida_simple_remove(&nvme_instance_ida, ctrl->instance); + ida_free(&nvme_instance_ida, ctrl->instance); out: if (ctrl->discard_page) __free_page(ctrl->discard_page); From 3dd83f4013f0e85d37c059c91015a2721209fe86 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Feb 2022 11:07:28 +0200 Subject: [PATCH 23/51] nvme-fc: replace ida_simple[get|remove] with the simler ida_[alloc|free] ida_simple_[get|remove] are wrappers anyways. Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 71b3108c22f0..96a5d7fc36f7 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -259,7 +259,7 @@ nvme_fc_free_lport(struct kref *ref) complete(&nvme_fc_unload_proceed); spin_unlock_irqrestore(&nvme_fc_lock, flags); - ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); + ida_free(&nvme_fc_local_port_cnt, lport->localport.port_num); ida_destroy(&lport->endp_cnt); put_device(lport->dev); @@ -399,7 +399,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, goto out_reghost_failed; } - idx = ida_simple_get(&nvme_fc_local_port_cnt, 0, 0, GFP_KERNEL); + idx = ida_alloc(&nvme_fc_local_port_cnt, GFP_KERNEL); if (idx < 0) { ret = -ENOSPC; goto out_fail_kfree; @@ -439,7 +439,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, return 0; out_ida_put: - ida_simple_remove(&nvme_fc_local_port_cnt, idx); + ida_free(&nvme_fc_local_port_cnt, idx); out_fail_kfree: kfree(newrec); out_reghost_failed: @@ -535,7 +535,7 @@ nvme_fc_free_rport(struct kref *ref) spin_unlock_irqrestore(&nvme_fc_lock, flags); WARN_ON(!list_empty(&rport->disc_list)); - ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num); + ida_free(&lport->endp_cnt, rport->remoteport.port_num); kfree(rport); @@ -713,7 +713,7 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, goto out_lport_put; } - idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL); + idx = ida_alloc(&lport->endp_cnt, GFP_KERNEL); if (idx < 0) { ret = -ENOSPC; goto out_kfree_rport; @@ -2393,7 +2393,7 @@ nvme_fc_ctrl_free(struct kref *ref) put_device(ctrl->dev); nvme_fc_rport_put(ctrl->rport); - ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum); + ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum); if (ctrl->ctrl.opts) nvmf_free_options(ctrl->ctrl.opts); kfree(ctrl); @@ -3472,7 +3472,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto out_fail; } - idx = ida_simple_get(&nvme_fc_ctrl_cnt, 0, 0, GFP_KERNEL); + idx = ida_alloc(&nvme_fc_ctrl_cnt, GFP_KERNEL); if (idx < 0) { ret = -ENOSPC; goto out_free_ctrl; @@ -3635,7 +3635,7 @@ out_free_queues: kfree(ctrl->queues); out_free_ida: put_device(ctrl->dev); - ida_simple_remove(&nvme_fc_ctrl_cnt, ctrl->cnum); + ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum); out_free_ctrl: kfree(ctrl); out_fail: From 22027a9811349de28f81e13e20e83299099acd3a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Feb 2022 11:07:29 +0200 Subject: [PATCH 24/51] nvmet: replace ida_simple[get|remove] with the simler ida_[alloc|free] ida_simple_[get|remove] are wrappers anyways. Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 5119c687de68..724a6d373340 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1400,7 +1400,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, if (subsys->cntlid_min > subsys->cntlid_max) goto out_free_sqs; - ret = ida_simple_get(&cntlid_ida, + ret = ida_alloc_range(&cntlid_ida, subsys->cntlid_min, subsys->cntlid_max, GFP_KERNEL); if (ret < 0) { @@ -1459,7 +1459,7 @@ static void nvmet_ctrl_free(struct kref *ref) flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fatal_err_work); - ida_simple_remove(&cntlid_ida, ctrl->cntlid); + ida_free(&cntlid_ida, ctrl->cntlid); nvmet_async_events_free(ctrl); kfree(ctrl->sqs); From 6dd0f465d57c4522294d0fd42fde7603264d0871 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Feb 2022 11:07:30 +0200 Subject: [PATCH 25/51] nvmet-fc: replace ida_simple[get|remove] with the simler ida_[alloc|free] ida_simple_[get|remove] are wrappers anyways. Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 22b5108168a6..cb013516784c 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1115,7 +1115,7 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle) if (!assoc) return NULL; - idx = ida_simple_get(&tgtport->assoc_cnt, 0, 0, GFP_KERNEL); + idx = ida_alloc(&tgtport->assoc_cnt, GFP_KERNEL); if (idx < 0) goto out_free_assoc; @@ -1157,7 +1157,7 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle) out_put: nvmet_fc_tgtport_put(tgtport); out_ida: - ida_simple_remove(&tgtport->assoc_cnt, idx); + ida_free(&tgtport->assoc_cnt, idx); out_free_assoc: kfree(assoc); return NULL; @@ -1183,7 +1183,7 @@ nvmet_fc_target_assoc_free(struct kref *ref) /* if pending Rcv Disconnect Association LS, send rsp now */ if (oldls) nvmet_fc_xmt_ls_rsp(tgtport, oldls); - ida_simple_remove(&tgtport->assoc_cnt, assoc->a_id); + ida_free(&tgtport->assoc_cnt, assoc->a_id); dev_info(tgtport->dev, "{%d:%d} Association freed\n", tgtport->fc_target_port.port_num, assoc->a_id); @@ -1383,7 +1383,7 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo, goto out_regtgt_failed; } - idx = ida_simple_get(&nvmet_fc_tgtport_cnt, 0, 0, GFP_KERNEL); + idx = ida_alloc(&nvmet_fc_tgtport_cnt, GFP_KERNEL); if (idx < 0) { ret = -ENOSPC; goto out_fail_kfree; @@ -1433,7 +1433,7 @@ nvmet_fc_register_targetport(struct nvmet_fc_port_info *pinfo, out_free_newrec: put_device(dev); out_ida_put: - ida_simple_remove(&nvmet_fc_tgtport_cnt, idx); + ida_free(&nvmet_fc_tgtport_cnt, idx); out_fail_kfree: kfree(newrec); out_regtgt_failed: @@ -1460,7 +1460,7 @@ nvmet_fc_free_tgtport(struct kref *ref) /* let the LLDD know we've finished tearing it down */ tgtport->ops->targetport_delete(&tgtport->fc_target_port); - ida_simple_remove(&nvmet_fc_tgtport_cnt, + ida_free(&nvmet_fc_tgtport_cnt, tgtport->fc_target_port.port_num); ida_destroy(&tgtport->assoc_cnt); From 7c2566394f8ceba4b99a373c1bc2411750301587 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Feb 2022 11:07:31 +0200 Subject: [PATCH 26/51] nvmet-rdma: replace ida_simple[get|remove] with the simler ida_[alloc|free] ida_simple_[get|remove] are wrappers anyways. Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 1deb4043e242..00656754e331 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1356,7 +1356,7 @@ static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) !queue->host_qid); } nvmet_rdma_free_rsps(queue); - ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); + ida_free(&nvmet_rdma_queue_ida, queue->idx); kfree(queue); } @@ -1459,7 +1459,7 @@ nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, spin_lock_init(&queue->rsps_lock); INIT_LIST_HEAD(&queue->queue_list); - queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); + queue->idx = ida_alloc(&nvmet_rdma_queue_ida, GFP_KERNEL); if (queue->idx < 0) { ret = NVME_RDMA_CM_NO_RSC; goto out_destroy_sq; @@ -1510,7 +1510,7 @@ out_free_cmds: out_free_responses: nvmet_rdma_free_rsps(queue); out_ida_remove: - ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); + ida_free(&nvmet_rdma_queue_ida, queue->idx); out_destroy_sq: nvmet_sq_destroy(&queue->nvme_sq); out_free_queue: From 44f331a630bdc7c61de9c6760c4eec0133ee9f04 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 14 Feb 2022 11:07:32 +0200 Subject: [PATCH 27/51] nvmet-tcp: replace ida_simple[get|remove] with the simler ida_[alloc|free] ida_simple_[get|remove] are wrappers anyways. Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 7c1c43ce466b..83ca577f72be 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1473,7 +1473,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) nvmet_tcp_free_cmds(queue); if (queue->hdr_digest || queue->data_digest) nvmet_tcp_free_crypto(queue); - ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); + ida_free(&nvmet_tcp_queue_ida, queue->idx); page = virt_to_head_page(queue->pf_cache.va); __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); @@ -1613,7 +1613,7 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port, init_llist_head(&queue->resp_list); INIT_LIST_HEAD(&queue->resp_send_list); - queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL); + queue->idx = ida_alloc(&nvmet_tcp_queue_ida, GFP_KERNEL); if (queue->idx < 0) { ret = queue->idx; goto out_free_queue; @@ -1646,7 +1646,7 @@ out_destroy_sq: out_free_connect: nvmet_tcp_free_cmd(&queue->connect); out_ida_remove: - ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx); + ida_free(&nvmet_tcp_queue_ida, queue->idx); out_free_queue: kfree(queue); return ret; From 4686af885a9168f9ec70c4063616640911c48b03 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 9 Feb 2022 10:54:49 +0200 Subject: [PATCH 28/51] nvme-rdma: add helpers for mapping/unmapping request Introduce nvme_rdma_dma_map_req/nvme_rdma_dma_unmap_req helper functions to improve code readability and ease on the error flow. Reviewed-by: Israel Rukshin Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 111 +++++++++++++++++++++++---------------- 1 file changed, 65 insertions(+), 46 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 850f84d204d0..14ec2c85db06 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1282,6 +1282,22 @@ static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue, return ib_post_send(queue->qp, &wr, NULL); } +static void nvme_rdma_dma_unmap_req(struct ib_device *ibdev, struct request *rq) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + + if (blk_integrity_rq(rq)) { + ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl, + req->metadata_sgl->nents, rq_dma_dir(rq)); + sg_free_table_chained(&req->metadata_sgl->sg_table, + NVME_INLINE_METADATA_SG_CNT); + } + + ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, + rq_dma_dir(rq)); + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); +} + static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, struct request *rq) { @@ -1293,13 +1309,6 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, if (!blk_rq_nr_phys_segments(rq)) return; - if (blk_integrity_rq(rq)) { - ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl, - req->metadata_sgl->nents, rq_dma_dir(rq)); - sg_free_table_chained(&req->metadata_sgl->sg_table, - NVME_INLINE_METADATA_SG_CNT); - } - if (req->use_sig_mr) pool = &queue->qp->sig_mrs; @@ -1308,9 +1317,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, req->mr = NULL; } - ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, - rq_dma_dir(rq)); - sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); + nvme_rdma_dma_unmap_req(ibdev, rq); } static int nvme_rdma_set_sg_null(struct nvme_command *c) @@ -1521,22 +1528,11 @@ mr_put: return -EINVAL; } -static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, - struct request *rq, struct nvme_command *c) +static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq, + int *count, int *pi_count) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_rdma_device *dev = queue->device; - struct ib_device *ibdev = dev->dev; - int pi_count = 0; - int count, ret; - - req->num_sge = 1; - refcount_set(&req->ref, 2); /* send and recv completions */ - - c->common.flags |= NVME_CMD_SGL_METABUF; - - if (!blk_rq_nr_phys_segments(rq)) - return nvme_rdma_set_sg_null(c); + int ret; req->data_sgl.sg_table.sgl = (struct scatterlist *)(req + 1); ret = sg_alloc_table_chained(&req->data_sgl.sg_table, @@ -1548,9 +1544,9 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, req->data_sgl.nents = blk_rq_map_sg(rq->q, rq, req->data_sgl.sg_table.sgl); - count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl, - req->data_sgl.nents, rq_dma_dir(rq)); - if (unlikely(count <= 0)) { + *count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl, + req->data_sgl.nents, rq_dma_dir(rq)); + if (unlikely(*count <= 0)) { ret = -EIO; goto out_free_table; } @@ -1569,16 +1565,50 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq->q, rq->bio, req->metadata_sgl->sg_table.sgl); - pi_count = ib_dma_map_sg(ibdev, - req->metadata_sgl->sg_table.sgl, - req->metadata_sgl->nents, - rq_dma_dir(rq)); - if (unlikely(pi_count <= 0)) { + *pi_count = ib_dma_map_sg(ibdev, + req->metadata_sgl->sg_table.sgl, + req->metadata_sgl->nents, + rq_dma_dir(rq)); + if (unlikely(*pi_count <= 0)) { ret = -EIO; goto out_free_pi_table; } } + return 0; + +out_free_pi_table: + sg_free_table_chained(&req->metadata_sgl->sg_table, + NVME_INLINE_METADATA_SG_CNT); +out_unmap_sg: + ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, + rq_dma_dir(rq)); +out_free_table: + sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); + return ret; +} + +static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, + struct request *rq, struct nvme_command *c) +{ + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; + int pi_count = 0; + int count, ret; + + req->num_sge = 1; + refcount_set(&req->ref, 2); /* send and recv completions */ + + c->common.flags |= NVME_CMD_SGL_METABUF; + + if (!blk_rq_nr_phys_segments(rq)) + return nvme_rdma_set_sg_null(c); + + ret = nvme_rdma_dma_map_req(ibdev, rq, &count, &pi_count); + if (unlikely(ret)) + return ret; + if (req->use_sig_mr) { ret = nvme_rdma_map_sg_pi(queue, req, c, count, pi_count); goto out; @@ -1602,23 +1632,12 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, ret = nvme_rdma_map_sg_fr(queue, req, c, count); out: if (unlikely(ret)) - goto out_unmap_pi_sg; + goto out_dma_unmap_req; return 0; -out_unmap_pi_sg: - if (blk_integrity_rq(rq)) - ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl, - req->metadata_sgl->nents, rq_dma_dir(rq)); -out_free_pi_table: - if (blk_integrity_rq(rq)) - sg_free_table_chained(&req->metadata_sgl->sg_table, - NVME_INLINE_METADATA_SG_CNT); -out_unmap_sg: - ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents, - rq_dma_dir(rq)); -out_free_table: - sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT); +out_dma_unmap_req: + nvme_rdma_dma_unmap_req(ibdev, rq); return ret; } From 72e8b5cd7dd387ba3eee81b0a59746ad8ccdcb5f Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 10 Feb 2022 11:12:36 -0800 Subject: [PATCH 29/51] nvme: add a helper to initialize connect_q Add and use helper to remove duplicate code for fabrics connect_q initialization and error handling for all the transports. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 6 ++---- drivers/nvme/host/nvme.h | 8 ++++++++ drivers/nvme/host/rdma.c | 6 ++---- drivers/nvme/host/tcp.c | 6 ++---- drivers/nvme/target/loop.c | 6 ++---- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 96a5d7fc36f7..080f85f4105f 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2916,11 +2916,9 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.tagset = &ctrl->tag_set; - ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); - if (IS_ERR(ctrl->ctrl.connect_q)) { - ret = PTR_ERR(ctrl->ctrl.connect_q); + ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl)); + if (ret) goto out_free_tag_set; - } ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); if (ret) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a162f6c6da6e..f8658f984d64 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -894,6 +894,14 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) } #endif +static inline int nvme_ctrl_init_connect_q(struct nvme_ctrl *ctrl) +{ + ctrl->connect_q = blk_mq_init_queue(ctrl->tagset); + if (IS_ERR(ctrl->connect_q)) + return PTR_ERR(ctrl->connect_q); + return 0; +} + static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) { return dev_to_disk(dev)->private_data; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 14ec2c85db06..c49b9c3c46f2 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -978,11 +978,9 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) goto out_free_io_queues; } - ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); - if (IS_ERR(ctrl->ctrl.connect_q)) { - ret = PTR_ERR(ctrl->ctrl.connect_q); + ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl)); + if (ret) goto out_free_tag_set; - } } ret = nvme_rdma_start_io_queues(ctrl); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4ceb28675fdf..6cbcc8b4daaf 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1825,11 +1825,9 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) goto out_free_io_queues; } - ctrl->connect_q = blk_mq_init_queue(ctrl->tagset); - if (IS_ERR(ctrl->connect_q)) { - ret = PTR_ERR(ctrl->connect_q); + ret = nvme_ctrl_init_connect_q(ctrl); + if (ret) goto out_free_tag_set; - } } ret = nvme_tcp_start_io_queues(ctrl); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index eb1094254c82..23f9d6f88804 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -543,11 +543,9 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) if (ret) goto out_destroy_queues; - ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); - if (IS_ERR(ctrl->ctrl.connect_q)) { - ret = PTR_ERR(ctrl->ctrl.connect_q); + ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl)); + if (ret) goto out_free_tagset; - } ret = nvme_loop_connect_io_queues(ctrl); if (ret) From bd83fe6f2cd2133beaac7c423fd36c3515048fc8 Mon Sep 17 00:00:00 2001 From: Alan Adamson Date: Thu, 3 Feb 2022 00:11:53 -0800 Subject: [PATCH 30/51] nvme: add verbose error logging Improves logging of NVMe errors. If NVME_VERBOSE_ERRORS is configured, a verbose description of the error is logged, otherwise only status codes/bits is logged. Signed-off-by: Chaitanya Kulkarni [kch]: fix several nits, cosmetics, and trim down code. Signed-off-by: Martin K. Petersen Signed-off-by: Alan Adamson Reviewed-by: Himanshu Madhani Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/Kconfig | 8 ++ drivers/nvme/host/Makefile | 2 +- drivers/nvme/host/constants.c | 185 ++++++++++++++++++++++++++++++++++ drivers/nvme/host/core.c | 33 ++++++ drivers/nvme/host/nvme.h | 19 ++++ include/linux/nvme.h | 1 + 6 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 drivers/nvme/host/constants.c diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index dc0450ca23a3..d6d056963c06 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -24,6 +24,14 @@ config NVME_MULTIPATH /dev/nvmeXnY device will show up for each NVMe namespace, even if it is accessible through multiple controllers. +config NVME_VERBOSE_ERRORS + bool "NVMe verbose error reporting" + depends on NVME_CORE + help + This option enables verbose reporting for NVMe errors. The + error translation table will grow the kernel image size by + about 4 KB. + config NVME_HWMON bool "NVMe hardware monitoring" depends on (NVME_CORE=y && HWMON=y) || (NVME_CORE=m && HWMON) diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index dfaacd472e5d..476c5c988496 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o obj-$(CONFIG_NVME_TCP) += nvme-tcp.o -nvme-core-y := core.o ioctl.o +nvme-core-y := core.o ioctl.o constants.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c new file mode 100644 index 000000000000..7d49eb34b348 --- /dev/null +++ b/drivers/nvme/host/constants.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVM Express device driver verbose errors + * Copyright (c) 2022, Oracle and/or its affiliates + */ + +#include +#include "nvme.h" + +#ifdef CONFIG_NVME_VERBOSE_ERRORS +static const char * const nvme_ops[] = { + [nvme_cmd_flush] = "Flush", + [nvme_cmd_write] = "Write", + [nvme_cmd_read] = "Read", + [nvme_cmd_write_uncor] = "Write Uncorrectable", + [nvme_cmd_compare] = "Compare", + [nvme_cmd_write_zeroes] = "Write Zeros", + [nvme_cmd_dsm] = "Dataset Management", + [nvme_cmd_verify] = "Verify", + [nvme_cmd_resv_register] = "Reservation Register", + [nvme_cmd_resv_report] = "Reservation Report", + [nvme_cmd_resv_acquire] = "Reservation Acquire", + [nvme_cmd_resv_release] = "Reservation Release", + [nvme_cmd_zone_mgmt_send] = "Zone Management Send", + [nvme_cmd_zone_mgmt_recv] = "Zone Management Receive", + [nvme_cmd_zone_append] = "Zone Management Append", +}; + +static const char * const nvme_admin_ops[] = { + [nvme_admin_delete_sq] = "Delete SQ", + [nvme_admin_create_sq] = "Create SQ", + [nvme_admin_get_log_page] = "Get Log Page", + [nvme_admin_delete_cq] = "Delete CQ", + [nvme_admin_create_cq] = "Create CQ", + [nvme_admin_identify] = "Identify", + [nvme_admin_abort_cmd] = "Abort Command", + [nvme_admin_set_features] = "Set Features", + [nvme_admin_get_features] = "Get Features", + [nvme_admin_async_event] = "Async Event", + [nvme_admin_ns_mgmt] = "Namespace Management", + [nvme_admin_activate_fw] = "Activate Firmware", + [nvme_admin_download_fw] = "Download Firmware", + [nvme_admin_dev_self_test] = "Device Self Test", + [nvme_admin_ns_attach] = "Namespace Attach", + [nvme_admin_keep_alive] = "Keep Alive", + [nvme_admin_directive_send] = "Directive Send", + [nvme_admin_directive_recv] = "Directive Receive", + [nvme_admin_virtual_mgmt] = "Virtual Management", + [nvme_admin_nvme_mi_send] = "NVMe Send MI", + [nvme_admin_nvme_mi_recv] = "NVMe Receive MI", + [nvme_admin_dbbuf] = "Doorbell Buffer Config", + [nvme_admin_format_nvm] = "Format NVM", + [nvme_admin_security_send] = "Security Send", + [nvme_admin_security_recv] = "Security Receive", + [nvme_admin_sanitize_nvm] = "Sanitize NVM", + [nvme_admin_get_lba_status] = "Get LBA Status", +}; + +static const char * const nvme_statuses[] = { + [NVME_SC_SUCCESS] = "Success", + [NVME_SC_INVALID_OPCODE] = "Invalid Command Opcode", + [NVME_SC_INVALID_FIELD] = "Invalid Field in Command", + [NVME_SC_CMDID_CONFLICT] = "Command ID Conflict", + [NVME_SC_DATA_XFER_ERROR] = "Data Transfer Error", + [NVME_SC_POWER_LOSS] = "Commands Aborted due to Power Loss Notification", + [NVME_SC_INTERNAL] = "Internal Error", + [NVME_SC_ABORT_REQ] = "Command Abort Requested", + [NVME_SC_ABORT_QUEUE] = "Command Aborted due to SQ Deletion", + [NVME_SC_FUSED_FAIL] = "Command Aborted due to Failed Fused Command", + [NVME_SC_FUSED_MISSING] = "Command Aborted due to Missing Fused Command", + [NVME_SC_INVALID_NS] = "Invalid Namespace or Format", + [NVME_SC_CMD_SEQ_ERROR] = "Command Sequence Error", + [NVME_SC_SGL_INVALID_LAST] = "Invalid SGL Segment Descriptor", + [NVME_SC_SGL_INVALID_COUNT] = "Invalid Number of SGL Descriptors", + [NVME_SC_SGL_INVALID_DATA] = "Data SGL Length Invalid", + [NVME_SC_SGL_INVALID_METADATA] = "Metadata SGL Length Invalid", + [NVME_SC_SGL_INVALID_TYPE] = "SGL Descriptor Type Invalid", + [NVME_SC_CMB_INVALID_USE] = "Invalid Use of Controller Memory Buffer", + [NVME_SC_PRP_INVALID_OFFSET] = "PRP Offset Invalid", + [NVME_SC_ATOMIC_WU_EXCEEDED] = "Atomic Write Unit Exceeded", + [NVME_SC_OP_DENIED] = "Operation Denied", + [NVME_SC_SGL_INVALID_OFFSET] = "SGL Offset Invalid", + [NVME_SC_RESERVED] = "Reserved", + [NVME_SC_HOST_ID_INCONSIST] = "Host Identifier Inconsistent Format", + [NVME_SC_KA_TIMEOUT_EXPIRED] = "Keep Alive Timeout Expired", + [NVME_SC_KA_TIMEOUT_INVALID] = "Keep Alive Timeout Invalid", + [NVME_SC_ABORTED_PREEMPT_ABORT] = "Command Aborted due to Preempt and Abort", + [NVME_SC_SANITIZE_FAILED] = "Sanitize Failed", + [NVME_SC_SANITIZE_IN_PROGRESS] = "Sanitize In Progress", + [NVME_SC_SGL_INVALID_GRANULARITY] = "SGL Data Block Granularity Invalid", + [NVME_SC_CMD_NOT_SUP_CMB_QUEUE] = "Command Not Supported for Queue in CMB", + [NVME_SC_NS_WRITE_PROTECTED] = "Namespace is Write Protected", + [NVME_SC_CMD_INTERRUPTED] = "Command Interrupted", + [NVME_SC_TRANSIENT_TR_ERR] = "Transient Transport Error", + [NVME_SC_INVALID_IO_CMD_SET] = "Invalid IO Command Set", + [NVME_SC_LBA_RANGE] = "LBA Out of Range", + [NVME_SC_CAP_EXCEEDED] = "Capacity Exceeded", + [NVME_SC_NS_NOT_READY] = "Namespace Not Ready", + [NVME_SC_RESERVATION_CONFLICT] = "Reservation Conflict", + [NVME_SC_FORMAT_IN_PROGRESS] = "Format In Progress", + [NVME_SC_CQ_INVALID] = "Completion Queue Invalid", + [NVME_SC_QID_INVALID] = "Invalid Queue Identifier", + [NVME_SC_QUEUE_SIZE] = "Invalid Queue Size", + [NVME_SC_ABORT_LIMIT] = "Abort Command Limit Exceeded", + [NVME_SC_ABORT_MISSING] = "Reserved", /* XXX */ + [NVME_SC_ASYNC_LIMIT] = "Asynchronous Event Request Limit Exceeded", + [NVME_SC_FIRMWARE_SLOT] = "Invalid Firmware Slot", + [NVME_SC_FIRMWARE_IMAGE] = "Invalid Firmware Image", + [NVME_SC_INVALID_VECTOR] = "Invalid Interrupt Vector", + [NVME_SC_INVALID_LOG_PAGE] = "Invalid Log Page", + [NVME_SC_INVALID_FORMAT] = "Invalid Format", + [NVME_SC_FW_NEEDS_CONV_RESET] = "Firmware Activation Requires Conventional Reset", + [NVME_SC_INVALID_QUEUE] = "Invalid Queue Deletion", + [NVME_SC_FEATURE_NOT_SAVEABLE] = "Feature Identifier Not Saveable", + [NVME_SC_FEATURE_NOT_CHANGEABLE] = "Feature Not Changeable", + [NVME_SC_FEATURE_NOT_PER_NS] = "Feature Not Namespace Specific", + [NVME_SC_FW_NEEDS_SUBSYS_RESET] = "Firmware Activation Requires NVM Subsystem Reset", + [NVME_SC_FW_NEEDS_RESET] = "Firmware Activation Requires Reset", + [NVME_SC_FW_NEEDS_MAX_TIME] = "Firmware Activation Requires Maximum Time Violation", + [NVME_SC_FW_ACTIVATE_PROHIBITED] = "Firmware Activation Prohibited", + [NVME_SC_OVERLAPPING_RANGE] = "Overlapping Range", + [NVME_SC_NS_INSUFFICIENT_CAP] = "Namespace Insufficient Capacity", + [NVME_SC_NS_ID_UNAVAILABLE] = "Namespace Identifier Unavailable", + [NVME_SC_NS_ALREADY_ATTACHED] = "Namespace Already Attached", + [NVME_SC_NS_IS_PRIVATE] = "Namespace Is Private", + [NVME_SC_NS_NOT_ATTACHED] = "Namespace Not Attached", + [NVME_SC_THIN_PROV_NOT_SUPP] = "Thin Provisioning Not Supported", + [NVME_SC_CTRL_LIST_INVALID] = "Controller List Invalid", + [NVME_SC_SELT_TEST_IN_PROGRESS] = "Device Self-test In Progress", + [NVME_SC_BP_WRITE_PROHIBITED] = "Boot Partition Write Prohibited", + [NVME_SC_CTRL_ID_INVALID] = "Invalid Controller Identifier", + [NVME_SC_SEC_CTRL_STATE_INVALID] = "Invalid Secondary Controller State", + [NVME_SC_CTRL_RES_NUM_INVALID] = "Invalid Number of Controller Resources", + [NVME_SC_RES_ID_INVALID] = "Invalid Resource Identifier", + [NVME_SC_PMR_SAN_PROHIBITED] = "Sanitize Prohibited", + [NVME_SC_ANA_GROUP_ID_INVALID] = "ANA Group Identifier Invalid", + [NVME_SC_ANA_ATTACH_FAILED] = "ANA Attach Failed", + [NVME_SC_BAD_ATTRIBUTES] = "Conflicting Attributes", + [NVME_SC_INVALID_PI] = "Invalid Protection Information", + [NVME_SC_READ_ONLY] = "Attempted Write to Read Only Range", + [NVME_SC_ONCS_NOT_SUPPORTED] = "ONCS Not Supported", + [NVME_SC_ZONE_BOUNDARY_ERROR] = "Zoned Boundary Error", + [NVME_SC_ZONE_FULL] = "Zone Is Full", + [NVME_SC_ZONE_READ_ONLY] = "Zone Is Read Only", + [NVME_SC_ZONE_OFFLINE] = "Zone Is Offline", + [NVME_SC_ZONE_INVALID_WRITE] = "Zone Invalid Write", + [NVME_SC_ZONE_TOO_MANY_ACTIVE] = "Too Many Active Zones", + [NVME_SC_ZONE_TOO_MANY_OPEN] = "Too Many Open Zones", + [NVME_SC_ZONE_INVALID_TRANSITION] = "Invalid Zone State Transition", + [NVME_SC_WRITE_FAULT] = "Write Fault", + [NVME_SC_READ_ERROR] = "Unrecovered Read Error", + [NVME_SC_GUARD_CHECK] = "End-to-end Guard Check Error", + [NVME_SC_APPTAG_CHECK] = "End-to-end Application Tag Check Error", + [NVME_SC_REFTAG_CHECK] = "End-to-end Reference Tag Check Error", + [NVME_SC_COMPARE_FAILED] = "Compare Failure", + [NVME_SC_ACCESS_DENIED] = "Access Denied", + [NVME_SC_UNWRITTEN_BLOCK] = "Deallocated or Unwritten Logical Block", + [NVME_SC_ANA_PERSISTENT_LOSS] = "Asymmetric Access Persistent Loss", + [NVME_SC_ANA_INACCESSIBLE] = "Asymmetric Access Inaccessible", + [NVME_SC_ANA_TRANSITION] = "Asymmetric Access Transition", + [NVME_SC_HOST_PATH_ERROR] = "Host Pathing Error", +}; + +const unsigned char *nvme_get_error_status_str(u16 status) +{ + status &= 0x7ff; + if (status < ARRAY_SIZE(nvme_statuses) && nvme_statuses[status]) + return nvme_statuses[status & 0x7ff]; + return "Unknown"; +} + +const unsigned char *nvme_get_opcode_str(u8 opcode) +{ + if (opcode < ARRAY_SIZE(nvme_ops) && nvme_ops[opcode]) + return nvme_ops[opcode]; + return "Unknown"; +} + +const unsigned char *nvme_get_admin_opcode_str(u8 opcode) +{ + if (opcode < ARRAY_SIZE(nvme_admin_ops) && nvme_admin_ops[opcode]) + return nvme_admin_ops[opcode]; + return "Unknown"; +} +#endif /* CONFIG_NVME_VERBOSE_ERRORS */ diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index da0e19148177..5e55732f3ea1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -299,6 +299,37 @@ static void nvme_retry_req(struct request *req) blk_mq_delay_kick_requeue_list(req->q, delay); } +static void nvme_log_error(struct request *req) +{ + struct nvme_ns *ns = req->q->queuedata; + struct nvme_request *nr = nvme_req(req); + + if (ns) { + pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n", + ns->disk ? ns->disk->disk_name : "?", + nvme_get_opcode_str(nr->cmd->common.opcode), + nr->cmd->common.opcode, + (unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)), + (unsigned long long)blk_rq_bytes(req) >> ns->lba_shift, + nvme_get_error_status_str(nr->status), + nr->status >> 8 & 7, /* Status Code Type */ + nr->status & 0xff, /* Status Code */ + nr->status & NVME_SC_MORE ? "MORE " : "", + nr->status & NVME_SC_DNR ? "DNR " : ""); + return; + } + + pr_err_ratelimited("%s: %s(0x%x), %s (sct 0x%x / sc 0x%x) %s%s\n", + dev_name(nr->ctrl->device), + nvme_get_admin_opcode_str(nr->cmd->common.opcode), + nr->cmd->common.opcode, + nvme_get_error_status_str(nr->status), + nr->status >> 8 & 7, /* Status Code Type */ + nr->status & 0xff, /* Status Code */ + nr->status & NVME_SC_MORE ? "MORE " : "", + nr->status & NVME_SC_DNR ? "DNR " : ""); +} + enum nvme_disposition { COMPLETE, RETRY, @@ -339,6 +370,8 @@ static inline void nvme_end_req(struct request *req) { blk_status_t status = nvme_error_status(nvme_req(req)->status); + if (unlikely(nvme_req(req)->status != NVME_SC_SUCCESS)) + nvme_log_error(req); nvme_end_req_zoned(req); nvme_trace_bio_complete(req); blk_mq_end_request(req, status); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f8658f984d64..08893c04ca89 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -938,4 +938,23 @@ static inline bool nvme_multi_css(struct nvme_ctrl *ctrl) return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; } +#ifdef CONFIG_NVME_VERBOSE_ERRORS +const unsigned char *nvme_get_error_status_str(u16 status); +const unsigned char *nvme_get_opcode_str(u8 opcode); +const unsigned char *nvme_get_admin_opcode_str(u8 opcode); +#else /* CONFIG_NVME_VERBOSE_ERRORS */ +static inline const unsigned char *nvme_get_error_status_str(u16 status) +{ + return "I/O Error"; +} +static inline const unsigned char *nvme_get_opcode_str(u8 opcode) +{ + return "I/O Cmd"; +} +static inline const unsigned char *nvme_get_admin_opcode_str(u8 opcode) +{ + return "Admin Cmd"; +} +#endif /* CONFIG_NVME_VERBOSE_ERRORS */ + #endif /* _NVME_H */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 855dd9b3e84b..1f946e5bf7c1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1636,6 +1636,7 @@ enum { NVME_SC_HOST_ABORTED_CMD = 0x371, NVME_SC_CRD = 0x1800, + NVME_SC_MORE = 0x2000, NVME_SC_DNR = 0x4000, }; From 89377bc1975c2993bde4a498a3a4e5817ac0ae2c Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 10 Feb 2022 11:07:55 +0530 Subject: [PATCH 31/51] nvme: add vectored-io support for user-passthrough Add a new NVME_IOCTL_IO64_CMD_VEC ioctl that works like the existing NVME_IOCTL_IO64_CMD ioctl except that it takes and array of iovecs and thus supports vectored I/O. - cmd.addr is base address of user iovec array - cmd.vec_cnt is count of iovec array elements This patch does not include vectored-variant for admin-commands as most of them are light on buffers and likely to have low invocation frequency. Signed-off-by: Kanchan Joshi Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/ioctl.c | 35 ++++++++++++++++++++++++--------- include/uapi/linux/nvme_ioctl.h | 6 +++++- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 22314962842d..aaf3dfad2657 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -56,7 +56,7 @@ out: static int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, void __user *ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - u32 meta_seed, u64 *result, unsigned timeout) + u32 meta_seed, u64 *result, unsigned timeout, bool vec) { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; @@ -75,8 +75,22 @@ static int nvme_submit_user_cmd(struct request_queue *q, nvme_req(req)->flags |= NVME_REQ_USERCMD; if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + if (!vec) + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, GFP_KERNEL); + else { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov = fast_iov; + struct iov_iter iter; + + ret = import_iovec(rq_data_dir(req), ubuffer, bufflen, + UIO_FASTIOV, &iov, &iter); + if (ret < 0) + goto out; + ret = blk_rq_map_user_iov(q, req, NULL, &iter, + GFP_KERNEL); + kfree(iov); + } if (ret) goto out; bio = req->bio; @@ -170,7 +184,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return nvme_submit_user_cmd(ns->queue, &c, nvme_to_user_ptr(io.addr), length, - metadata, meta_len, lower_32_bits(io.slba), NULL, 0); + metadata, meta_len, lower_32_bits(io.slba), NULL, 0, + false); } static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl, @@ -224,7 +239,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &result, timeout); + 0, &result, timeout, false); if (status >= 0) { if (put_user(result, &ucmd->result)) @@ -235,7 +250,7 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, } static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, - struct nvme_passthru_cmd64 __user *ucmd) + struct nvme_passthru_cmd64 __user *ucmd, bool vec) { struct nvme_passthru_cmd64 cmd; struct nvme_command c; @@ -270,7 +285,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns, status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, nvme_to_user_ptr(cmd.addr), cmd.data_len, nvme_to_user_ptr(cmd.metadata), cmd.metadata_len, - 0, &cmd.result, timeout); + 0, &cmd.result, timeout, vec); if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) @@ -296,7 +311,7 @@ static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd, case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ctrl, NULL, argp); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp); + return nvme_user_cmd64(ctrl, NULL, argp, false); default: return sed_ioctl(ctrl->opal_dev, cmd, argp); } @@ -340,7 +355,9 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd, case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, argp); case NVME_IOCTL_IO64_CMD: - return nvme_user_cmd64(ns->ctrl, ns, argp); + return nvme_user_cmd64(ns->ctrl, ns, argp, false); + case NVME_IOCTL_IO64_CMD_VEC: + return nvme_user_cmd64(ns->ctrl, ns, argp, true); default: return -ENOTTY; } @@ -480,7 +497,7 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd, case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ctrl, NULL, argp); case NVME_IOCTL_ADMIN64_CMD: - return nvme_user_cmd64(ctrl, NULL, argp); + return nvme_user_cmd64(ctrl, NULL, argp, false); case NVME_IOCTL_IO_CMD: return nvme_dev_user_cmd(ctrl, argp); case NVME_IOCTL_RESET: diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index d99b5a772698..b2e43185e3b5 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -55,7 +55,10 @@ struct nvme_passthru_cmd64 { __u64 metadata; __u64 addr; __u32 metadata_len; - __u32 data_len; + union { + __u32 data_len; /* for non-vectored io */ + __u32 vec_cnt; /* for vectored io */ + }; __u32 cdw10; __u32 cdw11; __u32 cdw12; @@ -78,5 +81,6 @@ struct nvme_passthru_cmd64 { #define NVME_IOCTL_RESCAN _IO('N', 0x46) #define NVME_IOCTL_ADMIN64_CMD _IOWR('N', 0x47, struct nvme_passthru_cmd64) #define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) +#define NVME_IOCTL_IO64_CMD_VEC _IOWR('N', 0x49, struct nvme_passthru_cmd64) #endif /* _UAPI_LINUX_NVME_IOCTL_H */ From 20d64911e7580f7e29c0086d67860c18307377d7 Mon Sep 17 00:00:00 2001 From: Martin Belanger Date: Tue, 8 Feb 2022 14:33:45 -0500 Subject: [PATCH 32/51] nvme: send uevent on connection up When connectivity with a controller is lost, the driver will keep trying to reconnect once every 10 sec. When connection is restored, user-space apps need to be informed so that they can take proper action. For example, TP8010 introduces the DIM PDU, which is used to register with a discovery controller (DC). The DIM PDU is sent from user-space. The DIM PDU must be sent every time a connection is established with a DC. Therefore, the kernel must tell user-space apps when connection is restored so that registration can happen. The uevent sent is a "change" uevent with environmental data set to: "NVME_EVENT=connected". Signed-off-by: Martin Belanger Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: John Meneghini Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5e55732f3ea1..b3e70bb2e6dc 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4262,6 +4262,13 @@ static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) return ret; } +static void nvme_change_uevent(struct nvme_ctrl *ctrl, char *envdata) +{ + char *envp[2] = { envdata, NULL }; + + kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp); +} + static void nvme_aen_uevent(struct nvme_ctrl *ctrl) { char *envp[2] = { NULL, NULL }; @@ -4429,6 +4436,8 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) nvme_queue_scan(ctrl); nvme_start_queues(ctrl); } + + nvme_change_uevent(ctrl, "NVME_EVENT=connected"); } EXPORT_SYMBOL_GPL(nvme_start_ctrl); From 86c2457a8e8112f16af8fd10a3e1dd7a302c3c3e Mon Sep 17 00:00:00 2001 From: Martin Belanger Date: Tue, 8 Feb 2022 14:33:46 -0500 Subject: [PATCH 33/51] nvme: expose cntrltype and dctype through sysfs TP8010 introduces the Discovery Controller Type attribute (dctype). The dctype is returned in the response to the Identify command. This patch exposes the dctype through the sysfs. Since the dctype depends on the Controller Type (cntrltype), another attribute of the Identify response, the patch also exposes the cntrltype as well. The dctype will only be displayed for discovery controllers. A note about the naming of this attribute: Although TP8010 calls this attribute the Discovery Controller Type, note that the dctype is now part of the response to the Identify command for all controller types. I/O, Discovery, and Admin controllers all share the same Identify response PDU structure. Non-discovery controllers as well as pre-TP8010 discovery controllers will continue to set this field to 0 (which has always been the default for reserved bytes). Per TP8010, the value 0 now means "Discovery controller type is not reported" instead of "Reserved". One could argue that this definition is correct even for non-discovery controllers, and by extension, exposing it in the sysfs for non-discovery controllers is appropriate. Signed-off-by: Martin Belanger Reviewed-by: Chaitanya Kulkarni Reviewed-by: John Meneghini Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 39 +++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 3 +++ include/linux/nvme.h | 10 +++++++++- 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b3e70bb2e6dc..52a49ae5b673 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3023,6 +3023,9 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->max_namespaces = le32_to_cpu(id->mnan); ctrl->ctratt = le32_to_cpu(id->ctratt); + ctrl->cntrltype = id->cntrltype; + ctrl->dctype = id->dctype; + if (id->rtd3e) { /* us -> s */ u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC; @@ -3556,6 +3559,40 @@ static ssize_t nvme_ctrl_fast_io_fail_tmo_store(struct device *dev, static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, nvme_ctrl_fast_io_fail_tmo_show, nvme_ctrl_fast_io_fail_tmo_store); +static ssize_t cntrltype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + static const char * const type[] = { + [NVME_CTRL_IO] = "io\n", + [NVME_CTRL_DISC] = "discovery\n", + [NVME_CTRL_ADMIN] = "admin\n", + }; + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->cntrltype > NVME_CTRL_ADMIN || !type[ctrl->cntrltype]) + return sysfs_emit(buf, "reserved\n"); + + return sysfs_emit(buf, type[ctrl->cntrltype]); +} +static DEVICE_ATTR_RO(cntrltype); + +static ssize_t dctype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + static const char * const type[] = { + [NVME_DCTYPE_NOT_REPORTED] = "none\n", + [NVME_DCTYPE_DDC] = "ddc\n", + [NVME_DCTYPE_CDC] = "cdc\n", + }; + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + + if (ctrl->dctype > NVME_DCTYPE_CDC || !type[ctrl->dctype]) + return sysfs_emit(buf, "reserved\n"); + + return sysfs_emit(buf, type[ctrl->dctype]); +} +static DEVICE_ATTR_RO(dctype); + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -3577,6 +3614,8 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_reconnect_delay.attr, &dev_attr_fast_io_fail_tmo.attr, &dev_attr_kato.attr, + &dev_attr_cntrltype.attr, + &dev_attr_dctype.attr, NULL }; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 08893c04ca89..4ce48203e63f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -349,6 +349,9 @@ struct nvme_ctrl { unsigned long discard_page_busy; struct nvme_fault_inject fault_inject; + + enum nvme_ctrl_type cntrltype; + enum nvme_dctype dctype; }; enum nvme_iopolicy { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 1f946e5bf7c1..9dbc3ef4daf7 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -43,6 +43,12 @@ enum nvme_ctrl_type { NVME_CTRL_ADMIN = 3, /* Administrative controller */ }; +enum nvme_dctype { + NVME_DCTYPE_NOT_REPORTED = 0, + NVME_DCTYPE_DDC = 1, /* Direct Discovery Controller */ + NVME_DCTYPE_CDC = 2, /* Central Discovery Controller */ +}; + /* Address Family codes for Discovery Log Page entry ADRFAM field */ enum { NVMF_ADDR_FAMILY_PCI = 0, /* PCIe */ @@ -320,7 +326,9 @@ struct nvme_id_ctrl { __le16 icdoff; __u8 ctrattr; __u8 msdbd; - __u8 rsvd1804[244]; + __u8 rsvd1804[2]; + __u8 dctype; + __u8 rsvd1807[241]; struct nvme_id_power_state psd[32]; __u8 vs[1024]; }; From 1c3adf0de1db86cf354dcb1a2dd1184e5e63a50a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 15 Feb 2022 07:03:08 -0800 Subject: [PATCH 34/51] nvme: explicitly set non-error for directives Stream directives is an optional feature. It is not an error if a controller doesn't support as many as the kernel can optionally use. Explicitly set the non-error return value on this condition with a comment explaining why. Note, the return value was already 0 in this condition, so the setting is redundant. This patch should just silence bots that falsely believe the condition contains an error omission. Signed-off-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 52a49ae5b673..1ce579d583c0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -809,6 +809,8 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl) if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { dev_info(ctrl->device, "too few streams (%u) available\n", ctrl->nssa); + /* this condition is not an error: streams are optional */ + ret = 0; goto out_disable_stream; } From 0a9f850061d9126b9a4aaf56ae1810138cac0f51 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 15 Feb 2022 07:37:19 -0800 Subject: [PATCH 35/51] nvme: remove nssa from struct nvme_ctrl The reported number of streams is not used outside the function that gets it, so no need to stash it in the controller structure. Use a local variable instead. Signed-off-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 9 +++++---- drivers/nvme/host/nvme.h | 1 - 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1ce579d583c0..9cffc4770e73 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -790,6 +790,7 @@ static int nvme_get_stream_params(struct nvme_ctrl *ctrl, static int nvme_configure_directives(struct nvme_ctrl *ctrl) { struct streams_directive_params s; + u16 nssa; int ret; if (!(ctrl->oacs & NVME_CTRL_OACS_DIRECTIVES)) @@ -805,16 +806,16 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl) if (ret) goto out_disable_stream; - ctrl->nssa = le16_to_cpu(s.nssa); - if (ctrl->nssa < BLK_MAX_WRITE_HINTS - 1) { + nssa = le16_to_cpu(s.nssa); + if (nssa < BLK_MAX_WRITE_HINTS - 1) { dev_info(ctrl->device, "too few streams (%u) available\n", - ctrl->nssa); + nssa); /* this condition is not an error: streams are optional */ ret = 0; goto out_disable_stream; } - ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); + ctrl->nr_streams = min_t(u16, nssa, BLK_MAX_WRITE_HINTS - 1); dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); return 0; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4ce48203e63f..587d92df118b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -280,7 +280,6 @@ struct nvme_ctrl { u16 crdt[3]; u16 oncs; u16 oacs; - u16 nssa; u16 nr_streams; u16 sqsize; u32 max_namespaces; From fd8099e7918cd2df39ef306dd1d1af7178a15b81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Feb 2022 10:57:15 +0100 Subject: [PATCH 36/51] nvme: cleanup __nvme_check_ids Pass the actual nvme_ns_ids used for the comparison instead of the ns_head that isn't needed and use a more descriptive function name. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9cffc4770e73..076a03b801b7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3673,16 +3673,15 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, return NULL; } -static int __nvme_check_ids(struct nvme_subsystem *subsys, - struct nvme_ns_head *new) +static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys, + struct nvme_ns_ids *ids) { struct nvme_ns_head *h; lockdep_assert_held(&subsys->lock); list_for_each_entry(h, &subsys->nsheads, entry) { - if (nvme_ns_ids_valid(&new->ids) && - nvme_ns_ids_equal(&new->ids, &h->ids)) + if (nvme_ns_ids_valid(ids) && nvme_ns_ids_equal(ids, &h->ids)) return -EINVAL; } @@ -3781,7 +3780,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, head->ids = *ids; kref_init(&head->ref); - ret = __nvme_check_ids(ctrl->subsys, head); + ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &head->ids); if (ret) { dev_err(ctrl->device, "duplicate IDs for nsid %d\n", nsid); From e2724cb9f0c406b8fb66efd3aa9e8b3edfd8d5c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Feb 2022 11:32:58 +0100 Subject: [PATCH 37/51] nvme: fix the check for duplicate unique identifiers nvme_subsys_check_duplicate_ids should needs to return an error if any of the identifiers matches, not just if all of them match. But it does not need to and should not look at the CSI value for this sanity check. Rewrite the logic to be separate from nvme_ns_ids_equal and optimize it by reducing duplicate checks for non-present identifiers. Fixes: ed754e5deeb1 ("nvme: track shared namespaces") Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 076a03b801b7..3e6ac985b24f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1716,13 +1716,6 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); } -static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) -{ - return !uuid_is_null(&ids->uuid) || - memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) || - memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); -} - static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) { return uuid_equal(&a->uuid, &b->uuid) && @@ -3676,12 +3669,21 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_subsystem *subsys, static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys, struct nvme_ns_ids *ids) { + bool has_uuid = !uuid_is_null(&ids->uuid); + bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid)); + bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); struct nvme_ns_head *h; lockdep_assert_held(&subsys->lock); list_for_each_entry(h, &subsys->nsheads, entry) { - if (nvme_ns_ids_valid(ids) && nvme_ns_ids_equal(ids, &h->ids)) + if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid)) + return -EINVAL; + if (has_nguid && + memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0) + return -EINVAL; + if (has_eui64 && + memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0) return -EINVAL; } From e2d77d2e11c4f1e70a1a24cc8fe63ff3dc9b53ef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Feb 2022 17:46:50 +0100 Subject: [PATCH 38/51] nvme: check for duplicate identifiers earlier Lift the check for duplicate identifiers into nvme_init_ns_head, which avoids pointless error unwinding in case they don't match, and also matches where we check identifier validity for the multipath case. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3e6ac985b24f..cece987ba169 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3782,13 +3782,6 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, head->ids = *ids; kref_init(&head->ref); - ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &head->ids); - if (ret) { - dev_err(ctrl->device, - "duplicate IDs for nsid %d\n", nsid); - goto out_cleanup_srcu; - } - if (head->ids.csi) { ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects); if (ret) @@ -3827,6 +3820,12 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, mutex_lock(&ctrl->subsys->lock); head = nvme_find_ns_head(ctrl->subsys, nsid); if (!head) { + ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, ids); + if (ret) { + dev_err(ctrl->device, + "duplicate IDs for nsid %d\n", nsid); + goto out_unlock; + } head = nvme_alloc_ns_head(ctrl, nsid, ids); if (IS_ERR(head)) { ret = PTR_ERR(head); From 2079f41ec6ffaad9aa51ca550105b2228467aec7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Feb 2022 17:48:32 +0100 Subject: [PATCH 39/51] nvme: check that EUI/GUID/UUID are globally unique Add a check to verify that the unique identifiers are unique globally in addition to the existing check that verifies that they are unique inside a single subsystem. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni --- drivers/nvme/host/core.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cece987ba169..f8084ded69e5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3810,12 +3810,45 @@ out: return ERR_PTR(ret); } +static int nvme_global_check_duplicate_ids(struct nvme_subsystem *this, + struct nvme_ns_ids *ids) +{ + struct nvme_subsystem *s; + int ret = 0; + + /* + * Note that this check is racy as we try to avoid holding the global + * lock over the whole ns_head creation. But it is only intended as + * a sanity check anyway. + */ + mutex_lock(&nvme_subsystems_lock); + list_for_each_entry(s, &nvme_subsystems, entry) { + if (s == this) + continue; + mutex_lock(&s->lock); + ret = nvme_subsys_check_duplicate_ids(s, ids); + mutex_unlock(&s->lock); + if (ret) + break; + } + mutex_unlock(&nvme_subsystems_lock); + + return ret; +} + static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, struct nvme_ns_ids *ids, bool is_shared) { struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_ns_head *head = NULL; - int ret = 0; + int ret; + + ret = nvme_global_check_duplicate_ids(ctrl->subsys, ids); + if (ret) { + dev_err(ctrl->device, + "globally duplicate IDs for nsid %d\n", nsid); + return ret; + } mutex_lock(&ctrl->subsys->lock); head = nvme_find_ns_head(ctrl->subsys, nsid); @@ -3823,7 +3856,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, ids); if (ret) { dev_err(ctrl->device, - "duplicate IDs for nsid %d\n", nsid); + "duplicate IDs in subsystem for nsid %d\n", + nsid); goto out_unlock; } head = nvme_alloc_ns_head(ctrl, nsid, ids); From 143a70b8b4300faa92ad82468f65dccd440e7957 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:18:56 +0300 Subject: [PATCH 40/51] iss-simdisk: use bvec_kmap_local in simdisk_submit_bio Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Acked-by: Max Filippov Link: https://lore.kernel.org/r/20220303111905.321089-2-hch@lst.de Signed-off-by: Jens Axboe --- arch/xtensa/platforms/iss/simdisk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 8eb6ad1a3a1d..0f0e0724397f 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -108,13 +108,13 @@ static void simdisk_submit_bio(struct bio *bio) sector_t sector = bio->bi_iter.bi_sector; bio_for_each_segment(bvec, bio, iter) { - char *buffer = kmap_atomic(bvec.bv_page) + bvec.bv_offset; + char *buffer = bvec_kmap_local(&bvec); unsigned len = bvec.bv_len >> SECTOR_SHIFT; simdisk_transfer(dev, sector, len, buffer, bio_data_dir(bio) == WRITE); sector += len; - kunmap_atomic(buffer); + kunmap_local(buffer); } bio_endio(bio); From b7ab4611b6c793100197abc93e069d6f9aab7960 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:18:57 +0300 Subject: [PATCH 41/51] aoe: use bvec_kmap_local in bvcpy Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220303111905.321089-3-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/aoe/aoecmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index cc11f89a0928..384073ef2323 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1018,9 +1018,9 @@ bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt) iter.bi_size = cnt; __bio_for_each_segment(bv, bio, iter, iter) { - char *p = kmap_atomic(bv.bv_page) + bv.bv_offset; + char *p = bvec_kmap_local(&bv); skb_copy_bits(skb, soff, p, bv.bv_len); - kunmap_atomic(p); + kunmap_local(p); soff += bv.bv_len; } } From b3bd0a8a74ab970cc1cf0849e66bd0906741105b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:18:58 +0300 Subject: [PATCH 42/51] zram: use memcpy_to_bvec in zram_bvec_read Use the proper helper instead of open coding the copy. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-4-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a3a5e1e71326..14becdf2815d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1331,12 +1331,10 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, goto out; if (is_partial_io(bvec)) { - void *dst = kmap_atomic(bvec->bv_page); void *src = kmap_atomic(page); - memcpy(dst + bvec->bv_offset, src + offset, bvec->bv_len); + memcpy_to_bvec(bvec, src + offset); kunmap_atomic(src); - kunmap_atomic(dst); } out: if (is_partial_io(bvec)) From bd3d3203eb84d08a6daef805efe9316b79d3bf3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:18:59 +0300 Subject: [PATCH 43/51] zram: use memcpy_from_bvec in zram_bvec_write Use memcpy_from_bvec instead of open coding the logic. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 14becdf2815d..e9474b02012d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1465,7 +1465,6 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, { int ret; struct page *page = NULL; - void *src; struct bio_vec vec; vec = *bvec; @@ -1483,11 +1482,9 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, if (ret) goto out; - src = kmap_atomic(bvec->bv_page); dst = kmap_atomic(page); - memcpy(dst + offset, src + bvec->bv_offset, bvec->bv_len); + memcpy_from_bvec(dst + offset, bvec); kunmap_atomic(dst); - kunmap_atomic(src); vec.bv_page = page; vec.bv_len = PAGE_SIZE; From 20072ec828640b7d23a0cfdbccf0dea48e77ba3e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:19:00 +0300 Subject: [PATCH 44/51] nvdimm-blk: use bvec_kmap_local in nd_blk_rw_integrity Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvdimm/blk.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index c1db43524d75..0a3873833594 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -88,10 +88,9 @@ static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk, */ cur_len = min(len, bv.bv_len); - iobuf = kmap_atomic(bv.bv_page); - err = ndbr->do_io(ndbr, dev_offset, iobuf + bv.bv_offset, - cur_len, rw); - kunmap_atomic(iobuf); + iobuf = bvec_kmap_local(&bv); + err = ndbr->do_io(ndbr, dev_offset, iobuf, cur_len, rw); + kunmap_local(iobuf); if (err) return err; From 3205190655ea56ea5e00815eeff4dab2bde0af80 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:19:01 +0300 Subject: [PATCH 45/51] nvdimm-btt: use bvec_kmap_local in btt_rw_integrity Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvdimm/btt.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index cbd994f7f1fe..9613e54c7a67 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1163,17 +1163,15 @@ static int btt_rw_integrity(struct btt *btt, struct bio_integrity_payload *bip, */ cur_len = min(len, bv.bv_len); - mem = kmap_atomic(bv.bv_page); + mem = bvec_kmap_local(&bv); if (rw) - ret = arena_write_bytes(arena, meta_nsoff, - mem + bv.bv_offset, cur_len, + ret = arena_write_bytes(arena, meta_nsoff, mem, cur_len, NVDIMM_IO_ATOMIC); else - ret = arena_read_bytes(arena, meta_nsoff, - mem + bv.bv_offset, cur_len, + ret = arena_read_bytes(arena, meta_nsoff, mem, cur_len, NVDIMM_IO_ATOMIC); - kunmap_atomic(mem); + kunmap_local(mem); if (ret) return ret; From 07fee7aba5472d0e65345146a68b4bd1a8b656c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:19:02 +0300 Subject: [PATCH 46/51] bcache: use bvec_kmap_local in bio_csum Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 6869e010475a..fdd0194f84dd 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -44,10 +44,10 @@ static void bio_csum(struct bio *bio, struct bkey *k) uint64_t csum = 0; bio_for_each_segment(bv, bio, iter) { - void *d = kmap(bv.bv_page) + bv.bv_offset; + void *d = bvec_kmap_local(&bv); csum = crc64_be(csum, d, bv.bv_len); - kunmap(bv.bv_page); + kunmap_local(d); } k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); From 472278508dce25316e806e45778658c3e4b353b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:19:03 +0300 Subject: [PATCH 47/51] drbd: use bvec_kmap_local in drbd_csum_bio Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_worker.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index a5e04b38006b..1b48c8172a07 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -326,9 +326,9 @@ void drbd_csum_bio(struct crypto_shash *tfm, struct bio *bio, void *digest) bio_for_each_segment(bvec, bio, iter) { u8 *src; - src = kmap_atomic(bvec.bv_page); - crypto_shash_update(desc, src + bvec.bv_offset, bvec.bv_len); - kunmap_atomic(src); + src = bvec_kmap_local(&bvec); + crypto_shash_update(desc, src, bvec.bv_len); + kunmap_local(src); /* REQ_OP_WRITE_SAME has only one segment, * checksum the payload only once. */ From 3eddaa60b8411c135d1c71090dea9b59ff3f2e26 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:19:04 +0300 Subject: [PATCH 48/51] drbd: use bvec_kmap_local in recv_dless_read Using local kmaps slightly reduces the chances to stray writes, and the bvec interface cleans up the code a little bit. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 04e3ec12d8b4..fa00cf2ea952 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2017,10 +2017,10 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector); bio_for_each_segment(bvec, bio, iter) { - void *mapped = kmap(bvec.bv_page) + bvec.bv_offset; + void *mapped = bvec_kmap_local(&bvec); expect = min_t(int, data_size, bvec.bv_len); err = drbd_recv_all_warn(peer_device->connection, mapped, expect); - kunmap(bvec.bv_page); + kunmap_local(mapped); if (err) return err; data_size -= expect; From 13d4ef0f66b7ee9415e101e213acaf94a0cb28ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Mar 2022 14:19:05 +0300 Subject: [PATCH 49/51] floppy: use memcpy_{to,from}_bvec Use the helpers instead of open coding them. Signed-off-by: Christoph Hellwig Reviewed-by: Ira Weiny Link: https://lore.kernel.org/r/20220303111905.321089-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 19c2d0327e15..8c647532e3ce 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2485,11 +2485,9 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2) } if (CT(raw_cmd->cmd[COMMAND]) == FD_READ) - memcpy_to_page(bv.bv_page, bv.bv_offset, dma_buffer, - size); + memcpy_to_bvec(&bv, dma_buffer); else - memcpy_from_page(dma_buffer, bv.bv_page, bv.bv_offset, - size); + memcpy_from_bvec(dma_buffer, &bv); remaining -= size; dma_buffer += size; From 7b1002f7cfe581930f63787a0b3de0144e61ed55 Mon Sep 17 00:00:00 2001 From: Mingzhe Zou Date: Fri, 7 Jan 2022 16:21:13 +0800 Subject: [PATCH 50/51] bcache: fixup bcache_dev_sectors_dirty_add() multithreaded CPU false sharing When attaching a cached device (a.k.a backing device) to a cache device, bch_sectors_dirty_init() is called to count dirty sectors and stripes (see what bcache_dev_sectors_dirty_add() does) on the cache device. When bcache_dev_sectors_dirty_add() is called, set_bit(stripe, d->full_dirty_stripes) or clear_bit(stripe, d->full_dirty_stripes) operation will always be performed. In full_dirty_stripes, each 1bit represents stripe_size (8192) sectors (512B), so 1bit=4MB (8192*512), and each CPU cache line=64B=512bit=2048MB. When 20 threads process a cached disk with 100G dirty data, a single thread processes about 23M at a time, and 20 threads total 460M. These full_dirty_stripes bits corresponding to the 460M data is likely to fall in the same CPU cache line. When one of these threads performs a set_bit or clear_bit operation, the same CPU cache line of other threads will become invalid and must read the full_dirty_stripes from the main memory again. Compared with single thread, the time of a bcache_dev_sectors_dirty_add() call is increased by about 50 times in our test (100G dirty data, 20 threads, bcache_dev_sectors_dirty_add() is called more than 20 million times). This patch tries to test_bit before set_bit or clear_bit operation. Therefore, a lot of force set and clear operations will be avoided, and most of bcache_dev_sectors_dirty_add() calls will only read CPU cache line. Signed-off-by: Mingzhe Zou Signed-off-by: Coly Li --- drivers/md/bcache/writeback.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d42301e6309d..176461f89f46 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -585,10 +585,13 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, sectors_dirty = atomic_add_return(s, d->stripe_sectors_dirty + stripe); - if (sectors_dirty == d->stripe_size) - set_bit(stripe, d->full_dirty_stripes); - else - clear_bit(stripe, d->full_dirty_stripes); + if (sectors_dirty == d->stripe_size) { + if (!test_bit(stripe, d->full_dirty_stripes)) + set_bit(stripe, d->full_dirty_stripes); + } else { + if (test_bit(stripe, d->full_dirty_stripes)) + clear_bit(stripe, d->full_dirty_stripes); + } nr_sectors -= s; stripe_offset = 0; From 887554ab96588de2917b6c8c73e552da082e5368 Mon Sep 17 00:00:00 2001 From: Mingzhe Zou Date: Fri, 11 Feb 2022 14:39:15 +0800 Subject: [PATCH 51/51] bcache: fixup multiple threads crash When multiple threads to check btree nodes in parallel, the main thread wait for all threads to stop or CACHE_SET_IO_DISABLE flag: wait_event_interruptible(check_state->wait, atomic_read(&check_state->started) == 0 || test_bit(CACHE_SET_IO_DISABLE, &c->flags)); However, the bch_btree_node_read and bch_btree_node_read_done maybe call bch_cache_set_error, then the CACHE_SET_IO_DISABLE will be set. If the flag already set, the main thread return error. At the same time, maybe some threads still running and read NULL pointer, the kernel will crash. This patch change the event wait condition, the main thread must wait for all threads to stop. Fixes: 8e7102273f597 ("bcache: make bch_btree_check() to be multithreaded") Signed-off-by: Mingzhe Zou Cc: stable@vger.kernel.org # v5.7+ Signed-off-by: Coly Li --- drivers/md/bcache/btree.c | 6 ++++-- drivers/md/bcache/writeback.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 88c573eeb598..ad9f16689419 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2060,9 +2060,11 @@ int bch_btree_check(struct cache_set *c) } } + /* + * Must wait for all threads to stop. + */ wait_event_interruptible(check_state->wait, - atomic_read(&check_state->started) == 0 || - test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + atomic_read(&check_state->started) == 0); for (i = 0; i < check_state->total_threads; i++) { if (check_state->infos[i].result) { diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 176461f89f46..9ee0005874cd 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -1001,9 +1001,11 @@ void bch_sectors_dirty_init(struct bcache_device *d) } } + /* + * Must wait for all threads to stop. + */ wait_event_interruptible(state->wait, - atomic_read(&state->started) == 0 || - test_bit(CACHE_SET_IO_DISABLE, &c->flags)); + atomic_read(&state->started) == 0); out: kfree(state);