From 12e0613715e1cf305fffafaf0e89d810d9a85cc0 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Sat, 13 Mar 2021 11:01:44 +0800 Subject: [PATCH 001/129] block_dump: remove block_dump feature in mark_inode_dirty() block_dump is an old debugging interface, one of it's functions is used to print the information about who write which file on disk. If we enable block_dump through /proc/sys/vm/block_dump and turn on debug log level, we can gather information about write process name, target file name and disk from kernel message. This feature is realized in block_dump___mark_inode_dirty(), it print above information into kernel message directly when marking inode dirty, so it is noisy and can easily trigger log storm. At the same time, get the dentry refcount is also not safe, we found it will lead to deadlock on ext4 file system with data=journal mode. After tracepoints has been introduced into the kernel, we got a tracepoint in __mark_inode_dirty(), which is a better replacement of block_dump___mark_inode_dirty(). The only downside is that it only trace the inode number and not a file name, but it probably doesn't matter because the original printed file name in block_dump is not accurate in some cases, and we can still find it through the inode number and device id. So this patch delete the dirting inode part of block_dump feature. Signed-off-by: zhangyi (F) Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210313030146.2882027-2-yi.zhang@huawei.com Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e91980f49388..7c46d1588a19 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2205,28 +2205,6 @@ int dirtytime_interval_handler(struct ctl_table *table, int write, return ret; } -static noinline void block_dump___mark_inode_dirty(struct inode *inode) -{ - if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { - struct dentry *dentry; - const char *name = "?"; - - dentry = d_find_alias(inode); - if (dentry) { - spin_lock(&dentry->d_lock); - name = (const char *) dentry->d_name.name; - } - printk(KERN_DEBUG - "%s(%d): dirtied inode %lu (%s) on %s\n", - current->comm, task_pid_nr(current), inode->i_ino, - name, inode->i_sb->s_id); - if (dentry) { - spin_unlock(&dentry->d_lock); - dput(dentry); - } - } -} - /** * __mark_inode_dirty - internal function to mark an inode dirty * @@ -2296,9 +2274,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) (dirtytime && (inode->i_state & I_DIRTY_INODE))) return; - if (unlikely(block_dump)) - block_dump___mark_inode_dirty(inode); - spin_lock(&inode->i_lock); if (dirtytime && (inode->i_state & I_DIRTY_INODE)) goto out_unlock_inode; From 3af3d772f7216cf23081bb4176e86f1219d32ebc Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Sat, 13 Mar 2021 11:01:45 +0800 Subject: [PATCH 002/129] block_dump: remove block_dump feature We have already delete block_dump feature in mark_inode_dirty() because it can be replaced by tracepoints, now we also remove the part in submit_bio() for the same reason. The part of block dump feature in submit_bio() dump the write process, write region and sectors on the target disk into kernel message. it can be replaced by block_bio_queue tracepoint in submit_bio_checks(), so we do not need block_dump anymore, remove the whole block_dump feature. Signed-off-by: zhangyi (F) Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210313030146.2882027-3-yi.zhang@huawei.com Signed-off-by: Jens Axboe --- block/blk-core.c | 9 --------- include/linux/writeback.h | 1 - kernel/sysctl.c | 8 -------- mm/page-writeback.c | 5 ----- 4 files changed, 23 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 9bcdae93f6d4..689aac2625d2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1086,15 +1086,6 @@ blk_qc_t submit_bio(struct bio *bio) task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, count); } - - if (unlikely(block_dump)) { - char b[BDEVNAME_SIZE]; - printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", - current->comm, task_pid_nr(current), - op_is_write(bio_op(bio)) ? "WRITE" : "READ", - (unsigned long long)bio->bi_iter.bi_sector, - bio_devname(bio, b), count); - } } /* diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 8e5c5bb16e2d..9ef50176f3a1 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -360,7 +360,6 @@ extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; extern unsigned int dirtytime_expire_interval; extern int vm_highmem_is_dirtyable; -extern int block_dump; extern int laptop_mode; int dirty_background_ratio_handler(struct ctl_table *table, int write, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 14edf84cc571..08e52b1090e9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2931,14 +2931,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "block_dump", - .data = &block_dump, - .maxlen = sizeof(block_dump), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, { .procname = "vfs_cache_pressure", .data = &sysctl_vfs_cache_pressure, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0062d5c57d41..fe72d5f65688 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -108,11 +108,6 @@ EXPORT_SYMBOL_GPL(dirty_writeback_interval); */ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ -/* - * Flag that makes the machine dump writes/reads and block dirtyings. - */ -int block_dump; - /* * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: * a full sync is triggered after this time elapses without any disk activity. From 51fd43e2801054b1321b1d81b91dc37efdff5127 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Sat, 13 Mar 2021 11:01:46 +0800 Subject: [PATCH 003/129] block_dump: remove comments in docs Now block_dump feature is gone, remove all comments in docs. Signed-off-by: zhangyi (F) Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210313030146.2882027-4-yi.zhang@huawei.com Signed-off-by: Jens Axboe --- Documentation/admin-guide/laptops/laptop-mode.rst | 11 ----------- Documentation/admin-guide/sysctl/vm.rst | 8 -------- 2 files changed, 19 deletions(-) diff --git a/Documentation/admin-guide/laptops/laptop-mode.rst b/Documentation/admin-guide/laptops/laptop-mode.rst index c984c4262f2e..b61cc601d298 100644 --- a/Documentation/admin-guide/laptops/laptop-mode.rst +++ b/Documentation/admin-guide/laptops/laptop-mode.rst @@ -101,17 +101,6 @@ this results in concentration of disk activity in a small time interval which occurs only once every 10 minutes, or whenever the disk is forced to spin up by a cache miss. The disk can then be spun down in the periods of inactivity. -If you want to find out which process caused the disk to spin up, you can -gather information by setting the flag /proc/sys/vm/block_dump. When this flag -is set, Linux reports all disk read and write operations that take place, and -all block dirtyings done to files. This makes it possible to debug why a disk -needs to spin up, and to increase battery life even more. The output of -block_dump is written to the kernel output, and it can be retrieved using -"dmesg". When you use block_dump and your kernel logging level also includes -kernel debugging messages, you probably want to turn off klogd, otherwise -the output of block_dump will be logged, causing disk activity that is not -normally there. - Configuration ------------- diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 586cd4b86428..3ca6679f16ea 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -25,7 +25,6 @@ files can be found in mm/swap.c. Currently, these files are in /proc/sys/vm: - admin_reserve_kbytes -- block_dump - compact_memory - compaction_proactiveness - compact_unevictable_allowed @@ -106,13 +105,6 @@ On x86_64 this is about 128MB. Changing this takes effect whenever an application requests memory. -block_dump -========== - -block_dump enables block I/O debugging when set to a nonzero value. More -information on block I/O debugging is in Documentation/admin-guide/laptops/laptop-mode.rst. - - compact_memory ============== From 65a8db393a8e49ee98432cf3e641d2bd2fa88606 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Tue, 11 May 2021 19:34:40 +0800 Subject: [PATCH 004/129] aoe: remove unnecessary mutex_init() The mutex ktio_spawn_lock is initialized statically. It is unnecessary to initialize by mutex_init(). Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Link: https://lore.kernel.org/r/20210511113440.3772053-1-yangyingliang@huawei.com Signed-off-by: Jens Axboe --- drivers/block/aoe/aoecmd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index ecd77897a761..588889bea7c3 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1701,8 +1701,6 @@ aoecmd_init(void) goto ktiowq_fail; } - mutex_init(&ktio_spawn_lock); - for (i = 0; i < ncpus; i++) { INIT_LIST_HEAD(&iocq[i].head); spin_lock_init(&iocq[i].lock); From b5f3352e0868611b555e1dcb2e1ffb8e346c519c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 May 2021 14:58:04 -0400 Subject: [PATCH 005/129] blkcg: drop CLONE_IO check in blkcg_can_attach() blkcg has always rejected to attach if any of the member tasks has shared io_context. The rationale was that io_contexts can be shared across different cgroups making it impossible to define what the appropriate control behavior should be. However, this check causes more problems than it solves: * The check prevents controller enable and migrations but not CLONE_IO itself, which can lead to surprises as the outcome changes depending on the order of operations. * Sharing within a cgroup is fine but the check can't distinguish that. This leads to unnecessary conflicts with the recent CLONE_IO usage in io_uring. io_context sharing doesn't make any difference for rq_qos based controllers and the way it's used is safe as long as tasks aren't migrated dynamically which is the vast majority of use cases. While we can try to make the check more precise to avoid false positives, the added complexity doesn't seem worthwhile. Let's just drop blkcg_can_attach(). Signed-off-by: Tejun Heo Link: https://lore.kernel.org/r/YJrTvHbrRDbJjw+S@slm.duckdns.org Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 582d2f18717e..d169e2055158 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1217,32 +1217,6 @@ void blkcg_exit_queue(struct request_queue *q) blk_throtl_exit(q); } -/* - * We cannot support shared io contexts, as we have no mean to support - * two tasks with the same ioc in two different groups without major rework - * of the main cic data structures. For now we allow a task to change - * its cgroup only if it's the only owner of its ioc. - */ -static int blkcg_can_attach(struct cgroup_taskset *tset) -{ - struct task_struct *task; - struct cgroup_subsys_state *dst_css; - struct io_context *ioc; - int ret = 0; - - /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, dst_css, tset) { - task_lock(task); - ioc = task->io_context; - if (ioc && atomic_read(&ioc->nr_tasks) > 1) - ret = -EINVAL; - task_unlock(task); - if (ret) - break; - } - return ret; -} - static void blkcg_bind(struct cgroup_subsys_state *root_css) { int i; @@ -1275,7 +1249,6 @@ struct cgroup_subsys io_cgrp_subsys = { .css_online = blkcg_css_online, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, - .can_attach = blkcg_can_attach, .css_rstat_flush = blkcg_rstat_flush, .bind = blkcg_bind, .dfl_cftypes = blkcg_files, From 8c390ff910c5500fc16cca6f90ac2a60c7c84979 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 11 May 2021 15:53:19 +0000 Subject: [PATCH 006/129] block: remove unneeded parenthesis from blk-sysfs Align to common code conventions. Signed-off-by: Max Gurtovoy Link: https://lore.kernel.org/r/20210511155319.1885277-1-mgurtovoy@nvidia.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e03bedf180ab..f89e2fc3963b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -91,7 +91,7 @@ static ssize_t queue_ra_show(struct request_queue *q, char *page) unsigned long ra_kb = q->backing_dev_info->ra_pages << (PAGE_SHIFT - 10); - return queue_var_show(ra_kb, (page)); + return queue_var_show(ra_kb, page); } static ssize_t @@ -112,28 +112,28 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) { int max_sectors_kb = queue_max_sectors(q) >> 1; - return queue_var_show(max_sectors_kb, (page)); + return queue_var_show(max_sectors_kb, page); } static ssize_t queue_max_segments_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_segments(q), (page)); + return queue_var_show(queue_max_segments(q), page); } static ssize_t queue_max_discard_segments_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_discard_segments(q), (page)); + return queue_var_show(queue_max_discard_segments(q), page); } static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) { - return queue_var_show(q->limits.max_integrity_segments, (page)); + return queue_var_show(q->limits.max_integrity_segments, page); } static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_segment_size(q), (page)); + return queue_var_show(queue_max_segment_size(q), page); } static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) @@ -261,12 +261,12 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) { int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1; - return queue_var_show(max_hw_sectors_kb, (page)); + return queue_var_show(max_hw_sectors_kb, page); } static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page) { - return queue_var_show(q->limits.virt_boundary_mask, (page)); + return queue_var_show(q->limits.virt_boundary_mask, page); } #define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ From 84da7acc3ba53af26f15c4b0ada446127b7a7836 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 May 2021 23:22:33 +0800 Subject: [PATCH 007/129] block: avoid double io accounting for flush request For flush request, rq->end_io() may be called two times, one is from timeout handling(blk_mq_check_expired()), another is from normal completion(__blk_mq_end_request()). Move blk_account_io_flush() after flush_rq->ref drops to zero, so io accounting can be done just once for flush request. Fixes: b68663186577 ("block: add iostat counters for flush requests") Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Tested-by: John Garry Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210511152236.763464-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-flush.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index 7942ca6ed321..1002f6c58181 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -219,8 +219,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) unsigned long flags = 0; struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); - blk_account_io_flush(flush_rq); - /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); @@ -230,6 +228,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) return; } + blk_account_io_flush(flush_rq); /* * Flush request has to be marked as IDLE when it is really ended * because its .end_io() is called from timeout code path too for From 2e315dc07df009c3e29d6926871f62a30cfae394 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 May 2021 23:22:34 +0800 Subject: [PATCH 008/129] blk-mq: grab rq->refcount before calling ->fn in blk_mq_tagset_busy_iter Grab rq->refcount before calling ->fn in blk_mq_tagset_busy_iter(), and this way will prevent the request from being re-used when ->fn is running. The approach is same as what we do during handling timeout. Fix request use-after-free(UAF) related with completion race or queue releasing: - If one rq is referred before rq->q is frozen, then queue won't be frozen before the request is released during iteration. - If one rq is referred after rq->q is frozen, refcount_inc_not_zero() will return false, and we won't iterate over this request. However, still one request UAF not covered: refcount_inc_not_zero() may read one freed request, and it will be handled in next patch. Tested-by: John Garry Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210511152236.763464-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 44 +++++++++++++++++++++++++++++++++----------- block/blk-mq.c | 14 +++++++++----- block/blk-mq.h | 1 + 3 files changed, 43 insertions(+), 16 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2a37731e8244..544edf2c56a5 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -199,6 +199,16 @@ struct bt_iter_data { bool reserved; }; +static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, + unsigned int bitnr) +{ + struct request *rq = tags->rqs[bitnr]; + + if (!rq || !refcount_inc_not_zero(&rq->ref)) + return NULL; + return rq; +} + static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; @@ -206,18 +216,22 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) struct blk_mq_tags *tags = hctx->tags; bool reserved = iter_data->reserved; struct request *rq; + bool ret = true; if (!reserved) bitnr += tags->nr_reserved_tags; - rq = tags->rqs[bitnr]; - /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ - if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx) - return iter_data->fn(hctx, rq, iter_data->data, reserved); - return true; + rq = blk_mq_find_and_get_req(tags, bitnr); + if (!rq) + return true; + + if (rq->q == hctx->queue && rq->mq_hctx == hctx) + ret = iter_data->fn(hctx, rq, iter_data->data, reserved); + blk_mq_put_rq_ref(rq); + return ret; } /** @@ -264,6 +278,8 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) struct blk_mq_tags *tags = iter_data->tags; bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED; struct request *rq; + bool ret = true; + bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS); if (!reserved) bitnr += tags->nr_reserved_tags; @@ -272,16 +288,19 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ - if (iter_data->flags & BT_TAG_ITER_STATIC_RQS) + if (iter_static_rqs) rq = tags->static_rqs[bitnr]; else - rq = tags->rqs[bitnr]; + rq = blk_mq_find_and_get_req(tags, bitnr); if (!rq) return true; - if ((iter_data->flags & BT_TAG_ITER_STARTED) && - !blk_mq_request_started(rq)) - return true; - return iter_data->fn(rq, iter_data->data, reserved); + + if (!(iter_data->flags & BT_TAG_ITER_STARTED) || + blk_mq_request_started(rq)) + ret = iter_data->fn(rq, iter_data->data, reserved); + if (!iter_static_rqs) + blk_mq_put_rq_ref(rq); + return ret; } /** @@ -348,6 +367,9 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, * indicates whether or not @rq is a reserved request. Return * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. + * + * We grab one request reference before calling @fn and release it after + * @fn returns. */ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) diff --git a/block/blk-mq.c b/block/blk-mq.c index c86c01bfecdb..debfa5cd8025 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -909,6 +909,14 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) return false; } +void blk_mq_put_rq_ref(struct request *rq) +{ + if (is_flush_rq(rq, rq->mq_hctx)) + rq->end_io(rq, 0); + else if (refcount_dec_and_test(&rq->ref)) + __blk_mq_free_request(rq); +} + static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { @@ -942,11 +950,7 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, if (blk_mq_req_expired(rq, next)) blk_mq_rq_timed_out(rq, reserved); - if (is_flush_rq(rq, hctx)) - rq->end_io(rq, 0); - else if (refcount_dec_and_test(&rq->ref)) - __blk_mq_free_request(rq); - + blk_mq_put_rq_ref(rq); return true; } diff --git a/block/blk-mq.h b/block/blk-mq.h index 9ce64bc4a6c8..556368d2c5b6 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -47,6 +47,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); +void blk_mq_put_rq_ref(struct request *rq); /* * Internal helpers for allocating/freeing the request map From bd63141d585bef14f4caf111f6d0e27fe2300ec6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 May 2021 23:22:35 +0800 Subject: [PATCH 009/129] blk-mq: clear stale request in tags->rq[] before freeing one request pool refcount_inc_not_zero() in bt_tags_iter() still may read one freed request. Fix the issue by the following approach: 1) hold a per-tags spinlock when reading ->rqs[tag] and calling refcount_inc_not_zero in bt_tags_iter() 2) clearing stale request referred via ->rqs[tag] before freeing request pool, the per-tags spinlock is held for clearing stale ->rq[tag] So after we cleared stale requests, bt_tags_iter() won't observe freed request any more, also the clearing will wait for pending request reference. The idea of clearing ->rqs[] is borrowed from John Garry's previous patch and one recent David's patch. Tested-by: John Garry Reviewed-by: David Jeffery Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210511152236.763464-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 9 +++++++-- block/blk-mq-tag.h | 6 ++++++ block/blk-mq.c | 46 +++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 544edf2c56a5..1671dae43030 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -202,10 +202,14 @@ struct bt_iter_data { static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, unsigned int bitnr) { - struct request *rq = tags->rqs[bitnr]; + struct request *rq; + unsigned long flags; + spin_lock_irqsave(&tags->lock, flags); + rq = tags->rqs[bitnr]; if (!rq || !refcount_inc_not_zero(&rq->ref)) - return NULL; + rq = NULL; + spin_unlock_irqrestore(&tags->lock, flags); return rq; } @@ -538,6 +542,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; + spin_lock_init(&tags->lock); if (blk_mq_is_sbitmap_shared(flags)) return tags; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 7d3e6b333a4a..f887988e5ef6 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -20,6 +20,12 @@ struct blk_mq_tags { struct request **rqs; struct request **static_rqs; struct list_head page_list; + + /* + * used to clear request reference in rqs[] before freeing one + * request pool + */ + spinlock_t lock; }; extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, diff --git a/block/blk-mq.c b/block/blk-mq.c index debfa5cd8025..dd371f321d35 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2307,6 +2307,45 @@ queue_exit: return BLK_QC_T_NONE; } +static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +/* called before freeing request pool in @tags */ +static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, unsigned int hctx_idx) +{ + struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; + struct page *page; + unsigned long flags; + + list_for_each_entry(page, &tags->page_list, lru) { + unsigned long start = (unsigned long)page_address(page); + unsigned long end = start + order_to_size(page->private); + int i; + + for (i = 0; i < set->queue_depth; i++) { + struct request *rq = drv_tags->rqs[i]; + unsigned long rq_addr = (unsigned long)rq; + + if (rq_addr >= start && rq_addr < end) { + WARN_ON_ONCE(refcount_read(&rq->ref) != 0); + cmpxchg(&drv_tags->rqs[i], rq, NULL); + } + } + } + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&drv_tags->lock, flags); + spin_unlock_irqrestore(&drv_tags->lock, flags); +} + void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { @@ -2325,6 +2364,8 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } + blk_mq_clear_rq_mapping(set, tags, hctx_idx); + while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); @@ -2384,11 +2425,6 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, return tags; } -static size_t order_to_size(unsigned int order) -{ - return (size_t)PAGE_SIZE << order; -} - static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, int node) { From 364b61818f65045479e42e76ed8dd6f051778280 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 11 May 2021 23:22:36 +0800 Subject: [PATCH 010/129] blk-mq: clearing flush request reference in tags->rqs[] Before we free request queue, clearing flush request reference in tags->rqs[], so that potential UAF can be avoided. Based on one patch written by David Jeffery. Tested-by: John Garry Reviewed-by: Bart Van Assche Reviewed-by: David Jeffery Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210511152236.763464-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index dd371f321d35..fbb165393790 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2643,16 +2643,49 @@ static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) &hctx->cpuhp_dead); } +/* + * Before freeing hw queue, clearing the flush request reference in + * tags->rqs[] for avoiding potential UAF. + */ +static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, + unsigned int queue_depth, struct request *flush_rq) +{ + int i; + unsigned long flags; + + /* The hw queue may not be mapped yet */ + if (!tags) + return; + + WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0); + + for (i = 0; i < queue_depth; i++) + cmpxchg(&tags->rqs[i], flush_rq, NULL); + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&tags->lock, flags); + spin_unlock_irqrestore(&tags->lock, flags); +} + /* hctx->ctxs will be freed in queue's release handler */ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { + struct request *flush_rq = hctx->fq->flush_rq; + if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); + blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], + set->queue_depth, flush_rq); if (set->ops->exit_request) - set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); + set->ops->exit_request(set, flush_rq, hctx_idx); if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); From 56b68085e536eff2676108f2f8356889a7dbbf55 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 13 May 2021 20:00:57 +0800 Subject: [PATCH 011/129] blk-mq: Some tag allocation code refactoring The tag allocation code to alloc the sbitmap pairs is common for regular bitmaps tags and shared sbitmap, so refactor into a common function. Also remove superfluous "flags" argument from blk_mq_init_shared_sbitmap(). Signed-off-by: John Garry Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/1620907258-30910-2-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 56 ++++++++++++++++++++++++++++------------------ block/blk-mq-tag.h | 9 +++++--- block/blk-mq.c | 2 +- 3 files changed, 41 insertions(+), 26 deletions(-) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 1671dae43030..f597d40de10b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -471,39 +471,54 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, node); } +int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, + struct sbitmap_queue *breserved_tags, + unsigned int queue_depth, unsigned int reserved, + int node, int alloc_policy) +{ + unsigned int depth = queue_depth - reserved; + bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; + + if (bt_alloc(bitmap_tags, depth, round_robin, node)) + return -ENOMEM; + if (bt_alloc(breserved_tags, reserved, round_robin, node)) + goto free_bitmap_tags; + + return 0; + +free_bitmap_tags: + sbitmap_queue_free(bitmap_tags); + return -ENOMEM; +} + static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, int node, int alloc_policy) { - unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; - bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; + int ret; - if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node)) - return -ENOMEM; - if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags, - round_robin, node)) - goto free_bitmap_tags; + ret = blk_mq_init_bitmaps(&tags->__bitmap_tags, + &tags->__breserved_tags, + tags->nr_tags, tags->nr_reserved_tags, + node, alloc_policy); + if (ret) + return ret; tags->bitmap_tags = &tags->__bitmap_tags; tags->breserved_tags = &tags->__breserved_tags; return 0; -free_bitmap_tags: - sbitmap_queue_free(&tags->__bitmap_tags); - return -ENOMEM; } -int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) +int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set) { - unsigned int depth = set->queue_depth - set->reserved_tags; int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); - bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - int i, node = set->numa_node; + int i, ret; - if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node)) - return -ENOMEM; - if (bt_alloc(&set->__breserved_tags, set->reserved_tags, - round_robin, node)) - goto free_bitmap_tags; + ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags, + set->queue_depth, set->reserved_tags, + set->numa_node, alloc_policy); + if (ret) + return ret; for (i = 0; i < set->nr_hw_queues; i++) { struct blk_mq_tags *tags = set->tags[i]; @@ -513,9 +528,6 @@ int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) } return 0; -free_bitmap_tags: - sbitmap_queue_free(&set->__bitmap_tags); - return -ENOMEM; } void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index f887988e5ef6..8ed55af08427 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -32,11 +32,14 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, unsigned int flags); extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags); +extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, + struct sbitmap_queue *breserved_tags, + unsigned int queue_depth, + unsigned int reserved, + int node, int alloc_policy); -extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, - unsigned int flags); +extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set); extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set); - extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); diff --git a/block/blk-mq.c b/block/blk-mq.c index fbb165393790..001e196bdebd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3564,7 +3564,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (blk_mq_is_sbitmap_shared(set->flags)) { atomic_set(&set->active_queues_shared_sbitmap, 0); - if (blk_mq_init_shared_sbitmap(set, set->flags)) { + if (blk_mq_init_shared_sbitmap(set)) { ret = -ENOMEM; goto out_free_mq_rq_maps; } From d97e594c51660bea510a387731637b894651e4b5 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 13 May 2021 20:00:58 +0800 Subject: [PATCH 012/129] blk-mq: Use request queue-wide tags for tagset-wide sbitmap The tags used for an IO scheduler are currently per hctx. As such, when q->nr_hw_queues grows, so does the request queue total IO scheduler tag depth. This may cause problems for SCSI MQ HBAs whose total driver depth is fixed. Ming and Yanhui report higher CPU usage and lower throughput in scenarios where the fixed total driver tag depth is appreciably lower than the total scheduler tag depth: https://lore.kernel.org/linux-block/440dfcfc-1a2c-bd98-1161-cec4d78c6dfc@huawei.com/T/#mc0d6d4f95275a2743d1c8c3e4dc9ff6c9aa3a76b In that scenario, since the scheduler tag is got first, much contention is introduced since a driver tag may not be available after we have got the sched tag. Improve this scenario by introducing request queue-wide tags for when a tagset-wide sbitmap is used. The static sched requests are still allocated per hctx, as requests are initialised per hctx, as in blk_mq_init_request(..., hctx_idx, ...) -> set->ops->init_request(.., hctx_idx, ...). For simplicity of resizing the request queue sbitmap when updating the request queue depth, just init at the max possible size, so we don't need to deal with the possibly with swapping out a new sbitmap for old if we need to grow. Signed-off-by: John Garry Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/1620907258-30910-3-git-send-email-john.garry@huawei.com Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 67 ++++++++++++++++++++++++++++++++++-------- block/blk-mq-sched.h | 2 ++ block/blk-mq-tag.c | 11 ++++--- block/blk-mq.c | 13 ++++++-- include/linux/blkdev.h | 4 +++ 5 files changed, 76 insertions(+), 21 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 996a4b2f73aa..045b6878b8c5 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -509,11 +509,9 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; - if (hctx->sched_tags) { blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); - blk_mq_free_rq_map(hctx->sched_tags, flags); + blk_mq_free_rq_map(hctx->sched_tags, set->flags); hctx->sched_tags = NULL; } } @@ -523,12 +521,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q, unsigned int hctx_idx) { struct blk_mq_tag_set *set = q->tag_set; - /* Clear HCTX_SHARED so tags are init'ed */ - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; int ret; hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, - set->reserved_tags, flags); + set->reserved_tags, set->flags); if (!hctx->sched_tags) return -ENOMEM; @@ -546,16 +542,50 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q) int i; queue_for_each_hw_ctx(q, hctx, i) { - /* Clear HCTX_SHARED so tags are freed */ - unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; - if (hctx->sched_tags) { - blk_mq_free_rq_map(hctx->sched_tags, flags); + blk_mq_free_rq_map(hctx->sched_tags, hctx->flags); hctx->sched_tags = NULL; } } } +static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue) +{ + struct blk_mq_tag_set *set = queue->tag_set; + int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); + struct blk_mq_hw_ctx *hctx; + int ret, i; + + /* + * Set initial depth at max so that we don't need to reallocate for + * updating nr_requests. + */ + ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags, + &queue->sched_breserved_tags, + MAX_SCHED_RQ, set->reserved_tags, + set->numa_node, alloc_policy); + if (ret) + return ret; + + queue_for_each_hw_ctx(queue, hctx, i) { + hctx->sched_tags->bitmap_tags = + &queue->sched_bitmap_tags; + hctx->sched_tags->breserved_tags = + &queue->sched_breserved_tags; + } + + sbitmap_queue_resize(&queue->sched_bitmap_tags, + queue->nr_requests - set->reserved_tags); + + return 0; +} + +static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue) +{ + sbitmap_queue_free(&queue->sched_bitmap_tags); + sbitmap_queue_free(&queue->sched_breserved_tags); +} + int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { struct blk_mq_hw_ctx *hctx; @@ -580,12 +610,18 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_sched_alloc_tags(q, hctx, i); if (ret) - goto err; + goto err_free_tags; + } + + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { + ret = blk_mq_init_sched_shared_sbitmap(q); + if (ret) + goto err_free_tags; } ret = e->ops.init_sched(q, e); if (ret) - goto err; + goto err_free_sbitmap; blk_mq_debugfs_register_sched(q); @@ -605,7 +641,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return 0; -err: +err_free_sbitmap: + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) + blk_mq_exit_sched_shared_sbitmap(q); +err_free_tags: blk_mq_sched_free_requests(q); blk_mq_sched_tags_teardown(q); q->elevator = NULL; @@ -643,5 +682,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q); + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) + blk_mq_exit_sched_shared_sbitmap(q); q->elevator = NULL; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 5b18ab915c65..aff037cfd8e7 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -5,6 +5,8 @@ #include "blk-mq.h" #include "blk-mq-tag.h" +#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ) + void blk_mq_sched_assign_ioc(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index f597d40de10b..86f87346232a 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -13,6 +13,7 @@ #include #include "blk.h" #include "blk-mq.h" +#include "blk-mq-sched.h" #include "blk-mq-tag.h" /* @@ -590,8 +591,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, */ if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; - /* Only sched tags can grow, so clear HCTX_SHARED flag */ - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; struct blk_mq_tags *new; bool ret; @@ -602,21 +601,21 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, * We need some sort of upper limit, set it high enough that * no valid use cases should require more. */ - if (tdepth > 16 * BLKDEV_MAX_RQ) + if (tdepth > MAX_SCHED_RQ) return -EINVAL; new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, - tags->nr_reserved_tags, flags); + tags->nr_reserved_tags, set->flags); if (!new) return -ENOMEM; ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); if (ret) { - blk_mq_free_rq_map(new, flags); + blk_mq_free_rq_map(new, set->flags); return -ENOMEM; } blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); - blk_mq_free_rq_map(*tagsptr, flags); + blk_mq_free_rq_map(*tagsptr, set->flags); *tagsptr = new; } else { /* diff --git a/block/blk-mq.c b/block/blk-mq.c index 001e196bdebd..f11d4018ce2e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3640,15 +3640,24 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } else { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr, true); + if (blk_mq_is_sbitmap_shared(set->flags)) { + hctx->sched_tags->bitmap_tags = + &q->sched_bitmap_tags; + hctx->sched_tags->breserved_tags = + &q->sched_breserved_tags; + } } if (ret) break; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(hctx); } - - if (!ret) + if (!ret) { q->nr_requests = nr; + if (q->elevator && blk_mq_is_sbitmap_shared(set->flags)) + sbitmap_queue_resize(&q->sched_bitmap_tags, + nr - set->reserved_tags); + } blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f69c75bd6d27..2c28577b50f4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -25,6 +25,7 @@ #include #include #include +#include struct module; struct scsi_ioctl_command; @@ -493,6 +494,9 @@ struct request_queue { atomic_t nr_active_requests_shared_sbitmap; + struct sbitmap_queue sched_bitmap_tags; + struct sbitmap_queue sched_breserved_tags; + struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); From 7c3f828b522b07adb341b08fde1660685c5ba3eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:51 +0200 Subject: [PATCH 013/129] block: refactor device number setup in __device_add_disk Untangle the mess around blk_alloc_devt by moving the check for the used allocation scheme into the callers. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Luis Chamberlain Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 4 +- block/genhd.c | 96 ++++++++++++++++------------------------- block/partitions/core.c | 15 +++++-- 3 files changed, 49 insertions(+), 66 deletions(-) diff --git a/block/blk.h b/block/blk.h index 8b3591aee0a5..cba3a94aabfa 100644 --- a/block/blk.h +++ b/block/blk.h @@ -343,8 +343,8 @@ static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} static inline void blk_queue_clear_zone_settings(struct request_queue *q) {} #endif -int blk_alloc_devt(struct block_device *part, dev_t *devt); -void blk_free_devt(dev_t devt); +int blk_alloc_ext_minor(void); +void blk_free_ext_minor(unsigned int minor); char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 diff --git a/block/genhd.c b/block/genhd.c index 9f8cb7beaad1..3daab80201df 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -333,52 +333,22 @@ static int blk_mangle_minor(int minor) return minor; } -/** - * blk_alloc_devt - allocate a dev_t for a block device - * @bdev: block device to allocate dev_t for - * @devt: out parameter for resulting dev_t - * - * Allocate a dev_t for block device. - * - * RETURNS: - * 0 on success, allocated dev_t is returned in *@devt. -errno on - * failure. - * - * CONTEXT: - * Might sleep. - */ -int blk_alloc_devt(struct block_device *bdev, dev_t *devt) +int blk_alloc_ext_minor(void) { - struct gendisk *disk = bdev->bd_disk; int idx; - /* in consecutive minor range? */ - if (bdev->bd_partno < disk->minors) { - *devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno); - return 0; - } - idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL); - if (idx < 0) - return idx == -ENOSPC ? -EBUSY : idx; - - *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); - return 0; + if (idx < 0) { + if (idx == -ENOSPC) + return -EBUSY; + return idx; + } + return blk_mangle_minor(idx); } -/** - * blk_free_devt - free a dev_t - * @devt: dev_t to free - * - * Free @devt which was allocated using blk_alloc_devt(). - * - * CONTEXT: - * Might sleep. - */ -void blk_free_devt(dev_t devt) +void blk_free_ext_minor(unsigned int minor) { - if (MAJOR(devt) == BLOCK_EXT_MAJOR) - ida_free(&ext_devt_ida, blk_mangle_minor(MINOR(devt))); + ida_free(&ext_devt_ida, blk_mangle_minor(minor)); } static char *bdevt_str(dev_t devt, char *buf) @@ -499,8 +469,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups, bool register_queue) { - dev_t devt; - int retval; + int ret; /* * The disk queue should now be all set with enough information about @@ -511,24 +480,30 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, if (register_queue) elevator_init_mq(disk->queue); - /* minors == 0 indicates to use ext devt from part0 and should - * be accompanied with EXT_DEVT flag. Make sure all - * parameters make sense. + /* + * If the driver provides an explicit major number it also must provide + * the number of minors numbers supported, and those will be used to + * setup the gendisk. + * Otherwise just allocate the device numbers for both the whole device + * and all partitions from the extended dev_t space. */ - WARN_ON(disk->minors && !(disk->major || disk->first_minor)); - WARN_ON(!disk->minors && - !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); + if (disk->major) { + WARN_ON(!disk->minors); + } else { + WARN_ON(disk->minors); + WARN_ON(!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); + + ret = blk_alloc_ext_minor(); + if (ret < 0) { + WARN_ON(1); + return; + } + disk->major = BLOCK_EXT_MAJOR; + disk->first_minor = MINOR(ret); + } disk->flags |= GENHD_FL_UP; - retval = blk_alloc_devt(disk->part0, &devt); - if (retval) { - WARN_ON(1); - return; - } - disk->major = MAJOR(devt); - disk->first_minor = MINOR(devt); - disk_alloc_events(disk); if (disk->flags & GENHD_FL_HIDDEN) { @@ -541,14 +516,14 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, } else { struct backing_dev_info *bdi = disk->queue->backing_dev_info; struct device *dev = disk_to_dev(disk); - int ret; /* Register BDI before referencing it from bdev */ - dev->devt = devt; - ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); + dev->devt = MKDEV(disk->major, disk->first_minor); + ret = bdi_register(bdi, "%u:%u", + disk->major, disk->first_minor); WARN_ON(ret); bdi_set_owner(bdi, dev); - bdev_add(disk->part0, devt); + bdev_add(disk->part0, dev->devt); } register_disk(parent, disk, groups); if (register_queue) @@ -1120,7 +1095,8 @@ static void disk_release(struct device *dev) might_sleep(); - blk_free_devt(dev->devt); + if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) + blk_free_ext_minor(MINOR(dev->devt)); disk_release_events(disk); kfree(disk->random); xa_destroy(&disk->part_tbl); diff --git a/block/partitions/core.c b/block/partitions/core.c index dc60ecf46fe6..504297bdc8bf 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -260,7 +260,8 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { - blk_free_devt(dev->devt); + if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR) + blk_free_ext_minor(MINOR(dev->devt)); bdput(dev_to_bdev(dev)); } @@ -379,9 +380,15 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, pdev->type = &part_type; pdev->parent = ddev; - err = blk_alloc_devt(bdev, &devt); - if (err) - goto out_put; + /* in consecutive minor range? */ + if (bdev->bd_partno < disk->minors) { + devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno); + } else { + err = blk_alloc_ext_minor(); + if (err < 0) + goto out_put; + devt = MKDEV(BLOCK_EXT_MAJOR, err); + } pdev->devt = devt; /* delay uevent until 'holders' subdir is created */ From 2e3c73fa0c419f62fd588731be30fb0d1bca9ad6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:52 +0200 Subject: [PATCH 014/129] block: move the DISK_MAX_PARTS sanity check into __device_add_disk Keep this together with the first place that actually looks at ->minors and prepare for not passing a minors argument to alloc_disk. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Luis Chamberlain Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-3-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 3daab80201df..8c1816d2929e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -489,6 +489,12 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, */ if (disk->major) { WARN_ON(!disk->minors); + + if (disk->minors > DISK_MAX_PARTS) { + pr_err("block: can't allocate more than %d partitions\n", + DISK_MAX_PARTS); + disk->minors = DISK_MAX_PARTS; + } } else { WARN_ON(disk->minors); WARN_ON(!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); @@ -1255,13 +1261,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) { struct gendisk *disk; - if (minors > DISK_MAX_PARTS) { - printk(KERN_ERR - "block: can't allocate more than %d partitions\n", - DISK_MAX_PARTS); - minors = DISK_MAX_PARTS; - } - disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (!disk) return NULL; From 0d1feb72ffd8578f6f167ca15b2096c276c1f6df Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:53 +0200 Subject: [PATCH 015/129] block: automatically enable GENHD_FL_EXT_DEVT Automatically set the GENHD_FL_EXT_DEVT flag for all disks allocated without an explicit number of minors. This is what all new block drivers should do, so make sure it is the default without boilerplate code. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Luis Chamberlain Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-4-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- block/partitions/core.c | 4 ---- drivers/block/n64cart.c | 2 +- drivers/lightnvm/core.c | 1 - drivers/memstick/core/ms_block.c | 1 - drivers/nvdimm/blk.c | 1 - drivers/nvdimm/btt.c | 1 - drivers/nvdimm/pmem.c | 1 - drivers/nvme/host/core.c | 1 - drivers/nvme/host/multipath.c | 1 - 10 files changed, 2 insertions(+), 13 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 8c1816d2929e..9fa734cb9cbd 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -497,7 +497,6 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, } } else { WARN_ON(disk->minors); - WARN_ON(!(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); ret = blk_alloc_ext_minor(); if (ret < 0) { @@ -506,6 +505,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, } disk->major = BLOCK_EXT_MAJOR; disk->first_minor = MINOR(ret); + disk->flags |= GENHD_FL_EXT_DEVT; } disk->flags |= GENHD_FL_UP; diff --git a/block/partitions/core.c b/block/partitions/core.c index 504297bdc8bf..ada3e1e66989 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -326,10 +326,6 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, const char *dname; int err; - /* - * disk_max_parts() won't be zero, either GENHD_FL_EXT_DEVT is set - * or 'minors' is passed to alloc_disk(). - */ if (partno >= disk_max_parts(disk)) return ERR_PTR(-EINVAL); diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 47bdf324e962..3dae4b631dea 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -141,7 +141,7 @@ static int __init n64cart_probe(struct platform_device *pdev) return -ENOMEM; disk->first_minor = 0; - disk->flags = GENHD_FL_NO_PART_SCAN | GENHD_FL_EXT_DEVT; + disk->flags = GENHD_FL_NO_PART_SCAN; disk->fops = &n64cart_fops; disk->private_data = &pdev->dev; strcpy(disk->disk_name, "n64cart"); diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 40a948c08a0b..e7dc539fc0ac 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -383,7 +383,6 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) } strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name)); - tdisk->flags = GENHD_FL_EXT_DEVT; tdisk->major = 0; tdisk->first_minor = 0; tdisk->fops = tt->bops; diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 8004dd64d09a..0bacf4268f83 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -2136,7 +2136,6 @@ static int msb_init_disk(struct memstick_dev *card) msb->disk->fops = &msb_bdops; msb->disk->private_data = msb; msb->disk->queue = msb->queue; - msb->disk->flags |= GENHD_FL_EXT_DEVT; capacity = msb->pages_in_block * msb->logical_block_count; capacity *= (msb->page_size / 512); diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 7b9556291eb1..7ba446d224fb 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -267,7 +267,6 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk) disk->first_minor = 0; disk->fops = &nd_blk_fops; disk->queue = q; - disk->flags = GENHD_FL_EXT_DEVT; disk->private_data = nsblk; nvdimm_namespace_disk_name(&nsblk->common, disk->disk_name); diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 18a267d5073f..1741a7b0b30f 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1537,7 +1537,6 @@ static int btt_blk_init(struct btt *btt) btt->btt_disk->fops = &btt_fops; btt->btt_disk->private_data = btt; btt->btt_disk->queue = btt->btt_queue; - btt->btt_disk->flags = GENHD_FL_EXT_DEVT; blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index ed10a8b66068..968b8483c763 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -477,7 +477,6 @@ static int pmem_attach_disk(struct device *dev, disk->fops = &pmem_fops; disk->queue = q; - disk->flags = GENHD_FL_EXT_DEVT; disk->private_data = pmem; nvdimm_namespace_disk_name(ndns, disk->disk_name); set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 762125f2905f..24bcae88587a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3699,7 +3699,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, disk->fops = &nvme_bdev_ops; disk->private_data = ns; disk->queue = ns->queue; - disk->flags = GENHD_FL_EXT_DEVT; /* * Without the multipath code enabled, multiple controller per * subsystems are visible as devices and thus we cannot use the diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index f81871c7128a..a5d02f236cca 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -462,7 +462,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) head->disk->fops = &nvme_ns_head_ops; head->disk->private_data = head; head->disk->queue = q; - head->disk->flags = GENHD_FL_EXT_DEVT; sprintf(head->disk->disk_name, "nvme%dn%d", ctrl->subsys->instance, head->instance); return 0; From 958229a7c55f219b1cff99f939dabbc1b6ba7161 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:54 +0200 Subject: [PATCH 016/129] block: add a flag to make put_disk on partially initalized disks safer Add a flag to indicate that __device_add_disk did grab a queue reference so that disk_release only drops it if we actually had it. This sort out one of the major pitfals with partially initialized gendisk that a lot of drivers did get wrong or still do. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Luis Chamberlain Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-5-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 7 +++++-- include/linux/genhd.h | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 9fa734cb9cbd..c826db33a73e 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -539,7 +539,10 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, * Take an extra ref on queue which will be put on disk_release() * so that it sticks around as long as @disk is there. */ - WARN_ON_ONCE(!blk_get_queue(disk->queue)); + if (blk_get_queue(disk->queue)) + set_bit(GD_QUEUE_REF, &disk->state); + else + WARN_ON_ONCE(1); disk_add_events(disk); blk_integrity_add(disk); @@ -1107,7 +1110,7 @@ static void disk_release(struct device *dev) kfree(disk->random); xa_destroy(&disk->part_tbl); bdput(disk->part0); - if (disk->queue) + if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue) blk_put_queue(disk->queue); kfree(disk); } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6fc26f7bdf71..4d3ee8b6b297 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -153,6 +153,7 @@ struct gendisk { unsigned long state; #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 +#define GD_QUEUE_REF 2 struct kobject *slave_dir; struct timer_rand_state *random; From f525464a8000f092c20b00eead3eaa9d849c599e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:55 +0200 Subject: [PATCH 017/129] block: add blk_alloc_disk and blk_cleanup_disk APIs Add two new APIs to allocate and free a gendisk including the request_queue for use with BIO based drivers. This is to avoid boilerplate code in drivers. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-6-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 35 +++++++++++++++++++++++++++++++++++ include/linux/genhd.h | 22 ++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/block/genhd.c b/block/genhd.c index c826db33a73e..efe0db4d62f0 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1293,6 +1293,25 @@ out_free_disk: } EXPORT_SYMBOL(__alloc_disk_node); +struct gendisk *__blk_alloc_disk(int node) +{ + struct request_queue *q; + struct gendisk *disk; + + q = blk_alloc_queue(node); + if (!q) + return NULL; + + disk = __alloc_disk_node(0, node); + if (!disk) { + blk_cleanup_queue(q); + return NULL; + } + disk->queue = q; + return disk; +} +EXPORT_SYMBOL(__blk_alloc_disk); + /** * put_disk - decrements the gendisk refcount * @disk: the struct gendisk to decrement the refcount for @@ -1310,6 +1329,22 @@ void put_disk(struct gendisk *disk) } EXPORT_SYMBOL(put_disk); +/** + * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk + * @disk: gendisk to shutdown + * + * Mark the queue hanging off @disk DYING, drain all pending requests, then mark + * the queue DEAD, destroy and put it and the gendisk structure. + * + * Context: can sleep + */ +void blk_cleanup_disk(struct gendisk *disk) +{ + blk_cleanup_queue(disk->queue); + put_disk(disk); +} +EXPORT_SYMBOL(blk_cleanup_disk); + static void set_disk_ro_uevent(struct gendisk *gd, int ro) { char event[] = "DISK_RO=1"; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 4d3ee8b6b297..782f0171d104 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -278,6 +278,28 @@ extern void put_disk(struct gendisk *disk); #define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE) +/** + * blk_alloc_disk - allocate a gendisk structure + * @node_id: numa node to allocate on + * + * Allocate and pre-initialize a gendisk structure for use with BIO based + * drivers. + * + * Context: can sleep + */ +#define blk_alloc_disk(node_id) \ +({ \ + struct gendisk *__disk = __blk_alloc_disk(node_id); \ + static struct lock_class_key __key; \ + \ + if (__disk) \ + lockdep_init_map(&__disk->lockdep_map, \ + "(bio completion)", &__key, 0); \ + __disk; \ +}) +struct gendisk *__blk_alloc_disk(int node); +void blk_cleanup_disk(struct gendisk *disk); + int __register_blkdev(unsigned int major, const char *name, void (*probe)(dev_t devt)); #define register_blkdev(major, name) \ From 7f9b348cb5e94259acdcbafbcaed55d3bb515304 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:56 +0200 Subject: [PATCH 018/129] brd: convert to blk_alloc_disk/blk_cleanup_disk Convert the brd driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. This also allows to remove the request_queue pointer in struct request_queue, and to simplify the initialization as blk_cleanup_disk can be called on any disk returned from blk_alloc_disk. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/brd.c | 94 ++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 61 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 7562cf30b14e..95694113e38e 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -38,9 +38,7 @@ * device). */ struct brd_device { - int brd_number; - - struct request_queue *brd_queue; + int brd_number; struct gendisk *brd_disk; struct list_head brd_list; @@ -372,7 +370,7 @@ static LIST_HEAD(brd_devices); static DEFINE_MUTEX(brd_devices_mutex); static struct dentry *brd_debugfs_dir; -static struct brd_device *brd_alloc(int i) +static int brd_alloc(int i) { struct brd_device *brd; struct gendisk *disk; @@ -380,64 +378,55 @@ static struct brd_device *brd_alloc(int i) brd = kzalloc(sizeof(*brd), GFP_KERNEL); if (!brd) - goto out; + return -ENOMEM; brd->brd_number = i; spin_lock_init(&brd->brd_lock); INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); - brd->brd_queue = blk_alloc_queue(NUMA_NO_NODE); - if (!brd->brd_queue) - goto out_free_dev; - snprintf(buf, DISK_NAME_LEN, "ram%d", i); if (!IS_ERR_OR_NULL(brd_debugfs_dir)) debugfs_create_u64(buf, 0444, brd_debugfs_dir, &brd->brd_nr_pages); - /* This is so fdisk will align partitions on 4k, because of - * direct_access API needing 4k alignment, returning a PFN - * (This is only a problem on very small devices <= 4M, - * otherwise fdisk will align on 1M. Regardless this call - * is harmless) - */ - blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE); - disk = brd->brd_disk = alloc_disk(max_part); + disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk) - goto out_free_queue; + goto out_free_dev; + disk->major = RAMDISK_MAJOR; disk->first_minor = i * max_part; + disk->minors = max_part; disk->fops = &brd_fops; disk->private_data = brd; disk->flags = GENHD_FL_EXT_DEVT; strlcpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); + + /* + * This is so fdisk will align partitions on 4k, because of + * direct_access API needing 4k alignment, returning a PFN + * (This is only a problem on very small devices <= 4M, + * otherwise fdisk will align on 1M. Regardless this call + * is harmless) + */ + blk_queue_physical_block_size(disk->queue, PAGE_SIZE); /* Tell the block layer that this is not a rotational device */ - blk_queue_flag_set(QUEUE_FLAG_NONROT, brd->brd_queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, brd->brd_queue); + blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); + add_disk(disk); + list_add_tail(&brd->brd_list, &brd_devices); - return brd; + return 0; -out_free_queue: - blk_cleanup_queue(brd->brd_queue); out_free_dev: kfree(brd); -out: - return NULL; -} - -static void brd_free(struct brd_device *brd) -{ - put_disk(brd->brd_disk); - blk_cleanup_queue(brd->brd_queue); - brd_free_pages(brd); - kfree(brd); + return -ENOMEM; } static void brd_probe(dev_t dev) { - struct brd_device *brd; int i = MINOR(dev) / max_part; + struct brd_device *brd; mutex_lock(&brd_devices_mutex); list_for_each_entry(brd, &brd_devices, brd_list) { @@ -445,13 +434,7 @@ static void brd_probe(dev_t dev) goto out_unlock; } - brd = brd_alloc(i); - if (brd) { - brd->brd_disk->queue = brd->brd_queue; - add_disk(brd->brd_disk); - list_add_tail(&brd->brd_list, &brd_devices); - } - + brd_alloc(i); out_unlock: mutex_unlock(&brd_devices_mutex); } @@ -460,7 +443,9 @@ static void brd_del_one(struct brd_device *brd) { list_del(&brd->brd_list); del_gendisk(brd->brd_disk); - brd_free(brd); + blk_cleanup_disk(brd->brd_disk); + brd_free_pages(brd); + kfree(brd); } static inline void brd_check_and_reset_par(void) @@ -485,7 +470,7 @@ static inline void brd_check_and_reset_par(void) static int __init brd_init(void) { struct brd_device *brd, *next; - int i; + int err, i; /* * brd module now has a feature to instantiate underlying device @@ -511,22 +496,11 @@ static int __init brd_init(void) mutex_lock(&brd_devices_mutex); for (i = 0; i < rd_nr; i++) { - brd = brd_alloc(i); - if (!brd) + err = brd_alloc(i); + if (err) goto out_free; - list_add_tail(&brd->brd_list, &brd_devices); } - /* point of no return */ - - list_for_each_entry(brd, &brd_devices, brd_list) { - /* - * associate with queue just before adding disk for - * avoiding to mess up failure path - */ - brd->brd_disk->queue = brd->brd_queue; - add_disk(brd->brd_disk); - } mutex_unlock(&brd_devices_mutex); pr_info("brd: module loaded\n"); @@ -535,15 +509,13 @@ static int __init brd_init(void) out_free: debugfs_remove_recursive(brd_debugfs_dir); - list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { - list_del(&brd->brd_list); - brd_free(brd); - } + list_for_each_entry_safe(brd, next, &brd_devices, brd_list) + brd_del_one(brd); mutex_unlock(&brd_devices_mutex); unregister_blkdev(RAMDISK_MAJOR, "ramdisk"); pr_info("brd: module NOT loaded !!!\n"); - return -ENOMEM; + return err; } static void __exit brd_exit(void) From b647ad024841d02d67e78716f51f355d8d3e9656 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:57 +0200 Subject: [PATCH 019/129] drbd: convert to blk_alloc_disk/blk_cleanup_disk Convert the drbd driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index de463773b530..55234a558e98 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2231,8 +2231,7 @@ void drbd_destroy_device(struct kref *kref) if (device->bitmap) /* should no longer be there. */ drbd_bm_cleanup(device); __free_page(device->md_io.page); - put_disk(device->vdisk); - blk_cleanup_queue(device->rq_queue); + blk_cleanup_disk(device->vdisk); kfree(device->rs_plan_s); /* not for_each_connection(connection, resource): @@ -2701,7 +2700,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig struct drbd_device *device; struct drbd_peer_device *peer_device, *tmp_peer_device; struct gendisk *disk; - struct request_queue *q; int id; int vnr = adm_ctx->volume; enum drbd_ret_code err = ERR_NOMEM; @@ -2723,29 +2721,26 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_init_set_defaults(device); - q = blk_alloc_queue(NUMA_NO_NODE); - if (!q) - goto out_no_q; - device->rq_queue = q; - - disk = alloc_disk(1); + disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk) goto out_no_disk; + device->vdisk = disk; + device->rq_queue = disk->queue; set_disk_ro(disk, true); - disk->queue = q; disk->major = DRBD_MAJOR; disk->first_minor = minor; + disk->minors = 1; disk->fops = &drbd_ops; sprintf(disk->disk_name, "drbd%d", minor); disk->private_data = device; - blk_queue_write_cache(q, true, true); + blk_queue_write_cache(disk->queue, true, true); /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ - blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); + blk_queue_max_hw_sectors(disk->queue, DRBD_MAX_BIO_SIZE_SAFE >> 8); device->md_io.page = alloc_page(GFP_KERNEL); if (!device->md_io.page) @@ -2834,10 +2829,8 @@ out_no_minor_idr: out_no_bitmap: __free_page(device->md_io.page); out_no_io_page: - put_disk(disk); + blk_cleanup_disk(disk); out_no_disk: - blk_cleanup_queue(q); -out_no_q: kref_put(&resource->kref, drbd_destroy_resource); kfree(device); return err; From 444134845277ad37c8ca7d1321d3dd57b96b5ae0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:58 +0200 Subject: [PATCH 020/129] pktcdvd: convert to blk_alloc_disk/blk_cleanup_disk Convert the pktcdvd driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/pktcdvd.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index bd3556585122..f69b5c69c2a6 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2711,19 +2711,17 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) pd->write_congestion_off = write_congestion_off; ret = -ENOMEM; - disk = alloc_disk(1); + disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk) goto out_mem; pd->disk = disk; disk->major = pktdev_major; disk->first_minor = idx; + disk->minors = 1; disk->fops = &pktcdvd_ops; disk->flags = GENHD_FL_REMOVABLE; strcpy(disk->disk_name, pd->name); disk->private_data = pd; - disk->queue = blk_alloc_queue(NUMA_NO_NODE); - if (!disk->queue) - goto out_mem2; pd->pkt_dev = MKDEV(pktdev_major, idx); ret = pkt_new_dev(pd, dev); @@ -2746,7 +2744,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) return 0; out_mem2: - put_disk(disk); + blk_cleanup_disk(disk); out_mem: mempool_exit(&pd->rb_pool); kfree(pd); @@ -2796,8 +2794,7 @@ static int pkt_remove_dev(dev_t pkt_dev) pkt_dbg(1, pd, "writer unmapped\n"); del_gendisk(pd->disk); - blk_cleanup_queue(pd->disk->queue); - put_disk(pd->disk); + blk_cleanup_disk(pd->disk); mempool_exit(&pd->rb_pool); kfree(pd); From 0be7966e7e6e8c57c3f63b16ddeed73e68313a89 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:50:59 +0200 Subject: [PATCH 021/129] rsxx: convert to blk_alloc_disk/blk_cleanup_disk Convert the rsxx driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rsxx/dev.c | 39 +++++++++++++--------------------- drivers/block/rsxx/rsxx_priv.h | 1 - 2 files changed, 15 insertions(+), 25 deletions(-) diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 9a28322a8cd8..1cc40b0ea761 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -236,47 +236,40 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) return -ENOMEM; } - card->queue = blk_alloc_queue(NUMA_NO_NODE); - if (!card->queue) { - dev_err(CARD_TO_DEV(card), "Failed queue alloc\n"); - unregister_blkdev(card->major, DRIVER_NAME); - return -ENOMEM; - } - - card->gendisk = alloc_disk(blkdev_minors); + card->gendisk = blk_alloc_disk(blkdev_minors); if (!card->gendisk) { dev_err(CARD_TO_DEV(card), "Failed disk alloc\n"); - blk_cleanup_queue(card->queue); unregister_blkdev(card->major, DRIVER_NAME); return -ENOMEM; } if (card->config_valid) { blk_size = card->config.data.block_size; - blk_queue_dma_alignment(card->queue, blk_size - 1); - blk_queue_logical_block_size(card->queue, blk_size); + blk_queue_dma_alignment(card->gendisk->queue, blk_size - 1); + blk_queue_logical_block_size(card->gendisk->queue, blk_size); } - blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors); - blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE); + blk_queue_max_hw_sectors(card->gendisk->queue, blkdev_max_hw_sectors); + blk_queue_physical_block_size(card->gendisk->queue, RSXX_HW_BLK_SIZE); - blk_queue_flag_set(QUEUE_FLAG_NONROT, card->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, card->queue); + blk_queue_flag_set(QUEUE_FLAG_NONROT, card->gendisk->queue); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, card->gendisk->queue); if (rsxx_discard_supported(card)) { - blk_queue_flag_set(QUEUE_FLAG_DISCARD, card->queue); - blk_queue_max_discard_sectors(card->queue, + blk_queue_flag_set(QUEUE_FLAG_DISCARD, card->gendisk->queue); + blk_queue_max_discard_sectors(card->gendisk->queue, RSXX_HW_BLK_SIZE >> 9); - card->queue->limits.discard_granularity = RSXX_HW_BLK_SIZE; - card->queue->limits.discard_alignment = RSXX_HW_BLK_SIZE; + card->gendisk->queue->limits.discard_granularity = + RSXX_HW_BLK_SIZE; + card->gendisk->queue->limits.discard_alignment = + RSXX_HW_BLK_SIZE; } snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name), "rsxx%d", card->disk_id); card->gendisk->major = card->major; - card->gendisk->first_minor = 0; + card->gendisk->minors = blkdev_minors; card->gendisk->fops = &rsxx_fops; card->gendisk->private_data = card; - card->gendisk->queue = card->queue; return 0; } @@ -286,10 +279,8 @@ void rsxx_destroy_dev(struct rsxx_cardinfo *card) if (!enable_blkdev) return; - put_disk(card->gendisk); + blk_cleanup_disk(card->gendisk); card->gendisk = NULL; - - blk_cleanup_queue(card->queue); unregister_blkdev(card->major, DRIVER_NAME); } diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h index 6147977994ff..26c320c0d924 100644 --- a/drivers/block/rsxx/rsxx_priv.h +++ b/drivers/block/rsxx/rsxx_priv.h @@ -154,7 +154,6 @@ struct rsxx_cardinfo { bool bdev_attached; int disk_id; int major; - struct request_queue *queue; struct gendisk *gendisk; struct { /* Used to convert a byte address to a device address. */ From 7681750bd35fe92dd915f4df177d45265e78a933 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:00 +0200 Subject: [PATCH 022/129] zram: convert to blk_alloc_disk/blk_cleanup_disk Convert the zram driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index cf8deecc39ef..006416cc4969 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1890,7 +1890,6 @@ static const struct attribute_group *zram_disk_attr_groups[] = { static int zram_add(void) { struct zram *zram; - struct request_queue *queue; int ret, device_id; zram = kzalloc(sizeof(struct zram), GFP_KERNEL); @@ -1906,27 +1905,20 @@ static int zram_add(void) #ifdef CONFIG_ZRAM_WRITEBACK spin_lock_init(&zram->wb_limit_lock); #endif - queue = blk_alloc_queue(NUMA_NO_NODE); - if (!queue) { - pr_err("Error allocating disk queue for device %d\n", + + /* gendisk structure */ + zram->disk = blk_alloc_disk(NUMA_NO_NODE); + if (!zram->disk) { + pr_err("Error allocating disk structure for device %d\n", device_id); ret = -ENOMEM; goto out_free_idr; } - /* gendisk structure */ - zram->disk = alloc_disk(1); - if (!zram->disk) { - pr_err("Error allocating disk structure for device %d\n", - device_id); - ret = -ENOMEM; - goto out_free_queue; - } - zram->disk->major = zram_major; zram->disk->first_minor = device_id; + zram->disk->minors = 1; zram->disk->fops = &zram_devops; - zram->disk->queue = queue; zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); @@ -1969,8 +1961,6 @@ static int zram_add(void) pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; -out_free_queue: - blk_cleanup_queue(queue); out_free_idr: idr_remove(&zram_index_idr, device_id); out_free_dev: @@ -2000,8 +1990,7 @@ static int zram_remove(struct zram *zram) pr_info("Removed device: %s\n", zram->disk->disk_name); del_gendisk(zram->disk); - blk_cleanup_queue(zram->disk->queue); - put_disk(zram->disk); + blk_cleanup_disk(zram->disk); kfree(zram); return 0; } From 1aabd53a4b49adaf65319aa622e612b6edb2b663 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:01 +0200 Subject: [PATCH 023/129] lightnvm: convert to blk_alloc_disk/blk_cleanup_disk Convert the lightnvm driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-12-hch@lst.de Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index e7dc539fc0ac..cf8a75494833 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -305,7 +305,6 @@ static int __nvm_config_extended(struct nvm_dev *dev, static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) { struct nvm_ioctl_create_extended e; - struct request_queue *tqueue; struct gendisk *tdisk; struct nvm_tgt_type *tt; struct nvm_target *t; @@ -370,23 +369,16 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) goto err_t; } - tdisk = alloc_disk(0); + tdisk = blk_alloc_disk(dev->q->node); if (!tdisk) { ret = -ENOMEM; goto err_dev; } - tqueue = blk_alloc_queue(dev->q->node); - if (!tqueue) { - ret = -ENOMEM; - goto err_disk; - } - strlcpy(tdisk->disk_name, create->tgtname, sizeof(tdisk->disk_name)); tdisk->major = 0; tdisk->first_minor = 0; tdisk->fops = tt->bops; - tdisk->queue = tqueue; targetdata = tt->init(tgt_dev, tdisk, create->flags); if (IS_ERR(targetdata)) { @@ -395,14 +387,14 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) } tdisk->private_data = targetdata; - tqueue->queuedata = targetdata; + tdisk->queue->queuedata = targetdata; mdts = (dev->geo.csecs >> 9) * NVM_MAX_VLBA; if (dev->geo.mdts) { mdts = min_t(u32, dev->geo.mdts, (dev->geo.csecs >> 9) * NVM_MAX_VLBA); } - blk_queue_max_hw_sectors(tqueue, mdts); + blk_queue_max_hw_sectors(tdisk->queue, mdts); set_capacity(tdisk, tt->capacity(targetdata)); add_disk(tdisk); @@ -427,10 +419,7 @@ err_sysfs: if (tt->exit) tt->exit(targetdata, true); err_init: - blk_cleanup_queue(tqueue); - tdisk->queue = NULL; -err_disk: - put_disk(tdisk); + blk_cleanup_disk(tdisk); err_dev: nvm_remove_tgt_dev(tgt_dev, 0); err_t: @@ -444,10 +433,8 @@ static void __nvm_remove_target(struct nvm_target *t, bool graceful) { struct nvm_tgt_type *tt = t->type; struct gendisk *tdisk = t->disk; - struct request_queue *q = tdisk->queue; del_gendisk(tdisk); - blk_cleanup_queue(q); if (tt->sysfs_exit) tt->sysfs_exit(tdisk); @@ -456,7 +443,7 @@ static void __nvm_remove_target(struct nvm_target *t, bool graceful) tt->exit(tdisk->private_data, graceful); nvm_remove_tgt_dev(t->dev, 1); - put_disk(tdisk); + blk_cleanup_disk(tdisk); module_put(t->type->owner); list_del(&t->list); From bc70852fd10415cda727577f12ea93e502eb1027 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:02 +0200 Subject: [PATCH 024/129] bcache: convert to blk_alloc_disk/blk_cleanup_disk Convert the bcache driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Acked-by: Coly Li Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-13-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index bea8c4429ae8..185246a0d855 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -890,13 +890,9 @@ static void bcache_device_free(struct bcache_device *d) if (disk_added) del_gendisk(disk); - if (disk->queue) - blk_cleanup_queue(disk->queue); - + blk_cleanup_disk(disk); ida_simple_remove(&bcache_device_idx, first_minor_to_idx(disk->first_minor)); - if (disk_added) - put_disk(disk); } bioset_exit(&d->bio_split); @@ -946,7 +942,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) goto err; - d->disk = alloc_disk(BCACHE_MINORS); + d->disk = blk_alloc_disk(NUMA_NO_NODE); if (!d->disk) goto err; @@ -955,14 +951,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->major = bcache_major; d->disk->first_minor = idx_to_first_minor(idx); + d->disk->minors = BCACHE_MINORS; d->disk->fops = ops; d->disk->private_data = d; - q = blk_alloc_queue(NUMA_NO_NODE); - if (!q) - return -ENOMEM; - - d->disk->queue = q; + q = d->disk->queue; q->limits.max_hw_sectors = UINT_MAX; q->limits.max_sectors = UINT_MAX; q->limits.max_segment_size = UINT_MAX; From 74fe6ba9239497e5fa383a15efa9f5ffc23b11f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:03 +0200 Subject: [PATCH 025/129] dm: convert to blk_alloc_disk/blk_cleanup_disk Convert the dm driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-14-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/dm.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ca2aedd8ee7d..3c7c2d257018 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1801,13 +1801,13 @@ static void cleanup_mapped_device(struct mapped_device *md) md->disk->private_data = NULL; spin_unlock(&_minor_lock); del_gendisk(md->disk); - put_disk(md->disk); } - if (md->queue) { + if (md->queue) dm_queue_destroy_keyslot_manager(md->queue); - blk_cleanup_queue(md->queue); - } + + if (md->disk) + blk_cleanup_disk(md->disk); cleanup_srcu_struct(&md->io_barrier); @@ -1869,13 +1869,10 @@ static struct mapped_device *alloc_dev(int minor) * established. If request-based table is loaded: blk-mq will * override accordingly. */ - md->queue = blk_alloc_queue(numa_node_id); - if (!md->queue) - goto bad; - - md->disk = alloc_disk_node(1, md->numa_node_id); + md->disk = blk_alloc_disk(md->numa_node_id); if (!md->disk) goto bad; + md->queue = md->disk->queue; init_waitqueue_head(&md->wait); INIT_WORK(&md->work, dm_wq_work); @@ -1888,6 +1885,7 @@ static struct mapped_device *alloc_dev(int minor) md->disk->major = _major; md->disk->first_minor = minor; + md->disk->minors = 1; md->disk->fops = &dm_blk_dops; md->disk->queue = md->queue; md->disk->private_data = md; From 0f1d2e0643c544df50dbc436da930201218fa1e2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:04 +0200 Subject: [PATCH 026/129] md: convert to blk_alloc_disk/blk_cleanup_disk Convert the md driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-15-hch@lst.de Signed-off-by: Jens Axboe --- drivers/md/md.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 49f897fbb89b..d806be8cc210 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5598,12 +5598,10 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_level) sysfs_put(mddev->sysfs_level); - if (mddev->gendisk) + if (mddev->gendisk) { del_gendisk(mddev->gendisk); - if (mddev->queue) - blk_cleanup_queue(mddev->queue); - if (mddev->gendisk) - put_disk(mddev->gendisk); + blk_cleanup_disk(mddev->gendisk); + } percpu_ref_exit(&mddev->writes_pending); bioset_exit(&mddev->bio_set); @@ -5711,20 +5709,13 @@ static int md_alloc(dev_t dev, char *name) goto abort; error = -ENOMEM; - mddev->queue = blk_alloc_queue(NUMA_NO_NODE); - if (!mddev->queue) + disk = blk_alloc_disk(NUMA_NO_NODE); + if (!disk) goto abort; - blk_set_stacking_limits(&mddev->queue->limits); - - disk = alloc_disk(1 << shift); - if (!disk) { - blk_cleanup_queue(mddev->queue); - mddev->queue = NULL; - goto abort; - } disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; + disk->minors = 1 << shift; if (name) strcpy(disk->disk_name, name); else if (partitioned) @@ -5733,7 +5724,9 @@ static int md_alloc(dev_t dev, char *name) sprintf(disk->disk_name, "md%d", unit); disk->fops = &md_fops; disk->private_data = mddev; - disk->queue = mddev->queue; + + mddev->queue = disk->queue; + blk_set_stacking_limits(&mddev->queue->limits); blk_queue_write_cache(mddev->queue, true, true); /* Allow extended partitions. This makes the * 'mdp' device redundant, but we can't really From 6c552ceabf39797fba1d3088af9481511f02393f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:05 +0200 Subject: [PATCH 027/129] nvdimm-blk: convert to blk_alloc_disk/blk_cleanup_disk Convert the nvdimm-blk driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-16-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvdimm/blk.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 7ba446d224fb..088d3dd6f6fa 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -228,48 +228,34 @@ static const struct block_device_operations nd_blk_fops = { .submit_bio = nd_blk_submit_bio, }; -static void nd_blk_release_queue(void *q) -{ - blk_cleanup_queue(q); -} - static void nd_blk_release_disk(void *disk) { del_gendisk(disk); - put_disk(disk); + blk_cleanup_disk(disk); } static int nsblk_attach_disk(struct nd_namespace_blk *nsblk) { struct device *dev = &nsblk->common.dev; resource_size_t available_disk_size; - struct request_queue *q; struct gendisk *disk; u64 internal_nlba; internal_nlba = div_u64(nsblk->size, nsblk_internal_lbasize(nsblk)); available_disk_size = internal_nlba * nsblk_sector_size(nsblk); - q = blk_alloc_queue(NUMA_NO_NODE); - if (!q) - return -ENOMEM; - if (devm_add_action_or_reset(dev, nd_blk_release_queue, q)) - return -ENOMEM; - - blk_queue_max_hw_sectors(q, UINT_MAX); - blk_queue_logical_block_size(q, nsblk_sector_size(nsblk)); - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - - disk = alloc_disk(0); + disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk) return -ENOMEM; - disk->first_minor = 0; disk->fops = &nd_blk_fops; - disk->queue = q; disk->private_data = nsblk; nvdimm_namespace_disk_name(&nsblk->common, disk->disk_name); + blk_queue_max_hw_sectors(disk->queue, UINT_MAX); + blk_queue_logical_block_size(disk->queue, nsblk_sector_size(nsblk)); + blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); + if (devm_add_action_or_reset(dev, nd_blk_release_disk, disk)) return -ENOMEM; From d4e4e5835f29fa1a1dcdecc5bea125050274d0f6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:06 +0200 Subject: [PATCH 028/129] nvdimm-btt: convert to blk_alloc_disk/blk_cleanup_disk Convert the nvdimm-btt driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-17-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvdimm/btt.c | 24 +++++++----------------- drivers/nvdimm/btt.h | 2 -- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 1741a7b0b30f..92dec4952297 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1521,34 +1521,25 @@ static int btt_blk_init(struct btt *btt) struct nd_btt *nd_btt = btt->nd_btt; struct nd_namespace_common *ndns = nd_btt->ndns; - /* create a new disk and request queue for btt */ - btt->btt_queue = blk_alloc_queue(NUMA_NO_NODE); - if (!btt->btt_queue) + btt->btt_disk = blk_alloc_disk(NUMA_NO_NODE); + if (!btt->btt_disk) return -ENOMEM; - btt->btt_disk = alloc_disk(0); - if (!btt->btt_disk) { - blk_cleanup_queue(btt->btt_queue); - return -ENOMEM; - } - nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); btt->btt_disk->first_minor = 0; btt->btt_disk->fops = &btt_fops; btt->btt_disk->private_data = btt; - btt->btt_disk->queue = btt->btt_queue; - blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); - blk_queue_max_hw_sectors(btt->btt_queue, UINT_MAX); - blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_queue); + blk_queue_logical_block_size(btt->btt_disk->queue, btt->sector_size); + blk_queue_max_hw_sectors(btt->btt_disk->queue, UINT_MAX); + blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue); if (btt_meta_size(btt)) { int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt)); if (rc) { del_gendisk(btt->btt_disk); - put_disk(btt->btt_disk); - blk_cleanup_queue(btt->btt_queue); + blk_cleanup_disk(btt->btt_disk); return rc; } } @@ -1563,8 +1554,7 @@ static int btt_blk_init(struct btt *btt) static void btt_blk_cleanup(struct btt *btt) { del_gendisk(btt->btt_disk); - put_disk(btt->btt_disk); - blk_cleanup_queue(btt->btt_queue); + blk_cleanup_disk(btt->btt_disk); } /** diff --git a/drivers/nvdimm/btt.h b/drivers/nvdimm/btt.h index aa53e0b769bd..0c76c0333f6e 100644 --- a/drivers/nvdimm/btt.h +++ b/drivers/nvdimm/btt.h @@ -201,7 +201,6 @@ struct badblocks; /** * struct btt - handle for a BTT instance * @btt_disk: Pointer to the gendisk for BTT device - * @btt_queue: Pointer to the request queue for the BTT device * @arena_list: Head of the list of arenas * @debugfs_dir: Debugfs dentry * @nd_btt: Parent nd_btt struct @@ -219,7 +218,6 @@ struct badblocks; */ struct btt { struct gendisk *btt_disk; - struct request_queue *btt_queue; struct list_head arena_list; struct dentry *debugfs_dir; struct nd_btt *nd_btt; From 87eb73b2ca7c1b913e84d6efe46810fd301e7a66 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:07 +0200 Subject: [PATCH 029/129] nvdimm-pmem: convert to blk_alloc_disk/blk_cleanup_disk Convert the nvdimm-pmem driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-18-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvdimm/pmem.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 968b8483c763..9fcd05084564 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -338,7 +338,7 @@ static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap) struct request_queue *q = container_of(pgmap->ref, struct request_queue, q_usage_counter); - blk_cleanup_queue(q); + blk_cleanup_disk(queue_to_disk(q)); } static void pmem_release_queue(void *pgmap) @@ -361,7 +361,6 @@ static void pmem_release_disk(void *__pmem) kill_dax(pmem->dax_dev); put_dax(pmem->dax_dev); del_gendisk(pmem->disk); - put_disk(pmem->disk); } static const struct dev_pagemap_ops fsdax_pagemap_ops = { @@ -422,10 +421,12 @@ static int pmem_attach_disk(struct device *dev, return -EBUSY; } - q = blk_alloc_queue(dev_to_node(dev)); - if (!q) + disk = blk_alloc_disk(nid); + if (!disk) return -ENOMEM; + q = disk->queue; + pmem->disk = disk; pmem->pfn_flags = PFN_DEV; pmem->pgmap.ref = &q->q_usage_counter; if (is_nd_pfn(dev)) { @@ -470,11 +471,6 @@ static int pmem_attach_disk(struct device *dev, if (pmem->pfn_flags & PFN_MAP) blk_queue_flag_set(QUEUE_FLAG_DAX, q); - disk = alloc_disk_node(0, nid); - if (!disk) - return -ENOMEM; - pmem->disk = disk; - disk->fops = &pmem_fops; disk->queue = q; disk->private_data = pmem; @@ -490,7 +486,6 @@ static int pmem_attach_disk(struct device *dev, flags = DAXDEV_F_SYNC; dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops, flags); if (IS_ERR(dax_dev)) { - put_disk(disk); return PTR_ERR(dax_dev); } dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); From f165fb89b71facbef833c6244abf8b9887b899d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:08 +0200 Subject: [PATCH 030/129] nvme-multipath: convert to blk_alloc_disk/blk_cleanup_disk Convert the nvme-multipath driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-19-hch@lst.de Signed-off-by: Jens Axboe --- drivers/nvdimm/pmem.c | 1 - drivers/nvme/host/multipath.c | 47 +++++++++++------------------------ 2 files changed, 14 insertions(+), 34 deletions(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 9fcd05084564..31f3c4bd6f72 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -472,7 +472,6 @@ static int pmem_attach_disk(struct device *dev, blk_queue_flag_set(QUEUE_FLAG_DAX, q); disk->fops = &pmem_fops; - disk->queue = q; disk->private_data = pmem; nvdimm_namespace_disk_name(ndns, disk->disk_name); set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index a5d02f236cca..b5fbdb416022 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -427,7 +427,6 @@ static void nvme_requeue_work(struct work_struct *work) int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) { - struct request_queue *q; bool vwc = false; mutex_init(&head->lock); @@ -443,33 +442,24 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) return 0; - q = blk_alloc_queue(ctrl->numa_node); - if (!q) - goto out; - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - /* set to a default value for 512 until disk is validated */ - blk_queue_logical_block_size(q, 512); - blk_set_stacking_limits(&q->limits); + head->disk = blk_alloc_disk(ctrl->numa_node); + if (!head->disk) + return -ENOMEM; + head->disk->fops = &nvme_ns_head_ops; + head->disk->private_data = head; + sprintf(head->disk->disk_name, "nvme%dn%d", + ctrl->subsys->instance, head->instance); + + blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); + /* set to a default value of 512 until the disk is validated */ + blk_queue_logical_block_size(head->disk->queue, 512); + blk_set_stacking_limits(&head->disk->queue->limits); /* we need to propagate up the VMC settings */ if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) vwc = true; - blk_queue_write_cache(q, vwc, vwc); - - head->disk = alloc_disk(0); - if (!head->disk) - goto out_cleanup_queue; - head->disk->fops = &nvme_ns_head_ops; - head->disk->private_data = head; - head->disk->queue = q; - sprintf(head->disk->disk_name, "nvme%dn%d", - ctrl->subsys->instance, head->instance); + blk_queue_write_cache(head->disk->queue, vwc, vwc); return 0; - -out_cleanup_queue: - blk_cleanup_queue(q); -out: - return -ENOMEM; } static void nvme_mpath_set_live(struct nvme_ns *ns) @@ -768,16 +758,7 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) /* make sure all pending bios are cleaned up */ kblockd_schedule_work(&head->requeue_work); flush_work(&head->requeue_work); - blk_cleanup_queue(head->disk->queue); - if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) { - /* - * if device_add_disk wasn't called, prevent - * disk release to put a bogus reference on the - * request queue - */ - head->disk->queue = NULL; - } - put_disk(head->disk); + blk_cleanup_disk(head->disk); } void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl) From c3e235383645de20796efd2821c60bf6a7ab5f2e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:09 +0200 Subject: [PATCH 031/129] nfblock: convert to blk_alloc_disk/blk_cleanup_disk Convert the nfblock driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Acked-by: Geert Uytterhoeven Reviewed-by: Hannes Reinecke Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-20-hch@lst.de Signed-off-by: Jens Axboe --- arch/m68k/emu/nfblock.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index ba808543161a..9a8394e96388 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -55,7 +55,6 @@ struct nfhd_device { int id; u32 blocks, bsize; int bshift; - struct request_queue *queue; struct gendisk *disk; }; @@ -119,32 +118,24 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) dev->bsize = bsize; dev->bshift = ffs(bsize) - 10; - dev->queue = blk_alloc_queue(NUMA_NO_NODE); - if (dev->queue == NULL) - goto free_dev; - - blk_queue_logical_block_size(dev->queue, bsize); - - dev->disk = alloc_disk(16); + dev->disk = blk_alloc_disk(NUMA_NO_NODE); if (!dev->disk) - goto free_queue; + goto free_dev; dev->disk->major = major_num; dev->disk->first_minor = dev_id * 16; + dev->disk->minors = 16; dev->disk->fops = &nfhd_ops; dev->disk->private_data = dev; sprintf(dev->disk->disk_name, "nfhd%u", dev_id); set_capacity(dev->disk, (sector_t)blocks * (bsize / 512)); - dev->disk->queue = dev->queue; - + blk_queue_logical_block_size(dev->disk->queue, bsize); add_disk(dev->disk); list_add_tail(&dev->list, &nfhd_list); return 0; -free_queue: - blk_cleanup_queue(dev->queue); free_dev: kfree(dev); out: @@ -186,8 +177,7 @@ static void __exit nfhd_exit(void) list_for_each_entry_safe(dev, next, &nfhd_list, list) { list_del(&dev->list); del_gendisk(dev->disk); - put_disk(dev->disk); - blk_cleanup_queue(dev->queue); + blk_cleanup_disk(dev->disk); kfree(dev); } unregister_blkdev(major_num, "nfhd"); From b1833edc4c95d801b249159be361af6d3c3ea44d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:10 +0200 Subject: [PATCH 032/129] simdisk: convert to blk_alloc_disk/blk_cleanup_disk Convert the simdisk driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-21-hch@lst.de Signed-off-by: Jens Axboe --- arch/xtensa/platforms/iss/simdisk.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index fc09be7b1347..3cdfa00738e0 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -27,7 +27,6 @@ struct simdisk { const char *filename; spinlock_t lock; - struct request_queue *queue; struct gendisk *gd; struct proc_dir_entry *procfile; int users; @@ -266,21 +265,13 @@ static int __init simdisk_setup(struct simdisk *dev, int which, spin_lock_init(&dev->lock); dev->users = 0; - dev->queue = blk_alloc_queue(NUMA_NO_NODE); - if (dev->queue == NULL) { - pr_err("blk_alloc_queue failed\n"); - goto out_alloc_queue; - } - - dev->gd = alloc_disk(SIMDISK_MINORS); - if (dev->gd == NULL) { - pr_err("alloc_disk failed\n"); - goto out_alloc_disk; - } + dev->gd = blk_alloc_disk(NUMA_NO_NODE); + if (!dev->gd) + return -ENOMEM; dev->gd->major = simdisk_major; dev->gd->first_minor = which; + dev->gd->minors = SIMDISK_MINORS; dev->gd->fops = &simdisk_ops; - dev->gd->queue = dev->queue; dev->gd->private_data = dev; snprintf(dev->gd->disk_name, 32, "simdisk%d", which); set_capacity(dev->gd, 0); @@ -288,12 +279,6 @@ static int __init simdisk_setup(struct simdisk *dev, int which, dev->procfile = proc_create_data(tmp, 0644, procdir, &simdisk_proc_ops, dev); return 0; - -out_alloc_disk: - blk_cleanup_queue(dev->queue); - dev->queue = NULL; -out_alloc_queue: - return -ENOMEM; } static int __init simdisk_init(void) @@ -343,10 +328,10 @@ static void simdisk_teardown(struct simdisk *dev, int which, char tmp[2] = { '0' + which, 0 }; simdisk_detach(dev); - if (dev->gd) + if (dev->gd) { del_gendisk(dev->gd); - if (dev->queue) - blk_cleanup_queue(dev->queue); + blk_cleanup_disk(dev->gd); + } remove_proc_entry(tmp, procdir); } From f9dc931de80664eb78cbc8c85052bd0856d4aa9c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:11 +0200 Subject: [PATCH 033/129] n64cart: convert to blk_alloc_disk Convert the n64cart driver to use the blk_alloc_disk helper to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-22-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/n64cart.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index 3dae4b631dea..7b4dd10af9ec 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -132,14 +132,10 @@ static int __init n64cart_probe(struct platform_device *pdev) if (!reg_base) return -EINVAL; - disk = alloc_disk(0); + disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk) return -ENOMEM; - disk->queue = blk_alloc_queue(NUMA_NO_NODE); - if (!disk->queue) - return -ENOMEM; - disk->first_minor = 0; disk->flags = GENHD_FL_NO_PART_SCAN; disk->fops = &n64cart_fops; From 684bf9cd8d29503ee47a6942e34e1d2f0a4774fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:12 +0200 Subject: [PATCH 034/129] ps3vram: convert to blk_alloc_disk/blk_cleanup_disk Convert the ps3vram driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-23-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/ps3vram.c | 31 ++++++++----------------------- 1 file changed, 8 insertions(+), 23 deletions(-) diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 1d738999fb69..7fbf469651c4 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -67,7 +67,6 @@ struct ps3vram_cache { }; struct ps3vram_priv { - struct request_queue *queue; struct gendisk *gendisk; u64 size; @@ -613,7 +612,6 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) { struct ps3vram_priv *priv; int error, status; - struct request_queue *queue; struct gendisk *gendisk; u64 ddr_size, ddr_lpar, ctrl_lpar, info_lpar, reports_lpar, reports_size, xdr_lpar; @@ -736,33 +734,23 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) ps3vram_proc_init(dev); - queue = blk_alloc_queue(NUMA_NO_NODE); - if (!queue) { - dev_err(&dev->core, "blk_alloc_queue failed\n"); + gendisk = blk_alloc_disk(NUMA_NO_NODE); + if (!gendisk) { + dev_err(&dev->core, "blk_alloc_disk failed\n"); error = -ENOMEM; goto out_cache_cleanup; } - priv->queue = queue; - blk_queue_max_segments(queue, BLK_MAX_SEGMENTS); - blk_queue_max_segment_size(queue, BLK_MAX_SEGMENT_SIZE); - blk_queue_max_hw_sectors(queue, BLK_SAFE_MAX_SECTORS); - - gendisk = alloc_disk(1); - if (!gendisk) { - dev_err(&dev->core, "alloc_disk failed\n"); - error = -ENOMEM; - goto fail_cleanup_queue; - } - priv->gendisk = gendisk; gendisk->major = ps3vram_major; - gendisk->first_minor = 0; + gendisk->minors = 1; gendisk->fops = &ps3vram_fops; - gendisk->queue = queue; gendisk->private_data = dev; strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); set_capacity(gendisk, priv->size >> 9); + blk_queue_max_segments(gendisk->queue, BLK_MAX_SEGMENTS); + blk_queue_max_segment_size(gendisk->queue, BLK_MAX_SEGMENT_SIZE); + blk_queue_max_hw_sectors(gendisk->queue, BLK_SAFE_MAX_SECTORS); dev_info(&dev->core, "%s: Using %llu MiB of GPU memory\n", gendisk->disk_name, get_capacity(gendisk) >> 11); @@ -770,8 +758,6 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) device_add_disk(&dev->core, gendisk, NULL); return 0; -fail_cleanup_queue: - blk_cleanup_queue(queue); out_cache_cleanup: remove_proc_entry(DEVICE_NAME, NULL); ps3vram_cache_cleanup(dev); @@ -802,8 +788,7 @@ static void ps3vram_remove(struct ps3_system_bus_device *dev) struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); del_gendisk(priv->gendisk); - put_disk(priv->gendisk); - blk_cleanup_queue(priv->queue); + blk_cleanup_disk(priv->gendisk); remove_proc_entry(DEVICE_NAME, NULL); ps3vram_cache_cleanup(dev); iounmap(priv->reports); From 0692ef289f067d1de416cd33c9cf8d7e006293dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:13 +0200 Subject: [PATCH 035/129] dcssblk: convert to blk_alloc_disk/blk_cleanup_disk Convert the dcssblk driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-24-hch@lst.de Signed-off-by: Jens Axboe --- drivers/s390/block/dcssblk.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index da33cb4cba28..7faa56399999 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -90,7 +90,6 @@ struct dcssblk_dev_info { int segment_type; unsigned char save_pending; unsigned char is_shared; - struct request_queue *dcssblk_queue; int num_of_segments; struct list_head seg_list; struct dax_device *dax_dev; @@ -429,9 +428,7 @@ removeseg: kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); del_gendisk(dev_info->gd); - blk_cleanup_queue(dev_info->dcssblk_queue); - dev_info->gd->queue = NULL; - put_disk(dev_info->gd); + blk_cleanup_disk(dev_info->gd); up_write(&dcssblk_devices_sem); if (device_remove_file_self(dev, attr)) { @@ -644,18 +641,17 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char dev_info->dev.release = dcssblk_release_segment; dev_info->dev.groups = dcssblk_dev_attr_groups; INIT_LIST_HEAD(&dev_info->lh); - dev_info->gd = alloc_disk(DCSSBLK_MINORS_PER_DISK); + dev_info->gd = blk_alloc_disk(NUMA_NO_NODE); if (dev_info->gd == NULL) { rc = -ENOMEM; goto seg_list_del; } dev_info->gd->major = dcssblk_major; + dev_info->gd->minors = DCSSBLK_MINORS_PER_DISK; dev_info->gd->fops = &dcssblk_devops; - dev_info->dcssblk_queue = blk_alloc_queue(NUMA_NO_NODE); - dev_info->gd->queue = dev_info->dcssblk_queue; dev_info->gd->private_data = dev_info; - blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096); - blk_queue_flag_set(QUEUE_FLAG_DAX, dev_info->dcssblk_queue); + blk_queue_logical_block_size(dev_info->gd->queue, 4096); + blk_queue_flag_set(QUEUE_FLAG_DAX, dev_info->gd->queue); seg_byte_size = (dev_info->end - dev_info->start + 1); set_capacity(dev_info->gd, seg_byte_size >> 9); // size in sectors @@ -719,9 +715,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char put_dev: list_del(&dev_info->lh); - blk_cleanup_queue(dev_info->dcssblk_queue); - dev_info->gd->queue = NULL; - put_disk(dev_info->gd); + blk_cleanup_disk(dev_info->gd); list_for_each_entry(seg_info, &dev_info->seg_list, lh) { segment_unload(seg_info->segment_name); } @@ -731,9 +725,7 @@ put_dev: dev_list_del: list_del(&dev_info->lh); release_gd: - blk_cleanup_queue(dev_info->dcssblk_queue); - dev_info->gd->queue = NULL; - put_disk(dev_info->gd); + blk_cleanup_disk(dev_info->gd); up_write(&dcssblk_devices_sem); seg_list_del: if (dev_info == NULL) @@ -801,9 +793,7 @@ dcssblk_remove_store(struct device *dev, struct device_attribute *attr, const ch kill_dax(dev_info->dax_dev); put_dax(dev_info->dax_dev); del_gendisk(dev_info->gd); - blk_cleanup_queue(dev_info->dcssblk_queue); - dev_info->gd->queue = NULL; - put_disk(dev_info->gd); + blk_cleanup_disk(dev_info->gd); /* unload all related segments */ list_for_each_entry(entry, &dev_info->seg_list, lh) From ef35885400481b46ede9cbdcc1fce902e06002c5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:14 +0200 Subject: [PATCH 036/129] xpram: convert to blk_alloc_disk/blk_cleanup_disk Convert the xpram driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-25-hch@lst.de Signed-off-by: Jens Axboe --- drivers/s390/block/xpram.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index d1ed39162943..91ef710edfd2 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -56,7 +56,6 @@ typedef struct { static xpram_device_t xpram_devices[XPRAM_MAX_DEVS]; static unsigned int xpram_sizes[XPRAM_MAX_DEVS]; static struct gendisk *xpram_disks[XPRAM_MAX_DEVS]; -static struct request_queue *xpram_queues[XPRAM_MAX_DEVS]; static unsigned int xpram_pages; static int xpram_devs; @@ -341,17 +340,13 @@ static int __init xpram_setup_blkdev(void) int i, rc = -ENOMEM; for (i = 0; i < xpram_devs; i++) { - xpram_disks[i] = alloc_disk(1); + xpram_disks[i] = blk_alloc_disk(NUMA_NO_NODE); if (!xpram_disks[i]) goto out; - xpram_queues[i] = blk_alloc_queue(NUMA_NO_NODE); - if (!xpram_queues[i]) { - put_disk(xpram_disks[i]); - goto out; - } - blk_queue_flag_set(QUEUE_FLAG_NONROT, xpram_queues[i]); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, xpram_queues[i]); - blk_queue_logical_block_size(xpram_queues[i], 4096); + blk_queue_flag_set(QUEUE_FLAG_NONROT, xpram_disks[i]->queue); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, + xpram_disks[i]->queue); + blk_queue_logical_block_size(xpram_disks[i]->queue, 4096); } /* @@ -373,9 +368,9 @@ static int __init xpram_setup_blkdev(void) offset += xpram_devices[i].size; disk->major = XPRAM_MAJOR; disk->first_minor = i; + disk->minors = 1; disk->fops = &xpram_devops; disk->private_data = &xpram_devices[i]; - disk->queue = xpram_queues[i]; sprintf(disk->disk_name, "slram%d", i); set_capacity(disk, xpram_sizes[i] << 1); add_disk(disk); @@ -383,10 +378,8 @@ static int __init xpram_setup_blkdev(void) return 0; out: - while (i--) { - blk_cleanup_queue(xpram_queues[i]); - put_disk(xpram_disks[i]); - } + while (i--) + blk_cleanup_disk(xpram_disks[i]); return rc; } @@ -434,8 +427,7 @@ static void __exit xpram_exit(void) int i; for (i = 0; i < xpram_devs; i++) { del_gendisk(xpram_disks[i]); - blk_cleanup_queue(xpram_queues[i]); - put_disk(xpram_disks[i]); + blk_cleanup_disk(xpram_disks[i]); } unregister_blkdev(XPRAM_MAJOR, XPRAM_NAME); platform_device_unregister(xpram_pdev); From 132226b301b545198515fb8c6b7f537c13b71f4d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:15 +0200 Subject: [PATCH 037/129] null_blk: convert to blk_alloc_disk/blk_cleanup_disk Convert the null_blk driver to use the blk_alloc_disk and blk_cleanup_disk helpers to simplify gendisk and request_queue allocation. Note that the blk-mq mode is left with its own allocations scheme, to be handled later. Signed-off-by: Christoph Hellwig Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-26-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 38 +++++++++++++++++------------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 5f006d9e1472..d8e098f1e5b5 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1597,11 +1597,10 @@ static void null_del_dev(struct nullb *nullb) null_restart_queue_async(nullb); } - blk_cleanup_queue(nullb->q); + blk_cleanup_disk(nullb->disk); if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); - put_disk(nullb->disk); cleanup_queues(nullb); if (null_cache_active(nullb)) null_free_device_storage(nullb->dev, true); @@ -1700,22 +1699,19 @@ static int init_driver_queues(struct nullb *nullb) static int null_gendisk_register(struct nullb *nullb) { sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; - struct gendisk *disk; + struct gendisk *disk = nullb->disk; - disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); - if (!disk) - return -ENOMEM; set_capacity(disk, size); disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; disk->major = null_major; disk->first_minor = nullb->index; + disk->minors = 1; if (queue_is_mq(nullb->q)) disk->fops = &null_rq_ops; else disk->fops = &null_bio_ops; disk->private_data = nullb; - disk->queue = nullb->q; strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); if (nullb->dev->zoned) { @@ -1851,23 +1847,27 @@ static int null_add_dev(struct nullb_device *dev) goto out_cleanup_queues; if (!null_setup_fault()) - goto out_cleanup_queues; + goto out_cleanup_tags; + rv = -ENOMEM; nullb->tag_set->timeout = 5 * HZ; nullb->q = blk_mq_init_queue_data(nullb->tag_set, nullb); - if (IS_ERR(nullb->q)) { - rv = -ENOMEM; + if (IS_ERR(nullb->q)) goto out_cleanup_tags; - } + nullb->disk = alloc_disk_node(1, nullb->dev->home_node); + if (!nullb->disk) + goto out_cleanup_disk; + nullb->disk->queue = nullb->q; } else if (dev->queue_mode == NULL_Q_BIO) { - nullb->q = blk_alloc_queue(dev->home_node); - if (!nullb->q) { - rv = -ENOMEM; + rv = -ENOMEM; + nullb->disk = blk_alloc_disk(nullb->dev->home_node); + if (!nullb->disk) goto out_cleanup_queues; - } + + nullb->q = nullb->disk->queue; rv = init_driver_queues(nullb); if (rv) - goto out_cleanup_blk_queue; + goto out_cleanup_disk; } if (dev->mbps) { @@ -1883,7 +1883,7 @@ static int null_add_dev(struct nullb_device *dev) if (dev->zoned) { rv = null_init_zoned_dev(dev, nullb->q); if (rv) - goto out_cleanup_blk_queue; + goto out_cleanup_disk; } nullb->q->queuedata = nullb; @@ -1921,8 +1921,8 @@ static int null_add_dev(struct nullb_device *dev) return 0; out_cleanup_zone: null_free_zoned_dev(dev); -out_cleanup_blk_queue: - blk_cleanup_queue(nullb->q); +out_cleanup_disk: + blk_cleanup_disk(nullb->disk); out_cleanup_tags: if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); From da7ba72960ca2a9b968e47fcf414d16f3d4c0c42 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 May 2021 07:51:16 +0200 Subject: [PATCH 038/129] block: unexport blk_alloc_queue blk_alloc_queue is just an internal helper now, unexport it and remove it from the public header. Signed-off-by: Christoph Hellwig Reviewed-by: Ulf Hansson Link: https://lore.kernel.org/r/20210521055116.1053587-27-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 1 - block/blk.h | 2 ++ include/linux/blkdev.h | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 689aac2625d2..3515a66022d7 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -599,7 +599,6 @@ fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } -EXPORT_SYMBOL(blk_alloc_queue); /** * blk_get_queue - increment the request_queue refcount diff --git a/block/blk.h b/block/blk.h index cba3a94aabfa..3440142f029b 100644 --- a/block/blk.h +++ b/block/blk.h @@ -359,4 +359,6 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); +struct request_queue *blk_alloc_queue(int node_id); + #endif /* BLK_INTERNAL_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2c28577b50f4..d66d0da72529 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1213,7 +1213,6 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, extern void blk_dump_rq_flags(struct request *, char *); bool __must_check blk_get_queue(struct request_queue *); -struct request_queue *blk_alloc_queue(int node_id); extern void blk_put_queue(struct request_queue *); extern void blk_set_queue_dying(struct request_queue *); From 362529d9280af4b08d2c25a4b39b8e5ae7658f9a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 May 2021 08:12:54 +0200 Subject: [PATCH 039/129] block: split __blkdev_get Split __blkdev_get into one helper for the whole device, and one for opening partitions. This removes the (bounded) recursion when opening a partition. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210525061301.2242282-2-hch@lst.de Signed-off-by: Jens Axboe --- fs/block_dev.c | 126 ++++++++++++++++++++++++------------------------- 1 file changed, 61 insertions(+), 65 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 6cc4d4cfe0c2..2b5073e3c923 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1290,78 +1290,68 @@ rescan: */ EXPORT_SYMBOL_GPL(bdev_disk_changed); -/* - * bd_mutex locking: - * - * mutex_lock(part->bd_mutex) - * mutex_lock_nested(whole->bd_mutex, 1) - */ -static int __blkdev_get(struct block_device *bdev, fmode_t mode) +static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; int ret = 0; - if (!(disk->flags & GENHD_FL_UP)) - return -ENXIO; - - if (!bdev->bd_openers) { - if (!bdev_is_partition(bdev)) { - ret = 0; - if (disk->fops->open) - ret = disk->fops->open(bdev, mode); - - if (!ret) - set_init_blocksize(bdev); - - /* - * If the device is invalidated, rescan partition - * if open succeeded or failed with -ENOMEDIUM. - * The latter is necessary to prevent ghost - * partitions on a removed medium. - */ - if (test_bit(GD_NEED_PART_SCAN, &disk->state) && - (!ret || ret == -ENOMEDIUM)) - bdev_disk_changed(bdev, ret == -ENOMEDIUM); - - if (ret) - return ret; - } else { - struct block_device *whole = bdgrab(disk->part0); - - mutex_lock_nested(&whole->bd_mutex, 1); - ret = __blkdev_get(whole, mode); - if (ret) { - mutex_unlock(&whole->bd_mutex); - bdput(whole); - return ret; - } - whole->bd_part_count++; - mutex_unlock(&whole->bd_mutex); - - if (!bdev_nr_sectors(bdev)) { - __blkdev_put(whole, mode, 1); - bdput(whole); - return -ENXIO; - } - set_init_blocksize(bdev); - } - - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); - } else { - if (!bdev_is_partition(bdev)) { - if (bdev->bd_disk->fops->open) - ret = bdev->bd_disk->fops->open(bdev, mode); - /* the same as first opener case, read comment there */ - if (test_bit(GD_NEED_PART_SCAN, &disk->state) && - (!ret || ret == -ENOMEDIUM)) - bdev_disk_changed(bdev, ret == -ENOMEDIUM); - if (ret) - return ret; + if (disk->fops->open) { + ret = disk->fops->open(bdev, mode); + if (ret) { + /* avoid ghost partitions on a removed medium */ + if (ret == -ENOMEDIUM && + test_bit(GD_NEED_PART_SCAN, &disk->state)) + bdev_disk_changed(bdev, true); + return ret; } } + + if (!bdev->bd_openers) { + set_init_blocksize(bdev); + if (bdev->bd_bdi == &noop_backing_dev_info) + bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); + } + if (test_bit(GD_NEED_PART_SCAN, &disk->state)) + bdev_disk_changed(bdev, false); bdev->bd_openers++; + return 0;; +} + +static int blkdev_get_part(struct block_device *part, fmode_t mode) +{ + struct gendisk *disk = part->bd_disk; + struct block_device *whole; + int ret; + + if (part->bd_openers) + goto done; + + whole = bdgrab(disk->part0); + mutex_lock_nested(&whole->bd_mutex, 1); + ret = blkdev_get_whole(whole, mode); + if (ret) { + mutex_unlock(&whole->bd_mutex); + goto out_put_whole; + } + whole->bd_part_count++; + mutex_unlock(&whole->bd_mutex); + + ret = -ENXIO; + if (!bdev_nr_sectors(part)) + goto out_blkdev_put; + + set_init_blocksize(part); + if (part->bd_bdi == &noop_backing_dev_info) + part->bd_bdi = bdi_get(disk->queue->backing_dev_info); +done: + part->bd_openers++; return 0; + +out_blkdev_put: + __blkdev_put(whole, mode, 1); +out_put_whole: + bdput(whole); + return ret; } struct block_device *blkdev_get_no_open(dev_t dev) @@ -1448,7 +1438,13 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) disk_block_events(disk); mutex_lock(&bdev->bd_mutex); - ret =__blkdev_get(bdev, mode); + ret = -ENXIO; + if (!(disk->flags & GENHD_FL_UP)) + goto abort_claiming; + if (bdev_is_partition(bdev)) + ret = blkdev_get_part(bdev, mode); + else + ret = blkdev_get_whole(bdev, mode); if (ret) goto abort_claiming; if (mode & FMODE_EXCL) { From 210a6d756f20f33fc546ec8682a538fbcb84ee8e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 May 2021 08:12:55 +0200 Subject: [PATCH 040/129] block: move sync_blockdev from __blkdev_put to blkdev_put Do the early unlocked syncing even earlier to move more code out of the recursive path. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210525061301.2242282-3-hch@lst.de Signed-off-by: Jens Axboe --- fs/block_dev.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 2b5073e3c923..41d2d9708bf8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1552,16 +1552,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) struct gendisk *disk = bdev->bd_disk; struct block_device *victim = NULL; - /* - * Sync early if it looks like we're the last one. If someone else - * opens the block device between now and the decrement of bd_openers - * then we did a sync that we didn't need to, but that's not the end - * of the world and we want to avoid long (could be several minute) - * syncs while holding the mutex. - */ - if (bdev->bd_openers == 1) - sync_blockdev(bdev); - mutex_lock_nested(&bdev->bd_mutex, for_part); if (for_part) bdev->bd_part_count--; @@ -1588,6 +1578,16 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; + /* + * Sync early if it looks like we're the last one. If someone else + * opens the block device between now and the decrement of bd_openers + * then we did a sync that we didn't need to, but that's not the end + * of the world and we want to avoid long (could be several minute) + * syncs while holding the mutex. + */ + if (bdev->bd_openers == 1) + sync_blockdev(bdev); + mutex_lock(&bdev->bd_mutex); if (mode & FMODE_EXCL) { From a8698707a1835be3abd12a3b28079a80999f8dee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 May 2021 08:12:56 +0200 Subject: [PATCH 041/129] block: move bd_mutex to struct gendisk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the per-block device bd_mutex with a per-gendisk open_mutex, thus simplifying locking wherever we deal with partitions. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Acked-by: Roger Pau Monné Link: https://lore.kernel.org/r/20210525061301.2242282-4-hch@lst.de Signed-off-by: Jens Axboe --- Documentation/filesystems/locking.rst | 2 +- block/genhd.c | 7 ++--- block/partitions/core.c | 24 ++++++++--------- drivers/block/loop.c | 14 +++++----- drivers/block/xen-blkfront.c | 8 +++--- drivers/block/zram/zram_drv.c | 18 ++++++------- drivers/block/zram/zram_drv.h | 2 +- drivers/md/md.h | 6 ++--- drivers/s390/block/dasd_genhd.c | 8 +++--- drivers/scsi/sd.c | 4 +-- fs/block_dev.c | 37 +++++++++++---------------- fs/btrfs/volumes.c | 2 +- fs/super.c | 8 +++--- include/linux/blk_types.h | 1 - include/linux/genhd.h | 3 +++ 15 files changed, 68 insertions(+), 76 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 1e894480115b..2183fd8cc350 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -480,7 +480,7 @@ prototypes:: locking rules: ======================= =================== -ops bd_mutex +ops open_mutex ======================= =================== open: yes release: yes diff --git a/block/genhd.c b/block/genhd.c index efe0db4d62f0..38d136a19484 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -591,10 +591,10 @@ void del_gendisk(struct gendisk *disk) blk_integrity_del(disk); disk_del_events(disk); - mutex_lock(&disk->part0->bd_mutex); + mutex_lock(&disk->open_mutex); disk->flags &= ~GENHD_FL_UP; blk_drop_partitions(disk); - mutex_unlock(&disk->part0->bd_mutex); + mutex_unlock(&disk->open_mutex); fsync_bdev(disk->part0); __invalidate_device(disk->part0, true); @@ -1273,6 +1273,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) goto out_free_disk; disk->node_id = node_id; + mutex_init(&disk->open_mutex); xa_init(&disk->part_tbl); if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; @@ -1525,7 +1526,7 @@ void disk_unblock_events(struct gendisk *disk) * doesn't clear the events from @disk->ev. * * CONTEXT: - * If @mask is non-zero must be called with bdev->bd_mutex held. + * If @mask is non-zero must be called with disk->open_mutex held. */ void disk_flush_events(struct gendisk *disk, unsigned int mask) { diff --git a/block/partitions/core.c b/block/partitions/core.c index ada3e1e66989..4fde8e0dd7cd 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -283,7 +283,7 @@ struct device_type part_type = { }; /* - * Must be called either with bd_mutex held, before a disk can be opened or + * Must be called either with open_mutex held, before a disk can be opened or * after all disk users are gone. */ static void delete_partition(struct block_device *part) @@ -312,7 +312,7 @@ static ssize_t whole_disk_show(struct device *dev, static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); /* - * Must be called either with bd_mutex held, before a disk can be opened or + * Must be called either with open_mutex held, before a disk can be opened or * after all disk users are gone. */ static struct block_device *add_partition(struct gendisk *disk, int partno, @@ -453,15 +453,15 @@ int bdev_add_partition(struct block_device *bdev, int partno, { struct block_device *part; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); if (partition_overlaps(bdev->bd_disk, start, length, -1)) { - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return -EBUSY; } part = add_partition(bdev->bd_disk, partno, start, length, ADDPART_FLAG_NONE, NULL); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return PTR_ERR_OR_ZERO(part); } @@ -474,8 +474,7 @@ int bdev_del_partition(struct block_device *bdev, int partno) if (!part) return -ENXIO; - mutex_lock(&part->bd_mutex); - mutex_lock_nested(&bdev->bd_mutex, 1); + mutex_lock(&bdev->bd_disk->open_mutex); ret = -EBUSY; if (part->bd_openers) @@ -484,8 +483,7 @@ int bdev_del_partition(struct block_device *bdev, int partno) delete_partition(part); ret = 0; out_unlock: - mutex_unlock(&bdev->bd_mutex); - mutex_unlock(&part->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); bdput(part); return ret; } @@ -500,8 +498,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno, if (!part) return -ENXIO; - mutex_lock(&part->bd_mutex); - mutex_lock_nested(&bdev->bd_mutex, 1); + mutex_lock(&bdev->bd_disk->open_mutex); ret = -EINVAL; if (start != part->bd_start_sect) goto out_unlock; @@ -514,8 +511,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno, ret = 0; out_unlock: - mutex_unlock(&part->bd_mutex); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); bdput(part); return ret; } @@ -541,7 +537,7 @@ void blk_drop_partitions(struct gendisk *disk) struct block_device *part; unsigned long idx; - lockdep_assert_held(&disk->part0->bd_mutex); + lockdep_assert_held(&disk->open_mutex); xa_for_each_start(&disk->part_tbl, idx, part, 1) { if (!bdgrab(part)) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d58d68f3c7cd..95c570f5923f 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -652,9 +652,9 @@ static void loop_reread_partitions(struct loop_device *lo, { int rc; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); rc = bdev_disk_changed(bdev, false); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); if (rc) pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", __func__, lo->lo_number, lo->lo_file_name, rc); @@ -747,7 +747,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, mutex_unlock(&lo->lo_mutex); /* * We must drop file reference outside of lo_mutex as dropping - * the file ref can take bd_mutex which creates circular locking + * the file ref can take open_mutex which creates circular locking * dependency. */ fput(old_file); @@ -1260,7 +1260,7 @@ out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) { /* - * bd_mutex has been held already in release path, so don't + * open_mutex has been held already in release path, so don't * acquire it if this function is called in such case. * * If the reread partition isn't from release path, lo_refcnt @@ -1268,10 +1268,10 @@ out_unlock: * current holder is released. */ if (!release) - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); err = bdev_disk_changed(bdev, false); if (!release) - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo_number, err); @@ -1298,7 +1298,7 @@ out_unlock: /* * Need not hold lo_mutex to fput backing file. Calling fput holding * lo_mutex triggers a circular lock dependency possibility warning as - * fput can take bd_mutex which is usually taken before lo_mutex. + * fput can take open_mutex which is usually taken before lo_mutex. */ if (filp) fput(filp); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 10df39a8b18d..f2c1aedcdf5a 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2163,7 +2163,7 @@ static void blkfront_closing(struct blkfront_info *info) return; } - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); if (bdev->bd_openers) { xenbus_dev_error(xbdev, -EBUSY, @@ -2174,7 +2174,7 @@ static void blkfront_closing(struct blkfront_info *info) xenbus_frontend_closed(xbdev); } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); bdput(bdev); } @@ -2531,7 +2531,7 @@ static int blkfront_remove(struct xenbus_device *xbdev) * isn't closed yet, we let release take care of it. */ - mutex_lock(&bdev->bd_mutex); + mutex_lock(&disk->open_mutex); info = disk->private_data; dev_warn(disk_to_dev(disk), @@ -2546,7 +2546,7 @@ static int blkfront_remove(struct xenbus_device *xbdev) mutex_unlock(&blkfront_mutex); } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&disk->open_mutex); bdput(bdev); return 0; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 006416cc4969..fcaf2750f68f 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1781,24 +1781,24 @@ static ssize_t reset_store(struct device *dev, zram = dev_to_zram(dev); bdev = zram->disk->part0; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); /* Do not reset an active device or claimed device */ if (bdev->bd_openers || zram->claim) { - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return -EBUSY; } /* From now on, anyone can't open /dev/zram[0-9] */ zram->claim = true; - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); zram->claim = false; - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return len; } @@ -1808,7 +1808,7 @@ static int zram_open(struct block_device *bdev, fmode_t mode) int ret = 0; struct zram *zram; - WARN_ON(!mutex_is_locked(&bdev->bd_mutex)); + WARN_ON(!mutex_is_locked(&bdev->bd_disk->open_mutex)); zram = bdev->bd_disk->private_data; /* zram was claimed to reset so open request fails */ @@ -1972,14 +1972,14 @@ static int zram_remove(struct zram *zram) { struct block_device *bdev = zram->disk->part0; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); if (bdev->bd_openers || zram->claim) { - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return -EBUSY; } zram->claim = true; - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); zram_debugfs_unregister(zram); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 419a7e8281ee..74c411911b6e 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -112,7 +112,7 @@ struct zram { /* * zram is claimed so open request will be failed */ - bool claim; /* Protected by bdev->bd_mutex */ + bool claim; /* Protected by disk->open_mutex */ struct file *backing_dev; #ifdef CONFIG_ZRAM_WRITEBACK spinlock_t wb_limit_lock; diff --git a/drivers/md/md.h b/drivers/md/md.h index fb7eab58cfd5..a88086d4110c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -395,10 +395,10 @@ struct mddev { * that we are never stopping an array while it is open. * 'reconfig_mutex' protects all other reconfiguration. * These locks are separate due to conflicting interactions - * with bdev->bd_mutex. + * with disk->open_mutex. * Lock ordering is: - * reconfig_mutex -> bd_mutex - * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open + * reconfig_mutex -> disk->open_mutex + * disk->open_mutex -> open_mutex: e.g. __blkdev_get -> md_open */ struct mutex open_mutex; struct mutex reconfig_mutex; diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 8d6587ec73e2..bf2082d461c7 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -109,9 +109,9 @@ int dasd_scan_partitions(struct dasd_block *block) return -ENODEV; } - mutex_lock(&bdev->bd_mutex); + mutex_lock(&block->gdp->open_mutex); rc = bdev_disk_changed(bdev, false); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&block->gdp->open_mutex); if (rc) DBF_DEV_EVENT(DBF_ERR, block->base, "scan partitions error, rc %d", rc); @@ -145,9 +145,9 @@ void dasd_destroy_partitions(struct dasd_block *block) bdev = block->bdev; block->bdev = NULL; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); bdev_disk_changed(bdev, true); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); /* Matching blkdev_put to the blkdev_get in dasd_scan_partitions. */ blkdev_put(bdev, FMODE_READ); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index cb3c37d1e009..d3ff723af879 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1400,7 +1400,7 @@ static void sd_uninit_command(struct scsi_cmnd *SCpnt) * In the latter case @inode and @filp carry an abridged amount * of information as noted above. * - * Locking: called with bdev->bd_mutex held. + * Locking: called with bdev->bd_disk->open_mutex held. **/ static int sd_open(struct block_device *bdev, fmode_t mode) { @@ -1476,7 +1476,7 @@ error_out: * Note: may block (uninterruptible) if error recovery is underway * on this disk. * - * Locking: called with bdev->bd_mutex held. + * Locking: called with bdev->bd_disk->open_mutex held. **/ static void sd_release(struct gendisk *disk, fmode_t mode) { diff --git a/fs/block_dev.c b/fs/block_dev.c index 41d2d9708bf8..e094806c3a0c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -895,7 +895,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) mapping_set_gfp_mask(&inode->i_data, GFP_USER); bdev = I_BDEV(inode); - mutex_init(&bdev->bd_mutex); mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); bdev->bd_disk = disk; @@ -1154,7 +1153,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) struct bd_holder_disk *holder; int ret = 0; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); WARN_ON_ONCE(!bdev->bd_holder); @@ -1199,7 +1198,7 @@ out_del: out_free: kfree(holder); out_unlock: - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); @@ -1218,7 +1217,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); holder = bd_find_holder_disk(bdev, disk); @@ -1230,7 +1229,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) kfree(holder); } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif @@ -1242,7 +1241,7 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) struct gendisk *disk = bdev->bd_disk; int ret = 0; - lockdep_assert_held(&bdev->bd_mutex); + lockdep_assert_held(&disk->open_mutex); if (!(disk->flags & GENHD_FL_UP)) return -ENXIO; @@ -1327,14 +1326,10 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) goto done; whole = bdgrab(disk->part0); - mutex_lock_nested(&whole->bd_mutex, 1); ret = blkdev_get_whole(whole, mode); - if (ret) { - mutex_unlock(&whole->bd_mutex); + if (ret) goto out_put_whole; - } whole->bd_part_count++; - mutex_unlock(&whole->bd_mutex); ret = -ENXIO; if (!bdev_nr_sectors(part)) @@ -1437,7 +1432,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) disk_block_events(disk); - mutex_lock(&bdev->bd_mutex); + mutex_lock(&disk->open_mutex); ret = -ENXIO; if (!(disk->flags & GENHD_FL_UP)) goto abort_claiming; @@ -1463,7 +1458,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) unblock_events = false; } } - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&disk->open_mutex); if (unblock_events) disk_unblock_events(disk); @@ -1472,7 +1467,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) abort_claiming: if (mode & FMODE_EXCL) bd_abort_claiming(bdev, holder); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&disk->open_mutex); disk_unblock_events(disk); put_blkdev: blkdev_put_no_open(bdev); @@ -1552,7 +1547,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) struct gendisk *disk = bdev->bd_disk; struct block_device *victim = NULL; - mutex_lock_nested(&bdev->bd_mutex, for_part); if (for_part) bdev->bd_part_count--; @@ -1567,7 +1561,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) if (!bdev_is_partition(bdev) && disk->fops->release) disk->fops->release(disk, mode); - mutex_unlock(&bdev->bd_mutex); if (victim) { __blkdev_put(victim, mode, 1); bdput(victim); @@ -1588,15 +1581,14 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) if (bdev->bd_openers == 1) sync_blockdev(bdev); - mutex_lock(&bdev->bd_mutex); - + mutex_lock(&disk->open_mutex); if (mode & FMODE_EXCL) { struct block_device *whole = bdev_whole(bdev); bool bdev_free; /* * Release a claim on the device. The holder fields - * are protected with bdev_lock. bd_mutex is to + * are protected with bdev_lock. open_mutex is to * synchronize disk_holder unlinking. */ spin_lock(&bdev_lock); @@ -1627,9 +1619,10 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) * from userland - e.g. eject(1). */ disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); - mutex_unlock(&bdev->bd_mutex); __blkdev_put(bdev, mode, 0); + mutex_unlock(&disk->open_mutex); + blkdev_put_no_open(bdev); } EXPORT_SYMBOL(blkdev_put); @@ -1936,10 +1929,10 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) old_inode = inode; bdev = I_BDEV(inode); - mutex_lock(&bdev->bd_mutex); + mutex_lock(&bdev->bd_disk->open_mutex); if (bdev->bd_openers) func(bdev, arg); - mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdev->bd_disk->open_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 47d27059d064..f246eb2772e9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1247,7 +1247,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, lockdep_assert_held(&uuid_mutex); /* * The device_list_mutex cannot be taken here in case opening the - * underlying device takes further locks like bd_mutex. + * underlying device takes further locks like open_mutex. * * We also don't need the lock here as this is called during mount and * exclusion is provided by uuid_mutex diff --git a/fs/super.c b/fs/super.c index 11b7e7213fd1..91b7f156735b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1277,9 +1277,9 @@ int get_tree_bdev(struct fs_context *fc, } /* - * s_umount nests inside bd_mutex during + * s_umount nests inside open_mutex during * __invalidate_device(). blkdev_put() acquires - * bd_mutex and can't be called under s_umount. Drop + * open_mutex and can't be called under s_umount. Drop * s_umount temporarily. This is safe as we're * holding an active reference. */ @@ -1352,9 +1352,9 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, } /* - * s_umount nests inside bd_mutex during + * s_umount nests inside open_mutex during * __invalidate_device(). blkdev_put() acquires - * bd_mutex and can't be called under s_umount. Drop + * open_mutex and can't be called under s_umount. Drop * s_umount temporarily. This is safe as we're * holding an active reference. */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index db026b6ec15a..a09660671fa4 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -29,7 +29,6 @@ struct block_device { int bd_openers; struct inode * bd_inode; /* will die */ struct super_block * bd_super; - struct mutex bd_mutex; /* open/close mutex */ void * bd_claiming; struct device bd_device; void * bd_holder; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 782f0171d104..1fabb1559110 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -154,6 +154,9 @@ struct gendisk { #define GD_NEED_PART_SCAN 0 #define GD_READ_ONLY 1 #define GD_QUEUE_REF 2 + + struct mutex open_mutex; /* open/close mutex */ + struct kobject *slave_dir; struct timer_rand_state *random; From e54069acac1a302c1adc26694963547f8b73c2b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 May 2021 08:12:57 +0200 Subject: [PATCH 042/129] block: move adjusting bd_part_count out of __blkdev_get Keep in the callers and thus remove the for_part argument. This mirrors what is done on the blkdev_get side and slightly simplifies blkdev_get_part as well. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210525061301.2242282-5-hch@lst.de Signed-off-by: Jens Axboe --- fs/block_dev.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index e094806c3a0c..43dce929e7ee 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1234,7 +1234,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif -static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); +static void __blkdev_put(struct block_device *bdev, fmode_t mode); int bdev_disk_changed(struct block_device *bdev, bool invalidate) { @@ -1329,12 +1329,12 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) ret = blkdev_get_whole(whole, mode); if (ret) goto out_put_whole; - whole->bd_part_count++; ret = -ENXIO; if (!bdev_nr_sectors(part)) goto out_blkdev_put; + whole->bd_part_count++; set_init_blocksize(part); if (part->bd_bdi == &noop_backing_dev_info) part->bd_bdi = bdi_get(disk->queue->backing_dev_info); @@ -1343,7 +1343,7 @@ done: return 0; out_blkdev_put: - __blkdev_put(whole, mode, 1); + __blkdev_put(whole, mode); out_put_whole: bdput(whole); return ret; @@ -1542,14 +1542,11 @@ static int blkdev_open(struct inode * inode, struct file * filp) return 0; } -static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) +static void __blkdev_put(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; struct block_device *victim = NULL; - if (for_part) - bdev->bd_part_count--; - if (!--bdev->bd_openers) { WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); @@ -1562,7 +1559,8 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) if (!bdev_is_partition(bdev) && disk->fops->release) disk->fops->release(disk, mode); if (victim) { - __blkdev_put(victim, mode, 1); + victim->bd_part_count--; + __blkdev_put(victim, mode); bdput(victim); } } @@ -1620,7 +1618,7 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) */ disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); - __blkdev_put(bdev, mode, 0); + __blkdev_put(bdev, mode); mutex_unlock(&disk->open_mutex); blkdev_put_no_open(bdev); From c8276b954d2dacbabe587c0421a9344529af5bad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 May 2021 08:12:58 +0200 Subject: [PATCH 043/129] block: split __blkdev_put Split __blkdev_put into one helper for the whole device, and one for partitions as well as another shared helper for flushing the block device inode mapping. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210525061301.2242282-6-hch@lst.de Signed-off-by: Jens Axboe --- fs/block_dev.c | 58 ++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 43dce929e7ee..cd45b54e86b4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1234,7 +1234,13 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif -static void __blkdev_put(struct block_device *bdev, fmode_t mode); +static void blkdev_flush_mapping(struct block_device *bdev) +{ + WARN_ON_ONCE(bdev->bd_holders); + sync_blockdev(bdev); + kill_bdev(bdev); + bdev_write_inode(bdev); +} int bdev_disk_changed(struct block_device *bdev, bool invalidate) { @@ -1316,6 +1322,14 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) return 0;; } +static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) +{ + if (!--bdev->bd_openers) + blkdev_flush_mapping(bdev); + if (bdev->bd_disk->fops->release) + bdev->bd_disk->fops->release(bdev->bd_disk, mode); +} + static int blkdev_get_part(struct block_device *part, fmode_t mode) { struct gendisk *disk = part->bd_disk; @@ -1343,12 +1357,24 @@ done: return 0; out_blkdev_put: - __blkdev_put(whole, mode); + blkdev_put_whole(whole, mode); out_put_whole: bdput(whole); return ret; } +static void blkdev_put_part(struct block_device *part, fmode_t mode) +{ + struct block_device *whole = bdev_whole(part); + + if (--part->bd_openers) + return; + blkdev_flush_mapping(part); + whole->bd_part_count--; + blkdev_put_whole(whole, mode); + bdput(whole); +} + struct block_device *blkdev_get_no_open(dev_t dev) { struct block_device *bdev; @@ -1542,29 +1568,6 @@ static int blkdev_open(struct inode * inode, struct file * filp) return 0; } -static void __blkdev_put(struct block_device *bdev, fmode_t mode) -{ - struct gendisk *disk = bdev->bd_disk; - struct block_device *victim = NULL; - - if (!--bdev->bd_openers) { - WARN_ON_ONCE(bdev->bd_holders); - sync_blockdev(bdev); - kill_bdev(bdev); - bdev_write_inode(bdev); - if (bdev_is_partition(bdev)) - victim = bdev_whole(bdev); - } - - if (!bdev_is_partition(bdev) && disk->fops->release) - disk->fops->release(disk, mode); - if (victim) { - victim->bd_part_count--; - __blkdev_put(victim, mode); - bdput(victim); - } -} - void blkdev_put(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; @@ -1618,7 +1621,10 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) */ disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); - __blkdev_put(bdev, mode); + if (bdev_is_partition(bdev)) + blkdev_put_part(bdev, mode); + else + blkdev_put_whole(bdev, mode); mutex_unlock(&disk->open_mutex); blkdev_put_no_open(bdev); From ab4b57057d744861f670b47b163209727b26418b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 May 2021 08:12:59 +0200 Subject: [PATCH 044/129] block: move bd_part_count to struct gendisk The bd_part_count value only makes sense for whole devices, so move it to struct gendisk and give it a more descriptive name. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210525061301.2242282-7-hch@lst.de Signed-off-by: Jens Axboe --- block/ioctl.c | 2 +- fs/block_dev.c | 6 +++--- include/linux/blk_types.h | 3 --- include/linux/genhd.h | 1 + 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index 8ba1ed8defd0..24beec9ca9c9 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -89,7 +89,7 @@ static int blkdev_reread_part(struct block_device *bdev, fmode_t mode) return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - if (bdev->bd_part_count) + if (bdev->bd_disk->open_partitions) return -EBUSY; /* diff --git a/fs/block_dev.c b/fs/block_dev.c index cd45b54e86b4..ac9b3c158a77 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1253,7 +1253,7 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) return -ENXIO; rescan: - if (bdev->bd_part_count) + if (disk->open_partitions) return -EBUSY; sync_blockdev(bdev); invalidate_bdev(bdev); @@ -1348,7 +1348,7 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) if (!bdev_nr_sectors(part)) goto out_blkdev_put; - whole->bd_part_count++; + disk->open_partitions++; set_init_blocksize(part); if (part->bd_bdi == &noop_backing_dev_info) part->bd_bdi = bdi_get(disk->queue->backing_dev_info); @@ -1370,7 +1370,7 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) if (--part->bd_openers) return; blkdev_flush_mapping(part); - whole->bd_part_count--; + whole->bd_disk->open_partitions--; blkdev_put_whole(whole, mode); bdput(whole); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a09660671fa4..fd3860d18d7e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -39,9 +39,6 @@ struct block_device { #endif struct kobject *bd_holder_dir; u8 bd_partno; - /* number of times partitions within this device have been opened. */ - unsigned bd_part_count; - spinlock_t bd_size_lock; /* for bd_inode->i_size updates */ struct gendisk * bd_disk; struct backing_dev_info *bd_bdi; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 1fabb1559110..47d4605c0e7e 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -156,6 +156,7 @@ struct gendisk { #define GD_QUEUE_REF 2 struct mutex open_mutex; /* open/close mutex */ + unsigned open_partitions; /* number of open partitions */ struct kobject *slave_dir; From c97d93c31e5734a16bfe663085ec91b8c9fb20f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 May 2021 08:13:00 +0200 Subject: [PATCH 045/129] block: factor out a part_devt helper Add a helper to find the dev_t for a disk + partno tuple. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210525061301.2242282-8-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 25 +++++++++++++++++-------- include/linux/genhd.h | 1 + init/do_mounts.c | 10 ++-------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 38d136a19484..3f7b1c92c7f3 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1227,6 +1227,19 @@ static int __init proc_genhd_init(void) module_init(proc_genhd_init); #endif /* CONFIG_PROC_FS */ +dev_t part_devt(struct gendisk *disk, u8 partno) +{ + struct block_device *part = bdget_disk(disk, partno); + dev_t devt = 0; + + if (part) { + devt = part->bd_dev; + bdput(part); + } + + return devt; +} + dev_t blk_lookup_devt(const char *name, int partno) { dev_t devt = MKDEV(0, 0); @@ -1236,7 +1249,6 @@ dev_t blk_lookup_devt(const char *name, int partno) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); - struct block_device *part; if (strcmp(dev_name(dev), name)) continue; @@ -1247,13 +1259,10 @@ dev_t blk_lookup_devt(const char *name, int partno) */ devt = MKDEV(MAJOR(dev->devt), MINOR(dev->devt) + partno); - break; - } - part = bdget_disk(disk, partno); - if (part) { - devt = part->bd_dev; - bdput(part); - break; + } else { + devt = part_devt(disk, partno); + if (devt) + break; } } class_dev_iter_exit(&iter); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 47d4605c0e7e..64a8431202b7 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -333,6 +333,7 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, } #endif /* CONFIG_SYSFS */ +dev_t part_devt(struct gendisk *disk, u8 partno); dev_t blk_lookup_devt(const char *name, int partno); void blk_request_module(dev_t devt); #ifdef CONFIG_BLOCK diff --git a/init/do_mounts.c b/init/do_mounts.c index a78e44ee6adb..74aede860de7 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -133,14 +133,8 @@ static dev_t devt_from_partuuid(const char *uuid_str) * Attempt to find the requested partition by adding an offset * to the partition number found by UUID. */ - struct block_device *part; - - part = bdget_disk(dev_to_disk(dev), - dev_to_bdev(dev)->bd_partno + offset); - if (part) { - devt = part->bd_dev; - bdput(part); - } + devt = part_devt(dev_to_disk(dev), + dev_to_bdev(dev)->bd_partno + offset); } else { devt = dev->devt; } From 0e0ccdecb3cff95a350b4364e7ebbaa754d0e47d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 May 2021 08:13:01 +0200 Subject: [PATCH 046/129] block: remove bdget_disk Just opencode the xa_load in the callers, as none of them actually needs a reference to the bdev. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20210525061301.2242282-9-hch@lst.de Signed-off-by: Jens Axboe --- block/genhd.c | 35 +++++------------------------------ block/partitions/core.c | 27 +++++++++++++-------------- include/linux/genhd.h | 1 - 3 files changed, 18 insertions(+), 45 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 3f7b1c92c7f3..5f5628216295 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -676,32 +676,6 @@ void blk_request_module(dev_t devt) request_module("block-major-%d", MAJOR(devt)); } -/** - * bdget_disk - do bdget() by gendisk and partition number - * @disk: gendisk of interest - * @partno: partition number - * - * Find partition @partno from @disk, do bdget() on it. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * Resulting block_device on success, NULL on failure. - */ -struct block_device *bdget_disk(struct gendisk *disk, int partno) -{ - struct block_device *bdev = NULL; - - rcu_read_lock(); - bdev = xa_load(&disk->part_tbl, partno); - if (bdev && !bdgrab(bdev)) - bdev = NULL; - rcu_read_unlock(); - - return bdev; -} - /* * print a full list of all partitions - intended for places where the root * filesystem can't be mounted and thus to give the victim some idea of what @@ -1229,13 +1203,14 @@ module_init(proc_genhd_init); dev_t part_devt(struct gendisk *disk, u8 partno) { - struct block_device *part = bdget_disk(disk, partno); + struct block_device *part; dev_t devt = 0; - if (part) { + rcu_read_lock(); + part = xa_load(&disk->part_tbl, partno); + if (part) devt = part->bd_dev; - bdput(part); - } + rcu_read_unlock(); return devt; } diff --git a/block/partitions/core.c b/block/partitions/core.c index 4fde8e0dd7cd..186d4fbd9f09 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -326,6 +326,8 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, const char *dname; int err; + lockdep_assert_held(&disk->open_mutex); + if (partno >= disk_max_parts(disk)) return ERR_PTR(-EINVAL); @@ -467,14 +469,13 @@ int bdev_add_partition(struct block_device *bdev, int partno, int bdev_del_partition(struct block_device *bdev, int partno) { - struct block_device *part; - int ret; - - part = bdget_disk(bdev->bd_disk, partno); - if (!part) - return -ENXIO; + struct block_device *part = NULL; + int ret = -ENXIO; mutex_lock(&bdev->bd_disk->open_mutex); + part = xa_load(&bdev->bd_disk->part_tbl, partno); + if (!part) + goto out_unlock; ret = -EBUSY; if (part->bd_openers) @@ -484,21 +485,20 @@ int bdev_del_partition(struct block_device *bdev, int partno) ret = 0; out_unlock: mutex_unlock(&bdev->bd_disk->open_mutex); - bdput(part); return ret; } int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length) { - struct block_device *part; - int ret = 0; - - part = bdget_disk(bdev->bd_disk, partno); - if (!part) - return -ENXIO; + struct block_device *part = NULL; + int ret = -ENXIO; mutex_lock(&bdev->bd_disk->open_mutex); + part = xa_load(&bdev->bd_disk->part_tbl, partno); + if (!part) + goto out_unlock; + ret = -EINVAL; if (start != part->bd_start_sect) goto out_unlock; @@ -512,7 +512,6 @@ int bdev_resize_partition(struct block_device *bdev, int partno, ret = 0; out_unlock: mutex_unlock(&bdev->bd_disk->open_mutex); - bdput(part); return ret; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 64a8431202b7..03d684f0498f 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -223,7 +223,6 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk) } extern void del_gendisk(struct gendisk *gp); -extern struct block_device *bdget_disk(struct gendisk *disk, int partno); void set_disk_ro(struct gendisk *disk, bool read_only); From 90bf3e28ef51aa3f480d2f2151813be669ba69ce Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 2 Jun 2021 11:06:59 +0100 Subject: [PATCH 047/129] null_blk: Fix null pointer dereference on nullb->disk on blk_cleanup_disk call The error handling on a nullb->disk allocation currently jumps to out_cleanup_disk that calls blk_cleanup_disk with a null pointer causing a null pointer dereference issue. Fix this by jumping to out_cleanup_tags instead. Addresses-Coverity: ("Dereference after null check") Fixes: 132226b301b5 ("null_blk: convert to blk_alloc_disk/blk_cleanup_disk") Signed-off-by: Colin Ian King Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210602100659.11058-1-colin.king@canonical.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index d8e098f1e5b5..83d803cb57c8 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1856,7 +1856,7 @@ static int null_add_dev(struct nullb_device *dev) goto out_cleanup_tags; nullb->disk = alloc_disk_node(1, nullb->dev->home_node); if (!nullb->disk) - goto out_cleanup_disk; + goto out_cleanup_tags; nullb->disk->queue = nullb->q; } else if (dev->queue_mode == NULL_Q_BIO) { rv = -ENOMEM; From 613471549f366cdf4170b81ce0f99f3867ec4d16 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 3 Jun 2021 12:47:21 +0200 Subject: [PATCH 048/129] block: Do not pull requests from the scheduler when we cannot dispatch them Provided the device driver does not implement dispatch budget accounting (which only SCSI does) the loop in __blk_mq_do_dispatch_sched() pulls requests from the IO scheduler as long as it is willing to give out any. That defeats scheduling heuristics inside the scheduler by creating false impression that the device can take more IO when it in fact cannot. For example with BFQ IO scheduler on top of virtio-blk device setting blkio cgroup weight has barely any impact on observed throughput of async IO because __blk_mq_do_dispatch_sched() always sucks out all the IO queued in BFQ. BFQ first submits IO from higher weight cgroups but when that is all dispatched, it will give out IO of lower weight cgroups as well. And then we have to wait for all this IO to be dispatched to the disk (which means lot of it actually has to complete) before the IO scheduler is queried again for dispatching more requests. This completely destroys any service differentiation. So grab request tag for a request pulled out of the IO scheduler already in __blk_mq_do_dispatch_sched() and do not pull any more requests if we cannot get it because we are unlikely to be able to dispatch it. That way only single request is going to wait in the dispatch list for some tag to free. Reviewed-by: Ming Lei Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210603104721.6309-1-jack@suse.cz Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 12 +++++++++++- block/blk-mq.c | 2 +- block/blk-mq.h | 2 ++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 045b6878b8c5..a9182d2f8ad3 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -168,9 +168,19 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * in blk_mq_dispatch_rq_list(). */ list_add_tail(&rq->queuelist, &rq_list); + count++; if (rq->mq_hctx != hctx) multi_hctxs = true; - } while (++count < max_dispatch); + + /* + * If we cannot get tag for the request, stop dequeueing + * requests from the IO scheduler. We are unlikely to be able + * to submit them anyway and it creates false impression for + * scheduling heuristics that the device can take more IO. + */ + if (!blk_mq_get_driver_tag(rq)) + break; + } while (count < max_dispatch); if (!count) { if (run_queue) diff --git a/block/blk-mq.c b/block/blk-mq.c index f11d4018ce2e..4261adee9964 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1104,7 +1104,7 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return true; } -static bool blk_mq_get_driver_tag(struct request *rq) +bool blk_mq_get_driver_tag(struct request *rq) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; diff --git a/block/blk-mq.h b/block/blk-mq.h index 556368d2c5b6..4b1ca7b7bbeb 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -260,6 +260,8 @@ static inline void blk_mq_put_driver_tag(struct request *rq) __blk_mq_put_driver_tag(rq->mq_hctx, rq); } +bool blk_mq_get_driver_tag(struct request *rq); + static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; From 7cc2623d1c84935f06fbdf727f41d70f4c779ef6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 19 May 2021 10:52:26 -0700 Subject: [PATCH 049/129] block: Update blk_update_request() documentation Although the original intent was to use blk_update_request() in stacking block drivers only, it is used much more widely today. Reflect this in the documentation block above this function. See also: * commit 32fab448e5e8 ("block: add request update interface"). * commit 2e60e02297cf ("block: clean up request completion API"). * commit ed6565e73424 ("block: handle partial completions for special payload requests"). Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210519175226.8853-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-core.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 3515a66022d7..514838ccab2d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1384,26 +1384,22 @@ void blk_steal_bios(struct bio_list *list, struct request *rq) EXPORT_SYMBOL_GPL(blk_steal_bios); /** - * blk_update_request - Special helper function for request stacking drivers + * blk_update_request - Complete multiple bytes without completing the request * @req: the request being processed * @error: block status code - * @nr_bytes: number of bytes to complete @req + * @nr_bytes: number of bytes to complete for @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * - * This special helper function is only for request stacking drivers - * (e.g. request-based dm) so that they can handle partial completion. - * Actual device drivers should use blk_mq_end_request instead. - * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Note: - * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both - * blk_rq_bytes() and in blk_update_request(). + * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function + * except in the consistency check at the end of this function. * * Return: * %false - this request doesn't have any more data From c9c9762d4d44dcb1b2ba90cfb4122dc11ceebf31 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 7 Jun 2021 12:34:05 -0700 Subject: [PATCH 050/129] block: return the correct bvec when checking for gaps After commit 07173c3ec276 ("block: enable multipage bvecs"), a bvec can have multiple pages. But bio_will_gap() still assumes one page bvec while checking for merging. If the pages in the bvec go across the seg_boundary_mask, this check for merging can potentially succeed if only the 1st page is tested, and can fail if all the pages are tested. Later, when SCSI builds the SG list the same check for merging is done in __blk_segment_map_sg_merge() with all the pages in the bvec tested. This time the check may fail if the pages in bvec go across the seg_boundary_mask (but tested okay in bio_will_gap() earlier, so those BIOs were merged). If this check fails, we end up with a broken SG list for drivers assuming the SG list not having offsets in intermediate pages. This results in incorrect pages written to the disk. Fix this by returning the multi-page bvec when testing gaps for merging. Cc: Jens Axboe Cc: Johannes Thumshirn Cc: Pavel Begunkov Cc: Ming Lei Cc: Tejun Heo Cc: "Matthew Wilcox (Oracle)" Cc: Jeffle Xu Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 07173c3ec276 ("block: enable multipage bvecs") Signed-off-by: Long Li Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/1623094445-22332-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Jens Axboe --- include/linux/bio.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index a0b4cfdf62a4..d2b98efb5cc5 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -44,9 +44,6 @@ static inline unsigned int bio_max_segs(unsigned int nr_segs) #define bio_offset(bio) bio_iter_offset((bio), (bio)->bi_iter) #define bio_iovec(bio) bio_iter_iovec((bio), (bio)->bi_iter) -#define bio_multiple_segments(bio) \ - ((bio)->bi_iter.bi_size != bio_iovec(bio).bv_len) - #define bvec_iter_sectors(iter) ((iter).bi_size >> 9) #define bvec_iter_end_sector(iter) ((iter).bi_sector + bvec_iter_sectors((iter))) @@ -271,7 +268,7 @@ static inline void bio_clear_flag(struct bio *bio, unsigned int bit) static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv) { - *bv = bio_iovec(bio); + *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); } static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) @@ -279,10 +276,9 @@ static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) struct bvec_iter iter = bio->bi_iter; int idx; - if (unlikely(!bio_multiple_segments(bio))) { - *bv = bio_iovec(bio); - return; - } + bio_get_first_bvec(bio, bv); + if (bv->bv_len == bio->bi_iter.bi_size) + return; /* this bio only has a single bvec */ bio_advance_iter(bio, &iter, iter.bi_size); From 11c7aa0ddea8611007768d3e6b58d45dc60a19e1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 7 Jun 2021 13:26:13 +0200 Subject: [PATCH 051/129] rq-qos: fix missed wake-ups in rq_qos_throttle try two Commit 545fbd0775ba ("rq-qos: fix missed wake-ups in rq_qos_throttle") tried to fix a problem that a process could be sleeping in rq_qos_wait() without anyone to wake it up. However the fix is not complete and the following can still happen: CPU1 (waiter1) CPU2 (waiter2) CPU3 (waker) rq_qos_wait() rq_qos_wait() acquire_inflight_cb() -> fails acquire_inflight_cb() -> fails completes IOs, inflight decreased prepare_to_wait_exclusive() prepare_to_wait_exclusive() has_sleeper = !wq_has_single_sleeper() -> true as there are two sleepers has_sleeper = !wq_has_single_sleeper() -> true io_schedule() io_schedule() Deadlock as now there's nobody to wakeup the two waiters. The logic automatically blocking when there are already sleepers is really subtle and the only way to make it work reliably is that we check whether there are some waiters in the queue when adding ourselves there. That way, we are guaranteed that at least the first process to enter the wait queue will recheck the waiting condition before going to sleep and thus guarantee forward progress. Fixes: 545fbd0775ba ("rq-qos: fix missed wake-ups in rq_qos_throttle") CC: stable@vger.kernel.org Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210607112613.25344-1-jack@suse.cz Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 4 ++-- include/linux/wait.h | 2 +- kernel/sched/wait.c | 9 +++++++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 656460636ad3..e83af7bc7591 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -266,8 +266,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) return; - prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); - has_sleeper = !wq_has_single_sleeper(&rqw->wait); + has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, + TASK_UNINTERRUPTIBLE); do { /* The memory barrier in set_task_state saves us here. */ if (data.got_token) diff --git a/include/linux/wait.h b/include/linux/wait.h index fe10e8570a52..6598ae35e1b5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -1136,7 +1136,7 @@ do { \ * Waitqueues which are removed from the waitqueue_head at wakeup time */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); -void prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 183cc6ae68a6..76577d1642a5 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -264,17 +264,22 @@ prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_ent } EXPORT_SYMBOL(prepare_to_wait); -void +/* Returns true if we are the first waiter in the queue, false otherwise. */ +bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) { unsigned long flags; + bool was_empty = false; wq_entry->flags |= WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); - if (list_empty(&wq_entry->entry)) + if (list_empty(&wq_entry->entry)) { + was_empty = list_empty(&wq_head->head); __add_wait_queue_entry_tail(wq_head, wq_entry); + } set_current_state(state); spin_unlock_irqrestore(&wq_head->lock, flags); + return was_empty; } EXPORT_SYMBOL(prepare_to_wait_exclusive); From a624eb520390cecf644a8906c982fd53b2afcc49 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 7 Jun 2021 16:52:43 -0700 Subject: [PATCH 052/129] libnvdimm/pmem: Fix blk_cleanup_disk() usage The queue_to_disk() helper can not be used after del_gendisk() communicate @disk via the pgmap->owner. Otherwise, queue_to_disk() returns NULL resulting in the splat below. Kernel attempted to read user page (330) - exploit attempt? (uid: 0) BUG: Kernel NULL pointer dereference on read at 0x00000330 Faulting instruction address: 0xc000000000906344 Oops: Kernel access of bad area, sig: 11 [#1] [..] NIP [c000000000906344] pmem_pagemap_cleanup+0x24/0x40 LR [c0000000004701d4] memunmap_pages+0x1b4/0x4b0 Call Trace: [c000000022cbb9c0] [c0000000009063c8] pmem_pagemap_kill+0x28/0x40 (unreliable) [c000000022cbb9e0] [c0000000004701d4] memunmap_pages+0x1b4/0x4b0 [c000000022cbba90] [c0000000008b28a0] devm_action_release+0x30/0x50 [c000000022cbbab0] [c0000000008b39c8] release_nodes+0x2f8/0x3e0 [c000000022cbbb60] [c0000000008ac440] device_release_driver_internal+0x190/0x2b0 [c000000022cbbba0] [c0000000008a8450] unbind_store+0x130/0x170 Reported-by: Sachin Sant Fixes: 87eb73b2ca7c ("nvdimm-pmem: convert to blk_alloc_disk/blk_cleanup_disk") Link: http://lore.kernel.org/r/DFB75BA8-603F-4A35-880B-C5B23EF8FA7D@linux.vnet.ibm.com Cc: Christoph Hellwig Cc: Ulf Hansson Cc: Jens Axboe Signed-off-by: Dan Williams Reviewed-by: Christoph Hellwig Tested-by: Sachin Sant Link: https://lore.kernel.org/r/162310994435.1571616.334551212901820961.stgit@dwillia2-desk3.amr.corp.intel.com [axboe: fold in compile warning fix] Signed-off-by: Jens Axboe --- drivers/nvdimm/pmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 31f3c4bd6f72..1e0615b8565e 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -335,10 +335,9 @@ static const struct attribute_group *pmem_attribute_groups[] = { static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap) { - struct request_queue *q = - container_of(pgmap->ref, struct request_queue, q_usage_counter); + struct pmem_device *pmem = pgmap->owner; - blk_cleanup_disk(queue_to_disk(q)); + blk_cleanup_disk(pmem->disk); } static void pmem_release_queue(void *pgmap) @@ -427,6 +426,7 @@ static int pmem_attach_disk(struct device *dev, q = disk->queue; pmem->disk = disk; + pmem->pgmap.owner = pmem; pmem->pfn_flags = PFN_DEV; pmem->pgmap.ref = &q->q_usage_counter; if (is_nd_pfn(dev)) { From cdb14e0f7775e767484843e8ecd736bb21754c58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:16 +0300 Subject: [PATCH 053/129] blk-mq: factor out a blk_mq_alloc_sq_tag_set helper Factour out a helper to initialize a simple single hw queue tag_set from blk_mq_init_sq_queue. This will allow to phase out blk_mq_init_sq_queue in favor of a more symmetric and general API. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 32 ++++++++++++++++++-------------- include/linux/blk-mq.h | 3 +++ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4261adee9964..867e5faf4f5b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3152,24 +3152,12 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - memset(set, 0, sizeof(*set)); - set->ops = ops; - set->nr_hw_queues = 1; - set->nr_maps = 1; - set->queue_depth = queue_depth; - set->numa_node = NUMA_NO_NODE; - set->flags = set_flags; - - ret = blk_mq_alloc_tag_set(set); + ret = blk_mq_alloc_sq_tag_set(set, ops, queue_depth, set_flags); if (ret) return ERR_PTR(ret); - q = blk_mq_init_queue(set); - if (IS_ERR(q)) { + if (IS_ERR(q)) blk_mq_free_tag_set(set); - return q; - } - return q; } EXPORT_SYMBOL(blk_mq_init_sq_queue); @@ -3589,6 +3577,22 @@ out_free_mq_map: } EXPORT_SYMBOL(blk_mq_alloc_tag_set); +/* allocate and initialize a tagset for a simple single-queue device */ +int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, unsigned int queue_depth, + unsigned int set_flags) +{ + memset(set, 0, sizeof(*set)); + set->ops = ops; + set->nr_hw_queues = 1; + set->nr_maps = 1; + set->queue_depth = queue_depth; + set->numa_node = NUMA_NO_NODE; + set->flags = set_flags; + return blk_mq_alloc_tag_set(set); +} +EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); + void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 359486940fa0..bb950fc669ef 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -439,6 +439,9 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, void blk_mq_unregister_dev(struct device *, struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); +int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, unsigned int queue_depth, + unsigned int set_flags); void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); From 26a9750aa875126e4b7fc5ee6de652a529c5b7ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:17 +0300 Subject: [PATCH 054/129] blk-mq: improve the blk_mq_init_allocated_queue interface Don't return the passed in request_queue but a normal error code, and drop the elevator_init argument in favor of just calling elevator_init_mq directly from dm-rq. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 36 ++++++++++++++---------------------- block/blk.h | 1 - block/elevator.c | 2 +- drivers/md/dm-rq.c | 9 +++------ include/linux/blk-mq.h | 5 ++--- include/linux/elevator.h | 1 + 6 files changed, 21 insertions(+), 33 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 867e5faf4f5b..8550ad64982f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3115,21 +3115,18 @@ void blk_mq_release(struct request_queue *q) struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata) { - struct request_queue *uninit_q, *q; + struct request_queue *q; + int ret; - uninit_q = blk_alloc_queue(set->numa_node); - if (!uninit_q) + q = blk_alloc_queue(set->numa_node); + if (!q) return ERR_PTR(-ENOMEM); - uninit_q->queuedata = queuedata; - - /* - * Initialize the queue without an elevator. device_add_disk() will do - * the initialization. - */ - q = blk_mq_init_allocated_queue(set, uninit_q, false); - if (IS_ERR(q)) - blk_cleanup_queue(uninit_q); - + q->queuedata = queuedata; + ret = blk_mq_init_allocated_queue(set, q); + if (ret) { + blk_cleanup_queue(q); + return ERR_PTR(ret); + } return q; } EXPORT_SYMBOL_GPL(blk_mq_init_queue_data); @@ -3273,9 +3270,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, mutex_unlock(&q->sysfs_lock); } -struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q, - bool elevator_init) +int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q) { /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -3325,11 +3321,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); - - if (elevator_init) - elevator_init_mq(q); - - return q; + return 0; err_hctxs: kfree(q->queue_hw_ctx); @@ -3340,7 +3332,7 @@ err_poll: q->poll_cb = NULL; err_exit: q->mq_ops = NULL; - return ERR_PTR(-ENOMEM); + return -ENOMEM; } EXPORT_SYMBOL(blk_mq_init_allocated_queue); diff --git a/block/blk.h b/block/blk.h index 3440142f029b..d3fa47af3607 100644 --- a/block/blk.h +++ b/block/blk.h @@ -192,7 +192,6 @@ void blk_account_io_done(struct request *req, u64 now); void blk_insert_flush(struct request *rq); -void elevator_init_mq(struct request_queue *q); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); void __elevator_exit(struct request_queue *, struct elevator_queue *); diff --git a/block/elevator.c b/block/elevator.c index 440699c28119..06e203426410 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -693,7 +693,7 @@ void elevator_init_mq(struct request_queue *q) elevator_put(e); } } - +EXPORT_SYMBOL_GPL(elevator_init_mq); /* only for dm-rq */ /* * switch to new_e io scheduler. be careful not to introduce deadlocks - diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 9c3bc3711b33..0dbd48cbdff9 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -530,7 +530,6 @@ static const struct blk_mq_ops dm_mq_ops = { int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) { - struct request_queue *q; struct dm_target *immutable_tgt; int err; @@ -557,12 +556,10 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) if (err) goto out_kfree_tag_set; - q = blk_mq_init_allocated_queue(md->tag_set, md->queue, true); - if (IS_ERR(q)) { - err = PTR_ERR(q); + err = blk_mq_init_allocated_queue(md->tag_set, md->queue); + if (err) goto out_tag_set; - } - + elevator_init_mq(md->queue); return 0; out_tag_set: diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index bb950fc669ef..73750b2838d2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -429,9 +429,8 @@ enum { struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata); -struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q, - bool elevator_init); +int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q); struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, diff --git a/include/linux/elevator.h b/include/linux/elevator.h index dcb2f9022c1d..783ecb3cb77a 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -120,6 +120,7 @@ extern void elv_merged_request(struct request_queue *, struct request *, extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); +void elevator_init_mq(struct request_queue *q); /* * io scheduler registration From b461dfc49eb6fbabc60b9dad476e787ada56b7b4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:18 +0300 Subject: [PATCH 055/129] blk-mq: add the blk_mq_alloc_disk APIs Add a new API to allocate a gendisk including the request_queue for use with blk-mq based drivers. This is to avoid boilerplate code in drivers. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 19 +++++++++++++++++++ include/linux/blk-mq.h | 12 ++++++++++++ 2 files changed, 31 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8550ad64982f..b123077a0dc4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3137,6 +3137,25 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) +{ + struct request_queue *q; + struct gendisk *disk; + + q = blk_mq_init_queue_data(set, queuedata); + if (IS_ERR(q)) + return ERR_CAST(q); + + disk = __alloc_disk_node(0, set->numa_node); + if (!disk) { + blk_cleanup_queue(q); + return ERR_PTR(-ENOMEM); + } + disk->queue = q; + return disk; +} +EXPORT_SYMBOL(__blk_mq_alloc_disk); + /* * Helper for setting up a queue with mq ops, given queue depth, and * the passed in mq ops flags. diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 73750b2838d2..f496c6c5b5d2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -426,6 +426,18 @@ enum { ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) +#define blk_mq_alloc_disk(set, queuedata) \ +({ \ + static struct lock_class_key __key; \ + struct gendisk *__disk = __blk_mq_alloc_disk(set, queuedata); \ + \ + if (__disk) \ + lockdep_init_map(&__disk->lockdep_map, \ + "(bio completion)", &__key, 0); \ + __disk; \ +}) +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, + void *queuedata); struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata); From 89a5f065653bcf7d8c3e4101e025e6c7b03339e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:19 +0300 Subject: [PATCH 056/129] virtio-blk: use blk_mq_alloc_disk Use the blk_mq_alloc_disk API to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-5-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index b9fa3ef5b57c..e4bd3b1fc3c2 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -749,13 +749,6 @@ static int virtblk_probe(struct virtio_device *vdev) if (err) goto out_free_vblk; - /* FIXME: How many partitions? How long is a piece of string? */ - vblk->disk = alloc_disk(1 << PART_BITS); - if (!vblk->disk) { - err = -ENOMEM; - goto out_free_vq; - } - /* Default queue sizing is to fill the ring. */ if (likely(!virtblk_queue_depth)) { queue_depth = vblk->vqs[0].vq->num_free; @@ -779,21 +772,20 @@ static int virtblk_probe(struct virtio_device *vdev) err = blk_mq_alloc_tag_set(&vblk->tag_set); if (err) - goto out_put_disk; + goto out_free_vq; - q = blk_mq_init_queue(&vblk->tag_set); - if (IS_ERR(q)) { - err = -ENOMEM; + vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, vblk); + if (IS_ERR(vblk->disk)) { + err = PTR_ERR(vblk->disk); goto out_free_tags; } - vblk->disk->queue = q; - - q->queuedata = vblk; + q = vblk->disk->queue; virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); vblk->disk->major = major; vblk->disk->first_minor = index_to_minor(index); + vblk->disk->minors = 1 << PART_BITS; vblk->disk->private_data = vblk; vblk->disk->fops = &virtblk_fops; vblk->disk->flags |= GENHD_FL_EXT_DEVT; @@ -892,8 +884,6 @@ static int virtblk_probe(struct virtio_device *vdev) out_free_tags: blk_mq_free_tag_set(&vblk->tag_set); -out_put_disk: - put_disk(vblk->disk); out_free_vq: vdev->config->del_vqs(vdev); kfree(vblk->vqs); @@ -913,8 +903,7 @@ static void virtblk_remove(struct virtio_device *vdev) flush_work(&vblk->config_work); del_gendisk(vblk->disk); - blk_cleanup_queue(vblk->disk->queue); - + blk_cleanup_disk(vblk->disk); blk_mq_free_tag_set(&vblk->tag_set); mutex_lock(&vblk->vdev_mutex); @@ -925,7 +914,6 @@ static void virtblk_remove(struct virtio_device *vdev) /* Virtqueues are stopped, nothing can use vblk->vdev anymore. */ vblk->vdev = NULL; - put_disk(vblk->disk); vdev->config->del_vqs(vdev); kfree(vblk->vqs); From 9c4f8971cc26cd485a39d02706533717eb3430dc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:20 +0300 Subject: [PATCH 057/129] pcd: use blk_mq_alloc_disk Use the blk_mq_alloc_disk API to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-6-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/paride/pcd.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 70da8b86ce58..f9cdd11f02f5 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -309,21 +309,19 @@ static void pcd_init_units(void) pcd_drive_count = 0; for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { - struct gendisk *disk = alloc_disk(1); + struct gendisk *disk; - if (!disk) + if (blk_mq_alloc_sq_tag_set(&cd->tag_set, &pcd_mq_ops, 1, + BLK_MQ_F_SHOULD_MERGE)) continue; - disk->queue = blk_mq_init_sq_queue(&cd->tag_set, &pcd_mq_ops, - 1, BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(disk->queue)) { - disk->queue = NULL; - put_disk(disk); + disk = blk_mq_alloc_disk(&cd->tag_set, cd); + if (IS_ERR(disk)) { + blk_mq_free_tag_set(&cd->tag_set); continue; } INIT_LIST_HEAD(&cd->rq_list); - disk->queue->queuedata = cd; blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); cd->disk = disk; cd->pi = &cd->pia; @@ -343,6 +341,7 @@ static void pcd_init_units(void) cd->info.mask = 0; disk->major = major; disk->first_minor = unit; + disk->minors = 1; strcpy(disk->disk_name, cd->name); /* umm... */ disk->fops = &pcd_bdops; disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; @@ -759,10 +758,8 @@ static int pcd_detect(void) for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) { if (!cd->disk) continue; - blk_cleanup_queue(cd->disk->queue); - cd->disk->queue = NULL; + blk_cleanup_disk(cd->disk); blk_mq_free_tag_set(&cd->tag_set); - put_disk(cd->disk); } pi_unregister_driver(par_drv); return -1; From c684b577968abeef96bf3c75d76d2dc19a1eb080 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:21 +0300 Subject: [PATCH 058/129] pf: use blk_mq_alloc_disk Use the blk_mq_alloc_disk API to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-7-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/paride/pf.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index bb09f21ce21a..d5b9c88ba76f 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -294,20 +294,17 @@ static void __init pf_init_units(void) for (unit = 0, pf = units; unit < PF_UNITS; unit++, pf++) { struct gendisk *disk; - disk = alloc_disk(1); - if (!disk) + if (blk_mq_alloc_sq_tag_set(&pf->tag_set, &pf_mq_ops, 1, + BLK_MQ_F_SHOULD_MERGE)) continue; - disk->queue = blk_mq_init_sq_queue(&pf->tag_set, &pf_mq_ops, - 1, BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(disk->queue)) { - disk->queue = NULL; - put_disk(disk); + disk = blk_mq_alloc_disk(&pf->tag_set, pf); + if (IS_ERR(disk)) { + blk_mq_free_tag_set(&pf->tag_set); continue; } INIT_LIST_HEAD(&pf->rq_list); - disk->queue->queuedata = pf; blk_queue_max_segments(disk->queue, cluster); blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH); pf->disk = disk; @@ -318,6 +315,7 @@ static void __init pf_init_units(void) snprintf(pf->name, PF_NAMELEN, "%s%d", name, unit); disk->major = major; disk->first_minor = unit; + disk->minors = 1; strcpy(disk->disk_name, pf->name); disk->fops = &pf_fops; disk->events = DISK_EVENT_MEDIA_CHANGE; @@ -766,10 +764,8 @@ static int pf_detect(void) for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) { if (!pf->disk) continue; - blk_cleanup_queue(pf->disk->queue); - pf->disk->queue = NULL; + blk_cleanup_disk(pf->disk); blk_mq_free_tag_set(&pf->tag_set); - put_disk(pf->disk); } pi_unregister_driver(par_drv); return -1; From f368b7d7fa77768026d439ec9c32fe16a2d5eb52 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:22 +0300 Subject: [PATCH 059/129] ms_block: use blk_mq_alloc_disk Use the blk_mq_alloc_disk API to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Acked-by: Ulf Hansson Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-8-hch@lst.de Signed-off-by: Jens Axboe --- drivers/memstick/core/ms_block.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 0bacf4268f83..dac258d12aca 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -2110,21 +2110,17 @@ static int msb_init_disk(struct memstick_dev *card) if (msb->disk_id < 0) return msb->disk_id; - msb->disk = alloc_disk(0); - if (!msb->disk) { - rc = -ENOMEM; + rc = blk_mq_alloc_sq_tag_set(&msb->tag_set, &msb_mq_ops, 2, + BLK_MQ_F_SHOULD_MERGE); + if (rc) goto out_release_id; - } - msb->queue = blk_mq_init_sq_queue(&msb->tag_set, &msb_mq_ops, 2, - BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(msb->queue)) { - rc = PTR_ERR(msb->queue); - msb->queue = NULL; - goto out_put_disk; + msb->disk = blk_mq_alloc_disk(&msb->tag_set, card); + if (IS_ERR(msb->disk)) { + rc = PTR_ERR(msb->disk); + goto out_free_tag_set; } - - msb->queue->queuedata = card; + msb->queue = msb->disk->queue; blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES); blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS); @@ -2135,7 +2131,6 @@ static int msb_init_disk(struct memstick_dev *card) sprintf(msb->disk->disk_name, "msblk%d", msb->disk_id); msb->disk->fops = &msb_bdops; msb->disk->private_data = msb; - msb->disk->queue = msb->queue; capacity = msb->pages_in_block * msb->logical_block_count; capacity *= (msb->page_size / 512); @@ -2155,8 +2150,8 @@ static int msb_init_disk(struct memstick_dev *card) dbg("Disk added"); return 0; -out_put_disk: - put_disk(msb->disk); +out_free_tag_set: + blk_mq_free_tag_set(&msb->tag_set); out_release_id: mutex_lock(&msb_disk_lock); idr_remove(&msb_disk_idr, msb->disk_id); From 51ed5bd55eb602fd8b3531bb919bcb59849fa569 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:23 +0300 Subject: [PATCH 060/129] mspro: use blk_mq_alloc_disk Use the blk_mq_alloc_disk API to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Acked-by: Ulf Hansson Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-9-hch@lst.de Signed-off-by: Jens Axboe --- drivers/memstick/core/mspro_block.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index cf7fe0d58ee7..22778d0e24f5 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -1205,21 +1205,17 @@ static int mspro_block_init_disk(struct memstick_dev *card) if (disk_id < 0) return disk_id; - msb->disk = alloc_disk(1 << MSPRO_BLOCK_PART_SHIFT); - if (!msb->disk) { - rc = -ENOMEM; + rc = blk_mq_alloc_sq_tag_set(&msb->tag_set, &mspro_mq_ops, 2, + BLK_MQ_F_SHOULD_MERGE); + if (rc) goto out_release_id; - } - msb->queue = blk_mq_init_sq_queue(&msb->tag_set, &mspro_mq_ops, 2, - BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(msb->queue)) { - rc = PTR_ERR(msb->queue); - msb->queue = NULL; - goto out_put_disk; + msb->disk = blk_mq_alloc_disk(&msb->tag_set, card); + if (IS_ERR(msb->disk)) { + rc = PTR_ERR(msb->disk); + goto out_free_tag_set; } - - msb->queue->queuedata = card; + msb->queue = msb->disk->queue; blk_queue_max_hw_sectors(msb->queue, MSPRO_BLOCK_MAX_PAGES); blk_queue_max_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS); @@ -1228,10 +1224,10 @@ static int mspro_block_init_disk(struct memstick_dev *card) msb->disk->major = major; msb->disk->first_minor = disk_id << MSPRO_BLOCK_PART_SHIFT; + msb->disk->minors = 1 << MSPRO_BLOCK_PART_SHIFT; msb->disk->fops = &ms_block_bdops; msb->usage_count = 1; msb->disk->private_data = msb; - msb->disk->queue = msb->queue; sprintf(msb->disk->disk_name, "mspblk%d", disk_id); @@ -1247,8 +1243,8 @@ static int mspro_block_init_disk(struct memstick_dev *card) msb->active = 1; return 0; -out_put_disk: - put_disk(msb->disk); +out_free_tag_set: + blk_mq_free_tag_set(&msb->tag_set); out_release_id: mutex_lock(&mspro_block_disk_lock); idr_remove(&mspro_block_disk_idr, disk_id); From 6966bb921def0a4ef1e069f806c086efae6782ea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:24 +0300 Subject: [PATCH 061/129] mtd_blkdevs: use blk_mq_alloc_disk Use the blk_mq_alloc_disk API to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-10-hch@lst.de Signed-off-by: Jens Axboe --- drivers/mtd/mtd_blkdevs.c | 48 ++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index fb8e12d590a1..5dc4c966ea73 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -30,11 +30,9 @@ static void blktrans_dev_release(struct kref *kref) struct mtd_blktrans_dev *dev = container_of(kref, struct mtd_blktrans_dev, ref); - dev->disk->private_data = NULL; - blk_cleanup_queue(dev->rq); + blk_cleanup_disk(dev->disk); blk_mq_free_tag_set(dev->tag_set); kfree(dev->tag_set); - put_disk(dev->disk); list_del(&dev->list); kfree(dev); } @@ -354,7 +352,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) if (new->devnum > (MINORMASK >> tr->part_bits) || (tr->part_bits && new->devnum >= 27 * 26)) { mutex_unlock(&blktrans_ref_mutex); - goto error1; + return ret; } list_add_tail(&new->list, &tr->devs); @@ -366,17 +364,28 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) if (!tr->writesect) new->readonly = 1; - /* Create gendisk */ ret = -ENOMEM; - gd = alloc_disk(1 << tr->part_bits); + new->tag_set = kzalloc(sizeof(*new->tag_set), GFP_KERNEL); + if (!new->tag_set) + goto out_list_del; - if (!gd) - goto error2; + ret = blk_mq_alloc_sq_tag_set(new->tag_set, &mtd_mq_ops, 2, + BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING); + if (ret) + goto out_kfree_tag_set; + + /* Create gendisk */ + gd = blk_mq_alloc_disk(new->tag_set, new); + if (IS_ERR(gd)) { + ret = PTR_ERR(gd); + goto out_free_tag_set; + } new->disk = gd; gd->private_data = new; gd->major = tr->major; gd->first_minor = (new->devnum) << tr->part_bits; + gd->minors = 1 << tr->part_bits; gd->fops = &mtd_block_ops; if (tr->part_bits) @@ -398,22 +407,9 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) spin_lock_init(&new->queue_lock); INIT_LIST_HEAD(&new->rq_list); - new->tag_set = kzalloc(sizeof(*new->tag_set), GFP_KERNEL); - if (!new->tag_set) - goto error3; - - new->rq = blk_mq_init_sq_queue(new->tag_set, &mtd_mq_ops, 2, - BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING); - if (IS_ERR(new->rq)) { - ret = PTR_ERR(new->rq); - new->rq = NULL; - goto error4; - } - if (tr->flush) blk_queue_write_cache(new->rq, true, false); - new->rq->queuedata = new; blk_queue_logical_block_size(new->rq, tr->blksize); blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq); @@ -437,13 +433,13 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) WARN_ON(ret); } return 0; -error4: + +out_free_tag_set: + blk_mq_free_tag_set(new->tag_set); +out_kfree_tag_set: kfree(new->tag_set); -error3: - put_disk(new->disk); -error2: +out_list_del: list_del(&new->list); -error1: return ret; } From 89662ac55a204d82f9b0c1497e060d18b51fadc2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:25 +0300 Subject: [PATCH 062/129] ps3disk: use blk_mq_alloc_disk Use the blk_mq_alloc_disk API to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Tested-by: Geoff Levand Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-11-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/ps3disk.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index ba3ece56cbb3..f374ea2c67ce 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -29,7 +29,6 @@ struct ps3disk_private { spinlock_t lock; /* Request queue spinlock */ - struct request_queue *queue; struct blk_mq_tag_set tag_set; struct gendisk *gendisk; unsigned int blocking_factor; @@ -267,7 +266,7 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data) blk_mq_end_request(req, error); spin_unlock(&priv->lock); - blk_mq_run_hw_queues(priv->queue, true); + blk_mq_run_hw_queues(priv->gendisk->queue, true); return IRQ_HANDLED; } @@ -441,17 +440,20 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) ps3disk_identify(dev); - queue = blk_mq_init_sq_queue(&priv->tag_set, &ps3disk_mq_ops, 1, + error = blk_mq_alloc_sq_tag_set(&priv->tag_set, &ps3disk_mq_ops, 1, BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(queue)) { - dev_err(&dev->sbd.core, "%s:%u: blk_mq_init_queue failed\n", - __func__, __LINE__); - error = PTR_ERR(queue); + if (error) goto fail_teardown; + + gendisk = blk_mq_alloc_disk(&priv->tag_set, dev); + if (IS_ERR(gendisk)) { + dev_err(&dev->sbd.core, "%s:%u: blk_mq_alloc_disk failed\n", + __func__, __LINE__); + error = PTR_ERR(gendisk); + goto fail_free_tag_set; } - priv->queue = queue; - queue->queuedata = dev; + queue = gendisk->queue; blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9); blk_queue_dma_alignment(queue, dev->blk_size-1); @@ -462,19 +464,11 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) blk_queue_max_segments(queue, -1); blk_queue_max_segment_size(queue, dev->bounce_size); - gendisk = alloc_disk(PS3DISK_MINORS); - if (!gendisk) { - dev_err(&dev->sbd.core, "%s:%u: alloc_disk failed\n", __func__, - __LINE__); - error = -ENOMEM; - goto fail_cleanup_queue; - } - priv->gendisk = gendisk; gendisk->major = ps3disk_major; gendisk->first_minor = devidx * PS3DISK_MINORS; + gendisk->minors = PS3DISK_MINORS; gendisk->fops = &ps3disk_fops; - gendisk->queue = queue; gendisk->private_data = dev; snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME, devidx+'a'); @@ -490,8 +484,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) device_add_disk(&dev->sbd.core, gendisk, NULL); return 0; -fail_cleanup_queue: - blk_cleanup_queue(queue); +fail_free_tag_set: blk_mq_free_tag_set(&priv->tag_set); fail_teardown: ps3stor_teardown(dev); @@ -517,9 +510,8 @@ static void ps3disk_remove(struct ps3_system_bus_device *_dev) &ps3disk_mask); mutex_unlock(&ps3disk_mask_mutex); del_gendisk(priv->gendisk); - blk_cleanup_queue(priv->queue); + blk_cleanup_disk(priv->gendisk); blk_mq_free_tag_set(&priv->tag_set); - put_disk(priv->gendisk); dev_notice(&dev->sbd.core, "Synchronizing disk cache\n"); ps3disk_sync_cache(dev); ps3stor_teardown(dev); From 9c8463e8e19c442aaf896468ce72c1ed82655781 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:26 +0300 Subject: [PATCH 063/129] swim3: use blk_mq_alloc_disk Use the blk_mq_alloc_disk API to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-12-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/swim3.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index a515d0c1d2cb..965af0a3e95b 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1202,30 +1202,27 @@ static int swim3_attach(struct macio_dev *mdev, return rc; } - disk = alloc_disk(1); - if (disk == NULL) { - rc = -ENOMEM; - goto out_unregister; - } - fs = &floppy_states[floppy_count]; memset(fs, 0, sizeof(*fs)); - disk->queue = blk_mq_init_sq_queue(&fs->tag_set, &swim3_mq_ops, 2, - BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(disk->queue)) { - rc = PTR_ERR(disk->queue); - disk->queue = NULL; - goto out_put_disk; + rc = blk_mq_alloc_sq_tag_set(&fs->tag_set, &swim3_mq_ops, 2, + BLK_MQ_F_SHOULD_MERGE); + if (rc) + goto out_unregister; + + disk = blk_mq_alloc_disk(&fs->tag_set, fs); + if (IS_ERR(disk)) { + rc = PTR_ERR(disk); + goto out_free_tag_set; } - disk->queue->queuedata = fs; rc = swim3_add_device(mdev, floppy_count); if (rc) - goto out_cleanup_queue; + goto out_cleanup_disk; disk->major = FLOPPY_MAJOR; disk->first_minor = floppy_count; + disk->minors = 1; disk->fops = &floppy_fops; disk->private_data = fs; disk->events = DISK_EVENT_MEDIA_CHANGE; @@ -1237,12 +1234,10 @@ static int swim3_attach(struct macio_dev *mdev, disks[floppy_count++] = disk; return 0; -out_cleanup_queue: - blk_cleanup_queue(disk->queue); - disk->queue = NULL; +out_cleanup_disk: + blk_cleanup_disk(disk); +out_free_tag_set: blk_mq_free_tag_set(&fs->tag_set); -out_put_disk: - put_disk(disk); out_unregister: if (floppy_count == 0) unregister_blkdev(FLOPPY_MAJOR, "fd"); From 51fbfedfcc86273eded52f05a2c1aa75e91df8ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:27 +0300 Subject: [PATCH 064/129] swim: use blk_mq_alloc_disk Use the blk_mq_alloc_disk API to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-13-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/swim.c | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 2917b21f48ff..7ccc8d2a41bc 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -800,23 +800,20 @@ static int swim_floppy_init(struct swim_priv *swd) spin_lock_init(&swd->lock); for (drive = 0; drive < swd->floppy_count; drive++) { - struct request_queue *q; + err = blk_mq_alloc_sq_tag_set(&swd->unit[drive].tag_set, + &swim_mq_ops, 2, BLK_MQ_F_SHOULD_MERGE); + if (err) + goto exit_put_disks; - swd->unit[drive].disk = alloc_disk(1); - if (swd->unit[drive].disk == NULL) { - err = -ENOMEM; + swd->unit[drive].disk = + blk_mq_alloc_disk(&swd->unit[drive].tag_set, + &swd->unit[drive]); + if (IS_ERR(swd->unit[drive].disk)) { + blk_mq_free_tag_set(&swd->unit[drive].tag_set); + err = PTR_ERR(swd->unit[drive].disk); goto exit_put_disks; } - q = blk_mq_init_sq_queue(&swd->unit[drive].tag_set, &swim_mq_ops, - 2, BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(q)) { - err = PTR_ERR(q); - goto exit_put_disks; - } - - swd->unit[drive].disk->queue = q; - swd->unit[drive].disk->queue->queuedata = &swd->unit[drive]; swd->unit[drive].swd = swd; } @@ -824,6 +821,7 @@ static int swim_floppy_init(struct swim_priv *swd) swd->unit[drive].disk->flags = GENHD_FL_REMOVABLE; swd->unit[drive].disk->major = FLOPPY_MAJOR; swd->unit[drive].disk->first_minor = drive; + swd->unit[drive].disk->minors = 1; sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive); swd->unit[drive].disk->fops = &floppy_fops; swd->unit[drive].disk->events = DISK_EVENT_MEDIA_CHANGE; @@ -839,14 +837,10 @@ exit_put_disks: do { struct gendisk *disk = swd->unit[drive].disk; - if (disk) { - if (disk->queue) { - blk_cleanup_queue(disk->queue); - disk->queue = NULL; - } - blk_mq_free_tag_set(&swd->unit[drive].tag_set); - put_disk(disk); - } + if (!disk) + continue; + blk_cleanup_disk(disk); + blk_mq_free_tag_set(&swd->unit[drive].tag_set); } while (drive--); return err; } From afea05a18d18673750bad33f7aa710ff71a78e91 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:28 +0300 Subject: [PATCH 065/129] sunvdc: use blk_mq_alloc_disk Use the blk_mq_alloc_disk API to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-14-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/sunvdc.c | 47 ++++++++++++------------------------------ 1 file changed, 13 insertions(+), 34 deletions(-) diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 39aeebc6837d..c53b38578bb7 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -780,27 +780,6 @@ static const struct blk_mq_ops vdc_mq_ops = { .queue_rq = vdc_queue_rq, }; -static void cleanup_queue(struct request_queue *q) -{ - struct vdc_port *port = q->queuedata; - - blk_cleanup_queue(q); - blk_mq_free_tag_set(&port->tag_set); -} - -static struct request_queue *init_queue(struct vdc_port *port) -{ - struct request_queue *q; - - q = blk_mq_init_sq_queue(&port->tag_set, &vdc_mq_ops, VDC_TX_RING_SIZE, - BLK_MQ_F_SHOULD_MERGE); - if (IS_ERR(q)) - return q; - - q->queuedata = port; - return q; -} - static int probe_disk(struct vdc_port *port) { struct request_queue *q; @@ -838,21 +817,21 @@ static int probe_disk(struct vdc_port *port) (u64)geom.num_sec); } - q = init_queue(port); - if (IS_ERR(q)) { - printk(KERN_ERR PFX "%s: Could not allocate queue.\n", - port->vio.name); - return PTR_ERR(q); - } - g = alloc_disk(1 << PARTITION_SHIFT); - if (!g) { + err = blk_mq_alloc_sq_tag_set(&port->tag_set, &vdc_mq_ops, + VDC_TX_RING_SIZE, BLK_MQ_F_SHOULD_MERGE); + if (err) + return err; + + g = blk_mq_alloc_disk(&port->tag_set, port); + if (IS_ERR(g)) { printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n", port->vio.name); - cleanup_queue(q); - return -ENOMEM; + blk_mq_free_tag_set(&port->tag_set); + return PTR_ERR(g); } port->disk = g; + q = g->queue; /* Each segment in a request is up to an aligned page in size. */ blk_queue_segment_boundary(q, PAGE_SIZE - 1); @@ -862,6 +841,7 @@ static int probe_disk(struct vdc_port *port) blk_queue_max_hw_sectors(q, port->max_xfer_size); g->major = vdc_major; g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT; + g->minors = 1 << PARTITION_SHIFT; strcpy(g->disk_name, port->disk_name); g->fops = &vdc_fops; @@ -1083,9 +1063,8 @@ static int vdc_port_remove(struct vio_dev *vdev) del_timer_sync(&port->vio.timer); del_gendisk(port->disk); - cleanup_queue(port->disk->queue); - put_disk(port->disk); - port->disk = NULL; + blk_cleanup_disk(port->disk); + blk_mq_free_tag_set(&port->tag_set); vdc_free_tx_ring(port); vio_ldc_free(&port->vio); From 0592c3d166c967056faa03b944c6c9c9b4e8a0cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:29 +0300 Subject: [PATCH 066/129] gdrom: use blk_mq_alloc_disk Use the blk_mq_alloc_disk API to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-15-hch@lst.de Signed-off-by: Jens Axboe --- drivers/cdrom/gdrom.c | 45 ++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index c6d8c0f59722..8e1fe75af93f 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -772,53 +772,50 @@ static int probe_gdrom(struct platform_device *devptr) goto probe_fail_no_mem; } probe_gdrom_setupcd(); - gd.disk = alloc_disk(1); - if (!gd.disk) { - err = -ENODEV; - goto probe_fail_no_disk; + + err = blk_mq_alloc_sq_tag_set(&gd.tag_set, &gdrom_mq_ops, 1, + BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING); + if (err) + goto probe_fail_free_cd_info; + + gd.disk = blk_mq_alloc_disk(&gd.tag_set, NULL); + if (IS_ERR(gd.disk)) { + err = PTR_ERR(gd.disk); + goto probe_fail_free_tag_set; } + gd.gdrom_rq = gd.disk->queue; probe_gdrom_setupdisk(); if (register_cdrom(gd.disk, gd.cd_info)) { err = -ENODEV; - goto probe_fail_cdrom_register; + goto probe_fail_cleanup_disk; } gd.disk->fops = &gdrom_bdops; gd.disk->events = DISK_EVENT_MEDIA_CHANGE; /* latch on to the interrupt */ err = gdrom_set_interrupt_handlers(); if (err) - goto probe_fail_cmdirq_register; - - gd.gdrom_rq = blk_mq_init_sq_queue(&gd.tag_set, &gdrom_mq_ops, 1, - BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING); - if (IS_ERR(gd.gdrom_rq)) { - err = PTR_ERR(gd.gdrom_rq); - gd.gdrom_rq = NULL; - goto probe_fail_requestq; - } + goto probe_fail_cleanup_disk; err = probe_gdrom_setupqueue(); if (err) - goto probe_fail_toc; + goto probe_fail_free_irqs; gd.toc = kzalloc(sizeof(struct gdromtoc), GFP_KERNEL); if (!gd.toc) { err = -ENOMEM; - goto probe_fail_toc; + goto probe_fail_free_irqs; } add_disk(gd.disk); return 0; -probe_fail_toc: - blk_cleanup_queue(gd.gdrom_rq); - blk_mq_free_tag_set(&gd.tag_set); -probe_fail_requestq: +probe_fail_free_irqs: free_irq(HW_EVENT_GDROM_DMA, &gd); free_irq(HW_EVENT_GDROM_CMD, &gd); -probe_fail_cmdirq_register: -probe_fail_cdrom_register: - del_gendisk(gd.disk); -probe_fail_no_disk: +probe_fail_cleanup_disk: + blk_cleanup_disk(gd.disk); +probe_fail_free_tag_set: + blk_mq_free_tag_set(&gd.tag_set); +probe_fail_free_cd_info: kfree(gd.cd_info); probe_fail_no_mem: unregister_blkdev(gdrom_major, GDROM_DEV_NAME); From 08c1d480ed38995690a7d83f2c6a505f6cbbed9f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:30 +0300 Subject: [PATCH 067/129] blk-mq: remove blk_mq_init_sq_queue All users are gone now. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-16-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 22 ---------------------- include/linux/blk-mq.h | 4 ---- 2 files changed, 26 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index b123077a0dc4..3115ea2d0990 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3156,28 +3156,6 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata) } EXPORT_SYMBOL(__blk_mq_alloc_disk); -/* - * Helper for setting up a queue with mq ops, given queue depth, and - * the passed in mq ops flags. - */ -struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, - unsigned int queue_depth, - unsigned int set_flags) -{ - struct request_queue *q; - int ret; - - ret = blk_mq_alloc_sq_tag_set(set, ops, queue_depth, set_flags); - if (ret) - return ERR_PTR(ret); - q = blk_mq_init_queue(set); - if (IS_ERR(q)) - blk_mq_free_tag_set(set); - return q; -} -EXPORT_SYMBOL(blk_mq_init_sq_queue); - static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index f496c6c5b5d2..02a4aab0aeac 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -443,10 +443,6 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); -struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, - unsigned int queue_depth, - unsigned int set_flags); void blk_mq_unregister_dev(struct device *, struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); From 6560ec961a080944f8d5e1fef17b771bfaf189cb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:31 +0300 Subject: [PATCH 068/129] aoe: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-17-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/aoe/aoeblk.c | 33 ++++++++++++--------------------- drivers/block/aoe/aoedev.c | 3 +-- 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index c34e71b0c4a9..06b360f7123a 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -338,14 +338,13 @@ static const struct blk_mq_ops aoeblk_mq_ops = { .queue_rq = aoeblk_queue_rq, }; -/* alloc_disk and add_disk can sleep */ +/* blk_mq_alloc_disk and add_disk can sleep */ void aoeblk_gdalloc(void *vp) { struct aoedev *d = vp; struct gendisk *gd; mempool_t *mp; - struct request_queue *q; struct blk_mq_tag_set *set; ulong flags; int late = 0; @@ -362,19 +361,12 @@ aoeblk_gdalloc(void *vp) if (late) return; - gd = alloc_disk(AOE_PARTITIONS); - if (gd == NULL) { - pr_err("aoe: cannot allocate disk structure for %ld.%d\n", - d->aoemajor, d->aoeminor); - goto err; - } - mp = mempool_create(MIN_BUFS, mempool_alloc_slab, mempool_free_slab, buf_pool_cache); if (mp == NULL) { printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n", d->aoemajor, d->aoeminor); - goto err_disk; + goto err; } set = &d->tag_set; @@ -391,12 +383,11 @@ aoeblk_gdalloc(void *vp) goto err_mempool; } - q = blk_mq_init_queue(set); - if (IS_ERR(q)) { + gd = blk_mq_alloc_disk(set, d); + if (IS_ERR(gd)) { pr_err("aoe: cannot allocate block queue for %ld.%d\n", d->aoemajor, d->aoeminor); - blk_mq_free_tag_set(set); - goto err_mempool; + goto err_tagset; } spin_lock_irqsave(&d->lock, flags); @@ -405,16 +396,16 @@ aoeblk_gdalloc(void *vp) WARN_ON(d->flags & DEVFL_TKILL); WARN_ON(d->gd); WARN_ON(d->flags & DEVFL_UP); - blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); - blk_queue_io_opt(q, SZ_2M); + blk_queue_max_hw_sectors(gd->queue, BLK_DEF_MAX_SECTORS); + blk_queue_io_opt(gd->queue, SZ_2M); d->bufpool = mp; - d->blkq = gd->queue = q; - q->queuedata = d; + d->blkq = gd->queue; d->gd = gd; if (aoe_maxsectors) - blk_queue_max_hw_sectors(q, aoe_maxsectors); + blk_queue_max_hw_sectors(gd->queue, aoe_maxsectors); gd->major = AOE_MAJOR; gd->first_minor = d->sysminor; + gd->minors = AOE_PARTITIONS; gd->fops = &aoe_bdops; gd->private_data = d; set_capacity(gd, d->ssize); @@ -435,10 +426,10 @@ aoeblk_gdalloc(void *vp) spin_unlock_irqrestore(&d->lock, flags); return; +err_tagset: + blk_mq_free_tag_set(set); err_mempool: mempool_destroy(mp); -err_disk: - put_disk(gd); err: spin_lock_irqsave(&d->lock, flags); d->flags &= ~DEVFL_GD_NOW; diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index e2ea2356da06..c5753c6bfe80 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -277,9 +277,8 @@ freedev(struct aoedev *d) if (d->gd) { aoedisk_rm_debugfs(d); del_gendisk(d->gd); - put_disk(d->gd); + blk_cleanup_disk(d->gd); blk_mq_free_tag_set(&d->tag_set); - blk_cleanup_queue(d->blkq); } t = d->targets; e = t + d->ntargets; From 34f84aefe2ba0a77431cc0c0808c5c0239b6ba43 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:32 +0300 Subject: [PATCH 069/129] floppy: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-18-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 8a9d22207c59..cbed9776f285 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4491,23 +4491,15 @@ static bool floppy_available(int drive) static int floppy_alloc_disk(unsigned int drive, unsigned int type) { struct gendisk *disk; - int err; - disk = alloc_disk(1); - if (!disk) - return -ENOMEM; - - disk->queue = blk_mq_init_queue(&tag_sets[drive]); - if (IS_ERR(disk->queue)) { - err = PTR_ERR(disk->queue); - disk->queue = NULL; - put_disk(disk); - return err; - } + disk = blk_mq_alloc_disk(&tag_sets[drive], NULL); + if (IS_ERR(disk)) + return PTR_ERR(disk); blk_queue_max_hw_sectors(disk->queue, 64); disk->major = FLOPPY_MAJOR; disk->first_minor = TOMINOR(drive) | (type << 2); + disk->minors = 1; disk->fops = &floppy_fops; disk->events = DISK_EVENT_MEDIA_CHANGE; if (type) @@ -4727,10 +4719,8 @@ out_put_disk: if (!disks[drive][0]) break; del_timer_sync(&motor_off_timer[drive]); - blk_cleanup_queue(disks[drive][0]->queue); - disks[drive][0]->queue = NULL; + blk_cleanup_disk(disks[drive][0]); blk_mq_free_tag_set(&tag_sets[drive]); - put_disk(disks[drive][0]); } return err; } From 1c99502fae359182a93a1c9cf7406edc0e28b6b0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:33 +0300 Subject: [PATCH 070/129] loop: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-19-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/loop.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 95c570f5923f..3f40e673a101 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2117,12 +2117,12 @@ static int loop_add(struct loop_device **l, int i) if (err) goto out_free_idr; - lo->lo_queue = blk_mq_init_queue(&lo->tag_set); - if (IS_ERR(lo->lo_queue)) { - err = PTR_ERR(lo->lo_queue); + disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, lo); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out_cleanup_tags; } - lo->lo_queue->queuedata = lo; + lo->lo_queue = lo->lo_disk->queue; blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS); @@ -2134,11 +2134,6 @@ static int loop_add(struct loop_device **l, int i) */ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue); - err = -ENOMEM; - disk = lo->lo_disk = alloc_disk(1 << part_shift); - if (!disk) - goto out_free_queue; - /* * Disable partition scanning by default. The in-kernel partition * scanning can be requested individually per-device during its @@ -2166,6 +2161,7 @@ static int loop_add(struct loop_device **l, int i) spin_lock_init(&lo->lo_lock); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; + disk->minors = 1 << part_shift; disk->fops = &lo_fops; disk->private_data = lo; disk->queue = lo->lo_queue; @@ -2174,8 +2170,6 @@ static int loop_add(struct loop_device **l, int i) *l = lo; return lo->lo_number; -out_free_queue: - blk_cleanup_queue(lo->lo_queue); out_cleanup_tags: blk_mq_free_tag_set(&lo->tag_set); out_free_idr: @@ -2189,9 +2183,8 @@ out: static void loop_remove(struct loop_device *lo) { del_gendisk(lo->lo_disk); - blk_cleanup_queue(lo->lo_queue); blk_mq_free_tag_set(&lo->tag_set); - put_disk(lo->lo_disk); + blk_cleanup_disk(lo->lo_disk); mutex_destroy(&lo->lo_mutex); kfree(lo); } From 4af5f2e0301311f88c420fcfc5f3c8611ade20ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:34 +0300 Subject: [PATCH 071/129] nbd: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-20-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 57 ++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 34 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 45d2c28c8fc8..614d82e7fae4 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -219,15 +219,11 @@ static const struct device_attribute pid_attr = { static void nbd_dev_remove(struct nbd_device *nbd) { struct gendisk *disk = nbd->disk; - struct request_queue *q; if (disk) { - q = disk->queue; del_gendisk(disk); - blk_cleanup_queue(q); blk_mq_free_tag_set(&nbd->tag_set); - disk->private_data = NULL; - put_disk(disk); + blk_cleanup_disk(disk); } /* @@ -1646,15 +1642,24 @@ static int nbd_dev_add(int index) { struct nbd_device *nbd; struct gendisk *disk; - struct request_queue *q; int err = -ENOMEM; nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL); if (!nbd) goto out; - disk = alloc_disk(1 << part_shift); - if (!disk) + nbd->tag_set.ops = &nbd_mq_ops; + nbd->tag_set.nr_hw_queues = 1; + nbd->tag_set.queue_depth = 128; + nbd->tag_set.numa_node = NUMA_NO_NODE; + nbd->tag_set.cmd_size = sizeof(struct nbd_cmd); + nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | + BLK_MQ_F_BLOCKING; + nbd->tag_set.driver_data = nbd; + nbd->destroy_complete = NULL; + + err = blk_mq_alloc_tag_set(&nbd->tag_set); + if (err) goto out_free_nbd; if (index >= 0) { @@ -1668,30 +1673,15 @@ static int nbd_dev_add(int index) index = err; } if (err < 0) - goto out_free_disk; - - nbd->index = index; - nbd->disk = disk; - nbd->tag_set.ops = &nbd_mq_ops; - nbd->tag_set.nr_hw_queues = 1; - nbd->tag_set.queue_depth = 128; - nbd->tag_set.numa_node = NUMA_NO_NODE; - nbd->tag_set.cmd_size = sizeof(struct nbd_cmd); - nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | - BLK_MQ_F_BLOCKING; - nbd->tag_set.driver_data = nbd; - nbd->destroy_complete = NULL; - - err = blk_mq_alloc_tag_set(&nbd->tag_set); - if (err) - goto out_free_idr; - - q = blk_mq_init_queue(&nbd->tag_set); - if (IS_ERR(q)) { - err = PTR_ERR(q); goto out_free_tags; + nbd->index = index; + + disk = blk_mq_alloc_disk(&nbd->tag_set, NULL); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); + goto out_free_idr; } - disk->queue = q; + nbd->disk = disk; /* * Tell the block layer that we are not a rotational device @@ -1712,6 +1702,7 @@ static int nbd_dev_add(int index) INIT_LIST_HEAD(&nbd->list); disk->major = NBD_MAJOR; disk->first_minor = index << part_shift; + disk->minors = 1 << part_shift; disk->fops = &nbd_fops; disk->private_data = nbd; sprintf(disk->disk_name, "nbd%d", index); @@ -1719,12 +1710,10 @@ static int nbd_dev_add(int index) nbd_total_devices++; return index; -out_free_tags: - blk_mq_free_tag_set(&nbd->tag_set); out_free_idr: idr_remove(&nbd_index_idr, index); -out_free_disk: - put_disk(disk); +out_free_tags: + blk_mq_free_tag_set(&nbd->tag_set); out_free_nbd: kfree(nbd); out: From 6759b1a2013ba6f65e97f0b9444fa1cf69654f7c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:35 +0300 Subject: [PATCH 072/129] nullb: use blk_mq_alloc_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-21-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 83d803cb57c8..3b320b005aa8 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1851,13 +1851,12 @@ static int null_add_dev(struct nullb_device *dev) rv = -ENOMEM; nullb->tag_set->timeout = 5 * HZ; - nullb->q = blk_mq_init_queue_data(nullb->tag_set, nullb); - if (IS_ERR(nullb->q)) + nullb->disk = blk_mq_alloc_disk(nullb->tag_set, nullb); + if (IS_ERR(nullb->disk)) { + rv = PTR_ERR(nullb->disk); goto out_cleanup_tags; - nullb->disk = alloc_disk_node(1, nullb->dev->home_node); - if (!nullb->disk) - goto out_cleanup_tags; - nullb->disk->queue = nullb->q; + } + nullb->q = nullb->disk->queue; } else if (dev->queue_mode == NULL_Q_BIO) { rv = -ENOMEM; nullb->disk = blk_alloc_disk(nullb->dev->home_node); From 262d431f90003b1a7d9585ef5465252317eb6bd7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:36 +0300 Subject: [PATCH 073/129] pd: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-22-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/paride/pd.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 828a45ffe0e7..3b2b8e872beb 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -879,18 +879,6 @@ static void pd_probe_drive(struct pd_unit *disk) { struct gendisk *p; - p = alloc_disk(1 << PD_BITS); - if (!p) - return; - - strcpy(p->disk_name, disk->name); - p->fops = &pd_fops; - p->major = major; - p->first_minor = (disk - pd) << PD_BITS; - p->events = DISK_EVENT_MEDIA_CHANGE; - disk->gd = p; - p->private_data = disk; - memset(&disk->tag_set, 0, sizeof(disk->tag_set)); disk->tag_set.ops = &pd_mq_ops; disk->tag_set.cmd_size = sizeof(struct pd_req); @@ -903,14 +891,21 @@ static void pd_probe_drive(struct pd_unit *disk) if (blk_mq_alloc_tag_set(&disk->tag_set)) return; - p->queue = blk_mq_init_queue(&disk->tag_set); - if (IS_ERR(p->queue)) { + p = blk_mq_alloc_disk(&disk->tag_set, disk); + if (!p) { blk_mq_free_tag_set(&disk->tag_set); - p->queue = NULL; return; } + disk->gd = p; + + strcpy(p->disk_name, disk->name); + p->fops = &pd_fops; + p->major = major; + p->first_minor = (disk - pd) << PD_BITS; + p->minors = 1 << PD_BITS; + p->events = DISK_EVENT_MEDIA_CHANGE; + p->private_data = disk; - p->queue->queuedata = disk; blk_queue_max_hw_sectors(p->queue, cluster); blk_queue_bounce_limit(p->queue, BLK_BOUNCE_HIGH); @@ -1019,9 +1014,8 @@ static void __exit pd_exit(void) if (p) { disk->gd = NULL; del_gendisk(p); - blk_cleanup_queue(p->queue); blk_mq_free_tag_set(&disk->tag_set); - put_disk(p); + blk_cleanup_disk(p); pi_release(disk->pi); } } From 195b1956b85baefc5049883fdcff249a8ff1911c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:37 +0300 Subject: [PATCH 074/129] rbd: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-23-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rbd.c | 52 ++++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 34 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index bbb88eb009e0..531d390902dd 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4750,9 +4750,8 @@ static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx, static void rbd_free_disk(struct rbd_device *rbd_dev) { - blk_cleanup_queue(rbd_dev->disk->queue); + blk_cleanup_disk(rbd_dev->disk); blk_mq_free_tag_set(&rbd_dev->tag_set); - put_disk(rbd_dev->disk); rbd_dev->disk = NULL; } @@ -4922,22 +4921,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; int err; - /* create gendisk info */ - disk = alloc_disk(single_major ? - (1 << RBD_SINGLE_MAJOR_PART_SHIFT) : - RBD_MINORS_PER_MAJOR); - if (!disk) - return -ENOMEM; - - snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", - rbd_dev->dev_id); - disk->major = rbd_dev->major; - disk->first_minor = rbd_dev->minor; - if (single_major) - disk->flags |= GENHD_FL_EXT_DEVT; - disk->fops = &rbd_bd_ops; - disk->private_data = rbd_dev; - memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); rbd_dev->tag_set.ops = &rbd_mq_ops; rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth; @@ -4948,13 +4931,26 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) err = blk_mq_alloc_tag_set(&rbd_dev->tag_set); if (err) - goto out_disk; + return err; - q = blk_mq_init_queue(&rbd_dev->tag_set); - if (IS_ERR(q)) { - err = PTR_ERR(q); + disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out_tag_set; } + q = disk->queue; + + snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", + rbd_dev->dev_id); + disk->major = rbd_dev->major; + disk->first_minor = rbd_dev->minor; + if (single_major) { + disk->minors = (1 << RBD_SINGLE_MAJOR_PART_SHIFT); + disk->flags |= GENHD_FL_EXT_DEVT; + } else { + disk->minors = RBD_MINORS_PER_MAJOR; + } + disk->fops = &rbd_bd_ops; blk_queue_flag_set(QUEUE_FLAG_NONROT, q); /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ @@ -4976,21 +4972,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); - /* - * disk_release() expects a queue ref from add_disk() and will - * put it. Hold an extra ref until add_disk() is called. - */ - WARN_ON(!blk_get_queue(q)); - disk->queue = q; - q->queuedata = rbd_dev; - rbd_dev->disk = disk; return 0; out_tag_set: blk_mq_free_tag_set(&rbd_dev->tag_set); -out_disk: - put_disk(disk); return err; } @@ -7088,8 +7074,6 @@ static ssize_t do_rbd_add(struct bus_type *bus, goto err_out_image_lock; device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL); - /* see rbd_init_disk() */ - blk_put_queue(rbd_dev->disk->queue); spin_lock(&rbd_dev_list_lock); list_add_tail(&rbd_dev->node, &rbd_dev_list); From 2c6ee0ae5fc7aed7b3309e725104ea60da2cc9c9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:38 +0300 Subject: [PATCH 075/129] rnbd: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Jack Wang Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-24-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 35 ++++++++--------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index c604a402cd5c..f4fa45d24c0b 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1353,18 +1353,6 @@ static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev) } } -static int setup_mq_dev(struct rnbd_clt_dev *dev) -{ - dev->queue = blk_mq_init_queue(&dev->sess->tag_set); - if (IS_ERR(dev->queue)) { - rnbd_clt_err(dev, "Initializing multiqueue queue failed, err: %ld\n", - PTR_ERR(dev->queue)); - return PTR_ERR(dev->queue); - } - rnbd_init_mq_hw_queues(dev); - return 0; -} - static void setup_request_queue(struct rnbd_clt_dev *dev) { blk_queue_logical_block_size(dev->queue, dev->logical_block_size); @@ -1393,13 +1381,13 @@ static void setup_request_queue(struct rnbd_clt_dev *dev) blk_queue_io_opt(dev->queue, dev->sess->max_io_size); blk_queue_virt_boundary(dev->queue, SZ_4K - 1); blk_queue_write_cache(dev->queue, dev->wc, dev->fua); - dev->queue->queuedata = dev; } static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) { dev->gd->major = rnbd_client_major; dev->gd->first_minor = idx << RNBD_PART_BITS; + dev->gd->minors = 1 << RNBD_PART_BITS; dev->gd->fops = &rnbd_client_ops; dev->gd->queue = dev->queue; dev->gd->private_data = dev; @@ -1426,24 +1414,18 @@ static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) static int rnbd_client_setup_device(struct rnbd_clt_dev *dev) { - int err, idx = dev->clt_device_id; + int idx = dev->clt_device_id; dev->size = dev->nsectors * dev->logical_block_size; - err = setup_mq_dev(dev); - if (err) - return err; + dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev); + if (IS_ERR(dev->gd)) + return PTR_ERR(dev->gd); + dev->queue = dev->gd->queue; + rnbd_init_mq_hw_queues(dev); setup_request_queue(dev); - - dev->gd = alloc_disk_node(1 << RNBD_PART_BITS, NUMA_NO_NODE); - if (!dev->gd) { - blk_cleanup_queue(dev->queue); - return -ENOMEM; - } - rnbd_clt_setup_gen_disk(dev, idx); - return 0; } @@ -1650,8 +1632,7 @@ put_sess: static void destroy_gen_disk(struct rnbd_clt_dev *dev) { del_gendisk(dev->gd); - blk_cleanup_queue(dev->queue); - put_disk(dev->gd); + blk_cleanup_disk(dev->gd); } static void destroy_sysfs(struct rnbd_clt_dev *dev, From 693874035e6e54981771eb5f19e6eb0da2437175 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:39 +0300 Subject: [PATCH 076/129] sx8: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-25-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/sx8.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 2cdf2771f8e8..f01f860b0e62 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -1343,32 +1343,25 @@ static int carm_init_disk(struct carm_host *host, unsigned int port_no) { struct carm_port *port = &host->port[port_no]; struct gendisk *disk; - struct request_queue *q; port->host = host; port->port_no = port_no; - disk = alloc_disk(CARM_MINORS_PER_MAJOR); - if (!disk) - return -ENOMEM; + disk = blk_mq_alloc_disk(&host->tag_set, port); + if (IS_ERR(disk)) + return PTR_ERR(disk); port->disk = disk; sprintf(disk->disk_name, DRV_NAME "/%u", (unsigned int)host->id * CARM_MAX_PORTS + port_no); disk->major = host->major; disk->first_minor = port_no * CARM_MINORS_PER_MAJOR; + disk->minors = CARM_MINORS_PER_MAJOR; disk->fops = &carm_bd_ops; disk->private_data = port; - q = blk_mq_init_queue(&host->tag_set); - if (IS_ERR(q)) - return PTR_ERR(q); - - blk_queue_max_segments(q, CARM_MAX_REQ_SG); - blk_queue_segment_boundary(q, CARM_SG_BOUNDARY); - - q->queuedata = port; - disk->queue = q; + blk_queue_max_segments(disk->queue, CARM_MAX_REQ_SG); + blk_queue_segment_boundary(disk->queue, CARM_SG_BOUNDARY); return 0; } @@ -1382,9 +1375,7 @@ static void carm_free_disk(struct carm_host *host, unsigned int port_no) if (disk->flags & GENHD_FL_UP) del_gendisk(disk); - if (disk->queue) - blk_cleanup_queue(disk->queue); - put_disk(disk); + blk_cleanup_disk(disk); } static int carm_init_shm(struct carm_host *host) From 3b62c140e93d32c825ed028faca45dee58dbe37f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:40 +0300 Subject: [PATCH 077/129] xen-blkfront: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-26-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 96 +++++++++++++++--------------------- 1 file changed, 39 insertions(+), 57 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index f2c1aedcdf5a..8d49f8fa98bb 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -968,48 +968,6 @@ static void blkif_set_queue_limits(struct blkfront_info *info) blk_queue_dma_alignment(rq, 511); } -static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, - unsigned int physical_sector_size) -{ - struct request_queue *rq; - struct blkfront_info *info = gd->private_data; - - memset(&info->tag_set, 0, sizeof(info->tag_set)); - info->tag_set.ops = &blkfront_mq_ops; - info->tag_set.nr_hw_queues = info->nr_rings; - if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) { - /* - * When indirect descriptior is not supported, the I/O request - * will be split between multiple request in the ring. - * To avoid problems when sending the request, divide by - * 2 the depth of the queue. - */ - info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2; - } else - info->tag_set.queue_depth = BLK_RING_SIZE(info); - info->tag_set.numa_node = NUMA_NO_NODE; - info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; - info->tag_set.cmd_size = sizeof(struct blkif_req); - info->tag_set.driver_data = info; - - if (blk_mq_alloc_tag_set(&info->tag_set)) - return -EINVAL; - rq = blk_mq_init_queue(&info->tag_set); - if (IS_ERR(rq)) { - blk_mq_free_tag_set(&info->tag_set); - return PTR_ERR(rq); - } - - rq->queuedata = info; - info->rq = gd->queue = rq; - info->gd = gd; - info->sector_size = sector_size; - info->physical_sector_size = physical_sector_size; - blkif_set_queue_limits(info); - - return 0; -} - static const char *flush_info(struct blkfront_info *info) { if (info->feature_flush && info->feature_fua) @@ -1146,12 +1104,36 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, err = xlbd_reserve_minors(minor, nr_minors); if (err) - goto out; + return err; err = -ENODEV; - gd = alloc_disk(nr_minors); - if (gd == NULL) - goto release; + memset(&info->tag_set, 0, sizeof(info->tag_set)); + info->tag_set.ops = &blkfront_mq_ops; + info->tag_set.nr_hw_queues = info->nr_rings; + if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) { + /* + * When indirect descriptior is not supported, the I/O request + * will be split between multiple request in the ring. + * To avoid problems when sending the request, divide by + * 2 the depth of the queue. + */ + info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2; + } else + info->tag_set.queue_depth = BLK_RING_SIZE(info); + info->tag_set.numa_node = NUMA_NO_NODE; + info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + info->tag_set.cmd_size = sizeof(struct blkif_req); + info->tag_set.driver_data = info; + + err = blk_mq_alloc_tag_set(&info->tag_set); + if (err) + goto out_release_minors; + + gd = blk_mq_alloc_disk(&info->tag_set, info); + if (IS_ERR(gd)) { + err = PTR_ERR(gd); + goto out_free_tag_set; + } strcpy(gd->disk_name, DEV_NAME); ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); @@ -1164,14 +1146,16 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, gd->major = XENVBD_MAJOR; gd->first_minor = minor; + gd->minors = nr_minors; gd->fops = &xlvbd_block_fops; gd->private_data = info; set_capacity(gd, capacity); - if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size)) { - del_gendisk(gd); - goto release; - } + info->rq = gd->queue; + info->gd = gd; + info->sector_size = sector_size; + info->physical_sector_size = physical_sector_size; + blkif_set_queue_limits(info); xlvbd_flush(info); @@ -1186,9 +1170,10 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, return 0; - release: +out_free_tag_set: + blk_mq_free_tag_set(&info->tag_set); +out_release_minors: xlbd_release_minors(minor, nr_minors); - out: return err; } @@ -1217,12 +1202,9 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) nr_minors = info->gd->minors; xlbd_release_minors(minor, nr_minors); - blk_cleanup_queue(info->rq); - blk_mq_free_tag_set(&info->tag_set); - info->rq = NULL; - - put_disk(info->gd); + blk_cleanup_disk(info->gd); info->gd = NULL; + blk_mq_free_tag_set(&info->tag_set); } /* Already hold rinfo->ring_lock. */ From 77567b25ab9f06c6477a153e58ace6897c6794d1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:41 +0300 Subject: [PATCH 078/129] ubi: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-27-hch@lst.de Signed-off-by: Jens Axboe --- drivers/mtd/ubi/block.c | 70 ++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 40 deletions(-) diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index e1a2ae21dfd3..e003b4b44ffa 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -394,28 +394,6 @@ int ubiblock_create(struct ubi_volume_info *vi) dev->vol_id = vi->vol_id; dev->leb_size = vi->usable_leb_size; - /* Initialize the gendisk of this ubiblock device */ - gd = alloc_disk(1); - if (!gd) { - pr_err("UBI: block: alloc_disk failed\n"); - ret = -ENODEV; - goto out_free_dev; - } - - gd->fops = &ubiblock_ops; - gd->major = ubiblock_major; - gd->first_minor = idr_alloc(&ubiblock_minor_idr, dev, 0, 0, GFP_KERNEL); - if (gd->first_minor < 0) { - dev_err(disk_to_dev(gd), - "block: dynamic minor allocation failed"); - ret = -ENODEV; - goto out_put_disk; - } - gd->private_data = dev; - sprintf(gd->disk_name, "ubiblock%d_%d", dev->ubi_num, dev->vol_id); - set_capacity(gd, disk_capacity); - dev->gd = gd; - dev->tag_set.ops = &ubiblock_mq_ops; dev->tag_set.queue_depth = 64; dev->tag_set.numa_node = NUMA_NO_NODE; @@ -427,19 +405,34 @@ int ubiblock_create(struct ubi_volume_info *vi) ret = blk_mq_alloc_tag_set(&dev->tag_set); if (ret) { dev_err(disk_to_dev(dev->gd), "blk_mq_alloc_tag_set failed"); - goto out_remove_minor; + goto out_free_dev;; } - dev->rq = blk_mq_init_queue(&dev->tag_set); - if (IS_ERR(dev->rq)) { - dev_err(disk_to_dev(gd), "blk_mq_init_queue failed"); - ret = PTR_ERR(dev->rq); + + /* Initialize the gendisk of this ubiblock device */ + gd = blk_mq_alloc_disk(&dev->tag_set, dev); + if (IS_ERR(gd)) { + ret = PTR_ERR(gd); goto out_free_tags; } - blk_queue_max_segments(dev->rq, UBI_MAX_SG_COUNT); - dev->rq->queuedata = dev; - dev->gd->queue = dev->rq; + gd->fops = &ubiblock_ops; + gd->major = ubiblock_major; + gd->minors = 1; + gd->first_minor = idr_alloc(&ubiblock_minor_idr, dev, 0, 0, GFP_KERNEL); + if (gd->first_minor < 0) { + dev_err(disk_to_dev(gd), + "block: dynamic minor allocation failed"); + ret = -ENODEV; + goto out_cleanup_disk; + } + gd->private_data = dev; + sprintf(gd->disk_name, "ubiblock%d_%d", dev->ubi_num, dev->vol_id); + set_capacity(gd, disk_capacity); + dev->gd = gd; + + dev->rq = gd->queue; + blk_queue_max_segments(dev->rq, UBI_MAX_SG_COUNT); /* * Create one workqueue per volume (per registered block device). @@ -448,7 +441,7 @@ int ubiblock_create(struct ubi_volume_info *vi) dev->wq = alloc_workqueue("%s", 0, 0, gd->disk_name); if (!dev->wq) { ret = -ENOMEM; - goto out_free_queue; + goto out_remove_minor; } list_add_tail(&dev->list, &ubiblock_devices); @@ -460,14 +453,12 @@ int ubiblock_create(struct ubi_volume_info *vi) mutex_unlock(&devices_mutex); return 0; -out_free_queue: - blk_cleanup_queue(dev->rq); -out_free_tags: - blk_mq_free_tag_set(&dev->tag_set); out_remove_minor: idr_remove(&ubiblock_minor_idr, gd->first_minor); -out_put_disk: - put_disk(dev->gd); +out_cleanup_disk: + blk_cleanup_disk(dev->gd); +out_free_tags: + blk_mq_free_tag_set(&dev->tag_set); out_free_dev: kfree(dev); out_unlock: @@ -483,11 +474,10 @@ static void ubiblock_cleanup(struct ubiblock *dev) /* Flush pending work */ destroy_workqueue(dev->wq); /* Finally destroy the blk queue */ - blk_cleanup_queue(dev->rq); - blk_mq_free_tag_set(&dev->tag_set); dev_info(disk_to_dev(dev->gd), "released"); + blk_cleanup_disk(dev->gd); + blk_mq_free_tag_set(&dev->tag_set); idr_remove(&ubiblock_minor_idr, dev->gd->first_minor); - put_disk(dev->gd); } int ubiblock_remove(struct ubi_volume_info *vi) From c06cf063b3e5d590781fec6e88ccc259384dc157 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:42 +0300 Subject: [PATCH 079/129] scm_blk: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Tested-by: Niklas Schnelle Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-28-hch@lst.de Signed-off-by: Jens Axboe --- drivers/s390/block/scm_blk.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index a4f6f2e62b1d..88cba6212ee2 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -462,12 +462,12 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) if (ret) goto out; - rq = blk_mq_init_queue(&bdev->tag_set); - if (IS_ERR(rq)) { - ret = PTR_ERR(rq); + bdev->gendisk = blk_mq_alloc_disk(&bdev->tag_set, scmdev); + if (IS_ERR(bdev->gendisk)) { + ret = PTR_ERR(bdev->gendisk); goto out_tag; } - bdev->rq = rq; + rq = bdev->rq = bdev->gendisk->queue; nr_max_blk = min(scmdev->nr_max_block, (unsigned int) (PAGE_SIZE / sizeof(struct aidaw))); @@ -477,17 +477,11 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) blk_queue_flag_set(QUEUE_FLAG_NONROT, rq); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, rq); - bdev->gendisk = alloc_disk(SCM_NR_PARTS); - if (!bdev->gendisk) { - ret = -ENOMEM; - goto out_queue; - } - rq->queuedata = scmdev; bdev->gendisk->private_data = scmdev; bdev->gendisk->fops = &scm_blk_devops; - bdev->gendisk->queue = rq; bdev->gendisk->major = scm_major; bdev->gendisk->first_minor = devindex * SCM_NR_PARTS; + bdev->gendisk->minors = SCM_NR_PARTS; len = snprintf(bdev->gendisk->disk_name, DISK_NAME_LEN, "scm"); if (devindex > 25) { @@ -504,8 +498,6 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) device_add_disk(&scmdev->dev, bdev->gendisk, NULL); return 0; -out_queue: - blk_cleanup_queue(rq); out_tag: blk_mq_free_tag_set(&bdev->tag_set); out: @@ -516,9 +508,8 @@ out: void scm_blk_dev_cleanup(struct scm_blk_dev *bdev) { del_gendisk(bdev->gendisk); - blk_cleanup_queue(bdev->gendisk->queue); + blk_cleanup_disk(bdev->gendisk); blk_mq_free_tag_set(&bdev->tag_set); - put_disk(bdev->gendisk); } void scm_blk_set_available(struct scm_blk_dev *bdev) From f6d8297412f882a2eabbf026f0d98449ae14e0fe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:43 +0300 Subject: [PATCH 080/129] amiflop: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-29-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/amiflop.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 9e2d0c6a3877..8b1714021498 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1781,15 +1781,13 @@ static int fd_alloc_disk(int drive, int system) { struct gendisk *disk; - disk = alloc_disk(1); - if (!disk) - goto out; - disk->queue = blk_mq_init_queue(&unit[drive].tag_set); - if (IS_ERR(disk->queue)) - goto out_put_disk; + disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); + if (IS_ERR(disk)) + return PTR_ERR(disk); disk->major = FLOPPY_MAJOR; disk->first_minor = drive + system; + disk->minors = 1; disk->fops = &floppy_fops; disk->events = DISK_EVENT_MEDIA_CHANGE; if (system) @@ -1802,12 +1800,6 @@ static int fd_alloc_disk(int drive, int system) unit[drive].gendisk[system] = disk; add_disk(disk); return 0; - -out_put_disk: - disk->queue = NULL; - put_disk(disk); -out: - return -ENOMEM; } static int fd_alloc_drive(int drive) From fd71c8a8ac77242661fff4af39593cd606a90a41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:44 +0300 Subject: [PATCH 081/129] ataflop: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-30-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/ataflop.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index d601e49f80e0..a093644ac39f 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1968,22 +1968,14 @@ static const struct blk_mq_ops ataflop_mq_ops = { static int ataflop_alloc_disk(unsigned int drive, unsigned int type) { struct gendisk *disk; - int ret; - disk = alloc_disk(1); - if (!disk) - return -ENOMEM; - - disk->queue = blk_mq_init_queue(&unit[drive].tag_set); - if (IS_ERR(disk->queue)) { - ret = PTR_ERR(disk->queue); - disk->queue = NULL; - put_disk(disk); - return ret; - } + disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); + if (IS_ERR(disk)) + return PTR_ERR(disk); disk->major = FLOPPY_MAJOR; disk->first_minor = drive + (type << 2); + disk->minors = 1; sprintf(disk->disk_name, "fd%d", drive); disk->fops = &floppy_fops; disk->events = DISK_EVENT_MEDIA_CHANGE; From ec06c989bb45acc28c7633703685dd684b1b5d9c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Jun 2021 09:53:45 +0300 Subject: [PATCH 082/129] z2ram: use blk_mq_alloc_disk and blk_cleanup_disk Use blk_mq_alloc_disk and blk_cleanup_disk to simplify the gendisk and request_queue allocation. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210602065345.355274-31-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/z2ram.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index c1d20818e649..a8968d9e759b 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -323,27 +323,20 @@ static const struct blk_mq_ops z2_mq_ops = { static int z2ram_register_disk(int minor) { - struct request_queue *q; struct gendisk *disk; - disk = alloc_disk(1); - if (!disk) - return -ENOMEM; - - q = blk_mq_init_queue(&tag_set); - if (IS_ERR(q)) { - put_disk(disk); - return PTR_ERR(q); - } + disk = blk_mq_alloc_disk(&tag_set, NULL); + if (IS_ERR(disk)) + return PTR_ERR(disk); disk->major = Z2RAM_MAJOR; disk->first_minor = minor; + disk->minors = 1; disk->fops = &z2_fops; if (minor) sprintf(disk->disk_name, "z2ram%d", minor); else sprintf(disk->disk_name, "z2ram"); - disk->queue = q; z2ram_gendisk[minor] = disk; add_disk(disk); From 07a719f8fdbe4ae0f825fa1a6d2755a63deb265e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Jun 2021 09:15:46 +0200 Subject: [PATCH 083/129] mtd_blkdevs: initialze new->rq in add_mtd_blktrans_dev Various places expect the request_queue in ->rq. Initialize it to avoid NULL pointer derefences. Fixes: 6966bb921def ("mtd_blkdevs: use blk_mq_alloc_disk") Reported-by: Marek Szyprowski Signed-off-by: Christoph Hellwig Tested-by: Marek Szyprowski Signed-off-by: Jens Axboe --- drivers/mtd/mtd_blkdevs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 5dc4c966ea73..6ce4bc57f919 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -382,6 +382,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) } new->disk = gd; + new->rq = new->disk->queue; gd->private_data = new; gd->major = tr->major; gd->first_minor = (new->devnum) << tr->part_bits; From 6a03cd9843fa4dcf827dc3ad60fa9b4217f3057c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 16 Jun 2021 09:15:47 +0200 Subject: [PATCH 084/129] loop: fix order of cleaning up the queue and freeing the tagset We must release the queue before freeing the tagset. Fixes: 1c99502fae35 ("loop: use blk_mq_alloc_disk and blk_cleanup_disk") Reported-by: Bruno Goncalves Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 3f40e673a101..e90f7d349816 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2183,8 +2183,8 @@ out: static void loop_remove(struct loop_device *lo) { del_gendisk(lo->lo_disk); - blk_mq_free_tag_set(&lo->tag_set); blk_cleanup_disk(lo->lo_disk); + blk_mq_free_tag_set(&lo->tag_set); mutex_destroy(&lo->lo_mutex); kfree(lo); } From 2cafe29a8d03f02a3d16193bdaae2f3e82a423f9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 9 Jun 2021 09:58:21 +0800 Subject: [PATCH 085/129] block: fix race between adding/removing rq qos and normal IO Yi reported several kernel panics on: [16687.001777] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008 ... [16687.163549] pc : __rq_qos_track+0x38/0x60 or [ 997.690455] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020 ... [ 997.850347] pc : __rq_qos_done+0x2c/0x50 Turns out it is caused by race between adding rq qos(wbt) and normal IO because rq_qos_add can be run when IO is being submitted, fix this issue by freezing queue before adding/deleting rq qos to queue. rq_qos_exit() needn't to freeze queue because it is called after queue has been frozen. iolatency calls rq_qos_add() during allocating queue, so freezing won't add delay because queue usage refcount works at atomic mode at that time. iocost calls rq_qos_add() when writing cgroup attribute file, that is fine to freeze queue at that time since we usually freeze queue when storing to queue sysfs attribute, meantime iocost only exists on the root cgroup. wbt_init calls it in blk_register_queue() and queue sysfs attribute store(queue_wb_lat_store() when write it 1st time in case of !BLK_WBT_MQ), the following patch will speedup the queue freezing in wbt_init. Reported-by: Yi Zhang Cc: Bart Van Assche Signed-off-by: Ming Lei Reviewed-by: Bart Van Assche Tested-by: Yi Zhang Link: https://lore.kernel.org/r/20210609015822.103433-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-rq-qos.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 2bc43e94f4c4..2bcb3495e376 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "blk-mq-debugfs.h" @@ -99,8 +100,21 @@ static inline void rq_wait_init(struct rq_wait *rq_wait) static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) { + /* + * No IO can be in-flight when adding rqos, so freeze queue, which + * is fine since we only support rq_qos for blk-mq queue. + * + * Reuse ->queue_lock for protecting against other concurrent + * rq_qos adding/deleting + */ + blk_mq_freeze_queue(q); + + spin_lock_irq(&q->queue_lock); rqos->next = q->rq_qos; q->rq_qos = rqos; + spin_unlock_irq(&q->queue_lock); + + blk_mq_unfreeze_queue(q); if (rqos->ops->debugfs_attrs) blk_mq_debugfs_register_rqos(rqos); @@ -110,12 +124,22 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) { struct rq_qos **cur; + /* + * See comment in rq_qos_add() about freezing queue & using + * ->queue_lock. + */ + blk_mq_freeze_queue(q); + + spin_lock_irq(&q->queue_lock); for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { if (*cur == rqos) { *cur = rqos->next; break; } } + spin_unlock_irq(&q->queue_lock); + + blk_mq_unfreeze_queue(q); blk_mq_debugfs_unregister_rqos(rqos); } From a72c374f97a4c7b2f9dde5144c867fec4bdcd798 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 9 Jun 2021 09:58:22 +0800 Subject: [PATCH 086/129] block: mark queue init done at the end of blk_register_queue Mark queue init done when everything is done well in blk_register_queue(), so that wbt_enable_default() can be run quickly without any RCU period involved since adding rq qos requires to freeze queue. Also no any side effect by delaying to mark queue init done. Reported-by: Yi Zhang Cc: Bart Van Assche Signed-off-by: Ming Lei Reviewed-by: Bart Van Assche Tested-by: Yi Zhang Link: https://lore.kernel.org/r/20210609015822.103433-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f89e2fc3963b..370d83c18057 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -866,20 +866,6 @@ int blk_register_queue(struct gendisk *disk) "%s is registering an already registered queue\n", kobject_name(&dev->kobj)); - /* - * SCSI probing may synchronously create and destroy a lot of - * request_queues for non-existent devices. Shutting down a fully - * functional queue takes measureable wallclock time as RCU grace - * periods are involved. To avoid excessive latency in these - * cases, a request_queue starts out in a degraded mode which is - * faster to shut down and is made fully functional here as - * request_queues for non-existent devices never get registered. - */ - if (!blk_queue_init_done(q)) { - blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); - percpu_ref_switch_to_percpu(&q->q_usage_counter); - } - blk_queue_update_readahead(q); ret = blk_trace_init_sysfs(dev); @@ -938,6 +924,21 @@ int blk_register_queue(struct gendisk *disk) ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); + + /* + * SCSI probing may synchronously create and destroy a lot of + * request_queues for non-existent devices. Shutting down a fully + * functional queue takes measureable wallclock time as RCU grace + * periods are involved. To avoid excessive latency in these + * cases, a request_queue starts out in a degraded mode which is + * faster to shut down and is made fully functional here as + * request_queues for non-existent devices never get registered. + */ + if (!blk_queue_init_done(q)) { + blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); + percpu_ref_switch_to_percpu(&q->q_usage_counter); + } + return ret; } EXPORT_SYMBOL_GPL(blk_register_queue); From fda0b5ba9d5a9f6bfab9bc195f7a8fce13aedf61 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 14 Jun 2021 14:41:09 -0700 Subject: [PATCH 087/129] docs: block/bfq: describe per-device weight The functionality of setting per-device weight for BFQ was added in v5.4 (commit 795fe54c2a828099), but the documentation was never updated. While at it, improve formatting a bit. Signed-off-by: Kir Kolyshkin Link: https://lore.kernel.org/r/20210614214109.207430-1-kolyshkin@gmail.com Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- Documentation/block/bfq-iosched.rst | 38 ++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/Documentation/block/bfq-iosched.rst b/Documentation/block/bfq-iosched.rst index 66c5a4e54130..df3a8a47f58c 100644 --- a/Documentation/block/bfq-iosched.rst +++ b/Documentation/block/bfq-iosched.rst @@ -553,20 +553,36 @@ throughput sustainable with bfq, because updating the blkio.bfq.* stats is rather costly, especially for some of the stats enabled by CONFIG_BFQ_CGROUP_DEBUG. -Parameters to set ------------------ +Parameters +---------- -For each group, there is only the following parameter to set. +For each group, the following parameters can be set: -weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the -group inside its parent. Available values: 1..1000 (default 100). The -linear mapping between ioprio and weights, described at the beginning -of the tunable section, is still valid, but all weights higher than -IOPRIO_BE_NR*10 are mapped to ioprio 0. + weight + This specifies the default weight for the cgroup inside its parent. + Available values: 1..1000 (default: 100). -Recall that, if low-latency is set, then BFQ automatically raises the -weight of the queues associated with interactive and soft real-time -applications. Unset this tunable if you need/want to control weights. + For cgroup v1, it is set by writing the value to `blkio.bfq.weight`. + + For cgroup v2, it is set by writing the value to `io.bfq.weight`. + (with an optional prefix of `default` and a space). + + The linear mapping between ioprio and weights, described at the beginning + of the tunable section, is still valid, but all weights higher than + IOPRIO_BE_NR*10 are mapped to ioprio 0. + + Recall that, if low-latency is set, then BFQ automatically raises the + weight of the queues associated with interactive and soft real-time + applications. Unset this tunable if you need/want to control weights. + + weight_device + This specifies a per-device weight for the cgroup. The syntax is + `minor:major weight`. A weight of `0` may be used to reset to the default + weight. + + For cgroup v1, it is set by writing the value to `blkio.bfq.weight_device`. + + For cgroup v2, the file name is `io.bfq.weight`. [1] From 37fe403898b2fa5fc9d3616fcda2ee6629318aab Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 10 Jun 2021 20:07:36 -0700 Subject: [PATCH 088/129] docs/cgroup-v1/blkio: stop abusing itemized list Fix many formatting issues by stop (ab)using itemized lists for everything (mostly replaced by definition lists). Acked-by: Tejun Heo Signed-off-by: Kir Kolyshkin Signed-off-by: Jens Axboe --- .../cgroup-v1/blkio-controller.rst | 130 +++++++++--------- 1 file changed, 67 insertions(+), 63 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst index 36d43ae7dc13..8101dcba381d 100644 --- a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst +++ b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst @@ -17,36 +17,37 @@ level logical devices like device mapper. HOWTO ===== + Throttling/Upper Limit policy ----------------------------- -- Enable Block IO controller:: +Enable Block IO controller:: CONFIG_BLK_CGROUP=y -- Enable throttling in block layer:: +Enable throttling in block layer:: CONFIG_BLK_DEV_THROTTLING=y -- Mount blkio controller (see cgroups.txt, Why are cgroups needed?):: +Mount blkio controller (see cgroups.txt, Why are cgroups needed?):: mount -t cgroup -o blkio none /sys/fs/cgroup/blkio -- Specify a bandwidth rate on particular device for root group. The format - for policy is ": ":: +Specify a bandwidth rate on particular device for root group. The format +for policy is ": ":: echo "8:16 1048576" > /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device - Above will put a limit of 1MB/second on reads happening for root group - on device having major/minor number 8:16. +This will put a limit of 1MB/second on reads happening for root group +on device having major/minor number 8:16. -- Run dd to read a file and see if rate is throttled to 1MB/s or not:: +Run dd to read a file and see if rate is throttled to 1MB/s or not:: # dd iflag=direct if=/mnt/common/zerofile of=/dev/null bs=4K count=1024 1024+0 records in 1024+0 records out 4194304 bytes (4.2 MB) copied, 4.0001 s, 1.0 MB/s - Limits for writes can be put using blkio.throttle.write_bps_device file. +Limits for writes can be put using blkio.throttle.write_bps_device file. Hierarchical Cgroups ==================== @@ -79,28 +80,31 @@ following:: Various user visible config options =================================== -CONFIG_BLK_CGROUP - - Block IO controller. -CONFIG_BFQ_CGROUP_DEBUG - - Debug help. Right now some additional stats file show up in cgroup + CONFIG_BLK_CGROUP + Block IO controller. + + CONFIG_BFQ_CGROUP_DEBUG + Debug help. Right now some additional stats file show up in cgroup if this option is enabled. -CONFIG_BLK_DEV_THROTTLING - - Enable block device throttling support in block layer. + CONFIG_BLK_DEV_THROTTLING + Enable block device throttling support in block layer. Details of cgroup files ======================= + Proportional weight policy files -------------------------------- -- blkio.weight - - Specifies per cgroup weight. This is default weight of the group + + blkio.weight + Specifies per cgroup weight. This is default weight of the group on all the devices until and unless overridden by per device rule. (See blkio.weight_device). Currently allowed range of weights is from 10 to 1000. -- blkio.weight_device - - One can specify per cgroup per device rules using this interface. + blkio.weight_device + One can specify per cgroup per device rules using this interface. These rules override the default value of group weight as specified by blkio.weight. @@ -130,34 +134,34 @@ Proportional weight policy files dev weight 8:16 300 -- blkio.time - - disk time allocated to cgroup per device in milliseconds. First + blkio.time + Disk time allocated to cgroup per device in milliseconds. First two fields specify the major and minor number of the device and third field specifies the disk time allocated to group in milliseconds. -- blkio.sectors - - number of sectors transferred to/from disk by the group. First + blkio.sectors + Number of sectors transferred to/from disk by the group. First two fields specify the major and minor number of the device and third field specifies the number of sectors transferred by the group to/from the device. -- blkio.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These + blkio.io_service_bytes + Number of bytes transferred to/from the disk by the group. These are further divided by the type of operation - read or write, sync or async. First two fields specify the major and minor number of the device, third field specifies the operation type and the fourth field specifies the number of bytes. -- blkio.io_serviced - - Number of IOs (bio) issued to the disk by the group. These + blkio.io_serviced + Number of IOs (bio) issued to the disk by the group. These are further divided by the type of operation - read or write, sync or async. First two fields specify the major and minor number of the device, third field specifies the operation type and the fourth field specifies the number of IOs. -- blkio.io_service_time - - Total amount of time between request dispatch and request completion + blkio.io_service_time + Total amount of time between request dispatch and request completion for the IOs done by this cgroup. This is in nanoseconds to make it meaningful for flash devices too. For devices with queue depth of 1, this time represents the actual service time. When queue_depth > 1, @@ -170,8 +174,8 @@ Proportional weight policy files specifies the operation type and the fourth field specifies the io_service_time in ns. -- blkio.io_wait_time - - Total amount of time the IOs for this cgroup spent waiting in the + blkio.io_wait_time + Total amount of time the IOs for this cgroup spent waiting in the scheduler queues for service. This can be greater than the total time elapsed since it is cumulative io_wait_time for all IOs. It is not a measure of total time the cgroup spent waiting but rather a measure of @@ -185,24 +189,24 @@ Proportional weight policy files minor number of the device, third field specifies the operation type and the fourth field specifies the io_wait_time in ns. -- blkio.io_merged - - Total number of bios/requests merged into requests belonging to this + blkio.io_merged + Total number of bios/requests merged into requests belonging to this cgroup. This is further divided by the type of operation - read or write, sync or async. -- blkio.io_queued - - Total number of requests queued up at any given instant for this + blkio.io_queued + Total number of requests queued up at any given instant for this cgroup. This is further divided by the type of operation - read or write, sync or async. -- blkio.avg_queue_size - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + blkio.avg_queue_size + Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. The average queue size for this cgroup over the entire time of this cgroup's existence. Queue size samples are taken each time one of the queues of this cgroup gets a timeslice. -- blkio.group_wait_time - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + blkio.group_wait_time + Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This is the amount of time the cgroup had to wait since it became busy (i.e., went from 0 to 1 request queued) to get a timeslice for one of its queues. This is different from the io_wait_time which is the @@ -212,8 +216,8 @@ Proportional weight policy files will only report the group_wait_time accumulated till the last time it got a timeslice and will not include the current delta. -- blkio.empty_time - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + blkio.empty_time + Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This is the amount of time a cgroup spends without any pending requests when not being served, i.e., it does not include any time spent idling for one of the queues of the cgroup. This is in @@ -221,8 +225,8 @@ Proportional weight policy files the stat will only report the empty_time accumulated till the last time it had a pending request and will not include the current delta. -- blkio.idle_time - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. + blkio.idle_time + Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This is the amount of time spent by the IO scheduler idling for a given cgroup in anticipation of a better request than the existing ones from other queues/cgroups. This is in nanoseconds. If this is read @@ -230,60 +234,60 @@ Proportional weight policy files idle_time accumulated till the last idle period and will not include the current delta. -- blkio.dequeue - - Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This + blkio.dequeue + Debugging aid only enabled if CONFIG_BFQ_CGROUP_DEBUG=y. This gives the statistics about how many a times a group was dequeued from service tree of the device. First two fields specify the major and minor number of the device and third field specifies the number of times a group was dequeued from a particular device. -- blkio.*_recursive - - Recursive version of various stats. These files show the + blkio.*_recursive + Recursive version of various stats. These files show the same information as their non-recursive counterparts but include stats from all the descendant cgroups. Throttling/Upper limit policy files ----------------------------------- -- blkio.throttle.read_bps_device - - Specifies upper limit on READ rate from the device. IO rate is + blkio.throttle.read_bps_device + Specifies upper limit on READ rate from the device. IO rate is specified in bytes per second. Rules are per device. Following is the format:: echo ": " > /cgrp/blkio.throttle.read_bps_device -- blkio.throttle.write_bps_device - - Specifies upper limit on WRITE rate to the device. IO rate is + blkio.throttle.write_bps_device + Specifies upper limit on WRITE rate to the device. IO rate is specified in bytes per second. Rules are per device. Following is the format:: echo ": " > /cgrp/blkio.throttle.write_bps_device -- blkio.throttle.read_iops_device - - Specifies upper limit on READ rate from the device. IO rate is + blkio.throttle.read_iops_device + Specifies upper limit on READ rate from the device. IO rate is specified in IO per second. Rules are per device. Following is the format:: echo ": " > /cgrp/blkio.throttle.read_iops_device -- blkio.throttle.write_iops_device - - Specifies upper limit on WRITE rate to the device. IO rate is + blkio.throttle.write_iops_device + Specifies upper limit on WRITE rate to the device. IO rate is specified in io per second. Rules are per device. Following is the format:: echo ": " > /cgrp/blkio.throttle.write_iops_device -Note: If both BW and IOPS rules are specified for a device, then IO is - subjected to both the constraints. + Note: If both BW and IOPS rules are specified for a device, then IO is + subjected to both the constraints. -- blkio.throttle.io_serviced - - Number of IOs (bio) issued to the disk by the group. These + blkio.throttle.io_serviced + Number of IOs (bio) issued to the disk by the group. These are further divided by the type of operation - read or write, sync or async. First two fields specify the major and minor number of the device, third field specifies the operation type and the fourth field specifies the number of IOs. -- blkio.throttle.io_service_bytes - - Number of bytes transferred to/from the disk by the group. These + blkio.throttle.io_service_bytes + Number of bytes transferred to/from the disk by the group. These are further divided by the type of operation - read or write, sync or async. First two fields specify the major and minor number of the device, third field specifies the operation type and the fourth field @@ -291,6 +295,6 @@ Note: If both BW and IOPS rules are specified for a device, then IO is Common files among various policies ----------------------------------- -- blkio.reset_stats - - Writing an int to this file will result in resetting all the stats + blkio.reset_stats + Writing an int to this file will result in resetting all the stats for that cgroup. From 828615950b5876e75587fdd5e9d1185be9cabba7 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 10 Jun 2021 20:07:37 -0700 Subject: [PATCH 089/129] docs/cgroup-v1/blkio: update for 5.x kernels Commit bf382fb0bcef4 ("block: remove legacy IO schedulers", Oct 12 2018) removes the CFQ scheduler, together with blkio.weight and blkio.weight_device described in cgroup v1 documentation. Users are supposed to use the BFQ scheduler, which cgroup file for setting weight is blkio.bfq.weight, but there is no way to set per-device weight. Later, commit 795fe54c2a8 per-device weights for BFQ, meaning that blkio.bfq.weight and blkio.bfq.weight_device can be used in a way similar to the old CFQ cgroup interface. Yet, the cgroup v1 docs were never updated. Fix this: - use the new file names; - fix the range for weight (used to be 10..1000, now 1..1000); - link to BFQ scheduler docs. Acked-by: Tejun Heo Signed-off-by: Kir Kolyshkin Signed-off-by: Jens Axboe --- .../cgroup-v1/blkio-controller.rst | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst index 8101dcba381d..16253eda192e 100644 --- a/Documentation/admin-guide/cgroup-v1/blkio-controller.rst +++ b/Documentation/admin-guide/cgroup-v1/blkio-controller.rst @@ -97,40 +97,41 @@ Details of cgroup files Proportional weight policy files -------------------------------- - blkio.weight + blkio.bfq.weight Specifies per cgroup weight. This is default weight of the group - on all the devices until and unless overridden by per device rule. - (See blkio.weight_device). - Currently allowed range of weights is from 10 to 1000. + on all the devices until and unless overridden by per device rule + (see `blkio.bfq.weight_device` below). - blkio.weight_device - One can specify per cgroup per device rules using this interface. - These rules override the default value of group weight as specified - by blkio.weight. + Currently allowed range of weights is from 1 to 1000. For more details, + see Documentation/block/bfq-iosched.rst. + + blkio.bfq.weight_device + Specifes per cgroup per device weights, overriding the default group + weight. For more details, see Documentation/block/bfq-iosched.rst. Following is the format:: - # echo dev_maj:dev_minor weight > blkio.weight_device + # echo dev_maj:dev_minor weight > blkio.bfq.weight_device Configure weight=300 on /dev/sdb (8:16) in this cgroup:: - # echo 8:16 300 > blkio.weight_device - # cat blkio.weight_device + # echo 8:16 300 > blkio.bfq.weight_device + # cat blkio.bfq.weight_device dev weight 8:16 300 Configure weight=500 on /dev/sda (8:0) in this cgroup:: - # echo 8:0 500 > blkio.weight_device - # cat blkio.weight_device + # echo 8:0 500 > blkio.bfq.weight_device + # cat blkio.bfq.weight_device dev weight 8:0 500 8:16 300 Remove specific weight for /dev/sda in this cgroup:: - # echo 8:0 0 > blkio.weight_device - # cat blkio.weight_device + # echo 8:0 0 > blkio.bfq.weight_device + # cat blkio.bfq.weight_device dev weight 8:16 300 From f0c1c4d2864ed614f90d2da1bab1a1c42907b940 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 9 Jun 2021 14:30:46 +0800 Subject: [PATCH 090/129] blk-mq: fix use-after-free in blk_mq_exit_sched tagset can't be used after blk_cleanup_queue() is returned because freeing tagset usually follows blk_clenup_queue(). Commit d97e594c5166 ("blk-mq: Use request queue-wide tags for tagset-wide sbitmap") adds check on q->tag_set->flags in blk_mq_exit_sched(), and causes use-after-free. Fixes it by using hctx->flags. Reported-by: syzbot+77ba3d171a25c56756ea@syzkaller.appspotmail.com Fixes: d97e594c5166 ("blk-mq: Use request queue-wide tags for tagset-wide sbitmap") Cc: John Garry Signed-off-by: Ming Lei Tested-by: John Garry Reviewed-by: John Garry Link: https://lore.kernel.org/r/20210609063046.122843-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index a9182d2f8ad3..80273245d11a 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -680,6 +680,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; unsigned int i; + unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { blk_mq_debugfs_unregister_sched_hctx(hctx); @@ -687,12 +688,13 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } + flags = hctx->flags; } blk_mq_debugfs_unregister_sched(q); if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q); - if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) + if (blk_mq_is_sbitmap_shared(flags)) blk_mq_exit_sched_shared_sbitmap(q); q->elevator = NULL; } From e42cfb1da0bf33c313318da201730324c423351d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 18 Jun 2021 10:59:22 +0900 Subject: [PATCH 091/129] block: Remove unnecessary elevator operation checks The insert_requests and dispatch_request elevator operations are mandatory for the correct execution of an elevator, and all implemented elevators (bfq, kyber and mq-deadline) implement them. As a result, there is no need to check for these operations before calling them when a queue has an elevator set. This simplifies the code in __blk_mq_sched_dispatch_requests() and blk_mq_sched_insert_request(). To avoid out-of-tree elevators to crash the kernel in case of bad implementation, add a check in elv_register() to verify that these operations are implemented. A small, probably not significant, IOPS improvement of 0.1% is observed with this patch applied (4.117 MIOPS to 4.123 MIOPS, average of 20 fio runs doing 4K random direct reads with psync and 32 jobs). Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20210618015922.713999-1-damien.lemoal@wdc.com Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 13 ++++++------- block/elevator.c | 4 ++++ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 80273245d11a..2403a5c2b053 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -294,8 +294,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; - struct elevator_queue *e = q->elevator; - const bool has_sched_dispatch = e && e->type->ops.dispatch_request; + const bool has_sched = q->elevator; int ret = 0; LIST_HEAD(rq_list); @@ -326,12 +325,12 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) { - if (has_sched_dispatch) + if (has_sched) ret = blk_mq_do_dispatch_sched(hctx); else ret = blk_mq_do_dispatch_ctx(hctx); } - } else if (has_sched_dispatch) { + } else if (has_sched) { ret = blk_mq_do_dispatch_sched(hctx); } else if (hctx->dispatch_busy) { /* dequeue request one by one from sw queue if queue is busy */ @@ -463,7 +462,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, goto run; } - if (e && e->type->ops.insert_requests) { + if (e) { LIST_HEAD(list); list_add(&rq->queuelist, &list); @@ -494,9 +493,9 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, percpu_ref_get(&q->q_usage_counter); e = hctx->queue->elevator; - if (e && e->type->ops.insert_requests) + if (e) { e->type->ops.insert_requests(hctx, list, false); - else { + } else { /* * try to issue requests directly if the hw queue isn't * busy in case of 'none' scheduler, and this way may save diff --git a/block/elevator.c b/block/elevator.c index 06e203426410..85d0d4adbb64 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -522,6 +522,10 @@ void elv_unregister_queue(struct request_queue *q) int elv_register(struct elevator_type *e) { + /* insert_requests and dispatch_request are mandatory */ + if (WARN_ON_ONCE(!e->ops.insert_requests || !e->ops.dispatch_request)) + return -EINVAL; + /* create icq_cache if requested */ if (e->icq_size) { if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || From 52d7e288444906aa5c99888e80a9cc1a1423ed92 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 18 Jun 2021 16:45:22 +0300 Subject: [PATCH 092/129] blk-mq: fix an IS_ERR() vs NULL bug The __blk_mq_alloc_disk() function doesn't return NULLs it returns error pointers. Fixes: b461dfc49eb6 ("blk-mq: add the blk_mq_alloc_disk APIs") Signed-off-by: Dan Carpenter Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/YMyjci35WBqrtqG+@mwanda Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 02a4aab0aeac..fd2de2b422ed 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -431,7 +431,7 @@ enum { static struct lock_class_key __key; \ struct gendisk *__disk = __blk_mq_alloc_disk(set, queuedata); \ \ - if (__disk) \ + if (!IS_ERR(__disk)) \ lockdep_init_map(&__disk->lockdep_map, \ "(bio completion)", &__key, 0); \ __disk; \ From a79da21b48cc5f81b047ae4e70b4d9cb49c93a6a Mon Sep 17 00:00:00 2001 From: lijiazi Date: Fri, 18 Jun 2021 11:17:20 +0800 Subject: [PATCH 093/129] blk-wbt: remove outdated comment Now wbt_wait() returns void, so remove now outdated comment. Signed-off-by: lijiazi Link: https://lore.kernel.org/r/1623986240-13878-1-git-send-email-lijiazi@xiaomi.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 42aed0160f86..b363b0532704 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -563,7 +563,6 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) } /* - * Returns true if the IO request should be accounted, false if not. * May sleep, if we have exceeded the writeback limits. Caller can pass * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. From 5f6776ba413ce273f7cb211f1cf8771f0cde7c81 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:41 -0700 Subject: [PATCH 094/129] block/Kconfig: Make the BLK_WBT and BLK_WBT_MQ entries consecutive These entries were consecutive at the time of their introduction but are no longer consecutive. Make these again consecutive. Additionally, modify the help text since it refers to blk-mq and since the legacy block layer has been removed. Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20210618004456.7280-2-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/Kconfig | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index a2297edfdde8..6685578b2a20 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -133,6 +133,13 @@ config BLK_WBT dynamically on an algorithm loosely based on CoDel, factoring in the realtime performance of the disk. +config BLK_WBT_MQ + bool "Enable writeback throttling by default" + default y + depends on BLK_WBT + help + Enable writeback throttling by default for request-based block devices. + config BLK_CGROUP_IOLATENCY bool "Enable support for latency based cgroup IO protection" depends on BLK_CGROUP=y @@ -155,13 +162,6 @@ config BLK_CGROUP_IOCOST distributes IO capacity between different groups based on their share of the overall weight distribution. -config BLK_WBT_MQ - bool "Multiqueue writeback throttling" - default y - depends on BLK_WBT - help - Enable writeback throttling by default on multiqueue devices. - config BLK_DEBUG_FS bool "Block layer debugging information in debugfs" default y From 19688d7f9592b8222f530037d9328fdc90fff14c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:42 -0700 Subject: [PATCH 095/129] block/blk-cgroup: Swap the blk_throtl_init() and blk_iolatency_init() calls Before adding more calls in this function, simplify the error path. Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Cc: Tejun Heo Cc: Christoph Hellwig Cc: Ming Lei Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Reviewed-by: Himanshu Madhani Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20210618004456.7280-3-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d169e2055158..3b0f6efaa2b6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1183,15 +1183,14 @@ int blkcg_init_queue(struct request_queue *q) if (preloaded) radix_tree_preload_end(); + ret = blk_iolatency_init(q); + if (ret) + goto err_destroy_all; + ret = blk_throtl_init(q); if (ret) goto err_destroy_all; - ret = blk_iolatency_init(q); - if (ret) { - blk_throtl_exit(q); - goto err_destroy_all; - } return 0; err_destroy_all: From fb44023e70224c3bd9eb949bd3ab66876bd14c56 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:43 -0700 Subject: [PATCH 096/129] block/blk-rq-qos: Move a function from a header file into a C file rq_qos_id_to_name() is only used in blk-mq-debugfs.c so move that function into in blk-mq-debugfs.c. Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20210618004456.7280-4-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 13 +++++++++++++ block/blk-rq-qos.h | 13 ------------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 2a75bc7401df..6ac1c86f62ef 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -937,6 +937,19 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q) q->sched_debugfs_dir = NULL; } +static const char *rq_qos_id_to_name(enum rq_qos_id id) +{ + switch (id) { + case RQ_QOS_WBT: + return "wbt"; + case RQ_QOS_LATENCY: + return "latency"; + case RQ_QOS_COST: + return "cost"; + } + return "unknown"; +} + void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { debugfs_remove_recursive(rqos->debugfs_dir); diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 2bcb3495e376..a77afbdd472c 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -79,19 +79,6 @@ static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) return rq_qos_id(q, RQ_QOS_LATENCY); } -static inline const char *rq_qos_id_to_name(enum rq_qos_id id) -{ - switch (id) { - case RQ_QOS_WBT: - return "wbt"; - case RQ_QOS_LATENCY: - return "latency"; - case RQ_QOS_COST: - return "cost"; - } - return "unknown"; -} - static inline void rq_wait_init(struct rq_wait *rq_wait) { atomic_set(&rq_wait->inflight, 0); From 556910e39249d55e23deaec479f49e7d85bc0d24 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:44 -0700 Subject: [PATCH 097/129] block: Introduce the ioprio rq-qos policy Introduce an rq-qos policy that assigns an I/O priority to requests based on blk-cgroup configuration settings. This policy has the following advantages over the ioprio_set() system call: - This policy is cgroup based so it has all the advantages of cgroups. - While ioprio_set() does not affect page cache writeback I/O, this rq-qos controller affects page cache writeback I/O for filesystems that support assiociating a cgroup with writeback I/O. See also Documentation/admin-guide/cgroup-v2.rst. Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-5-bvanassche@acm.org Signed-off-by: Jens Axboe --- Documentation/admin-guide/cgroup-v2.rst | 55 +++++ block/Kconfig | 9 + block/Makefile | 1 + block/blk-cgroup.c | 5 + block/blk-ioprio.c | 262 ++++++++++++++++++++++++ block/blk-ioprio.h | 19 ++ block/blk-mq-debugfs.c | 2 + block/blk-rq-qos.h | 1 + 8 files changed, 354 insertions(+) create mode 100644 block/blk-ioprio.c create mode 100644 block/blk-ioprio.h diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index b1e81aa8598a..4e59925e6583 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -56,6 +56,7 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst #include #include "blk.h" +#include "blk-ioprio.h" /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. @@ -1187,6 +1188,10 @@ int blkcg_init_queue(struct request_queue *q) if (ret) goto err_destroy_all; + ret = blk_ioprio_init(q); + if (ret) + goto err_destroy_all; + ret = blk_throtl_init(q); if (ret) goto err_destroy_all; diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c new file mode 100644 index 000000000000..332a07761bf8 --- /dev/null +++ b/block/blk-ioprio.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Block rq-qos policy for assigning an I/O priority class to requests. + * + * Using an rq-qos policy for assigning I/O priority class has two advantages + * over using the ioprio_set() system call: + * + * - This policy is cgroup based so it has all the advantages of cgroups. + * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos + * controller affects page cache writeback I/O for filesystems that support + * assiociating a cgroup with writeback I/O. See also + * Documentation/admin-guide/cgroup-v2.rst. + */ + +#include +#include +#include +#include +#include +#include "blk-ioprio.h" +#include "blk-rq-qos.h" + +/** + * enum prio_policy - I/O priority class policy. + * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class. + * @POLICY_NONE_TO_RT: modify IOPRIO_CLASS_NONE into IOPRIO_CLASS_RT. + * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into + * IOPRIO_CLASS_BE. + * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE. + * + * See also . + */ +enum prio_policy { + POLICY_NO_CHANGE = 0, + POLICY_NONE_TO_RT = 1, + POLICY_RESTRICT_TO_BE = 2, + POLICY_ALL_TO_IDLE = 3, +}; + +static const char *policy_name[] = { + [POLICY_NO_CHANGE] = "no-change", + [POLICY_NONE_TO_RT] = "none-to-rt", + [POLICY_RESTRICT_TO_BE] = "restrict-to-be", + [POLICY_ALL_TO_IDLE] = "idle", +}; + +static struct blkcg_policy ioprio_policy; + +/** + * struct ioprio_blkg - Per (cgroup, request queue) data. + * @pd: blkg_policy_data structure. + */ +struct ioprio_blkg { + struct blkg_policy_data pd; +}; + +/** + * struct ioprio_blkcg - Per cgroup data. + * @cpd: blkcg_policy_data structure. + * @prio_policy: One of the IOPRIO_CLASS_* values. See also . + */ +struct ioprio_blkcg { + struct blkcg_policy_data cpd; + enum prio_policy prio_policy; +}; + +static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL; +} + +static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) +{ + return container_of(blkcg_to_cpd(blkcg, &ioprio_policy), + struct ioprio_blkcg, cpd); +} + +static struct ioprio_blkcg * +ioprio_blkcg_from_css(struct cgroup_subsys_state *css) +{ + return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); +} + +static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio) +{ + struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy); + + if (!pd) + return NULL; + + return blkcg_to_ioprio_blkcg(pd->blkg->blkcg); +} + +static int ioprio_show_prio_policy(struct seq_file *sf, void *v) +{ + struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf)); + + seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]); + return 0; +} + +static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of)); + int ret; + + if (off != 0) + return -EIO; + /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */ + ret = sysfs_match_string(policy_name, buf); + if (ret < 0) + return ret; + blkcg->prio_policy = ret; + + return nbytes; +} + +static struct blkg_policy_data * +ioprio_alloc_pd(gfp_t gfp, struct request_queue *q, struct blkcg *blkcg) +{ + struct ioprio_blkg *ioprio_blkg; + + ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp); + if (!ioprio_blkg) + return NULL; + + return &ioprio_blkg->pd; +} + +static void ioprio_free_pd(struct blkg_policy_data *pd) +{ + struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd); + + kfree(ioprio_blkg); +} + +static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) +{ + struct ioprio_blkcg *blkcg; + + blkcg = kzalloc(sizeof(*blkcg), gfp); + if (!blkcg) + return NULL; + blkcg->prio_policy = POLICY_NO_CHANGE; + return &blkcg->cpd; +} + +static void ioprio_free_cpd(struct blkcg_policy_data *cpd) +{ + struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd); + + kfree(blkcg); +} + +#define IOPRIO_ATTRS \ + { \ + .name = "prio.class", \ + .seq_show = ioprio_show_prio_policy, \ + .write = ioprio_set_prio_policy, \ + }, \ + { } /* sentinel */ + +/* cgroup v2 attributes */ +static struct cftype ioprio_files[] = { + IOPRIO_ATTRS +}; + +/* cgroup v1 attributes */ +static struct cftype ioprio_legacy_files[] = { + IOPRIO_ATTRS +}; + +static struct blkcg_policy ioprio_policy = { + .dfl_cftypes = ioprio_files, + .legacy_cftypes = ioprio_legacy_files, + + .cpd_alloc_fn = ioprio_alloc_cpd, + .cpd_free_fn = ioprio_free_cpd, + + .pd_alloc_fn = ioprio_alloc_pd, + .pd_free_fn = ioprio_free_pd, +}; + +struct blk_ioprio { + struct rq_qos rqos; +}; + +static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq, + struct bio *bio) +{ + struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); + + /* + * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers + * correspond to a lower priority. Hence, the max_t() below selects + * the lower priority of bi_ioprio and the cgroup I/O priority class. + * If the cgroup policy has been set to POLICY_NO_CHANGE == 0, the + * bio I/O priority is not modified. If the bio I/O priority equals + * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio. + */ + bio->bi_ioprio = max_t(u16, bio->bi_ioprio, + IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); +} + +static void blkcg_ioprio_exit(struct rq_qos *rqos) +{ + struct blk_ioprio *blkioprio_blkg = + container_of(rqos, typeof(*blkioprio_blkg), rqos); + + blkcg_deactivate_policy(rqos->q, &ioprio_policy); + kfree(blkioprio_blkg); +} + +static struct rq_qos_ops blkcg_ioprio_ops = { + .track = blkcg_ioprio_track, + .exit = blkcg_ioprio_exit, +}; + +int blk_ioprio_init(struct request_queue *q) +{ + struct blk_ioprio *blkioprio_blkg; + struct rq_qos *rqos; + int ret; + + blkioprio_blkg = kzalloc(sizeof(*blkioprio_blkg), GFP_KERNEL); + if (!blkioprio_blkg) + return -ENOMEM; + + ret = blkcg_activate_policy(q, &ioprio_policy); + if (ret) { + kfree(blkioprio_blkg); + return ret; + } + + rqos = &blkioprio_blkg->rqos; + rqos->id = RQ_QOS_IOPRIO; + rqos->ops = &blkcg_ioprio_ops; + rqos->q = q; + + /* + * Registering the rq-qos policy after activating the blk-cgroup + * policy guarantees that ioprio_blkcg_from_bio(bio) != NULL in the + * rq-qos callbacks. + */ + rq_qos_add(q, rqos); + + return 0; +} + +static int __init ioprio_init(void) +{ + return blkcg_policy_register(&ioprio_policy); +} + +static void __exit ioprio_exit(void) +{ + blkcg_policy_unregister(&ioprio_policy); +} + +module_init(ioprio_init); +module_exit(ioprio_exit); diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h new file mode 100644 index 000000000000..a7785c2f1aea --- /dev/null +++ b/block/blk-ioprio.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BLK_IOPRIO_H_ +#define _BLK_IOPRIO_H_ + +#include + +struct request_queue; + +#ifdef CONFIG_BLK_CGROUP_IOPRIO +int blk_ioprio_init(struct request_queue *q); +#else +static inline int blk_ioprio_init(struct request_queue *q) +{ + return 0; +} +#endif + +#endif /* _BLK_IOPRIO_H_ */ diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 6ac1c86f62ef..4b66d2776eda 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -946,6 +946,8 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) return "latency"; case RQ_QOS_COST: return "cost"; + case RQ_QOS_IOPRIO: + return "ioprio"; } return "unknown"; } diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index a77afbdd472c..f000f83e0621 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -17,6 +17,7 @@ enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_LATENCY, RQ_QOS_COST, + RQ_QOS_IOPRIO, }; struct rq_wait { From 46eae2e32a6adc368230b4df0501082c5233e99c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:45 -0700 Subject: [PATCH 098/129] block/mq-deadline: Add several comments Make the code easier to read by adding more comments. Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-6-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 8eea2cbf2bf4..31418e9ce9e2 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -139,6 +139,9 @@ static void dd_request_merged(struct request_queue *q, struct request *req, } } +/* + * Callback function that is invoked after @next has been merged into @req. + */ static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { @@ -375,6 +378,8 @@ done: } /* + * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). + * * One confusing aspect here is that we get called for a specific * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared @@ -438,6 +443,10 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e) return 0; } +/* + * Try to merge @bio into an existing request. If @bio has been merged into + * an existing request, store the pointer to that request into *@rq. + */ static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { @@ -461,6 +470,10 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, return ELEVATOR_NO_MERGE; } +/* + * Attempt to merge a bio into an existing request. This function is called + * before @bio is associated with a request. + */ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { @@ -518,6 +531,9 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } } +/* + * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests(). + */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool at_head) { @@ -544,6 +560,8 @@ static void dd_prepare_request(struct request *rq) } /* + * Callback from inside blk_mq_free_request(). + * * For zoned block devices, write unlock the target zone of * completed write requests. Do this while holding the zone lock * spinlock so that the zone is never unlocked while deadline_fifo_request() From 3bd473f41ae990815d6f75d285b161eebf361278 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:46 -0700 Subject: [PATCH 099/129] block/mq-deadline: Add two lockdep_assert_held() statements Document the locking strategy by adding two lockdep_assert_held() statements. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Cc: Damien Le Moal Cc: Christoph Hellwig Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-7-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 31418e9ce9e2..191ff5ce629c 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -279,6 +279,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) bool reads, writes; int data_dir; + lockdep_assert_held(&dd->lock); + if (!list_empty(&dd->dispatch)) { rq = list_first_entry(&dd->dispatch, struct request, queuelist); list_del_init(&rq->queuelist); @@ -501,6 +503,8 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); + lockdep_assert_held(&dd->lock); + /* * This may be a requeue of a write request that has locked its * target zone. If it is the case, this releases the zone lock. From 2f295beab40f13ab93c004d45372238f2066a5ee Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:47 -0700 Subject: [PATCH 100/129] block/mq-deadline: Remove two local variables Make __dd_dispatch_request() easier to read by removing two local variables. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Cc: Damien Le Moal Cc: Christoph Hellwig Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-8-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 191ff5ce629c..caa438f62a4d 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -276,7 +276,6 @@ deadline_next_request(struct deadline_data *dd, int data_dir) static struct request *__dd_dispatch_request(struct deadline_data *dd) { struct request *rq, *next_rq; - bool reads, writes; int data_dir; lockdep_assert_held(&dd->lock); @@ -287,9 +286,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) goto done; } - reads = !list_empty(&dd->fifo_list[READ]); - writes = !list_empty(&dd->fifo_list[WRITE]); - /* * batches are currently reads XOR writes */ @@ -306,7 +302,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * data direction (read / write) */ - if (reads) { + if (!list_empty(&dd->fifo_list[READ])) { BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); if (deadline_fifo_request(dd, WRITE) && @@ -322,7 +318,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * there are either no reads or writes have been starved */ - if (writes) { + if (!list_empty(&dd->fifo_list[WRITE])) { dispatch_writes: BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); From 3e9a99eba058f79736dccaf25934f8d6ca380fb3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:48 -0700 Subject: [PATCH 101/129] block/mq-deadline: Rename dd_init_queue() and dd_exit_queue() Change "queue" into "sched" to make the function names reflect better the purpose of these functions. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Cc: Damien Le Moal Cc: Christoph Hellwig Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-9-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index caa438f62a4d..d823ba7cb084 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -395,7 +395,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) return rq; } -static void dd_exit_queue(struct elevator_queue *e) +static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; @@ -408,7 +408,7 @@ static void dd_exit_queue(struct elevator_queue *e) /* * initialize elevator private data (deadline_data). */ -static int dd_init_queue(struct request_queue *q, struct elevator_type *e) +static int dd_init_sched(struct request_queue *q, struct elevator_type *e) { struct deadline_data *dd; struct elevator_queue *eq; @@ -800,8 +800,8 @@ static struct elevator_type mq_deadline = { .requests_merged = dd_merged_requests, .request_merged = dd_request_merged, .has_work = dd_has_work, - .init_sched = dd_init_queue, - .exit_sched = dd_exit_queue, + .init_sched = dd_init_sched, + .exit_sched = dd_exit_sched, }, #ifdef CONFIG_BLK_DEBUG_FS From 004a26b327c2e1ea88b2638cf16c0e30e82f297e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:49 -0700 Subject: [PATCH 102/129] block/mq-deadline: Improve compile-time argument checking Modern compilers complain if an out-of-range value is passed to a function argument that has an enumeration type. Let the compiler detect out-of-range data direction arguments instead of verifying the data_dir argument at runtime. Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Himanshu Madhani Cc: Damien Le Moal Cc: Christoph Hellwig Cc: Ming Lei Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-10-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 96 +++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index d823ba7cb084..69126beff77d 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -35,6 +35,13 @@ static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ +enum dd_data_dir { + DD_READ = READ, + DD_WRITE = WRITE, +}; + +enum { DD_DIR_COUNT = 2 }; + struct deadline_data { /* * run time data @@ -43,20 +50,20 @@ struct deadline_data { /* * requests (deadline_rq s) are present on both sort_list and fifo_list */ - struct rb_root sort_list[2]; - struct list_head fifo_list[2]; + struct rb_root sort_list[DD_DIR_COUNT]; + struct list_head fifo_list[DD_DIR_COUNT]; /* * next in sort order. read, write or both are NULL */ - struct request *next_rq[2]; + struct request *next_rq[DD_DIR_COUNT]; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ /* * settings that change how the i/o scheduler behaves */ - int fifo_expire[2]; + int fifo_expire[DD_DIR_COUNT]; int fifo_batch; int writes_starved; int front_merges; @@ -97,7 +104,7 @@ deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) static inline void deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) { - const int data_dir = rq_data_dir(rq); + const enum dd_data_dir data_dir = rq_data_dir(rq); if (dd->next_rq[data_dir] == rq) dd->next_rq[data_dir] = deadline_latter_request(rq); @@ -169,10 +176,10 @@ static void dd_merged_requests(struct request_queue *q, struct request *req, static void deadline_move_request(struct deadline_data *dd, struct request *rq) { - const int data_dir = rq_data_dir(rq); + const enum dd_data_dir data_dir = rq_data_dir(rq); - dd->next_rq[READ] = NULL; - dd->next_rq[WRITE] = NULL; + dd->next_rq[DD_READ] = NULL; + dd->next_rq[DD_WRITE] = NULL; dd->next_rq[data_dir] = deadline_latter_request(rq); /* @@ -185,9 +192,10 @@ deadline_move_request(struct deadline_data *dd, struct request *rq) * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) */ -static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) +static inline int deadline_check_fifo(struct deadline_data *dd, + enum dd_data_dir data_dir) { - struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); + struct request *rq = rq_entry_fifo(dd->fifo_list[data_dir].next); /* * rq is expired! @@ -203,19 +211,16 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) * dispatch using arrival ordered lists. */ static struct request * -deadline_fifo_request(struct deadline_data *dd, int data_dir) +deadline_fifo_request(struct deadline_data *dd, enum dd_data_dir data_dir) { struct request *rq; unsigned long flags; - if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) - return NULL; - if (list_empty(&dd->fifo_list[data_dir])) return NULL; rq = rq_entry_fifo(dd->fifo_list[data_dir].next); - if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) return rq; /* @@ -223,7 +228,7 @@ deadline_fifo_request(struct deadline_data *dd, int data_dir) * an unlocked target zone. */ spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { + list_for_each_entry(rq, &dd->fifo_list[DD_WRITE], queuelist) { if (blk_req_can_dispatch_to_zone(rq)) goto out; } @@ -239,19 +244,16 @@ out: * dispatch using sector position sorted lists. */ static struct request * -deadline_next_request(struct deadline_data *dd, int data_dir) +deadline_next_request(struct deadline_data *dd, enum dd_data_dir data_dir) { struct request *rq; unsigned long flags; - if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) - return NULL; - rq = dd->next_rq[data_dir]; if (!rq) return NULL; - if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) return rq; /* @@ -276,7 +278,7 @@ deadline_next_request(struct deadline_data *dd, int data_dir) static struct request *__dd_dispatch_request(struct deadline_data *dd) { struct request *rq, *next_rq; - int data_dir; + enum dd_data_dir data_dir; lockdep_assert_held(&dd->lock); @@ -289,9 +291,9 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) /* * batches are currently reads XOR writes */ - rq = deadline_next_request(dd, WRITE); + rq = deadline_next_request(dd, DD_WRITE); if (!rq) - rq = deadline_next_request(dd, READ); + rq = deadline_next_request(dd, DD_READ); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ @@ -302,14 +304,14 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * data direction (read / write) */ - if (!list_empty(&dd->fifo_list[READ])) { - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); + if (!list_empty(&dd->fifo_list[DD_READ])) { + BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[DD_READ])); - if (deadline_fifo_request(dd, WRITE) && + if (deadline_fifo_request(dd, DD_WRITE) && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; - data_dir = READ; + data_dir = DD_READ; goto dispatch_find_request; } @@ -318,13 +320,13 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * there are either no reads or writes have been starved */ - if (!list_empty(&dd->fifo_list[WRITE])) { + if (!list_empty(&dd->fifo_list[DD_WRITE])) { dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); + BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[DD_WRITE])); dd->starved = 0; - data_dir = WRITE; + data_dir = DD_WRITE; goto dispatch_find_request; } @@ -399,8 +401,8 @@ static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; - BUG_ON(!list_empty(&dd->fifo_list[READ])); - BUG_ON(!list_empty(&dd->fifo_list[WRITE])); + BUG_ON(!list_empty(&dd->fifo_list[DD_READ])); + BUG_ON(!list_empty(&dd->fifo_list[DD_WRITE])); kfree(dd); } @@ -424,12 +426,12 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) } eq->elevator_data = dd; - INIT_LIST_HEAD(&dd->fifo_list[READ]); - INIT_LIST_HEAD(&dd->fifo_list[WRITE]); - dd->sort_list[READ] = RB_ROOT; - dd->sort_list[WRITE] = RB_ROOT; - dd->fifo_expire[READ] = read_expire; - dd->fifo_expire[WRITE] = write_expire; + INIT_LIST_HEAD(&dd->fifo_list[DD_READ]); + INIT_LIST_HEAD(&dd->fifo_list[DD_WRITE]); + dd->sort_list[DD_READ] = RB_ROOT; + dd->sort_list[DD_WRITE] = RB_ROOT; + dd->fifo_expire[DD_READ] = read_expire; + dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; dd->fifo_batch = fifo_batch; @@ -497,7 +499,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; - const int data_dir = rq_data_dir(rq); + const enum dd_data_dir data_dir = rq_data_dir(rq); lockdep_assert_held(&dd->lock); @@ -585,7 +587,7 @@ static void dd_finish_request(struct request *rq) spin_lock_irqsave(&dd->zone_lock, flags); blk_req_zone_write_unlock(rq); - if (!list_empty(&dd->fifo_list[WRITE])) + if (!list_empty(&dd->fifo_list[DD_WRITE])) blk_mq_sched_mark_restart_hctx(rq->mq_hctx); spin_unlock_irqrestore(&dd->zone_lock, flags); } @@ -626,8 +628,8 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ __data = jiffies_to_msecs(__data); \ return deadline_var_show(__data, (page)); \ } -SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); -SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); +SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[DD_READ], 1); +SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[DD_WRITE], 1); SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); @@ -649,8 +651,8 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) *(__PTR) = __data; \ return count; \ } -STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX, 1); STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); @@ -717,8 +719,8 @@ static int deadline_##name##_next_rq_show(void *data, \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } -DEADLINE_DEBUGFS_DDIR_ATTRS(READ, read) -DEADLINE_DEBUGFS_DDIR_ATTRS(WRITE, write) +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_READ, read) +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_WRITE, write) #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) From d6d7f013d65491eaff477b9bd83b80111f5be9e4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:50 -0700 Subject: [PATCH 103/129] block/mq-deadline: Improve the sysfs show and store macros Define separate macros for integers and jiffies to improve readability. Use sysfs_emit() and kstrtoint() instead of sprintf() and simple_strtol(). The former functions are the recommended functions. Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-11-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 64 ++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 36 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 69126beff77d..f92224ff0256 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -605,58 +605,50 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) /* * sysfs parts below */ -static ssize_t -deadline_var_show(int var, char *page) -{ - return sprintf(page, "%d\n", var); -} - -static void -deadline_var_store(int *var, const char *page) -{ - char *p = (char *) page; - - *var = simple_strtol(p, &p, 10); -} - -#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +#define SHOW_INT(__FUNC, __VAR) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct deadline_data *dd = e->elevator_data; \ - int __data = __VAR; \ - if (__CONV) \ - __data = jiffies_to_msecs(__data); \ - return deadline_var_show(__data, (page)); \ + \ + return sysfs_emit(page, "%d\n", __VAR); \ } -SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[DD_READ], 1); -SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[DD_WRITE], 1); -SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); -SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); -SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); -#undef SHOW_FUNCTION +#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) +SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); +SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); +SHOW_INT(deadline_writes_starved_show, dd->writes_starved); +SHOW_INT(deadline_front_merges_show, dd->front_merges); +SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); +#undef SHOW_INT +#undef SHOW_JIFFIES #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct deadline_data *dd = e->elevator_data; \ - int __data; \ - deadline_var_store(&__data, (page)); \ + int __data, __ret; \ + \ + __ret = kstrtoint(page, 0, &__data); \ + if (__ret < 0) \ + return __ret; \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ - if (__CONV) \ - *(__PTR) = msecs_to_jiffies(__data); \ - else \ - *(__PTR) = __data; \ + *(__PTR) = __CONV(__data); \ return count; \ } -STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); -STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); -STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); +#define STORE_INT(__FUNC, __PTR, MIN, MAX) \ + STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) +#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ + STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) +STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); +STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); +STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); +STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); +STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION +#undef STORE_INT +#undef STORE_JIFFIES #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) From 07757588e5076748308dd95ee2e3cd0b82ebb8c4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:51 -0700 Subject: [PATCH 104/129] block/mq-deadline: Reserve 25% of scheduler tags for synchronous requests For interactive workloads it is important that synchronous requests are not delayed. Hence reserve 25% of scheduler tags for synchronous requests. This patch still allows asynchronous requests to fill the hardware queues since blk_mq_init_sched() makes sure that the number of scheduler requests is the double of the hardware queue depth. From blk_mq_init_sched(): q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, BLKDEV_MAX_RQ); Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-12-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 55 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index f92224ff0256..44da481c3fea 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -67,6 +67,7 @@ struct deadline_data { int fifo_batch; int writes_starved; int front_merges; + u32 async_depth; spinlock_t lock; spinlock_t zone_lock; @@ -397,6 +398,44 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) return rq; } +/* + * Called by __blk_mq_alloc_request(). The shallow_depth value set by this + * function is used by __blk_mq_get_tag(). + */ +static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +{ + struct deadline_data *dd = data->q->elevator->elevator_data; + + /* Do not throttle synchronous reads. */ + if (op_is_sync(op) && !op_is_write(op)) + return; + + /* + * Throttle asynchronous requests and writes such that these requests + * do not block the allocation of synchronous requests. + */ + data->shallow_depth = dd->async_depth; +} + +/* Called by blk_mq_update_nr_requests(). */ +static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + struct blk_mq_tags *tags = hctx->sched_tags; + + dd->async_depth = max(1UL, 3 * q->nr_requests / 4); + + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth); +} + +/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ +static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + dd_depth_updated(hctx); + return 0; +} + static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; @@ -617,6 +656,7 @@ SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); +SHOW_INT(deadline_async_depth_show, dd->front_merges); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES @@ -645,6 +685,7 @@ STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX) STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); +STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT @@ -658,6 +699,7 @@ static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), + DD_ATTR(async_depth), DD_ATTR(fifo_batch), __ATTR_NULL }; @@ -733,6 +775,15 @@ static int deadline_starved_show(void *data, struct seq_file *m) return 0; } +static int dd_async_depth_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u\n", dd->async_depth); + return 0; +} + static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos) __acquires(&dd->lock) { @@ -775,6 +826,7 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { DEADLINE_QUEUE_DDIR_ATTRS(write), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, + {"async_depth", 0400, dd_async_depth_show}, {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops}, {}, }; @@ -783,6 +835,8 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { static struct elevator_type mq_deadline = { .ops = { + .depth_updated = dd_depth_updated, + .limit_depth = dd_limit_depth, .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .prepare_request = dd_prepare_request, @@ -796,6 +850,7 @@ static struct elevator_type mq_deadline = { .has_work = dd_has_work, .init_sched = dd_init_sched, .exit_sched = dd_exit_sched, + .init_hctx = dd_init_hctx, }, #ifdef CONFIG_BLK_DEBUG_FS From d672d325b1492f5b0e54b7226f01e2d57b58bfb4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:52 -0700 Subject: [PATCH 105/129] block/mq-deadline: Micro-optimize the batching algorithm When dispatching the first request of a batch, the deadline_move_request() call clears .next_rq[] for the opposite data direction. .next_rq[] is not restored when changing data direction. Fix this by not clearing .next_rq[] and by keeping track of the data direction of a batch in a variable instead. This patch is a micro-optimization because: - The number of deadline_next_request() calls for the read direction is halved. - The number of times that deadline_next_request() returns NULL is reduced. Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-13-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 44da481c3fea..b09ae1f332a2 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -53,6 +53,8 @@ struct deadline_data { struct rb_root sort_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT]; + /* Data direction of latest dispatched request. */ + enum dd_data_dir last_dir; /* * next in sort order. read, write or both are NULL */ @@ -179,8 +181,6 @@ deadline_move_request(struct deadline_data *dd, struct request *rq) { const enum dd_data_dir data_dir = rq_data_dir(rq); - dd->next_rq[DD_READ] = NULL; - dd->next_rq[DD_WRITE] = NULL; dd->next_rq[data_dir] = deadline_latter_request(rq); /* @@ -292,10 +292,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) /* * batches are currently reads XOR writes */ - rq = deadline_next_request(dd, DD_WRITE); - if (!rq) - rq = deadline_next_request(dd, DD_READ); - + rq = deadline_next_request(dd, dd->last_dir); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ goto dispatch_request; @@ -361,6 +358,7 @@ dispatch_find_request: if (!rq) return NULL; + dd->last_dir = data_dir; dd->batching = 0; dispatch_request: @@ -473,6 +471,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; + dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); From c807ab520fc3fd056c47c74ced63f9d3991a171b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:53 -0700 Subject: [PATCH 106/129] block/mq-deadline: Add I/O priority support Maintain one dispatch list and one FIFO list per I/O priority class: RT, BE and IDLE. Maintain statistics for each priority level. Split the debugfs attributes per priority level as follows: $ ls /sys/kernel/debug/block/.../sched/ async_depth dispatch2 read_next_rq write2_fifo_list batching read0_fifo_list starved write_next_rq dispatch0 read1_fifo_list write0_fifo_list dispatch1 read2_fifo_list write1_fifo_list Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-14-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 342 +++++++++++++++++++++++++++++--------------- 1 file changed, 228 insertions(+), 114 deletions(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index b09ae1f332a2..aba672a5be1e 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -42,23 +42,36 @@ enum dd_data_dir { enum { DD_DIR_COUNT = 2 }; +enum dd_prio { + DD_RT_PRIO = 0, + DD_BE_PRIO = 1, + DD_IDLE_PRIO = 2, + DD_PRIO_MAX = 2, +}; + +enum { DD_PRIO_COUNT = 3 }; + +/* + * Deadline scheduler data per I/O priority (enum dd_prio). Requests are + * present on both sort_list[] and fifo_list[]. + */ +struct dd_per_prio { + struct list_head dispatch; + struct rb_root sort_list[DD_DIR_COUNT]; + struct list_head fifo_list[DD_DIR_COUNT]; + /* Next request in FIFO order. Read, write or both are NULL. */ + struct request *next_rq[DD_DIR_COUNT]; +}; + struct deadline_data { /* * run time data */ - /* - * requests (deadline_rq s) are present on both sort_list and fifo_list - */ - struct rb_root sort_list[DD_DIR_COUNT]; - struct list_head fifo_list[DD_DIR_COUNT]; + struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ enum dd_data_dir last_dir; - /* - * next in sort order. read, write or both are NULL - */ - struct request *next_rq[DD_DIR_COUNT]; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ @@ -73,13 +86,29 @@ struct deadline_data { spinlock_t lock; spinlock_t zone_lock; - struct list_head dispatch; +}; + +/* Maps an I/O priority class to a deadline scheduler priority. */ +static const enum dd_prio ioprio_class_to_prio[] = { + [IOPRIO_CLASS_NONE] = DD_BE_PRIO, + [IOPRIO_CLASS_RT] = DD_RT_PRIO, + [IOPRIO_CLASS_BE] = DD_BE_PRIO, + [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; static inline struct rb_root * -deadline_rb_root(struct deadline_data *dd, struct request *rq) +deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { - return &dd->sort_list[rq_data_dir(rq)]; + return &per_prio->sort_list[rq_data_dir(rq)]; +} + +/* + * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a + * request. + */ +static u8 dd_rq_ioclass(struct request *rq) +{ + return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); } /* @@ -97,38 +126,38 @@ deadline_latter_request(struct request *rq) } static void -deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) +deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { - struct rb_root *root = deadline_rb_root(dd, rq); + struct rb_root *root = deadline_rb_root(per_prio, rq); elv_rb_add(root, rq); } static inline void -deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) +deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { const enum dd_data_dir data_dir = rq_data_dir(rq); - if (dd->next_rq[data_dir] == rq) - dd->next_rq[data_dir] = deadline_latter_request(rq); + if (per_prio->next_rq[data_dir] == rq) + per_prio->next_rq[data_dir] = deadline_latter_request(rq); - elv_rb_del(deadline_rb_root(dd, rq), rq); + elv_rb_del(deadline_rb_root(per_prio, rq), rq); } /* * remove rq from rbtree and fifo. */ -static void deadline_remove_request(struct request_queue *q, struct request *rq) +static void deadline_remove_request(struct request_queue *q, + struct dd_per_prio *per_prio, + struct request *rq) { - struct deadline_data *dd = q->elevator->elevator_data; - list_del_init(&rq->queuelist); /* * We might not be on the rbtree, if we are doing an insert merge */ if (!RB_EMPTY_NODE(&rq->rb_node)) - deadline_del_rq_rb(dd, rq); + deadline_del_rq_rb(per_prio, rq); elv_rqhash_del(q, rq); if (q->last_merge == rq) @@ -139,13 +168,16 @@ static void dd_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(req); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; /* * if the merge was a front merge, we need to reposition request */ if (type == ELEVATOR_FRONT_MERGE) { - elv_rb_del(deadline_rb_root(dd, req), req); - deadline_add_rq_rb(dd, req); + elv_rb_del(deadline_rb_root(per_prio, req), req); + deadline_add_rq_rb(per_prio, req); } } @@ -155,6 +187,9 @@ static void dd_request_merged(struct request_queue *q, struct request *req, static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { + const u8 ioprio_class = dd_rq_ioclass(next); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo @@ -170,33 +205,34 @@ static void dd_merged_requests(struct request_queue *q, struct request *req, /* * kill knowledge of next, this one is a goner */ - deadline_remove_request(q, next); + deadline_remove_request(q, &dd->per_prio[prio], next); } /* * move an entry to dispatch queue */ static void -deadline_move_request(struct deadline_data *dd, struct request *rq) +deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + struct request *rq) { const enum dd_data_dir data_dir = rq_data_dir(rq); - dd->next_rq[data_dir] = deadline_latter_request(rq); + per_prio->next_rq[data_dir] = deadline_latter_request(rq); /* * take it off the sort and fifo list */ - deadline_remove_request(rq->q, rq); + deadline_remove_request(rq->q, per_prio, rq); } /* * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) */ -static inline int deadline_check_fifo(struct deadline_data *dd, +static inline int deadline_check_fifo(struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { - struct request *rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); /* * rq is expired! @@ -212,15 +248,16 @@ static inline int deadline_check_fifo(struct deadline_data *dd, * dispatch using arrival ordered lists. */ static struct request * -deadline_fifo_request(struct deadline_data *dd, enum dd_data_dir data_dir) +deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) { struct request *rq; unsigned long flags; - if (list_empty(&dd->fifo_list[data_dir])) + if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) return rq; @@ -229,7 +266,7 @@ deadline_fifo_request(struct deadline_data *dd, enum dd_data_dir data_dir) * an unlocked target zone. */ spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry(rq, &dd->fifo_list[DD_WRITE], queuelist) { + list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { if (blk_req_can_dispatch_to_zone(rq)) goto out; } @@ -245,12 +282,13 @@ out: * dispatch using sector position sorted lists. */ static struct request * -deadline_next_request(struct deadline_data *dd, enum dd_data_dir data_dir) +deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) { struct request *rq; unsigned long flags; - rq = dd->next_rq[data_dir]; + rq = per_prio->next_rq[data_dir]; if (!rq) return NULL; @@ -276,15 +314,17 @@ deadline_next_request(struct deadline_data *dd, enum dd_data_dir data_dir) * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc */ -static struct request *__dd_dispatch_request(struct deadline_data *dd) +static struct request *__dd_dispatch_request(struct deadline_data *dd, + struct dd_per_prio *per_prio) { struct request *rq, *next_rq; enum dd_data_dir data_dir; lockdep_assert_held(&dd->lock); - if (!list_empty(&dd->dispatch)) { - rq = list_first_entry(&dd->dispatch, struct request, queuelist); + if (!list_empty(&per_prio->dispatch)) { + rq = list_first_entry(&per_prio->dispatch, struct request, + queuelist); list_del_init(&rq->queuelist); goto done; } @@ -292,7 +332,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) /* * batches are currently reads XOR writes */ - rq = deadline_next_request(dd, dd->last_dir); + rq = deadline_next_request(dd, per_prio, dd->last_dir); if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ goto dispatch_request; @@ -302,10 +342,10 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * data direction (read / write) */ - if (!list_empty(&dd->fifo_list[DD_READ])) { - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[DD_READ])); + if (!list_empty(&per_prio->fifo_list[DD_READ])) { + BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); - if (deadline_fifo_request(dd, DD_WRITE) && + if (deadline_fifo_request(dd, per_prio, DD_WRITE) && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; @@ -318,9 +358,9 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * there are either no reads or writes have been starved */ - if (!list_empty(&dd->fifo_list[DD_WRITE])) { + if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[DD_WRITE])); + BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); dd->starved = 0; @@ -335,14 +375,14 @@ dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ - next_rq = deadline_next_request(dd, data_dir); - if (deadline_check_fifo(dd, data_dir) || !next_rq) { + next_rq = deadline_next_request(dd, per_prio, data_dir); + if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ - rq = deadline_fifo_request(dd, data_dir); + rq = deadline_fifo_request(dd, per_prio, data_dir); } else { /* * The last req was the same dir and we have a next request in @@ -366,7 +406,7 @@ dispatch_request: * rq is the selected appropriate request. */ dd->batching++; - deadline_move_request(dd, rq); + deadline_move_request(dd, per_prio, rq); done: /* * If the request needs its target zone locked, do it. @@ -388,9 +428,14 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct request *rq; + enum dd_prio prio; spin_lock(&dd->lock); - rq = __dd_dispatch_request(dd); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); + if (rq) + break; + } spin_unlock(&dd->lock); return rq; @@ -437,9 +482,14 @@ static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; + enum dd_prio prio; - BUG_ON(!list_empty(&dd->fifo_list[DD_READ])); - BUG_ON(!list_empty(&dd->fifo_list[DD_WRITE])); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); + WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); + } kfree(dd); } @@ -451,22 +501,28 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) { struct deadline_data *dd; struct elevator_queue *eq; + enum dd_prio prio; + int ret = -ENOMEM; eq = elevator_alloc(q, e); if (!eq) - return -ENOMEM; + return ret; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); - if (!dd) { - kobject_put(&eq->kobj); - return -ENOMEM; - } + if (!dd) + goto put_eq; + eq->elevator_data = dd; - INIT_LIST_HEAD(&dd->fifo_list[DD_READ]); - INIT_LIST_HEAD(&dd->fifo_list[DD_WRITE]); - dd->sort_list[DD_READ] = RB_ROOT; - dd->sort_list[DD_WRITE] = RB_ROOT; + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + INIT_LIST_HEAD(&per_prio->dispatch); + INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); + INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); + per_prio->sort_list[DD_READ] = RB_ROOT; + per_prio->sort_list[DD_WRITE] = RB_ROOT; + } dd->fifo_expire[DD_READ] = read_expire; dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; @@ -475,10 +531,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->fifo_batch = fifo_batch; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); - INIT_LIST_HEAD(&dd->dispatch); q->elevator = eq; return 0; + +put_eq: + kobject_put(&eq->kobj); + return ret; } /* @@ -489,13 +548,16 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; sector_t sector = bio_end_sector(bio); struct request *__rq; if (!dd->front_merges) return ELEVATOR_NO_MERGE; - __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); + __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); @@ -538,6 +600,10 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const enum dd_data_dir data_dir = rq_data_dir(rq); + u16 ioprio = req_get_ioprio(rq); + u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); + struct dd_per_prio *per_prio; + enum dd_prio prio; lockdep_assert_held(&dd->lock); @@ -547,15 +613,18 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ blk_req_zone_write_unlock(rq); + prio = ioprio_class_to_prio[ioprio_class]; + if (blk_mq_sched_try_insert_merge(q, rq)) return; trace_block_rq_insert(rq); + per_prio = &dd->per_prio[prio]; if (at_head) { - list_add(&rq->queuelist, &dd->dispatch); + list_add(&rq->queuelist, &per_prio->dispatch); } else { - deadline_add_rq_rb(dd, rq); + deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -567,7 +636,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; - list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); + list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); } } @@ -618,26 +687,39 @@ static void dd_prepare_request(struct request *rq) static void dd_finish_request(struct request *rq) { struct request_queue *q = rq->q; + struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(rq); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; if (blk_queue_is_zoned(q)) { - struct deadline_data *dd = q->elevator->elevator_data; unsigned long flags; spin_lock_irqsave(&dd->zone_lock, flags); blk_req_zone_write_unlock(rq); - if (!list_empty(&dd->fifo_list[DD_WRITE])) + if (!list_empty(&per_prio->fifo_list[DD_WRITE])) blk_mq_sched_mark_restart_hctx(rq->mq_hctx); spin_unlock_irqrestore(&dd->zone_lock, flags); } } +static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) +{ + return !list_empty_careful(&per_prio->dispatch) || + !list_empty_careful(&per_prio->fifo_list[DD_READ]) || + !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); +} + static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; + enum dd_prio prio; - return !list_empty_careful(&dd->dispatch) || - !list_empty_careful(&dd->fifo_list[0]) || - !list_empty_careful(&dd->fifo_list[1]); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) + if (dd_has_work_for_prio(&dd->per_prio[prio])) + return true; + + return false; } /* @@ -704,16 +786,17 @@ static struct elv_fs_entry deadline_attrs[] = { }; #ifdef CONFIG_BLK_DEBUG_FS -#define DEADLINE_DEBUGFS_DDIR_ATTRS(ddir, name) \ +#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ static void *deadline_##name##_fifo_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ - return seq_list_start(&dd->fifo_list[ddir], *pos); \ + return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ } \ \ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ @@ -721,8 +804,9 @@ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ - return seq_list_next(v, &dd->fifo_list[ddir], pos); \ + return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ } \ \ static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ @@ -746,14 +830,20 @@ static int deadline_##name##_next_rq_show(void *data, \ { \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ - struct request *rq = dd->next_rq[ddir]; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + struct request *rq = per_prio->next_rq[data_dir]; \ \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_READ, read) -DEADLINE_DEBUGFS_DDIR_ATTRS(DD_WRITE, write) + +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) @@ -783,50 +873,74 @@ static int dd_async_depth_show(void *data, struct seq_file *m) return 0; } -static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos) - __acquires(&dd->lock) -{ - struct request_queue *q = m->private; - struct deadline_data *dd = q->elevator->elevator_data; - - spin_lock(&dd->lock); - return seq_list_start(&dd->dispatch, *pos); +#define DEADLINE_DISPATCH_ATTR(prio) \ +static void *deadline_dispatch##prio##_start(struct seq_file *m, \ + loff_t *pos) \ + __acquires(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + spin_lock(&dd->lock); \ + return seq_list_start(&per_prio->dispatch, *pos); \ +} \ + \ +static void *deadline_dispatch##prio##_next(struct seq_file *m, \ + void *v, loff_t *pos) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + return seq_list_next(v, &per_prio->dispatch, pos); \ +} \ + \ +static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ + __releases(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + \ + spin_unlock(&dd->lock); \ +} \ + \ +static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ + .start = deadline_dispatch##prio##_start, \ + .next = deadline_dispatch##prio##_next, \ + .stop = deadline_dispatch##prio##_stop, \ + .show = blk_mq_debugfs_rq_show, \ } -static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct request_queue *q = m->private; - struct deadline_data *dd = q->elevator->elevator_data; +DEADLINE_DISPATCH_ATTR(0); +DEADLINE_DISPATCH_ATTR(1); +DEADLINE_DISPATCH_ATTR(2); +#undef DEADLINE_DISPATCH_ATTR - return seq_list_next(v, &dd->dispatch, pos); -} - -static void deadline_dispatch_stop(struct seq_file *m, void *v) - __releases(&dd->lock) -{ - struct request_queue *q = m->private; - struct deadline_data *dd = q->elevator->elevator_data; - - spin_unlock(&dd->lock); -} - -static const struct seq_operations deadline_dispatch_seq_ops = { - .start = deadline_dispatch_start, - .next = deadline_dispatch_next, - .stop = deadline_dispatch_stop, - .show = blk_mq_debugfs_rq_show, -}; - -#define DEADLINE_QUEUE_DDIR_ATTRS(name) \ - {#name "_fifo_list", 0400, .seq_ops = &deadline_##name##_fifo_seq_ops}, \ +#define DEADLINE_QUEUE_DDIR_ATTRS(name) \ + {#name "_fifo_list", 0400, \ + .seq_ops = &deadline_##name##_fifo_seq_ops} +#define DEADLINE_NEXT_RQ_ATTR(name) \ {#name "_next_rq", 0400, deadline_##name##_next_rq_show} static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { - DEADLINE_QUEUE_DDIR_ATTRS(read), - DEADLINE_QUEUE_DDIR_ATTRS(write), + DEADLINE_QUEUE_DDIR_ATTRS(read0), + DEADLINE_QUEUE_DDIR_ATTRS(write0), + DEADLINE_QUEUE_DDIR_ATTRS(read1), + DEADLINE_QUEUE_DDIR_ATTRS(write1), + DEADLINE_QUEUE_DDIR_ATTRS(read2), + DEADLINE_QUEUE_DDIR_ATTRS(write2), + DEADLINE_NEXT_RQ_ATTR(read0), + DEADLINE_NEXT_RQ_ATTR(write0), + DEADLINE_NEXT_RQ_ATTR(read1), + DEADLINE_NEXT_RQ_ATTR(write1), + DEADLINE_NEXT_RQ_ATTR(read2), + DEADLINE_NEXT_RQ_ATTR(write2), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, {"async_depth", 0400, dd_async_depth_show}, - {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops}, + {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, + {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, + {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS @@ -876,6 +990,6 @@ static void __exit deadline_exit(void) module_init(deadline_init); module_exit(deadline_exit); -MODULE_AUTHOR("Jens Axboe"); +MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MQ deadline IO scheduler"); From 38ba64d12d4cf9fa260c45d7398e2a24afaceefa Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:54 -0700 Subject: [PATCH 107/129] block/mq-deadline: Track I/O statistics Track I/O statistics per I/O priority and export these statistics to debugfs. These statistics help developers of the deadline scheduler. Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-15-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 100 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index aba672a5be1e..04d9d6b3745b 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -51,6 +51,19 @@ enum dd_prio { enum { DD_PRIO_COUNT = 3 }; +/* I/O statistics per I/O priority. */ +struct io_stats_per_prio { + local_t inserted; + local_t merged; + local_t dispatched; + local_t completed; +}; + +/* I/O statistics for all I/O priorities (enum dd_prio). */ +struct io_stats { + struct io_stats_per_prio stats[DD_PRIO_COUNT]; +}; + /* * Deadline scheduler data per I/O priority (enum dd_prio). Requests are * present on both sort_list[] and fifo_list[]. @@ -75,6 +88,8 @@ struct deadline_data { unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ + struct io_stats __percpu *stats; + /* * settings that change how the i/o scheduler behaves */ @@ -88,6 +103,33 @@ struct deadline_data { spinlock_t zone_lock; }; +/* Count one event of type 'event_type' and with I/O priority 'prio' */ +#define dd_count(dd, event_type, prio) do { \ + struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \ + \ + BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ + BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ + local_inc(&io_stats->stats[(prio)].event_type); \ + put_cpu_ptr(io_stats); \ +} while (0) + +/* + * Returns the total number of dd_count(dd, event_type, prio) calls across all + * CPUs. No locking or barriers since it is fine if the returned sum is slightly + * outdated. + */ +#define dd_sum(dd, event_type, prio) ({ \ + unsigned int cpu; \ + u32 sum = 0; \ + \ + BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ + BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ + for_each_present_cpu(cpu) \ + sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \ + stats[(prio)].event_type); \ + sum; \ +}) + /* Maps an I/O priority class to a deadline scheduler priority. */ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_NONE] = DD_BE_PRIO, @@ -187,9 +229,12 @@ static void dd_request_merged(struct request_queue *q, struct request *req, static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { + struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + dd_count(dd, merged, prio); + /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo @@ -225,6 +270,12 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, deadline_remove_request(rq->q, per_prio, rq); } +/* Number of requests queued for a given priority level. */ +static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) +{ + return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio); +} + /* * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) @@ -319,6 +370,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, { struct request *rq, *next_rq; enum dd_data_dir data_dir; + enum dd_prio prio; + u8 ioprio_class; lockdep_assert_held(&dd->lock); @@ -408,6 +461,9 @@ dispatch_request: dd->batching++; deadline_move_request(dd, per_prio, rq); done: + ioprio_class = dd_rq_ioclass(rq); + prio = ioprio_class_to_prio[ioprio_class]; + dd_count(dd, dispatched, prio); /* * If the request needs its target zone locked, do it. */ @@ -491,6 +547,8 @@ static void dd_exit_sched(struct elevator_queue *e) WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); } + free_percpu(dd->stats); + kfree(dd); } @@ -514,6 +572,11 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) eq->elevator_data = dd; + dd->stats = alloc_percpu_gfp(typeof(*dd->stats), + GFP_KERNEL | __GFP_ZERO); + if (!dd->stats) + goto free_dd; + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -535,6 +598,9 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) q->elevator = eq; return 0; +free_dd: + kfree(dd); + put_eq: kobject_put(&eq->kobj); return ret; @@ -614,6 +680,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_req_zone_write_unlock(rq); prio = ioprio_class_to_prio[ioprio_class]; + dd_count(dd, inserted, prio); if (blk_mq_sched_try_insert_merge(q, rq)) return; @@ -692,6 +759,8 @@ static void dd_finish_request(struct request *rq) const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; + dd_count(dd, completed, prio); + if (blk_queue_is_zoned(q)) { unsigned long flags; @@ -873,6 +942,35 @@ static int dd_async_depth_show(void *data, struct seq_file *m) return 0; } +static int dd_queued_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO), + dd_queued(dd, DD_BE_PRIO), + dd_queued(dd, DD_IDLE_PRIO)); + return 0; +} + +/* Number of requests owned by the block driver for a given priority. */ +static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) +{ + return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio) + - dd_sum(dd, completed, prio); +} + +static int dd_owned_by_driver_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO), + dd_owned_by_driver(dd, DD_BE_PRIO), + dd_owned_by_driver(dd, DD_IDLE_PRIO)); + return 0; +} + #define DEADLINE_DISPATCH_ATTR(prio) \ static void *deadline_dispatch##prio##_start(struct seq_file *m, \ loff_t *pos) \ @@ -941,6 +1039,8 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, + {"owned_by_driver", 0400, dd_owned_by_driver_show}, + {"queued", 0400, dd_queued_show}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS From 08a9ad8bf607388d768a341957d53eae64250c2d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:55 -0700 Subject: [PATCH 108/129] block/mq-deadline: Add cgroup support Maintain statistics per cgroup and export these to user space. These statistics are essential for verifying whether the proper I/O priorities have been assigned to requests. An example of the statistics data with this patch applied: $ cat /sys/fs/cgroup/io.stat 11:2 rbytes=0 wbytes=0 rios=3 wios=0 dbytes=0 dios=0 [NONE] dispatched=0 inserted=0 merged=171 [RT] dispatched=0 inserted=0 merged=0 [BE] dispatched=0 inserted=0 merged=0 [IDLE] dispatched=0 inserted=0 merged=0 8:32 rbytes=2142720 wbytes=0 rios=105 wios=0 dbytes=0 dios=0 [NONE] dispatched=0 inserted=0 merged=171 [RT] dispatched=0 inserted=0 merged=0 [BE] dispatched=0 inserted=0 merged=0 [IDLE] dispatched=0 inserted=0 merged=0 Cc: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-16-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/Kconfig.iosched | 6 + block/Makefile | 2 + block/mq-deadline-cgroup.c | 126 ++++++++++++++++++++ block/mq-deadline-cgroup.h | 114 ++++++++++++++++++ block/{mq-deadline.c => mq-deadline-main.c} | 74 +++++++++--- 5 files changed, 308 insertions(+), 14 deletions(-) create mode 100644 block/mq-deadline-cgroup.c create mode 100644 block/mq-deadline-cgroup.h rename block/{mq-deadline.c => mq-deadline-main.c} (95%) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 2f2158e05a91..64053d67a97b 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -9,6 +9,12 @@ config MQ_IOSCHED_DEADLINE help MQ version of the deadline IO scheduler. +config MQ_IOSCHED_DEADLINE_CGROUP + tristate + default y + depends on MQ_IOSCHED_DEADLINE + depends on BLK_CGROUP + config MQ_IOSCHED_KYBER tristate "Kyber I/O scheduler" default y diff --git a/block/Makefile b/block/Makefile index af3d044abaf1..b9db5d4edfc8 100644 --- a/block/Makefile +++ b/block/Makefile @@ -21,6 +21,8 @@ obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o +mq-deadline-y += mq-deadline-main.o +mq-deadline-$(CONFIG_MQ_IOSCHED_DEADLINE_CGROUP)+= mq-deadline-cgroup.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o diff --git a/block/mq-deadline-cgroup.c b/block/mq-deadline-cgroup.c new file mode 100644 index 000000000000..3b4bfddec39f --- /dev/null +++ b/block/mq-deadline-cgroup.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include "mq-deadline-cgroup.h" + +static struct blkcg_policy dd_blkcg_policy; + +static struct blkcg_policy_data *dd_cpd_alloc(gfp_t gfp) +{ + struct dd_blkcg *pd; + + pd = kzalloc(sizeof(*pd), gfp); + if (!pd) + return NULL; + pd->stats = alloc_percpu_gfp(typeof(*pd->stats), + GFP_KERNEL | __GFP_ZERO); + if (!pd->stats) { + kfree(pd); + return NULL; + } + return &pd->cpd; +} + +static void dd_cpd_free(struct blkcg_policy_data *cpd) +{ + struct dd_blkcg *dd_blkcg = container_of(cpd, typeof(*dd_blkcg), cpd); + + free_percpu(dd_blkcg->stats); + kfree(dd_blkcg); +} + +static struct dd_blkcg *dd_blkcg_from_pd(struct blkg_policy_data *pd) +{ + return container_of(blkcg_to_cpd(pd->blkg->blkcg, &dd_blkcg_policy), + struct dd_blkcg, cpd); +} + +/* + * Convert an association between a block cgroup and a request queue into a + * pointer to the mq-deadline information associated with a (blkcg, queue) pair. + */ +struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio) +{ + struct blkg_policy_data *pd; + + pd = blkg_to_pd(bio->bi_blkg, &dd_blkcg_policy); + if (!pd) + return NULL; + + return dd_blkcg_from_pd(pd); +} + +static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +{ + static const char *const prio_class_name[] = { + [IOPRIO_CLASS_NONE] = "NONE", + [IOPRIO_CLASS_RT] = "RT", + [IOPRIO_CLASS_BE] = "BE", + [IOPRIO_CLASS_IDLE] = "IDLE", + }; + struct dd_blkcg *blkcg = dd_blkcg_from_pd(pd); + int res = 0; + u8 prio; + + for (prio = 0; prio < ARRAY_SIZE(blkcg->stats->stats); prio++) + res += scnprintf(buf + res, size - res, + " [%s] dispatched=%u inserted=%u merged=%u", + prio_class_name[prio], + ddcg_sum(blkcg, dispatched, prio) + + ddcg_sum(blkcg, merged, prio) - + ddcg_sum(blkcg, completed, prio), + ddcg_sum(blkcg, inserted, prio) - + ddcg_sum(blkcg, completed, prio), + ddcg_sum(blkcg, merged, prio)); + + return res; +} + +static struct blkg_policy_data *dd_pd_alloc(gfp_t gfp, struct request_queue *q, + struct blkcg *blkcg) +{ + struct dd_blkg *pd; + + pd = kzalloc(sizeof(*pd), gfp); + if (!pd) + return NULL; + return &pd->pd; +} + +static void dd_pd_free(struct blkg_policy_data *pd) +{ + struct dd_blkg *dd_blkg = container_of(pd, typeof(*dd_blkg), pd); + + kfree(dd_blkg); +} + +static struct blkcg_policy dd_blkcg_policy = { + .cpd_alloc_fn = dd_cpd_alloc, + .cpd_free_fn = dd_cpd_free, + + .pd_alloc_fn = dd_pd_alloc, + .pd_free_fn = dd_pd_free, + .pd_stat_fn = dd_pd_stat, +}; + +int dd_activate_policy(struct request_queue *q) +{ + return blkcg_activate_policy(q, &dd_blkcg_policy); +} + +void dd_deactivate_policy(struct request_queue *q) +{ + blkcg_deactivate_policy(q, &dd_blkcg_policy); +} + +int __init dd_blkcg_init(void) +{ + return blkcg_policy_register(&dd_blkcg_policy); +} + +void __exit dd_blkcg_exit(void) +{ + blkcg_policy_unregister(&dd_blkcg_policy); +} diff --git a/block/mq-deadline-cgroup.h b/block/mq-deadline-cgroup.h new file mode 100644 index 000000000000..0143fd74f3ce --- /dev/null +++ b/block/mq-deadline-cgroup.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#if !defined(_MQ_DEADLINE_CGROUP_H_) +#define _MQ_DEADLINE_CGROUP_H_ + +#include + +struct request_queue; + +/** + * struct io_stats_per_prio - I/O statistics per I/O priority class. + * @inserted: Number of inserted requests. + * @merged: Number of merged requests. + * @dispatched: Number of dispatched requests. + * @completed: Number of I/O completions. + */ +struct io_stats_per_prio { + local_t inserted; + local_t merged; + local_t dispatched; + local_t completed; +}; + +/* I/O statistics per I/O cgroup per I/O priority class (IOPRIO_CLASS_*). */ +struct blkcg_io_stats { + struct io_stats_per_prio stats[4]; +}; + +/** + * struct dd_blkcg - Per cgroup data. + * @cpd: blkcg_policy_data structure. + * @stats: I/O statistics. + */ +struct dd_blkcg { + struct blkcg_policy_data cpd; /* must be the first member */ + struct blkcg_io_stats __percpu *stats; +}; + +/* + * Count one event of type 'event_type' and with I/O priority class + * 'prio_class'. + */ +#define ddcg_count(ddcg, event_type, prio_class) do { \ +if (ddcg) { \ + struct blkcg_io_stats *io_stats = get_cpu_ptr((ddcg)->stats); \ + \ + BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \ + BUILD_BUG_ON(!__same_type((prio_class), u8)); \ + local_inc(&io_stats->stats[(prio_class)].event_type); \ + put_cpu_ptr(io_stats); \ +} \ +} while (0) + +/* + * Returns the total number of ddcg_count(ddcg, event_type, prio_class) calls + * across all CPUs. No locking or barriers since it is fine if the returned + * sum is slightly outdated. + */ +#define ddcg_sum(ddcg, event_type, prio) ({ \ + unsigned int cpu; \ + u32 sum = 0; \ + \ + BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \ + BUILD_BUG_ON(!__same_type((prio), u8)); \ + for_each_present_cpu(cpu) \ + sum += local_read(&per_cpu_ptr((ddcg)->stats, cpu)-> \ + stats[(prio)].event_type); \ + sum; \ +}) + +#ifdef CONFIG_BLK_CGROUP + +/** + * struct dd_blkg - Per (cgroup, request queue) data. + * @pd: blkg_policy_data structure. + */ +struct dd_blkg { + struct blkg_policy_data pd; /* must be the first member */ +}; + +struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio); +int dd_activate_policy(struct request_queue *q); +void dd_deactivate_policy(struct request_queue *q); +int __init dd_blkcg_init(void); +void __exit dd_blkcg_exit(void); + +#else /* CONFIG_BLK_CGROUP */ + +static inline struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio) +{ + return NULL; +} + +static inline int dd_activate_policy(struct request_queue *q) +{ + return 0; +} + +static inline void dd_deactivate_policy(struct request_queue *q) +{ +} + +static inline int dd_blkcg_init(void) +{ + return 0; +} + +static inline void dd_blkcg_exit(void) +{ +} + +#endif /* CONFIG_BLK_CGROUP */ + +#endif /* _MQ_DEADLINE_CGROUP_H_ */ diff --git a/block/mq-deadline.c b/block/mq-deadline-main.c similarity index 95% rename from block/mq-deadline.c rename to block/mq-deadline-main.c index 04d9d6b3745b..58a401ea8f56 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline-main.c @@ -25,6 +25,7 @@ #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" +#include "mq-deadline-cgroup.h" /* * See Documentation/block/deadline-iosched.rst @@ -51,14 +52,6 @@ enum dd_prio { enum { DD_PRIO_COUNT = 3 }; -/* I/O statistics per I/O priority. */ -struct io_stats_per_prio { - local_t inserted; - local_t merged; - local_t dispatched; - local_t completed; -}; - /* I/O statistics for all I/O priorities (enum dd_prio). */ struct io_stats { struct io_stats_per_prio stats[DD_PRIO_COUNT]; @@ -81,6 +74,9 @@ struct deadline_data { * run time data */ + /* Request queue that owns this data structure. */ + struct request_queue *queue; + struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ @@ -232,8 +228,10 @@ static void dd_merged_requests(struct request_queue *q, struct request *req, struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_blkcg *blkcg = next->elv.priv[0]; dd_count(dd, merged, prio); + ddcg_count(blkcg, merged, ioprio_class); /* * if next expires before rq, assign its expire time to rq @@ -370,6 +368,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, { struct request *rq, *next_rq; enum dd_data_dir data_dir; + struct dd_blkcg *blkcg; enum dd_prio prio; u8 ioprio_class; @@ -464,6 +463,8 @@ done: ioprio_class = dd_rq_ioclass(rq); prio = ioprio_class_to_prio[ioprio_class]; dd_count(dd, dispatched, prio); + blkcg = rq->elv.priv[0]; + ddcg_count(blkcg, dispatched, ioprio_class); /* * If the request needs its target zone locked, do it. */ @@ -540,6 +541,8 @@ static void dd_exit_sched(struct elevator_queue *e) struct deadline_data *dd = e->elevator_data; enum dd_prio prio; + dd_deactivate_policy(dd->queue); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -553,7 +556,7 @@ static void dd_exit_sched(struct elevator_queue *e) } /* - * initialize elevator private data (deadline_data). + * Initialize elevator private data (deadline_data) and associate with blkcg. */ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) { @@ -562,6 +565,12 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) enum dd_prio prio; int ret = -ENOMEM; + /* + * Initialization would be very tricky if the queue is not frozen, + * hence the warning statement below. + */ + WARN_ON_ONCE(!percpu_ref_is_zero(&q->q_usage_counter)); + eq = elevator_alloc(q, e); if (!eq) return ret; @@ -577,6 +586,8 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) if (!dd->stats) goto free_dd; + dd->queue = q; + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -595,9 +606,17 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); + ret = dd_activate_policy(q); + if (ret) + goto free_stats; + + ret = 0; q->elevator = eq; return 0; +free_stats: + free_percpu(dd->stats); + free_dd: kfree(dd); @@ -670,6 +689,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); struct dd_per_prio *per_prio; enum dd_prio prio; + struct dd_blkcg *blkcg; lockdep_assert_held(&dd->lock); @@ -679,8 +699,19 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ blk_req_zone_write_unlock(rq); + /* + * If a block cgroup has been associated with the submitter and if an + * I/O priority has been set in the associated block cgroup, use the + * lowest of the cgroup priority and the request priority for the + * request. If no priority has been set in the request, use the cgroup + * priority. + */ prio = ioprio_class_to_prio[ioprio_class]; dd_count(dd, inserted, prio); + blkcg = dd_blkcg_from_bio(rq->bio); + ddcg_count(blkcg, inserted, ioprio_class); + WARN_ON_ONCE(rq->elv.priv[0]); + rq->elv.priv[0] = blkcg; if (blk_mq_sched_try_insert_merge(q, rq)) return; @@ -727,12 +758,10 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, spin_unlock(&dd->lock); } -/* - * Nothing to do here. This is defined only to ensure that .finish_request - * method is called upon request completion. - */ +/* Callback from inside blk_mq_rq_ctx_init(). */ static void dd_prepare_request(struct request *rq) { + rq->elv.priv[0] = NULL; } /* @@ -755,11 +784,13 @@ static void dd_finish_request(struct request *rq) { struct request_queue *q = rq->q; struct deadline_data *dd = q->elevator->elevator_data; + struct dd_blkcg *blkcg = rq->elv.priv[0]; const u8 ioprio_class = dd_rq_ioclass(rq); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; dd_count(dd, completed, prio); + ddcg_count(blkcg, completed, ioprio_class); if (blk_queue_is_zoned(q)) { unsigned long flags; @@ -1079,11 +1110,26 @@ MODULE_ALIAS("mq-deadline-iosched"); static int __init deadline_init(void) { - return elv_register(&mq_deadline); + int ret; + + ret = elv_register(&mq_deadline); + if (ret) + goto out; + ret = dd_blkcg_init(); + if (ret) + goto unreg; + +out: + return ret; + +unreg: + elv_unregister(&mq_deadline); + goto out; } static void __exit deadline_exit(void) { + dd_blkcg_exit(); elv_unregister(&mq_deadline); } From fb926032b3209300f9dc454a36b8299582ae545c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Jun 2021 17:44:56 -0700 Subject: [PATCH 109/129] block/mq-deadline: Prioritize high-priority requests While one or more requests with a certain I/O priority are pending, do not dispatch lower priority requests. Dispatch lower priority requests anyway after the "aging" time has expired. This patch has been tested as follows: modprobe scsi_debug ndelay=1000000 max_queue=16 && sd='' && while [ -z "$sd" ]; do sd=/dev/$(basename /sys/bus/pseudo/drivers/scsi_debug/adapter*/host*/target*/*/block/*) done && echo $((100*1000)) > /sys/block/$sd/queue/iosched/aging_expire && cd /sys/fs/cgroup/blkio/ && echo $$ >cgroup.procs && echo restrict-to-be >blkio.prio.class && mkdir -p hipri && cd hipri && echo none-to-rt >blkio.prio.class && { max-iops -a1 -d32 -j1 -e mq-deadline $sd >& ~/low-pri.txt & } && echo $$ >cgroup.procs && max-iops -a1 -d32 -j1 -e mq-deadline $sd >& ~/hi-pri.txt Result: * 11000 IOPS for the high-priority job * 40 IOPS for the low-priority job If the aging expiry time is changed from 100s into 0, the IOPS results change into 6712 and 6796 IOPS. The max-iops script is a script that runs fio with the following arguments: --bs=4K --gtod_reduce=1 --ioengine=libaio --ioscheduler=${arg_e} --runtime=60 --norandommap --rw=read --thread --buffered=0 --numjobs=${arg_j} --iodepth=${arg_d} --iodepth_batch_submit=${arg_a} --iodepth_batch_complete=$((arg_d / 2)) --name=${positional_argument_1} --filename=${positional_argument_1} Reviewed-by: Damien Le Moal Cc: Hannes Reinecke Cc: Christoph Hellwig Cc: Ming Lei Cc: Johannes Thumshirn Cc: Himanshu Madhani Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210618004456.7280-17-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline-main.c | 42 +++++++++++++++++++++++++++++++++++----- 1 file changed, 37 insertions(+), 5 deletions(-) diff --git a/block/mq-deadline-main.c b/block/mq-deadline-main.c index 58a401ea8f56..4815e536091f 100644 --- a/block/mq-deadline-main.c +++ b/block/mq-deadline-main.c @@ -32,6 +32,11 @@ */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ +/* + * Time after which to dispatch lower priority requests even if higher + * priority requests are pending. + */ +static const int aging_expire = 10 * HZ; static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ @@ -94,6 +99,7 @@ struct deadline_data { int writes_starved; int front_merges; u32 async_depth; + int aging_expire; spinlock_t lock; spinlock_t zone_lock; @@ -361,10 +367,11 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, /* * deadline_dispatch_requests selects the best request according to - * read/write expire, fifo_batch, etc + * read/write expire, fifo_batch, etc and with a start time <= @latest. */ static struct request *__dd_dispatch_request(struct deadline_data *dd, - struct dd_per_prio *per_prio) + struct dd_per_prio *per_prio, + u64 latest_start_ns) { struct request *rq, *next_rq; enum dd_data_dir data_dir; @@ -377,6 +384,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, if (!list_empty(&per_prio->dispatch)) { rq = list_first_entry(&per_prio->dispatch, struct request, queuelist); + if (rq->start_time_ns > latest_start_ns) + return NULL; list_del_init(&rq->queuelist); goto done; } @@ -454,6 +463,8 @@ dispatch_find_request: dd->batching = 0; dispatch_request: + if (rq->start_time_ns > latest_start_ns) + return NULL; /* * rq is the selected appropriate request. */ @@ -484,15 +495,32 @@ done: static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; - struct request *rq; + const u64 now_ns = ktime_get_ns(); + struct request *rq = NULL; enum dd_prio prio; spin_lock(&dd->lock); - for (prio = 0; prio <= DD_PRIO_MAX; prio++) { - rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); + /* + * Start with dispatching requests whose deadline expired more than + * aging_expire jiffies ago. + */ + for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) { + rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns - + jiffies_to_nsecs(dd->aging_expire)); if (rq) + goto unlock; + } + /* + * Next, dispatch requests in priority order. Ignore lower priority + * requests if any higher priority requests are pending. + */ + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns); + if (rq || dd_queued(dd, prio)) break; } + +unlock: spin_unlock(&dd->lock); return rq; @@ -603,6 +631,7 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->front_merges = 1; dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; + dd->aging_expire = aging_expire; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); @@ -835,6 +864,7 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); +SHOW_JIFFIES(deadline_aging_expire_show, dd->aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); SHOW_INT(deadline_async_depth_show, dd->front_merges); @@ -864,6 +894,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); +STORE_JIFFIES(deadline_aging_expire_store, &dd->aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX); @@ -882,6 +913,7 @@ static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(front_merges), DD_ATTR(async_depth), DD_ATTR(fifo_batch), + DD_ATTR(aging_expire), __ATTR_NULL }; From 1d0903d61e9645c6330b94247b96dd873dfc11c8 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 19 Jun 2021 17:36:59 +0800 Subject: [PATCH 110/129] blk-wbt: introduce a new disable state to prevent false positive by rwb_enabled() Now that we disable wbt by simply zero out rwb->wb_normal in wbt_disable_default() when switch elevator to bfq, but it's not safe because it will become false positive if we change queue depth. If it become false positive between wbt_wait() and wbt_track() when submit write request, it will lead to drop rqw->inflight to -1 in wbt_done(), which will end up trigger IO hung. Fix this issue by introduce a new state which mean the wbt was disabled. Fixes: a79050434b45 ("blk-rq-qos: refactor out common elements of blk-wbt") Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20210619093700.920393-2-yi.zhang@huawei.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 5 +++-- block/blk-wbt.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index b363b0532704..0ce0883df3d6 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -77,7 +77,8 @@ enum { static inline bool rwb_enabled(struct rq_wb *rwb) { - return rwb && rwb->wb_normal != 0; + return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT && + rwb->wb_normal != 0; } static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) @@ -701,7 +702,7 @@ void wbt_disable_default(struct request_queue *q) rwb = RQWB(rqos); if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { blk_stat_deactivate(rwb->cb); - rwb->wb_normal = 0; + rwb->enable_state = WBT_STATE_OFF_DEFAULT; } } EXPORT_SYMBOL_GPL(wbt_disable_default); diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 16bdc85b8df9..2eb01becde8c 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -34,6 +34,7 @@ enum { enum { WBT_STATE_ON_DEFAULT = 1, WBT_STATE_ON_MANUAL = 2, + WBT_STATE_OFF_DEFAULT }; struct rq_wb { From 76a8040817b4b9c69b53f9b326987fa891b4082a Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 19 Jun 2021 17:37:00 +0800 Subject: [PATCH 111/129] blk-wbt: make sure throttle is enabled properly After commit a79050434b45 ("blk-rq-qos: refactor out common elements of blk-wbt"), if throttle was disabled by wbt_disable_default(), we could not enable again, fix this by set enable_state back to WBT_STATE_ON_DEFAULT. Fixes: a79050434b45 ("blk-rq-qos: refactor out common elements of blk-wbt") Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20210619093700.920393-3-yi.zhang@huawei.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0ce0883df3d6..3ed71b8da887 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -636,9 +636,13 @@ void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) void wbt_enable_default(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); + /* Throttling already enabled? */ - if (rqos) + if (rqos) { + if (RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) + RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; return; + } /* Queue not registered? Maybe shutting down... */ if (!blk_queue_registered(q)) From 511a2699237611b062df7798476bf3a1392910b9 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 19 Jun 2021 16:09:42 +0200 Subject: [PATCH 112/129] block, bfq: let also stably merged queues enjoy weight raising Merged bfq_queues are kept out of weight-raising (low-latency) mechanisms. The reason is that these queues are usually created for non-interactive and non-soft-real-time tasks. Yet this is not the case for stably-merged queues. These queues are merged just because they are created shortly after each other. So they may easily serve the I/O of an interactive or soft-real time application, if the application happens to spawn multiple processes. To address this issue, this commits lets also stably-merged queued enjoy weight raising. Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210619140948.98712-2-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index acd1f881273e..da2363f12e53 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1729,10 +1729,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfqq->entity.new_weight == 40; *interactive = !in_burst && idle_for_long_time && bfqq->entity.new_weight == 40; + /* + * Merged bfq_queues are kept out of weight-raising + * (low-latency) mechanisms. The reason is that these queues + * are usually created for non-interactive and + * non-soft-real-time tasks. Yet this is not the case for + * stably-merged queues. These queues are merged just because + * they are created shortly after each other. So they may + * easily serve the I/O of an interactive or soft-real time + * application, if the application happens to spawn multiple + * processes. So let also stably-merged queued enjoy weight + * raising. + */ wr_or_deserves_wr = bfqd->low_latency && (bfqq->wr_coeff > 1 || (bfq_bfqq_sync(bfqq) && - bfqq->bic && (*interactive || soft_rt))); + (bfqq->bic || RQ_BIC(rq)->stably_merged) && + (*interactive || soft_rt))); /* * Using the last flag, update budget and check whether bfqq From e03f2ab78a4a673e4af23c3b855591c48b9de4d7 Mon Sep 17 00:00:00 2001 From: Luca Mariotti Date: Sat, 19 Jun 2021 16:09:43 +0200 Subject: [PATCH 113/129] block, bfq: fix delayed stable merge check When attempting to schedule a merge of a given bfq_queue with the currently in-service bfq_queue or with a cooperating bfq_queue among the scheduled bfq_queues, delayed stable merge is checked for rotational or non-queueing devs. For this stable merge to be performed, some conditions must be met. If the current bfq_queue underwent some split from some merged bfq_queue, one of these conditions is that two hundred milliseconds must elapse from split, otherwise this condition is always met. Unfortunately, by mistake, time_is_after_jiffies() was written instead of time_is_before_jiffies() for this check, verifying that less than two hundred milliseconds have elapsed instead of verifying that at least two hundred milliseconds have elapsed. Fix this issue by replacing time_is_after_jiffies() with time_is_before_jiffies(). Signed-off-by: Luca Mariotti Signed-off-by: Paolo Valente Signed-off-by: Pietro Pedroni Link: https://lore.kernel.org/r/20210619140948.98712-3-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index da2363f12e53..c5c0e74977d4 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2710,7 +2710,7 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (unlikely(!bfqd->nonrot_with_queueing)) { if (bic->stable_merge_bfqq && !bfq_bfqq_just_created(bfqq) && - time_is_after_jiffies(bfqq->split_time + + time_is_before_jiffies(bfqq->split_time + msecs_to_jiffies(200))) { struct bfq_queue *stable_merge_bfqq = bic->stable_merge_bfqq; From d4f49983fa3944416c28379c35fbe10c68455ea4 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 19 Jun 2021 16:09:44 +0200 Subject: [PATCH 114/129] block, bfq: consider also creation time in delayed stable merge Since commit 430a67f9d616 ("block, bfq: merge bursts of newly-created queues"), BFQ may schedule a merge between a newly created sync bfq_queue and the last sync bfq_queue created. Such a merging is not performed immediately, because BFQ needs first to find out whether the newly created queue actually reaches a higher throughput if not merged at all (and in that case BFQ will not perform any stable merging). To check that, a little time must be waited after the creation of the new queue, so that some I/O can flow in the queue, and statistics on such I/O can be computed. Yet, to evaluate the above waiting time, the last split time is considered as start time, instead of the creation time of the queue. This is a mistake, because considering the split time is correct only in the following scenario. The queue undergoes a non-stable merges on the arrival of its very first I/O request, due to close I/O with some other queue. While the queue is merged for close I/O, stable merging is not considered. Yet the queue may then happen to be split, if the close I/O finishes (or happens to be a false positive). From this time on, the queue can again be considered for stable merging. But, again, a little time must elapse, to let some new I/O flow in the queue and to get updated statistics. To wait for this time, the split time is to be taken into account. Yet, if the queue does not undergo a non-stable merge on the arrival of its very first request, then BFQ immediately checks whether the stable merge is to be performed. It happens because the split time for a queue is initialized to minus infinity when the queue is created. This commit fixes this mistake by adding the missing condition. Now the check for delayed stable-merge is performed after a little time is elapsed not only from the last queue split time, but also from the creation time of the queue. Fixes: 430a67f9d616 ("block, bfq: merge bursts of newly-created queues") Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210619140948.98712-4-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index c5c0e74977d4..2a5c1a660f3b 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2711,7 +2711,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bic->stable_merge_bfqq && !bfq_bfqq_just_created(bfqq) && time_is_before_jiffies(bfqq->split_time + - msecs_to_jiffies(200))) { + msecs_to_jiffies(200)) && + time_is_before_jiffies(bfqq->creation_time + + msecs_to_jiffies(200))) { struct bfq_queue *stable_merge_bfqq = bic->stable_merge_bfqq; int proc_ref = min(bfqq_process_refs(bfqq), From 7812472f973047a886e4ed9a91d98d6627dd746f Mon Sep 17 00:00:00 2001 From: Pietro Pedroni Date: Sat, 19 Jun 2021 16:09:45 +0200 Subject: [PATCH 115/129] block, bfq: boost throughput by extending queue-merging times One of the methods with which bfq boosts throughput is by merging queues. One of the merging variants in bfq is the stable merge. This mechanism is activated between two queues only if they are created within a certain maximum time T1 from each other. Merging can happen soon or be delayed. In the second case, before merging, bfq needs to evaluate a throughput-boost parameter that indicates whether the queue generates a high throughput is served alone. Merging occurs when this throughput-boost is not high enough. In particular, this parameter is evaluated and late merging may occur only after at least a time T2 from the creation of the queue. Currently T1 and T2 are set to 180ms and 200ms, respectively. In this way the merging mechanism rarely occurs because time is not enough. This results in a noticeable lowering of the overall throughput with some workloads (see the example below). This commit introduces two constants bfq_activation_stable_merging and bfq_late_stable_merging in order to increase the duration of T1 and T2. Both the stable merging activation time and the late merging time are set to 600ms. This value has been experimentally evaluated using sqlite benchmark in the Phoronix Test Suite on a HDD. The duration of the benchmark before this fix was 111.02s, while now it has reached 97.02s, a better result than that of all the other schedulers. Signed-off-by: Pietro Pedroni Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210619140948.98712-5-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 2a5c1a660f3b..98a42ddb1760 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -364,6 +364,16 @@ static int ref_wr_duration[2]; */ static const unsigned long max_service_from_wr = 120000; +/* + * Maximum time between the creation of two queues, for stable merge + * to be activated (in ms) + */ +static const unsigned long bfq_activation_stable_merging = 600; +/* + * Minimum time to be waited before evaluating delayed stable merge (in ms) + */ +static const unsigned long bfq_late_stable_merging = 600; + #define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) @@ -2711,9 +2721,9 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bic->stable_merge_bfqq && !bfq_bfqq_just_created(bfqq) && time_is_before_jiffies(bfqq->split_time + - msecs_to_jiffies(200)) && + msecs_to_jiffies(bfq_late_stable_merging)) && time_is_before_jiffies(bfqq->creation_time + - msecs_to_jiffies(200))) { + msecs_to_jiffies(bfq_late_stable_merging))) { struct bfq_queue *stable_merge_bfqq = bic->stable_merge_bfqq; int proc_ref = min(bfqq_process_refs(bfqq), @@ -5494,7 +5504,7 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, */ if (!last_bfqq_created || time_before(last_bfqq_created->creation_time + - bfqd->bfq_burst_interval, + msecs_to_jiffies(bfq_activation_stable_merging), bfqq->creation_time) || bfqq->entity.parent != last_bfqq_created->entity.parent || bfqq->ioprio != last_bfqq_created->ioprio || From bd3664b362381c4c1473753ebedf0ab242a60d1d Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 19 Jun 2021 16:09:46 +0200 Subject: [PATCH 116/129] block, bfq: avoid delayed merge of async queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit 430a67f9d616 ("block, bfq: merge bursts of newly-created queues"), BFQ may schedule a merge between a newly created sync bfq_queue, say Q2, and the last sync bfq_queue created, say Q1. To this goal, BFQ stores the address of Q1 in the field bic->stable_merge_bfqq of the bic associated with Q2. So, when the time for the possible merge arrives, BFQ knows which bfq_queue to merge Q2 with. In particular, BFQ checks for possible merges on request arrivals. Yet the same bic may also be associated with an async bfq_queue, say Q3. So, if a request for Q3 arrives, then the above check may happen to be executed while the bfq_queue at hand is Q3, instead of Q2. In this case, Q1 happens to be merged with an async bfq_queue. This is not only a conceptual mistake, because async queues are to be kept out of queue merging, but also a bug that leads to inconsistent states. This commits simply filters async queues out of delayed merges. Fixes: 430a67f9d616 ("block, bfq: merge bursts of newly-created queues") Tested-by: Holger Hoffstätte Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210619140948.98712-6-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 98a42ddb1760..7bf073ef9443 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2718,7 +2718,13 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, * costly and complicated. */ if (unlikely(!bfqd->nonrot_with_queueing)) { - if (bic->stable_merge_bfqq && + /* + * Make sure also that bfqq is sync, because + * bic->stable_merge_bfqq may point to some queue (for + * stable merging) also if bic is associated with a + * sync queue, but this bfqq is async + */ + if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq && !bfq_bfqq_just_created(bfqq) && time_is_before_jiffies(bfqq->split_time + msecs_to_jiffies(bfq_late_stable_merging)) && From efc72524b3a9e4e7bc7c07f756528736409ec1b7 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 19 Jun 2021 16:09:47 +0200 Subject: [PATCH 117/129] block, bfq: check waker only for queues with no in-flight I/O Consider two bfq_queues, say Q1 and Q2, with Q2 empty. If a request of Q1 gets completed shortly before a new request arrives for Q2, then BFQ flags Q1 as a candidate waker for Q2. Yet, the arrival of this new request may have a different cause, in the following case. If also Q2 has requests in flight while waiting for the arrival of a new request, then the completion of its own requests may be the actual cause of the awakening of the process that sends I/O to Q2. So Q1 may be flagged wrongly as a candidate waker. This commit avoids this deceptive flagging, by disabling candidate-waker flagging for Q2, if Q2 has in-flight I/O. Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210619140948.98712-7-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 7bf073ef9443..a273b2bcea2a 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1985,14 +1985,18 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) * Turning back to the detection of a waker queue, a queue Q is deemed * as a waker queue for bfqq if, for three consecutive times, bfqq * happens to become non empty right after a request of Q has been - * completed. In particular, on the first time, Q is tentatively set - * as a candidate waker queue, while on the third consecutive time - * that Q is detected, the field waker_bfqq is set to Q, to confirm - * that Q is a waker queue for bfqq. These detection steps are - * performed only if bfqq has a long think time, so as to make it more - * likely that bfqq's I/O is actually being blocked by a - * synchronization. This last filter, plus the above three-times - * requirement, make false positives less likely. + * completed. In this respect, even if bfqq is empty, we do not check + * for a waker if it still has some in-flight I/O. In fact, in this + * case bfqq is actually still being served by the drive, and may + * receive new I/O on the completion of some of the in-flight + * requests. In particular, on the first time, Q is tentatively set as + * a candidate waker queue, while on the third consecutive time that Q + * is detected, the field waker_bfqq is set to Q, to confirm that Q is + * a waker queue for bfqq. These detection steps are performed only if + * bfqq has a long think time, so as to make it more likely that + * bfqq's I/O is actually being blocked by a synchronization. This + * last filter, plus the above three-times requirement, make false + * positives less likely. * * NOTE * @@ -2018,6 +2022,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || bfq_bfqq_has_short_ttime(bfqq) || + bfqq->dispatched > 0 || now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) return; From 9a2ac41b13c573703d6689f51f3e27dd658324be Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 19 Jun 2021 16:09:48 +0200 Subject: [PATCH 118/129] block, bfq: reset waker pointer with shared queues Commit 85686d0dc194 ("block, bfq: keep shared queues out of the waker mechanism") leaves shared bfq_queues out of the waker-detection mechanism. It attains this goal by not updating the pointer last_completed_rq_bfqq, if the last request completed belongs to a shared bfq_queue (so that the pointer will not point to the shared bfq_queue). Yet this has a side effect: the pointer last_completed_rq_bfqq keeps pointing, deceptively, to a bfq_queue that actually is not the last one to have had a request completed. As a consequence, such a bfq_queue may deceptively be considered as a waker of some bfq_queue, even of some shared bfq_queue. To address this issue, reset last_completed_rq_bfqq if the last request completed belongs to a shared queue. Fixes: 85686d0dc194 ("block, bfq: keep shared queues out of the waker mechanism") Signed-off-by: Paolo Valente Link: https://lore.kernel.org/r/20210619140948.98712-8-paolo.valente@linaro.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index a273b2bcea2a..fedb0a8fd388 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -6165,11 +6165,13 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) * of other queues. But a false waker will unjustly steal * bandwidth to its supposedly woken queue. So considering * also shared queues in the waking mechanism may cause more - * control troubles than throughput benefits. Then do not set - * last_completed_rq_bfqq to bfqq if bfqq is a shared queue. + * control troubles than throughput benefits. Then reset + * last_completed_rq_bfqq if bfqq is a shared queue. */ if (!bfq_bfqq_coop(bfqq)) bfqd->last_completed_rq_bfqq = bfqq; + else + bfqd->last_completed_rq_bfqq = NULL; /* * If we are waiting to discover whether the request pattern From ddcc5c544eb0991501761622b651cf43ce660a22 Mon Sep 17 00:00:00 2001 From: Thomas Bracht Laumann Jespersen Date: Sat, 19 Jun 2021 21:51:31 +0200 Subject: [PATCH 119/129] block/partitions/msdos: Fix typo inidicator -> indicator Just a fix for a small typo in msdos_partition(). Signed-off-by: Thomas Bracht Laumann Jespersen Link: https://lore.kernel.org/r/20210619195130.19348-1-t@laumann.xyz Signed-off-by: Jens Axboe --- block/partitions/msdos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 8f2fcc080264..63e4f6f8b6e9 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -622,7 +622,7 @@ int msdos_partition(struct parsed_partitions *state) for (slot = 1; slot <= 4; slot++, p++) { if (p->boot_ind != 0 && p->boot_ind != 0x80) { /* - * Even without a valid boot inidicator value + * Even without a valid boot indicator value * its still possible this is valid FAT filesystem * without a partition table. */ From 60b6a7e6a0f4382cd689f9afdac816964fec2921 Mon Sep 17 00:00:00 2001 From: Edward Hsieh Date: Thu, 24 Jun 2021 20:30:30 +0800 Subject: [PATCH 120/129] block: fix trace completion for chained bio For chained bio, trace_block_bio_complete in bio_endio is currently called only by the parent bio once upon all chained bio completed. However, the sector and size for the parent bio are modified in bio_split. Therefore, the size and sector of the complete events might not match the queue events in blktrace. The original fix of bio completion trace ("block: trace completion of all bios.") wants multiple complete events to correspond to one queue event but missed this. The issue can be reproduced by md/raid5 read with bio cross chunks. To fix, move trace completion into the loop for every chained bio to call. Fixes: fbbaf700e7b1 ("block: trace completion of all bios.") Reviewed-by: Wade Liang Reviewed-by: BingJing Chang Signed-off-by: Edward Hsieh Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210624123030.27014-1-edwardh@synology.com Signed-off-by: Jens Axboe --- block/bio.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/block/bio.c b/block/bio.c index 44205dfb6b60..1fab762e079b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1375,8 +1375,7 @@ static inline bool bio_remaining_done(struct bio *bio) * * bio_endio() can be called several times on a bio that has been chained * using bio_chain(). The ->bi_end_io() function will only be called the - * last time. At this point the BLK_TA_COMPLETE tracing event will be - * generated if BIO_TRACE_COMPLETION is set. + * last time. **/ void bio_endio(struct bio *bio) { @@ -1389,6 +1388,11 @@ again: if (bio->bi_bdev) rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + } + /* * Need to have a real endio function for chained bios, otherwise * various corner cases will break (like stacking block devices that @@ -1402,11 +1406,6 @@ again: goto again; } - if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); - bio_clear_flag(bio, BIO_TRACE_COMPLETION); - } - blk_throtl_bio_endio(bio); /* release cgroup info */ bio_uninit(bio); From d5870edfa3afc4608231267ea3b8e4beb3eab1ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jun 2021 09:38:42 +0200 Subject: [PATCH 121/129] block: move the disk events code to a separate file Move the code for handling disk events from genhd.c into a new file as it isn't very related to the rest of the file while at the same time requiring lots of forward declarations. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210624073843.251178-2-hch@lst.de Signed-off-by: Jens Axboe --- block/Makefile | 3 +- block/blk.h | 5 + block/disk-events.c | 484 +++++++++++++++++++++++++++++++++++++++++++ block/genhd.c | 492 -------------------------------------------- 4 files changed, 491 insertions(+), 493 deletions(-) create mode 100644 block/disk-events.c diff --git a/block/Makefile b/block/Makefile index b9db5d4edfc8..bfbe4e13ca1e 100644 --- a/block/Makefile +++ b/block/Makefile @@ -8,7 +8,8 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ blk-exec.o blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ - genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o + genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ + disk-events.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o diff --git a/block/blk.h b/block/blk.h index d3fa47af3607..f8d726429906 100644 --- a/block/blk.h +++ b/block/blk.h @@ -360,4 +360,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct request_queue *blk_alloc_queue(int node_id); +void disk_alloc_events(struct gendisk *disk); +void disk_add_events(struct gendisk *disk); +void disk_del_events(struct gendisk *disk); +void disk_release_events(struct gendisk *disk); + #endif /* BLK_INTERNAL_H */ diff --git a/block/disk-events.c b/block/disk-events.c new file mode 100644 index 000000000000..1bc5dcb75e4e --- /dev/null +++ b/block/disk-events.c @@ -0,0 +1,484 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Disk events - monitor disk events like media change and eject request. + */ +#include +#include +#include +#include "blk.h" + +struct disk_events { + struct list_head node; /* all disk_event's */ + struct gendisk *disk; /* the associated disk */ + spinlock_t lock; + + struct mutex block_mutex; /* protects blocking */ + int block; /* event blocking depth */ + unsigned int pending; /* events already sent out */ + unsigned int clearing; /* events being cleared */ + + long poll_msecs; /* interval, -1 for default */ + struct delayed_work dwork; +}; + +static const char *disk_events_strs[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", +}; + +static char *disk_uevents[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", +}; + +/* list of all disk_events */ +static DEFINE_MUTEX(disk_events_mutex); +static LIST_HEAD(disk_events); + +/* disable in-kernel polling by default */ +static unsigned long disk_events_dfl_poll_msecs; + +static unsigned long disk_events_poll_jiffies(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + long intv_msecs = 0; + + /* + * If device-specific poll interval is set, always use it. If + * the default is being used, poll if the POLL flag is set. + */ + if (ev->poll_msecs >= 0) + intv_msecs = ev->poll_msecs; + else if (disk->event_flags & DISK_EVENT_FLAG_POLL) + intv_msecs = disk_events_dfl_poll_msecs; + + return msecs_to_jiffies(intv_msecs); +} + +/** + * disk_block_events - block and flush disk event checking + * @disk: disk to block events for + * + * On return from this function, it is guaranteed that event checking + * isn't in progress and won't happen until unblocked by + * disk_unblock_events(). Events blocking is counted and the actual + * unblocking happens after the matching number of unblocks are done. + * + * Note that this intentionally does not block event checking from + * disk_clear_events(). + * + * CONTEXT: + * Might sleep. + */ +void disk_block_events(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + unsigned long flags; + bool cancel; + + if (!ev) + return; + + /* + * Outer mutex ensures that the first blocker completes canceling + * the event work before further blockers are allowed to finish. + */ + mutex_lock(&ev->block_mutex); + + spin_lock_irqsave(&ev->lock, flags); + cancel = !ev->block++; + spin_unlock_irqrestore(&ev->lock, flags); + + if (cancel) + cancel_delayed_work_sync(&disk->ev->dwork); + + mutex_unlock(&ev->block_mutex); +} + +static void __disk_unblock_events(struct gendisk *disk, bool check_now) +{ + struct disk_events *ev = disk->ev; + unsigned long intv; + unsigned long flags; + + spin_lock_irqsave(&ev->lock, flags); + + if (WARN_ON_ONCE(ev->block <= 0)) + goto out_unlock; + + if (--ev->block) + goto out_unlock; + + intv = disk_events_poll_jiffies(disk); + if (check_now) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); + else if (intv) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); +out_unlock: + spin_unlock_irqrestore(&ev->lock, flags); +} + +/** + * disk_unblock_events - unblock disk event checking + * @disk: disk to unblock events for + * + * Undo disk_block_events(). When the block count reaches zero, it + * starts events polling if configured. + * + * CONTEXT: + * Don't care. Safe to call from irq context. + */ +void disk_unblock_events(struct gendisk *disk) +{ + if (disk->ev) + __disk_unblock_events(disk, false); +} + +/** + * disk_flush_events - schedule immediate event checking and flushing + * @disk: disk to check and flush events for + * @mask: events to flush + * + * Schedule immediate event checking on @disk if not blocked. Events in + * @mask are scheduled to be cleared from the driver. Note that this + * doesn't clear the events from @disk->ev. + * + * CONTEXT: + * If @mask is non-zero must be called with disk->open_mutex held. + */ +void disk_flush_events(struct gendisk *disk, unsigned int mask) +{ + struct disk_events *ev = disk->ev; + + if (!ev) + return; + + spin_lock_irq(&ev->lock); + ev->clearing |= mask; + if (!ev->block) + mod_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); + spin_unlock_irq(&ev->lock); +} + +static void disk_check_events(struct disk_events *ev, + unsigned int *clearing_ptr) +{ + struct gendisk *disk = ev->disk; + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + unsigned int clearing = *clearing_ptr; + unsigned int events; + unsigned long intv; + int nr_events = 0, i; + + /* check events */ + events = disk->fops->check_events(disk, clearing); + + /* accumulate pending events and schedule next poll if necessary */ + spin_lock_irq(&ev->lock); + + events &= ~ev->pending; + ev->pending |= events; + *clearing_ptr &= ~clearing; + + intv = disk_events_poll_jiffies(disk); + if (!ev->block && intv) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); + + spin_unlock_irq(&ev->lock); + + /* + * Tell userland about new events. Only the events listed in + * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT + * is set. Otherwise, events are processed internally but never + * get reported to userland. + */ + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if ((events & disk->events & (1 << i)) && + (disk->event_flags & DISK_EVENT_FLAG_UEVENT)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +} + +/** + * disk_clear_events - synchronously check, clear and return pending events + * @disk: disk to fetch and clear events from + * @mask: mask of events to be fetched and cleared + * + * Disk events are synchronously checked and pending events in @mask + * are cleared and returned. This ignores the block count. + * + * CONTEXT: + * Might sleep. + */ +static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) +{ + struct disk_events *ev = disk->ev; + unsigned int pending; + unsigned int clearing = mask; + + if (!ev) + return 0; + + disk_block_events(disk); + + /* + * store the union of mask and ev->clearing on the stack so that the + * race with disk_flush_events does not cause ambiguity (ev->clearing + * can still be modified even if events are blocked). + */ + spin_lock_irq(&ev->lock); + clearing |= ev->clearing; + ev->clearing = 0; + spin_unlock_irq(&ev->lock); + + disk_check_events(ev, &clearing); + /* + * if ev->clearing is not 0, the disk_flush_events got called in the + * middle of this function, so we want to run the workfn without delay. + */ + __disk_unblock_events(disk, ev->clearing ? true : false); + + /* then, fetch and clear pending events */ + spin_lock_irq(&ev->lock); + pending = ev->pending & mask; + ev->pending &= ~mask; + spin_unlock_irq(&ev->lock); + WARN_ON_ONCE(clearing & mask); + + return pending; +} + +/** + * bdev_check_media_change - check if a removable media has been changed + * @bdev: block device to check + * + * Check whether a removable media has been changed, and attempt to free all + * dentries and inodes and invalidates all block device page cache entries in + * that case. + * + * Returns %true if the block device changed, or %false if not. + */ +bool bdev_check_media_change(struct block_device *bdev) +{ + unsigned int events; + + events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE | + DISK_EVENT_EJECT_REQUEST); + if (!(events & DISK_EVENT_MEDIA_CHANGE)) + return false; + + if (__invalidate_device(bdev, true)) + pr_warn("VFS: busy inodes on changed media %s\n", + bdev->bd_disk->disk_name); + set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + return true; +} +EXPORT_SYMBOL(bdev_check_media_change); + +/* + * Separate this part out so that a different pointer for clearing_ptr can be + * passed in for disk_clear_events. + */ +static void disk_events_workfn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct disk_events *ev = container_of(dwork, struct disk_events, dwork); + + disk_check_events(ev, &ev->clearing); +} + +/* + * A disk events enabled device has the following sysfs nodes under + * its /sys/block/X/ directory. + * + * events : list of all supported events + * events_async : list of events which can be detected w/o polling + * (always empty, only for backwards compatibility) + * events_poll_msecs : polling interval, 0: disable, -1: system default + */ +static ssize_t __disk_events_show(unsigned int events, char *buf) +{ + const char *delim = ""; + ssize_t pos = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) + if (events & (1 << i)) { + pos += sprintf(buf + pos, "%s%s", + delim, disk_events_strs[i]); + delim = " "; + } + if (pos) + pos += sprintf(buf + pos, "\n"); + return pos; +} + +static ssize_t disk_events_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT)) + return 0; + return __disk_events_show(disk->events, buf); +} + +static ssize_t disk_events_async_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return 0; +} + +static ssize_t disk_events_poll_msecs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->ev) + return sprintf(buf, "-1\n"); + return sprintf(buf, "%ld\n", disk->ev->poll_msecs); +} + +static ssize_t disk_events_poll_msecs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + long intv; + + if (!count || !sscanf(buf, "%ld", &intv)) + return -EINVAL; + + if (intv < 0 && intv != -1) + return -EINVAL; + + if (!disk->ev) + return -ENODEV; + + disk_block_events(disk); + disk->ev->poll_msecs = intv; + __disk_unblock_events(disk, true); + return count; +} + +static const DEVICE_ATTR(events, 0444, disk_events_show, NULL); +static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); +static const DEVICE_ATTR(events_poll_msecs, 0644, + disk_events_poll_msecs_show, + disk_events_poll_msecs_store); + +static const struct attribute *disk_events_attrs[] = { + &dev_attr_events.attr, + &dev_attr_events_async.attr, + &dev_attr_events_poll_msecs.attr, + NULL, +}; + +/* + * The default polling interval can be specified by the kernel + * parameter block.events_dfl_poll_msecs which defaults to 0 + * (disable). This can also be modified runtime by writing to + * /sys/module/block/parameters/events_dfl_poll_msecs. + */ +static int disk_events_set_dfl_poll_msecs(const char *val, + const struct kernel_param *kp) +{ + struct disk_events *ev; + int ret; + + ret = param_set_ulong(val, kp); + if (ret < 0) + return ret; + + mutex_lock(&disk_events_mutex); + list_for_each_entry(ev, &disk_events, node) + disk_flush_events(ev->disk, 0); + mutex_unlock(&disk_events_mutex); + return 0; +} + +static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { + .set = disk_events_set_dfl_poll_msecs, + .get = param_get_ulong, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "block." + +module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, + &disk_events_dfl_poll_msecs, 0644); + +/* + * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. + */ +void disk_alloc_events(struct gendisk *disk) +{ + struct disk_events *ev; + + if (!disk->fops->check_events || !disk->events) + return; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) { + pr_warn("%s: failed to initialize events\n", disk->disk_name); + return; + } + + INIT_LIST_HEAD(&ev->node); + ev->disk = disk; + spin_lock_init(&ev->lock); + mutex_init(&ev->block_mutex); + ev->block = 1; + ev->poll_msecs = -1; + INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); + + disk->ev = ev; +} + +void disk_add_events(struct gendisk *disk) +{ + /* FIXME: error handling */ + if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) + pr_warn("%s: failed to create sysfs files for events\n", + disk->disk_name); + + if (!disk->ev) + return; + + mutex_lock(&disk_events_mutex); + list_add_tail(&disk->ev->node, &disk_events); + mutex_unlock(&disk_events_mutex); + + /* + * Block count is initialized to 1 and the following initial + * unblock kicks it into action. + */ + __disk_unblock_events(disk, true); +} + +void disk_del_events(struct gendisk *disk) +{ + if (disk->ev) { + disk_block_events(disk); + + mutex_lock(&disk_events_mutex); + list_del_init(&disk->ev->node); + mutex_unlock(&disk_events_mutex); + } + + sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); +} + +void disk_release_events(struct gendisk *disk) +{ + /* the block count should be 1 from disk_del_events() */ + WARN_ON_ONCE(disk->ev && disk->ev->block != 1); + kfree(disk->ev); +} diff --git a/block/genhd.c b/block/genhd.c index 5f5628216295..4f879deede9a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -33,13 +33,6 @@ static struct kobject *block_depr; #define NR_EXT_DEVT (1 << MINORBITS) static DEFINE_IDA(ext_devt_ida); -static void disk_check_events(struct disk_events *ev, - unsigned int *clearing_ptr); -static void disk_alloc_events(struct gendisk *disk); -static void disk_add_events(struct gendisk *disk); -static void disk_del_events(struct gendisk *disk); -static void disk_release_events(struct gendisk *disk); - void set_capacity(struct gendisk *disk, sector_t sectors) { struct block_device *bdev = disk->part0; @@ -1367,488 +1360,3 @@ int bdev_read_only(struct block_device *bdev) return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); } EXPORT_SYMBOL(bdev_read_only); - -/* - * Disk events - monitor disk events like media change and eject request. - */ -struct disk_events { - struct list_head node; /* all disk_event's */ - struct gendisk *disk; /* the associated disk */ - spinlock_t lock; - - struct mutex block_mutex; /* protects blocking */ - int block; /* event blocking depth */ - unsigned int pending; /* events already sent out */ - unsigned int clearing; /* events being cleared */ - - long poll_msecs; /* interval, -1 for default */ - struct delayed_work dwork; -}; - -static const char *disk_events_strs[] = { - [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", - [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", -}; - -static char *disk_uevents[] = { - [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", - [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", -}; - -/* list of all disk_events */ -static DEFINE_MUTEX(disk_events_mutex); -static LIST_HEAD(disk_events); - -/* disable in-kernel polling by default */ -static unsigned long disk_events_dfl_poll_msecs; - -static unsigned long disk_events_poll_jiffies(struct gendisk *disk) -{ - struct disk_events *ev = disk->ev; - long intv_msecs = 0; - - /* - * If device-specific poll interval is set, always use it. If - * the default is being used, poll if the POLL flag is set. - */ - if (ev->poll_msecs >= 0) - intv_msecs = ev->poll_msecs; - else if (disk->event_flags & DISK_EVENT_FLAG_POLL) - intv_msecs = disk_events_dfl_poll_msecs; - - return msecs_to_jiffies(intv_msecs); -} - -/** - * disk_block_events - block and flush disk event checking - * @disk: disk to block events for - * - * On return from this function, it is guaranteed that event checking - * isn't in progress and won't happen until unblocked by - * disk_unblock_events(). Events blocking is counted and the actual - * unblocking happens after the matching number of unblocks are done. - * - * Note that this intentionally does not block event checking from - * disk_clear_events(). - * - * CONTEXT: - * Might sleep. - */ -void disk_block_events(struct gendisk *disk) -{ - struct disk_events *ev = disk->ev; - unsigned long flags; - bool cancel; - - if (!ev) - return; - - /* - * Outer mutex ensures that the first blocker completes canceling - * the event work before further blockers are allowed to finish. - */ - mutex_lock(&ev->block_mutex); - - spin_lock_irqsave(&ev->lock, flags); - cancel = !ev->block++; - spin_unlock_irqrestore(&ev->lock, flags); - - if (cancel) - cancel_delayed_work_sync(&disk->ev->dwork); - - mutex_unlock(&ev->block_mutex); -} - -static void __disk_unblock_events(struct gendisk *disk, bool check_now) -{ - struct disk_events *ev = disk->ev; - unsigned long intv; - unsigned long flags; - - spin_lock_irqsave(&ev->lock, flags); - - if (WARN_ON_ONCE(ev->block <= 0)) - goto out_unlock; - - if (--ev->block) - goto out_unlock; - - intv = disk_events_poll_jiffies(disk); - if (check_now) - queue_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, 0); - else if (intv) - queue_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, intv); -out_unlock: - spin_unlock_irqrestore(&ev->lock, flags); -} - -/** - * disk_unblock_events - unblock disk event checking - * @disk: disk to unblock events for - * - * Undo disk_block_events(). When the block count reaches zero, it - * starts events polling if configured. - * - * CONTEXT: - * Don't care. Safe to call from irq context. - */ -void disk_unblock_events(struct gendisk *disk) -{ - if (disk->ev) - __disk_unblock_events(disk, false); -} - -/** - * disk_flush_events - schedule immediate event checking and flushing - * @disk: disk to check and flush events for - * @mask: events to flush - * - * Schedule immediate event checking on @disk if not blocked. Events in - * @mask are scheduled to be cleared from the driver. Note that this - * doesn't clear the events from @disk->ev. - * - * CONTEXT: - * If @mask is non-zero must be called with disk->open_mutex held. - */ -void disk_flush_events(struct gendisk *disk, unsigned int mask) -{ - struct disk_events *ev = disk->ev; - - if (!ev) - return; - - spin_lock_irq(&ev->lock); - ev->clearing |= mask; - if (!ev->block) - mod_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, 0); - spin_unlock_irq(&ev->lock); -} - -/** - * disk_clear_events - synchronously check, clear and return pending events - * @disk: disk to fetch and clear events from - * @mask: mask of events to be fetched and cleared - * - * Disk events are synchronously checked and pending events in @mask - * are cleared and returned. This ignores the block count. - * - * CONTEXT: - * Might sleep. - */ -static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) -{ - struct disk_events *ev = disk->ev; - unsigned int pending; - unsigned int clearing = mask; - - if (!ev) - return 0; - - disk_block_events(disk); - - /* - * store the union of mask and ev->clearing on the stack so that the - * race with disk_flush_events does not cause ambiguity (ev->clearing - * can still be modified even if events are blocked). - */ - spin_lock_irq(&ev->lock); - clearing |= ev->clearing; - ev->clearing = 0; - spin_unlock_irq(&ev->lock); - - disk_check_events(ev, &clearing); - /* - * if ev->clearing is not 0, the disk_flush_events got called in the - * middle of this function, so we want to run the workfn without delay. - */ - __disk_unblock_events(disk, ev->clearing ? true : false); - - /* then, fetch and clear pending events */ - spin_lock_irq(&ev->lock); - pending = ev->pending & mask; - ev->pending &= ~mask; - spin_unlock_irq(&ev->lock); - WARN_ON_ONCE(clearing & mask); - - return pending; -} - -/** - * bdev_check_media_change - check if a removable media has been changed - * @bdev: block device to check - * - * Check whether a removable media has been changed, and attempt to free all - * dentries and inodes and invalidates all block device page cache entries in - * that case. - * - * Returns %true if the block device changed, or %false if not. - */ -bool bdev_check_media_change(struct block_device *bdev) -{ - unsigned int events; - - events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE | - DISK_EVENT_EJECT_REQUEST); - if (!(events & DISK_EVENT_MEDIA_CHANGE)) - return false; - - if (__invalidate_device(bdev, true)) - pr_warn("VFS: busy inodes on changed media %s\n", - bdev->bd_disk->disk_name); - set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); - return true; -} -EXPORT_SYMBOL(bdev_check_media_change); - -/* - * Separate this part out so that a different pointer for clearing_ptr can be - * passed in for disk_clear_events. - */ -static void disk_events_workfn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct disk_events *ev = container_of(dwork, struct disk_events, dwork); - - disk_check_events(ev, &ev->clearing); -} - -static void disk_check_events(struct disk_events *ev, - unsigned int *clearing_ptr) -{ - struct gendisk *disk = ev->disk; - char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; - unsigned int clearing = *clearing_ptr; - unsigned int events; - unsigned long intv; - int nr_events = 0, i; - - /* check events */ - events = disk->fops->check_events(disk, clearing); - - /* accumulate pending events and schedule next poll if necessary */ - spin_lock_irq(&ev->lock); - - events &= ~ev->pending; - ev->pending |= events; - *clearing_ptr &= ~clearing; - - intv = disk_events_poll_jiffies(disk); - if (!ev->block && intv) - queue_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, intv); - - spin_unlock_irq(&ev->lock); - - /* - * Tell userland about new events. Only the events listed in - * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT - * is set. Otherwise, events are processed internally but never - * get reported to userland. - */ - for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) - if ((events & disk->events & (1 << i)) && - (disk->event_flags & DISK_EVENT_FLAG_UEVENT)) - envp[nr_events++] = disk_uevents[i]; - - if (nr_events) - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); -} - -/* - * A disk events enabled device has the following sysfs nodes under - * its /sys/block/X/ directory. - * - * events : list of all supported events - * events_async : list of events which can be detected w/o polling - * (always empty, only for backwards compatibility) - * events_poll_msecs : polling interval, 0: disable, -1: system default - */ -static ssize_t __disk_events_show(unsigned int events, char *buf) -{ - const char *delim = ""; - ssize_t pos = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) - if (events & (1 << i)) { - pos += sprintf(buf + pos, "%s%s", - delim, disk_events_strs[i]); - delim = " "; - } - if (pos) - pos += sprintf(buf + pos, "\n"); - return pos; -} - -static ssize_t disk_events_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT)) - return 0; - - return __disk_events_show(disk->events, buf); -} - -static ssize_t disk_events_async_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return 0; -} - -static ssize_t disk_events_poll_msecs_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - if (!disk->ev) - return sprintf(buf, "-1\n"); - - return sprintf(buf, "%ld\n", disk->ev->poll_msecs); -} - -static ssize_t disk_events_poll_msecs_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct gendisk *disk = dev_to_disk(dev); - long intv; - - if (!count || !sscanf(buf, "%ld", &intv)) - return -EINVAL; - - if (intv < 0 && intv != -1) - return -EINVAL; - - if (!disk->ev) - return -ENODEV; - - disk_block_events(disk); - disk->ev->poll_msecs = intv; - __disk_unblock_events(disk, true); - - return count; -} - -static const DEVICE_ATTR(events, 0444, disk_events_show, NULL); -static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); -static const DEVICE_ATTR(events_poll_msecs, 0644, - disk_events_poll_msecs_show, - disk_events_poll_msecs_store); - -static const struct attribute *disk_events_attrs[] = { - &dev_attr_events.attr, - &dev_attr_events_async.attr, - &dev_attr_events_poll_msecs.attr, - NULL, -}; - -/* - * The default polling interval can be specified by the kernel - * parameter block.events_dfl_poll_msecs which defaults to 0 - * (disable). This can also be modified runtime by writing to - * /sys/module/block/parameters/events_dfl_poll_msecs. - */ -static int disk_events_set_dfl_poll_msecs(const char *val, - const struct kernel_param *kp) -{ - struct disk_events *ev; - int ret; - - ret = param_set_ulong(val, kp); - if (ret < 0) - return ret; - - mutex_lock(&disk_events_mutex); - - list_for_each_entry(ev, &disk_events, node) - disk_flush_events(ev->disk, 0); - - mutex_unlock(&disk_events_mutex); - - return 0; -} - -static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { - .set = disk_events_set_dfl_poll_msecs, - .get = param_get_ulong, -}; - -#undef MODULE_PARAM_PREFIX -#define MODULE_PARAM_PREFIX "block." - -module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, - &disk_events_dfl_poll_msecs, 0644); - -/* - * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. - */ -static void disk_alloc_events(struct gendisk *disk) -{ - struct disk_events *ev; - - if (!disk->fops->check_events || !disk->events) - return; - - ev = kzalloc(sizeof(*ev), GFP_KERNEL); - if (!ev) { - pr_warn("%s: failed to initialize events\n", disk->disk_name); - return; - } - - INIT_LIST_HEAD(&ev->node); - ev->disk = disk; - spin_lock_init(&ev->lock); - mutex_init(&ev->block_mutex); - ev->block = 1; - ev->poll_msecs = -1; - INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); - - disk->ev = ev; -} - -static void disk_add_events(struct gendisk *disk) -{ - /* FIXME: error handling */ - if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) - pr_warn("%s: failed to create sysfs files for events\n", - disk->disk_name); - - if (!disk->ev) - return; - - mutex_lock(&disk_events_mutex); - list_add_tail(&disk->ev->node, &disk_events); - mutex_unlock(&disk_events_mutex); - - /* - * Block count is initialized to 1 and the following initial - * unblock kicks it into action. - */ - __disk_unblock_events(disk, true); -} - -static void disk_del_events(struct gendisk *disk) -{ - if (disk->ev) { - disk_block_events(disk); - - mutex_lock(&disk_events_mutex); - list_del_init(&disk->ev->node); - mutex_unlock(&disk_events_mutex); - } - - sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); -} - -static void disk_release_events(struct gendisk *disk) -{ - /* the block count should be 1 from disk_del_events() */ - WARN_ON_ONCE(disk->ev && disk->ev->block != 1); - kfree(disk->ev); -} From 2bc8cda5ea4b42ff78be1b11011092d57b424d37 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jun 2021 09:38:43 +0200 Subject: [PATCH 122/129] block: add the events* attributes to disk_attrs Add the events attributes to the disk_attrs array, which ensures they are added by the driver core when the device is created rather than adding them after the device has been added, which is racy versus uevents and requires more boilerplate code. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20210624073843.251178-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 3 +++ block/disk-events.c | 23 ++++------------------- block/genhd.c | 3 +++ 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/block/blk.h b/block/blk.h index f8d726429906..4fcd7a032377 100644 --- a/block/blk.h +++ b/block/blk.h @@ -364,5 +364,8 @@ void disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); void disk_del_events(struct gendisk *disk); void disk_release_events(struct gendisk *disk); +extern struct device_attribute dev_attr_events; +extern struct device_attribute dev_attr_events_async; +extern struct device_attribute dev_attr_events_poll_msecs; #endif /* BLK_INTERNAL_H */ diff --git a/block/disk-events.c b/block/disk-events.c index 1bc5dcb75e4e..a75931ff5da4 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -368,18 +368,10 @@ static ssize_t disk_events_poll_msecs_store(struct device *dev, return count; } -static const DEVICE_ATTR(events, 0444, disk_events_show, NULL); -static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); -static const DEVICE_ATTR(events_poll_msecs, 0644, - disk_events_poll_msecs_show, - disk_events_poll_msecs_store); - -static const struct attribute *disk_events_attrs[] = { - &dev_attr_events.attr, - &dev_attr_events_async.attr, - &dev_attr_events_poll_msecs.attr, - NULL, -}; +DEVICE_ATTR(events, 0444, disk_events_show, NULL); +DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); +DEVICE_ATTR(events_poll_msecs, 0644, disk_events_poll_msecs_show, + disk_events_poll_msecs_store); /* * The default polling interval can be specified by the kernel @@ -444,11 +436,6 @@ void disk_alloc_events(struct gendisk *disk) void disk_add_events(struct gendisk *disk) { - /* FIXME: error handling */ - if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) - pr_warn("%s: failed to create sysfs files for events\n", - disk->disk_name); - if (!disk->ev) return; @@ -472,8 +459,6 @@ void disk_del_events(struct gendisk *disk) list_del_init(&disk->ev->node); mutex_unlock(&disk_events_mutex); } - - sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); } void disk_release_events(struct gendisk *disk) diff --git a/block/genhd.c b/block/genhd.c index 4f879deede9a..79aa40b4c39c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1022,6 +1022,9 @@ static struct attribute *disk_attrs[] = { &dev_attr_stat.attr, &dev_attr_inflight.attr, &dev_attr_badblocks.attr, + &dev_attr_events.attr, + &dev_attr_events_async.attr, + &dev_attr_events_poll_msecs.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif From 630161cfdf5cdc696a82b59410d1ff00b23d946e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jun 2021 14:32:39 +0200 Subject: [PATCH 123/129] block: move bdev_disk_changed Move bdev_disk_changed to block/partitions/core.c, together with the rest of the partition scanning code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210624123240.441814-2-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 55 ++++++++++++++++++++++++++++++++++++++++- fs/block_dev.c | 53 --------------------------------------- include/linux/genhd.h | 1 - 3 files changed, 54 insertions(+), 55 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 186d4fbd9f09..b79785f7027c 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -596,7 +596,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, return true; } -int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) +static int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) { struct parsed_partitions *state; int ret = -EAGAIN, p; @@ -657,6 +657,59 @@ out_free_state: return ret; } +int bdev_disk_changed(struct block_device *bdev, bool invalidate) +{ + struct gendisk *disk = bdev->bd_disk; + int ret = 0; + + lockdep_assert_held(&disk->open_mutex); + + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + +rescan: + if (disk->open_partitions) + return -EBUSY; + sync_blockdev(bdev); + invalidate_bdev(bdev); + blk_drop_partitions(disk); + + clear_bit(GD_NEED_PART_SCAN, &disk->state); + + /* + * Historically we only set the capacity to zero for devices that + * support partitions (independ of actually having partitions created). + * Doing that is rather inconsistent, but changing it broke legacy + * udisks polling for legacy ide-cdrom devices. Use the crude check + * below to get the sane behavior for most device while not breaking + * userspace for this particular setup. + */ + if (invalidate) { + if (disk_part_scan_enabled(disk) || + !(disk->flags & GENHD_FL_REMOVABLE)) + set_capacity(disk, 0); + } + + if (get_capacity(disk)) { + ret = blk_add_partitions(disk, bdev); + if (ret == -EAGAIN) + goto rescan; + } else if (invalidate) { + /* + * Tell userspace that the media / partition table may have + * changed. + */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + } + + return ret; +} +/* + * Only exported for loop and dasd for historic reasons. Don't use in new + * code! + */ +EXPORT_SYMBOL_GPL(bdev_disk_changed); + void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { struct address_space *mapping = state->bdev->bd_inode->i_mapping; diff --git a/fs/block_dev.c b/fs/block_dev.c index ac9b3c158a77..5b3a73ecb696 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1242,59 +1242,6 @@ static void blkdev_flush_mapping(struct block_device *bdev) bdev_write_inode(bdev); } -int bdev_disk_changed(struct block_device *bdev, bool invalidate) -{ - struct gendisk *disk = bdev->bd_disk; - int ret = 0; - - lockdep_assert_held(&disk->open_mutex); - - if (!(disk->flags & GENHD_FL_UP)) - return -ENXIO; - -rescan: - if (disk->open_partitions) - return -EBUSY; - sync_blockdev(bdev); - invalidate_bdev(bdev); - blk_drop_partitions(disk); - - clear_bit(GD_NEED_PART_SCAN, &disk->state); - - /* - * Historically we only set the capacity to zero for devices that - * support partitions (independ of actually having partitions created). - * Doing that is rather inconsistent, but changing it broke legacy - * udisks polling for legacy ide-cdrom devices. Use the crude check - * below to get the sane behavior for most device while not breaking - * userspace for this particular setup. - */ - if (invalidate) { - if (disk_part_scan_enabled(disk) || - !(disk->flags & GENHD_FL_REMOVABLE)) - set_capacity(disk, 0); - } - - if (get_capacity(disk)) { - ret = blk_add_partitions(disk, bdev); - if (ret == -EAGAIN) - goto rescan; - } else if (invalidate) { - /* - * Tell userspace that the media / partition table may have - * changed. - */ - kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); - } - - return ret; -} -/* - * Only exported for loop and dasd for historic reasons. Don't use in new - * code! - */ -EXPORT_SYMBOL_GPL(bdev_disk_changed); - static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 03d684f0498f..f5f0c9bdf1d2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -257,7 +257,6 @@ static inline sector_t get_capacity(struct gendisk *disk) } int bdev_disk_changed(struct block_device *bdev, bool invalidate); -int blk_add_partitions(struct gendisk *disk, struct block_device *bdev); void blk_drop_partitions(struct gendisk *disk); extern struct gendisk *__alloc_disk_node(int minors, int node_id); From 0384264ea8a39bd98c9a3158060565f650c056a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Jun 2021 14:32:40 +0200 Subject: [PATCH 124/129] block: pass a gendisk to bdev_disk_changed bdev_disk_changed can only operate on whole devices. Make that clear by passing a gendisk instead of the struct block_device. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210624123240.441814-3-hch@lst.de Signed-off-by: Jens Axboe --- block/partitions/core.c | 22 ++++++++++------------ drivers/block/loop.c | 21 ++++++++++----------- drivers/s390/block/dasd_genhd.c | 4 ++-- fs/block_dev.c | 4 ++-- include/linux/genhd.h | 2 +- 5 files changed, 25 insertions(+), 28 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index b79785f7027c..347c56a51d87 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -120,8 +120,7 @@ static void free_partitions(struct parsed_partitions *state) kfree(state); } -static struct parsed_partitions *check_partition(struct gendisk *hd, - struct block_device *bdev) +static struct parsed_partitions *check_partition(struct gendisk *hd) { struct parsed_partitions *state; int i, res, err; @@ -136,7 +135,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd, } state->pp_buf[0] = '\0'; - state->bdev = bdev; + state->bdev = hd->part0; disk_name(hd, 0, state->name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) @@ -546,7 +545,7 @@ void blk_drop_partitions(struct gendisk *disk) } } -static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, +static bool blk_add_partition(struct gendisk *disk, struct parsed_partitions *state, int p) { sector_t size = state->parts[p].size; @@ -596,7 +595,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, return true; } -static int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) +static int blk_add_partitions(struct gendisk *disk) { struct parsed_partitions *state; int ret = -EAGAIN, p; @@ -604,7 +603,7 @@ static int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) if (!disk_part_scan_enabled(disk)) return 0; - state = check_partition(disk, bdev); + state = check_partition(disk); if (!state) return 0; if (IS_ERR(state)) { @@ -648,7 +647,7 @@ static int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); for (p = 1; p < state->limit; p++) - if (!blk_add_partition(disk, bdev, state, p)) + if (!blk_add_partition(disk, state, p)) goto out_free_state; ret = 0; @@ -657,9 +656,8 @@ out_free_state: return ret; } -int bdev_disk_changed(struct block_device *bdev, bool invalidate) +int bdev_disk_changed(struct gendisk *disk, bool invalidate) { - struct gendisk *disk = bdev->bd_disk; int ret = 0; lockdep_assert_held(&disk->open_mutex); @@ -670,8 +668,8 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) rescan: if (disk->open_partitions) return -EBUSY; - sync_blockdev(bdev); - invalidate_bdev(bdev); + sync_blockdev(disk->part0); + invalidate_bdev(disk->part0); blk_drop_partitions(disk); clear_bit(GD_NEED_PART_SCAN, &disk->state); @@ -691,7 +689,7 @@ rescan: } if (get_capacity(disk)) { - ret = blk_add_partitions(disk, bdev); + ret = blk_add_partitions(disk); if (ret == -EAGAIN) goto rescan; } else if (invalidate) { diff --git a/drivers/block/loop.c b/drivers/block/loop.c index e90f7d349816..4fb1f9530d5a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -647,14 +647,13 @@ static inline void loop_update_dio(struct loop_device *lo) lo->use_dio); } -static void loop_reread_partitions(struct loop_device *lo, - struct block_device *bdev) +static void loop_reread_partitions(struct loop_device *lo) { int rc; - mutex_lock(&bdev->bd_disk->open_mutex); - rc = bdev_disk_changed(bdev, false); - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_lock(&lo->lo_disk->open_mutex); + rc = bdev_disk_changed(lo->lo_disk, false); + mutex_unlock(&lo->lo_disk->open_mutex); if (rc) pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", __func__, lo->lo_number, lo->lo_file_name, rc); @@ -752,7 +751,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, */ fput(old_file); if (partscan) - loop_reread_partitions(lo, bdev); + loop_reread_partitions(lo); return 0; out_err: @@ -1174,7 +1173,7 @@ static int loop_configure(struct loop_device *lo, fmode_t mode, bdgrab(bdev); mutex_unlock(&lo->lo_mutex); if (partscan) - loop_reread_partitions(lo, bdev); + loop_reread_partitions(lo); if (!(mode & FMODE_EXCL)) bd_abort_claiming(bdev, loop_configure); return 0; @@ -1268,10 +1267,10 @@ out_unlock: * current holder is released. */ if (!release) - mutex_lock(&bdev->bd_disk->open_mutex); - err = bdev_disk_changed(bdev, false); + mutex_lock(&lo->lo_disk->open_mutex); + err = bdev_disk_changed(lo->lo_disk, false); if (!release) - mutex_unlock(&bdev->bd_disk->open_mutex); + mutex_unlock(&lo->lo_disk->open_mutex); if (err) pr_warn("%s: partition scan of loop%d failed (rc=%d)\n", __func__, lo_number, err); @@ -1416,7 +1415,7 @@ out_unfreeze: out_unlock: mutex_unlock(&lo->lo_mutex); if (partscan) - loop_reread_partitions(lo, bdev); + loop_reread_partitions(lo); return err; } diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index bf2082d461c7..493e8469893c 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -110,7 +110,7 @@ int dasd_scan_partitions(struct dasd_block *block) } mutex_lock(&block->gdp->open_mutex); - rc = bdev_disk_changed(bdev, false); + rc = bdev_disk_changed(block->gdp, false); mutex_unlock(&block->gdp->open_mutex); if (rc) DBF_DEV_EVENT(DBF_ERR, block->base, @@ -146,7 +146,7 @@ void dasd_destroy_partitions(struct dasd_block *block) block->bdev = NULL; mutex_lock(&bdev->bd_disk->open_mutex); - bdev_disk_changed(bdev, true); + bdev_disk_changed(bdev->bd_disk, true); mutex_unlock(&bdev->bd_disk->open_mutex); /* Matching blkdev_put to the blkdev_get in dasd_scan_partitions. */ diff --git a/fs/block_dev.c b/fs/block_dev.c index 5b3a73ecb696..34253d155f5c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1253,7 +1253,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) /* avoid ghost partitions on a removed medium */ if (ret == -ENOMEDIUM && test_bit(GD_NEED_PART_SCAN, &disk->state)) - bdev_disk_changed(bdev, true); + bdev_disk_changed(disk, true); return ret; } } @@ -1264,7 +1264,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } if (test_bit(GD_NEED_PART_SCAN, &disk->state)) - bdev_disk_changed(bdev, false); + bdev_disk_changed(disk, false); bdev->bd_openers++; return 0;; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index f5f0c9bdf1d2..13b34177cc85 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -256,7 +256,7 @@ static inline sector_t get_capacity(struct gendisk *disk) return bdev_nr_sectors(disk->part0); } -int bdev_disk_changed(struct block_device *bdev, bool invalidate); +int bdev_disk_changed(struct gendisk *disk, bool invalidate); void blk_drop_partitions(struct gendisk *disk); extern struct gendisk *__alloc_disk_node(int minors, int node_id); From a921c655f2033dd1ce1379128efe881dda23ea37 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 23 Jun 2021 11:36:33 +0200 Subject: [PATCH 125/129] bfq: Remove merged request already in bfq_requests_merged() Currently, bfq does very little in bfq_requests_merged() and handles all the request cleanup in bfq_finish_requeue_request() called from blk_mq_free_request(). That is currently safe only because blk_mq_free_request() is called shortly after bfq_requests_merged() while bfqd->lock is still held. However to fix a lock inversion between bfqd->lock and ioc->lock, we need to call blk_mq_free_request() after dropping bfqd->lock. That would mean that already merged request could be seen by other processes inside bfq queues and possibly dispatched to the device which is wrong. So move cleanup of the request from bfq_finish_requeue_request() to bfq_requests_merged(). Acked-by: Paolo Valente Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210623093634.27879-2-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index fedb0a8fd388..9433d38e486c 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2433,7 +2433,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, *next_bfqq = bfq_init_rq(next); if (!bfqq) - return; + goto remove; /* * If next and rq belong to the same bfq_queue and next is older @@ -2456,6 +2456,14 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +remove: + /* Merged request may be in the IO scheduler. Remove it. */ + if (!RB_EMPTY_NODE(&next->rb_node)) { + bfq_remove_request(next->q, next); + if (next_bfqq) + bfqg_stats_update_io_remove(bfqq_group(next_bfqq), + next->cmd_flags); + } } /* Must be called with bfqq != NULL */ @@ -6414,6 +6422,7 @@ static void bfq_finish_requeue_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd; + unsigned long flags; /* * rq either is not associated with any icq, or is an already @@ -6431,39 +6440,15 @@ static void bfq_finish_requeue_request(struct request *rq) rq->io_start_time_ns, rq->cmd_flags); + spin_lock_irqsave(&bfqd->lock, flags); if (likely(rq->rq_flags & RQF_STARTED)) { - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); - if (rq == bfqd->waited_rq) bfq_update_inject_limit(bfqd, bfqq); bfq_completed_request(bfqq, bfqd); - bfq_finish_requeue_request_body(bfqq); - - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - /* - * Request rq may be still/already in the scheduler, - * in which case we need to remove it (this should - * never happen in case of requeue). And we cannot - * defer such a check and removal, to avoid - * inconsistencies in the time interval from the end - * of this function to the start of the deferred work. - * This situation seems to occur only in process - * context, as a consequence of a merge. In the - * current version of the code, this implies that the - * lock is held. - */ - - if (!RB_EMPTY_NODE(&rq->rb_node)) { - bfq_remove_request(rq->q, rq); - bfqg_stats_update_io_remove(bfqq_group(bfqq), - rq->cmd_flags); - } - bfq_finish_requeue_request_body(bfqq); } + bfq_finish_requeue_request_body(bfqq); + spin_unlock_irqrestore(&bfqd->lock, flags); /* * Reset private fields. In case of a requeue, this allows From fd2ef39cc9a6b9c4c41864ac506906c52f94b06a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 23 Jun 2021 11:36:34 +0200 Subject: [PATCH 126/129] blk: Fix lock inversion between ioc lock and bfqd lock Lockdep complains about lock inversion between ioc->lock and bfqd->lock: bfqd -> ioc: put_io_context+0x33/0x90 -> ioc->lock grabbed blk_mq_free_request+0x51/0x140 blk_put_request+0xe/0x10 blk_attempt_req_merge+0x1d/0x30 elv_attempt_insert_merge+0x56/0xa0 blk_mq_sched_try_insert_merge+0x4b/0x60 bfq_insert_requests+0x9e/0x18c0 -> bfqd->lock grabbed blk_mq_sched_insert_requests+0xd6/0x2b0 blk_mq_flush_plug_list+0x154/0x280 blk_finish_plug+0x40/0x60 ext4_writepages+0x696/0x1320 do_writepages+0x1c/0x80 __filemap_fdatawrite_range+0xd7/0x120 sync_file_range+0xac/0xf0 ioc->bfqd: bfq_exit_icq+0xa3/0xe0 -> bfqd->lock grabbed put_io_context_active+0x78/0xb0 -> ioc->lock grabbed exit_io_context+0x48/0x50 do_exit+0x7e9/0xdd0 do_group_exit+0x54/0xc0 To avoid this inversion we change blk_mq_sched_try_insert_merge() to not free the merged request but rather leave that upto the caller similarly to blk_mq_sched_try_merge(). And in bfq_insert_requests() we make sure to free all the merged requests after dropping bfqd->lock. Fixes: aee69d78dec0 ("block, bfq: introduce the BFQ-v0 I/O scheduler as an extra scheduler") Reviewed-by: Ming Lei Acked-by: Paolo Valente Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20210623093634.27879-3-jack@suse.cz Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 6 ++++-- block/blk-merge.c | 19 ++++++++----------- block/blk-mq-sched.c | 5 +++-- block/blk-mq-sched.h | 3 ++- block/blk-mq.h | 11 +++++++++++ block/blk.h | 2 +- block/elevator.c | 11 ++++++++--- block/mq-deadline-main.c | 5 ++++- include/linux/elevator.h | 3 ++- 9 files changed, 43 insertions(+), 22 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 9433d38e486c..727955918563 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -2345,9 +2345,9 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + spin_unlock_irq(&bfqd->lock); if (free) blk_mq_free_request(free); - spin_unlock_irq(&bfqd->lock); return ret; } @@ -5969,14 +5969,16 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct bfq_queue *bfqq; bool idle_timer_disabled = false; unsigned int cmd_flags; + LIST_HEAD(free); #ifdef CONFIG_BFQ_GROUP_IOSCHED if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) bfqg_stats_update_legacy_io(q, rq); #endif spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { + if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); + blk_mq_free_requests(&free); return; } diff --git a/block/blk-merge.c b/block/blk-merge.c index 4d97fb6dd226..1398b52a24b4 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -846,18 +846,15 @@ static struct request *attempt_front_merge(struct request_queue *q, return NULL; } -int blk_attempt_req_merge(struct request_queue *q, struct request *rq, - struct request *next) +/* + * Try to merge 'next' into 'rq'. Return true if the merge happened, false + * otherwise. The caller is responsible for freeing 'next' if the merge + * happened. + */ +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, + struct request *next) { - struct request *free; - - free = attempt_merge(q, rq, next); - if (free) { - blk_put_request(free); - return 1; - } - - return 0; + return attempt_merge(q, rq, next); } bool blk_rq_merge_ok(struct request *rq, struct bio *bio) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 2403a5c2b053..c838d81ac058 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -399,9 +399,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, return ret; } -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { - return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); + return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free); } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index aff037cfd8e7..5246ae040704 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -13,7 +13,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); diff --git a/block/blk-mq.h b/block/blk-mq.h index 4b1ca7b7bbeb..d08779f77a26 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -302,6 +302,17 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q, return NULL; } +/* Free all requests on the list */ +static inline void blk_mq_free_requests(struct list_head *list) +{ + while (!list_empty(list)) { + struct request *rq = list_entry_rq(list->next); + + list_del_init(&rq->queuelist); + blk_mq_free_request(rq); + } +} + /* * For shared tag users, we track the number of currently active users * and attempt to provide a fair share of the tag depth for each of them. diff --git a/block/blk.h b/block/blk.h index 4fcd7a032377..4b885c0f6708 100644 --- a/block/blk.h +++ b/block/blk.h @@ -224,7 +224,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *, void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); -int blk_attempt_req_merge(struct request_queue *q, struct request *rq, +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); unsigned int blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); diff --git a/block/elevator.c b/block/elevator.c index 85d0d4adbb64..52ada14cfe45 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -350,9 +350,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, * we can append 'rq' to an existing request, so we can throw 'rq' away * afterwards. * - * Returns true if we merged, false otherwise + * Returns true if we merged, false otherwise. 'free' will contain all + * requests that need to be freed. */ -bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) +bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { struct request *__rq; bool ret; @@ -363,8 +365,10 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) /* * First try one-hit cache. */ - if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) + if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) { + list_add(&rq->queuelist, free); return true; + } if (blk_queue_noxmerges(q)) return false; @@ -378,6 +382,7 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) break; + list_add(&rq->queuelist, free); /* The merged request could be merged with others, try again */ ret = true; rq = __rq; diff --git a/block/mq-deadline-main.c b/block/mq-deadline-main.c index 4815e536091f..9db6da9ef4c6 100644 --- a/block/mq-deadline-main.c +++ b/block/mq-deadline-main.c @@ -719,6 +719,7 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct dd_per_prio *per_prio; enum dd_prio prio; struct dd_blkcg *blkcg; + LIST_HEAD(free); lockdep_assert_held(&dd->lock); @@ -742,8 +743,10 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, WARN_ON_ONCE(rq->elv.priv[0]); rq->elv.priv[0] = blkcg; - if (blk_mq_sched_try_insert_merge(q, rq)) + if (blk_mq_sched_try_insert_merge(q, rq, &free)) { + blk_mq_free_requests(&free); return; + } trace_block_rq_insert(rq); diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 783ecb3cb77a..ef9ceead3db1 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -117,7 +117,8 @@ extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); extern void elv_merged_request(struct request_queue *, struct request *, enum elv_merge); -extern bool elv_attempt_insert_merge(struct request_queue *, struct request *); +extern bool elv_attempt_insert_merge(struct request_queue *, struct request *, + struct list_head *); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); void elevator_init_mq(struct request_queue *q); From cb9516be7708a2a18ec0a19fe3a225b5b3bc92c7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Jun 2021 10:02:48 +0800 Subject: [PATCH 127/129] blk-mq: update hctx->dispatch_busy in case of real scheduler Commit 6e6fcbc27e77 ("blk-mq: support batching dispatch in case of io") starts to support io batching submission by using hctx->dispatch_busy. However, blk_mq_update_dispatch_busy() isn't changed to update hctx->dispatch_busy in that commit, so fix the issue by updating hctx->dispatch_busy in case of real scheduler. Reported-by: Jan Kara Reviewed-by: Jan Kara Fixes: 6e6fcbc27e77 ("blk-mq: support batching dispatch in case of io") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210625020248.1630497-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 3115ea2d0990..c2f3550337f7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1224,9 +1224,6 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) { unsigned int ewma; - if (hctx->queue->elevator) - return; - ewma = hctx->dispatch_busy; if (!ewma && !busy) From c06bc5a3fb42304d815a2dc41e324b5a97c9f7da Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sun, 27 Jun 2021 14:11:12 -0700 Subject: [PATCH 128/129] block/mq-deadline: Remove a WARN_ON_ONCE() call The purpose of the WARN_ON_ONCE() statement in dd_insert_request() is to verify that dd_prepare_request() cleared rq->elv.priv[0]. Since dd_prepare_request() is called during request initialization but not if a request is requeued, a warning is triggered if a request is requeued. Fix this by removing the WARN_ON_ONCE() statement. This patch suppresses the following kernel warning: WARNING: CPU: 28 PID: 432 at block/mq-deadline-main.c:740 dd_insert_request+0x4d4/0x5b0 Workqueue: kblockd blk_mq_requeue_work Call Trace: dd_insert_requests+0xfa/0x130 blk_mq_sched_insert_request+0x22c/0x240 blk_mq_requeue_work+0x21c/0x2d0 process_one_work+0x4c2/0xa70 worker_thread+0x2e5/0x6d0 kthread+0x21c/0x250 ret_from_fork+0x1f/0x30 Reported-by: Sachin Sant Fixes: 08a9ad8bf607 ("block/mq-deadline: Add cgroup support") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20210627211112.12720-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline-main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/mq-deadline-main.c b/block/mq-deadline-main.c index 9db6da9ef4c6..6f612e6dc82b 100644 --- a/block/mq-deadline-main.c +++ b/block/mq-deadline-main.c @@ -740,7 +740,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, dd_count(dd, inserted, prio); blkcg = dd_blkcg_from_bio(rq->bio); ddcg_count(blkcg, inserted, ioprio_class); - WARN_ON_ONCE(rq->elv.priv[0]); rq->elv.priv[0] = blkcg; if (blk_mq_sched_try_insert_merge(q, rq, &free)) { From 2705dfb2094777e405e065105e307074af8965c1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 28 Jun 2021 10:33:12 +0800 Subject: [PATCH 129/129] block: fix discard request merge ll_new_hw_segment() is reached only in case of single range discard merge, and we don't have max discard segment size limit actually, so it is wrong to run the following check: if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req)) it may be always false since req->nr_phys_segments is initialized as one, and bio's segment count is still 1, blk_rq_get_max_segments(reg) is 1 too. Fix the issue by not doing the check and bypassing the calculation of discard request's nr_phys_segments. Based on analysis from Wang Shanker. Cc: Christoph Hellwig Reported-by: Wang Shanker Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20210628023312.1903255-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 1398b52a24b4..a11b3b53717e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -559,10 +559,14 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq) static inline int ll_new_hw_segment(struct request *req, struct bio *bio, unsigned int nr_phys_segs) { - if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req)) + if (blk_integrity_merge_bio(req->q, req, bio) == false) goto no_merge; - if (blk_integrity_merge_bio(req->q, req, bio) == false) + /* discard request merge won't add new segment */ + if (req_op(req) == REQ_OP_DISCARD) + return 1; + + if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req)) goto no_merge; /*