From 0b80f9866e6bbfb905140ed8787ff2af03652c0c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 4 May 2020 19:27:54 -0400 Subject: [PATCH 1/7] iocost: protect iocg->abs_vdebt with iocg->waitq.lock abs_vdebt is an atomic_64 which tracks how much over budget a given cgroup is and controls the activation of use_delay mechanism. Once a cgroup goes over budget from forced IOs, it has to pay it back with its future budget. The progress guarantee on debt paying comes from the iocg being active - active iocgs are processed by the periodic timer, which ensures that as time passes the debts dissipate and the iocg returns to normal operation. However, both iocg activation and vdebt handling are asynchronous and a sequence like the following may happen. 1. The iocg is in the process of being deactivated by the periodic timer. 2. A bio enters ioc_rqos_throttle(), calls iocg_activate() which returns without anything because it still sees that the iocg is already active. 3. The iocg is deactivated. 4. The bio from #2 is over budget but needs to be forced. It increases abs_vdebt and goes over the threshold and enables use_delay. 5. IO control is enabled for the iocg's subtree and now IOs are attributed to the descendant cgroups and the iocg itself no longer issues IOs. This leaves the iocg with stuck abs_vdebt - it has debt but inactive and no further IOs which can activate it. This can end up unduly punishing all the descendants cgroups. The usual throttling path has the same issue - the iocg must be active while throttled to ensure that future event will wake it up - and solves the problem by synchronizing the throttling path with a spinlock. abs_vdebt handling is another form of overage handling and shares a lot of characteristics including the fact that it isn't in the hottest path. This patch fixes the above and other possible races by strictly synchronizing abs_vdebt and use_delay handling with iocg->waitq.lock. Signed-off-by: Tejun Heo Reported-by: Vlad Dmitriev Cc: stable@vger.kernel.org # v5.4+ Fixes: e1518f63f246 ("blk-iocost: Don't let merges push vtime into the future") Signed-off-by: Jens Axboe --- block/blk-iocost.c | 117 ++++++++++++++++++++------------- tools/cgroup/iocost_monitor.py | 7 +- 2 files changed, 77 insertions(+), 47 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 3ab0c1c704b6..7c1fe605d0d6 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -466,7 +466,7 @@ struct ioc_gq { */ atomic64_t vtime; atomic64_t done_vtime; - atomic64_t abs_vdebt; + u64 abs_vdebt; u64 last_vtime; /* @@ -1142,7 +1142,7 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) struct iocg_wake_ctx ctx = { .iocg = iocg }; u64 margin_ns = (u64)(ioc->period_us * WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; - u64 abs_vdebt, vdebt, vshortage, expires, oexpires; + u64 vdebt, vshortage, expires, oexpires; s64 vbudget; u32 hw_inuse; @@ -1152,18 +1152,15 @@ static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) vbudget = now->vnow - atomic64_read(&iocg->vtime); /* pay off debt */ - abs_vdebt = atomic64_read(&iocg->abs_vdebt); - vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse); + vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); if (vdebt && vbudget > 0) { u64 delta = min_t(u64, vbudget, vdebt); u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), - abs_vdebt); + iocg->abs_vdebt); atomic64_add(delta, &iocg->vtime); atomic64_add(delta, &iocg->done_vtime); - atomic64_sub(abs_delta, &iocg->abs_vdebt); - if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0)) - atomic64_set(&iocg->abs_vdebt, 0); + iocg->abs_vdebt -= abs_delta; } /* @@ -1219,12 +1216,18 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost) u64 expires, oexpires; u32 hw_inuse; + lockdep_assert_held(&iocg->waitq.lock); + /* debt-adjust vtime */ current_hweight(iocg, NULL, &hw_inuse); - vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse); + vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); - /* clear or maintain depending on the overage */ - if (time_before_eq64(vtime, now->vnow)) { + /* + * Clear or maintain depending on the overage. Non-zero vdebt is what + * guarantees that @iocg is online and future iocg_kick_delay() will + * clear use_delay. Don't leave it on when there's no vdebt. + */ + if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) { blkcg_clear_delay(blkg); return false; } @@ -1258,9 +1261,12 @@ static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) { struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); struct ioc_now now; + unsigned long flags; + spin_lock_irqsave(&iocg->waitq.lock, flags); ioc_now(iocg->ioc, &now); iocg_kick_delay(iocg, &now, 0); + spin_unlock_irqrestore(&iocg->waitq.lock, flags); return HRTIMER_NORESTART; } @@ -1368,14 +1374,13 @@ static void ioc_timer_fn(struct timer_list *timer) * should have woken up in the last period and expire idle iocgs. */ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { - if (!waitqueue_active(&iocg->waitq) && - !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg)) + if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt && + !iocg_is_idle(iocg)) continue; spin_lock(&iocg->waitq.lock); - if (waitqueue_active(&iocg->waitq) || - atomic64_read(&iocg->abs_vdebt)) { + if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { /* might be oversleeping vtime / hweight changes, kick */ iocg_kick_waitq(iocg, &now); iocg_kick_delay(iocg, &now, 0); @@ -1718,28 +1723,49 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) * tests are racy but the races aren't systemic - we only miss once * in a while which is fine. */ - if (!waitqueue_active(&iocg->waitq) && - !atomic64_read(&iocg->abs_vdebt) && + if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && time_before_eq64(vtime + cost, now.vnow)) { iocg_commit_bio(iocg, bio, cost); return; } /* - * We're over budget. If @bio has to be issued regardless, - * remember the abs_cost instead of advancing vtime. - * iocg_kick_waitq() will pay off the debt before waking more IOs. + * We activated above but w/o any synchronization. Deactivation is + * synchronized with waitq.lock and we won't get deactivated as long + * as we're waiting or has debt, so we're good if we're activated + * here. In the unlikely case that we aren't, just issue the IO. + */ + spin_lock_irq(&iocg->waitq.lock); + + if (unlikely(list_empty(&iocg->active_list))) { + spin_unlock_irq(&iocg->waitq.lock); + iocg_commit_bio(iocg, bio, cost); + return; + } + + /* + * We're over budget. If @bio has to be issued regardless, remember + * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay + * off the debt before waking more IOs. + * * This way, the debt is continuously paid off each period with the - * actual budget available to the cgroup. If we just wound vtime, - * we would incorrectly use the current hw_inuse for the entire - * amount which, for example, can lead to the cgroup staying - * blocked for a long time even with substantially raised hw_inuse. + * actual budget available to the cgroup. If we just wound vtime, we + * would incorrectly use the current hw_inuse for the entire amount + * which, for example, can lead to the cgroup staying blocked for a + * long time even with substantially raised hw_inuse. + * + * An iocg with vdebt should stay online so that the timer can keep + * deducting its vdebt and [de]activate use_delay mechanism + * accordingly. We don't want to race against the timer trying to + * clear them and leave @iocg inactive w/ dangling use_delay heavily + * penalizing the cgroup and its descendants. */ if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { - atomic64_add(abs_cost, &iocg->abs_vdebt); + iocg->abs_vdebt += abs_cost; if (iocg_kick_delay(iocg, &now, cost)) blkcg_schedule_throttle(rqos->q, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); + spin_unlock_irq(&iocg->waitq.lock); return; } @@ -1756,20 +1782,6 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) * All waiters are on iocg->waitq and the wait states are * synchronized using waitq.lock. */ - spin_lock_irq(&iocg->waitq.lock); - - /* - * We activated above but w/o any synchronization. Deactivation is - * synchronized with waitq.lock and we won't get deactivated as - * long as we're waiting, so we're good if we're activated here. - * In the unlikely case that we are deactivated, just issue the IO. - */ - if (unlikely(list_empty(&iocg->active_list))) { - spin_unlock_irq(&iocg->waitq.lock); - iocg_commit_bio(iocg, bio, cost); - return; - } - init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); wait.wait.private = current; wait.bio = bio; @@ -1801,6 +1813,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, struct ioc_now now; u32 hw_inuse; u64 abs_cost, cost; + unsigned long flags; /* bypass if disabled or for root cgroup */ if (!ioc->enabled || !iocg->level) @@ -1820,15 +1833,28 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, iocg->cursor = bio_end; /* - * Charge if there's enough vtime budget and the existing request - * has cost assigned. Otherwise, account it as debt. See debt - * handling in ioc_rqos_throttle() for details. + * Charge if there's enough vtime budget and the existing request has + * cost assigned. */ if (rq->bio && rq->bio->bi_iocost_cost && - time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) + time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { iocg_commit_bio(iocg, bio, cost); - else - atomic64_add(abs_cost, &iocg->abs_vdebt); + return; + } + + /* + * Otherwise, account it as debt if @iocg is online, which it should + * be for the vast majority of cases. See debt handling in + * ioc_rqos_throttle() for details. + */ + spin_lock_irqsave(&iocg->waitq.lock, flags); + if (likely(!list_empty(&iocg->active_list))) { + iocg->abs_vdebt += abs_cost; + iocg_kick_delay(iocg, &now, cost); + } else { + iocg_commit_bio(iocg, bio, cost); + } + spin_unlock_irqrestore(&iocg->waitq.lock, flags); } static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) @@ -1998,7 +2024,6 @@ static void ioc_pd_init(struct blkg_policy_data *pd) iocg->ioc = ioc; atomic64_set(&iocg->vtime, now.vnow); atomic64_set(&iocg->done_vtime, now.vnow); - atomic64_set(&iocg->abs_vdebt, 0); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); iocg->hweight_active = HWEIGHT_WHOLE; diff --git a/tools/cgroup/iocost_monitor.py b/tools/cgroup/iocost_monitor.py index 7427a5ee761b..9d8e9613008a 100644 --- a/tools/cgroup/iocost_monitor.py +++ b/tools/cgroup/iocost_monitor.py @@ -159,7 +159,12 @@ class IocgStat: else: self.inflight_pct = 0 - self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000 + # vdebt used to be an atomic64_t and is now u64, support both + try: + self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000 + except: + self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000 + self.use_delay = blkg.use_delay.counter.value_() self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000 From 156c75737255b8db0aa887abcb66b709856cf453 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:47:53 +0200 Subject: [PATCH 2/7] vboxsf: don't use the source name in the bdi name Simplify the bdi name to mirror what we are doing elsewhere, and drop them name in favor of just using a number. This avoids a potentially very long bdi name. Signed-off-by: Christoph Hellwig Reviewed-by: Hans de Goede Signed-off-by: Jens Axboe --- fs/vboxsf/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 675e26989376..8fe03b4a0d2b 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -164,7 +164,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc) goto fail_free; } - err = super_setup_bdi_name(sb, "vboxsf-%s.%d", fc->source, sbi->bdi_id); + err = super_setup_bdi_name(sb, "vboxsf-%d", sbi->bdi_id); if (err) goto fail_free; From eb7ae5e06bb6e6ac6bb86872d27c43ebab92f6b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:47:54 +0200 Subject: [PATCH 3/7] bdi: move bdi_dev_name out of line bdi_dev_name is not a fast path function, move it out of line. This prepares for using it from modular callers without having to export an implementation detail like bdi_unknown_name. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Greg Kroah-Hartman Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 9 +-------- mm/backing-dev.c | 10 +++++++++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f88197c1ffc2..c9ad5c3b7b4b 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -505,13 +505,6 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi) (1 << WB_async_congested)); } -extern const char *bdi_unknown_name; - -static inline const char *bdi_dev_name(struct backing_dev_info *bdi) -{ - if (!bdi || !bdi->dev) - return bdi_unknown_name; - return dev_name(bdi->dev); -} +const char *bdi_dev_name(struct backing_dev_info *bdi); #endif /* _LINUX_BACKING_DEV_H */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c81b4f3a7268..c2c44c89ee5d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -21,7 +21,7 @@ struct backing_dev_info noop_backing_dev_info = { EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; -const char *bdi_unknown_name = "(unknown)"; +static const char *bdi_unknown_name = "(unknown)"; /* * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU @@ -1043,6 +1043,14 @@ void bdi_put(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_put); +const char *bdi_dev_name(struct backing_dev_info *bdi) +{ + if (!bdi || !bdi->dev) + return bdi_unknown_name; + return dev_name(bdi->dev); +} +EXPORT_SYMBOL_GPL(bdi_dev_name); + static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) From d51cfc53ade3189455a1b88ec7a2ff0c24597cf8 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Mon, 4 May 2020 14:47:55 +0200 Subject: [PATCH 4/7] bdi: use bdi_dev_name() to get device name Use the common interface bdi_dev_name() to get device name. Signed-off-by: Yufen Yu Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Bart Van Assche Add missing include BFQ Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 6 ++++-- block/blk-cgroup.c | 2 +- fs/ceph/debugfs.c | 2 +- include/trace/events/wbt.h | 8 ++++---- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 78ba57efd16b..3d411716d7ee 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -123,6 +123,7 @@ #include #include #include +#include #include "blk.h" #include "blk-mq.h" @@ -4976,8 +4977,9 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); switch (ioprio_class) { default: - dev_err(bfqq->bfqd->queue->backing_dev_info->dev, - "bfq: bad prio class %d\n", ioprio_class); + pr_err("bdi %s: bfq: bad prio class %d\n", + bdi_dev_name(bfqq->bfqd->queue->backing_dev_info), + ioprio_class); /* fall through */ case IOPRIO_CLASS_NONE: /* diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c5dc833212e1..930212c1a512 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -496,7 +496,7 @@ const char *blkg_dev_name(struct blkcg_gq *blkg) { /* some drivers (floppy) instantiate a queue w/o disk registered */ if (blkg->q->backing_dev_info->dev) - return dev_name(blkg->q->backing_dev_info->dev); + return bdi_dev_name(blkg->q->backing_dev_info); return NULL; } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 481ac97b4d25..dcaed75de9e6 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -271,7 +271,7 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) &congestion_kb_fops); snprintf(name, sizeof(name), "../../bdi/%s", - dev_name(fsc->sb->s_bdi->dev)); + bdi_dev_name(fsc->sb->s_bdi)); fsc->debugfs_bdi = debugfs_create_symlink("bdi", fsc->client->debugfs_dir, diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h index 784814160197..9c66e59d859c 100644 --- a/include/trace/events/wbt.h +++ b/include/trace/events/wbt.h @@ -33,7 +33,7 @@ TRACE_EVENT(wbt_stat, ), TP_fast_assign( - strlcpy(__entry->name, dev_name(bdi->dev), + strlcpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->rmean = stat[0].mean; __entry->rmin = stat[0].min; @@ -68,7 +68,7 @@ TRACE_EVENT(wbt_lat, ), TP_fast_assign( - strlcpy(__entry->name, dev_name(bdi->dev), + strlcpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->lat = div_u64(lat, 1000); ), @@ -105,7 +105,7 @@ TRACE_EVENT(wbt_step, ), TP_fast_assign( - strlcpy(__entry->name, dev_name(bdi->dev), + strlcpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->msg = msg; __entry->step = step; @@ -141,7 +141,7 @@ TRACE_EVENT(wbt_timer, ), TP_fast_assign( - strlcpy(__entry->name, dev_name(bdi->dev), + strlcpy(__entry->name, bdi_dev_name(bdi), ARRAY_SIZE(__entry->name)); __entry->status = status; __entry->step = step; From 6bd87eec23cbc9ed222bed0f5b5b02bf300e9a8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 14:47:56 +0200 Subject: [PATCH 5/7] bdi: add a ->dev_name field to struct backing_dev_info Cache a copy of the name for the life time of the backing_dev_info structure so that we can reference it even after unregistering. Fixes: 68f23b89067f ("memcg: fix a crash in wb_workfn when a device disappears") Reported-by: Yufen Yu Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 + mm/backing-dev.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index ee577a83cfe6..7367150f962a 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -219,6 +219,7 @@ struct backing_dev_info { wait_queue_head_t wb_waitq; struct device *dev; + char dev_name[64]; struct device *owner; struct timer_list laptop_mode_wb_timer; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c2c44c89ee5d..efc5b83acd2d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -938,7 +938,8 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; - dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args); + vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); + dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); if (IS_ERR(dev)) return PTR_ERR(dev); @@ -1047,7 +1048,7 @@ const char *bdi_dev_name(struct backing_dev_info *bdi) { if (!bdi || !bdi->dev) return bdi_unknown_name; - return dev_name(bdi->dev); + return bdi->dev_name; } EXPORT_SYMBOL_GPL(bdi_dev_name); From a8de6639169b90e3dc4f27e752a3c5abac5e90da Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 May 2020 23:07:04 +0300 Subject: [PATCH 6/7] nvme-pci: fix "slimmer CQ head update" Pre-incrementing ->cq_head can't be done in memory because OOB value can be observed by another context. This devalues space savings compared to original code :-\ $ ./scripts/bloat-o-meter ../vmlinux-000 ../obj/vmlinux add/remove: 0/0 grow/shrink: 0/4 up/down: 0/-32 (-32) Function old new delta nvme_poll_irqdisable 464 456 -8 nvme_poll 455 447 -8 nvme_irq 388 380 -8 nvme_dev_disable 955 947 -8 But the code is minimal now: one read for head, one read for q_depth, one increment, one comparison, single instruction phase bit update and one write for new head. Signed-off-by: Alexey Dobriyan Reported-by: John Garry Tested-by: John Garry Fixes: e2a366a4b0feaeb ("nvme-pci: slimmer CQ head update") Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4e79e412b276..e13c370de830 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -973,9 +973,13 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx) static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) { - if (++nvmeq->cq_head == nvmeq->q_depth) { + u16 tmp = nvmeq->cq_head + 1; + + if (tmp == nvmeq->q_depth) { nvmeq->cq_head = 0; nvmeq->cq_phase ^= 1; + } else { + nvmeq->cq_head = tmp; } } From 59c7c3caaaf8750df4ec3255082f15eb4e371514 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 6 May 2020 15:44:02 -0700 Subject: [PATCH 7/7] nvme: fix possible hang when ns scanning fails during error recovery When the controller is reconnecting, the host fails I/O and admin commands as the host cannot reach the controller. ns scanning may revalidate namespaces during that period and it is wrong to remove namespaces due to these failures as we may hang (see 205da2434301). One command that may fail is nvme_identify_ns_descs. Since we return success due to having ns identify descriptor list optional, we continue to compare ns identifiers in nvme_revalidate_disk, obviously fail and return -ENODEV to nvme_validate_ns, which will remove the namespace. Exactly what we don't want to happen. Fixes: 22802bf742c2 ("nvme: Namepace identification descriptor list is optional") Tested-by: Anton Eidelman Signed-off-by: Sagi Grimberg Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f2adea96b04c..f3c037f5a9ba 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1110,7 +1110,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, * Don't treat an error as fatal, as we potentially already * have a NGUID or EUI-64. */ - if (status > 0) + if (status > 0 && !(status & NVME_SC_DNR)) status = 0; goto free_data; }