From bb8bd38b9a1685334b73e8c62e128cbedb875867 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 1 Jul 2015 12:57:40 -0400 Subject: [PATCH 01/10] bio integrity: do not assume bio_integrity_pool exists if bioset exists bio_integrity_alloc() and bio_integrity_free() assume that if a bio was allocated from a bioset that that bioset also had its bio_integrity_pool allocated using bioset_integrity_create(). This is a very bad assumption given that bioset_create() and bioset_integrity_create() are completely disjoint. Not all callers of bioset_create() have been trained to also call bioset_integrity_create() -- and they may not care to be. Fix this by falling back to kmalloc'ing 'struct bio_integrity_payload' rather than force all bioset consumers to (wastefully) preallocate a bio_integrity_pool that they very likely won't actually need (given the niche nature of the current block integrity support). Otherwise, a NULL pointer "Kernel BUG" with a trace like the following will be observed (as seen on s390x using zfcp storage) because dm-io doesn't use bioset_integrity_create() when creating its bioset: [ 791.643338] Call Trace: [ 791.643339] ([<00000003df98b848>] 0x3df98b848) [ 791.643341] [<00000000002c5de8>] bio_integrity_alloc+0x48/0xf8 [ 791.643348] [<00000000002c6486>] bio_integrity_prep+0xae/0x2f0 [ 791.643349] [<0000000000371e38>] blk_queue_bio+0x1c8/0x3d8 [ 791.643355] [<000000000036f8d0>] generic_make_request+0xc0/0x100 [ 791.643357] [<000000000036f9b2>] submit_bio+0xa2/0x198 [ 791.643406] [<000003ff801f9774>] dispatch_io+0x15c/0x3b0 [dm_mod] [ 791.643419] [<000003ff801f9b3e>] dm_io+0x176/0x2f0 [dm_mod] [ 791.643423] [<000003ff8074b28a>] do_reads+0x13a/0x1a8 [dm_mirror] [ 791.643425] [<000003ff8074b43a>] do_mirror+0x142/0x298 [dm_mirror] [ 791.643428] [<0000000000154fca>] process_one_work+0x18a/0x3f8 [ 791.643432] [<000000000015598a>] worker_thread+0x132/0x3b0 [ 791.643435] [<000000000015d49a>] kthread+0xd2/0xd8 [ 791.643438] [<00000000005bc0ca>] kernel_thread_starter+0x6/0xc [ 791.643446] [<00000000005bc0c4>] kernel_thread_starter+0x0/0xc Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- block/bio-integrity.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 0436c21db7f2..719b7152aed1 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -51,7 +51,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, unsigned long idx = BIO_POOL_NONE; unsigned inline_vecs; - if (!bs) { + if (!bs || !bs->bio_integrity_pool) { bip = kmalloc(sizeof(struct bio_integrity_payload) + sizeof(struct bio_vec) * nr_vecs, gfp_mask); inline_vecs = nr_vecs; @@ -104,7 +104,7 @@ void bio_integrity_free(struct bio *bio) kfree(page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset); - if (bs) { + if (bs && bs->bio_integrity_pool) { if (bip->bip_slab != BIO_POOL_NONE) bvec_free(bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab); From 0762b23d23c1f23beab91a3af0fa89749b75f03c Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Tue, 7 Jul 2015 12:41:07 +0530 Subject: [PATCH 02/10] block: use FIELD_SIZEOF to calculate size of a field use FIELD_SIZEOF instead of open coding Signed-off-by: Maninder Singh Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 82819e68f58b..627ed0c593fb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -3370,7 +3370,7 @@ EXPORT_SYMBOL(blk_post_runtime_resume); int __init blk_dev_init(void) { BUILD_BUG_ON(__REQ_NR_BITS > 8 * - sizeof(((struct request *)0)->cmd_flags)); + FIELD_SIZEOF(struct request, cmd_flags)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", From a322baad1003798312741b0cb97bd2c7511ccf61 Mon Sep 17 00:00:00 2001 From: Arianna Avanzini Date: Tue, 7 Jul 2015 03:08:15 +0200 Subject: [PATCH 03/10] block/blk-cgroup.c: free per-blkcg data when freeing the blkcg Currently, per-blkcg data is freed each time a policy is deactivated, that is also upon scheduler switch. However, when switching from a scheduler implementing a policy which requires per-blkcg data to another one, that same policy might be active on other devices, and therefore those same per-blkcg data could be still in use. This commit lets per-blkcg data be freed when the blkcg is freed instead of on policy deactivation. Signed-off-by: Arianna Avanzini Reported-and-tested-by: Michael Kaminsky Fixes: e48453c3 ("block, cgroup: implement policy-specific per-blkcg data") Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 9f97da52d006..5e2723f2c6a3 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -822,8 +822,13 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); - if (blkcg != &blkcg_root) + if (blkcg != &blkcg_root) { + int i; + + for (i = 0; i < BLKCG_MAX_POLS; i++) + kfree(blkcg->pd[i]); kfree(blkcg); + } } static struct cgroup_subsys_state * @@ -1162,8 +1167,6 @@ void blkcg_deactivate_policy(struct request_queue *q, kfree(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; - kfree(blkg->blkcg->pd[pol->plid]); - blkg->blkcg->pd[pol->plid] = NULL; spin_unlock(&blkg->blkcg->lock); } From 838f13bf4b6737d4aec508558e45f81798fc2677 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jul 2015 16:39:47 -0400 Subject: [PATCH 04/10] blkcg: allow blkcg_pol_mutex to be grabbed from cgroup [file] methods blkcg_pol_mutex primarily protects the blkcg_policy array. It also protects cgroup file type [un]registration during policy addition / removal. This puts blkcg_pol_mutex outside cgroup internal synchronization and in turn makes it impossible to grab from blkcg's cgroup methods as that leads to cyclic dependency. Another problematic dependency arising from this is through cgroup interface file deactivation. Removing a cftype requires removing all files of the type which in turn involves draining all on-going invocations of the file methods. This means that an interface file implementation can't grab blkcg_pol_mutex as draining can lead to AA deadlock. blkcg_reset_stats() is already in this situation. It currently trylocks blkcg_pol_mutex and then unwinds and retries the whole operation on failure, which is cumbersome at best. It has a lengthy comment explaining how cgroup internal synchronization is involved and expected to be updated but as explained above this doesn't need cgroup internal locking to deadlock. It's a self-contained AA deadlock. The described circular dependencies can be easily broken by moving cftype [un]registration out of blkcg_pol_mutex and protect them with an outer mutex. This patch introduces blkcg_pol_register_mutex which wraps entire policy [un]registration including cftype operations and shrinks blkcg_pol_mutex critical section. This also makes the trylock dancing in blkcg_reset_stats() unnecessary. Removed. This patch is necessary for the following blkcg_policy_data allocation bug fixes. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5e2723f2c6a3..2ff74ffcbb27 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -29,6 +29,14 @@ #define MAX_KEY_LEN 100 +/* + * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. + * blkcg_pol_register_mutex nests outside of it and synchronizes entire + * policy [un]register operations including cgroup file additions / + * removals. Putting cgroup file registration outside blkcg_pol_mutex + * allows grabbing it from cgroup callbacks. + */ +static DEFINE_MUTEX(blkcg_pol_register_mutex); static DEFINE_MUTEX(blkcg_pol_mutex); struct blkcg blkcg_root; @@ -453,20 +461,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct blkcg_gq *blkg; int i; - /* - * XXX: We invoke cgroup_add/rm_cftypes() under blkcg_pol_mutex - * which ends up putting cgroup's internal cgroup_tree_mutex under - * it; however, cgroup_tree_mutex is nested above cgroup file - * active protection and grabbing blkcg_pol_mutex from a cgroup - * file operation creates a possible circular dependency. cgroup - * internal locking is planned to go through further simplification - * and this issue should go away soon. For now, let's trylock - * blkcg_pol_mutex and restart the write on failure. - * - * http://lkml.kernel.org/g/5363C04B.4010400@oracle.com - */ - if (!mutex_trylock(&blkcg_pol_mutex)) - return restart_syscall(); + mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); /* @@ -1190,6 +1185,7 @@ int blkcg_policy_register(struct blkcg_policy *pol) if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) return -EINVAL; + mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ @@ -1198,19 +1194,23 @@ int blkcg_policy_register(struct blkcg_policy *pol) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) - goto out_unlock; + goto err_unlock; /* register and update blkgs */ pol->plid = i; blkcg_policy[i] = pol; + mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ if (pol->cftypes) WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys, pol->cftypes)); - ret = 0; -out_unlock: + mutex_unlock(&blkcg_pol_register_mutex); + return 0; + +err_unlock: mutex_unlock(&blkcg_pol_mutex); + mutex_unlock(&blkcg_pol_register_mutex); return ret; } EXPORT_SYMBOL_GPL(blkcg_policy_register); @@ -1223,7 +1223,7 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register); */ void blkcg_policy_unregister(struct blkcg_policy *pol) { - mutex_lock(&blkcg_pol_mutex); + mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) goto out_unlock; @@ -1233,8 +1233,10 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) cgroup_rm_cftypes(pol->cftypes); /* unregister and update blkgs */ + mutex_lock(&blkcg_pol_mutex); blkcg_policy[pol->plid] = NULL; -out_unlock: mutex_unlock(&blkcg_pol_mutex); +out_unlock: + mutex_unlock(&blkcg_pol_register_mutex); } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); From 144232b34258c1fc19729e077c6fb161e30da07b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jul 2015 16:39:48 -0400 Subject: [PATCH 05/10] blkcg: blkcg_css_alloc() should grab blkcg_pol_mutex while iterating blkcg_policy[] An entry in blkcg_policy[] is stable while there are non-bypassing in-flight IOs on a request_queue which has the policy activated. This is why most derefs of blkcg_policy[] don't need explicit locking; however, blkcg_css_alloc() isn't invoked from IO path and thus doesn't have this protection and may race policies being added and removed. Fix it by adding explicit blkcg_pol_mutex protection around blkcg_policy[] iteration in blkcg_css_alloc(). Signed-off-by: Tejun Heo Fixes: e48453c386f3 ("block, cgroup: implement policy-specific per-blkcg data") Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2ff74ffcbb27..05b893de516b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -844,6 +844,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) goto free_blkcg; } + mutex_lock(&blkcg_pol_mutex); + for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; @@ -860,6 +862,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) BUG_ON(blkcg->pd[i]); cpd = kzalloc(pol->cpd_size, GFP_KERNEL); if (!cpd) { + mutex_unlock(&blkcg_pol_mutex); ret = ERR_PTR(-ENOMEM); goto free_pd_blkcg; } @@ -868,6 +871,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) pol->cpd_init_fn(blkcg); } + mutex_unlock(&blkcg_pol_mutex); done: spin_lock_init(&blkcg->lock); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); From 7876f930d0e78addc6bbdbba0d6c196a0788d545 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jul 2015 16:39:49 -0400 Subject: [PATCH 06/10] blkcg: implement all_blkcgs list Add all_blkcgs list goes through blkcg->all_blkcgs_node and is protected by blkcg_pol_mutex. This will be used to fix blkcg_policy_data allocation bug. Signed-off-by: Tejun Heo Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 17 ++++++++++++----- include/linux/blk-cgroup.h | 1 + 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 05b893de516b..42ff436ffaf4 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -46,6 +46,8 @@ struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; +static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -817,6 +819,10 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); + mutex_lock(&blkcg_pol_mutex); + list_del(&blkcg->all_blkcgs_node); + mutex_unlock(&blkcg_pol_mutex); + if (blkcg != &blkcg_root) { int i; @@ -833,6 +839,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) struct cgroup_subsys_state *ret; int i; + mutex_lock(&blkcg_pol_mutex); + if (!parent_css) { blkcg = &blkcg_root; goto done; @@ -844,8 +852,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) goto free_blkcg; } - mutex_lock(&blkcg_pol_mutex); - for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; @@ -862,7 +868,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) BUG_ON(blkcg->pd[i]); cpd = kzalloc(pol->cpd_size, GFP_KERNEL); if (!cpd) { - mutex_unlock(&blkcg_pol_mutex); ret = ERR_PTR(-ENOMEM); goto free_pd_blkcg; } @@ -871,7 +876,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) pol->cpd_init_fn(blkcg); } - mutex_unlock(&blkcg_pol_mutex); done: spin_lock_init(&blkcg->lock); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); @@ -879,14 +883,17 @@ done: #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); #endif + list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); + + mutex_unlock(&blkcg_pol_mutex); return &blkcg->css; free_pd_blkcg: for (i--; i >= 0; i--) kfree(blkcg->pd[i]); - free_blkcg: kfree(blkcg); + mutex_unlock(&blkcg_pol_mutex); return ret; } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 58cfab80dd70..cf3e7bc22ef3 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -47,6 +47,7 @@ struct blkcg { struct blkcg_policy_data *pd[BLKCG_MAX_POLS]; + struct list_head all_blkcgs_node; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; #endif From 06b285bd11257bccc5a1b85a835507e33656aff2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jul 2015 16:39:50 -0400 Subject: [PATCH 07/10] blkcg: fix blkcg_policy_data allocation bug e48453c386f3 ("block, cgroup: implement policy-specific per-blkcg data") updated per-blkcg policy data to be dynamically allocated. When a policy is registered, its policy data aren't created. Instead, when the policy is activated on a queue, the policy data are allocated if there are blkg's (blkcg_gq's) which are attached to a given blkcg. This is buggy. Consider the following scenario. 1. A blkcg is created. No blkg's attached yet. 2. The policy is registered. No policy data is allocated. 3. The policy is activated on a queue. As the above blkcg doesn't have any blkg's, it won't allocate the matching blkcg_policy_data. 4. An IO is issued from the blkcg and blkg is created and the blkcg still doesn't have the matching policy data allocated. With cfq-iosched, this leads to an oops. It also doesn't free policy data on policy unregistration assuming that freeing of all policy data on blkcg destruction should take care of it; however, this also is incorrect. 1. A blkcg has policy data. 2. The policy gets unregistered but the policy data remains. 3. Another policy gets registered on the same slot. 4. Later, the new policy tries to allocate policy data on the previous blkcg but the slot is already occupied and gets skipped. The policy ends up operating on the policy data of the previous policy. There's no reason to manage blkcg_policy_data lazily. The reason we do lazy allocation of blkg's is that the number of all possible blkg's is the product of cgroups and block devices which can reach a surprising level. blkcg_policy_data is contrained by the number of cgroups and shouldn't be a problem. This patch makes blkcg_policy_data to be allocated for all existing blkcg's on policy registration and freed on unregistration and removes blkcg_policy_data handling from policy [de]activation paths. This makes that blkcg_policy_data are created and removed with the policy they belong to and fixes the above described problems. Signed-off-by: Tejun Heo Fixes: e48453c386f3 ("block, cgroup: implement policy-specific per-blkcg data") Cc: Vivek Goyal Cc: Arianna Avanzini Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 78 ++++++++++++++++++++------------------ include/linux/blk-cgroup.h | 10 +---- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 42ff436ffaf4..9da02c021ebe 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1048,10 +1048,8 @@ int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { LIST_HEAD(pds); - LIST_HEAD(cpds); struct blkcg_gq *blkg; struct blkg_policy_data *pd, *nd; - struct blkcg_policy_data *cpd, *cnd; int cnt = 0, ret; if (blkcg_policy_enabled(q, pol)) @@ -1064,10 +1062,7 @@ int blkcg_activate_policy(struct request_queue *q, cnt++; spin_unlock_irq(q->queue_lock); - /* - * Allocate per-blkg and per-blkcg policy data - * for all existing blkgs. - */ + /* allocate per-blkg policy data for all existing blkgs */ while (cnt--) { pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); if (!pd) { @@ -1075,15 +1070,6 @@ int blkcg_activate_policy(struct request_queue *q, goto out_free; } list_add_tail(&pd->alloc_node, &pds); - - if (!pol->cpd_size) - continue; - cpd = kzalloc_node(pol->cpd_size, GFP_KERNEL, q->node); - if (!cpd) { - ret = -ENOMEM; - goto out_free; - } - list_add_tail(&cpd->alloc_node, &cpds); } /* @@ -1093,32 +1079,17 @@ int blkcg_activate_policy(struct request_queue *q, spin_lock_irq(q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { - if (WARN_ON(list_empty(&pds)) || - WARN_ON(pol->cpd_size && list_empty(&cpds))) { + if (WARN_ON(list_empty(&pds))) { /* umm... this shouldn't happen, just abort */ ret = -ENOMEM; goto out_unlock; } - cpd = list_first_entry(&cpds, struct blkcg_policy_data, - alloc_node); - list_del_init(&cpd->alloc_node); pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); list_del_init(&pd->alloc_node); /* grab blkcg lock too while installing @pd on @blkg */ spin_lock(&blkg->blkcg->lock); - if (!pol->cpd_size) - goto no_cpd; - if (!blkg->blkcg->pd[pol->plid]) { - /* Per-policy per-blkcg data */ - blkg->blkcg->pd[pol->plid] = cpd; - cpd->plid = pol->plid; - pol->cpd_init_fn(blkg->blkcg); - } else { /* must free it as it has already been extracted */ - kfree(cpd); - } -no_cpd: blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; @@ -1135,8 +1106,6 @@ out_free: blk_queue_bypass_end(q); list_for_each_entry_safe(pd, nd, &pds, alloc_node) kfree(pd); - list_for_each_entry_safe(cpd, cnd, &cpds, alloc_node) - kfree(cpd); return ret; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); @@ -1191,6 +1160,7 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); */ int blkcg_policy_register(struct blkcg_policy *pol) { + struct blkcg *blkcg; int i, ret; if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) @@ -1207,9 +1177,27 @@ int blkcg_policy_register(struct blkcg_policy *pol) if (i >= BLKCG_MAX_POLS) goto err_unlock; - /* register and update blkgs */ + /* register @pol */ pol->plid = i; - blkcg_policy[i] = pol; + blkcg_policy[pol->plid] = pol; + + /* allocate and install cpd's */ + if (pol->cpd_size) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + struct blkcg_policy_data *cpd; + + cpd = kzalloc(pol->cpd_size, GFP_KERNEL); + if (!cpd) { + mutex_unlock(&blkcg_pol_mutex); + goto err_free_cpds; + } + + blkcg->pd[pol->plid] = cpd; + cpd->plid = pol->plid; + pol->cpd_init_fn(blkcg); + } + } + mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ @@ -1219,6 +1207,14 @@ int blkcg_policy_register(struct blkcg_policy *pol) mutex_unlock(&blkcg_pol_register_mutex); return 0; +err_free_cpds: + if (pol->cpd_size) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + kfree(blkcg->pd[pol->plid]); + blkcg->pd[pol->plid] = NULL; + } + } + blkcg_policy[pol->plid] = NULL; err_unlock: mutex_unlock(&blkcg_pol_mutex); mutex_unlock(&blkcg_pol_register_mutex); @@ -1234,6 +1230,8 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register); */ void blkcg_policy_unregister(struct blkcg_policy *pol) { + struct blkcg *blkcg; + mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) @@ -1243,9 +1241,17 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) if (pol->cftypes) cgroup_rm_cftypes(pol->cftypes); - /* unregister and update blkgs */ + /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); + + if (pol->cpd_size) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + kfree(blkcg->pd[pol->plid]); + blkcg->pd[pol->plid] = NULL; + } + } blkcg_policy[pol->plid] = NULL; + mutex_unlock(&blkcg_pol_mutex); out_unlock: mutex_unlock(&blkcg_pol_register_mutex); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index cf3e7bc22ef3..1b62d768c7df 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -89,18 +89,12 @@ struct blkg_policy_data { * Policies that need to keep per-blkcg data which is independent * from any request_queue associated to it must specify its size * with the cpd_size field of the blkcg_policy structure and - * embed a blkcg_policy_data in it. blkcg core allocates - * policy-specific per-blkcg structures lazily the first time - * they are actually needed, so it handles them together with - * blkgs. cpd_init() is invoked to let each policy handle - * per-blkcg data. + * embed a blkcg_policy_data in it. cpd_init() is invoked to let + * each policy handle per-blkcg data. */ struct blkcg_policy_data { /* the policy id this per-policy data belongs to */ int plid; - - /* used during policy activation */ - struct list_head alloc_node; }; /* association between a blk cgroup and a request queue */ From 77b5a08427e87514c33730afc18cd02c9475e2c3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 6 Mar 2015 08:37:46 -0700 Subject: [PATCH 08/10] bcache: don't embed 'return' statements in closure macros This is horribly confusing, it breaks the flow of the code without it being apparent in the caller. Signed-off-by: Jens Axboe Acked-by: Christoph Hellwig --- drivers/md/bcache/closure.h | 3 --- drivers/md/bcache/io.c | 1 + drivers/md/bcache/journal.c | 2 ++ drivers/md/bcache/request.c | 14 +++++++++++--- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index a08e3eeac3c5..79a6d63e8ed3 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -320,7 +320,6 @@ static inline void closure_wake_up(struct closure_waitlist *list) do { \ set_closure_fn(_cl, _fn, _wq); \ closure_sub(_cl, CLOSURE_RUNNING + 1); \ - return; \ } while (0) /** @@ -349,7 +348,6 @@ do { \ do { \ set_closure_fn(_cl, _fn, _wq); \ closure_queue(_cl); \ - return; \ } while (0) /** @@ -365,7 +363,6 @@ do { \ do { \ set_closure_fn(_cl, _destructor, NULL); \ closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ - return; \ } while (0) /** diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c index cb64e64a4789..bf6a9ca18403 100644 --- a/drivers/md/bcache/io.c +++ b/drivers/md/bcache/io.c @@ -105,6 +105,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p) } while (n != bio); continue_at(&s->cl, bch_bio_submit_split_done, NULL); + return; submit: generic_make_request(bio); } diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index ce64fc851251..418607a6ba33 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -592,12 +592,14 @@ static void journal_write_unlocked(struct closure *cl) if (!w->need_write) { closure_return_with_destructor(cl, journal_write_unlock); + return; } else if (journal_full(&c->journal)) { journal_reclaim(c); spin_unlock(&c->journal.lock); btree_flush_write(c); continue_at(cl, journal_write, system_wq); + return; } c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 4afb2d26b148..f292790997d7 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -88,8 +88,10 @@ static void bch_data_insert_keys(struct closure *cl) if (journal_ref) atomic_dec_bug(journal_ref); - if (!op->insert_data_done) + if (!op->insert_data_done) { continue_at(cl, bch_data_insert_start, op->wq); + return; + } bch_keylist_free(&op->insert_keys); closure_return(cl); @@ -216,8 +218,10 @@ static void bch_data_insert_start(struct closure *cl) /* 1 for the device pointer and 1 for the chksum */ if (bch_keylist_realloc(&op->insert_keys, 3 + (op->csum ? 1 : 0), - op->c)) + op->c)) { continue_at(cl, bch_data_insert_keys, op->wq); + return; + } k = op->insert_keys.top; bkey_init(k); @@ -255,6 +259,7 @@ static void bch_data_insert_start(struct closure *cl) op->insert_data_done = true; continue_at(cl, bch_data_insert_keys, op->wq); + return; err: /* bch_alloc_sectors() blocks if s->writeback = true */ BUG_ON(op->writeback); @@ -576,8 +581,10 @@ static void cache_lookup(struct closure *cl) ret = bch_btree_map_keys(&s->op, s->iop.c, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), cache_lookup_fn, MAP_END_KEY); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { continue_at(cl, cache_lookup, bcache_wq); + return; + } closure_return(cl); } @@ -1085,6 +1092,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio) continue_at_nobarrier(&s->cl, flash_dev_nodata, bcache_wq); + return; } else if (rw) { bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &KEY(d->id, bio->bi_iter.bi_sector, 0), From 7bee607472aa2e5a36dfe143e5a625be06125f53 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 14 Jul 2015 11:57:48 -0600 Subject: [PATCH 09/10] NVMe: Reread partitions on metadata formats This patch has the driver automatically reread partitions if a namespace has a separate metadata format. Previously revalidating a disk was sufficient to get the correct capacity set on such formatted drives, but partitions that may exist would not have been surfaced. Reported-by: Paul Grabinar Signed-off-by: Keith Busch Cc: Matthew Wilcox Tested-by: Paul Grabinar Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index d1d6141920d3..7920c2741b47 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -2108,8 +2108,17 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) goto out_free_disk; add_disk(ns->disk); - if (ns->ms) - revalidate_disk(ns->disk); + if (ns->ms) { + struct block_device *bd = bdget_disk(ns->disk, 0); + if (!bd) + return; + if (blkdev_get(bd, FMODE_READ, NULL)) { + bdput(bd); + return; + } + blkdev_reread_part(bd); + blkdev_put(bd, FMODE_READ); + } return; out_free_disk: kfree(disk); From e56f698bd0720e17f10f39e8b0b5b446ad0ab22c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 16 Jul 2015 19:53:22 +0800 Subject: [PATCH 10/10] blk-mq: set default timeout as 30 seconds It is reasonable to set default timeout of request as 30 seconds instead of 30000 ticks, which may be 300 seconds if HZ is 100, for example, some arm64 based systems may choose 100 HZ. Signed-off-by: Ming Lei Fixes: c76cbbcf4044 ("blk-mq: put blk_queue_rq_timeout together in blk_mq_init_queue()" Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index f53779692c77..7d842db59699 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1998,7 +1998,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, goto err_hctxs; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); - blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30000); + blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->nr_queues = nr_cpu_ids; q->nr_hw_queues = set->nr_hw_queues;