From bb8bd38b9a1685334b73e8c62e128cbedb875867 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 1 Jul 2015 12:57:40 -0400
Subject: [PATCH 01/10] bio integrity: do not assume bio_integrity_pool exists
 if bioset exists

bio_integrity_alloc() and bio_integrity_free() assume that if a bio was
allocated from a bioset that that bioset also had its bio_integrity_pool
allocated using bioset_integrity_create().  This is a very bad
assumption given that bioset_create() and bioset_integrity_create() are
completely disjoint.  Not all callers of bioset_create() have been
trained to also call bioset_integrity_create() -- and they may not care
to be.

Fix this by falling back to kmalloc'ing 'struct bio_integrity_payload'
rather than force all bioset consumers to (wastefully) preallocate a
bio_integrity_pool that they very likely won't actually need (given the
niche nature of the current block integrity support).

Otherwise, a NULL pointer "Kernel BUG" with a trace like the following
will be observed (as seen on s390x using zfcp storage) because dm-io
doesn't use bioset_integrity_create() when creating its bioset:

    [  791.643338] Call Trace:
    [  791.643339] ([<00000003df98b848>] 0x3df98b848)
    [  791.643341]  [<00000000002c5de8>] bio_integrity_alloc+0x48/0xf8
    [  791.643348]  [<00000000002c6486>] bio_integrity_prep+0xae/0x2f0
    [  791.643349]  [<0000000000371e38>] blk_queue_bio+0x1c8/0x3d8
    [  791.643355]  [<000000000036f8d0>] generic_make_request+0xc0/0x100
    [  791.643357]  [<000000000036f9b2>] submit_bio+0xa2/0x198
    [  791.643406]  [<000003ff801f9774>] dispatch_io+0x15c/0x3b0 [dm_mod]
    [  791.643419]  [<000003ff801f9b3e>] dm_io+0x176/0x2f0 [dm_mod]
    [  791.643423]  [<000003ff8074b28a>] do_reads+0x13a/0x1a8 [dm_mirror]
    [  791.643425]  [<000003ff8074b43a>] do_mirror+0x142/0x298 [dm_mirror]
    [  791.643428]  [<0000000000154fca>] process_one_work+0x18a/0x3f8
    [  791.643432]  [<000000000015598a>] worker_thread+0x132/0x3b0
    [  791.643435]  [<000000000015d49a>] kthread+0xd2/0xd8
    [  791.643438]  [<00000000005bc0ca>] kernel_thread_starter+0x6/0xc
    [  791.643446]  [<00000000005bc0c4>] kernel_thread_starter+0x0/0xc

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio-integrity.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 0436c21db7f2..719b7152aed1 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -51,7 +51,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 	unsigned long idx = BIO_POOL_NONE;
 	unsigned inline_vecs;
 
-	if (!bs) {
+	if (!bs || !bs->bio_integrity_pool) {
 		bip = kmalloc(sizeof(struct bio_integrity_payload) +
 			      sizeof(struct bio_vec) * nr_vecs, gfp_mask);
 		inline_vecs = nr_vecs;
@@ -104,7 +104,7 @@ void bio_integrity_free(struct bio *bio)
 		kfree(page_address(bip->bip_vec->bv_page) +
 		      bip->bip_vec->bv_offset);
 
-	if (bs) {
+	if (bs && bs->bio_integrity_pool) {
 		if (bip->bip_slab != BIO_POOL_NONE)
 			bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
 				  bip->bip_slab);

From 0762b23d23c1f23beab91a3af0fa89749b75f03c Mon Sep 17 00:00:00 2001
From: Maninder Singh <maninder1.s@samsung.com>
Date: Tue, 7 Jul 2015 12:41:07 +0530
Subject: [PATCH 02/10] block: use FIELD_SIZEOF to calculate size of a field

use FIELD_SIZEOF instead of open coding

Signed-off-by: Maninder Singh <maninder1.s@samsung.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 82819e68f58b..627ed0c593fb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3370,7 +3370,7 @@ EXPORT_SYMBOL(blk_post_runtime_resume);
 int __init blk_dev_init(void)
 {
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
-			sizeof(((struct request *)0)->cmd_flags));
+			FIELD_SIZEOF(struct request, cmd_flags));
 
 	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
 	kblockd_workqueue = alloc_workqueue("kblockd",

From a322baad1003798312741b0cb97bd2c7511ccf61 Mon Sep 17 00:00:00 2001
From: Arianna Avanzini <avanzini.arianna@gmail.com>
Date: Tue, 7 Jul 2015 03:08:15 +0200
Subject: [PATCH 03/10] block/blk-cgroup.c: free per-blkcg data when freeing
 the blkcg

Currently, per-blkcg data is freed each time a policy is deactivated,
that is also upon scheduler switch. However, when switching from a
scheduler implementing a policy which requires per-blkcg data to
another one, that same policy might be active on other devices, and
therefore those same per-blkcg data could be still in use.
This commit lets per-blkcg data be freed when the blkcg is freed
instead of on policy deactivation.

Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
Reported-and-tested-by: Michael Kaminsky <kaminsky@cs.cmu.edu>
Fixes: e48453c3 ("block, cgroup: implement policy-specific per-blkcg data")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9f97da52d006..5e2723f2c6a3 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -822,8 +822,13 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 
-	if (blkcg != &blkcg_root)
+	if (blkcg != &blkcg_root) {
+		int i;
+
+		for (i = 0; i < BLKCG_MAX_POLS; i++)
+			kfree(blkcg->pd[i]);
 		kfree(blkcg);
+	}
 }
 
 static struct cgroup_subsys_state *
@@ -1162,8 +1167,6 @@ void blkcg_deactivate_policy(struct request_queue *q,
 
 		kfree(blkg->pd[pol->plid]);
 		blkg->pd[pol->plid] = NULL;
-		kfree(blkg->blkcg->pd[pol->plid]);
-		blkg->blkcg->pd[pol->plid] = NULL;
 
 		spin_unlock(&blkg->blkcg->lock);
 	}

From 838f13bf4b6737d4aec508558e45f81798fc2677 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Jul 2015 16:39:47 -0400
Subject: [PATCH 04/10] blkcg: allow blkcg_pol_mutex to be grabbed from cgroup
 [file] methods

blkcg_pol_mutex primarily protects the blkcg_policy array.  It also
protects cgroup file type [un]registration during policy addition /
removal.  This puts blkcg_pol_mutex outside cgroup internal
synchronization and in turn makes it impossible to grab from blkcg's
cgroup methods as that leads to cyclic dependency.

Another problematic dependency arising from this is through cgroup
interface file deactivation.  Removing a cftype requires removing all
files of the type which in turn involves draining all on-going
invocations of the file methods.  This means that an interface file
implementation can't grab blkcg_pol_mutex as draining can lead to AA
deadlock.

blkcg_reset_stats() is already in this situation.  It currently
trylocks blkcg_pol_mutex and then unwinds and retries the whole
operation on failure, which is cumbersome at best.  It has a lengthy
comment explaining how cgroup internal synchronization is involved and
expected to be updated but as explained above this doesn't need cgroup
internal locking to deadlock.  It's a self-contained AA deadlock.

The described circular dependencies can be easily broken by moving
cftype [un]registration out of blkcg_pol_mutex and protect them with
an outer mutex.  This patch introduces blkcg_pol_register_mutex which
wraps entire policy [un]registration including cftype operations and
shrinks blkcg_pol_mutex critical section.  This also makes the trylock
dancing in blkcg_reset_stats() unnecessary.  Removed.

This patch is necessary for the following blkcg_policy_data allocation
bug fixes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5e2723f2c6a3..2ff74ffcbb27 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -29,6 +29,14 @@
 
 #define MAX_KEY_LEN 100
 
+/*
+ * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
+ * blkcg_pol_register_mutex nests outside of it and synchronizes entire
+ * policy [un]register operations including cgroup file additions /
+ * removals.  Putting cgroup file registration outside blkcg_pol_mutex
+ * allows grabbing it from cgroup callbacks.
+ */
+static DEFINE_MUTEX(blkcg_pol_register_mutex);
 static DEFINE_MUTEX(blkcg_pol_mutex);
 
 struct blkcg blkcg_root;
@@ -453,20 +461,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 	struct blkcg_gq *blkg;
 	int i;
 
-	/*
-	 * XXX: We invoke cgroup_add/rm_cftypes() under blkcg_pol_mutex
-	 * which ends up putting cgroup's internal cgroup_tree_mutex under
-	 * it; however, cgroup_tree_mutex is nested above cgroup file
-	 * active protection and grabbing blkcg_pol_mutex from a cgroup
-	 * file operation creates a possible circular dependency.  cgroup
-	 * internal locking is planned to go through further simplification
-	 * and this issue should go away soon.  For now, let's trylock
-	 * blkcg_pol_mutex and restart the write on failure.
-	 *
-	 * http://lkml.kernel.org/g/5363C04B.4010400@oracle.com
-	 */
-	if (!mutex_trylock(&blkcg_pol_mutex))
-		return restart_syscall();
+	mutex_lock(&blkcg_pol_mutex);
 	spin_lock_irq(&blkcg->lock);
 
 	/*
@@ -1190,6 +1185,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
 		return -EINVAL;
 
+	mutex_lock(&blkcg_pol_register_mutex);
 	mutex_lock(&blkcg_pol_mutex);
 
 	/* find an empty slot */
@@ -1198,19 +1194,23 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 		if (!blkcg_policy[i])
 			break;
 	if (i >= BLKCG_MAX_POLS)
-		goto out_unlock;
+		goto err_unlock;
 
 	/* register and update blkgs */
 	pol->plid = i;
 	blkcg_policy[i] = pol;
+	mutex_unlock(&blkcg_pol_mutex);
 
 	/* everything is in place, add intf files for the new policy */
 	if (pol->cftypes)
 		WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
 						  pol->cftypes));
-	ret = 0;
-out_unlock:
+	mutex_unlock(&blkcg_pol_register_mutex);
+	return 0;
+
+err_unlock:
 	mutex_unlock(&blkcg_pol_mutex);
+	mutex_unlock(&blkcg_pol_register_mutex);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_register);
@@ -1223,7 +1223,7 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register);
  */
 void blkcg_policy_unregister(struct blkcg_policy *pol)
 {
-	mutex_lock(&blkcg_pol_mutex);
+	mutex_lock(&blkcg_pol_register_mutex);
 
 	if (WARN_ON(blkcg_policy[pol->plid] != pol))
 		goto out_unlock;
@@ -1233,8 +1233,10 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 		cgroup_rm_cftypes(pol->cftypes);
 
 	/* unregister and update blkgs */
+	mutex_lock(&blkcg_pol_mutex);
 	blkcg_policy[pol->plid] = NULL;
-out_unlock:
 	mutex_unlock(&blkcg_pol_mutex);
+out_unlock:
+	mutex_unlock(&blkcg_pol_register_mutex);
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);

From 144232b34258c1fc19729e077c6fb161e30da07b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Jul 2015 16:39:48 -0400
Subject: [PATCH 05/10] blkcg: blkcg_css_alloc() should grab blkcg_pol_mutex
 while iterating blkcg_policy[]

An entry in blkcg_policy[] is stable while there are non-bypassing
in-flight IOs on a request_queue which has the policy activated.  This
is why most derefs of blkcg_policy[] don't need explicit locking;
however, blkcg_css_alloc() isn't invoked from IO path and thus doesn't
have this protection and may race policies being added and removed.

Fix it by adding explicit blkcg_pol_mutex protection around
blkcg_policy[] iteration in blkcg_css_alloc().

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: e48453c386f3 ("block, cgroup: implement policy-specific per-blkcg data")
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2ff74ffcbb27..05b893de516b 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -844,6 +844,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		goto free_blkcg;
 	}
 
+	mutex_lock(&blkcg_pol_mutex);
+
 	for (i = 0; i < BLKCG_MAX_POLS ; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
 		struct blkcg_policy_data *cpd;
@@ -860,6 +862,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		BUG_ON(blkcg->pd[i]);
 		cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
 		if (!cpd) {
+			mutex_unlock(&blkcg_pol_mutex);
 			ret = ERR_PTR(-ENOMEM);
 			goto free_pd_blkcg;
 		}
@@ -868,6 +871,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		pol->cpd_init_fn(blkcg);
 	}
 
+	mutex_unlock(&blkcg_pol_mutex);
 done:
 	spin_lock_init(&blkcg->lock);
 	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);

From 7876f930d0e78addc6bbdbba0d6c196a0788d545 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Jul 2015 16:39:49 -0400
Subject: [PATCH 06/10] blkcg: implement all_blkcgs list

Add all_blkcgs list goes through blkcg->all_blkcgs_node and is
protected by blkcg_pol_mutex.  This will be used to fix
blkcg_policy_data allocation bug.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 17 ++++++++++++-----
 include/linux/blk-cgroup.h |  1 +
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 05b893de516b..42ff436ffaf4 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -46,6 +46,8 @@ struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
 
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
+static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
+
 static bool blkcg_policy_enabled(struct request_queue *q,
 				 const struct blkcg_policy *pol)
 {
@@ -817,6 +819,10 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
 	struct blkcg *blkcg = css_to_blkcg(css);
 
+	mutex_lock(&blkcg_pol_mutex);
+	list_del(&blkcg->all_blkcgs_node);
+	mutex_unlock(&blkcg_pol_mutex);
+
 	if (blkcg != &blkcg_root) {
 		int i;
 
@@ -833,6 +839,8 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 	struct cgroup_subsys_state *ret;
 	int i;
 
+	mutex_lock(&blkcg_pol_mutex);
+
 	if (!parent_css) {
 		blkcg = &blkcg_root;
 		goto done;
@@ -844,8 +852,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		goto free_blkcg;
 	}
 
-	mutex_lock(&blkcg_pol_mutex);
-
 	for (i = 0; i < BLKCG_MAX_POLS ; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
 		struct blkcg_policy_data *cpd;
@@ -862,7 +868,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		BUG_ON(blkcg->pd[i]);
 		cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
 		if (!cpd) {
-			mutex_unlock(&blkcg_pol_mutex);
 			ret = ERR_PTR(-ENOMEM);
 			goto free_pd_blkcg;
 		}
@@ -871,7 +876,6 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 		pol->cpd_init_fn(blkcg);
 	}
 
-	mutex_unlock(&blkcg_pol_mutex);
 done:
 	spin_lock_init(&blkcg->lock);
 	INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
@@ -879,14 +883,17 @@ done:
 #ifdef CONFIG_CGROUP_WRITEBACK
 	INIT_LIST_HEAD(&blkcg->cgwb_list);
 #endif
+	list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
+
+	mutex_unlock(&blkcg_pol_mutex);
 	return &blkcg->css;
 
 free_pd_blkcg:
 	for (i--; i >= 0; i--)
 		kfree(blkcg->pd[i]);
-
 free_blkcg:
 	kfree(blkcg);
+	mutex_unlock(&blkcg_pol_mutex);
 	return ret;
 }
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 58cfab80dd70..cf3e7bc22ef3 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -47,6 +47,7 @@ struct blkcg {
 
 	struct blkcg_policy_data	*pd[BLKCG_MAX_POLS];
 
+	struct list_head		all_blkcgs_node;
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct list_head		cgwb_list;
 #endif

From 06b285bd11257bccc5a1b85a835507e33656aff2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 9 Jul 2015 16:39:50 -0400
Subject: [PATCH 07/10] blkcg: fix blkcg_policy_data allocation bug

e48453c386f3 ("block, cgroup: implement policy-specific per-blkcg
data") updated per-blkcg policy data to be dynamically allocated.
When a policy is registered, its policy data aren't created.  Instead,
when the policy is activated on a queue, the policy data are allocated
if there are blkg's (blkcg_gq's) which are attached to a given blkcg.
This is buggy.  Consider the following scenario.

1. A blkcg is created.  No blkg's attached yet.

2. The policy is registered.  No policy data is allocated.

3. The policy is activated on a queue.  As the above blkcg doesn't
   have any blkg's, it won't allocate the matching blkcg_policy_data.

4. An IO is issued from the blkcg and blkg is created and the blkcg
   still doesn't have the matching policy data allocated.

With cfq-iosched, this leads to an oops.

It also doesn't free policy data on policy unregistration assuming
that freeing of all policy data on blkcg destruction should take care
of it; however, this also is incorrect.

1. A blkcg has policy data.

2. The policy gets unregistered but the policy data remains.

3. Another policy gets registered on the same slot.

4. Later, the new policy tries to allocate policy data on the previous
   blkcg but the slot is already occupied and gets skipped.  The
   policy ends up operating on the policy data of the previous policy.

There's no reason to manage blkcg_policy_data lazily.  The reason we
do lazy allocation of blkg's is that the number of all possible blkg's
is the product of cgroups and block devices which can reach a
surprising level.  blkcg_policy_data is contrained by the number of
cgroups and shouldn't be a problem.

This patch makes blkcg_policy_data to be allocated for all existing
blkcg's on policy registration and freed on unregistration and removes
blkcg_policy_data handling from policy [de]activation paths.  This
makes that blkcg_policy_data are created and removed with the policy
they belong to and fixes the above described problems.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: e48453c386f3 ("block, cgroup: implement policy-specific per-blkcg data")
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Arianna Avanzini <avanzini.arianna@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c         | 78 ++++++++++++++++++++------------------
 include/linux/blk-cgroup.h | 10 +----
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 42ff436ffaf4..9da02c021ebe 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1048,10 +1048,8 @@ int blkcg_activate_policy(struct request_queue *q,
 			  const struct blkcg_policy *pol)
 {
 	LIST_HEAD(pds);
-	LIST_HEAD(cpds);
 	struct blkcg_gq *blkg;
 	struct blkg_policy_data *pd, *nd;
-	struct blkcg_policy_data *cpd, *cnd;
 	int cnt = 0, ret;
 
 	if (blkcg_policy_enabled(q, pol))
@@ -1064,10 +1062,7 @@ int blkcg_activate_policy(struct request_queue *q,
 		cnt++;
 	spin_unlock_irq(q->queue_lock);
 
-	/*
-	 * Allocate per-blkg and per-blkcg policy data
-	 * for all existing blkgs.
-	 */
+	/* allocate per-blkg policy data for all existing blkgs */
 	while (cnt--) {
 		pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
 		if (!pd) {
@@ -1075,15 +1070,6 @@ int blkcg_activate_policy(struct request_queue *q,
 			goto out_free;
 		}
 		list_add_tail(&pd->alloc_node, &pds);
-
-		if (!pol->cpd_size)
-			continue;
-		cpd = kzalloc_node(pol->cpd_size, GFP_KERNEL, q->node);
-		if (!cpd) {
-			ret = -ENOMEM;
-			goto out_free;
-		}
-		list_add_tail(&cpd->alloc_node, &cpds);
 	}
 
 	/*
@@ -1093,32 +1079,17 @@ int blkcg_activate_policy(struct request_queue *q,
 	spin_lock_irq(q->queue_lock);
 
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		if (WARN_ON(list_empty(&pds)) ||
-		    WARN_ON(pol->cpd_size && list_empty(&cpds))) {
+		if (WARN_ON(list_empty(&pds))) {
 			/* umm... this shouldn't happen, just abort */
 			ret = -ENOMEM;
 			goto out_unlock;
 		}
-		cpd = list_first_entry(&cpds, struct blkcg_policy_data,
-				       alloc_node);
-		list_del_init(&cpd->alloc_node);
 		pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
 		list_del_init(&pd->alloc_node);
 
 		/* grab blkcg lock too while installing @pd on @blkg */
 		spin_lock(&blkg->blkcg->lock);
 
-		if (!pol->cpd_size)
-			goto no_cpd;
-		if (!blkg->blkcg->pd[pol->plid]) {
-			/* Per-policy per-blkcg data */
-			blkg->blkcg->pd[pol->plid] = cpd;
-			cpd->plid = pol->plid;
-			pol->cpd_init_fn(blkg->blkcg);
-		} else { /* must free it as it has already been extracted */
-			kfree(cpd);
-		}
-no_cpd:
 		blkg->pd[pol->plid] = pd;
 		pd->blkg = blkg;
 		pd->plid = pol->plid;
@@ -1135,8 +1106,6 @@ out_free:
 	blk_queue_bypass_end(q);
 	list_for_each_entry_safe(pd, nd, &pds, alloc_node)
 		kfree(pd);
-	list_for_each_entry_safe(cpd, cnd, &cpds, alloc_node)
-		kfree(cpd);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1191,6 +1160,7 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
  */
 int blkcg_policy_register(struct blkcg_policy *pol)
 {
+	struct blkcg *blkcg;
 	int i, ret;
 
 	if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
@@ -1207,9 +1177,27 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	if (i >= BLKCG_MAX_POLS)
 		goto err_unlock;
 
-	/* register and update blkgs */
+	/* register @pol */
 	pol->plid = i;
-	blkcg_policy[i] = pol;
+	blkcg_policy[pol->plid] = pol;
+
+	/* allocate and install cpd's */
+	if (pol->cpd_size) {
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+			struct blkcg_policy_data *cpd;
+
+			cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+			if (!cpd) {
+				mutex_unlock(&blkcg_pol_mutex);
+				goto err_free_cpds;
+			}
+
+			blkcg->pd[pol->plid] = cpd;
+			cpd->plid = pol->plid;
+			pol->cpd_init_fn(blkcg);
+		}
+	}
+
 	mutex_unlock(&blkcg_pol_mutex);
 
 	/* everything is in place, add intf files for the new policy */
@@ -1219,6 +1207,14 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 	mutex_unlock(&blkcg_pol_register_mutex);
 	return 0;
 
+err_free_cpds:
+	if (pol->cpd_size) {
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+			kfree(blkcg->pd[pol->plid]);
+			blkcg->pd[pol->plid] = NULL;
+		}
+	}
+	blkcg_policy[pol->plid] = NULL;
 err_unlock:
 	mutex_unlock(&blkcg_pol_mutex);
 	mutex_unlock(&blkcg_pol_register_mutex);
@@ -1234,6 +1230,8 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register);
  */
 void blkcg_policy_unregister(struct blkcg_policy *pol)
 {
+	struct blkcg *blkcg;
+
 	mutex_lock(&blkcg_pol_register_mutex);
 
 	if (WARN_ON(blkcg_policy[pol->plid] != pol))
@@ -1243,9 +1241,17 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
 	if (pol->cftypes)
 		cgroup_rm_cftypes(pol->cftypes);
 
-	/* unregister and update blkgs */
+	/* remove cpds and unregister */
 	mutex_lock(&blkcg_pol_mutex);
+
+	if (pol->cpd_size) {
+		list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+			kfree(blkcg->pd[pol->plid]);
+			blkcg->pd[pol->plid] = NULL;
+		}
+	}
 	blkcg_policy[pol->plid] = NULL;
+
 	mutex_unlock(&blkcg_pol_mutex);
 out_unlock:
 	mutex_unlock(&blkcg_pol_register_mutex);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index cf3e7bc22ef3..1b62d768c7df 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -89,18 +89,12 @@ struct blkg_policy_data {
  * Policies that need to keep per-blkcg data which is independent
  * from any request_queue associated to it must specify its size
  * with the cpd_size field of the blkcg_policy structure and
- * embed a blkcg_policy_data in it. blkcg core allocates
- * policy-specific per-blkcg structures lazily the first time
- * they are actually needed, so it handles them together with
- * blkgs. cpd_init() is invoked to let each policy handle
- * per-blkcg data.
+ * embed a blkcg_policy_data in it.  cpd_init() is invoked to let
+ * each policy handle per-blkcg data.
  */
 struct blkcg_policy_data {
 	/* the policy id this per-policy data belongs to */
 	int				plid;
-
-	/* used during policy activation */
-	struct list_head		alloc_node;
 };
 
 /* association between a blk cgroup and a request queue */

From 77b5a08427e87514c33730afc18cd02c9475e2c3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 6 Mar 2015 08:37:46 -0700
Subject: [PATCH 08/10] bcache: don't embed 'return' statements in closure
 macros

This is horribly confusing, it breaks the flow of the code without
it being apparent in the caller.

Signed-off-by: Jens Axboe <axboe@fb.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/bcache/closure.h |  3 ---
 drivers/md/bcache/io.c      |  1 +
 drivers/md/bcache/journal.c |  2 ++
 drivers/md/bcache/request.c | 14 +++++++++++---
 4 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index a08e3eeac3c5..79a6d63e8ed3 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -320,7 +320,6 @@ static inline void closure_wake_up(struct closure_waitlist *list)
 do {									\
 	set_closure_fn(_cl, _fn, _wq);					\
 	closure_sub(_cl, CLOSURE_RUNNING + 1);				\
-	return;								\
 } while (0)
 
 /**
@@ -349,7 +348,6 @@ do {									\
 do {									\
 	set_closure_fn(_cl, _fn, _wq);					\
 	closure_queue(_cl);						\
-	return;								\
 } while (0)
 
 /**
@@ -365,7 +363,6 @@ do {									\
 do {									\
 	set_closure_fn(_cl, _destructor, NULL);				\
 	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
-	return;								\
 } while (0)
 
 /**
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index cb64e64a4789..bf6a9ca18403 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -105,6 +105,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
 	} while (n != bio);
 
 	continue_at(&s->cl, bch_bio_submit_split_done, NULL);
+	return;
 submit:
 	generic_make_request(bio);
 }
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ce64fc851251..418607a6ba33 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -592,12 +592,14 @@ static void journal_write_unlocked(struct closure *cl)
 
 	if (!w->need_write) {
 		closure_return_with_destructor(cl, journal_write_unlock);
+		return;
 	} else if (journal_full(&c->journal)) {
 		journal_reclaim(c);
 		spin_unlock(&c->journal.lock);
 
 		btree_flush_write(c);
 		continue_at(cl, journal_write, system_wq);
+		return;
 	}
 
 	c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 4afb2d26b148..f292790997d7 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -88,8 +88,10 @@ static void bch_data_insert_keys(struct closure *cl)
 	if (journal_ref)
 		atomic_dec_bug(journal_ref);
 
-	if (!op->insert_data_done)
+	if (!op->insert_data_done) {
 		continue_at(cl, bch_data_insert_start, op->wq);
+		return;
+	}
 
 	bch_keylist_free(&op->insert_keys);
 	closure_return(cl);
@@ -216,8 +218,10 @@ static void bch_data_insert_start(struct closure *cl)
 		/* 1 for the device pointer and 1 for the chksum */
 		if (bch_keylist_realloc(&op->insert_keys,
 					3 + (op->csum ? 1 : 0),
-					op->c))
+					op->c)) {
 			continue_at(cl, bch_data_insert_keys, op->wq);
+			return;
+		}
 
 		k = op->insert_keys.top;
 		bkey_init(k);
@@ -255,6 +259,7 @@ static void bch_data_insert_start(struct closure *cl)
 
 	op->insert_data_done = true;
 	continue_at(cl, bch_data_insert_keys, op->wq);
+	return;
 err:
 	/* bch_alloc_sectors() blocks if s->writeback = true */
 	BUG_ON(op->writeback);
@@ -576,8 +581,10 @@ static void cache_lookup(struct closure *cl)
 	ret = bch_btree_map_keys(&s->op, s->iop.c,
 				 &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
 				 cache_lookup_fn, MAP_END_KEY);
-	if (ret == -EAGAIN)
+	if (ret == -EAGAIN) {
 		continue_at(cl, cache_lookup, bcache_wq);
+		return;
+	}
 
 	closure_return(cl);
 }
@@ -1085,6 +1092,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 		continue_at_nobarrier(&s->cl,
 				      flash_dev_nodata,
 				      bcache_wq);
+		return;
 	} else if (rw) {
 		bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
 					&KEY(d->id, bio->bi_iter.bi_sector, 0),

From 7bee607472aa2e5a36dfe143e5a625be06125f53 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 14 Jul 2015 11:57:48 -0600
Subject: [PATCH 09/10] NVMe: Reread partitions on metadata formats

This patch has the driver automatically reread partitions if a namespace
has a separate metadata format. Previously revalidating a disk was
sufficient to get the correct capacity set on such formatted drives,
but partitions that may exist would not have been surfaced.

Reported-by: Paul Grabinar <paul.grabinar@ranbarg.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Cc: Matthew Wilcox <willy@linux.intel.com>
Tested-by: Paul Grabinar <paul.grabinar@ranbarg.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/block/nvme-core.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index d1d6141920d3..7920c2741b47 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -2108,8 +2108,17 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
 		goto out_free_disk;
 
 	add_disk(ns->disk);
-	if (ns->ms)
-		revalidate_disk(ns->disk);
+	if (ns->ms) {
+		struct block_device *bd = bdget_disk(ns->disk, 0);
+		if (!bd)
+			return;
+		if (blkdev_get(bd, FMODE_READ, NULL)) {
+			bdput(bd);
+			return;
+		}
+		blkdev_reread_part(bd);
+		blkdev_put(bd, FMODE_READ);
+	}
 	return;
  out_free_disk:
 	kfree(disk);

From e56f698bd0720e17f10f39e8b0b5b446ad0ab22c Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 16 Jul 2015 19:53:22 +0800
Subject: [PATCH 10/10] blk-mq: set default timeout as 30 seconds

It is reasonable to set default timeout of request as 30 seconds instead of
30000 ticks, which may be 300 seconds if HZ is 100, for example, some arm64
based systems may choose 100 HZ.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Fixes: c76cbbcf4044 ("blk-mq: put blk_queue_rq_timeout together in blk_mq_init_queue()"
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f53779692c77..7d842db59699 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1998,7 +1998,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		goto err_hctxs;
 
 	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
-	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30000);
+	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
 
 	q->nr_queues = nr_cpu_ids;
 	q->nr_hw_queues = set->nr_hw_queues;