From 98d5561bfbc3c7a53d6abc1812a2bd5344d36fa3 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Mon, 2 Apr 2012 09:48:37 +1000 Subject: [PATCH 1/8] md/linear: If md_integrity_register() fails, linear_run() must free the mem. Signed-off-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/linear.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/md/linear.c b/drivers/md/linear.c index b0fcc7d02adb..fa211d80fc0a 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -198,6 +198,7 @@ out: static int linear_run (struct mddev *mddev) { struct linear_conf *conf; + int ret; if (md_check_no_bitmap(mddev)) return -EINVAL; @@ -211,7 +212,13 @@ static int linear_run (struct mddev *mddev) blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); mddev->queue->backing_dev_info.congested_fn = linear_congested; mddev->queue->backing_dev_info.congested_data = mddev; - return md_integrity_register(mddev); + + ret = md_integrity_register(mddev); + if (ret) { + kfree(conf); + mddev->private = NULL; + } + return ret; } static int linear_add(struct mddev *mddev, struct md_rdev *rdev) From 0366ef847581d692e197b88825867ca9ee00e358 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Mon, 2 Apr 2012 09:48:37 +1000 Subject: [PATCH 2/8] md/raid0: If md_integrity_register() fails, raid0_run() must free the mem. Signed-off-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid0.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 6f31f5596e01..c9809453a346 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -407,6 +407,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks return array_sectors; } +static int raid0_stop(struct mddev *mddev); + static int raid0_run(struct mddev *mddev) { struct r0conf *conf; @@ -454,7 +456,12 @@ static int raid0_run(struct mddev *mddev) blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); dump_zones(mddev); - return md_integrity_register(mddev); + + ret = md_integrity_register(mddev); + if (ret) + raid0_stop(mddev); + + return ret; } static int raid0_stop(struct mddev *mddev) From 5220ea1e640869e70f894837678315c878c651fd Mon Sep 17 00:00:00 2001 From: majianpeng Date: Mon, 2 Apr 2012 09:48:38 +1000 Subject: [PATCH 3/8] md/raid1: If md_integrity_register() failed,run() must free the mem Signed-off-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid1.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4a40a200d769..242440831b23 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2636,11 +2636,13 @@ static struct r1conf *setup_conf(struct mddev *mddev) return ERR_PTR(err); } +static int stop(struct mddev *mddev); static int run(struct mddev *mddev) { struct r1conf *conf; int i; struct md_rdev *rdev; + int ret; if (mddev->level != 1) { printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n", @@ -2705,7 +2707,11 @@ static int run(struct mddev *mddev) mddev->queue->backing_dev_info.congested_data = mddev; blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); } - return md_integrity_register(mddev); + + ret = md_integrity_register(mddev); + if (ret) + stop(mddev); + return ret; } static int stop(struct mddev *mddev) From 18b9837ea0dc3cf844c6c4196871ce91d047bddb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sun, 1 Apr 2012 23:48:38 +1000 Subject: [PATCH 4/8] md/raid5: fix handling of bad blocks during recovery. 1/ We can only treat a known-bad-block like a read-error if we have the data that belongs in that block. So fix that test. 2/ If we cannot recovery a stripe due to insufficient data, don't tell "md_done_sync" that the sync failed unless we really did fail something. If we successfully record bad blocks, that is success. Reported-by: "majianpeng" Signed-off-by: NeilBrown --- drivers/md/raid5.c | 55 ++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 23ac880bba9a..9799be80bf31 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2471,39 +2471,41 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, int abort = 0; int i; - md_done_sync(conf->mddev, STRIPE_SECTORS, 0); clear_bit(STRIPE_SYNCING, &sh->state); s->syncing = 0; s->replacing = 0; /* There is nothing more to do for sync/check/repair. + * Don't even need to abort as that is handled elsewhere + * if needed, and not always wanted e.g. if there is a known + * bad block here. * For recover/replace we need to record a bad block on all * non-sync devices, or abort the recovery */ - if (!test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) - return; - /* During recovery devices cannot be removed, so locking and - * refcounting of rdevs is not needed - */ - for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = conf->disks[i].rdev; - if (rdev - && !test_bit(Faulty, &rdev->flags) - && !test_bit(In_sync, &rdev->flags) - && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) - abort = 1; - rdev = conf->disks[i].replacement; - if (rdev - && !test_bit(Faulty, &rdev->flags) - && !test_bit(In_sync, &rdev->flags) - && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) - abort = 1; - } - if (abort) { - conf->recovery_disabled = conf->mddev->recovery_disabled; - set_bit(MD_RECOVERY_INTR, &conf->mddev->recovery); + if (test_bit(MD_RECOVERY_RECOVER, &conf->mddev->recovery)) { + /* During recovery devices cannot be removed, so + * locking and refcounting of rdevs is not needed + */ + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = conf->disks[i].rdev; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + abort = 1; + rdev = conf->disks[i].replacement; + if (rdev + && !test_bit(Faulty, &rdev->flags) + && !test_bit(In_sync, &rdev->flags) + && !rdev_set_badblocks(rdev, sh->sector, + STRIPE_SECTORS, 0)) + abort = 1; + } + if (abort) + conf->recovery_disabled = + conf->mddev->recovery_disabled; } + md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); } static int want_replace(struct stripe_head *sh, int disk_idx) @@ -3203,7 +3205,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) /* Not in-sync */; else if (is_bad) { /* also not in-sync */ - if (!test_bit(WriteErrorSeen, &rdev->flags)) { + if (!test_bit(WriteErrorSeen, &rdev->flags) && + test_bit(R5_UPTODATE, &dev->flags)) { /* treat as in-sync, but with a read error * which we can now try to correct */ From 24b961f811a3e790a9b93604d2594bfb6cce4fa4 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Sun, 1 Apr 2012 23:48:38 +1000 Subject: [PATCH 5/8] md: Avoid OOPS when reshaping raid1 to raid0 raid1 arrays do not have the notion of chunk size. Calculate the largest chunk sector size we can use to avoid a divide by zero OOPS when aligning the size of the new array to the chunk size. Signed-off-by: Jes Sorensen Signed-off-by: NeilBrown --- drivers/md/raid0.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c9809453a346..de63a1fc3737 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -632,6 +632,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev) static void *raid0_takeover_raid1(struct mddev *mddev) { struct r0conf *priv_conf; + int chunksect; /* Check layout: * - (N - 1) mirror drives must be already faulty @@ -642,10 +643,25 @@ static void *raid0_takeover_raid1(struct mddev *mddev) return ERR_PTR(-EINVAL); } + /* + * a raid1 doesn't have the notion of chunk size, so + * figure out the largest suitable size we can use. + */ + chunksect = 64 * 2; /* 64K by default */ + + /* The array must be an exact multiple of chunksize */ + while (chunksect && (mddev->array_sectors & (chunksect - 1))) + chunksect >>= 1; + + if ((chunksect << 9) < PAGE_SIZE) + /* array size does not allow a suitable chunk size */ + return ERR_PTR(-EINVAL); + /* Set new parameters */ mddev->new_level = 0; mddev->new_layout = 0; - mddev->new_chunk_sectors = 128; /* by default set chunk size to 64k */ + mddev->new_chunk_sectors = chunksect; + mddev->chunk_sectors = chunksect; mddev->delta_disks = 1 - mddev->raid_disks; mddev->raid_disks = 1; /* make sure it will be not marked as dirty */ From a42f9d83b5c05dc6e678a1f0cd9767502c2c58de Mon Sep 17 00:00:00 2001 From: majianpeng Date: Mon, 2 Apr 2012 01:04:19 +1000 Subject: [PATCH 6/8] md/raid1:Remove unnecessary rcu_dereference(conf->mirrors[i].rdev). Because rde->nr_pending > 0,so can not remove this disk. And in any case, we aren't holding rcu_read_lock() Signed-off-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid1.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 242440831b23..8c420f178603 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2386,8 +2386,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp int ok = 1; for (i = 0 ; i < conf->raid_disks * 2 ; i++) if (r1_bio->bios[i]->bi_end_io == end_sync_write) { - struct md_rdev *rdev = - rcu_dereference(conf->mirrors[i].rdev); + struct md_rdev *rdev = conf->mirrors[i].rdev; ok = rdev_set_badblocks(rdev, sector_nr, min_bad, 0 ) && ok; From c6d2e084c7411f61f2b446d94989e5aaf9879b0f Mon Sep 17 00:00:00 2001 From: majianpeng Date: Mon, 2 Apr 2012 01:16:59 +1000 Subject: [PATCH 7/8] md/raid5: Fix a bug about judging if the operation is syncing or replacing When create a raid5 using assume-clean and echo check or repair to sync_action.Then component disks did not operated IO but the raid check/resync faster than normal. Because the judgement in function analyse_stripe(): if (do_recovery || sh->sector >= conf->mddev->recovery_cp) s->syncing = 1; else s->replacing = 1; When check or repair,the recovery_cp == MaxSectore,so syncing equal zero not one. This bug was introduced by commit 9a3e1101b827 md/raid5: detect and handle replacements during recovery. so this patch is suitable for 3.3-stable. Cc: stable@vger.kernel.org Signed-off-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9799be80bf31..f351422938e0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3279,12 +3279,14 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) /* If there is a failed device being replaced, * we must be recovering. * else if we are after recovery_cp, we must be syncing + * else if MD_RECOVERY_REQUESTED is set, we also are syncing. * else we can only be replacing * sync and recovery both need to read all devices, and so * use the same flag. */ if (do_recovery || - sh->sector >= conf->mddev->recovery_cp) + sh->sector >= conf->mddev->recovery_cp || + test_bit(MD_RECOVERY_REQUESTED, &(conf->mddev->recovery))) s->syncing = 1; else s->replacing = 1; From 5020ad7d143ccfcf8149974096220d59e5572120 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 2 Apr 2012 01:39:05 +1000 Subject: [PATCH 8/8] md/raid1,raid10: don't compare excess byte during consistency check. When comparing two pages read from different legs of a mirror, only compare the bytes that were read, not the whole page. In most cases we read a whole page, but in some cases with bad blocks or odd sizes devices we might read fewer than that. This bug has been present "forever" but at worst it might cause a report of two many mismatches and generate a little bit extra resync IO, so there is no need to back-port to -stable kernels. Reported-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 8c420f178603..d35e4c991e38 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1738,7 +1738,7 @@ static int process_checks(struct r1bio *r1_bio) s = sbio->bi_io_vec[j].bv_page; if (memcmp(page_address(p), page_address(s), - PAGE_SIZE)) + sbio->bi_io_vec[j].bv_len)) break; } } else diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3540316886f2..fff782189e48 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1821,7 +1821,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) for (j = 0; j < vcnt; j++) if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), page_address(tbio->bi_io_vec[j].bv_page), - PAGE_SIZE)) + fbio->bi_io_vec[j].bv_len)) break; if (j == vcnt) continue;