mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2024-11-16 00:34:20 +08:00
md: Use REQ_FAILFAST_* on metadata writes where appropriate
This can only be supported on personalities which ensure that md_error() never causes an array to enter the 'failed' state. i.e. if marking a device Faulty would cause some data to be inaccessible, the device is status is left as non-Faulty. This is true for RAID1 and RAID10. If we get a failure writing metadata but the device doesn't fail, it must be the last device so we re-write without FAILFAST to improve chance of success. We also flag the device as LastDev so that future metadata updates don't waste time on failfast writes. Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: Shaohua Li <shli@fb.com>
This commit is contained in:
parent
688834e6ae
commit
46533ff7fe
@ -209,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
|
|||||||
|
|
||||||
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
|
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
|
||||||
{
|
{
|
||||||
struct md_rdev *rdev = NULL;
|
struct md_rdev *rdev;
|
||||||
struct block_device *bdev;
|
struct block_device *bdev;
|
||||||
struct mddev *mddev = bitmap->mddev;
|
struct mddev *mddev = bitmap->mddev;
|
||||||
struct bitmap_storage *store = &bitmap->storage;
|
struct bitmap_storage *store = &bitmap->storage;
|
||||||
|
|
||||||
|
restart:
|
||||||
|
rdev = NULL;
|
||||||
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
|
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
|
||||||
int size = PAGE_SIZE;
|
int size = PAGE_SIZE;
|
||||||
loff_t offset = mddev->bitmap_info.offset;
|
loff_t offset = mddev->bitmap_info.offset;
|
||||||
@ -269,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
|
|||||||
page);
|
page);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (wait)
|
if (wait && md_super_wait(mddev) < 0)
|
||||||
md_super_wait(mddev);
|
goto restart;
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
bad_alignment:
|
bad_alignment:
|
||||||
@ -428,6 +430,13 @@ static void bitmap_wait_writes(struct bitmap *bitmap)
|
|||||||
wait_event(bitmap->write_wait,
|
wait_event(bitmap->write_wait,
|
||||||
atomic_read(&bitmap->pending_writes)==0);
|
atomic_read(&bitmap->pending_writes)==0);
|
||||||
else
|
else
|
||||||
|
/* Note that we ignore the return value. The writes
|
||||||
|
* might have failed, but that would just mean that
|
||||||
|
* some bits which should be cleared haven't been,
|
||||||
|
* which is safe. The relevant bitmap blocks will
|
||||||
|
* probably get written again, but there is no great
|
||||||
|
* loss if they aren't.
|
||||||
|
*/
|
||||||
md_super_wait(bitmap->mddev);
|
md_super_wait(bitmap->mddev);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -727,7 +727,13 @@ static void super_written(struct bio *bio)
|
|||||||
if (bio->bi_error) {
|
if (bio->bi_error) {
|
||||||
pr_err("md: super_written gets error=%d\n", bio->bi_error);
|
pr_err("md: super_written gets error=%d\n", bio->bi_error);
|
||||||
md_error(mddev, rdev);
|
md_error(mddev, rdev);
|
||||||
|
if (!test_bit(Faulty, &rdev->flags)
|
||||||
|
&& (bio->bi_opf & MD_FAILFAST)) {
|
||||||
|
set_bit(MD_NEED_REWRITE, &mddev->flags);
|
||||||
|
set_bit(LastDev, &rdev->flags);
|
||||||
}
|
}
|
||||||
|
} else
|
||||||
|
clear_bit(LastDev, &rdev->flags);
|
||||||
|
|
||||||
if (atomic_dec_and_test(&mddev->pending_writes))
|
if (atomic_dec_and_test(&mddev->pending_writes))
|
||||||
wake_up(&mddev->sb_wait);
|
wake_up(&mddev->sb_wait);
|
||||||
@ -744,7 +750,13 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
|
|||||||
* if zero is reached.
|
* if zero is reached.
|
||||||
* If an error occurred, call md_error
|
* If an error occurred, call md_error
|
||||||
*/
|
*/
|
||||||
struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
|
struct bio *bio;
|
||||||
|
int ff = 0;
|
||||||
|
|
||||||
|
if (test_bit(Faulty, &rdev->flags))
|
||||||
|
return;
|
||||||
|
|
||||||
|
bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
|
||||||
|
|
||||||
atomic_inc(&rdev->nr_pending);
|
atomic_inc(&rdev->nr_pending);
|
||||||
|
|
||||||
@ -753,16 +765,24 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
|
|||||||
bio_add_page(bio, page, size, 0);
|
bio_add_page(bio, page, size, 0);
|
||||||
bio->bi_private = rdev;
|
bio->bi_private = rdev;
|
||||||
bio->bi_end_io = super_written;
|
bio->bi_end_io = super_written;
|
||||||
bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH_FUA);
|
|
||||||
|
if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
|
||||||
|
test_bit(FailFast, &rdev->flags) &&
|
||||||
|
!test_bit(LastDev, &rdev->flags))
|
||||||
|
ff = MD_FAILFAST;
|
||||||
|
bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH_FUA | ff);
|
||||||
|
|
||||||
atomic_inc(&mddev->pending_writes);
|
atomic_inc(&mddev->pending_writes);
|
||||||
submit_bio(bio);
|
submit_bio(bio);
|
||||||
}
|
}
|
||||||
|
|
||||||
void md_super_wait(struct mddev *mddev)
|
int md_super_wait(struct mddev *mddev)
|
||||||
{
|
{
|
||||||
/* wait for all superblock writes that were scheduled to complete */
|
/* wait for all superblock writes that were scheduled to complete */
|
||||||
wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
|
wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
|
||||||
|
if (test_and_clear_bit(MD_NEED_REWRITE, &mddev->flags))
|
||||||
|
return -EAGAIN;
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
|
int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
|
||||||
@ -1334,9 +1354,10 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
|
|||||||
if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
|
if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
|
||||||
rdev->mddev->level >= 1)
|
rdev->mddev->level >= 1)
|
||||||
num_sectors = (sector_t)(2ULL << 32) - 2;
|
num_sectors = (sector_t)(2ULL << 32) - 2;
|
||||||
|
do {
|
||||||
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
|
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
|
||||||
rdev->sb_page);
|
rdev->sb_page);
|
||||||
md_super_wait(rdev->mddev);
|
} while (md_super_wait(rdev->mddev) < 0);
|
||||||
return num_sectors;
|
return num_sectors;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1877,9 +1898,10 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
|
|||||||
sb->data_size = cpu_to_le64(num_sectors);
|
sb->data_size = cpu_to_le64(num_sectors);
|
||||||
sb->super_offset = rdev->sb_start;
|
sb->super_offset = rdev->sb_start;
|
||||||
sb->sb_csum = calc_sb_1_csum(sb);
|
sb->sb_csum = calc_sb_1_csum(sb);
|
||||||
|
do {
|
||||||
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
|
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
|
||||||
rdev->sb_page);
|
rdev->sb_page);
|
||||||
md_super_wait(rdev->mddev);
|
} while (md_super_wait(rdev->mddev) < 0);
|
||||||
return num_sectors;
|
return num_sectors;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -2416,6 +2438,7 @@ repeat:
|
|||||||
|
|
||||||
if (mddev->queue)
|
if (mddev->queue)
|
||||||
blk_add_trace_msg(mddev->queue, "md md_update_sb");
|
blk_add_trace_msg(mddev->queue, "md md_update_sb");
|
||||||
|
rewrite:
|
||||||
bitmap_update_sb(mddev->bitmap);
|
bitmap_update_sb(mddev->bitmap);
|
||||||
rdev_for_each(rdev, mddev) {
|
rdev_for_each(rdev, mddev) {
|
||||||
char b[BDEVNAME_SIZE];
|
char b[BDEVNAME_SIZE];
|
||||||
@ -2447,7 +2470,8 @@ repeat:
|
|||||||
/* only need to write one superblock... */
|
/* only need to write one superblock... */
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
md_super_wait(mddev);
|
if (md_super_wait(mddev) < 0)
|
||||||
|
goto rewrite;
|
||||||
/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
|
/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
|
||||||
|
|
||||||
if (mddev_is_clustered(mddev) && ret == 0)
|
if (mddev_is_clustered(mddev) && ret == 0)
|
||||||
|
@ -29,6 +29,16 @@
|
|||||||
|
|
||||||
#define MaxSector (~(sector_t)0)
|
#define MaxSector (~(sector_t)0)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* These flags should really be called "NO_RETRY" rather than
|
||||||
|
* "FAILFAST" because they don't make any promise about time lapse,
|
||||||
|
* only about the number of retries, which will be zero.
|
||||||
|
* REQ_FAILFAST_DRIVER is not included because
|
||||||
|
* Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
|
||||||
|
* seems to suggest that the errors it avoids retrying should usually
|
||||||
|
* be retried.
|
||||||
|
*/
|
||||||
|
#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
|
||||||
/*
|
/*
|
||||||
* MD's 'extended' device
|
* MD's 'extended' device
|
||||||
*/
|
*/
|
||||||
@ -177,6 +187,10 @@ enum flag_bits {
|
|||||||
* It is expects that no bad block log
|
* It is expects that no bad block log
|
||||||
* is present.
|
* is present.
|
||||||
*/
|
*/
|
||||||
|
LastDev, /* Seems to be the last working dev as
|
||||||
|
* it didn't fail, so don't use FailFast
|
||||||
|
* any more for metadata
|
||||||
|
*/
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
|
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
|
||||||
@ -213,6 +227,11 @@ enum mddev_flags {
|
|||||||
MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
|
MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
|
||||||
* already took resync lock, need to
|
* already took resync lock, need to
|
||||||
* release the lock */
|
* release the lock */
|
||||||
|
MD_FAILFAST_SUPPORTED, /* Using MD_FAILFAST on metadata writes is
|
||||||
|
* supported as calls to md_error() will
|
||||||
|
* never cause the array to become failed.
|
||||||
|
*/
|
||||||
|
MD_NEED_REWRITE, /* metadata write needs to be repeated */
|
||||||
};
|
};
|
||||||
#define MD_UPDATE_SB_FLAGS (BIT(MD_CHANGE_DEVS) | \
|
#define MD_UPDATE_SB_FLAGS (BIT(MD_CHANGE_DEVS) | \
|
||||||
BIT(MD_CHANGE_CLEAN) | \
|
BIT(MD_CHANGE_CLEAN) | \
|
||||||
@ -628,7 +647,7 @@ extern int mddev_congested(struct mddev *mddev, int bits);
|
|||||||
extern void md_flush_request(struct mddev *mddev, struct bio *bio);
|
extern void md_flush_request(struct mddev *mddev, struct bio *bio);
|
||||||
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
|
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
|
||||||
sector_t sector, int size, struct page *page);
|
sector_t sector, int size, struct page *page);
|
||||||
extern void md_super_wait(struct mddev *mddev);
|
extern int md_super_wait(struct mddev *mddev);
|
||||||
extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
|
extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
|
||||||
struct page *page, int op, int op_flags,
|
struct page *page, int op, int op_flags,
|
||||||
bool metadata_op);
|
bool metadata_op);
|
||||||
|
@ -2988,6 +2988,7 @@ static int raid1_run(struct mddev *mddev)
|
|||||||
mddev->thread = conf->thread;
|
mddev->thread = conf->thread;
|
||||||
conf->thread = NULL;
|
conf->thread = NULL;
|
||||||
mddev->private = conf;
|
mddev->private = conf;
|
||||||
|
set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
|
||||||
|
|
||||||
md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
|
md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
|
||||||
|
|
||||||
|
@ -3729,6 +3729,7 @@ static int raid10_run(struct mddev *mddev)
|
|||||||
size = raid10_size(mddev, 0, 0);
|
size = raid10_size(mddev, 0, 0);
|
||||||
md_set_array_sectors(mddev, size);
|
md_set_array_sectors(mddev, size);
|
||||||
mddev->resync_max_sectors = size;
|
mddev->resync_max_sectors = size;
|
||||||
|
set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
|
||||||
|
|
||||||
if (mddev->queue) {
|
if (mddev->queue) {
|
||||||
int stripe = conf->geo.raid_disks *
|
int stripe = conf->geo.raid_disks *
|
||||||
|
Loading…
Reference in New Issue
Block a user