btrfs-progs: allow read_data_from_disk() to rebuild RAID56 using P/Q

This new ability is added by:

- Allow btrfs_map_block() to return the chunk type
  This makes later work much easier

- Only reset stripe offset inside btrfs_map_block() when needed
  Currently if @raid_map is not NULL, btrfs_map_block() will consider
  this call is for WRITE and will reset stripe offset.

  This is no longer the case, as for RAID56 read with mirror_num 1/0,
  we will still call btrfs_map_block() with non-NULL raid_map.

  Add a small check to make sure we won't reset stripe offset for
  mirror 1/0 read.

- Add new helper read_raid56() to handle rebuild
  We will read the full stripe (including all data and P/Q stripes)
  do the rebuild, then only copy the refered part to the caller.

  There is a catch for RAID6, we have no way to exhaust all combination,
  so the current repair will assume the mirror = 0 data is corrupted,
  then try to find a missing device.

  But if no missing device can be found, it will assume P is corrupted.
  This is just a guess, and can to totally wrong, but we have no better
  idea.

Now btrfs-progs have full read ability for RAID56.

Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
Qu Wenruo 2022-04-05 20:48:29 +08:00 committed by David Sterba
parent a99bece1cd
commit 4e9e978783
3 changed files with 127 additions and 13 deletions

View File

@ -26,6 +26,7 @@
#include "kerncompat.h"
#include "kernel-shared/extent_io.h"
#include "kernel-lib/list.h"
#include "kernel-lib/raid56.h"
#include "kernel-shared/ctree.h"
#include "kernel-shared/volumes.h"
#include "kernel-shared/disk-io.h"
@ -788,23 +789,131 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
return ret;
}
static int read_raid56(struct btrfs_fs_info *fs_info, void *buf, u64 logical,
u64 len, int mirror, struct btrfs_multi_bio *multi,
u64 *raid_map)
{
const int num_stripes = multi->num_stripes;
const u64 full_stripe_start = raid_map[0];
void **pointers = NULL;
int failed_a = -1;
int failed_b = -1;
int i;
int ret;
/* Only read repair should go this path */
ASSERT(mirror > 1);
ASSERT(raid_map);
/* The read length should be inside one stripe */
ASSERT(len <= BTRFS_STRIPE_LEN);
pointers = calloc(num_stripes, sizeof(void *));
if (!pointers) {
ret = -ENOMEM;
goto out;
}
/* Allocate memory for the full stripe */
for (i = 0; i < num_stripes; i++) {
pointers[i] = malloc(BTRFS_STRIPE_LEN);
if (!pointers[i]) {
ret = -ENOMEM;
goto out;
}
}
/*
* Read the full stripe.
*
* The stripes in @multi is not rotated, thus can be used to read from
* disk directly.
*/
for (i = 0; i < num_stripes; i++) {
ret = btrfs_pread(multi->stripes[i].dev->fd, pointers[i],
BTRFS_STRIPE_LEN, multi->stripes[i].physical,
fs_info->zoned);
if (ret < BTRFS_STRIPE_LEN) {
ret = -EIO;
goto out;
}
}
/*
* Get the failed index.
*
* Since we're reading using mirror_num > 1 already, it means the data
* stripe where @logical lies in is definitely corrupted.
*/
failed_a = (logical - full_stripe_start) / BTRFS_STRIPE_LEN;
/*
* For RAID6, we don't have good way to exhaust all the combinations,
* so here we can only go through the map to see if we have missing devices.
*/
if (multi->type & BTRFS_BLOCK_GROUP_RAID6) {
for (i = 0; i < num_stripes; i++) {
/* Skip failed_a, as it's already marked failed */
if (i == failed_a)
continue;
/* Missing dev */
if (multi->stripes[i].dev->fd == -1) {
failed_b = i;
break;
}
}
/*
* No missing device, we have no better idea, default to P
* corruption
*/
if (failed_b < 0)
failed_b = num_stripes - 2;
}
/* Rebuild the full stripe */
ret = raid56_recov(num_stripes, BTRFS_STRIPE_LEN, multi->type,
failed_a, failed_b, pointers);
ASSERT(ret == 0);
/* Now copy the data back to original buf */
memcpy(buf, pointers[failed_a] + (logical - full_stripe_start) %
BTRFS_STRIPE_LEN, len);
ret = 0;
out:
for (i = 0; i < num_stripes; i++)
free(pointers[i]);
free(pointers);
return ret;
}
int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 logical,
u64 *len, int mirror)
{
struct btrfs_multi_bio *multi = NULL;
struct btrfs_device *device;
u64 read_len = *len;
u64 *raid_map = NULL;
int ret;
ret = btrfs_map_block(info, READ, logical, &read_len, &multi, mirror,
NULL);
&raid_map);
if (ret) {
fprintf(stderr, "Couldn't map the block %llu\n", logical);
return -EIO;
}
read_len = min(*len, read_len);
/* We need to rebuild from P/Q */
if (mirror > 1 && multi->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = read_raid56(info, buf, logical, read_len, mirror, multi,
raid_map);
free(multi);
free(raid_map);
*len = read_len;
return ret;
}
free(raid_map);
device = multi->stripes[0].dev;
read_len = min(*len, read_len);
if (device->fd <= 0) {
kfree(multi);
return -EIO;
@ -824,6 +933,7 @@ int read_data_from_disk(struct btrfs_fs_info *info, void *buf, u64 logical,
logical, ret, read_len);
return -EIO;
}
*len = read_len;
return 0;
}

View File

@ -1811,6 +1811,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
int stripes_required = 1;
int stripe_index;
int i;
bool need_raid_map = false;
struct btrfs_multi_bio *multi = NULL;
if (multi_ret && rw == READ) {
@ -1848,17 +1849,18 @@ again:
}
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK
&& multi_ret && ((rw & WRITE) || mirror_num > 1) && raid_map_ret) {
/* RAID[56] write or recovery. Return all stripes */
stripes_required = map->num_stripes;
need_raid_map = true;
/* RAID[56] write or recovery. Return all stripes */
stripes_required = map->num_stripes;
/* Only allocate the map if we've already got a large enough multi_ret */
if (stripes_allocated >= stripes_required) {
raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
if (!raid_map) {
kfree(multi);
return -ENOMEM;
}
}
/* Only allocate the map if we've already got a large enough multi_ret */
if (stripes_allocated >= stripes_required) {
raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
if (!raid_map) {
kfree(multi);
return -ENOMEM;
}
}
}
/* if our multi bio struct is too small, back off and try again */
@ -1896,6 +1898,7 @@ again:
goto out;
multi->num_stripes = 1;
multi->type = map->type;
stripe_index = 0;
if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
if (rw == WRITE)
@ -1922,7 +1925,7 @@ again:
else if (mirror_num)
stripe_index = mirror_num - 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
if (raid_map) {
if (need_raid_map && raid_map) {
int rot;
u64 tmp;
u64 raid56_full_stripe_start;

View File

@ -106,6 +106,7 @@ struct btrfs_bio_stripe {
};
struct btrfs_multi_bio {
u64 type;
int error;
int num_stripes;
struct btrfs_bio_stripe stripes[];