2019-05-21 01:08:12 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
raid0.c : Multiple Devices driver for Linux
|
2014-09-30 12:23:59 +08:00
|
|
|
Copyright (C) 1994-96 Marc ZYNGIER
|
2005-04-17 06:20:36 +08:00
|
|
|
<zyngier@ufr-info-p7.ibp.fr> or
|
|
|
|
<maz@gloups.fdn.fr>
|
2014-09-30 12:23:59 +08:00
|
|
|
Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
RAID-0 management functions.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
2009-03-31 11:33:13 +08:00
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/seq_file.h>
|
2011-07-04 01:58:33 +08:00
|
|
|
#include <linux/module.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
|
|
|
#include <linux/slab.h>
|
2016-11-18 10:22:04 +08:00
|
|
|
#include <trace/events/block.h>
|
2009-03-31 11:33:13 +08:00
|
|
|
#include "md.h"
|
2009-03-31 11:27:03 +08:00
|
|
|
#include "raid0.h"
|
2010-03-08 13:02:44 +08:00
|
|
|
#include "raid5.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2019-09-09 14:30:02 +08:00
|
|
|
static int default_layout = 0;
|
|
|
|
module_param(default_layout, int, 0644);
|
|
|
|
|
2017-01-05 08:10:19 +08:00
|
|
|
#define UNSUPPORTED_MDDEV_FLAGS \
|
|
|
|
((1L << MD_HAS_JOURNAL) | \
|
|
|
|
(1L << MD_JOURNAL_CLEAN) | \
|
2017-03-09 16:59:57 +08:00
|
|
|
(1L << MD_FAILFAST_SUPPORTED) |\
|
2017-08-16 23:13:45 +08:00
|
|
|
(1L << MD_HAS_PPL) | \
|
|
|
|
(1L << MD_HAS_MULTIPLE_PPLS))
|
2017-01-05 08:10:19 +08:00
|
|
|
|
2009-06-16 15:00:54 +08:00
|
|
|
/*
|
|
|
|
* inform the user of the raid configuration
|
|
|
|
*/
|
2011-10-11 13:47:53 +08:00
|
|
|
static void dump_zones(struct mddev *mddev)
|
2009-06-16 15:00:54 +08:00
|
|
|
{
|
2011-10-07 11:23:22 +08:00
|
|
|
int j, k;
|
2009-06-16 15:00:54 +08:00
|
|
|
sector_t zone_size = 0;
|
|
|
|
sector_t zone_start = 0;
|
2011-10-11 13:48:59 +08:00
|
|
|
struct r0conf *conf = mddev->private;
|
2010-03-16 14:23:35 +08:00
|
|
|
int raid_disks = conf->strip_zone[0].nb_dev;
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_debug("md: RAID0 configuration for %s - %d zone%s\n",
|
|
|
|
mdname(mddev),
|
|
|
|
conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s");
|
2009-06-16 15:00:54 +08:00
|
|
|
for (j = 0; j < conf->nr_strip_zones; j++) {
|
2016-11-02 11:16:50 +08:00
|
|
|
char line[200];
|
|
|
|
int len = 0;
|
|
|
|
|
2009-06-16 15:00:54 +08:00
|
|
|
for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
|
2022-08-24 02:51:04 +08:00
|
|
|
len += scnprintf(line+len, 200-len, "%s%pg", k?"/":"",
|
2022-05-12 14:19:13 +08:00
|
|
|
conf->devlist[j * raid_disks + k]->bdev);
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_debug("md: zone%d=[%s]\n", j, line);
|
2009-06-16 15:00:54 +08:00
|
|
|
|
|
|
|
zone_size = conf->strip_zone[j].zone_end - zone_start;
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_debug(" zone-offset=%10lluKB, device-offset=%10lluKB, size=%10lluKB\n",
|
2009-06-16 15:00:54 +08:00
|
|
|
(unsigned long long)zone_start>>1,
|
|
|
|
(unsigned long long)conf->strip_zone[j].dev_start>>1,
|
|
|
|
(unsigned long long)zone_size>>1);
|
|
|
|
zone_start = conf->strip_zone[j].zone_end;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-11 13:48:59 +08:00
|
|
|
static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2009-09-23 16:06:41 +08:00
|
|
|
int i, c, err;
|
2009-06-16 14:50:35 +08:00
|
|
|
sector_t curr_zone_end, sectors;
|
2011-10-11 13:45:26 +08:00
|
|
|
struct md_rdev *smallest, *rdev1, *rdev2, *rdev, **dev;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct strip_zone *zone;
|
|
|
|
int cnt;
|
2011-10-11 13:48:59 +08:00
|
|
|
struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
|
2020-01-15 21:35:25 +08:00
|
|
|
unsigned blksize = 512;
|
2009-06-16 14:47:36 +08:00
|
|
|
|
2016-04-14 17:31:49 +08:00
|
|
|
*private_conf = ERR_PTR(-ENOMEM);
|
2009-06-16 14:47:36 +08:00
|
|
|
if (!conf)
|
|
|
|
return -ENOMEM;
|
2012-03-19 09:46:39 +08:00
|
|
|
rdev_for_each(rdev1, mddev) {
|
2022-05-12 14:19:13 +08:00
|
|
|
pr_debug("md/raid0:%s: looking at %pg\n",
|
2011-10-07 11:23:22 +08:00
|
|
|
mdname(mddev),
|
2022-05-12 14:19:13 +08:00
|
|
|
rdev1->bdev);
|
2005-04-17 06:20:36 +08:00
|
|
|
c = 0;
|
2009-06-18 06:48:55 +08:00
|
|
|
|
|
|
|
/* round size to chunk_size */
|
|
|
|
sectors = rdev1->sectors;
|
|
|
|
sector_div(sectors, mddev->chunk_sectors);
|
|
|
|
rdev1->sectors = sectors * mddev->chunk_sectors;
|
|
|
|
|
2015-08-03 11:11:47 +08:00
|
|
|
blksize = max(blksize, queue_logical_block_size(
|
|
|
|
rdev1->bdev->bd_disk->queue));
|
|
|
|
|
2012-03-19 09:46:39 +08:00
|
|
|
rdev_for_each(rdev2, mddev) {
|
2022-05-12 14:19:13 +08:00
|
|
|
pr_debug("md/raid0:%s: comparing %pg(%llu)"
|
|
|
|
" with %pg(%llu)\n",
|
2011-10-07 11:23:22 +08:00
|
|
|
mdname(mddev),
|
2022-05-12 14:19:13 +08:00
|
|
|
rdev1->bdev,
|
2011-10-07 11:23:22 +08:00
|
|
|
(unsigned long long)rdev1->sectors,
|
2022-05-12 14:19:13 +08:00
|
|
|
rdev2->bdev,
|
2011-10-07 11:23:22 +08:00
|
|
|
(unsigned long long)rdev2->sectors);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (rdev2 == rdev1) {
|
2011-10-07 11:23:22 +08:00
|
|
|
pr_debug("md/raid0:%s: END\n",
|
|
|
|
mdname(mddev));
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
}
|
2009-03-31 11:33:13 +08:00
|
|
|
if (rdev2->sectors == rdev1->sectors) {
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Not unique, don't count it as a new
|
|
|
|
* group
|
|
|
|
*/
|
2011-10-07 11:23:22 +08:00
|
|
|
pr_debug("md/raid0:%s: EQUAL\n",
|
|
|
|
mdname(mddev));
|
2005-04-17 06:20:36 +08:00
|
|
|
c = 1;
|
|
|
|
break;
|
|
|
|
}
|
2011-10-07 11:23:22 +08:00
|
|
|
pr_debug("md/raid0:%s: NOT EQUAL\n",
|
|
|
|
mdname(mddev));
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
if (!c) {
|
2011-10-07 11:23:22 +08:00
|
|
|
pr_debug("md/raid0:%s: ==> UNIQUE\n",
|
|
|
|
mdname(mddev));
|
2005-04-17 06:20:36 +08:00
|
|
|
conf->nr_strip_zones++;
|
2011-10-07 11:23:22 +08:00
|
|
|
pr_debug("md/raid0:%s: %d zones\n",
|
|
|
|
mdname(mddev), conf->nr_strip_zones);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
2011-10-07 11:23:22 +08:00
|
|
|
pr_debug("md/raid0:%s: FINAL %d zones\n",
|
|
|
|
mdname(mddev), conf->nr_strip_zones);
|
2019-09-09 14:30:02 +08:00
|
|
|
|
2015-08-03 11:11:47 +08:00
|
|
|
/*
|
|
|
|
* now since we have the hard sector sizes, we can make sure
|
|
|
|
* chunk size is a multiple of that sector size
|
|
|
|
*/
|
|
|
|
if ((mddev->chunk_sectors << 9) % blksize) {
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
|
|
|
|
mdname(mddev),
|
|
|
|
mddev->chunk_sectors << 9, blksize);
|
2015-08-03 11:11:47 +08:00
|
|
|
err = -EINVAL;
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
|
2009-06-16 14:47:36 +08:00
|
|
|
err = -ENOMEM;
|
treewide: kzalloc() -> kcalloc()
The kzalloc() function has a 2-factor argument form, kcalloc(). This
patch replaces cases of:
kzalloc(a * b, gfp)
with:
kcalloc(a * b, gfp)
as well as handling cases of:
kzalloc(a * b * c, gfp)
with:
kzalloc(array3_size(a, b, c), gfp)
as it's slightly less ugly than:
kzalloc_array(array_size(a, b), c, gfp)
This does, however, attempt to ignore constant size factors like:
kzalloc(4 * 1024, gfp)
though any constants defined via macros get caught up in the conversion.
Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.
The Coccinelle script used for this was:
// Fix redundant parens around sizeof().
@@
type TYPE;
expression THING, E;
@@
(
kzalloc(
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
kzalloc(
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)
// Drop single-byte sizes and redundant parens.
@@
expression COUNT;
typedef u8;
typedef __u8;
@@
(
kzalloc(
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(char) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)
// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@
(
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (COUNT_ID)
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * COUNT_ID
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (COUNT_CONST)
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * COUNT_CONST
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (COUNT_ID)
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * COUNT_ID
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (COUNT_CONST)
+ COUNT_CONST, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * COUNT_CONST
+ COUNT_CONST, sizeof(THING)
, ...)
)
// 2-factor product, only identifiers.
@@
identifier SIZE, COUNT;
@@
- kzalloc
+ kcalloc
(
- SIZE * COUNT
+ COUNT, SIZE
, ...)
// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@
(
kzalloc(
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)
// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@
(
kzalloc(
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc(
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)
// 3-factor product, only identifiers, with redundant parens removed.
@@
identifier STRIDE, SIZE, COUNT;
@@
(
kzalloc(
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)
// Any remaining multi-factor products, first at least 3-factor products,
// when they're not all constants...
@@
expression E1, E2, E3;
constant C1, C2, C3;
@@
(
kzalloc(C1 * C2 * C3, ...)
|
kzalloc(
- (E1) * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- (E1) * (E2) * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- (E1) * (E2) * (E3)
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)
// And then all remaining 2 factors products when they're not all constants,
// keeping sizeof() as the second factor argument.
@@
expression THING, E1, E2;
type TYPE;
constant C1, C2, C3;
@@
(
kzalloc(sizeof(THING) * C2, ...)
|
kzalloc(sizeof(TYPE) * C2, ...)
|
kzalloc(C1 * C2 * C3, ...)
|
kzalloc(C1 * C2, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (E2)
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * E2
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (E2)
+ E2, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * E2
+ E2, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- (E1) * E2
+ E1, E2
, ...)
|
- kzalloc
+ kcalloc
(
- (E1) * (E2)
+ E1, E2
, ...)
|
- kzalloc
+ kcalloc
(
- E1 * E2
+ E1, E2
, ...)
)
Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-13 05:03:40 +08:00
|
|
|
conf->strip_zone = kcalloc(conf->nr_strip_zones,
|
|
|
|
sizeof(struct strip_zone),
|
|
|
|
GFP_KERNEL);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!conf->strip_zone)
|
2009-06-16 14:47:36 +08:00
|
|
|
goto abort;
|
treewide: kzalloc() -> kcalloc()
The kzalloc() function has a 2-factor argument form, kcalloc(). This
patch replaces cases of:
kzalloc(a * b, gfp)
with:
kcalloc(a * b, gfp)
as well as handling cases of:
kzalloc(a * b * c, gfp)
with:
kzalloc(array3_size(a, b, c), gfp)
as it's slightly less ugly than:
kzalloc_array(array_size(a, b), c, gfp)
This does, however, attempt to ignore constant size factors like:
kzalloc(4 * 1024, gfp)
though any constants defined via macros get caught up in the conversion.
Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.
The Coccinelle script used for this was:
// Fix redundant parens around sizeof().
@@
type TYPE;
expression THING, E;
@@
(
kzalloc(
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
kzalloc(
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)
// Drop single-byte sizes and redundant parens.
@@
expression COUNT;
typedef u8;
typedef __u8;
@@
(
kzalloc(
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
kzalloc(
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(char) * COUNT
+ COUNT
, ...)
|
kzalloc(
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)
// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@
(
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (COUNT_ID)
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * COUNT_ID
+ COUNT_ID, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (COUNT_CONST)
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * COUNT_CONST
+ COUNT_CONST, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (COUNT_ID)
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * COUNT_ID
+ COUNT_ID, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (COUNT_CONST)
+ COUNT_CONST, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * COUNT_CONST
+ COUNT_CONST, sizeof(THING)
, ...)
)
// 2-factor product, only identifiers.
@@
identifier SIZE, COUNT;
@@
- kzalloc
+ kcalloc
(
- SIZE * COUNT
+ COUNT, SIZE
, ...)
// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@
(
kzalloc(
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
kzalloc(
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
kzalloc(
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)
// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@
(
kzalloc(
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
kzalloc(
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
kzalloc(
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)
// 3-factor product, only identifiers, with redundant parens removed.
@@
identifier STRIDE, SIZE, COUNT;
@@
(
kzalloc(
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
kzalloc(
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)
// Any remaining multi-factor products, first at least 3-factor products,
// when they're not all constants...
@@
expression E1, E2, E3;
constant C1, C2, C3;
@@
(
kzalloc(C1 * C2 * C3, ...)
|
kzalloc(
- (E1) * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- (E1) * (E2) * E3
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- (E1) * (E2) * (E3)
+ array3_size(E1, E2, E3)
, ...)
|
kzalloc(
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)
// And then all remaining 2 factors products when they're not all constants,
// keeping sizeof() as the second factor argument.
@@
expression THING, E1, E2;
type TYPE;
constant C1, C2, C3;
@@
(
kzalloc(sizeof(THING) * C2, ...)
|
kzalloc(sizeof(TYPE) * C2, ...)
|
kzalloc(C1 * C2 * C3, ...)
|
kzalloc(C1 * C2, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * (E2)
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(TYPE) * E2
+ E2, sizeof(TYPE)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * (E2)
+ E2, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- sizeof(THING) * E2
+ E2, sizeof(THING)
, ...)
|
- kzalloc
+ kcalloc
(
- (E1) * E2
+ E1, E2
, ...)
|
- kzalloc
+ kcalloc
(
- (E1) * (E2)
+ E1, E2
, ...)
|
- kzalloc
+ kcalloc
(
- E1 * E2
+ E1, E2
, ...)
)
Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-13 05:03:40 +08:00
|
|
|
conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *),
|
|
|
|
conf->nr_strip_zones,
|
|
|
|
mddev->raid_disks),
|
2005-04-17 06:20:36 +08:00
|
|
|
GFP_KERNEL);
|
|
|
|
if (!conf->devlist)
|
2009-06-16 14:47:36 +08:00
|
|
|
goto abort;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* The first zone must contain all devices, so here we check that
|
|
|
|
* there is a proper alignment of slots to devices and find them all
|
|
|
|
*/
|
|
|
|
zone = &conf->strip_zone[0];
|
|
|
|
cnt = 0;
|
|
|
|
smallest = NULL;
|
2009-06-16 14:50:52 +08:00
|
|
|
dev = conf->devlist;
|
2009-06-16 14:47:36 +08:00
|
|
|
err = -EINVAL;
|
2012-03-19 09:46:39 +08:00
|
|
|
rdev_for_each(rdev1, mddev) {
|
2005-04-17 06:20:36 +08:00
|
|
|
int j = rdev1->raid_disk;
|
|
|
|
|
2010-06-15 16:36:03 +08:00
|
|
|
if (mddev->level == 10) {
|
2010-03-08 13:02:44 +08:00
|
|
|
/* taking over a raid10-n2 array */
|
|
|
|
j /= 2;
|
2010-06-15 16:36:03 +08:00
|
|
|
rdev1->new_raid_disk = j;
|
|
|
|
}
|
2010-03-08 13:02:44 +08:00
|
|
|
|
2011-01-31 10:47:13 +08:00
|
|
|
if (mddev->level == 1) {
|
|
|
|
/* taiking over a raid1 array-
|
|
|
|
* we have only one active disk
|
|
|
|
*/
|
|
|
|
j = 0;
|
|
|
|
rdev1->new_raid_disk = j;
|
|
|
|
}
|
|
|
|
|
2013-02-21 12:50:07 +08:00
|
|
|
if (j < 0) {
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("md/raid0:%s: remove inactive devices before converting to RAID0\n",
|
|
|
|
mdname(mddev));
|
2013-02-21 12:50:07 +08:00
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
if (j >= mddev->raid_disks) {
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("md/raid0:%s: bad disk number %d - aborting!\n",
|
|
|
|
mdname(mddev), j);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto abort;
|
|
|
|
}
|
2009-06-16 14:50:52 +08:00
|
|
|
if (dev[j]) {
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("md/raid0:%s: multiple devices for %d - aborting!\n",
|
|
|
|
mdname(mddev), j);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto abort;
|
|
|
|
}
|
2009-06-16 14:50:52 +08:00
|
|
|
dev[j] = rdev1;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-03-31 11:33:13 +08:00
|
|
|
if (!smallest || (rdev1->sectors < smallest->sectors))
|
2005-04-17 06:20:36 +08:00
|
|
|
smallest = rdev1;
|
|
|
|
cnt++;
|
|
|
|
}
|
|
|
|
if (cnt != mddev->raid_disks) {
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("md/raid0:%s: too few disks (%d of %d) - aborting!\n",
|
|
|
|
mdname(mddev), cnt, mddev->raid_disks);
|
2005-04-17 06:20:36 +08:00
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
zone->nb_dev = cnt;
|
2009-06-16 14:50:35 +08:00
|
|
|
zone->zone_end = smallest->sectors * cnt;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-06-16 14:50:35 +08:00
|
|
|
curr_zone_end = zone->zone_end;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* now do the other zones */
|
|
|
|
for (i = 1; i < conf->nr_strip_zones; i++)
|
|
|
|
{
|
2009-09-23 16:06:41 +08:00
|
|
|
int j;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
zone = conf->strip_zone + i;
|
2009-06-16 14:50:52 +08:00
|
|
|
dev = conf->devlist + i * mddev->raid_disks;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-10-07 11:23:22 +08:00
|
|
|
pr_debug("md/raid0:%s: zone %d\n", mdname(mddev), i);
|
2009-06-16 14:46:46 +08:00
|
|
|
zone->dev_start = smallest->sectors;
|
2005-04-17 06:20:36 +08:00
|
|
|
smallest = NULL;
|
|
|
|
c = 0;
|
|
|
|
|
|
|
|
for (j=0; j<cnt; j++) {
|
2009-06-16 14:50:52 +08:00
|
|
|
rdev = conf->devlist[j];
|
2009-06-16 14:46:46 +08:00
|
|
|
if (rdev->sectors <= zone->dev_start) {
|
2022-05-12 14:19:13 +08:00
|
|
|
pr_debug("md/raid0:%s: checking %pg ... nope\n",
|
2011-10-07 11:23:22 +08:00
|
|
|
mdname(mddev),
|
2022-05-12 14:19:13 +08:00
|
|
|
rdev->bdev);
|
2009-03-31 11:33:13 +08:00
|
|
|
continue;
|
|
|
|
}
|
2022-05-12 14:19:13 +08:00
|
|
|
pr_debug("md/raid0:%s: checking %pg ..."
|
2011-10-07 11:23:22 +08:00
|
|
|
" contained as device %d\n",
|
|
|
|
mdname(mddev),
|
2022-05-12 14:19:13 +08:00
|
|
|
rdev->bdev, c);
|
2009-06-16 14:50:52 +08:00
|
|
|
dev[c] = rdev;
|
2009-03-31 11:33:13 +08:00
|
|
|
c++;
|
|
|
|
if (!smallest || rdev->sectors < smallest->sectors) {
|
|
|
|
smallest = rdev;
|
2011-10-07 11:23:22 +08:00
|
|
|
pr_debug("md/raid0:%s: (%llu) is smallest!.\n",
|
|
|
|
mdname(mddev),
|
|
|
|
(unsigned long long)rdev->sectors);
|
2009-03-31 11:33:13 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
zone->nb_dev = c;
|
2009-06-16 14:50:35 +08:00
|
|
|
sectors = (smallest->sectors - zone->dev_start) * c;
|
2011-10-07 11:23:22 +08:00
|
|
|
pr_debug("md/raid0:%s: zone->nb_dev: %d, sectors: %llu\n",
|
|
|
|
mdname(mddev),
|
|
|
|
zone->nb_dev, (unsigned long long)sectors);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-06-16 14:50:35 +08:00
|
|
|
curr_zone_end += sectors;
|
2009-06-16 14:46:46 +08:00
|
|
|
zone->zone_end = curr_zone_end;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-10-07 11:23:22 +08:00
|
|
|
pr_debug("md/raid0:%s: current zone start: %llu\n",
|
|
|
|
mdname(mddev),
|
|
|
|
(unsigned long long)smallest->sectors);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2022-04-13 14:53:56 +08:00
|
|
|
if (conf->nr_strip_zones == 1 || conf->strip_zone[1].nb_dev == 1) {
|
|
|
|
conf->layout = RAID0_ORIG_LAYOUT;
|
|
|
|
} else if (mddev->layout == RAID0_ORIG_LAYOUT ||
|
|
|
|
mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) {
|
|
|
|
conf->layout = mddev->layout;
|
|
|
|
} else if (default_layout == RAID0_ORIG_LAYOUT ||
|
|
|
|
default_layout == RAID0_ALT_MULTIZONE_LAYOUT) {
|
|
|
|
conf->layout = default_layout;
|
|
|
|
} else {
|
|
|
|
pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n",
|
|
|
|
mdname(mddev));
|
|
|
|
pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n");
|
|
|
|
err = -EOPNOTSUPP;
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
|
md/raid0: add discard support for the 'original' layout
We've found that using raid0 with the 'original' layout and discard
enabled with different disk sizes (such that at least two zones are
created) can result in data corruption. This is due to the fact that
the discard handling in 'raid0_handle_discard()' assumes the 'alternate'
layout. We've seen this corruption using ext4 but other filesystems are
likely susceptible as well.
More specifically, while multiple zones are necessary to create the
corruption, the corruption may not occur with multiple zones if they
layout in such a way the layout matches what the 'alternate' layout
would have produced. Thus, not all raid0 devices with the 'original'
layout, different size disks and discard enabled will encounter this
corruption.
The 3.14 kernel inadvertently changed the raid0 disk layout for different
size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the
same raid0 array could corrupt data. This lead to the creation of the
'original' layout (to match the pre-3.14 layout) and the 'alternate' layout
(to match the post 3.14 layout) in the 5.4 kernel time frame and an option
to tell the kernel which layout to use (since it couldn't be autodetected).
However, when the 'original' layout was added back to 5.4 discard support
for the 'original' layout was not added leading this issue.
I've been able to reliably reproduce the corruption with the following
test case:
1. create raid0 array with different size disks using original layout
2. mkfs
3. mount -o discard
4. create lots of files
5. remove 1/2 the files
6. fstrim -a (or just the mount point for the raid0 array)
7. umount
8. fsck -fn /dev/md0 (spews all sorts of corruptions)
Let's fix this by adding proper discard support to the 'original' layout.
The fix 'maps' the 'original' layout disks to the order in which they are
read/written such that we can compare the disks in the same way that the
current 'alternate' layout does. A 'disk_shift' field is added to
'struct strip_zone'. This could be computed on the fly in
raid0_handle_discard() but by adding this field, we save some computation
in the discard path.
Note we could also potentially fix this by re-ordering the disks in the
zones that follow the first one, and then always read/writing them using
the 'alternate' layout. However, that is seen as a more substantial change,
and we are attempting the least invasive fix at this time to remedy the
corruption.
I've verified the change using the reproducer mentioned above. Typically,
the corruption is seen after less than 3 iterations, while the patch has
run 500+ iterations.
Cc: NeilBrown <neilb@suse.de>
Cc: Song Liu <song@kernel.org>
Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com
2023-06-24 02:05:23 +08:00
|
|
|
if (conf->layout == RAID0_ORIG_LAYOUT) {
|
|
|
|
for (i = 1; i < conf->nr_strip_zones; i++) {
|
|
|
|
sector_t first_sector = conf->strip_zone[i-1].zone_end;
|
|
|
|
|
|
|
|
sector_div(first_sector, mddev->chunk_sectors);
|
|
|
|
zone = conf->strip_zone + i;
|
|
|
|
/* disk_shift is first disk index used in the zone */
|
|
|
|
zone->disk_shift = sector_div(first_sector,
|
|
|
|
zone->nb_dev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-07 11:23:22 +08:00
|
|
|
pr_debug("md/raid0:%s: done.\n", mdname(mddev));
|
2010-03-08 13:02:44 +08:00
|
|
|
*private_conf = conf;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
2009-06-16 14:47:21 +08:00
|
|
|
abort:
|
2009-06-16 14:47:36 +08:00
|
|
|
kfree(conf->strip_zone);
|
|
|
|
kfree(conf->devlist);
|
|
|
|
kfree(conf);
|
2013-02-21 12:36:38 +08:00
|
|
|
*private_conf = ERR_PTR(err);
|
2009-06-16 14:47:36 +08:00
|
|
|
return err;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2012-03-19 09:46:39 +08:00
|
|
|
/* Find the zone which holds a particular offset
|
|
|
|
* Update *sectorp to be an offset in that zone
|
|
|
|
*/
|
|
|
|
static struct strip_zone *find_zone(struct r0conf *conf,
|
|
|
|
sector_t *sectorp)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct strip_zone *z = conf->strip_zone;
|
|
|
|
sector_t sector = *sectorp;
|
|
|
|
|
|
|
|
for (i = 0; i < conf->nr_strip_zones; i++)
|
|
|
|
if (sector < z[i].zone_end) {
|
|
|
|
if (i)
|
|
|
|
*sectorp = sector - z[i-1].zone_end;
|
|
|
|
return z + i;
|
|
|
|
}
|
|
|
|
BUG();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* remaps the bio to the target device. we separate two flows.
|
2015-04-10 11:19:04 +08:00
|
|
|
* power 2 flow and a general flow for the sake of performance
|
2012-03-19 09:46:39 +08:00
|
|
|
*/
|
|
|
|
static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
|
|
|
|
sector_t sector, sector_t *sector_offset)
|
|
|
|
{
|
|
|
|
unsigned int sect_in_chunk;
|
|
|
|
sector_t chunk;
|
|
|
|
struct r0conf *conf = mddev->private;
|
|
|
|
int raid_disks = conf->strip_zone[0].nb_dev;
|
|
|
|
unsigned int chunk_sects = mddev->chunk_sectors;
|
|
|
|
|
|
|
|
if (is_power_of_2(chunk_sects)) {
|
|
|
|
int chunksect_bits = ffz(~chunk_sects);
|
|
|
|
/* find the sector offset inside the chunk */
|
|
|
|
sect_in_chunk = sector & (chunk_sects - 1);
|
|
|
|
sector >>= chunksect_bits;
|
|
|
|
/* chunk in zone */
|
|
|
|
chunk = *sector_offset;
|
|
|
|
/* quotient is the chunk in real device*/
|
|
|
|
sector_div(chunk, zone->nb_dev << chunksect_bits);
|
|
|
|
} else{
|
|
|
|
sect_in_chunk = sector_div(sector, chunk_sects);
|
|
|
|
chunk = *sector_offset;
|
|
|
|
sector_div(chunk, chunk_sects * zone->nb_dev);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* position the bio over the real device
|
|
|
|
* real sector = chunk in device + starting of zone
|
|
|
|
* + the position in the chunk
|
|
|
|
*/
|
|
|
|
*sector_offset = (chunk * chunk_sects) + sect_in_chunk;
|
|
|
|
return conf->devlist[(zone - conf->strip_zone)*raid_disks
|
|
|
|
+ sector_div(sector, zone->nb_dev)];
|
|
|
|
}
|
|
|
|
|
2011-10-11 13:47:53 +08:00
|
|
|
static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks)
|
2009-03-18 09:10:40 +08:00
|
|
|
{
|
|
|
|
sector_t array_sectors = 0;
|
2011-10-11 13:45:26 +08:00
|
|
|
struct md_rdev *rdev;
|
2009-03-18 09:10:40 +08:00
|
|
|
|
|
|
|
WARN_ONCE(sectors || raid_disks,
|
|
|
|
"%s does not support generic reshape\n", __func__);
|
|
|
|
|
2012-03-19 09:46:39 +08:00
|
|
|
rdev_for_each(rdev, mddev)
|
2013-02-21 11:33:17 +08:00
|
|
|
array_sectors += (rdev->sectors &
|
|
|
|
~(sector_t)(mddev->chunk_sectors-1));
|
2009-03-18 09:10:40 +08:00
|
|
|
|
|
|
|
return array_sectors;
|
|
|
|
}
|
|
|
|
|
2021-12-10 17:31:15 +08:00
|
|
|
static void free_conf(struct mddev *mddev, struct r0conf *conf)
|
|
|
|
{
|
|
|
|
kfree(conf->strip_zone);
|
|
|
|
kfree(conf->devlist);
|
|
|
|
kfree(conf);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void raid0_free(struct mddev *mddev, void *priv)
|
|
|
|
{
|
|
|
|
struct r0conf *conf = priv;
|
|
|
|
|
|
|
|
free_conf(mddev, conf);
|
|
|
|
}
|
2012-04-02 07:48:37 +08:00
|
|
|
|
2011-10-11 13:47:53 +08:00
|
|
|
static int raid0_run(struct mddev *mddev)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2011-10-11 13:48:59 +08:00
|
|
|
struct r0conf *conf;
|
2009-06-16 14:47:21 +08:00
|
|
|
int ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-06-18 06:45:01 +08:00
|
|
|
if (mddev->chunk_sectors == 0) {
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("md/raid0:%s: chunk size must be set.\n", mdname(mddev));
|
2006-01-06 16:20:36 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
2009-06-18 06:49:23 +08:00
|
|
|
if (md_check_no_bitmap(mddev))
|
|
|
|
return -EINVAL;
|
2015-02-14 02:48:01 +08:00
|
|
|
|
2010-03-08 13:02:44 +08:00
|
|
|
/* if private is not null, we are here after takeover */
|
|
|
|
if (mddev->private == NULL) {
|
|
|
|
ret = create_strip_zones(mddev, &conf);
|
|
|
|
if (ret < 0)
|
2023-06-22 00:51:03 +08:00
|
|
|
return ret;
|
2010-03-08 13:02:44 +08:00
|
|
|
mddev->private = conf;
|
|
|
|
}
|
|
|
|
conf = mddev->private;
|
2015-08-03 11:11:47 +08:00
|
|
|
if (mddev->queue) {
|
|
|
|
struct md_rdev *rdev;
|
|
|
|
|
|
|
|
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
|
2017-04-06 01:21:03 +08:00
|
|
|
blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
|
2015-08-03 11:11:47 +08:00
|
|
|
|
|
|
|
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
|
|
|
|
blk_queue_io_opt(mddev->queue,
|
|
|
|
(mddev->chunk_sectors << 9) * mddev->raid_disks);
|
|
|
|
|
2015-09-24 13:47:47 +08:00
|
|
|
rdev_for_each(rdev, mddev) {
|
|
|
|
disk_stack_limits(mddev->gendisk, rdev->bdev,
|
|
|
|
rdev->data_offset << 9);
|
|
|
|
}
|
2015-08-03 11:11:47 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* calculate array device size */
|
2009-03-31 11:59:03 +08:00
|
|
|
md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_debug("md/raid0:%s: md_size is %llu sectors.\n",
|
|
|
|
mdname(mddev),
|
|
|
|
(unsigned long long)mddev->array_sectors);
|
2015-02-14 02:48:01 +08:00
|
|
|
|
2009-06-16 15:00:54 +08:00
|
|
|
dump_zones(mddev);
|
2012-04-02 07:48:37 +08:00
|
|
|
|
|
|
|
ret = md_integrity_register(mddev);
|
2021-12-10 17:31:15 +08:00
|
|
|
if (ret)
|
2023-06-22 00:51:03 +08:00
|
|
|
free_conf(mddev, conf);
|
2012-04-02 07:48:37 +08:00
|
|
|
|
|
|
|
return ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
md/raid0: add discard support for the 'original' layout
We've found that using raid0 with the 'original' layout and discard
enabled with different disk sizes (such that at least two zones are
created) can result in data corruption. This is due to the fact that
the discard handling in 'raid0_handle_discard()' assumes the 'alternate'
layout. We've seen this corruption using ext4 but other filesystems are
likely susceptible as well.
More specifically, while multiple zones are necessary to create the
corruption, the corruption may not occur with multiple zones if they
layout in such a way the layout matches what the 'alternate' layout
would have produced. Thus, not all raid0 devices with the 'original'
layout, different size disks and discard enabled will encounter this
corruption.
The 3.14 kernel inadvertently changed the raid0 disk layout for different
size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the
same raid0 array could corrupt data. This lead to the creation of the
'original' layout (to match the pre-3.14 layout) and the 'alternate' layout
(to match the post 3.14 layout) in the 5.4 kernel time frame and an option
to tell the kernel which layout to use (since it couldn't be autodetected).
However, when the 'original' layout was added back to 5.4 discard support
for the 'original' layout was not added leading this issue.
I've been able to reliably reproduce the corruption with the following
test case:
1. create raid0 array with different size disks using original layout
2. mkfs
3. mount -o discard
4. create lots of files
5. remove 1/2 the files
6. fstrim -a (or just the mount point for the raid0 array)
7. umount
8. fsck -fn /dev/md0 (spews all sorts of corruptions)
Let's fix this by adding proper discard support to the 'original' layout.
The fix 'maps' the 'original' layout disks to the order in which they are
read/written such that we can compare the disks in the same way that the
current 'alternate' layout does. A 'disk_shift' field is added to
'struct strip_zone'. This could be computed on the fly in
raid0_handle_discard() but by adding this field, we save some computation
in the discard path.
Note we could also potentially fix this by re-ordering the disks in the
zones that follow the first one, and then always read/writing them using
the 'alternate' layout. However, that is seen as a more substantial change,
and we are attempting the least invasive fix at this time to remedy the
corruption.
I've verified the change using the reproducer mentioned above. Typically,
the corruption is seen after less than 3 iterations, while the patch has
run 500+ iterations.
Cc: NeilBrown <neilb@suse.de>
Cc: Song Liu <song@kernel.org>
Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com
2023-06-24 02:05:23 +08:00
|
|
|
/*
|
|
|
|
* Convert disk_index to the disk order in which it is read/written.
|
|
|
|
* For example, if we have 4 disks, they are numbered 0,1,2,3. If we
|
|
|
|
* write the disks starting at disk 3, then the read/write order would
|
|
|
|
* be disk 3, then 0, then 1, and then disk 2 and we want map_disk_shift()
|
|
|
|
* to map the disks as follows 0,1,2,3 => 1,2,3,0. So disk 0 would map
|
|
|
|
* to 1, 1 to 2, 2 to 3, and 3 to 0. That way we can compare disks in
|
|
|
|
* that 'output' space to understand the read/write disk ordering.
|
|
|
|
*/
|
|
|
|
static int map_disk_shift(int disk_index, int num_disks, int disk_shift)
|
|
|
|
{
|
|
|
|
return ((disk_index + num_disks - disk_shift) % num_disks);
|
|
|
|
}
|
|
|
|
|
2017-05-08 08:36:24 +08:00
|
|
|
static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
|
|
|
|
{
|
|
|
|
struct r0conf *conf = mddev->private;
|
|
|
|
struct strip_zone *zone;
|
|
|
|
sector_t start = bio->bi_iter.bi_sector;
|
|
|
|
sector_t end;
|
|
|
|
unsigned int stripe_size;
|
|
|
|
sector_t first_stripe_index, last_stripe_index;
|
|
|
|
sector_t start_disk_offset;
|
|
|
|
unsigned int start_disk_index;
|
|
|
|
sector_t end_disk_offset;
|
|
|
|
unsigned int end_disk_index;
|
|
|
|
unsigned int disk;
|
md/raid0: add discard support for the 'original' layout
We've found that using raid0 with the 'original' layout and discard
enabled with different disk sizes (such that at least two zones are
created) can result in data corruption. This is due to the fact that
the discard handling in 'raid0_handle_discard()' assumes the 'alternate'
layout. We've seen this corruption using ext4 but other filesystems are
likely susceptible as well.
More specifically, while multiple zones are necessary to create the
corruption, the corruption may not occur with multiple zones if they
layout in such a way the layout matches what the 'alternate' layout
would have produced. Thus, not all raid0 devices with the 'original'
layout, different size disks and discard enabled will encounter this
corruption.
The 3.14 kernel inadvertently changed the raid0 disk layout for different
size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the
same raid0 array could corrupt data. This lead to the creation of the
'original' layout (to match the pre-3.14 layout) and the 'alternate' layout
(to match the post 3.14 layout) in the 5.4 kernel time frame and an option
to tell the kernel which layout to use (since it couldn't be autodetected).
However, when the 'original' layout was added back to 5.4 discard support
for the 'original' layout was not added leading this issue.
I've been able to reliably reproduce the corruption with the following
test case:
1. create raid0 array with different size disks using original layout
2. mkfs
3. mount -o discard
4. create lots of files
5. remove 1/2 the files
6. fstrim -a (or just the mount point for the raid0 array)
7. umount
8. fsck -fn /dev/md0 (spews all sorts of corruptions)
Let's fix this by adding proper discard support to the 'original' layout.
The fix 'maps' the 'original' layout disks to the order in which they are
read/written such that we can compare the disks in the same way that the
current 'alternate' layout does. A 'disk_shift' field is added to
'struct strip_zone'. This could be computed on the fly in
raid0_handle_discard() but by adding this field, we save some computation
in the discard path.
Note we could also potentially fix this by re-ordering the disks in the
zones that follow the first one, and then always read/writing them using
the 'alternate' layout. However, that is seen as a more substantial change,
and we are attempting the least invasive fix at this time to remedy the
corruption.
I've verified the change using the reproducer mentioned above. Typically,
the corruption is seen after less than 3 iterations, while the patch has
run 500+ iterations.
Cc: NeilBrown <neilb@suse.de>
Cc: Song Liu <song@kernel.org>
Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com
2023-06-24 02:05:23 +08:00
|
|
|
sector_t orig_start, orig_end;
|
2017-05-08 08:36:24 +08:00
|
|
|
|
md/raid0: add discard support for the 'original' layout
We've found that using raid0 with the 'original' layout and discard
enabled with different disk sizes (such that at least two zones are
created) can result in data corruption. This is due to the fact that
the discard handling in 'raid0_handle_discard()' assumes the 'alternate'
layout. We've seen this corruption using ext4 but other filesystems are
likely susceptible as well.
More specifically, while multiple zones are necessary to create the
corruption, the corruption may not occur with multiple zones if they
layout in such a way the layout matches what the 'alternate' layout
would have produced. Thus, not all raid0 devices with the 'original'
layout, different size disks and discard enabled will encounter this
corruption.
The 3.14 kernel inadvertently changed the raid0 disk layout for different
size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the
same raid0 array could corrupt data. This lead to the creation of the
'original' layout (to match the pre-3.14 layout) and the 'alternate' layout
(to match the post 3.14 layout) in the 5.4 kernel time frame and an option
to tell the kernel which layout to use (since it couldn't be autodetected).
However, when the 'original' layout was added back to 5.4 discard support
for the 'original' layout was not added leading this issue.
I've been able to reliably reproduce the corruption with the following
test case:
1. create raid0 array with different size disks using original layout
2. mkfs
3. mount -o discard
4. create lots of files
5. remove 1/2 the files
6. fstrim -a (or just the mount point for the raid0 array)
7. umount
8. fsck -fn /dev/md0 (spews all sorts of corruptions)
Let's fix this by adding proper discard support to the 'original' layout.
The fix 'maps' the 'original' layout disks to the order in which they are
read/written such that we can compare the disks in the same way that the
current 'alternate' layout does. A 'disk_shift' field is added to
'struct strip_zone'. This could be computed on the fly in
raid0_handle_discard() but by adding this field, we save some computation
in the discard path.
Note we could also potentially fix this by re-ordering the disks in the
zones that follow the first one, and then always read/writing them using
the 'alternate' layout. However, that is seen as a more substantial change,
and we are attempting the least invasive fix at this time to remedy the
corruption.
I've verified the change using the reproducer mentioned above. Typically,
the corruption is seen after less than 3 iterations, while the patch has
run 500+ iterations.
Cc: NeilBrown <neilb@suse.de>
Cc: Song Liu <song@kernel.org>
Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com
2023-06-24 02:05:23 +08:00
|
|
|
orig_start = start;
|
2017-05-08 08:36:24 +08:00
|
|
|
zone = find_zone(conf, &start);
|
|
|
|
|
|
|
|
if (bio_end_sector(bio) > zone->zone_end) {
|
|
|
|
struct bio *split = bio_split(bio,
|
|
|
|
zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
|
2018-05-21 06:25:52 +08:00
|
|
|
&mddev->bio_set);
|
2017-05-08 08:36:24 +08:00
|
|
|
bio_chain(split, bio);
|
2020-07-01 16:59:44 +08:00
|
|
|
submit_bio_noacct(bio);
|
2017-05-08 08:36:24 +08:00
|
|
|
bio = split;
|
|
|
|
end = zone->zone_end;
|
|
|
|
} else
|
|
|
|
end = bio_end_sector(bio);
|
|
|
|
|
md/raid0: add discard support for the 'original' layout
We've found that using raid0 with the 'original' layout and discard
enabled with different disk sizes (such that at least two zones are
created) can result in data corruption. This is due to the fact that
the discard handling in 'raid0_handle_discard()' assumes the 'alternate'
layout. We've seen this corruption using ext4 but other filesystems are
likely susceptible as well.
More specifically, while multiple zones are necessary to create the
corruption, the corruption may not occur with multiple zones if they
layout in such a way the layout matches what the 'alternate' layout
would have produced. Thus, not all raid0 devices with the 'original'
layout, different size disks and discard enabled will encounter this
corruption.
The 3.14 kernel inadvertently changed the raid0 disk layout for different
size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the
same raid0 array could corrupt data. This lead to the creation of the
'original' layout (to match the pre-3.14 layout) and the 'alternate' layout
(to match the post 3.14 layout) in the 5.4 kernel time frame and an option
to tell the kernel which layout to use (since it couldn't be autodetected).
However, when the 'original' layout was added back to 5.4 discard support
for the 'original' layout was not added leading this issue.
I've been able to reliably reproduce the corruption with the following
test case:
1. create raid0 array with different size disks using original layout
2. mkfs
3. mount -o discard
4. create lots of files
5. remove 1/2 the files
6. fstrim -a (or just the mount point for the raid0 array)
7. umount
8. fsck -fn /dev/md0 (spews all sorts of corruptions)
Let's fix this by adding proper discard support to the 'original' layout.
The fix 'maps' the 'original' layout disks to the order in which they are
read/written such that we can compare the disks in the same way that the
current 'alternate' layout does. A 'disk_shift' field is added to
'struct strip_zone'. This could be computed on the fly in
raid0_handle_discard() but by adding this field, we save some computation
in the discard path.
Note we could also potentially fix this by re-ordering the disks in the
zones that follow the first one, and then always read/writing them using
the 'alternate' layout. However, that is seen as a more substantial change,
and we are attempting the least invasive fix at this time to remedy the
corruption.
I've verified the change using the reproducer mentioned above. Typically,
the corruption is seen after less than 3 iterations, while the patch has
run 500+ iterations.
Cc: NeilBrown <neilb@suse.de>
Cc: Song Liu <song@kernel.org>
Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com
2023-06-24 02:05:23 +08:00
|
|
|
orig_end = end;
|
2017-05-08 08:36:24 +08:00
|
|
|
if (zone != conf->strip_zone)
|
|
|
|
end = end - zone[-1].zone_end;
|
|
|
|
|
|
|
|
/* Now start and end is the offset in zone */
|
|
|
|
stripe_size = zone->nb_dev * mddev->chunk_sectors;
|
|
|
|
|
|
|
|
first_stripe_index = start;
|
|
|
|
sector_div(first_stripe_index, stripe_size);
|
|
|
|
last_stripe_index = end;
|
|
|
|
sector_div(last_stripe_index, stripe_size);
|
|
|
|
|
md/raid0: add discard support for the 'original' layout
We've found that using raid0 with the 'original' layout and discard
enabled with different disk sizes (such that at least two zones are
created) can result in data corruption. This is due to the fact that
the discard handling in 'raid0_handle_discard()' assumes the 'alternate'
layout. We've seen this corruption using ext4 but other filesystems are
likely susceptible as well.
More specifically, while multiple zones are necessary to create the
corruption, the corruption may not occur with multiple zones if they
layout in such a way the layout matches what the 'alternate' layout
would have produced. Thus, not all raid0 devices with the 'original'
layout, different size disks and discard enabled will encounter this
corruption.
The 3.14 kernel inadvertently changed the raid0 disk layout for different
size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the
same raid0 array could corrupt data. This lead to the creation of the
'original' layout (to match the pre-3.14 layout) and the 'alternate' layout
(to match the post 3.14 layout) in the 5.4 kernel time frame and an option
to tell the kernel which layout to use (since it couldn't be autodetected).
However, when the 'original' layout was added back to 5.4 discard support
for the 'original' layout was not added leading this issue.
I've been able to reliably reproduce the corruption with the following
test case:
1. create raid0 array with different size disks using original layout
2. mkfs
3. mount -o discard
4. create lots of files
5. remove 1/2 the files
6. fstrim -a (or just the mount point for the raid0 array)
7. umount
8. fsck -fn /dev/md0 (spews all sorts of corruptions)
Let's fix this by adding proper discard support to the 'original' layout.
The fix 'maps' the 'original' layout disks to the order in which they are
read/written such that we can compare the disks in the same way that the
current 'alternate' layout does. A 'disk_shift' field is added to
'struct strip_zone'. This could be computed on the fly in
raid0_handle_discard() but by adding this field, we save some computation
in the discard path.
Note we could also potentially fix this by re-ordering the disks in the
zones that follow the first one, and then always read/writing them using
the 'alternate' layout. However, that is seen as a more substantial change,
and we are attempting the least invasive fix at this time to remedy the
corruption.
I've verified the change using the reproducer mentioned above. Typically,
the corruption is seen after less than 3 iterations, while the patch has
run 500+ iterations.
Cc: NeilBrown <neilb@suse.de>
Cc: Song Liu <song@kernel.org>
Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com
2023-06-24 02:05:23 +08:00
|
|
|
/* In the first zone the original and alternate layouts are the same */
|
|
|
|
if ((conf->layout == RAID0_ORIG_LAYOUT) && (zone != conf->strip_zone)) {
|
|
|
|
sector_div(orig_start, mddev->chunk_sectors);
|
|
|
|
start_disk_index = sector_div(orig_start, zone->nb_dev);
|
|
|
|
start_disk_index = map_disk_shift(start_disk_index,
|
|
|
|
zone->nb_dev,
|
|
|
|
zone->disk_shift);
|
|
|
|
sector_div(orig_end, mddev->chunk_sectors);
|
|
|
|
end_disk_index = sector_div(orig_end, zone->nb_dev);
|
|
|
|
end_disk_index = map_disk_shift(end_disk_index,
|
|
|
|
zone->nb_dev, zone->disk_shift);
|
|
|
|
} else {
|
|
|
|
start_disk_index = (int)(start - first_stripe_index * stripe_size) /
|
|
|
|
mddev->chunk_sectors;
|
|
|
|
end_disk_index = (int)(end - last_stripe_index * stripe_size) /
|
|
|
|
mddev->chunk_sectors;
|
|
|
|
}
|
2017-05-08 08:36:24 +08:00
|
|
|
start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
|
|
|
|
mddev->chunk_sectors) +
|
|
|
|
first_stripe_index * mddev->chunk_sectors;
|
|
|
|
end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
|
|
|
|
mddev->chunk_sectors) +
|
|
|
|
last_stripe_index * mddev->chunk_sectors;
|
|
|
|
|
|
|
|
for (disk = 0; disk < zone->nb_dev; disk++) {
|
|
|
|
sector_t dev_start, dev_end;
|
|
|
|
struct md_rdev *rdev;
|
md/raid0: add discard support for the 'original' layout
We've found that using raid0 with the 'original' layout and discard
enabled with different disk sizes (such that at least two zones are
created) can result in data corruption. This is due to the fact that
the discard handling in 'raid0_handle_discard()' assumes the 'alternate'
layout. We've seen this corruption using ext4 but other filesystems are
likely susceptible as well.
More specifically, while multiple zones are necessary to create the
corruption, the corruption may not occur with multiple zones if they
layout in such a way the layout matches what the 'alternate' layout
would have produced. Thus, not all raid0 devices with the 'original'
layout, different size disks and discard enabled will encounter this
corruption.
The 3.14 kernel inadvertently changed the raid0 disk layout for different
size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the
same raid0 array could corrupt data. This lead to the creation of the
'original' layout (to match the pre-3.14 layout) and the 'alternate' layout
(to match the post 3.14 layout) in the 5.4 kernel time frame and an option
to tell the kernel which layout to use (since it couldn't be autodetected).
However, when the 'original' layout was added back to 5.4 discard support
for the 'original' layout was not added leading this issue.
I've been able to reliably reproduce the corruption with the following
test case:
1. create raid0 array with different size disks using original layout
2. mkfs
3. mount -o discard
4. create lots of files
5. remove 1/2 the files
6. fstrim -a (or just the mount point for the raid0 array)
7. umount
8. fsck -fn /dev/md0 (spews all sorts of corruptions)
Let's fix this by adding proper discard support to the 'original' layout.
The fix 'maps' the 'original' layout disks to the order in which they are
read/written such that we can compare the disks in the same way that the
current 'alternate' layout does. A 'disk_shift' field is added to
'struct strip_zone'. This could be computed on the fly in
raid0_handle_discard() but by adding this field, we save some computation
in the discard path.
Note we could also potentially fix this by re-ordering the disks in the
zones that follow the first one, and then always read/writing them using
the 'alternate' layout. However, that is seen as a more substantial change,
and we are attempting the least invasive fix at this time to remedy the
corruption.
I've verified the change using the reproducer mentioned above. Typically,
the corruption is seen after less than 3 iterations, while the patch has
run 500+ iterations.
Cc: NeilBrown <neilb@suse.de>
Cc: Song Liu <song@kernel.org>
Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com
2023-06-24 02:05:23 +08:00
|
|
|
int compare_disk;
|
|
|
|
|
|
|
|
compare_disk = map_disk_shift(disk, zone->nb_dev,
|
|
|
|
zone->disk_shift);
|
2017-05-08 08:36:24 +08:00
|
|
|
|
md/raid0: add discard support for the 'original' layout
We've found that using raid0 with the 'original' layout and discard
enabled with different disk sizes (such that at least two zones are
created) can result in data corruption. This is due to the fact that
the discard handling in 'raid0_handle_discard()' assumes the 'alternate'
layout. We've seen this corruption using ext4 but other filesystems are
likely susceptible as well.
More specifically, while multiple zones are necessary to create the
corruption, the corruption may not occur with multiple zones if they
layout in such a way the layout matches what the 'alternate' layout
would have produced. Thus, not all raid0 devices with the 'original'
layout, different size disks and discard enabled will encounter this
corruption.
The 3.14 kernel inadvertently changed the raid0 disk layout for different
size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the
same raid0 array could corrupt data. This lead to the creation of the
'original' layout (to match the pre-3.14 layout) and the 'alternate' layout
(to match the post 3.14 layout) in the 5.4 kernel time frame and an option
to tell the kernel which layout to use (since it couldn't be autodetected).
However, when the 'original' layout was added back to 5.4 discard support
for the 'original' layout was not added leading this issue.
I've been able to reliably reproduce the corruption with the following
test case:
1. create raid0 array with different size disks using original layout
2. mkfs
3. mount -o discard
4. create lots of files
5. remove 1/2 the files
6. fstrim -a (or just the mount point for the raid0 array)
7. umount
8. fsck -fn /dev/md0 (spews all sorts of corruptions)
Let's fix this by adding proper discard support to the 'original' layout.
The fix 'maps' the 'original' layout disks to the order in which they are
read/written such that we can compare the disks in the same way that the
current 'alternate' layout does. A 'disk_shift' field is added to
'struct strip_zone'. This could be computed on the fly in
raid0_handle_discard() but by adding this field, we save some computation
in the discard path.
Note we could also potentially fix this by re-ordering the disks in the
zones that follow the first one, and then always read/writing them using
the 'alternate' layout. However, that is seen as a more substantial change,
and we are attempting the least invasive fix at this time to remedy the
corruption.
I've verified the change using the reproducer mentioned above. Typically,
the corruption is seen after less than 3 iterations, while the patch has
run 500+ iterations.
Cc: NeilBrown <neilb@suse.de>
Cc: Song Liu <song@kernel.org>
Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com
2023-06-24 02:05:23 +08:00
|
|
|
if (compare_disk < start_disk_index)
|
2017-05-08 08:36:24 +08:00
|
|
|
dev_start = (first_stripe_index + 1) *
|
|
|
|
mddev->chunk_sectors;
|
md/raid0: add discard support for the 'original' layout
We've found that using raid0 with the 'original' layout and discard
enabled with different disk sizes (such that at least two zones are
created) can result in data corruption. This is due to the fact that
the discard handling in 'raid0_handle_discard()' assumes the 'alternate'
layout. We've seen this corruption using ext4 but other filesystems are
likely susceptible as well.
More specifically, while multiple zones are necessary to create the
corruption, the corruption may not occur with multiple zones if they
layout in such a way the layout matches what the 'alternate' layout
would have produced. Thus, not all raid0 devices with the 'original'
layout, different size disks and discard enabled will encounter this
corruption.
The 3.14 kernel inadvertently changed the raid0 disk layout for different
size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the
same raid0 array could corrupt data. This lead to the creation of the
'original' layout (to match the pre-3.14 layout) and the 'alternate' layout
(to match the post 3.14 layout) in the 5.4 kernel time frame and an option
to tell the kernel which layout to use (since it couldn't be autodetected).
However, when the 'original' layout was added back to 5.4 discard support
for the 'original' layout was not added leading this issue.
I've been able to reliably reproduce the corruption with the following
test case:
1. create raid0 array with different size disks using original layout
2. mkfs
3. mount -o discard
4. create lots of files
5. remove 1/2 the files
6. fstrim -a (or just the mount point for the raid0 array)
7. umount
8. fsck -fn /dev/md0 (spews all sorts of corruptions)
Let's fix this by adding proper discard support to the 'original' layout.
The fix 'maps' the 'original' layout disks to the order in which they are
read/written such that we can compare the disks in the same way that the
current 'alternate' layout does. A 'disk_shift' field is added to
'struct strip_zone'. This could be computed on the fly in
raid0_handle_discard() but by adding this field, we save some computation
in the discard path.
Note we could also potentially fix this by re-ordering the disks in the
zones that follow the first one, and then always read/writing them using
the 'alternate' layout. However, that is seen as a more substantial change,
and we are attempting the least invasive fix at this time to remedy the
corruption.
I've verified the change using the reproducer mentioned above. Typically,
the corruption is seen after less than 3 iterations, while the patch has
run 500+ iterations.
Cc: NeilBrown <neilb@suse.de>
Cc: Song Liu <song@kernel.org>
Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com
2023-06-24 02:05:23 +08:00
|
|
|
else if (compare_disk > start_disk_index)
|
2017-05-08 08:36:24 +08:00
|
|
|
dev_start = first_stripe_index * mddev->chunk_sectors;
|
|
|
|
else
|
|
|
|
dev_start = start_disk_offset;
|
|
|
|
|
md/raid0: add discard support for the 'original' layout
We've found that using raid0 with the 'original' layout and discard
enabled with different disk sizes (such that at least two zones are
created) can result in data corruption. This is due to the fact that
the discard handling in 'raid0_handle_discard()' assumes the 'alternate'
layout. We've seen this corruption using ext4 but other filesystems are
likely susceptible as well.
More specifically, while multiple zones are necessary to create the
corruption, the corruption may not occur with multiple zones if they
layout in such a way the layout matches what the 'alternate' layout
would have produced. Thus, not all raid0 devices with the 'original'
layout, different size disks and discard enabled will encounter this
corruption.
The 3.14 kernel inadvertently changed the raid0 disk layout for different
size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the
same raid0 array could corrupt data. This lead to the creation of the
'original' layout (to match the pre-3.14 layout) and the 'alternate' layout
(to match the post 3.14 layout) in the 5.4 kernel time frame and an option
to tell the kernel which layout to use (since it couldn't be autodetected).
However, when the 'original' layout was added back to 5.4 discard support
for the 'original' layout was not added leading this issue.
I've been able to reliably reproduce the corruption with the following
test case:
1. create raid0 array with different size disks using original layout
2. mkfs
3. mount -o discard
4. create lots of files
5. remove 1/2 the files
6. fstrim -a (or just the mount point for the raid0 array)
7. umount
8. fsck -fn /dev/md0 (spews all sorts of corruptions)
Let's fix this by adding proper discard support to the 'original' layout.
The fix 'maps' the 'original' layout disks to the order in which they are
read/written such that we can compare the disks in the same way that the
current 'alternate' layout does. A 'disk_shift' field is added to
'struct strip_zone'. This could be computed on the fly in
raid0_handle_discard() but by adding this field, we save some computation
in the discard path.
Note we could also potentially fix this by re-ordering the disks in the
zones that follow the first one, and then always read/writing them using
the 'alternate' layout. However, that is seen as a more substantial change,
and we are attempting the least invasive fix at this time to remedy the
corruption.
I've verified the change using the reproducer mentioned above. Typically,
the corruption is seen after less than 3 iterations, while the patch has
run 500+ iterations.
Cc: NeilBrown <neilb@suse.de>
Cc: Song Liu <song@kernel.org>
Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com
2023-06-24 02:05:23 +08:00
|
|
|
if (compare_disk < end_disk_index)
|
2017-05-08 08:36:24 +08:00
|
|
|
dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
|
md/raid0: add discard support for the 'original' layout
We've found that using raid0 with the 'original' layout and discard
enabled with different disk sizes (such that at least two zones are
created) can result in data corruption. This is due to the fact that
the discard handling in 'raid0_handle_discard()' assumes the 'alternate'
layout. We've seen this corruption using ext4 but other filesystems are
likely susceptible as well.
More specifically, while multiple zones are necessary to create the
corruption, the corruption may not occur with multiple zones if they
layout in such a way the layout matches what the 'alternate' layout
would have produced. Thus, not all raid0 devices with the 'original'
layout, different size disks and discard enabled will encounter this
corruption.
The 3.14 kernel inadvertently changed the raid0 disk layout for different
size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the
same raid0 array could corrupt data. This lead to the creation of the
'original' layout (to match the pre-3.14 layout) and the 'alternate' layout
(to match the post 3.14 layout) in the 5.4 kernel time frame and an option
to tell the kernel which layout to use (since it couldn't be autodetected).
However, when the 'original' layout was added back to 5.4 discard support
for the 'original' layout was not added leading this issue.
I've been able to reliably reproduce the corruption with the following
test case:
1. create raid0 array with different size disks using original layout
2. mkfs
3. mount -o discard
4. create lots of files
5. remove 1/2 the files
6. fstrim -a (or just the mount point for the raid0 array)
7. umount
8. fsck -fn /dev/md0 (spews all sorts of corruptions)
Let's fix this by adding proper discard support to the 'original' layout.
The fix 'maps' the 'original' layout disks to the order in which they are
read/written such that we can compare the disks in the same way that the
current 'alternate' layout does. A 'disk_shift' field is added to
'struct strip_zone'. This could be computed on the fly in
raid0_handle_discard() but by adding this field, we save some computation
in the discard path.
Note we could also potentially fix this by re-ordering the disks in the
zones that follow the first one, and then always read/writing them using
the 'alternate' layout. However, that is seen as a more substantial change,
and we are attempting the least invasive fix at this time to remedy the
corruption.
I've verified the change using the reproducer mentioned above. Typically,
the corruption is seen after less than 3 iterations, while the patch has
run 500+ iterations.
Cc: NeilBrown <neilb@suse.de>
Cc: Song Liu <song@kernel.org>
Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.")
Cc: stable@vger.kernel.org
Signed-off-by: Jason Baron <jbaron@akamai.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com
2023-06-24 02:05:23 +08:00
|
|
|
else if (compare_disk > end_disk_index)
|
2017-05-08 08:36:24 +08:00
|
|
|
dev_end = last_stripe_index * mddev->chunk_sectors;
|
|
|
|
else
|
|
|
|
dev_end = end_disk_offset;
|
|
|
|
|
|
|
|
if (dev_end <= dev_start)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
rdev = conf->devlist[(zone - conf->strip_zone) *
|
|
|
|
conf->strip_zone[0].nb_dev + disk];
|
2021-02-04 15:50:43 +08:00
|
|
|
md_submit_discard_bio(mddev, rdev, bio,
|
2017-05-08 08:36:24 +08:00
|
|
|
dev_start + zone->dev_start + rdev->data_offset,
|
2021-02-04 15:50:43 +08:00
|
|
|
dev_end - dev_start);
|
2017-05-08 08:36:24 +08:00
|
|
|
}
|
|
|
|
bio_endio(bio);
|
|
|
|
}
|
|
|
|
|
2023-08-14 17:27:07 +08:00
|
|
|
static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio)
|
2009-06-16 15:02:05 +08:00
|
|
|
{
|
2019-09-09 14:30:02 +08:00
|
|
|
struct r0conf *conf = mddev->private;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct strip_zone *zone;
|
2011-10-11 13:45:26 +08:00
|
|
|
struct md_rdev *tmp_dev;
|
2023-08-14 17:27:07 +08:00
|
|
|
sector_t bio_sector = bio->bi_iter.bi_sector;
|
|
|
|
sector_t sector = bio_sector;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2023-08-17 02:13:55 +08:00
|
|
|
md_account_bio(mddev, &bio);
|
2021-05-25 17:46:17 +08:00
|
|
|
|
2017-04-05 12:05:51 +08:00
|
|
|
zone = find_zone(mddev->private, §or);
|
2019-09-09 14:30:02 +08:00
|
|
|
switch (conf->layout) {
|
|
|
|
case RAID0_ORIG_LAYOUT:
|
2023-08-14 17:27:07 +08:00
|
|
|
tmp_dev = map_sector(mddev, zone, bio_sector, §or);
|
2019-09-09 14:30:02 +08:00
|
|
|
break;
|
|
|
|
case RAID0_ALT_MULTIZONE_LAYOUT:
|
|
|
|
tmp_dev = map_sector(mddev, zone, sector, §or);
|
|
|
|
break;
|
|
|
|
default:
|
2019-09-21 14:00:31 +08:00
|
|
|
WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev));
|
2019-09-09 14:30:02 +08:00
|
|
|
bio_io_error(bio);
|
2023-08-14 17:27:07 +08:00
|
|
|
return;
|
2019-09-09 14:30:02 +08:00
|
|
|
}
|
md raid0/linear: Mark array as 'broken' and fail BIOs if a member is gone
Currently md raid0/linear are not provided with any mechanism to validate
if an array member got removed or failed. The driver keeps sending BIOs
regardless of the state of array members, and kernel shows state 'clean'
in the 'array_state' sysfs attribute. This leads to the following
situation: if a raid0/linear array member is removed and the array is
mounted, some user writing to this array won't realize that errors are
happening unless they check dmesg or perform one fsync per written file.
Despite udev signaling the member device is gone, 'mdadm' cannot issue the
STOP_ARRAY ioctl successfully, given the array is mounted.
In other words, no -EIO is returned and writes (except direct ones) appear
normal. Meaning the user might think the wrote data is correctly stored in
the array, but instead garbage was written given that raid0 does stripping
(and so, it requires all its members to be working in order to not corrupt
data). For md/linear, writes to the available members will work fine, but
if the writes go to the missing member(s), it'll cause a file corruption
situation, whereas the portion of the writes to the missing devices aren't
written effectively.
This patch changes this behavior: we check if the block device's gendisk
is UP when submitting the BIO to the array member, and if it isn't, we flag
the md device as MD_BROKEN and fail subsequent I/Os to that device; a read
request to the array requiring data from a valid member is still completed.
While flagging the device as MD_BROKEN, we also show a rate-limited warning
in the kernel log.
A new array state 'broken' was added too: it mimics the state 'clean' in
every aspect, being useful only to distinguish if the array has some member
missing. We rely on the MD_BROKEN flag to put the array in the 'broken'
state. This state cannot be written in 'array_state' as it just shows
one or more members of the array are missing but acts like 'clean', it
wouldn't make sense to write it.
With this patch, the filesystem reacts much faster to the event of missing
array member: after some I/O errors, ext4 for instance aborts the journal
and prevents corruption. Without this change, we're able to keep writing
in the disk and after a machine reboot, e2fsck shows some severe fs errors
that demand fixing. This patch was tested in ext4 and xfs filesystems, and
requires a 'mdadm' counterpart to handle the 'broken' state.
Cc: Song Liu <songliubraving@fb.com>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: Guilherme G. Piccoli <gpiccoli@canonical.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
2019-09-04 03:49:00 +08:00
|
|
|
|
2023-03-06 21:03:17 +08:00
|
|
|
if (unlikely(is_rdev_broken(tmp_dev))) {
|
md raid0/linear: Mark array as 'broken' and fail BIOs if a member is gone
Currently md raid0/linear are not provided with any mechanism to validate
if an array member got removed or failed. The driver keeps sending BIOs
regardless of the state of array members, and kernel shows state 'clean'
in the 'array_state' sysfs attribute. This leads to the following
situation: if a raid0/linear array member is removed and the array is
mounted, some user writing to this array won't realize that errors are
happening unless they check dmesg or perform one fsync per written file.
Despite udev signaling the member device is gone, 'mdadm' cannot issue the
STOP_ARRAY ioctl successfully, given the array is mounted.
In other words, no -EIO is returned and writes (except direct ones) appear
normal. Meaning the user might think the wrote data is correctly stored in
the array, but instead garbage was written given that raid0 does stripping
(and so, it requires all its members to be working in order to not corrupt
data). For md/linear, writes to the available members will work fine, but
if the writes go to the missing member(s), it'll cause a file corruption
situation, whereas the portion of the writes to the missing devices aren't
written effectively.
This patch changes this behavior: we check if the block device's gendisk
is UP when submitting the BIO to the array member, and if it isn't, we flag
the md device as MD_BROKEN and fail subsequent I/Os to that device; a read
request to the array requiring data from a valid member is still completed.
While flagging the device as MD_BROKEN, we also show a rate-limited warning
in the kernel log.
A new array state 'broken' was added too: it mimics the state 'clean' in
every aspect, being useful only to distinguish if the array has some member
missing. We rely on the MD_BROKEN flag to put the array in the 'broken'
state. This state cannot be written in 'array_state' as it just shows
one or more members of the array are missing but acts like 'clean', it
wouldn't make sense to write it.
With this patch, the filesystem reacts much faster to the event of missing
array member: after some I/O errors, ext4 for instance aborts the journal
and prevents corruption. Without this change, we're able to keep writing
in the disk and after a machine reboot, e2fsck shows some severe fs errors
that demand fixing. This patch was tested in ext4 and xfs filesystems, and
requires a 'mdadm' counterpart to handle the 'broken' state.
Cc: Song Liu <songliubraving@fb.com>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: Guilherme G. Piccoli <gpiccoli@canonical.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
2019-09-04 03:49:00 +08:00
|
|
|
bio_io_error(bio);
|
2023-03-06 21:03:17 +08:00
|
|
|
md_error(mddev, tmp_dev);
|
2023-08-14 17:27:07 +08:00
|
|
|
return;
|
md raid0/linear: Mark array as 'broken' and fail BIOs if a member is gone
Currently md raid0/linear are not provided with any mechanism to validate
if an array member got removed or failed. The driver keeps sending BIOs
regardless of the state of array members, and kernel shows state 'clean'
in the 'array_state' sysfs attribute. This leads to the following
situation: if a raid0/linear array member is removed and the array is
mounted, some user writing to this array won't realize that errors are
happening unless they check dmesg or perform one fsync per written file.
Despite udev signaling the member device is gone, 'mdadm' cannot issue the
STOP_ARRAY ioctl successfully, given the array is mounted.
In other words, no -EIO is returned and writes (except direct ones) appear
normal. Meaning the user might think the wrote data is correctly stored in
the array, but instead garbage was written given that raid0 does stripping
(and so, it requires all its members to be working in order to not corrupt
data). For md/linear, writes to the available members will work fine, but
if the writes go to the missing member(s), it'll cause a file corruption
situation, whereas the portion of the writes to the missing devices aren't
written effectively.
This patch changes this behavior: we check if the block device's gendisk
is UP when submitting the BIO to the array member, and if it isn't, we flag
the md device as MD_BROKEN and fail subsequent I/Os to that device; a read
request to the array requiring data from a valid member is still completed.
While flagging the device as MD_BROKEN, we also show a rate-limited warning
in the kernel log.
A new array state 'broken' was added too: it mimics the state 'clean' in
every aspect, being useful only to distinguish if the array has some member
missing. We rely on the MD_BROKEN flag to put the array in the 'broken'
state. This state cannot be written in 'array_state' as it just shows
one or more members of the array are missing but acts like 'clean', it
wouldn't make sense to write it.
With this patch, the filesystem reacts much faster to the event of missing
array member: after some I/O errors, ext4 for instance aborts the journal
and prevents corruption. Without this change, we're able to keep writing
in the disk and after a machine reboot, e2fsck shows some severe fs errors
that demand fixing. This patch was tested in ext4 and xfs filesystems, and
requires a 'mdadm' counterpart to handle the 'broken' state.
Cc: Song Liu <songliubraving@fb.com>
Reviewed-by: NeilBrown <neilb@suse.de>
Signed-off-by: Guilherme G. Piccoli <gpiccoli@canonical.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
2019-09-04 03:49:00 +08:00
|
|
|
}
|
|
|
|
|
2017-08-24 01:10:32 +08:00
|
|
|
bio_set_dev(bio, tmp_dev->bdev);
|
2017-04-05 12:05:51 +08:00
|
|
|
bio->bi_iter.bi_sector = sector + zone->dev_start +
|
|
|
|
tmp_dev->data_offset;
|
|
|
|
|
2017-05-08 08:36:24 +08:00
|
|
|
if (mddev->gendisk)
|
2020-12-04 00:21:38 +08:00
|
|
|
trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
|
|
|
|
bio_sector);
|
2017-05-08 08:36:24 +08:00
|
|
|
mddev_check_write_zeroes(mddev, bio);
|
2020-07-01 16:59:44 +08:00
|
|
|
submit_bio_noacct(bio);
|
2023-08-14 17:27:07 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static bool raid0_make_request(struct mddev *mddev, struct bio *bio)
|
|
|
|
{
|
|
|
|
sector_t sector;
|
|
|
|
unsigned chunk_sects;
|
|
|
|
unsigned sectors;
|
|
|
|
|
|
|
|
if (unlikely(bio->bi_opf & REQ_PREFLUSH)
|
|
|
|
&& md_flush_request(mddev, bio))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
|
|
|
|
raid0_handle_discard(mddev, bio);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
sector = bio->bi_iter.bi_sector;
|
|
|
|
chunk_sects = mddev->chunk_sectors;
|
|
|
|
|
|
|
|
sectors = chunk_sects -
|
|
|
|
(likely(is_power_of_2(chunk_sects))
|
|
|
|
? (sector & (chunk_sects-1))
|
|
|
|
: sector_div(sector, chunk_sects));
|
|
|
|
|
|
|
|
if (sectors < bio_sectors(bio)) {
|
|
|
|
struct bio *split = bio_split(bio, sectors, GFP_NOIO,
|
|
|
|
&mddev->bio_set);
|
|
|
|
bio_chain(split, bio);
|
md/raid0: Fix performance regression for large sequential writes
Commit f00d7c85be9e ("md/raid0: fix up bio splitting.") among other
things changed how bio that needs to be split is submitted. Before this
commit, we have split the bio, mapped and submitted each part. After
this commit, we map only the first part of the split bio and submit the
second part unmapped. Due to bio sorting in __submit_bio_noacct() this
results in the following request ordering:
9,0 18 1181 0.525037895 15995 Q WS 1479315464 + 63392
Split off chunk-sized (1024 sectors) request:
9,0 18 1182 0.629019647 15995 X WS 1479315464 / 1479316488
Request is unaligned to the chunk so it's split in
raid0_make_request(). This is the first part mapped and punted to
bio_list:
8,0 18 7053 0.629020455 15995 A WS 739921928 + 1016 <- (9,0) 1479315464
Now raid0_make_request() returns, second part is postponed on
bio_list. __submit_bio_noacct() resorts the bio_list, mapped request
is submitted to the underlying device:
8,0 18 7054 0.629022782 15995 G WS 739921928 + 1016
Now we take another request from the bio_list which is the remainder
of the original huge request. Split off another chunk-sized bit from
it and the situation repeats:
9,0 18 1183 0.629024499 15995 X WS 1479316488 / 1479317512
8,16 18 6998 0.629025110 15995 A WS 739921928 + 1016 <- (9,0) 1479316488
8,16 18 6999 0.629026728 15995 G WS 739921928 + 1016
...
9,0 18 1184 0.629032940 15995 X WS 1479317512 / 1479318536 [libnetacq-write]
8,0 18 7059 0.629033294 15995 A WS 739922952 + 1016 <- (9,0) 1479317512
8,0 18 7060 0.629033902 15995 G WS 739922952 + 1016
...
This repeats until we consume the whole original huge request. Now we
finally get to processing the second parts of the split off requests
(in reverse order):
8,16 18 7181 0.629161384 15995 A WS 739952640 + 8 <- (9,0) 1479377920
8,0 18 7239 0.629162140 15995 A WS 739952640 + 8 <- (9,0) 1479376896
8,16 18 7186 0.629163881 15995 A WS 739951616 + 8 <- (9,0) 1479375872
8,0 18 7242 0.629164421 15995 A WS 739951616 + 8 <- (9,0) 1479374848
...
I guess it is obvious that this IO pattern is extremely inefficient way
to perform sequential IO. It also makes bio_list to grow to rather long
lengths.
Change raid0_make_request() to map both parts of the split bio. Since we
know we are provided with at most chunk-sized bios, we will always need
to split the incoming bio at most once.
Fixes: f00d7c85be9e ("md/raid0: fix up bio splitting.")
Signed-off-by: Jan Kara <jack@suse.cz>
Reviewed-by: Yu Kuai <yukuai3@huawei.com>
Link: https://lore.kernel.org/r/20230814092720.3931-2-jack@suse.cz
Signed-off-by: Song Liu <song@kernel.org>
2023-08-14 17:27:08 +08:00
|
|
|
raid0_map_submit_bio(mddev, bio);
|
2023-08-14 17:27:07 +08:00
|
|
|
bio = split;
|
|
|
|
}
|
|
|
|
|
|
|
|
raid0_map_submit_bio(mddev, bio);
|
2017-06-05 14:49:39 +08:00
|
|
|
return true;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-10-17 14:30:53 +08:00
|
|
|
|
2011-10-11 13:47:53 +08:00
|
|
|
static void raid0_status(struct seq_file *seq, struct mddev *mddev)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2009-06-18 06:45:01 +08:00
|
|
|
seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2);
|
2005-04-17 06:20:36 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2023-03-06 21:03:17 +08:00
|
|
|
static void raid0_error(struct mddev *mddev, struct md_rdev *rdev)
|
|
|
|
{
|
|
|
|
if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
|
|
|
|
char *md_name = mdname(mddev);
|
|
|
|
|
|
|
|
pr_crit("md/raid0%s: Disk failure on %pg detected, failing array.\n",
|
|
|
|
md_name, rdev->bdev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-10-11 13:47:53 +08:00
|
|
|
static void *raid0_takeover_raid45(struct mddev *mddev)
|
2010-03-08 13:02:44 +08:00
|
|
|
{
|
2011-10-11 13:45:26 +08:00
|
|
|
struct md_rdev *rdev;
|
2011-10-11 13:48:59 +08:00
|
|
|
struct r0conf *priv_conf;
|
2010-03-08 13:02:44 +08:00
|
|
|
|
|
|
|
if (mddev->degraded != 1) {
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
|
|
|
|
mdname(mddev),
|
|
|
|
mddev->degraded);
|
2010-03-08 13:02:44 +08:00
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
}
|
|
|
|
|
2012-03-19 09:46:39 +08:00
|
|
|
rdev_for_each(rdev, mddev) {
|
2010-03-08 13:02:44 +08:00
|
|
|
/* check slot number for a disk */
|
|
|
|
if (rdev->raid_disk == mddev->raid_disks-1) {
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("md/raid0:%s: raid5 must have missing parity disk!\n",
|
|
|
|
mdname(mddev));
|
2010-03-08 13:02:44 +08:00
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
}
|
2013-06-26 09:55:20 +08:00
|
|
|
rdev->sectors = mddev->dev_sectors;
|
2010-03-08 13:02:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Set new parameters */
|
|
|
|
mddev->new_level = 0;
|
2010-06-16 18:55:14 +08:00
|
|
|
mddev->new_layout = 0;
|
2010-03-08 13:02:44 +08:00
|
|
|
mddev->new_chunk_sectors = mddev->chunk_sectors;
|
|
|
|
mddev->raid_disks--;
|
|
|
|
mddev->delta_disks = -1;
|
|
|
|
/* make sure it will be not marked as dirty */
|
|
|
|
mddev->recovery_cp = MaxSector;
|
2017-01-05 08:10:19 +08:00
|
|
|
mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
|
2010-03-08 13:02:44 +08:00
|
|
|
|
|
|
|
create_strip_zones(mddev, &priv_conf);
|
2016-12-09 07:48:17 +08:00
|
|
|
|
2010-03-08 13:02:44 +08:00
|
|
|
return priv_conf;
|
|
|
|
}
|
|
|
|
|
2011-10-11 13:47:53 +08:00
|
|
|
static void *raid0_takeover_raid10(struct mddev *mddev)
|
2010-03-08 13:02:44 +08:00
|
|
|
{
|
2011-10-11 13:48:59 +08:00
|
|
|
struct r0conf *priv_conf;
|
2010-03-08 13:02:44 +08:00
|
|
|
|
|
|
|
/* Check layout:
|
|
|
|
* - far_copies must be 1
|
|
|
|
* - near_copies must be 2
|
|
|
|
* - disks number must be even
|
|
|
|
* - all mirrors must be already degraded
|
|
|
|
*/
|
|
|
|
if (mddev->layout != ((1 << 8) + 2)) {
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n",
|
|
|
|
mdname(mddev),
|
|
|
|
mddev->layout);
|
2010-03-08 13:02:44 +08:00
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
}
|
|
|
|
if (mddev->raid_disks & 1) {
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n",
|
|
|
|
mdname(mddev));
|
2010-03-08 13:02:44 +08:00
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
}
|
|
|
|
if (mddev->degraded != (mddev->raid_disks>>1)) {
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("md/raid0:%s: All mirrors must be already degraded!\n",
|
|
|
|
mdname(mddev));
|
2010-03-08 13:02:44 +08:00
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Set new parameters */
|
|
|
|
mddev->new_level = 0;
|
2010-06-16 18:55:14 +08:00
|
|
|
mddev->new_layout = 0;
|
2010-03-08 13:02:44 +08:00
|
|
|
mddev->new_chunk_sectors = mddev->chunk_sectors;
|
|
|
|
mddev->delta_disks = - mddev->raid_disks / 2;
|
|
|
|
mddev->raid_disks += mddev->delta_disks;
|
|
|
|
mddev->degraded = 0;
|
|
|
|
/* make sure it will be not marked as dirty */
|
|
|
|
mddev->recovery_cp = MaxSector;
|
2017-01-05 08:10:19 +08:00
|
|
|
mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
|
2010-03-08 13:02:44 +08:00
|
|
|
|
|
|
|
create_strip_zones(mddev, &priv_conf);
|
|
|
|
return priv_conf;
|
|
|
|
}
|
|
|
|
|
2011-10-11 13:47:53 +08:00
|
|
|
static void *raid0_takeover_raid1(struct mddev *mddev)
|
2011-01-31 10:47:13 +08:00
|
|
|
{
|
2011-10-11 13:48:59 +08:00
|
|
|
struct r0conf *priv_conf;
|
2012-04-01 21:48:38 +08:00
|
|
|
int chunksect;
|
2011-01-31 10:47:13 +08:00
|
|
|
|
|
|
|
/* Check layout:
|
|
|
|
* - (N - 1) mirror drives must be already faulty
|
|
|
|
*/
|
|
|
|
if ((mddev->raid_disks - 1) != mddev->degraded) {
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_err("md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
|
2011-01-31 10:47:13 +08:00
|
|
|
mdname(mddev));
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
}
|
|
|
|
|
2012-04-01 21:48:38 +08:00
|
|
|
/*
|
|
|
|
* a raid1 doesn't have the notion of chunk size, so
|
|
|
|
* figure out the largest suitable size we can use.
|
|
|
|
*/
|
|
|
|
chunksect = 64 * 2; /* 64K by default */
|
|
|
|
|
|
|
|
/* The array must be an exact multiple of chunksize */
|
|
|
|
while (chunksect && (mddev->array_sectors & (chunksect - 1)))
|
|
|
|
chunksect >>= 1;
|
|
|
|
|
|
|
|
if ((chunksect << 9) < PAGE_SIZE)
|
|
|
|
/* array size does not allow a suitable chunk size */
|
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
2011-01-31 10:47:13 +08:00
|
|
|
/* Set new parameters */
|
|
|
|
mddev->new_level = 0;
|
|
|
|
mddev->new_layout = 0;
|
2012-04-01 21:48:38 +08:00
|
|
|
mddev->new_chunk_sectors = chunksect;
|
|
|
|
mddev->chunk_sectors = chunksect;
|
2011-01-31 10:47:13 +08:00
|
|
|
mddev->delta_disks = 1 - mddev->raid_disks;
|
2011-02-14 07:01:41 +08:00
|
|
|
mddev->raid_disks = 1;
|
2011-01-31 10:47:13 +08:00
|
|
|
/* make sure it will be not marked as dirty */
|
|
|
|
mddev->recovery_cp = MaxSector;
|
2017-01-05 08:10:19 +08:00
|
|
|
mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);
|
2011-01-31 10:47:13 +08:00
|
|
|
|
|
|
|
create_strip_zones(mddev, &priv_conf);
|
|
|
|
return priv_conf;
|
|
|
|
}
|
|
|
|
|
2011-10-11 13:47:53 +08:00
|
|
|
static void *raid0_takeover(struct mddev *mddev)
|
2010-03-08 13:02:44 +08:00
|
|
|
{
|
|
|
|
/* raid0 can take over:
|
2010-06-16 18:56:12 +08:00
|
|
|
* raid4 - if all data disks are active.
|
2010-03-08 13:02:44 +08:00
|
|
|
* raid5 - providing it is Raid4 layout and one disk is faulty
|
|
|
|
* raid10 - assuming we have all necessary active disks
|
2011-01-31 10:47:13 +08:00
|
|
|
* raid1 - with (N -1) mirror drives faulty
|
2010-03-08 13:02:44 +08:00
|
|
|
*/
|
2014-08-06 14:34:27 +08:00
|
|
|
|
|
|
|
if (mddev->bitmap) {
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("md/raid0: %s: cannot takeover array with bitmap\n",
|
|
|
|
mdname(mddev));
|
2014-08-06 14:34:27 +08:00
|
|
|
return ERR_PTR(-EBUSY);
|
|
|
|
}
|
2010-06-16 18:56:12 +08:00
|
|
|
if (mddev->level == 4)
|
|
|
|
return raid0_takeover_raid45(mddev);
|
|
|
|
|
2010-03-08 13:02:44 +08:00
|
|
|
if (mddev->level == 5) {
|
|
|
|
if (mddev->layout == ALGORITHM_PARITY_N)
|
2010-06-16 18:56:12 +08:00
|
|
|
return raid0_takeover_raid45(mddev);
|
2010-03-08 13:02:44 +08:00
|
|
|
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
|
|
|
|
mdname(mddev), ALGORITHM_PARITY_N);
|
2010-03-08 13:02:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (mddev->level == 10)
|
|
|
|
return raid0_takeover_raid10(mddev);
|
|
|
|
|
2011-01-31 10:47:13 +08:00
|
|
|
if (mddev->level == 1)
|
|
|
|
return raid0_takeover_raid1(mddev);
|
|
|
|
|
2016-11-02 11:16:50 +08:00
|
|
|
pr_warn("Takeover from raid%i to raid0 not supported\n",
|
2011-01-31 10:47:13 +08:00
|
|
|
mddev->level);
|
|
|
|
|
2010-03-08 13:02:44 +08:00
|
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
}
|
|
|
|
|
2017-10-19 09:49:15 +08:00
|
|
|
static void raid0_quiesce(struct mddev *mddev, int quiesce)
|
2010-03-08 13:02:44 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2011-10-11 13:49:58 +08:00
|
|
|
static struct md_personality raid0_personality=
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
.name = "raid0",
|
2006-01-06 16:20:36 +08:00
|
|
|
.level = 0,
|
2005-04-17 06:20:36 +08:00
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.make_request = raid0_make_request,
|
|
|
|
.run = raid0_run,
|
2014-12-15 09:56:58 +08:00
|
|
|
.free = raid0_free,
|
2005-04-17 06:20:36 +08:00
|
|
|
.status = raid0_status,
|
2009-03-18 09:10:40 +08:00
|
|
|
.size = raid0_size,
|
2010-03-08 13:02:44 +08:00
|
|
|
.takeover = raid0_takeover,
|
|
|
|
.quiesce = raid0_quiesce,
|
2023-03-06 21:03:17 +08:00
|
|
|
.error_handler = raid0_error,
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static int __init raid0_init (void)
|
|
|
|
{
|
2006-01-06 16:20:36 +08:00
|
|
|
return register_md_personality (&raid0_personality);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void raid0_exit (void)
|
|
|
|
{
|
2006-01-06 16:20:36 +08:00
|
|
|
unregister_md_personality (&raid0_personality);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
module_init(raid0_init);
|
|
|
|
module_exit(raid0_exit);
|
|
|
|
MODULE_LICENSE("GPL");
|
2009-12-14 09:49:58 +08:00
|
|
|
MODULE_DESCRIPTION("RAID0 (striping) personality for MD");
|
2005-04-17 06:20:36 +08:00
|
|
|
MODULE_ALIAS("md-personality-2"); /* RAID0 */
|
2006-01-06 16:20:51 +08:00
|
|
|
MODULE_ALIAS("md-raid0");
|
2006-01-06 16:20:36 +08:00
|
|
|
MODULE_ALIAS("md-level-0");
|