2018-08-11 13:19:58 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2013-03-24 07:11:31 +08:00
|
|
|
/*
|
|
|
|
* bcache setup/teardown code, and some metadata io - read a superblock and
|
|
|
|
* figure out what to do with it.
|
|
|
|
*
|
|
|
|
* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
|
|
|
|
* Copyright 2012 Google, Inc.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "bcache.h"
|
|
|
|
#include "btree.h"
|
|
|
|
#include "debug.h"
|
2013-12-21 09:22:05 +08:00
|
|
|
#include "extents.h"
|
2013-03-24 07:11:31 +08:00
|
|
|
#include "request.h"
|
2013-06-05 21:21:07 +08:00
|
|
|
#include "writeback.h"
|
2020-07-25 20:00:27 +08:00
|
|
|
#include "features.h"
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2013-04-27 06:39:55 +08:00
|
|
|
#include <linux/blkdev.h>
|
2013-03-24 07:11:31 +08:00
|
|
|
#include <linux/debugfs.h>
|
|
|
|
#include <linux/genhd.h>
|
2013-07-31 16:12:02 +08:00
|
|
|
#include <linux/idr.h>
|
2013-07-11 09:31:58 +08:00
|
|
|
#include <linux/kthread.h>
|
2020-06-15 00:53:32 +08:00
|
|
|
#include <linux/workqueue.h>
|
2013-03-24 07:11:31 +08:00
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/random.h>
|
|
|
|
#include <linux/reboot.h>
|
|
|
|
#include <linux/sysfs.h>
|
|
|
|
|
2018-12-13 22:53:55 +08:00
|
|
|
unsigned int bch_cutoff_writeback;
|
|
|
|
unsigned int bch_cutoff_writeback_sync;
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
static const char bcache_magic[] = {
|
|
|
|
0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
|
|
|
|
0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
|
|
|
|
};
|
|
|
|
|
|
|
|
static const char invalid_uuid[] = {
|
|
|
|
0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
|
|
|
|
0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct kobject *bcache_kobj;
|
|
|
|
struct mutex bch_register_lock;
|
bcache: avoid a deadlock in bcache_reboot()
It is quite frequently to observe deadlock in bcache_reboot() happens
and hang the system reboot process. The reason is, in bcache_reboot()
when calling bch_cache_set_stop() and bcache_device_stop() the mutex
bch_register_lock is held. But in the process to stop cache set and
bcache device, bch_register_lock will be acquired again. If this mutex
is held here, deadlock will happen inside the stopping process. The
aftermath of the deadlock is, whole system reboot gets hung.
The fix is to avoid holding bch_register_lock for the following loops
in bcache_reboot(),
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
A module range variable 'bcache_is_reboot' is added, it sets to true
in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked
to be true, reject the registration by returning -EBUSY immediately.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-28 19:59:46 +08:00
|
|
|
bool bcache_is_reboot;
|
2013-03-24 07:11:31 +08:00
|
|
|
LIST_HEAD(bch_cache_sets);
|
|
|
|
static LIST_HEAD(uncached_devices);
|
|
|
|
|
2013-07-31 16:12:02 +08:00
|
|
|
static int bcache_major;
|
bcache: rewrite multiple partitions support
Current partition support of bcache is confusing and buggy. It tries to
trace non-continuous device minor numbers by an ida bit string, and
mistakenly mixed bcache device index with minor numbers. This design
generates several negative results,
- Index of bcache device name is not consecutive under /dev/. If there are
3 bcache devices, they name will be,
/dev/bcache0, /dev/bcache16, /dev/bcache32
Only bcache code indexes bcache device name is such an interesting way.
- First minor number of each bcache device is traced by ida bit string.
One bcache device will occupy 16 bits, this is not a good idea. Indeed
only one bit is enough.
- Because minor number and bcache device index are mixed, a device index
is allocated by ida_simple_get(), but an first minor number is sent into
ida_simple_remove() to release the device. It confused original author
too.
Root cause of the above errors is, bcache code should not handle device
minor numbers at all! A standard process to support multiple partitions in
Linux kernel is,
- Device driver provides major device number, and indexes multiple device
instances.
- Device driver does not allocat nor trace device minor number, only
provides a first minor number of a given device instance, and sets how
many minor numbers (paritions) the device instance may have.
All rested stuffs are handled by block layer code, most of the details can
be found from block/{genhd, partition-generic}.c files.
This patch re-writes multiple partitions support for bcache. It makes
whole things to be more clear, and uses ida bit string in a more efficeint
way.
- Ida bit string only traces bcache device index, not minor number. For a
bcache device with 128 partitions, only one bit in ida bit string is
enough.
- Device minor number and device index are separated in concept. Device
index is used for /dev node naming, and ida bit string trace. Minor
number is calculated from device index and only used to initialize
first_minor of a bcache device.
- It does not follow any standard for 16 partitions on a bcache device.
This patch sets 128 partitions on single bcache device at max, this is
the limitation from GPT (GUID Partition Table) and supported by fdisk.
Considering a typical device minor number is 20 bits width, each bcache
device may have 128 partitions (7 bits), there can be 8192 bcache devices
existing on system. For most common deployment for a single server in
now days, it should be enough.
[minor spelling fixes in commit message by Michael Lyle]
Signed-off-by: Coly Li <colyli@suse.de>
Cc: Eric Wheeler <bcache@lists.ewheeler.net>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-14 07:35:31 +08:00
|
|
|
static DEFINE_IDA(bcache_device_idx);
|
2013-03-24 07:11:31 +08:00
|
|
|
static wait_queue_head_t unregister_wait;
|
|
|
|
struct workqueue_struct *bcache_wq;
|
2018-09-27 23:41:46 +08:00
|
|
|
struct workqueue_struct *bch_journal_wq;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
bcache: avoid a deadlock in bcache_reboot()
It is quite frequently to observe deadlock in bcache_reboot() happens
and hang the system reboot process. The reason is, in bcache_reboot()
when calling bch_cache_set_stop() and bcache_device_stop() the mutex
bch_register_lock is held. But in the process to stop cache set and
bcache device, bch_register_lock will be acquired again. If this mutex
is held here, deadlock will happen inside the stopping process. The
aftermath of the deadlock is, whole system reboot gets hung.
The fix is to avoid holding bch_register_lock for the following loops
in bcache_reboot(),
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
A module range variable 'bcache_is_reboot' is added, it sets to true
in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked
to be true, reject the registration by returning -EBUSY immediately.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-28 19:59:46 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
|
bcache: rewrite multiple partitions support
Current partition support of bcache is confusing and buggy. It tries to
trace non-continuous device minor numbers by an ida bit string, and
mistakenly mixed bcache device index with minor numbers. This design
generates several negative results,
- Index of bcache device name is not consecutive under /dev/. If there are
3 bcache devices, they name will be,
/dev/bcache0, /dev/bcache16, /dev/bcache32
Only bcache code indexes bcache device name is such an interesting way.
- First minor number of each bcache device is traced by ida bit string.
One bcache device will occupy 16 bits, this is not a good idea. Indeed
only one bit is enough.
- Because minor number and bcache device index are mixed, a device index
is allocated by ida_simple_get(), but an first minor number is sent into
ida_simple_remove() to release the device. It confused original author
too.
Root cause of the above errors is, bcache code should not handle device
minor numbers at all! A standard process to support multiple partitions in
Linux kernel is,
- Device driver provides major device number, and indexes multiple device
instances.
- Device driver does not allocat nor trace device minor number, only
provides a first minor number of a given device instance, and sets how
many minor numbers (paritions) the device instance may have.
All rested stuffs are handled by block layer code, most of the details can
be found from block/{genhd, partition-generic}.c files.
This patch re-writes multiple partitions support for bcache. It makes
whole things to be more clear, and uses ida bit string in a more efficeint
way.
- Ida bit string only traces bcache device index, not minor number. For a
bcache device with 128 partitions, only one bit in ida bit string is
enough.
- Device minor number and device index are separated in concept. Device
index is used for /dev node naming, and ida bit string trace. Minor
number is calculated from device index and only used to initialize
first_minor of a bcache device.
- It does not follow any standard for 16 partitions on a bcache device.
This patch sets 128 partitions on single bcache device at max, this is
the limitation from GPT (GUID Partition Table) and supported by fdisk.
Considering a typical device minor number is 20 bits width, each bcache
device may have 128 partitions (7 bits), there can be 8192 bcache devices
existing on system. For most common deployment for a single server in
now days, it should be enough.
[minor spelling fixes in commit message by Michael Lyle]
Signed-off-by: Coly Li <colyli@suse.de>
Cc: Eric Wheeler <bcache@lists.ewheeler.net>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-14 07:35:31 +08:00
|
|
|
/* limitation of partitions number on single bcache device */
|
|
|
|
#define BCACHE_MINORS 128
|
|
|
|
/* limitation of bcache devices number on single system */
|
|
|
|
#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
/* Superblock */
|
|
|
|
|
bcache: add bucket_size_hi into struct cache_sb_disk for large bucket
The large bucket feature is to extend bucket_size from 16bit to 32bit.
When create cache device on zoned device (e.g. zoned NVMe SSD), making
a single bucket cover one or more zones of the zoned device is the
simplest way to support zoned device as cache by bcache.
But current maximum bucket size is 16MB and a typical zone size of zoned
device is 256MB, this is the major motiviation to extend bucket size to
a larger bit width.
This patch is the basic and first change to support large bucket size,
the major changes it makes are,
- Add BCH_FEATURE_INCOMPAT_LARGE_BUCKET for the large bucket feature,
INCOMPAT means it introduces incompatible on-disk format change.
- Add BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET) routines.
- Adds __le16 bucket_size_hi into struct cache_sb_disk at offset 0x8d0
for the on-disk super block format.
- For the in-memory super block struct cache_sb, member bucket_size is
extended from __u16 to __32.
- Add get_bucket_size() to combine the bucket_size and bucket_size_hi
from struct cache_sb_disk into an unsigned int value.
Since we already have large bucket size helpers meta_bucket_pages(),
meta_bucket_bytes() and alloc_meta_bucket_pages(), they make sure when
bucket size > 8MB, the memory allocation for bcache meta data bucket
won't fail no matter how large the bucket size extended. So these meta
data buckets are handled properly when the bucket size width increase
from 16bit to 32bit, we don't need to worry about them.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-07-25 20:00:35 +08:00
|
|
|
static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s)
|
|
|
|
{
|
|
|
|
unsigned int bucket_size = le16_to_cpu(s->bucket_size);
|
|
|
|
|
|
|
|
if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES &&
|
|
|
|
bch_has_feature_large_bucket(sb))
|
|
|
|
bucket_size |= le16_to_cpu(s->bucket_size_hi) << 16;
|
|
|
|
|
|
|
|
return bucket_size;
|
|
|
|
}
|
|
|
|
|
2020-07-25 20:00:23 +08:00
|
|
|
static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev,
|
|
|
|
struct cache_sb_disk *s)
|
|
|
|
{
|
|
|
|
const char *err;
|
|
|
|
unsigned int i;
|
|
|
|
|
2020-07-25 20:00:28 +08:00
|
|
|
sb->first_bucket= le16_to_cpu(s->first_bucket);
|
2020-07-25 20:00:23 +08:00
|
|
|
sb->nbuckets = le64_to_cpu(s->nbuckets);
|
bcache: add bucket_size_hi into struct cache_sb_disk for large bucket
The large bucket feature is to extend bucket_size from 16bit to 32bit.
When create cache device on zoned device (e.g. zoned NVMe SSD), making
a single bucket cover one or more zones of the zoned device is the
simplest way to support zoned device as cache by bcache.
But current maximum bucket size is 16MB and a typical zone size of zoned
device is 256MB, this is the major motiviation to extend bucket size to
a larger bit width.
This patch is the basic and first change to support large bucket size,
the major changes it makes are,
- Add BCH_FEATURE_INCOMPAT_LARGE_BUCKET for the large bucket feature,
INCOMPAT means it introduces incompatible on-disk format change.
- Add BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET) routines.
- Adds __le16 bucket_size_hi into struct cache_sb_disk at offset 0x8d0
for the on-disk super block format.
- For the in-memory super block struct cache_sb, member bucket_size is
extended from __u16 to __32.
- Add get_bucket_size() to combine the bucket_size and bucket_size_hi
from struct cache_sb_disk into an unsigned int value.
Since we already have large bucket size helpers meta_bucket_pages(),
meta_bucket_bytes() and alloc_meta_bucket_pages(), they make sure when
bucket size > 8MB, the memory allocation for bcache meta data bucket
won't fail no matter how large the bucket size extended. So these meta
data buckets are handled properly when the bucket size width increase
from 16bit to 32bit, we don't need to worry about them.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-07-25 20:00:35 +08:00
|
|
|
sb->bucket_size = get_bucket_size(sb, s);
|
2020-07-25 20:00:23 +08:00
|
|
|
|
|
|
|
sb->nr_in_set = le16_to_cpu(s->nr_in_set);
|
|
|
|
sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
|
|
|
|
|
2020-07-25 20:00:28 +08:00
|
|
|
err = "Too many journal buckets";
|
|
|
|
if (sb->keys > SB_JOURNAL_BUCKETS)
|
|
|
|
goto err;
|
|
|
|
|
2020-07-25 20:00:23 +08:00
|
|
|
err = "Too many buckets";
|
|
|
|
if (sb->nbuckets > LONG_MAX)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "Not enough buckets";
|
|
|
|
if (sb->nbuckets < 1 << 7)
|
|
|
|
goto err;
|
|
|
|
|
2020-07-25 20:00:24 +08:00
|
|
|
err = "Bad block size (not power of 2)";
|
|
|
|
if (!is_power_of_2(sb->block_size))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "Bad block size (larger than page size)";
|
|
|
|
if (sb->block_size > PAGE_SECTORS)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "Bad bucket size (not power of 2)";
|
|
|
|
if (!is_power_of_2(sb->bucket_size))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "Bad bucket size (smaller than page size)";
|
|
|
|
if (sb->bucket_size < PAGE_SECTORS)
|
2020-07-25 20:00:23 +08:00
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "Invalid superblock: device too small";
|
|
|
|
if (get_capacity(bdev->bd_disk) <
|
|
|
|
sb->bucket_size * sb->nbuckets)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "Bad UUID";
|
|
|
|
if (bch_is_zero(sb->set_uuid, 16))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "Bad cache device number in set";
|
|
|
|
if (!sb->nr_in_set ||
|
|
|
|
sb->nr_in_set <= sb->nr_this_dev ||
|
|
|
|
sb->nr_in_set > MAX_CACHES_PER_SET)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "Journal buckets not sequential";
|
|
|
|
for (i = 0; i < sb->keys; i++)
|
|
|
|
if (sb->d[i] != sb->first_bucket + i)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "Too many journal buckets";
|
|
|
|
if (sb->first_bucket + sb->keys > sb->nbuckets)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "Invalid superblock: first bucket comes before end of super";
|
|
|
|
if (sb->first_bucket * sb->bucket_size < 16)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = NULL;
|
|
|
|
err:
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
|
2020-01-24 01:01:32 +08:00
|
|
|
struct cache_sb_disk **res)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
const char *err;
|
2020-01-24 01:01:27 +08:00
|
|
|
struct cache_sb_disk *s;
|
2020-01-24 01:01:34 +08:00
|
|
|
struct page *page;
|
2018-08-11 13:19:44 +08:00
|
|
|
unsigned int i;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-01-24 01:01:34 +08:00
|
|
|
page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
|
|
|
|
SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
|
|
|
|
if (IS_ERR(page))
|
2013-03-24 07:11:31 +08:00
|
|
|
return "IO error";
|
2020-01-24 01:01:34 +08:00
|
|
|
s = page_address(page) + offset_in_page(SB_OFFSET);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
sb->offset = le64_to_cpu(s->offset);
|
|
|
|
sb->version = le64_to_cpu(s->version);
|
|
|
|
|
|
|
|
memcpy(sb->magic, s->magic, 16);
|
|
|
|
memcpy(sb->uuid, s->uuid, 16);
|
|
|
|
memcpy(sb->set_uuid, s->set_uuid, 16);
|
|
|
|
memcpy(sb->label, s->label, SB_LABEL_SIZE);
|
|
|
|
|
|
|
|
sb->flags = le64_to_cpu(s->flags);
|
|
|
|
sb->seq = le64_to_cpu(s->seq);
|
|
|
|
sb->last_mount = le32_to_cpu(s->last_mount);
|
|
|
|
sb->keys = le16_to_cpu(s->keys);
|
|
|
|
|
|
|
|
for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
|
|
|
|
sb->d[i] = le64_to_cpu(s->d[i]);
|
|
|
|
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u\n",
|
2013-03-24 07:11:31 +08:00
|
|
|
sb->version, sb->flags, sb->seq, sb->keys);
|
|
|
|
|
2019-11-13 16:03:18 +08:00
|
|
|
err = "Not a bcache superblock (bad offset)";
|
2013-03-24 07:11:31 +08:00
|
|
|
if (sb->offset != SB_SECTOR)
|
|
|
|
goto err;
|
|
|
|
|
2019-11-13 16:03:18 +08:00
|
|
|
err = "Not a bcache superblock (bad magic)";
|
2013-03-24 07:11:31 +08:00
|
|
|
if (memcmp(sb->magic, bcache_magic, 16))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "Bad checksum";
|
|
|
|
if (s->csum != csum_set(s))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "Bad UUID";
|
2013-03-29 02:50:55 +08:00
|
|
|
if (bch_is_zero(sb->uuid, 16))
|
2013-03-24 07:11:31 +08:00
|
|
|
goto err;
|
|
|
|
|
2013-04-24 12:51:48 +08:00
|
|
|
sb->block_size = le16_to_cpu(s->block_size);
|
|
|
|
|
|
|
|
err = "Superblock block size smaller than device block size";
|
|
|
|
if (sb->block_size << 9 < bdev_logical_block_size(bdev))
|
|
|
|
goto err;
|
|
|
|
|
2013-04-12 06:14:35 +08:00
|
|
|
switch (sb->version) {
|
|
|
|
case BCACHE_SB_VERSION_BDEV:
|
|
|
|
sb->data_offset = BDEV_DATA_START_DEFAULT;
|
|
|
|
break;
|
|
|
|
case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
|
2020-07-25 20:00:27 +08:00
|
|
|
case BCACHE_SB_VERSION_BDEV_WITH_FEATURES:
|
2013-04-12 06:14:35 +08:00
|
|
|
sb->data_offset = le64_to_cpu(s->data_offset);
|
|
|
|
|
|
|
|
err = "Bad data offset";
|
|
|
|
if (sb->data_offset < BDEV_DATA_START_DEFAULT)
|
|
|
|
goto err;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2013-04-12 06:14:35 +08:00
|
|
|
break;
|
|
|
|
case BCACHE_SB_VERSION_CDEV:
|
|
|
|
case BCACHE_SB_VERSION_CDEV_WITH_UUID:
|
2020-07-25 20:00:23 +08:00
|
|
|
err = read_super_common(sb, bdev, s);
|
|
|
|
if (err)
|
2013-04-12 06:14:35 +08:00
|
|
|
goto err;
|
|
|
|
break;
|
2020-07-25 20:00:27 +08:00
|
|
|
case BCACHE_SB_VERSION_CDEV_WITH_FEATURES:
|
bcache: add bucket_size_hi into struct cache_sb_disk for large bucket
The large bucket feature is to extend bucket_size from 16bit to 32bit.
When create cache device on zoned device (e.g. zoned NVMe SSD), making
a single bucket cover one or more zones of the zoned device is the
simplest way to support zoned device as cache by bcache.
But current maximum bucket size is 16MB and a typical zone size of zoned
device is 256MB, this is the major motiviation to extend bucket size to
a larger bit width.
This patch is the basic and first change to support large bucket size,
the major changes it makes are,
- Add BCH_FEATURE_INCOMPAT_LARGE_BUCKET for the large bucket feature,
INCOMPAT means it introduces incompatible on-disk format change.
- Add BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET) routines.
- Adds __le16 bucket_size_hi into struct cache_sb_disk at offset 0x8d0
for the on-disk super block format.
- For the in-memory super block struct cache_sb, member bucket_size is
extended from __u16 to __32.
- Add get_bucket_size() to combine the bucket_size and bucket_size_hi
from struct cache_sb_disk into an unsigned int value.
Since we already have large bucket size helpers meta_bucket_pages(),
meta_bucket_bytes() and alloc_meta_bucket_pages(), they make sure when
bucket size > 8MB, the memory allocation for bcache meta data bucket
won't fail no matter how large the bucket size extended. So these meta
data buckets are handled properly when the bucket size width increase
from 16bit to 32bit, we don't need to worry about them.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-07-25 20:00:35 +08:00
|
|
|
/*
|
|
|
|
* Feature bits are needed in read_super_common(),
|
|
|
|
* convert them firstly.
|
|
|
|
*/
|
2020-07-25 20:00:27 +08:00
|
|
|
sb->feature_compat = le64_to_cpu(s->feature_compat);
|
|
|
|
sb->feature_incompat = le64_to_cpu(s->feature_incompat);
|
|
|
|
sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat);
|
bcache: add bucket_size_hi into struct cache_sb_disk for large bucket
The large bucket feature is to extend bucket_size from 16bit to 32bit.
When create cache device on zoned device (e.g. zoned NVMe SSD), making
a single bucket cover one or more zones of the zoned device is the
simplest way to support zoned device as cache by bcache.
But current maximum bucket size is 16MB and a typical zone size of zoned
device is 256MB, this is the major motiviation to extend bucket size to
a larger bit width.
This patch is the basic and first change to support large bucket size,
the major changes it makes are,
- Add BCH_FEATURE_INCOMPAT_LARGE_BUCKET for the large bucket feature,
INCOMPAT means it introduces incompatible on-disk format change.
- Add BCH_FEATURE_INCOMPAT_FUNCS(large_bucket, LARGE_BUCKET) routines.
- Adds __le16 bucket_size_hi into struct cache_sb_disk at offset 0x8d0
for the on-disk super block format.
- For the in-memory super block struct cache_sb, member bucket_size is
extended from __u16 to __32.
- Add get_bucket_size() to combine the bucket_size and bucket_size_hi
from struct cache_sb_disk into an unsigned int value.
Since we already have large bucket size helpers meta_bucket_pages(),
meta_bucket_bytes() and alloc_meta_bucket_pages(), they make sure when
bucket size > 8MB, the memory allocation for bcache meta data bucket
won't fail no matter how large the bucket size extended. So these meta
data buckets are handled properly when the bucket size width increase
from 16bit to 32bit, we don't need to worry about them.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-07-25 20:00:35 +08:00
|
|
|
err = read_super_common(sb, bdev, s);
|
|
|
|
if (err)
|
2013-04-12 06:14:35 +08:00
|
|
|
goto err;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
err = "Unsupported superblock version";
|
2013-03-24 07:11:31 +08:00
|
|
|
goto err;
|
2013-04-12 06:14:35 +08:00
|
|
|
}
|
|
|
|
|
2018-07-26 12:17:41 +08:00
|
|
|
sb->last_mount = (u32)ktime_get_real_seconds();
|
2020-01-24 01:01:32 +08:00
|
|
|
*res = s;
|
2020-01-24 01:01:34 +08:00
|
|
|
return NULL;
|
2013-03-24 07:11:31 +08:00
|
|
|
err:
|
2020-01-24 01:01:34 +08:00
|
|
|
put_page(page);
|
2013-03-24 07:11:31 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
static void write_bdev_super_endio(struct bio *bio)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
struct cached_dev *dc = bio->bi_private;
|
2019-06-28 19:59:30 +08:00
|
|
|
|
|
|
|
if (bio->bi_status)
|
|
|
|
bch_count_backing_io_errors(dc, bio);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2013-12-17 07:27:25 +08:00
|
|
|
closure_put(&dc->sb_write);
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
2020-01-24 01:01:33 +08:00
|
|
|
static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
|
|
|
|
struct bio *bio)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
2018-08-11 13:19:44 +08:00
|
|
|
unsigned int i;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-01-24 01:01:33 +08:00
|
|
|
bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
|
2013-10-12 06:44:27 +08:00
|
|
|
bio->bi_iter.bi_sector = SB_SECTOR;
|
2020-01-24 01:01:33 +08:00
|
|
|
__bio_add_page(bio, virt_to_page(out), SB_SIZE,
|
|
|
|
offset_in_page(out));
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
out->offset = cpu_to_le64(sb->offset);
|
|
|
|
|
|
|
|
memcpy(out->uuid, sb->uuid, 16);
|
|
|
|
memcpy(out->set_uuid, sb->set_uuid, 16);
|
|
|
|
memcpy(out->label, sb->label, SB_LABEL_SIZE);
|
|
|
|
|
|
|
|
out->flags = cpu_to_le64(sb->flags);
|
|
|
|
out->seq = cpu_to_le64(sb->seq);
|
|
|
|
|
|
|
|
out->last_mount = cpu_to_le32(sb->last_mount);
|
|
|
|
out->first_bucket = cpu_to_le16(sb->first_bucket);
|
|
|
|
out->keys = cpu_to_le16(sb->keys);
|
|
|
|
|
|
|
|
for (i = 0; i < sb->keys; i++)
|
|
|
|
out->d[i] = cpu_to_le64(sb->d[i]);
|
|
|
|
|
2020-07-25 20:00:27 +08:00
|
|
|
if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
|
|
|
|
out->feature_compat = cpu_to_le64(sb->feature_compat);
|
|
|
|
out->feature_incompat = cpu_to_le64(sb->feature_incompat);
|
|
|
|
out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat);
|
|
|
|
}
|
|
|
|
|
|
|
|
out->version = cpu_to_le64(sb->version);
|
2013-03-24 07:11:31 +08:00
|
|
|
out->csum = csum_set(out);
|
|
|
|
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_debug("ver %llu, flags %llu, seq %llu\n",
|
2013-03-24 07:11:31 +08:00
|
|
|
sb->version, sb->flags, sb->seq);
|
|
|
|
|
2016-06-06 03:31:41 +08:00
|
|
|
submit_bio(bio);
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
2013-12-17 07:27:25 +08:00
|
|
|
static void bch_write_bdev_super_unlock(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
|
|
|
|
|
|
|
|
up(&dc->sb_write_mutex);
|
|
|
|
}
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
|
|
|
|
{
|
2013-12-17 07:27:25 +08:00
|
|
|
struct closure *cl = &dc->sb_write;
|
2013-03-24 07:11:31 +08:00
|
|
|
struct bio *bio = &dc->sb_bio;
|
|
|
|
|
2013-12-17 07:27:25 +08:00
|
|
|
down(&dc->sb_write_mutex);
|
|
|
|
closure_init(cl, parent);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-01-24 01:01:33 +08:00
|
|
|
bio_init(bio, dc->sb_bv, 1);
|
2017-08-24 01:10:32 +08:00
|
|
|
bio_set_dev(bio, dc->bdev);
|
2013-03-24 07:11:31 +08:00
|
|
|
bio->bi_end_io = write_bdev_super_endio;
|
|
|
|
bio->bi_private = dc;
|
|
|
|
|
|
|
|
closure_get(cl);
|
2018-03-19 08:36:24 +08:00
|
|
|
/* I/O request sent to backing device */
|
2020-01-24 01:01:33 +08:00
|
|
|
__write_super(&dc->sb, dc->sb_disk, bio);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2013-12-17 07:27:25 +08:00
|
|
|
closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
static void write_super_endio(struct bio *bio)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
struct cache *ca = bio->bi_private;
|
|
|
|
|
2018-01-09 04:21:29 +08:00
|
|
|
/* is_read = 0 */
|
|
|
|
bch_count_io_errors(ca, bio->bi_status, 0,
|
|
|
|
"writing superblock");
|
2013-12-17 07:27:25 +08:00
|
|
|
closure_put(&ca->set->sb_write);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void bcache_write_super_unlock(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct cache_set *c = container_of(cl, struct cache_set, sb_write);
|
|
|
|
|
|
|
|
up(&c->sb_write_mutex);
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void bcache_write_super(struct cache_set *c)
|
|
|
|
{
|
2013-12-17 07:27:25 +08:00
|
|
|
struct closure *cl = &c->sb_write;
|
2020-10-01 14:50:47 +08:00
|
|
|
struct cache *ca = c->cache;
|
|
|
|
struct bio *bio = &ca->sb_bio;
|
|
|
|
unsigned int version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2013-12-17 07:27:25 +08:00
|
|
|
down(&c->sb_write_mutex);
|
|
|
|
closure_init(cl, &c->cl);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-10-01 14:50:56 +08:00
|
|
|
ca->sb.seq++;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-10-01 14:50:56 +08:00
|
|
|
if (ca->sb.version < version)
|
|
|
|
ca->sb.version = version;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-10-01 14:50:47 +08:00
|
|
|
bio_init(bio, ca->sb_bv, 1);
|
|
|
|
bio_set_dev(bio, ca->bdev);
|
|
|
|
bio->bi_end_io = write_super_endio;
|
|
|
|
bio->bi_private = ca;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-10-01 14:50:47 +08:00
|
|
|
closure_get(cl);
|
|
|
|
__write_super(&ca->sb, ca->sb_disk, bio);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2013-12-17 07:27:25 +08:00
|
|
|
closure_return_with_destructor(cl, bcache_write_super_unlock);
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* UUID io */
|
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
static void uuid_endio(struct bio *bio)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
struct closure *cl = bio->bi_private;
|
2013-12-17 07:27:25 +08:00
|
|
|
struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2017-06-03 15:38:06 +08:00
|
|
|
cache_set_err_on(bio->bi_status, c, "accessing uuids");
|
2013-03-24 07:11:31 +08:00
|
|
|
bch_bbio_free(bio, c);
|
|
|
|
closure_put(cl);
|
|
|
|
}
|
|
|
|
|
2013-12-17 07:27:25 +08:00
|
|
|
static void uuid_io_unlock(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
|
|
|
|
|
|
|
|
up(&c->uuid_write_mutex);
|
|
|
|
}
|
|
|
|
|
2016-06-06 03:32:05 +08:00
|
|
|
static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
|
2013-03-24 07:11:31 +08:00
|
|
|
struct bkey *k, struct closure *parent)
|
|
|
|
{
|
2013-12-17 07:27:25 +08:00
|
|
|
struct closure *cl = &c->uuid_write;
|
2013-03-24 07:11:31 +08:00
|
|
|
struct uuid_entry *u;
|
2018-08-11 13:19:44 +08:00
|
|
|
unsigned int i;
|
2013-05-15 11:33:16 +08:00
|
|
|
char buf[80];
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
BUG_ON(!parent);
|
2013-12-17 07:27:25 +08:00
|
|
|
down(&c->uuid_write_mutex);
|
|
|
|
closure_init(cl, parent);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
for (i = 0; i < KEY_PTRS(k); i++) {
|
|
|
|
struct bio *bio = bch_bbio_alloc(c);
|
|
|
|
|
2016-08-06 05:35:16 +08:00
|
|
|
bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
|
2013-10-12 06:44:27 +08:00
|
|
|
bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
bio->bi_end_io = uuid_endio;
|
|
|
|
bio->bi_private = cl;
|
2016-06-06 03:32:05 +08:00
|
|
|
bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
|
2013-03-29 02:50:55 +08:00
|
|
|
bch_bio_map(bio, c->uuids);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
bch_submit_bbio(bio, c, k, i);
|
|
|
|
|
2016-06-06 03:32:05 +08:00
|
|
|
if (op != REQ_OP_WRITE)
|
2013-03-24 07:11:31 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2013-12-18 15:47:33 +08:00
|
|
|
bch_extent_to_text(buf, sizeof(buf), k);
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_debug("%s UUIDs at %s\n", op == REQ_OP_WRITE ? "wrote" : "read", buf);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
|
2013-03-29 02:50:55 +08:00
|
|
|
if (!bch_is_zero(u->uuid, 16))
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u\n",
|
2013-03-24 07:11:31 +08:00
|
|
|
u - c->uuids, u->uuid, u->label,
|
|
|
|
u->first_reg, u->last_reg, u->invalidated);
|
|
|
|
|
2013-12-17 07:27:25 +08:00
|
|
|
closure_return_with_destructor(cl, uuid_io_unlock);
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
|
|
|
|
{
|
|
|
|
struct bkey *k = &j->uuid_bucket;
|
|
|
|
|
2013-12-21 09:22:05 +08:00
|
|
|
if (__bch_btree_ptr_invalid(c, k))
|
2013-03-24 07:11:31 +08:00
|
|
|
return "bad uuid pointer";
|
|
|
|
|
|
|
|
bkey_copy(&c->uuid_bucket, k);
|
2016-11-01 21:40:10 +08:00
|
|
|
uuid_io(c, REQ_OP_READ, 0, k, cl);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
|
|
|
|
struct uuid_entry_v0 *u0 = (void *) c->uuids;
|
|
|
|
struct uuid_entry *u1 = (void *) c->uuids;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
closure_sync(cl);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Since the new uuid entry is bigger than the old, we have to
|
|
|
|
* convert starting at the highest memory address and work down
|
|
|
|
* in order to do it in place
|
|
|
|
*/
|
|
|
|
|
|
|
|
for (i = c->nr_uuids - 1;
|
|
|
|
i >= 0;
|
|
|
|
--i) {
|
|
|
|
memcpy(u1[i].uuid, u0[i].uuid, 16);
|
|
|
|
memcpy(u1[i].label, u0[i].label, 32);
|
|
|
|
|
|
|
|
u1[i].first_reg = u0[i].first_reg;
|
|
|
|
u1[i].last_reg = u0[i].last_reg;
|
|
|
|
u1[i].invalidated = u0[i].invalidated;
|
|
|
|
|
|
|
|
u1[i].flags = 0;
|
|
|
|
u1[i].sectors = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __uuid_write(struct cache_set *c)
|
|
|
|
{
|
|
|
|
BKEY_PADDED(key) k;
|
|
|
|
struct closure cl;
|
2020-10-01 14:50:56 +08:00
|
|
|
struct cache *ca = c->cache;
|
2020-07-25 20:00:31 +08:00
|
|
|
unsigned int size;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2018-08-11 13:19:45 +08:00
|
|
|
closure_init_stack(&cl);
|
2013-03-24 07:11:31 +08:00
|
|
|
lockdep_assert_held(&bch_register_lock);
|
|
|
|
|
2020-10-01 14:50:45 +08:00
|
|
|
if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true))
|
2013-03-24 07:11:31 +08:00
|
|
|
return 1;
|
|
|
|
|
2020-10-01 14:50:56 +08:00
|
|
|
size = meta_bucket_pages(&ca->sb) * PAGE_SECTORS;
|
2020-07-25 20:00:31 +08:00
|
|
|
SET_KEY_SIZE(&k.key, size);
|
2016-06-06 03:32:05 +08:00
|
|
|
uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
|
2013-03-24 07:11:31 +08:00
|
|
|
closure_sync(&cl);
|
|
|
|
|
2018-10-08 20:41:07 +08:00
|
|
|
/* Only one bucket used for uuid write */
|
|
|
|
atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
bkey_copy(&c->uuid_bucket, &k.key);
|
2013-07-25 07:46:42 +08:00
|
|
|
bkey_put(c, &k.key);
|
2013-03-24 07:11:31 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int bch_uuid_write(struct cache_set *c)
|
|
|
|
{
|
|
|
|
int ret = __uuid_write(c);
|
|
|
|
|
|
|
|
if (!ret)
|
|
|
|
bch_journal_meta(c, NULL);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
|
|
|
|
{
|
|
|
|
struct uuid_entry *u;
|
|
|
|
|
|
|
|
for (u = c->uuids;
|
|
|
|
u < c->uuids + c->nr_uuids; u++)
|
|
|
|
if (!memcmp(u->uuid, uuid, 16))
|
|
|
|
return u;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct uuid_entry *uuid_find_empty(struct cache_set *c)
|
|
|
|
{
|
|
|
|
static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
|
2018-08-11 13:19:45 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
return uuid_find(c, zero_uuid);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Bucket priorities/gens:
|
|
|
|
*
|
|
|
|
* For each bucket, we store on disk its
|
2018-08-11 13:19:55 +08:00
|
|
|
* 8 bit gen
|
|
|
|
* 16 bit priority
|
2013-03-24 07:11:31 +08:00
|
|
|
*
|
|
|
|
* See alloc.c for an explanation of the gen. The priority is used to implement
|
|
|
|
* lru (and in the future other) cache replacement policies; for most purposes
|
|
|
|
* it's just an opaque integer.
|
|
|
|
*
|
|
|
|
* The gens and the priorities don't have a whole lot to do with each other, and
|
|
|
|
* it's actually the gens that must be written out at specific times - it's no
|
|
|
|
* big deal if the priorities don't get written, if we lose them we just reuse
|
|
|
|
* buckets in suboptimal order.
|
|
|
|
*
|
|
|
|
* On disk they're stored in a packed array, and in as many buckets are required
|
|
|
|
* to fit them all. The buckets we use to store them form a list; the journal
|
|
|
|
* header points to the first bucket, the first bucket points to the second
|
|
|
|
* bucket, et cetera.
|
|
|
|
*
|
|
|
|
* This code is used by the allocation code; periodically (whenever it runs out
|
|
|
|
* of buckets to allocate from) the allocation code will invalidate some
|
|
|
|
* buckets, but it can't use those buckets until their new gens are safely on
|
|
|
|
* disk.
|
|
|
|
*/
|
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
static void prio_endio(struct bio *bio)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
struct cache *ca = bio->bi_private;
|
|
|
|
|
2017-06-03 15:38:06 +08:00
|
|
|
cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
|
2013-03-24 07:11:31 +08:00
|
|
|
bch_bbio_free(bio, ca->set);
|
|
|
|
closure_put(&ca->prio);
|
|
|
|
}
|
|
|
|
|
2016-06-06 03:32:05 +08:00
|
|
|
static void prio_io(struct cache *ca, uint64_t bucket, int op,
|
|
|
|
unsigned long op_flags)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
struct closure *cl = &ca->prio;
|
|
|
|
struct bio *bio = bch_bbio_alloc(ca->set);
|
|
|
|
|
|
|
|
closure_init_stack(cl);
|
|
|
|
|
2013-10-12 06:44:27 +08:00
|
|
|
bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
|
2017-08-24 01:10:32 +08:00
|
|
|
bio_set_dev(bio, ca->bdev);
|
2020-07-25 20:00:32 +08:00
|
|
|
bio->bi_iter.bi_size = meta_bucket_bytes(&ca->sb);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
bio->bi_end_io = prio_endio;
|
|
|
|
bio->bi_private = ca;
|
2016-06-06 03:32:05 +08:00
|
|
|
bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
|
2013-03-29 02:50:55 +08:00
|
|
|
bch_bio_map(bio, ca->disk_buckets);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags
When too many I/Os failed on cache device, bch_cache_set_error() is called
in the error handling code path to retire whole problematic cache set. If
new I/O requests continue to come and take refcount dc->count, the cache
set won't be retired immediately, this is a problem.
Further more, there are several kernel thread and self-armed kernel work
may still running after bch_cache_set_error() is called. It needs to wait
quite a while for them to stop, or they won't stop at all. They also
prevent the cache set from being retired.
The solution in this patch is, to add per cache set flag to disable I/O
request on this cache and all attached backing devices. Then new coming I/O
requests can be rejected in *_make_request() before taking refcount, kernel
threads and self-armed kernel worker can stop very fast when flags bit
CACHE_SET_IO_DISABLE is set.
Because bcache also do internal I/Os for writeback, garbage collection,
bucket allocation, journaling, this kind of I/O should be disabled after
bch_cache_set_error() is called. So closure_bio_submit() is modified to
check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set,
closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and
return, generic_make_request() won't be called.
A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit
from cache_set->flags, to disable or enable cache set I/O for debugging. It
is helpful to trigger more corner case issues for failed cache device.
Changelog
v4, add wait_for_kthread_stop(), and call it before exits writeback and gc
kernel threads.
v3, change CACHE_SET_IO_DISABLE from 4 to 3, since it is bit index.
remove "bcache: " prefix when printing out kernel message.
v2, more changes by previous review,
- Use CACHE_SET_IO_DISABLE of cache_set->flags, suggested by Junhui.
- Check CACHE_SET_IO_DISABLE in bch_btree_gc() to stop a while-loop, this
is reported and inspired from origal patch of Pavel Vazharov.
v1, initial version.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Pavel Vazharov <freakpv@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:17 +08:00
|
|
|
closure_bio_submit(ca->set, bio, &ca->prio);
|
2013-03-24 07:11:31 +08:00
|
|
|
closure_sync(cl);
|
|
|
|
}
|
|
|
|
|
2019-11-13 16:03:21 +08:00
|
|
|
int bch_prio_write(struct cache *ca, bool wait)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
struct bucket *b;
|
|
|
|
struct closure cl;
|
|
|
|
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu\n",
|
2019-11-13 16:03:21 +08:00
|
|
|
fifo_used(&ca->free[RESERVE_PRIO]),
|
|
|
|
fifo_used(&ca->free[RESERVE_NONE]),
|
|
|
|
fifo_used(&ca->free_inc));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Pre-check if there are enough free buckets. In the non-blocking
|
|
|
|
* scenario it's better to fail early rather than starting to allocate
|
|
|
|
* buckets and do a cleanup later in case of failure.
|
|
|
|
*/
|
|
|
|
if (!wait) {
|
|
|
|
size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
|
|
|
|
fifo_used(&ca->free[RESERVE_NONE]);
|
|
|
|
if (prio_buckets(ca) > avail)
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
closure_init_stack(&cl);
|
|
|
|
|
|
|
|
lockdep_assert_held(&ca->set->bucket_lock);
|
|
|
|
|
|
|
|
ca->disk_buckets->seq++;
|
|
|
|
|
|
|
|
atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
|
|
|
|
&ca->meta_sectors_written);
|
|
|
|
|
|
|
|
for (i = prio_buckets(ca) - 1; i >= 0; --i) {
|
|
|
|
long bucket;
|
|
|
|
struct prio_set *p = ca->disk_buckets;
|
2013-03-26 02:46:44 +08:00
|
|
|
struct bucket_disk *d = p->data;
|
|
|
|
struct bucket_disk *end = d + prios_per_bucket(ca);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
for (b = ca->buckets + i * prios_per_bucket(ca);
|
|
|
|
b < ca->buckets + ca->sb.nbuckets && d < end;
|
|
|
|
b++, d++) {
|
|
|
|
d->prio = cpu_to_le16(b->prio);
|
|
|
|
d->gen = b->gen;
|
|
|
|
}
|
|
|
|
|
|
|
|
p->next_bucket = ca->prio_buckets[i + 1];
|
2013-11-01 06:46:42 +08:00
|
|
|
p->magic = pset_magic(&ca->sb);
|
2020-07-25 20:00:32 +08:00
|
|
|
p->csum = bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2019-11-13 16:03:21 +08:00
|
|
|
bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
|
2013-03-24 07:11:31 +08:00
|
|
|
BUG_ON(bucket == -1);
|
|
|
|
|
|
|
|
mutex_unlock(&ca->set->bucket_lock);
|
2016-06-06 03:32:05 +08:00
|
|
|
prio_io(ca, bucket, REQ_OP_WRITE, 0);
|
2013-03-24 07:11:31 +08:00
|
|
|
mutex_lock(&ca->set->bucket_lock);
|
|
|
|
|
|
|
|
ca->prio_buckets[i] = bucket;
|
|
|
|
atomic_dec_bug(&ca->buckets[bucket].pin);
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_unlock(&ca->set->bucket_lock);
|
|
|
|
|
|
|
|
bch_journal_meta(ca->set, &cl);
|
|
|
|
closure_sync(&cl);
|
|
|
|
|
|
|
|
mutex_lock(&ca->set->bucket_lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Don't want the old priorities to get garbage collected until after we
|
|
|
|
* finish writing the new ones, and they're journalled
|
|
|
|
*/
|
2014-03-18 07:55:55 +08:00
|
|
|
for (i = 0; i < prio_buckets(ca); i++) {
|
|
|
|
if (ca->prio_last_buckets[i])
|
|
|
|
__bch_bucket_free(ca,
|
|
|
|
&ca->buckets[ca->prio_last_buckets[i]]);
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
ca->prio_last_buckets[i] = ca->prio_buckets[i];
|
2014-03-18 07:55:55 +08:00
|
|
|
}
|
2019-11-13 16:03:21 +08:00
|
|
|
return 0;
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
2020-02-01 22:42:35 +08:00
|
|
|
static int prio_read(struct cache *ca, uint64_t bucket)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
struct prio_set *p = ca->disk_buckets;
|
|
|
|
struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
|
|
|
|
struct bucket *b;
|
2018-08-11 13:19:44 +08:00
|
|
|
unsigned int bucket_nr = 0;
|
2020-02-01 22:42:35 +08:00
|
|
|
int ret = -EIO;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
for (b = ca->buckets;
|
|
|
|
b < ca->buckets + ca->sb.nbuckets;
|
|
|
|
b++, d++) {
|
|
|
|
if (d == end) {
|
|
|
|
ca->prio_buckets[bucket_nr] = bucket;
|
|
|
|
ca->prio_last_buckets[bucket_nr] = bucket;
|
|
|
|
bucket_nr++;
|
|
|
|
|
2016-11-01 21:40:10 +08:00
|
|
|
prio_io(ca, bucket, REQ_OP_READ, 0);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2018-08-11 13:19:47 +08:00
|
|
|
if (p->csum !=
|
2020-07-25 20:00:32 +08:00
|
|
|
bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_warn("bad csum reading priorities\n");
|
2020-02-01 22:42:35 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-02-01 22:42:35 +08:00
|
|
|
if (p->magic != pset_magic(&ca->sb)) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_warn("bad magic reading priorities\n");
|
2020-02-01 22:42:35 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
bucket = p->next_bucket;
|
|
|
|
d = p->data;
|
|
|
|
}
|
|
|
|
|
|
|
|
b->prio = le16_to_cpu(d->prio);
|
2014-02-28 09:51:12 +08:00
|
|
|
b->gen = b->last_gc = d->gen;
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
2020-02-01 22:42:35 +08:00
|
|
|
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
|
|
return ret;
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Bcache device */
|
|
|
|
|
|
|
|
static int open_dev(struct block_device *b, fmode_t mode)
|
|
|
|
{
|
|
|
|
struct bcache_device *d = b->bd_disk->private_data;
|
2018-08-11 13:19:45 +08:00
|
|
|
|
2013-08-22 08:49:09 +08:00
|
|
|
if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
|
2013-03-24 07:11:31 +08:00
|
|
|
return -ENXIO;
|
|
|
|
|
|
|
|
closure_get(&d->cl);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-05-10 04:39:26 +08:00
|
|
|
static void release_dev(struct gendisk *b, fmode_t mode)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
struct bcache_device *d = b->private_data;
|
2018-08-11 13:19:45 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
closure_put(&d->cl);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int ioctl_dev(struct block_device *b, fmode_t mode,
|
|
|
|
unsigned int cmd, unsigned long arg)
|
|
|
|
{
|
|
|
|
struct bcache_device *d = b->bd_disk->private_data;
|
bcache: stop bcache device when backing device is offline
Currently bcache does not handle backing device failure, if backing
device is offline and disconnected from system, its bcache device can still
be accessible. If the bcache device is in writeback mode, I/O requests even
can success if the requests hit on cache device. That is to say, when and
how bcache handles offline backing device is undefined.
This patch tries to handle backing device offline in a rather simple way,
- Add cached_dev->status_update_thread kernel thread to update backing
device status in every 1 second.
- Add cached_dev->offline_seconds to record how many seconds the backing
device is observed to be offline. If the backing device is offline for
BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and
call bcache_device_stop() to stop the bache device which linked to the
offline backing device.
Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds,
its bcache device will be removed, then user space application writing on
it will get error immediately, and handler the device failure in time.
This patch is quite simple, does not handle more complicated situations.
Once the bcache device is stopped, users need to recovery the backing
device, register and attach it manually.
Changelog:
v3: call wait_for_kthread_stop() before exits kernel thread.
v2: remove "bcache: " prefix when calling pr_warn().
v1: initial version.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-28 15:37:41 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
return d->ioctl(d, mode, cmd, arg);
|
|
|
|
}
|
|
|
|
|
2020-07-01 16:59:43 +08:00
|
|
|
static const struct block_device_operations bcache_cached_ops = {
|
|
|
|
.submit_bio = cached_dev_submit_bio,
|
|
|
|
.open = open_dev,
|
|
|
|
.release = release_dev,
|
|
|
|
.ioctl = ioctl_dev,
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
};
|
|
|
|
|
|
|
|
static const struct block_device_operations bcache_flash_ops = {
|
|
|
|
.submit_bio = flash_dev_submit_bio,
|
2013-03-24 07:11:31 +08:00
|
|
|
.open = open_dev,
|
|
|
|
.release = release_dev,
|
|
|
|
.ioctl = ioctl_dev,
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
};
|
|
|
|
|
|
|
|
void bcache_device_stop(struct bcache_device *d)
|
|
|
|
{
|
2013-08-22 08:49:09 +08:00
|
|
|
if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
|
2019-04-25 00:48:39 +08:00
|
|
|
/*
|
|
|
|
* closure_fn set to
|
|
|
|
* - cached device: cached_dev_flush()
|
|
|
|
* - flash dev: flash_dev_flush()
|
|
|
|
*/
|
2013-03-24 07:11:31 +08:00
|
|
|
closure_queue(&d->cl);
|
|
|
|
}
|
|
|
|
|
2013-02-01 23:29:41 +08:00
|
|
|
static void bcache_device_unlink(struct bcache_device *d)
|
|
|
|
{
|
2013-08-22 08:49:09 +08:00
|
|
|
lockdep_assert_held(&bch_register_lock);
|
2013-02-01 23:29:41 +08:00
|
|
|
|
2013-08-22 08:49:09 +08:00
|
|
|
if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
|
2020-10-01 14:50:47 +08:00
|
|
|
struct cache *ca = d->c->cache;
|
2013-02-01 23:29:41 +08:00
|
|
|
|
2013-08-22 08:49:09 +08:00
|
|
|
sysfs_remove_link(&d->c->kobj, d->name);
|
|
|
|
sysfs_remove_link(&d->kobj, "cache");
|
|
|
|
|
2020-10-01 14:50:47 +08:00
|
|
|
bd_unlink_disk_holder(ca->bdev, d->disk);
|
2013-08-22 08:49:09 +08:00
|
|
|
}
|
2013-02-01 23:29:41 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
|
|
|
|
const char *name)
|
|
|
|
{
|
2020-10-01 14:50:47 +08:00
|
|
|
struct cache *ca = c->cache;
|
2019-06-28 19:59:37 +08:00
|
|
|
int ret;
|
2013-02-01 23:29:41 +08:00
|
|
|
|
2020-10-01 14:50:47 +08:00
|
|
|
bd_link_disk_holder(ca->bdev, d->disk);
|
2013-02-01 23:29:41 +08:00
|
|
|
|
|
|
|
snprintf(d->name, BCACHEDEVNAME_SIZE,
|
|
|
|
"%s%u", name, d->id);
|
|
|
|
|
2019-06-28 19:59:37 +08:00
|
|
|
ret = sysfs_create_link(&d->kobj, &c->kobj, "cache");
|
|
|
|
if (ret < 0)
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("Couldn't create device -> cache set symlink\n");
|
2019-06-28 19:59:37 +08:00
|
|
|
|
|
|
|
ret = sysfs_create_link(&c->kobj, &d->kobj, d->name);
|
|
|
|
if (ret < 0)
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("Couldn't create cache set -> device symlink\n");
|
2015-11-30 09:19:32 +08:00
|
|
|
|
|
|
|
clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
|
2013-02-01 23:29:41 +08:00
|
|
|
}
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
static void bcache_device_detach(struct bcache_device *d)
|
|
|
|
{
|
|
|
|
lockdep_assert_held(&bch_register_lock);
|
|
|
|
|
2018-08-09 15:48:49 +08:00
|
|
|
atomic_dec(&d->c->attached_dev_nr);
|
|
|
|
|
2013-08-22 08:49:09 +08:00
|
|
|
if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
|
2013-03-24 07:11:31 +08:00
|
|
|
struct uuid_entry *u = d->c->uuids + d->id;
|
|
|
|
|
|
|
|
SET_UUID_FLASH_ONLY(u, 0);
|
|
|
|
memcpy(u->uuid, invalid_uuid, 16);
|
2018-07-26 12:17:41 +08:00
|
|
|
u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
|
2013-03-24 07:11:31 +08:00
|
|
|
bch_uuid_write(d->c);
|
|
|
|
}
|
|
|
|
|
2013-08-22 08:49:09 +08:00
|
|
|
bcache_device_unlink(d);
|
2013-02-01 23:29:41 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
d->c->devices[d->id] = NULL;
|
|
|
|
closure_put(&d->c->caching);
|
|
|
|
d->c = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
|
2018-08-11 13:19:44 +08:00
|
|
|
unsigned int id)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
d->id = id;
|
|
|
|
d->c = c;
|
|
|
|
c->devices[id] = d;
|
|
|
|
|
2018-01-09 04:21:28 +08:00
|
|
|
if (id >= c->devices_max_used)
|
|
|
|
c->devices_max_used = id + 1;
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
closure_get(&c->caching);
|
|
|
|
}
|
|
|
|
|
bcache: rewrite multiple partitions support
Current partition support of bcache is confusing and buggy. It tries to
trace non-continuous device minor numbers by an ida bit string, and
mistakenly mixed bcache device index with minor numbers. This design
generates several negative results,
- Index of bcache device name is not consecutive under /dev/. If there are
3 bcache devices, they name will be,
/dev/bcache0, /dev/bcache16, /dev/bcache32
Only bcache code indexes bcache device name is such an interesting way.
- First minor number of each bcache device is traced by ida bit string.
One bcache device will occupy 16 bits, this is not a good idea. Indeed
only one bit is enough.
- Because minor number and bcache device index are mixed, a device index
is allocated by ida_simple_get(), but an first minor number is sent into
ida_simple_remove() to release the device. It confused original author
too.
Root cause of the above errors is, bcache code should not handle device
minor numbers at all! A standard process to support multiple partitions in
Linux kernel is,
- Device driver provides major device number, and indexes multiple device
instances.
- Device driver does not allocat nor trace device minor number, only
provides a first minor number of a given device instance, and sets how
many minor numbers (paritions) the device instance may have.
All rested stuffs are handled by block layer code, most of the details can
be found from block/{genhd, partition-generic}.c files.
This patch re-writes multiple partitions support for bcache. It makes
whole things to be more clear, and uses ida bit string in a more efficeint
way.
- Ida bit string only traces bcache device index, not minor number. For a
bcache device with 128 partitions, only one bit in ida bit string is
enough.
- Device minor number and device index are separated in concept. Device
index is used for /dev node naming, and ida bit string trace. Minor
number is calculated from device index and only used to initialize
first_minor of a bcache device.
- It does not follow any standard for 16 partitions on a bcache device.
This patch sets 128 partitions on single bcache device at max, this is
the limitation from GPT (GUID Partition Table) and supported by fdisk.
Considering a typical device minor number is 20 bits width, each bcache
device may have 128 partitions (7 bits), there can be 8192 bcache devices
existing on system. For most common deployment for a single server in
now days, it should be enough.
[minor spelling fixes in commit message by Michael Lyle]
Signed-off-by: Coly Li <colyli@suse.de>
Cc: Eric Wheeler <bcache@lists.ewheeler.net>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-14 07:35:31 +08:00
|
|
|
static inline int first_minor_to_idx(int first_minor)
|
|
|
|
{
|
|
|
|
return (first_minor/BCACHE_MINORS);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int idx_to_first_minor(int idx)
|
|
|
|
{
|
|
|
|
return (idx * BCACHE_MINORS);
|
|
|
|
}
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
static void bcache_device_free(struct bcache_device *d)
|
|
|
|
{
|
2019-11-13 16:03:17 +08:00
|
|
|
struct gendisk *disk = d->disk;
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
lockdep_assert_held(&bch_register_lock);
|
|
|
|
|
2019-11-13 16:03:17 +08:00
|
|
|
if (disk)
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_info("%s stopped\n", disk->disk_name);
|
2019-11-13 16:03:17 +08:00
|
|
|
else
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("bcache device (NULL gendisk) stopped\n");
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
if (d->c)
|
|
|
|
bcache_device_detach(d);
|
2019-11-13 16:03:17 +08:00
|
|
|
|
|
|
|
if (disk) {
|
bcache: fix refcount underflow in bcache_device_free()
The problematic code piece in bcache_device_free() is,
785 static void bcache_device_free(struct bcache_device *d)
786 {
787 struct gendisk *disk = d->disk;
[snipped]
799 if (disk) {
800 if (disk->flags & GENHD_FL_UP)
801 del_gendisk(disk);
802
803 if (disk->queue)
804 blk_cleanup_queue(disk->queue);
805
806 ida_simple_remove(&bcache_device_idx,
807 first_minor_to_idx(disk->first_minor));
808 put_disk(disk);
809 }
[snipped]
816 }
At line 808, put_disk(disk) may encounter kobject refcount of 'disk'
being underflow.
Here is how to reproduce the issue,
- Attche the backing device to a cache device and do random write to
make the cache being dirty.
- Stop the bcache device while the cache device has dirty data of the
backing device.
- Only register the backing device back, NOT register cache device.
- The bcache device node /dev/bcache0 won't show up, because backing
device waits for the cache device shows up for the missing dirty
data.
- Now echo 1 into /sys/fs/bcache/pendings_cleanup, to stop the pending
backing device.
- After the pending backing device stopped, use 'dmesg' to check kernel
message, a use-after-free warning from KASA reported the refcount of
kobject linked to the 'disk' is underflow.
The dropping refcount at line 808 in the above code piece is added by
add_disk(d->disk) in bch_cached_dev_run(). But in the above condition
the cache device is not registered, bch_cached_dev_run() has no chance
to be called and the refcount is not added. The put_disk() for a non-
added refcount of gendisk kobject triggers a underflow warning.
This patch checks whether GENHD_FL_UP is set in disk->flags, if it is
not set then the bcache device was not added, don't call put_disk()
and the the underflow issue can be avoided.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-05-27 12:01:53 +08:00
|
|
|
bool disk_added = (disk->flags & GENHD_FL_UP) != 0;
|
|
|
|
|
|
|
|
if (disk_added)
|
2019-11-13 16:03:17 +08:00
|
|
|
del_gendisk(disk);
|
|
|
|
|
|
|
|
if (disk->queue)
|
|
|
|
blk_cleanup_queue(disk->queue);
|
|
|
|
|
bcache: rewrite multiple partitions support
Current partition support of bcache is confusing and buggy. It tries to
trace non-continuous device minor numbers by an ida bit string, and
mistakenly mixed bcache device index with minor numbers. This design
generates several negative results,
- Index of bcache device name is not consecutive under /dev/. If there are
3 bcache devices, they name will be,
/dev/bcache0, /dev/bcache16, /dev/bcache32
Only bcache code indexes bcache device name is such an interesting way.
- First minor number of each bcache device is traced by ida bit string.
One bcache device will occupy 16 bits, this is not a good idea. Indeed
only one bit is enough.
- Because minor number and bcache device index are mixed, a device index
is allocated by ida_simple_get(), but an first minor number is sent into
ida_simple_remove() to release the device. It confused original author
too.
Root cause of the above errors is, bcache code should not handle device
minor numbers at all! A standard process to support multiple partitions in
Linux kernel is,
- Device driver provides major device number, and indexes multiple device
instances.
- Device driver does not allocat nor trace device minor number, only
provides a first minor number of a given device instance, and sets how
many minor numbers (paritions) the device instance may have.
All rested stuffs are handled by block layer code, most of the details can
be found from block/{genhd, partition-generic}.c files.
This patch re-writes multiple partitions support for bcache. It makes
whole things to be more clear, and uses ida bit string in a more efficeint
way.
- Ida bit string only traces bcache device index, not minor number. For a
bcache device with 128 partitions, only one bit in ida bit string is
enough.
- Device minor number and device index are separated in concept. Device
index is used for /dev node naming, and ida bit string trace. Minor
number is calculated from device index and only used to initialize
first_minor of a bcache device.
- It does not follow any standard for 16 partitions on a bcache device.
This patch sets 128 partitions on single bcache device at max, this is
the limitation from GPT (GUID Partition Table) and supported by fdisk.
Considering a typical device minor number is 20 bits width, each bcache
device may have 128 partitions (7 bits), there can be 8192 bcache devices
existing on system. For most common deployment for a single server in
now days, it should be enough.
[minor spelling fixes in commit message by Michael Lyle]
Signed-off-by: Coly Li <colyli@suse.de>
Cc: Eric Wheeler <bcache@lists.ewheeler.net>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-14 07:35:31 +08:00
|
|
|
ida_simple_remove(&bcache_device_idx,
|
2019-11-13 16:03:17 +08:00
|
|
|
first_minor_to_idx(disk->first_minor));
|
bcache: fix refcount underflow in bcache_device_free()
The problematic code piece in bcache_device_free() is,
785 static void bcache_device_free(struct bcache_device *d)
786 {
787 struct gendisk *disk = d->disk;
[snipped]
799 if (disk) {
800 if (disk->flags & GENHD_FL_UP)
801 del_gendisk(disk);
802
803 if (disk->queue)
804 blk_cleanup_queue(disk->queue);
805
806 ida_simple_remove(&bcache_device_idx,
807 first_minor_to_idx(disk->first_minor));
808 put_disk(disk);
809 }
[snipped]
816 }
At line 808, put_disk(disk) may encounter kobject refcount of 'disk'
being underflow.
Here is how to reproduce the issue,
- Attche the backing device to a cache device and do random write to
make the cache being dirty.
- Stop the bcache device while the cache device has dirty data of the
backing device.
- Only register the backing device back, NOT register cache device.
- The bcache device node /dev/bcache0 won't show up, because backing
device waits for the cache device shows up for the missing dirty
data.
- Now echo 1 into /sys/fs/bcache/pendings_cleanup, to stop the pending
backing device.
- After the pending backing device stopped, use 'dmesg' to check kernel
message, a use-after-free warning from KASA reported the refcount of
kobject linked to the 'disk' is underflow.
The dropping refcount at line 808 in the above code piece is added by
add_disk(d->disk) in bch_cached_dev_run(). But in the above condition
the cache device is not registered, bch_cached_dev_run() has no chance
to be called and the refcount is not added. The put_disk() for a non-
added refcount of gendisk kobject triggers a underflow warning.
This patch checks whether GENHD_FL_UP is set in disk->flags, if it is
not set then the bcache device was not added, don't call put_disk()
and the the underflow issue can be avoided.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-05-27 12:01:53 +08:00
|
|
|
if (disk_added)
|
|
|
|
put_disk(disk);
|
2013-07-31 16:12:02 +08:00
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2018-05-21 06:25:51 +08:00
|
|
|
bioset_exit(&d->bio_split);
|
2015-07-01 05:59:30 +08:00
|
|
|
kvfree(d->full_dirty_stripes);
|
|
|
|
kvfree(d->stripe_sectors_dirty);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
closure_debug_destroy(&d->cl);
|
|
|
|
}
|
|
|
|
|
2018-08-11 13:19:44 +08:00
|
|
|
static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
|
2020-07-01 16:59:43 +08:00
|
|
|
sector_t sectors, struct block_device *cached_bdev,
|
|
|
|
const struct block_device_operations *ops)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
struct request_queue *q;
|
2018-03-19 08:36:33 +08:00
|
|
|
const size_t max_stripes = min_t(size_t, INT_MAX,
|
|
|
|
SIZE_MAX / sizeof(atomic_t));
|
2020-07-25 20:00:21 +08:00
|
|
|
uint64_t n;
|
bcache: rewrite multiple partitions support
Current partition support of bcache is confusing and buggy. It tries to
trace non-continuous device minor numbers by an ida bit string, and
mistakenly mixed bcache device index with minor numbers. This design
generates several negative results,
- Index of bcache device name is not consecutive under /dev/. If there are
3 bcache devices, they name will be,
/dev/bcache0, /dev/bcache16, /dev/bcache32
Only bcache code indexes bcache device name is such an interesting way.
- First minor number of each bcache device is traced by ida bit string.
One bcache device will occupy 16 bits, this is not a good idea. Indeed
only one bit is enough.
- Because minor number and bcache device index are mixed, a device index
is allocated by ida_simple_get(), but an first minor number is sent into
ida_simple_remove() to release the device. It confused original author
too.
Root cause of the above errors is, bcache code should not handle device
minor numbers at all! A standard process to support multiple partitions in
Linux kernel is,
- Device driver provides major device number, and indexes multiple device
instances.
- Device driver does not allocat nor trace device minor number, only
provides a first minor number of a given device instance, and sets how
many minor numbers (paritions) the device instance may have.
All rested stuffs are handled by block layer code, most of the details can
be found from block/{genhd, partition-generic}.c files.
This patch re-writes multiple partitions support for bcache. It makes
whole things to be more clear, and uses ida bit string in a more efficeint
way.
- Ida bit string only traces bcache device index, not minor number. For a
bcache device with 128 partitions, only one bit in ida bit string is
enough.
- Device minor number and device index are separated in concept. Device
index is used for /dev node naming, and ida bit string trace. Minor
number is calculated from device index and only used to initialize
first_minor of a bcache device.
- It does not follow any standard for 16 partitions on a bcache device.
This patch sets 128 partitions on single bcache device at max, this is
the limitation from GPT (GUID Partition Table) and supported by fdisk.
Considering a typical device minor number is 20 bits width, each bcache
device may have 128 partitions (7 bits), there can be 8192 bcache devices
existing on system. For most common deployment for a single server in
now days, it should be enough.
[minor spelling fixes in commit message by Michael Lyle]
Signed-off-by: Coly Li <colyli@suse.de>
Cc: Eric Wheeler <bcache@lists.ewheeler.net>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-14 07:35:31 +08:00
|
|
|
int idx;
|
2013-06-05 21:21:07 +08:00
|
|
|
|
2013-08-17 17:13:15 +08:00
|
|
|
if (!d->stripe_size)
|
|
|
|
d->stripe_size = 1 << 31;
|
2013-06-05 21:21:07 +08:00
|
|
|
|
2020-07-25 20:00:21 +08:00
|
|
|
n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
|
|
|
|
if (!n || n > max_stripes) {
|
|
|
|
pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
|
|
|
|
n);
|
2013-06-05 21:21:07 +08:00
|
|
|
return -ENOMEM;
|
2013-11-01 06:43:22 +08:00
|
|
|
}
|
2020-07-25 20:00:21 +08:00
|
|
|
d->nr_stripes = n;
|
2013-06-05 21:21:07 +08:00
|
|
|
|
|
|
|
n = d->nr_stripes * sizeof(atomic_t);
|
2017-05-09 06:57:37 +08:00
|
|
|
d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
|
2013-06-05 21:21:07 +08:00
|
|
|
if (!d->stripe_sectors_dirty)
|
|
|
|
return -ENOMEM;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2013-11-01 06:43:22 +08:00
|
|
|
n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
|
2017-05-09 06:57:37 +08:00
|
|
|
d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
|
2013-11-01 06:43:22 +08:00
|
|
|
if (!d->full_dirty_stripes)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
bcache: rewrite multiple partitions support
Current partition support of bcache is confusing and buggy. It tries to
trace non-continuous device minor numbers by an ida bit string, and
mistakenly mixed bcache device index with minor numbers. This design
generates several negative results,
- Index of bcache device name is not consecutive under /dev/. If there are
3 bcache devices, they name will be,
/dev/bcache0, /dev/bcache16, /dev/bcache32
Only bcache code indexes bcache device name is such an interesting way.
- First minor number of each bcache device is traced by ida bit string.
One bcache device will occupy 16 bits, this is not a good idea. Indeed
only one bit is enough.
- Because minor number and bcache device index are mixed, a device index
is allocated by ida_simple_get(), but an first minor number is sent into
ida_simple_remove() to release the device. It confused original author
too.
Root cause of the above errors is, bcache code should not handle device
minor numbers at all! A standard process to support multiple partitions in
Linux kernel is,
- Device driver provides major device number, and indexes multiple device
instances.
- Device driver does not allocat nor trace device minor number, only
provides a first minor number of a given device instance, and sets how
many minor numbers (paritions) the device instance may have.
All rested stuffs are handled by block layer code, most of the details can
be found from block/{genhd, partition-generic}.c files.
This patch re-writes multiple partitions support for bcache. It makes
whole things to be more clear, and uses ida bit string in a more efficeint
way.
- Ida bit string only traces bcache device index, not minor number. For a
bcache device with 128 partitions, only one bit in ida bit string is
enough.
- Device minor number and device index are separated in concept. Device
index is used for /dev node naming, and ida bit string trace. Minor
number is calculated from device index and only used to initialize
first_minor of a bcache device.
- It does not follow any standard for 16 partitions on a bcache device.
This patch sets 128 partitions on single bcache device at max, this is
the limitation from GPT (GUID Partition Table) and supported by fdisk.
Considering a typical device minor number is 20 bits width, each bcache
device may have 128 partitions (7 bits), there can be 8192 bcache devices
existing on system. For most common deployment for a single server in
now days, it should be enough.
[minor spelling fixes in commit message by Michael Lyle]
Signed-off-by: Coly Li <colyli@suse.de>
Cc: Eric Wheeler <bcache@lists.ewheeler.net>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-14 07:35:31 +08:00
|
|
|
idx = ida_simple_get(&bcache_device_idx, 0,
|
|
|
|
BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
|
|
|
|
if (idx < 0)
|
|
|
|
return idx;
|
2016-10-24 09:19:20 +08:00
|
|
|
|
2018-05-21 06:25:51 +08:00
|
|
|
if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
|
2018-07-26 12:17:40 +08:00
|
|
|
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
d->disk = alloc_disk(BCACHE_MINORS);
|
|
|
|
if (!d->disk)
|
|
|
|
goto err;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2013-06-05 21:21:07 +08:00
|
|
|
set_capacity(d->disk, sectors);
|
bcache: rewrite multiple partitions support
Current partition support of bcache is confusing and buggy. It tries to
trace non-continuous device minor numbers by an ida bit string, and
mistakenly mixed bcache device index with minor numbers. This design
generates several negative results,
- Index of bcache device name is not consecutive under /dev/. If there are
3 bcache devices, they name will be,
/dev/bcache0, /dev/bcache16, /dev/bcache32
Only bcache code indexes bcache device name is such an interesting way.
- First minor number of each bcache device is traced by ida bit string.
One bcache device will occupy 16 bits, this is not a good idea. Indeed
only one bit is enough.
- Because minor number and bcache device index are mixed, a device index
is allocated by ida_simple_get(), but an first minor number is sent into
ida_simple_remove() to release the device. It confused original author
too.
Root cause of the above errors is, bcache code should not handle device
minor numbers at all! A standard process to support multiple partitions in
Linux kernel is,
- Device driver provides major device number, and indexes multiple device
instances.
- Device driver does not allocat nor trace device minor number, only
provides a first minor number of a given device instance, and sets how
many minor numbers (paritions) the device instance may have.
All rested stuffs are handled by block layer code, most of the details can
be found from block/{genhd, partition-generic}.c files.
This patch re-writes multiple partitions support for bcache. It makes
whole things to be more clear, and uses ida bit string in a more efficeint
way.
- Ida bit string only traces bcache device index, not minor number. For a
bcache device with 128 partitions, only one bit in ida bit string is
enough.
- Device minor number and device index are separated in concept. Device
index is used for /dev node naming, and ida bit string trace. Minor
number is calculated from device index and only used to initialize
first_minor of a bcache device.
- It does not follow any standard for 16 partitions on a bcache device.
This patch sets 128 partitions on single bcache device at max, this is
the limitation from GPT (GUID Partition Table) and supported by fdisk.
Considering a typical device minor number is 20 bits width, each bcache
device may have 128 partitions (7 bits), there can be 8192 bcache devices
existing on system. For most common deployment for a single server in
now days, it should be enough.
[minor spelling fixes in commit message by Michael Lyle]
Signed-off-by: Coly Li <colyli@suse.de>
Cc: Eric Wheeler <bcache@lists.ewheeler.net>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-14 07:35:31 +08:00
|
|
|
snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
d->disk->major = bcache_major;
|
bcache: rewrite multiple partitions support
Current partition support of bcache is confusing and buggy. It tries to
trace non-continuous device minor numbers by an ida bit string, and
mistakenly mixed bcache device index with minor numbers. This design
generates several negative results,
- Index of bcache device name is not consecutive under /dev/. If there are
3 bcache devices, they name will be,
/dev/bcache0, /dev/bcache16, /dev/bcache32
Only bcache code indexes bcache device name is such an interesting way.
- First minor number of each bcache device is traced by ida bit string.
One bcache device will occupy 16 bits, this is not a good idea. Indeed
only one bit is enough.
- Because minor number and bcache device index are mixed, a device index
is allocated by ida_simple_get(), but an first minor number is sent into
ida_simple_remove() to release the device. It confused original author
too.
Root cause of the above errors is, bcache code should not handle device
minor numbers at all! A standard process to support multiple partitions in
Linux kernel is,
- Device driver provides major device number, and indexes multiple device
instances.
- Device driver does not allocat nor trace device minor number, only
provides a first minor number of a given device instance, and sets how
many minor numbers (paritions) the device instance may have.
All rested stuffs are handled by block layer code, most of the details can
be found from block/{genhd, partition-generic}.c files.
This patch re-writes multiple partitions support for bcache. It makes
whole things to be more clear, and uses ida bit string in a more efficeint
way.
- Ida bit string only traces bcache device index, not minor number. For a
bcache device with 128 partitions, only one bit in ida bit string is
enough.
- Device minor number and device index are separated in concept. Device
index is used for /dev node naming, and ida bit string trace. Minor
number is calculated from device index and only used to initialize
first_minor of a bcache device.
- It does not follow any standard for 16 partitions on a bcache device.
This patch sets 128 partitions on single bcache device at max, this is
the limitation from GPT (GUID Partition Table) and supported by fdisk.
Considering a typical device minor number is 20 bits width, each bcache
device may have 128 partitions (7 bits), there can be 8192 bcache devices
existing on system. For most common deployment for a single server in
now days, it should be enough.
[minor spelling fixes in commit message by Michael Lyle]
Signed-off-by: Coly Li <colyli@suse.de>
Cc: Eric Wheeler <bcache@lists.ewheeler.net>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2017-10-14 07:35:31 +08:00
|
|
|
d->disk->first_minor = idx_to_first_minor(idx);
|
2020-07-01 16:59:43 +08:00
|
|
|
d->disk->fops = ops;
|
2013-03-24 07:11:31 +08:00
|
|
|
d->disk->private_data = d;
|
|
|
|
|
2020-07-01 16:59:43 +08:00
|
|
|
q = blk_alloc_queue(NUMA_NO_NODE);
|
2013-07-31 16:12:02 +08:00
|
|
|
if (!q)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
d->disk->queue = q;
|
|
|
|
q->limits.max_hw_sectors = UINT_MAX;
|
|
|
|
q->limits.max_sectors = UINT_MAX;
|
|
|
|
q->limits.max_segment_size = UINT_MAX;
|
|
|
|
q->limits.max_segments = BIO_MAX_PAGES;
|
2015-07-14 22:15:12 +08:00
|
|
|
blk_queue_max_discard_sectors(q, UINT_MAX);
|
2014-02-11 09:26:40 +08:00
|
|
|
q->limits.discard_granularity = 512;
|
2013-03-24 07:11:31 +08:00
|
|
|
q->limits.io_min = block_size;
|
|
|
|
q->limits.logical_block_size = block_size;
|
|
|
|
q->limits.physical_block_size = block_size;
|
bcache: check and adjust logical block size for backing devices
It's possible for a block driver to set logical block size to
a value greater than page size incorrectly; e.g. bcache takes
the value from the superblock, set by the user w/ make-bcache.
This causes a BUG/NULL pointer dereference in the path:
__blkdev_get()
-> set_init_blocksize() // set i_blkbits based on ...
-> bdev_logical_block_size()
-> queue_logical_block_size() // ... this value
-> bdev_disk_changed()
...
-> blkdev_readpage()
-> block_read_full_page()
-> create_page_buffers() // size = 1 << i_blkbits
-> create_empty_buffers() // give size/take pointer
-> alloc_page_buffers() // return NULL
.. BUG!
Because alloc_page_buffers() is called with size > PAGE_SIZE,
thus it initializes head = NULL, skips the loop, return head;
then create_empty_buffers() gets (and uses) the NULL pointer.
This has been around longer than commit ad6bf88a6c19 ("block:
fix an integer overflow in logical block size"); however, it
increased the range of values that can trigger the issue.
Previously only 8k/16k/32k (on x86/4k page size) would do it,
as greater values overflow unsigned short to zero, and queue_
logical_block_size() would then use the default of 512.
Now the range with unsigned int is much larger, and users w/
the 512k value, which happened to be zero'ed previously and
work fine, started to hit this issue -- as the zero is gone,
and queue_logical_block_size() does return 512k (>PAGE_SIZE.)
Fix this by checking the bcache device's logical block size,
and if it's greater than page size, fallback to the backing/
cached device's logical page size.
This doesn't affect cache devices as those are still checked
for block/page size in read_super(); only the backing/cached
devices are not.
Apparently it's a regression from commit 2903381fce71 ("bcache:
Take data offset from the bdev superblock."), moving the check
into BCACHE_SB_VERSION_CDEV only. Now that we have superblocks
of backing devices out there with this larger value, we cannot
refuse to load them (i.e., have a similar check in _BDEV.)
Ideally perhaps bcache should use all values from the backing
device (physical/logical/io_min block size)? But for now just
fix the problematic case.
Test-case:
# IMG=/root/disk.img
# dd if=/dev/zero of=$IMG bs=1 count=0 seek=1G
# DEV=$(losetup --find --show $IMG)
# make-bcache --bdev $DEV --block 8k
< see dmesg >
Before:
# uname -r
5.7.0-rc7
[ 55.944046] BUG: kernel NULL pointer dereference, address: 0000000000000000
...
[ 55.949742] CPU: 3 PID: 610 Comm: bcache-register Not tainted 5.7.0-rc7 #4
...
[ 55.952281] RIP: 0010:create_empty_buffers+0x1a/0x100
...
[ 55.966434] Call Trace:
[ 55.967021] create_page_buffers+0x48/0x50
[ 55.967834] block_read_full_page+0x49/0x380
[ 55.972181] do_read_cache_page+0x494/0x610
[ 55.974780] read_part_sector+0x2d/0xaa
[ 55.975558] read_lba+0x10e/0x1e0
[ 55.977904] efi_partition+0x120/0x5a6
[ 55.980227] blk_add_partitions+0x161/0x390
[ 55.982177] bdev_disk_changed+0x61/0xd0
[ 55.982961] __blkdev_get+0x350/0x490
[ 55.983715] __device_add_disk+0x318/0x480
[ 55.984539] bch_cached_dev_run+0xc5/0x270
[ 55.986010] register_bcache.cold+0x122/0x179
[ 55.987628] kernfs_fop_write+0xbc/0x1a0
[ 55.988416] vfs_write+0xb1/0x1a0
[ 55.989134] ksys_write+0x5a/0xd0
[ 55.989825] do_syscall_64+0x43/0x140
[ 55.990563] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 55.991519] RIP: 0033:0x7f7d60ba3154
...
After:
# uname -r
5.7.0.bcachelbspgsz
[ 31.672460] bcache: bcache_device_init() bcache0: sb/logical block size (8192) greater than page size (4096) falling back to device logical block size (512)
[ 31.675133] bcache: register_bdev() registered backing device loop0
# grep ^ /sys/block/bcache0/queue/*_block_size
/sys/block/bcache0/queue/logical_block_size:512
/sys/block/bcache0/queue/physical_block_size:8192
Reported-by: Ryan Finnie <ryan@finnie.org>
Reported-by: Sebastian Marsching <sebastian@marsching.com>
Signed-off-by: Mauricio Faria de Oliveira <mfo@canonical.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-06-15 00:53:31 +08:00
|
|
|
|
|
|
|
if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) {
|
|
|
|
/*
|
|
|
|
* This should only happen with BCACHE_SB_VERSION_BDEV.
|
|
|
|
* Block/page size is checked for BCACHE_SB_VERSION_CDEV.
|
|
|
|
*/
|
2020-06-15 00:53:33 +08:00
|
|
|
pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
|
bcache: check and adjust logical block size for backing devices
It's possible for a block driver to set logical block size to
a value greater than page size incorrectly; e.g. bcache takes
the value from the superblock, set by the user w/ make-bcache.
This causes a BUG/NULL pointer dereference in the path:
__blkdev_get()
-> set_init_blocksize() // set i_blkbits based on ...
-> bdev_logical_block_size()
-> queue_logical_block_size() // ... this value
-> bdev_disk_changed()
...
-> blkdev_readpage()
-> block_read_full_page()
-> create_page_buffers() // size = 1 << i_blkbits
-> create_empty_buffers() // give size/take pointer
-> alloc_page_buffers() // return NULL
.. BUG!
Because alloc_page_buffers() is called with size > PAGE_SIZE,
thus it initializes head = NULL, skips the loop, return head;
then create_empty_buffers() gets (and uses) the NULL pointer.
This has been around longer than commit ad6bf88a6c19 ("block:
fix an integer overflow in logical block size"); however, it
increased the range of values that can trigger the issue.
Previously only 8k/16k/32k (on x86/4k page size) would do it,
as greater values overflow unsigned short to zero, and queue_
logical_block_size() would then use the default of 512.
Now the range with unsigned int is much larger, and users w/
the 512k value, which happened to be zero'ed previously and
work fine, started to hit this issue -- as the zero is gone,
and queue_logical_block_size() does return 512k (>PAGE_SIZE.)
Fix this by checking the bcache device's logical block size,
and if it's greater than page size, fallback to the backing/
cached device's logical page size.
This doesn't affect cache devices as those are still checked
for block/page size in read_super(); only the backing/cached
devices are not.
Apparently it's a regression from commit 2903381fce71 ("bcache:
Take data offset from the bdev superblock."), moving the check
into BCACHE_SB_VERSION_CDEV only. Now that we have superblocks
of backing devices out there with this larger value, we cannot
refuse to load them (i.e., have a similar check in _BDEV.)
Ideally perhaps bcache should use all values from the backing
device (physical/logical/io_min block size)? But for now just
fix the problematic case.
Test-case:
# IMG=/root/disk.img
# dd if=/dev/zero of=$IMG bs=1 count=0 seek=1G
# DEV=$(losetup --find --show $IMG)
# make-bcache --bdev $DEV --block 8k
< see dmesg >
Before:
# uname -r
5.7.0-rc7
[ 55.944046] BUG: kernel NULL pointer dereference, address: 0000000000000000
...
[ 55.949742] CPU: 3 PID: 610 Comm: bcache-register Not tainted 5.7.0-rc7 #4
...
[ 55.952281] RIP: 0010:create_empty_buffers+0x1a/0x100
...
[ 55.966434] Call Trace:
[ 55.967021] create_page_buffers+0x48/0x50
[ 55.967834] block_read_full_page+0x49/0x380
[ 55.972181] do_read_cache_page+0x494/0x610
[ 55.974780] read_part_sector+0x2d/0xaa
[ 55.975558] read_lba+0x10e/0x1e0
[ 55.977904] efi_partition+0x120/0x5a6
[ 55.980227] blk_add_partitions+0x161/0x390
[ 55.982177] bdev_disk_changed+0x61/0xd0
[ 55.982961] __blkdev_get+0x350/0x490
[ 55.983715] __device_add_disk+0x318/0x480
[ 55.984539] bch_cached_dev_run+0xc5/0x270
[ 55.986010] register_bcache.cold+0x122/0x179
[ 55.987628] kernfs_fop_write+0xbc/0x1a0
[ 55.988416] vfs_write+0xb1/0x1a0
[ 55.989134] ksys_write+0x5a/0xd0
[ 55.989825] do_syscall_64+0x43/0x140
[ 55.990563] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 55.991519] RIP: 0033:0x7f7d60ba3154
...
After:
# uname -r
5.7.0.bcachelbspgsz
[ 31.672460] bcache: bcache_device_init() bcache0: sb/logical block size (8192) greater than page size (4096) falling back to device logical block size (512)
[ 31.675133] bcache: register_bdev() registered backing device loop0
# grep ^ /sys/block/bcache0/queue/*_block_size
/sys/block/bcache0/queue/logical_block_size:512
/sys/block/bcache0/queue/physical_block_size:8192
Reported-by: Ryan Finnie <ryan@finnie.org>
Reported-by: Sebastian Marsching <sebastian@marsching.com>
Signed-off-by: Mauricio Faria de Oliveira <mfo@canonical.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-06-15 00:53:31 +08:00
|
|
|
d->disk->disk_name, q->limits.logical_block_size,
|
|
|
|
PAGE_SIZE, bdev_logical_block_size(cached_bdev));
|
|
|
|
|
|
|
|
/* This also adjusts physical block size/min io size if needed */
|
|
|
|
blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev));
|
|
|
|
}
|
|
|
|
|
2018-03-08 09:10:07 +08:00
|
|
|
blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
|
|
|
|
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
|
|
|
|
blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2016-03-31 00:13:22 +08:00
|
|
|
blk_queue_write_cache(q, true, true);
|
2013-07-11 09:44:40 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
return 0;
|
2018-07-26 12:17:40 +08:00
|
|
|
|
|
|
|
err:
|
|
|
|
ida_simple_remove(&bcache_device_idx, idx);
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Cached device */
|
|
|
|
|
|
|
|
static void calc_cached_dev_sectors(struct cache_set *c)
|
|
|
|
{
|
|
|
|
uint64_t sectors = 0;
|
|
|
|
struct cached_dev *dc;
|
|
|
|
|
|
|
|
list_for_each_entry(dc, &c->cached_devs, list)
|
|
|
|
sectors += bdev_sectors(dc->bdev);
|
|
|
|
|
|
|
|
c->cached_dev_sectors = sectors;
|
|
|
|
}
|
|
|
|
|
bcache: stop bcache device when backing device is offline
Currently bcache does not handle backing device failure, if backing
device is offline and disconnected from system, its bcache device can still
be accessible. If the bcache device is in writeback mode, I/O requests even
can success if the requests hit on cache device. That is to say, when and
how bcache handles offline backing device is undefined.
This patch tries to handle backing device offline in a rather simple way,
- Add cached_dev->status_update_thread kernel thread to update backing
device status in every 1 second.
- Add cached_dev->offline_seconds to record how many seconds the backing
device is observed to be offline. If the backing device is offline for
BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and
call bcache_device_stop() to stop the bache device which linked to the
offline backing device.
Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds,
its bcache device will be removed, then user space application writing on
it will get error immediately, and handler the device failure in time.
This patch is quite simple, does not handle more complicated situations.
Once the bcache device is stopped, users need to recovery the backing
device, register and attach it manually.
Changelog:
v3: call wait_for_kthread_stop() before exits kernel thread.
v2: remove "bcache: " prefix when calling pr_warn().
v1: initial version.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-28 15:37:41 +08:00
|
|
|
#define BACKING_DEV_OFFLINE_TIMEOUT 5
|
|
|
|
static int cached_dev_status_update(void *arg)
|
|
|
|
{
|
|
|
|
struct cached_dev *dc = arg;
|
|
|
|
struct request_queue *q;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this delayed worker is stopping outside, directly quit here.
|
|
|
|
* dc->io_disable might be set via sysfs interface, so check it
|
|
|
|
* here too.
|
|
|
|
*/
|
|
|
|
while (!kthread_should_stop() && !dc->io_disable) {
|
|
|
|
q = bdev_get_queue(dc->bdev);
|
|
|
|
if (blk_queue_dying(q))
|
|
|
|
dc->offline_seconds++;
|
|
|
|
else
|
|
|
|
dc->offline_seconds = 0;
|
|
|
|
|
|
|
|
if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("%s: device offline for %d seconds\n",
|
bcache: stop bcache device when backing device is offline
Currently bcache does not handle backing device failure, if backing
device is offline and disconnected from system, its bcache device can still
be accessible. If the bcache device is in writeback mode, I/O requests even
can success if the requests hit on cache device. That is to say, when and
how bcache handles offline backing device is undefined.
This patch tries to handle backing device offline in a rather simple way,
- Add cached_dev->status_update_thread kernel thread to update backing
device status in every 1 second.
- Add cached_dev->offline_seconds to record how many seconds the backing
device is observed to be offline. If the backing device is offline for
BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and
call bcache_device_stop() to stop the bache device which linked to the
offline backing device.
Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds,
its bcache device will be removed, then user space application writing on
it will get error immediately, and handler the device failure in time.
This patch is quite simple, does not handle more complicated situations.
Once the bcache device is stopped, users need to recovery the backing
device, register and attach it manually.
Changelog:
v3: call wait_for_kthread_stop() before exits kernel thread.
v2: remove "bcache: " prefix when calling pr_warn().
v1: initial version.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-28 15:37:41 +08:00
|
|
|
dc->backing_dev_name,
|
|
|
|
BACKING_DEV_OFFLINE_TIMEOUT);
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("%s: disable I/O request due to backing device offline\n",
|
|
|
|
dc->disk.name);
|
bcache: stop bcache device when backing device is offline
Currently bcache does not handle backing device failure, if backing
device is offline and disconnected from system, its bcache device can still
be accessible. If the bcache device is in writeback mode, I/O requests even
can success if the requests hit on cache device. That is to say, when and
how bcache handles offline backing device is undefined.
This patch tries to handle backing device offline in a rather simple way,
- Add cached_dev->status_update_thread kernel thread to update backing
device status in every 1 second.
- Add cached_dev->offline_seconds to record how many seconds the backing
device is observed to be offline. If the backing device is offline for
BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and
call bcache_device_stop() to stop the bache device which linked to the
offline backing device.
Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds,
its bcache device will be removed, then user space application writing on
it will get error immediately, and handler the device failure in time.
This patch is quite simple, does not handle more complicated situations.
Once the bcache device is stopped, users need to recovery the backing
device, register and attach it manually.
Changelog:
v3: call wait_for_kthread_stop() before exits kernel thread.
v2: remove "bcache: " prefix when calling pr_warn().
v1: initial version.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-28 15:37:41 +08:00
|
|
|
dc->io_disable = true;
|
|
|
|
/* let others know earlier that io_disable is true */
|
|
|
|
smp_mb();
|
|
|
|
bcache_device_stop(&dc->disk);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
schedule_timeout_interruptible(HZ);
|
|
|
|
}
|
|
|
|
|
|
|
|
wait_for_kthread_stop();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2019-06-28 19:59:33 +08:00
|
|
|
int bch_cached_dev_run(struct cached_dev *dc)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
struct bcache_device *d = &dc->disk;
|
2019-04-25 00:48:29 +08:00
|
|
|
char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
|
2013-06-08 05:27:01 +08:00
|
|
|
char *env[] = {
|
|
|
|
"DRIVER=bcache",
|
|
|
|
kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
|
2019-04-25 00:48:29 +08:00
|
|
|
kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""),
|
2013-06-09 06:54:48 +08:00
|
|
|
NULL,
|
2013-06-08 05:27:01 +08:00
|
|
|
};
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2019-06-28 19:59:39 +08:00
|
|
|
if (dc->io_disable) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("I/O disabled on cached dev %s\n",
|
2019-06-28 19:59:39 +08:00
|
|
|
dc->backing_dev_name);
|
2019-07-22 22:12:36 +08:00
|
|
|
kfree(env[1]);
|
|
|
|
kfree(env[2]);
|
|
|
|
kfree(buf);
|
2019-06-28 19:59:33 +08:00
|
|
|
return -EIO;
|
2019-06-28 19:59:39 +08:00
|
|
|
}
|
2019-06-28 19:59:33 +08:00
|
|
|
|
2015-11-30 09:20:59 +08:00
|
|
|
if (atomic_xchg(&dc->running, 1)) {
|
|
|
|
kfree(env[1]);
|
|
|
|
kfree(env[2]);
|
2019-04-25 00:48:29 +08:00
|
|
|
kfree(buf);
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_info("cached dev %s is running already\n",
|
2019-06-28 19:59:39 +08:00
|
|
|
dc->backing_dev_name);
|
2019-06-28 19:59:33 +08:00
|
|
|
return -EBUSY;
|
2015-11-30 09:20:59 +08:00
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
if (!d->c &&
|
|
|
|
BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
|
|
|
|
struct closure cl;
|
2018-08-11 13:19:45 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
closure_init_stack(&cl);
|
|
|
|
|
|
|
|
SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
|
|
|
|
bch_write_bdev_super(dc, &cl);
|
|
|
|
closure_sync(&cl);
|
|
|
|
}
|
|
|
|
|
|
|
|
add_disk(d->disk);
|
2013-02-01 23:29:41 +08:00
|
|
|
bd_link_disk_holder(dc->bdev, dc->disk.disk);
|
2018-08-11 13:19:55 +08:00
|
|
|
/*
|
|
|
|
* won't show up in the uevent file, use udevadm monitor -e instead
|
|
|
|
* only class / kset properties are persistent
|
|
|
|
*/
|
2013-03-24 07:11:31 +08:00
|
|
|
kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
|
2013-06-08 05:27:01 +08:00
|
|
|
kfree(env[1]);
|
2013-06-09 06:54:48 +08:00
|
|
|
kfree(env[2]);
|
2019-04-25 00:48:29 +08:00
|
|
|
kfree(buf);
|
2013-06-08 05:27:01 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
|
2019-06-28 19:59:33 +08:00
|
|
|
sysfs_create_link(&disk_to_dev(d->disk)->kobj,
|
|
|
|
&d->kobj, "bcache")) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n");
|
2019-06-28 19:59:33 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
}
|
bcache: stop bcache device when backing device is offline
Currently bcache does not handle backing device failure, if backing
device is offline and disconnected from system, its bcache device can still
be accessible. If the bcache device is in writeback mode, I/O requests even
can success if the requests hit on cache device. That is to say, when and
how bcache handles offline backing device is undefined.
This patch tries to handle backing device offline in a rather simple way,
- Add cached_dev->status_update_thread kernel thread to update backing
device status in every 1 second.
- Add cached_dev->offline_seconds to record how many seconds the backing
device is observed to be offline. If the backing device is offline for
BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and
call bcache_device_stop() to stop the bache device which linked to the
offline backing device.
Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds,
its bcache device will be removed, then user space application writing on
it will get error immediately, and handler the device failure in time.
This patch is quite simple, does not handle more complicated situations.
Once the bcache device is stopped, users need to recovery the backing
device, register and attach it manually.
Changelog:
v3: call wait_for_kthread_stop() before exits kernel thread.
v2: remove "bcache: " prefix when calling pr_warn().
v1: initial version.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-28 15:37:41 +08:00
|
|
|
|
|
|
|
dc->status_update_thread = kthread_run(cached_dev_status_update,
|
|
|
|
dc, "bcache_status_update");
|
|
|
|
if (IS_ERR(dc->status_update_thread)) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n");
|
bcache: stop bcache device when backing device is offline
Currently bcache does not handle backing device failure, if backing
device is offline and disconnected from system, its bcache device can still
be accessible. If the bcache device is in writeback mode, I/O requests even
can success if the requests hit on cache device. That is to say, when and
how bcache handles offline backing device is undefined.
This patch tries to handle backing device offline in a rather simple way,
- Add cached_dev->status_update_thread kernel thread to update backing
device status in every 1 second.
- Add cached_dev->offline_seconds to record how many seconds the backing
device is observed to be offline. If the backing device is offline for
BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and
call bcache_device_stop() to stop the bache device which linked to the
offline backing device.
Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds,
its bcache device will be removed, then user space application writing on
it will get error immediately, and handler the device failure in time.
This patch is quite simple, does not handle more complicated situations.
Once the bcache device is stopped, users need to recovery the backing
device, register and attach it manually.
Changelog:
v3: call wait_for_kthread_stop() before exits kernel thread.
v2: remove "bcache: " prefix when calling pr_warn().
v1: initial version.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-28 15:37:41 +08:00
|
|
|
}
|
2019-06-28 19:59:33 +08:00
|
|
|
|
|
|
|
return 0;
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
bcache: stop dc->writeback_rate_update properly
struct delayed_work writeback_rate_update in struct cache_dev is a delayed
worker to call function update_writeback_rate() in period (the interval is
defined by dc->writeback_rate_update_seconds).
When a metadate I/O error happens on cache device, bcache error handling
routine bch_cache_set_error() will call bch_cache_set_unregister() to
retire whole cache set. On the unregister code path, this delayed work is
stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update).
dc->writeback_rate_update is a special delayed work from others in bcache.
In its routine update_writeback_rate(), this delayed work is re-armed
itself. That means when cancel_delayed_work_sync() returns, this delayed
work can still be executed after several seconds defined by
dc->writeback_rate_update_seconds.
The problem is, after cancel_delayed_work_sync() returns, the cache set
unregister code path will continue and release memory of struct cache set.
Then the delayed work is scheduled to run, __update_writeback_rate()
will reference the already released cache_set memory, and trigger a NULL
pointer deference fault.
This patch introduces two more bcache device flags,
- BCACHE_DEV_WB_RUNNING
bit set: bcache device is in writeback mode and running, it is OK for
dc->writeback_rate_update to re-arm itself.
bit clear:bcache device is trying to stop dc->writeback_rate_update,
this delayed work should not re-arm itself and quit.
- BCACHE_DEV_RATE_DW_RUNNING
bit set: routine update_writeback_rate() is executing.
bit clear: routine update_writeback_rate() quits.
This patch also adds a function cancel_writeback_rate_update_dwork() to
wait for dc->writeback_rate_update quits before cancel it by calling
cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected
quit dc->writeback_rate_update, after time_out seconds this function will
give up and continue to call cancel_delayed_work_sync().
And here I explain how this patch stops self re-armed delayed work properly
with the above stuffs.
update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning
and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling
cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING.
Before calling cancel_delayed_work_sync() wait utill flag
BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling
cancel_delayed_work_sync(), dc->writeback_rate_update must be already re-
armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases
delayed work routine update_writeback_rate() won't be executed after
cancel_delayed_work_sync() returns.
Inside update_writeback_rate() before calling schedule_delayed_work(), flag
BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means
someone is about to stop the delayed work. Because flag
BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync()
has to wait for this flag to be cleared, we don't need to worry about race
condition here.
If update_writeback_rate() is scheduled to run after checking
BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync()
in cancel_writeback_rate_update_dwork(), it is also safe. Because at this
moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned
previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear
and quit immediately.
Because there are more dependences inside update_writeback_rate() to struct
cache_set memory, dc->writeback_rate_update is not a simple self re-arm
delayed work. After trying many different methods (e.g. hold dc->count, or
use locks), this is the only way I can find which works to properly stop
dc->writeback_rate_update delayed work.
Changelog:
v3: change values of BCACHE_DEV_WB_RUNNING and BCACHE_DEV_RATE_DW_RUNNING
to bit index, for test_bit().
v2: Try to fix the race issue which is pointed out by Junhui.
v1: The initial version for review
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:16 +08:00
|
|
|
/*
|
|
|
|
* If BCACHE_DEV_RATE_DW_RUNNING is set, it means routine of the delayed
|
|
|
|
* work dc->writeback_rate_update is running. Wait until the routine
|
|
|
|
* quits (BCACHE_DEV_RATE_DW_RUNNING is clear), then continue to
|
|
|
|
* cancel it. If BCACHE_DEV_RATE_DW_RUNNING is not clear after time_out
|
|
|
|
* seconds, give up waiting here and continue to cancel it too.
|
|
|
|
*/
|
|
|
|
static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
|
|
|
|
{
|
|
|
|
int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
|
|
|
|
|
|
|
|
do {
|
|
|
|
if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
|
|
|
|
&dc->disk.flags))
|
|
|
|
break;
|
|
|
|
time_out--;
|
|
|
|
schedule_timeout_interruptible(1);
|
|
|
|
} while (time_out > 0);
|
|
|
|
|
|
|
|
if (time_out == 0)
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_warn("give up waiting for dc->writeback_write_update to quit\n");
|
bcache: stop dc->writeback_rate_update properly
struct delayed_work writeback_rate_update in struct cache_dev is a delayed
worker to call function update_writeback_rate() in period (the interval is
defined by dc->writeback_rate_update_seconds).
When a metadate I/O error happens on cache device, bcache error handling
routine bch_cache_set_error() will call bch_cache_set_unregister() to
retire whole cache set. On the unregister code path, this delayed work is
stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update).
dc->writeback_rate_update is a special delayed work from others in bcache.
In its routine update_writeback_rate(), this delayed work is re-armed
itself. That means when cancel_delayed_work_sync() returns, this delayed
work can still be executed after several seconds defined by
dc->writeback_rate_update_seconds.
The problem is, after cancel_delayed_work_sync() returns, the cache set
unregister code path will continue and release memory of struct cache set.
Then the delayed work is scheduled to run, __update_writeback_rate()
will reference the already released cache_set memory, and trigger a NULL
pointer deference fault.
This patch introduces two more bcache device flags,
- BCACHE_DEV_WB_RUNNING
bit set: bcache device is in writeback mode and running, it is OK for
dc->writeback_rate_update to re-arm itself.
bit clear:bcache device is trying to stop dc->writeback_rate_update,
this delayed work should not re-arm itself and quit.
- BCACHE_DEV_RATE_DW_RUNNING
bit set: routine update_writeback_rate() is executing.
bit clear: routine update_writeback_rate() quits.
This patch also adds a function cancel_writeback_rate_update_dwork() to
wait for dc->writeback_rate_update quits before cancel it by calling
cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected
quit dc->writeback_rate_update, after time_out seconds this function will
give up and continue to call cancel_delayed_work_sync().
And here I explain how this patch stops self re-armed delayed work properly
with the above stuffs.
update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning
and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling
cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING.
Before calling cancel_delayed_work_sync() wait utill flag
BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling
cancel_delayed_work_sync(), dc->writeback_rate_update must be already re-
armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases
delayed work routine update_writeback_rate() won't be executed after
cancel_delayed_work_sync() returns.
Inside update_writeback_rate() before calling schedule_delayed_work(), flag
BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means
someone is about to stop the delayed work. Because flag
BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync()
has to wait for this flag to be cleared, we don't need to worry about race
condition here.
If update_writeback_rate() is scheduled to run after checking
BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync()
in cancel_writeback_rate_update_dwork(), it is also safe. Because at this
moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned
previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear
and quit immediately.
Because there are more dependences inside update_writeback_rate() to struct
cache_set memory, dc->writeback_rate_update is not a simple self re-arm
delayed work. After trying many different methods (e.g. hold dc->count, or
use locks), this is the only way I can find which works to properly stop
dc->writeback_rate_update delayed work.
Changelog:
v3: change values of BCACHE_DEV_WB_RUNNING and BCACHE_DEV_RATE_DW_RUNNING
to bit index, for test_bit().
v2: Try to fix the race issue which is pointed out by Junhui.
v1: The initial version for review
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:16 +08:00
|
|
|
|
|
|
|
cancel_delayed_work_sync(&dc->writeback_rate_update);
|
|
|
|
}
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
static void cached_dev_detach_finish(struct work_struct *w)
|
|
|
|
{
|
|
|
|
struct cached_dev *dc = container_of(w, struct cached_dev, detach);
|
|
|
|
|
2013-08-22 08:49:09 +08:00
|
|
|
BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
|
2017-10-31 05:46:32 +08:00
|
|
|
BUG_ON(refcount_read(&dc->count));
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
|
bcache: stop dc->writeback_rate_update properly
struct delayed_work writeback_rate_update in struct cache_dev is a delayed
worker to call function update_writeback_rate() in period (the interval is
defined by dc->writeback_rate_update_seconds).
When a metadate I/O error happens on cache device, bcache error handling
routine bch_cache_set_error() will call bch_cache_set_unregister() to
retire whole cache set. On the unregister code path, this delayed work is
stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update).
dc->writeback_rate_update is a special delayed work from others in bcache.
In its routine update_writeback_rate(), this delayed work is re-armed
itself. That means when cancel_delayed_work_sync() returns, this delayed
work can still be executed after several seconds defined by
dc->writeback_rate_update_seconds.
The problem is, after cancel_delayed_work_sync() returns, the cache set
unregister code path will continue and release memory of struct cache set.
Then the delayed work is scheduled to run, __update_writeback_rate()
will reference the already released cache_set memory, and trigger a NULL
pointer deference fault.
This patch introduces two more bcache device flags,
- BCACHE_DEV_WB_RUNNING
bit set: bcache device is in writeback mode and running, it is OK for
dc->writeback_rate_update to re-arm itself.
bit clear:bcache device is trying to stop dc->writeback_rate_update,
this delayed work should not re-arm itself and quit.
- BCACHE_DEV_RATE_DW_RUNNING
bit set: routine update_writeback_rate() is executing.
bit clear: routine update_writeback_rate() quits.
This patch also adds a function cancel_writeback_rate_update_dwork() to
wait for dc->writeback_rate_update quits before cancel it by calling
cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected
quit dc->writeback_rate_update, after time_out seconds this function will
give up and continue to call cancel_delayed_work_sync().
And here I explain how this patch stops self re-armed delayed work properly
with the above stuffs.
update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning
and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling
cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING.
Before calling cancel_delayed_work_sync() wait utill flag
BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling
cancel_delayed_work_sync(), dc->writeback_rate_update must be already re-
armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases
delayed work routine update_writeback_rate() won't be executed after
cancel_delayed_work_sync() returns.
Inside update_writeback_rate() before calling schedule_delayed_work(), flag
BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means
someone is about to stop the delayed work. Because flag
BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync()
has to wait for this flag to be cleared, we don't need to worry about race
condition here.
If update_writeback_rate() is scheduled to run after checking
BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync()
in cancel_writeback_rate_update_dwork(), it is also safe. Because at this
moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned
previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear
and quit immediately.
Because there are more dependences inside update_writeback_rate() to struct
cache_set memory, dc->writeback_rate_update is not a simple self re-arm
delayed work. After trying many different methods (e.g. hold dc->count, or
use locks), this is the only way I can find which works to properly stop
dc->writeback_rate_update delayed work.
Changelog:
v3: change values of BCACHE_DEV_WB_RUNNING and BCACHE_DEV_RATE_DW_RUNNING
to bit index, for test_bit().
v2: Try to fix the race issue which is pointed out by Junhui.
v1: The initial version for review
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:16 +08:00
|
|
|
if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
|
|
|
|
cancel_writeback_rate_update_dwork(dc);
|
|
|
|
|
2018-01-09 04:21:19 +08:00
|
|
|
if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
|
|
|
|
kthread_stop(dc->writeback_thread);
|
|
|
|
dc->writeback_thread = NULL;
|
|
|
|
}
|
|
|
|
|
2019-06-28 19:59:47 +08:00
|
|
|
mutex_lock(&bch_register_lock);
|
|
|
|
|
2018-10-08 20:41:15 +08:00
|
|
|
calc_cached_dev_sectors(dc->disk.c);
|
2013-03-24 07:11:31 +08:00
|
|
|
bcache_device_detach(&dc->disk);
|
|
|
|
list_move(&dc->list, &uncached_devices);
|
|
|
|
|
2013-08-22 08:49:09 +08:00
|
|
|
clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
|
2014-03-20 08:49:37 +08:00
|
|
|
clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
|
2013-08-22 08:49:09 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
mutex_unlock(&bch_register_lock);
|
|
|
|
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_info("Caching disabled for %s\n", dc->backing_dev_name);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
/* Drop ref we took in cached_dev_detach() */
|
|
|
|
closure_put(&dc->disk.cl);
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch_cached_dev_detach(struct cached_dev *dc)
|
|
|
|
{
|
|
|
|
lockdep_assert_held(&bch_register_lock);
|
|
|
|
|
2013-08-22 08:49:09 +08:00
|
|
|
if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
|
2013-03-24 07:11:31 +08:00
|
|
|
return;
|
|
|
|
|
2013-08-22 08:49:09 +08:00
|
|
|
if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
|
2013-03-24 07:11:31 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Block the device from being closed and freed until we're finished
|
|
|
|
* detaching
|
|
|
|
*/
|
|
|
|
closure_get(&dc->disk.cl);
|
|
|
|
|
|
|
|
bch_writeback_queue(dc);
|
bcache: stop dc->writeback_rate_update properly
struct delayed_work writeback_rate_update in struct cache_dev is a delayed
worker to call function update_writeback_rate() in period (the interval is
defined by dc->writeback_rate_update_seconds).
When a metadate I/O error happens on cache device, bcache error handling
routine bch_cache_set_error() will call bch_cache_set_unregister() to
retire whole cache set. On the unregister code path, this delayed work is
stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update).
dc->writeback_rate_update is a special delayed work from others in bcache.
In its routine update_writeback_rate(), this delayed work is re-armed
itself. That means when cancel_delayed_work_sync() returns, this delayed
work can still be executed after several seconds defined by
dc->writeback_rate_update_seconds.
The problem is, after cancel_delayed_work_sync() returns, the cache set
unregister code path will continue and release memory of struct cache set.
Then the delayed work is scheduled to run, __update_writeback_rate()
will reference the already released cache_set memory, and trigger a NULL
pointer deference fault.
This patch introduces two more bcache device flags,
- BCACHE_DEV_WB_RUNNING
bit set: bcache device is in writeback mode and running, it is OK for
dc->writeback_rate_update to re-arm itself.
bit clear:bcache device is trying to stop dc->writeback_rate_update,
this delayed work should not re-arm itself and quit.
- BCACHE_DEV_RATE_DW_RUNNING
bit set: routine update_writeback_rate() is executing.
bit clear: routine update_writeback_rate() quits.
This patch also adds a function cancel_writeback_rate_update_dwork() to
wait for dc->writeback_rate_update quits before cancel it by calling
cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected
quit dc->writeback_rate_update, after time_out seconds this function will
give up and continue to call cancel_delayed_work_sync().
And here I explain how this patch stops self re-armed delayed work properly
with the above stuffs.
update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning
and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling
cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING.
Before calling cancel_delayed_work_sync() wait utill flag
BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling
cancel_delayed_work_sync(), dc->writeback_rate_update must be already re-
armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases
delayed work routine update_writeback_rate() won't be executed after
cancel_delayed_work_sync() returns.
Inside update_writeback_rate() before calling schedule_delayed_work(), flag
BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means
someone is about to stop the delayed work. Because flag
BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync()
has to wait for this flag to be cleared, we don't need to worry about race
condition here.
If update_writeback_rate() is scheduled to run after checking
BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync()
in cancel_writeback_rate_update_dwork(), it is also safe. Because at this
moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned
previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear
and quit immediately.
Because there are more dependences inside update_writeback_rate() to struct
cache_set memory, dc->writeback_rate_update is not a simple self re-arm
delayed work. After trying many different methods (e.g. hold dc->count, or
use locks), this is the only way I can find which works to properly stop
dc->writeback_rate_update delayed work.
Changelog:
v3: change values of BCACHE_DEV_WB_RUNNING and BCACHE_DEV_RATE_DW_RUNNING
to bit index, for test_bit().
v2: Try to fix the race issue which is pointed out by Junhui.
v1: The initial version for review
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:16 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
cached_dev_put(dc);
|
|
|
|
}
|
|
|
|
|
bcache: fix for data collapse after re-attaching an attached device
back-end device sdm has already attached a cache_set with ID
f67ebe1f-f8bc-4d73-bfe5-9dc88607f119, then try to attach with
another cache set, and it returns with an error:
[root]# cd /sys/block/sdm/bcache
[root]# echo 5ccd0a63-148e-48b8-afa2-aca9cbd6279f > attach
-bash: echo: write error: Invalid argument
After that, execute a command to modify the label of bcache
device:
[root]# echo data_disk1 > label
Then we reboot the system, when the system power on, the back-end
device can not attach to cache_set, a messages show in the log:
Feb 5 12:05:52 ceph152 kernel: [922385.508498] bcache:
bch_cached_dev_attach() couldn't find uuid for sdm in set
In sysfs_attach(), dc->sb.set_uuid was assigned to the value
which input through sysfs, no matter whether it is success
or not in bch_cached_dev_attach(). For example, If the back-end
device has already attached to an cache set, bch_cached_dev_attach()
would fail, but dc->sb.set_uuid was changed. Then modify the
label of bcache device, it will call bch_write_bdev_super(),
which would write the dc->sb.set_uuid to the super block, so we
record a wrong cache set ID in the super block, after the system
reboot, the cache set couldn't find the uuid of the back-end
device, so the bcache device couldn't exist and use any more.
In this patch, we don't assigned cache set ID to dc->sb.set_uuid
in sysfs_attach() directly, but input it into bch_cached_dev_attach(),
and assigned dc->sb.set_uuid to the cache set ID after the back-end
device attached to the cache set successful.
Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-08 03:41:46 +08:00
|
|
|
int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
|
|
|
|
uint8_t *set_uuid)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
2018-07-26 12:17:41 +08:00
|
|
|
uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
|
2013-03-24 07:11:31 +08:00
|
|
|
struct uuid_entry *u;
|
2018-03-06 05:41:55 +08:00
|
|
|
struct cached_dev *exist_dc, *t;
|
2019-06-28 19:59:33 +08:00
|
|
|
int ret = 0;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-10-01 14:50:48 +08:00
|
|
|
if ((set_uuid && memcmp(set_uuid, c->set_uuid, 16)) ||
|
|
|
|
(!set_uuid && memcmp(dc->sb.set_uuid, c->set_uuid, 16)))
|
2013-03-24 07:11:31 +08:00
|
|
|
return -ENOENT;
|
|
|
|
|
|
|
|
if (dc->disk.c) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("Can't attach %s: already attached\n",
|
2018-05-03 18:51:32 +08:00
|
|
|
dc->backing_dev_name);
|
2013-03-24 07:11:31 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("Can't attach %s: shutting down\n",
|
2018-05-03 18:51:32 +08:00
|
|
|
dc->backing_dev_name);
|
2013-03-24 07:11:31 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2020-10-01 14:50:56 +08:00
|
|
|
if (dc->sb.block_size < c->cache->sb.block_size) {
|
2013-03-24 07:11:31 +08:00
|
|
|
/* Will die */
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("Couldn't attach %s: block size less than set's block size\n",
|
2018-05-03 18:51:32 +08:00
|
|
|
dc->backing_dev_name);
|
2013-03-24 07:11:31 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2018-03-06 05:41:55 +08:00
|
|
|
/* Check whether already attached */
|
|
|
|
list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
|
|
|
|
if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("Tried to attach %s but duplicate UUID already attached\n",
|
2018-05-03 18:51:32 +08:00
|
|
|
dc->backing_dev_name);
|
2018-03-06 05:41:55 +08:00
|
|
|
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
u = uuid_find(c, dc->sb.uuid);
|
|
|
|
|
|
|
|
if (u &&
|
|
|
|
(BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
|
|
|
|
BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
|
|
|
|
memcpy(u->uuid, invalid_uuid, 16);
|
2018-07-26 12:17:41 +08:00
|
|
|
u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
|
2013-03-24 07:11:31 +08:00
|
|
|
u = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!u) {
|
|
|
|
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("Couldn't find uuid for %s in set\n",
|
2018-05-03 18:51:32 +08:00
|
|
|
dc->backing_dev_name);
|
2013-03-24 07:11:31 +08:00
|
|
|
return -ENOENT;
|
|
|
|
}
|
|
|
|
|
|
|
|
u = uuid_find_empty(c);
|
|
|
|
if (!u) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("Not caching %s, no room for UUID\n",
|
2018-05-03 18:51:32 +08:00
|
|
|
dc->backing_dev_name);
|
2013-03-24 07:11:31 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-08-11 13:19:55 +08:00
|
|
|
/*
|
|
|
|
* Deadlocks since we're called via sysfs...
|
|
|
|
* sysfs_remove_file(&dc->kobj, &sysfs_attach);
|
2013-03-24 07:11:31 +08:00
|
|
|
*/
|
|
|
|
|
2013-03-29 02:50:55 +08:00
|
|
|
if (bch_is_zero(u->uuid, 16)) {
|
2013-03-24 07:11:31 +08:00
|
|
|
struct closure cl;
|
2018-08-11 13:19:45 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
closure_init_stack(&cl);
|
|
|
|
|
|
|
|
memcpy(u->uuid, dc->sb.uuid, 16);
|
|
|
|
memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
|
|
|
|
u->first_reg = u->last_reg = rtime;
|
|
|
|
bch_uuid_write(c);
|
|
|
|
|
2020-10-01 14:50:48 +08:00
|
|
|
memcpy(dc->sb.set_uuid, c->set_uuid, 16);
|
2013-03-24 07:11:31 +08:00
|
|
|
SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
|
|
|
|
|
|
|
|
bch_write_bdev_super(dc, &cl);
|
|
|
|
closure_sync(&cl);
|
|
|
|
} else {
|
|
|
|
u->last_reg = rtime;
|
|
|
|
bch_uuid_write(c);
|
|
|
|
}
|
|
|
|
|
|
|
|
bcache_device_attach(&dc->disk, c, u - c->uuids);
|
|
|
|
list_move(&dc->list, &c->cached_devs);
|
|
|
|
calc_cached_dev_sectors(c);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* dc->c must be set before dc->count != 0 - paired with the mb in
|
|
|
|
* cached_dev_get()
|
|
|
|
*/
|
2018-08-11 13:20:00 +08:00
|
|
|
smp_wmb();
|
2017-10-31 05:46:32 +08:00
|
|
|
refcount_set(&dc->count, 1);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2016-02-27 06:39:06 +08:00
|
|
|
/* Block writeback thread, but spawn it */
|
|
|
|
down_write(&dc->writeback_lock);
|
|
|
|
if (bch_cached_dev_writeback_start(dc)) {
|
|
|
|
up_write(&dc->writeback_lock);
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("Couldn't start writeback facilities for %s\n",
|
2019-06-28 19:59:38 +08:00
|
|
|
dc->disk.disk->disk_name);
|
2014-05-02 04:48:57 +08:00
|
|
|
return -ENOMEM;
|
2016-02-27 06:39:06 +08:00
|
|
|
}
|
2014-05-02 04:48:57 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
|
|
|
|
atomic_set(&dc->has_dirty, 1);
|
|
|
|
bch_writeback_queue(dc);
|
|
|
|
}
|
|
|
|
|
2018-10-08 20:41:12 +08:00
|
|
|
bch_sectors_dirty_init(&dc->disk);
|
|
|
|
|
2019-06-28 19:59:33 +08:00
|
|
|
ret = bch_cached_dev_run(dc);
|
|
|
|
if (ret && (ret != -EBUSY)) {
|
|
|
|
up_write(&dc->writeback_lock);
|
2019-06-28 19:59:45 +08:00
|
|
|
/*
|
|
|
|
* bch_register_lock is held, bcache_device_stop() is not
|
|
|
|
* able to be directly called. The kthread and kworker
|
|
|
|
* created previously in bch_cached_dev_writeback_start()
|
|
|
|
* have to be stopped manually here.
|
|
|
|
*/
|
|
|
|
kthread_stop(dc->writeback_thread);
|
|
|
|
cancel_writeback_rate_update_dwork(dc);
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("Couldn't run cached device %s\n",
|
2019-06-28 19:59:38 +08:00
|
|
|
dc->backing_dev_name);
|
2019-06-28 19:59:33 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-02-01 23:29:41 +08:00
|
|
|
bcache_device_link(&dc->disk, c, "bdev");
|
2018-08-09 15:48:49 +08:00
|
|
|
atomic_inc(&c->attached_dev_nr);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2016-02-27 06:39:06 +08:00
|
|
|
/* Allow the writeback thread to proceed */
|
|
|
|
up_write(&dc->writeback_lock);
|
|
|
|
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_info("Caching %s as %s on set %pU\n",
|
2018-05-03 18:51:32 +08:00
|
|
|
dc->backing_dev_name,
|
|
|
|
dc->disk.disk->disk_name,
|
2020-10-01 14:50:48 +08:00
|
|
|
dc->disk.c->set_uuid);
|
2013-03-24 07:11:31 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-04-25 00:48:35 +08:00
|
|
|
/* when dc->disk.kobj released */
|
2013-03-24 07:11:31 +08:00
|
|
|
void bch_cached_dev_release(struct kobject *kobj)
|
|
|
|
{
|
|
|
|
struct cached_dev *dc = container_of(kobj, struct cached_dev,
|
|
|
|
disk.kobj);
|
|
|
|
kfree(dc);
|
|
|
|
module_put(THIS_MODULE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cached_dev_free(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
|
|
|
|
|
bcache: stop dc->writeback_rate_update properly
struct delayed_work writeback_rate_update in struct cache_dev is a delayed
worker to call function update_writeback_rate() in period (the interval is
defined by dc->writeback_rate_update_seconds).
When a metadate I/O error happens on cache device, bcache error handling
routine bch_cache_set_error() will call bch_cache_set_unregister() to
retire whole cache set. On the unregister code path, this delayed work is
stopped by calling cancel_delayed_work_sync(&dc->writeback_rate_update).
dc->writeback_rate_update is a special delayed work from others in bcache.
In its routine update_writeback_rate(), this delayed work is re-armed
itself. That means when cancel_delayed_work_sync() returns, this delayed
work can still be executed after several seconds defined by
dc->writeback_rate_update_seconds.
The problem is, after cancel_delayed_work_sync() returns, the cache set
unregister code path will continue and release memory of struct cache set.
Then the delayed work is scheduled to run, __update_writeback_rate()
will reference the already released cache_set memory, and trigger a NULL
pointer deference fault.
This patch introduces two more bcache device flags,
- BCACHE_DEV_WB_RUNNING
bit set: bcache device is in writeback mode and running, it is OK for
dc->writeback_rate_update to re-arm itself.
bit clear:bcache device is trying to stop dc->writeback_rate_update,
this delayed work should not re-arm itself and quit.
- BCACHE_DEV_RATE_DW_RUNNING
bit set: routine update_writeback_rate() is executing.
bit clear: routine update_writeback_rate() quits.
This patch also adds a function cancel_writeback_rate_update_dwork() to
wait for dc->writeback_rate_update quits before cancel it by calling
cancel_delayed_work_sync(). In order to avoid a deadlock by unexpected
quit dc->writeback_rate_update, after time_out seconds this function will
give up and continue to call cancel_delayed_work_sync().
And here I explain how this patch stops self re-armed delayed work properly
with the above stuffs.
update_writeback_rate() sets BCACHE_DEV_RATE_DW_RUNNING at its beginning
and clears BCACHE_DEV_RATE_DW_RUNNING at its end. Before calling
cancel_writeback_rate_update_dwork() clear flag BCACHE_DEV_WB_RUNNING.
Before calling cancel_delayed_work_sync() wait utill flag
BCACHE_DEV_RATE_DW_RUNNING is clear. So when calling
cancel_delayed_work_sync(), dc->writeback_rate_update must be already re-
armed, or quite by seeing BCACHE_DEV_WB_RUNNING cleared. In both cases
delayed work routine update_writeback_rate() won't be executed after
cancel_delayed_work_sync() returns.
Inside update_writeback_rate() before calling schedule_delayed_work(), flag
BCACHE_DEV_WB_RUNNING is checked before. If this flag is cleared, it means
someone is about to stop the delayed work. Because flag
BCACHE_DEV_RATE_DW_RUNNING is set already and cancel_delayed_work_sync()
has to wait for this flag to be cleared, we don't need to worry about race
condition here.
If update_writeback_rate() is scheduled to run after checking
BCACHE_DEV_RATE_DW_RUNNING and before calling cancel_delayed_work_sync()
in cancel_writeback_rate_update_dwork(), it is also safe. Because at this
moment BCACHE_DEV_WB_RUNNING is cleared with memory barrier. As I mentioned
previously, update_writeback_rate() will see BCACHE_DEV_WB_RUNNING is clear
and quit immediately.
Because there are more dependences inside update_writeback_rate() to struct
cache_set memory, dc->writeback_rate_update is not a simple self re-arm
delayed work. After trying many different methods (e.g. hold dc->count, or
use locks), this is the only way I can find which works to properly stop
dc->writeback_rate_update delayed work.
Changelog:
v3: change values of BCACHE_DEV_WB_RUNNING and BCACHE_DEV_RATE_DW_RUNNING
to bit index, for test_bit().
v2: Try to fix the race issue which is pointed out by Junhui.
v1: The initial version for review
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Junhui Tang <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:16 +08:00
|
|
|
if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
|
|
|
|
cancel_writeback_rate_update_dwork(dc);
|
|
|
|
|
2014-05-21 03:20:28 +08:00
|
|
|
if (!IS_ERR_OR_NULL(dc->writeback_thread))
|
|
|
|
kthread_stop(dc->writeback_thread);
|
bcache: stop bcache device when backing device is offline
Currently bcache does not handle backing device failure, if backing
device is offline and disconnected from system, its bcache device can still
be accessible. If the bcache device is in writeback mode, I/O requests even
can success if the requests hit on cache device. That is to say, when and
how bcache handles offline backing device is undefined.
This patch tries to handle backing device offline in a rather simple way,
- Add cached_dev->status_update_thread kernel thread to update backing
device status in every 1 second.
- Add cached_dev->offline_seconds to record how many seconds the backing
device is observed to be offline. If the backing device is offline for
BACKING_DEV_OFFLINE_TIMEOUT (30) seconds, set dc->io_disable to 1 and
call bcache_device_stop() to stop the bache device which linked to the
offline backing device.
Now if a backing device is offline for BACKING_DEV_OFFLINE_TIMEOUT seconds,
its bcache device will be removed, then user space application writing on
it will get error immediately, and handler the device failure in time.
This patch is quite simple, does not handle more complicated situations.
Once the bcache device is stopped, users need to recovery the backing
device, register and attach it manually.
Changelog:
v3: call wait_for_kthread_stop() before exits kernel thread.
v2: remove "bcache: " prefix when calling pr_warn().
v1: initial version.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-05-28 15:37:41 +08:00
|
|
|
if (!IS_ERR_OR_NULL(dc->status_update_thread))
|
|
|
|
kthread_stop(dc->status_update_thread);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
bcache: acquire bch_register_lock later in cached_dev_free()
When enable lockdep engine, a lockdep warning can be observed when
reboot or shutdown system,
[ 3142.764557][ T1] bcache: bcache_reboot() Stopping all devices:
[ 3142.776265][ T2649]
[ 3142.777159][ T2649] ======================================================
[ 3142.780039][ T2649] WARNING: possible circular locking dependency detected
[ 3142.782869][ T2649] 5.2.0-rc4-lp151.20-default+ #1 Tainted: G W
[ 3142.785684][ T2649] ------------------------------------------------------
[ 3142.788479][ T2649] kworker/3:67/2649 is trying to acquire lock:
[ 3142.790738][ T2649] 00000000aaf02291 ((wq_completion)bcache_writeback_wq){+.+.}, at: flush_workqueue+0x87/0x4c0
[ 3142.794678][ T2649]
[ 3142.794678][ T2649] but task is already holding lock:
[ 3142.797402][ T2649] 000000004fcf89c5 (&bch_register_lock){+.+.}, at: cached_dev_free+0x17/0x120 [bcache]
[ 3142.801462][ T2649]
[ 3142.801462][ T2649] which lock already depends on the new lock.
[ 3142.801462][ T2649]
[ 3142.805277][ T2649]
[ 3142.805277][ T2649] the existing dependency chain (in reverse order) is:
[ 3142.808902][ T2649]
[ 3142.808902][ T2649] -> #2 (&bch_register_lock){+.+.}:
[ 3142.812396][ T2649] __mutex_lock+0x7a/0x9d0
[ 3142.814184][ T2649] cached_dev_free+0x17/0x120 [bcache]
[ 3142.816415][ T2649] process_one_work+0x2a4/0x640
[ 3142.818413][ T2649] worker_thread+0x39/0x3f0
[ 3142.820276][ T2649] kthread+0x125/0x140
[ 3142.822061][ T2649] ret_from_fork+0x3a/0x50
[ 3142.823965][ T2649]
[ 3142.823965][ T2649] -> #1 ((work_completion)(&cl->work)#2){+.+.}:
[ 3142.827244][ T2649] process_one_work+0x277/0x640
[ 3142.829160][ T2649] worker_thread+0x39/0x3f0
[ 3142.830958][ T2649] kthread+0x125/0x140
[ 3142.832674][ T2649] ret_from_fork+0x3a/0x50
[ 3142.834915][ T2649]
[ 3142.834915][ T2649] -> #0 ((wq_completion)bcache_writeback_wq){+.+.}:
[ 3142.838121][ T2649] lock_acquire+0xb4/0x1c0
[ 3142.840025][ T2649] flush_workqueue+0xae/0x4c0
[ 3142.842035][ T2649] drain_workqueue+0xa9/0x180
[ 3142.844042][ T2649] destroy_workqueue+0x17/0x250
[ 3142.846142][ T2649] cached_dev_free+0x52/0x120 [bcache]
[ 3142.848530][ T2649] process_one_work+0x2a4/0x640
[ 3142.850663][ T2649] worker_thread+0x39/0x3f0
[ 3142.852464][ T2649] kthread+0x125/0x140
[ 3142.854106][ T2649] ret_from_fork+0x3a/0x50
[ 3142.855880][ T2649]
[ 3142.855880][ T2649] other info that might help us debug this:
[ 3142.855880][ T2649]
[ 3142.859663][ T2649] Chain exists of:
[ 3142.859663][ T2649] (wq_completion)bcache_writeback_wq --> (work_completion)(&cl->work)#2 --> &bch_register_lock
[ 3142.859663][ T2649]
[ 3142.865424][ T2649] Possible unsafe locking scenario:
[ 3142.865424][ T2649]
[ 3142.868022][ T2649] CPU0 CPU1
[ 3142.869885][ T2649] ---- ----
[ 3142.871751][ T2649] lock(&bch_register_lock);
[ 3142.873379][ T2649] lock((work_completion)(&cl->work)#2);
[ 3142.876399][ T2649] lock(&bch_register_lock);
[ 3142.879727][ T2649] lock((wq_completion)bcache_writeback_wq);
[ 3142.882064][ T2649]
[ 3142.882064][ T2649] *** DEADLOCK ***
[ 3142.882064][ T2649]
[ 3142.885060][ T2649] 3 locks held by kworker/3:67/2649:
[ 3142.887245][ T2649] #0: 00000000e774cdd0 ((wq_completion)events){+.+.}, at: process_one_work+0x21e/0x640
[ 3142.890815][ T2649] #1: 00000000f7df89da ((work_completion)(&cl->work)#2){+.+.}, at: process_one_work+0x21e/0x640
[ 3142.894884][ T2649] #2: 000000004fcf89c5 (&bch_register_lock){+.+.}, at: cached_dev_free+0x17/0x120 [bcache]
[ 3142.898797][ T2649]
[ 3142.898797][ T2649] stack backtrace:
[ 3142.900961][ T2649] CPU: 3 PID: 2649 Comm: kworker/3:67 Tainted: G W 5.2.0-rc4-lp151.20-default+ #1
[ 3142.904789][ T2649] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018
[ 3142.909168][ T2649] Workqueue: events cached_dev_free [bcache]
[ 3142.911422][ T2649] Call Trace:
[ 3142.912656][ T2649] dump_stack+0x85/0xcb
[ 3142.914181][ T2649] print_circular_bug+0x19a/0x1f0
[ 3142.916193][ T2649] __lock_acquire+0x16cd/0x1850
[ 3142.917936][ T2649] ? __lock_acquire+0x6a8/0x1850
[ 3142.919704][ T2649] ? lock_acquire+0xb4/0x1c0
[ 3142.921335][ T2649] ? find_held_lock+0x34/0xa0
[ 3142.923052][ T2649] lock_acquire+0xb4/0x1c0
[ 3142.924635][ T2649] ? flush_workqueue+0x87/0x4c0
[ 3142.926375][ T2649] flush_workqueue+0xae/0x4c0
[ 3142.928047][ T2649] ? flush_workqueue+0x87/0x4c0
[ 3142.929824][ T2649] ? drain_workqueue+0xa9/0x180
[ 3142.931686][ T2649] drain_workqueue+0xa9/0x180
[ 3142.933534][ T2649] destroy_workqueue+0x17/0x250
[ 3142.935787][ T2649] cached_dev_free+0x52/0x120 [bcache]
[ 3142.937795][ T2649] process_one_work+0x2a4/0x640
[ 3142.939803][ T2649] worker_thread+0x39/0x3f0
[ 3142.941487][ T2649] ? process_one_work+0x640/0x640
[ 3142.943389][ T2649] kthread+0x125/0x140
[ 3142.944894][ T2649] ? kthread_create_worker_on_cpu+0x70/0x70
[ 3142.947744][ T2649] ret_from_fork+0x3a/0x50
[ 3142.970358][ T2649] bcache: bcache_device_free() bcache0 stopped
Here is how the deadlock happens.
1) bcache_reboot() calls bcache_device_stop(), then inside
bcache_device_stop() BCACHE_DEV_CLOSING bit is set on d->flags.
Then closure_queue(&d->cl) is called to invoke cached_dev_flush().
2) In cached_dev_flush(), cached_dev_free() is called by continu_at().
3) In cached_dev_free(), when stopping the writeback kthread of the
cached device by kthread_stop(), dc->writeback_thread will be waken
up to quite the kthread while-loop, then cached_dev_put() is called
in bch_writeback_thread().
4) Calling cached_dev_put() in writeback kthread may drop dc->count to
0, then dc->detach kworker is scheduled, which is initialized as
cached_dev_detach_finish().
5) Inside cached_dev_detach_finish(), the last line of code is to call
closure_put(&dc->disk.cl), which drops the last reference counter of
closrure dc->disk.cl, then the callback cached_dev_flush() gets
called.
Now cached_dev_flush() is called for second time in the code path, the
first time is in step 2). And again bch_register_lock will be acquired
again, and a A-A lock (lockdep terminology) is happening.
The root cause of the above A-A lock is in cached_dev_free(), mutex
bch_register_lock is held before stopping writeback kthread and other
kworkers. Fortunately now we have variable 'bcache_is_reboot', which may
prevent device registration or unregistration during reboot/shutdown
time, so it is unncessary to hold bch_register_lock such early now.
This is how this patch fixes the reboot/shutdown time A-A lock issue:
After moving mutex_lock(&bch_register_lock) to a later location where
before atomic_read(&dc->running) in cached_dev_free(), such A-A lock
problem can be solved without any reboot time registration race.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-28 19:59:48 +08:00
|
|
|
mutex_lock(&bch_register_lock);
|
|
|
|
|
2013-05-15 15:11:26 +08:00
|
|
|
if (atomic_read(&dc->running))
|
|
|
|
bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
|
2013-03-24 07:11:31 +08:00
|
|
|
bcache_device_free(&dc->disk);
|
|
|
|
list_del(&dc->list);
|
|
|
|
|
|
|
|
mutex_unlock(&bch_register_lock);
|
|
|
|
|
2020-01-24 01:01:33 +08:00
|
|
|
if (dc->sb_disk)
|
|
|
|
put_page(virt_to_page(dc->sb_disk));
|
2020-01-24 01:01:26 +08:00
|
|
|
|
2014-07-08 04:03:36 +08:00
|
|
|
if (!IS_ERR_OR_NULL(dc->bdev))
|
2013-03-24 07:11:31 +08:00
|
|
|
blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
|
|
|
|
|
|
|
|
wake_up(&unregister_wait);
|
|
|
|
|
|
|
|
kobject_put(&dc->disk.kobj);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cached_dev_flush(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
|
|
|
|
struct bcache_device *d = &dc->disk;
|
|
|
|
|
2013-07-11 12:25:02 +08:00
|
|
|
mutex_lock(&bch_register_lock);
|
2013-08-22 08:49:09 +08:00
|
|
|
bcache_device_unlink(d);
|
2013-07-11 12:25:02 +08:00
|
|
|
mutex_unlock(&bch_register_lock);
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
bch_cache_accounting_destroy(&dc->accounting);
|
|
|
|
kobject_del(&d->kobj);
|
|
|
|
|
|
|
|
continue_at(cl, cached_dev_free, system_wq);
|
|
|
|
}
|
|
|
|
|
2018-08-11 13:19:44 +08:00
|
|
|
static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
2013-05-15 15:11:26 +08:00
|
|
|
int ret;
|
2013-03-24 07:11:31 +08:00
|
|
|
struct io *io;
|
2013-05-15 15:11:26 +08:00
|
|
|
struct request_queue *q = bdev_get_queue(dc->bdev);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
__module_get(THIS_MODULE);
|
|
|
|
INIT_LIST_HEAD(&dc->list);
|
2013-05-15 15:11:26 +08:00
|
|
|
closure_init(&dc->disk.cl, NULL);
|
|
|
|
set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
|
2013-03-24 07:11:31 +08:00
|
|
|
kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
|
|
|
|
INIT_WORK(&dc->detach, cached_dev_detach_finish);
|
2013-12-17 07:27:25 +08:00
|
|
|
sema_init(&dc->sb_write_mutex, 1);
|
2013-05-15 15:11:26 +08:00
|
|
|
INIT_LIST_HEAD(&dc->io_lru);
|
|
|
|
spin_lock_init(&dc->io_lock);
|
|
|
|
bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
dc->sequential_cutoff = 4 << 20;
|
|
|
|
|
|
|
|
for (io = dc->io; io < dc->io + RECENT_IO; io++) {
|
|
|
|
list_add(&io->lru, &dc->io_lru);
|
|
|
|
hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
|
|
|
|
}
|
|
|
|
|
2013-07-12 13:39:53 +08:00
|
|
|
dc->disk.stripe_size = q->limits.io_opt >> 9;
|
|
|
|
|
|
|
|
if (dc->disk.stripe_size)
|
|
|
|
dc->partial_stripes_expensive =
|
|
|
|
q->limits.raid_partial_stripes_expensive;
|
|
|
|
|
2013-06-05 21:21:07 +08:00
|
|
|
ret = bcache_device_init(&dc->disk, block_size,
|
2020-11-27 01:43:37 +08:00
|
|
|
bdev_nr_sectors(dc->bdev) - dc->sb.data_offset,
|
2020-07-01 16:59:43 +08:00
|
|
|
dc->bdev, &bcache_cached_ops);
|
2013-05-15 15:11:26 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2020-09-24 14:51:30 +08:00
|
|
|
blk_queue_io_opt(dc->disk.disk->queue,
|
|
|
|
max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q)));
|
2013-05-15 15:11:26 +08:00
|
|
|
|
bcache: add io_disable to struct cached_dev
If a bcache device is configured to writeback mode, current code does not
handle write I/O errors on backing devices properly.
In writeback mode, write request is written to cache device, and
latter being flushed to backing device. If I/O failed when writing from
cache device to the backing device, bcache code just ignores the error and
upper layer code is NOT noticed that the backing device is broken.
This patch tries to handle backing device failure like how the cache device
failure is handled,
- Add a error counter 'io_errors' and error limit 'error_limit' in struct
cached_dev. Add another io_disable to struct cached_dev to disable I/Os
on the problematic backing device.
- When I/O error happens on backing device, increase io_errors counter. And
if io_errors reaches error_limit, set cache_dev->io_disable to true, and
stop the bcache device.
The result is, if backing device is broken of disconnected, and I/O errors
reach its error limit, backing device will be disabled and the associated
bcache device will be removed from system.
Changelog:
v2: remove "bcache: " prefix in pr_error(), and use correct name string to
print out bcache device gendisk name.
v1: indeed this is new added in v2 patch set.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:25 +08:00
|
|
|
atomic_set(&dc->io_errors, 0);
|
|
|
|
dc->io_disable = false;
|
|
|
|
dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
|
bcache: add stop_when_cache_set_failed option to backing device
When there are too many I/O errors on cache device, current bcache code
will retire the whole cache set, and detach all bcache devices. But the
detached bcache devices are not stopped, which is problematic when bcache
is in writeback mode.
If the retired cache set has dirty data of backing devices, continue
writing to bcache device will write to backing device directly. If the
LBA of write request has a dirty version cached on cache device, next time
when the cache device is re-registered and backing device re-attached to
it again, the stale dirty data on cache device will be written to backing
device, and overwrite latest directly written data. This situation causes
a quite data corruption.
But we cannot simply stop all attached bcache devices when the cache set is
broken or disconnected. For example, use bcache to accelerate performance
of an email service. In such workload, if cache device is broken but no
dirty data lost, keep the bcache device alive and permit email service
continue to access user data might be a better solution for the cache
device failure.
Nix <nix@esperi.org.uk> points out the issue and provides the above example
to explain why it might be necessary to not stop bcache device for broken
cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant
suggestion to provide "always" and "auto" options to per-cached device
sysfs file stop_when_cache_set_failed. If cache set is retiring and the
backing device has no dirty data on cache, it should be safe to keep the
bcache device alive. In this case, if stop_when_cache_set_failed is set to
"auto", the device failure handling code will not stop this bcache device
and permit application to access the backing device with a unattached
bcache device.
Changelog:
[mlyle: edited to not break string constants across lines]
v3: fix typos pointed out by Nix.
v2: change option values of stop_when_cache_set_failed from 1/0 to
"auto"/"always".
v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1
(always stop).
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Cc: Nix <nix@esperi.org.uk>
Cc: Pavel Goran <via-bcache@pvgoran.name>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:18 +08:00
|
|
|
/* default to auto */
|
|
|
|
dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
|
|
|
|
|
2013-05-15 15:11:26 +08:00
|
|
|
bch_cached_dev_request_init(dc);
|
|
|
|
bch_cached_dev_writeback_init(dc);
|
2013-03-24 07:11:31 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Cached device - bcache superblock */
|
|
|
|
|
2020-01-24 01:01:32 +08:00
|
|
|
static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
|
2013-03-24 07:11:31 +08:00
|
|
|
struct block_device *bdev,
|
|
|
|
struct cached_dev *dc)
|
|
|
|
{
|
|
|
|
const char *err = "cannot allocate memory";
|
|
|
|
struct cache_set *c;
|
2019-06-28 19:59:33 +08:00
|
|
|
int ret = -ENOMEM;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2018-05-03 18:51:32 +08:00
|
|
|
bdevname(bdev, dc->backing_dev_name);
|
2013-03-24 07:11:31 +08:00
|
|
|
memcpy(&dc->sb, sb, sizeof(struct cache_sb));
|
|
|
|
dc->bdev = bdev;
|
|
|
|
dc->bdev->bd_holder = dc;
|
2020-01-24 01:01:33 +08:00
|
|
|
dc->sb_disk = sb_disk;
|
2018-05-03 18:51:32 +08:00
|
|
|
|
2013-05-15 15:11:26 +08:00
|
|
|
if (cached_dev_init(dc, sb->block_size << 9))
|
|
|
|
goto err;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
err = "error creating kobject";
|
2020-11-17 15:18:55 +08:00
|
|
|
if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache"))
|
2013-03-24 07:11:31 +08:00
|
|
|
goto err;
|
|
|
|
if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
|
|
|
|
goto err;
|
|
|
|
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_info("registered backing device %s\n", dc->backing_dev_name);
|
2013-05-15 15:11:26 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
list_add(&dc->list, &uncached_devices);
|
2018-08-09 15:48:45 +08:00
|
|
|
/* attach to a matched cache set if it exists */
|
2013-03-24 07:11:31 +08:00
|
|
|
list_for_each_entry(c, &bch_cache_sets, list)
|
bcache: fix for data collapse after re-attaching an attached device
back-end device sdm has already attached a cache_set with ID
f67ebe1f-f8bc-4d73-bfe5-9dc88607f119, then try to attach with
another cache set, and it returns with an error:
[root]# cd /sys/block/sdm/bcache
[root]# echo 5ccd0a63-148e-48b8-afa2-aca9cbd6279f > attach
-bash: echo: write error: Invalid argument
After that, execute a command to modify the label of bcache
device:
[root]# echo data_disk1 > label
Then we reboot the system, when the system power on, the back-end
device can not attach to cache_set, a messages show in the log:
Feb 5 12:05:52 ceph152 kernel: [922385.508498] bcache:
bch_cached_dev_attach() couldn't find uuid for sdm in set
In sysfs_attach(), dc->sb.set_uuid was assigned to the value
which input through sysfs, no matter whether it is success
or not in bch_cached_dev_attach(). For example, If the back-end
device has already attached to an cache set, bch_cached_dev_attach()
would fail, but dc->sb.set_uuid was changed. Then modify the
label of bcache device, it will call bch_write_bdev_super(),
which would write the dc->sb.set_uuid to the super block, so we
record a wrong cache set ID in the super block, after the system
reboot, the cache set couldn't find the uuid of the back-end
device, so the bcache device couldn't exist and use any more.
In this patch, we don't assigned cache set ID to dc->sb.set_uuid
in sysfs_attach() directly, but input it into bch_cached_dev_attach(),
and assigned dc->sb.set_uuid to the cache set ID after the back-end
device attached to the cache set successful.
Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-08 03:41:46 +08:00
|
|
|
bch_cached_dev_attach(dc, c, NULL);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
|
2019-06-28 19:59:33 +08:00
|
|
|
BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) {
|
|
|
|
err = "failed to run cached device";
|
|
|
|
ret = bch_cached_dev_run(dc);
|
|
|
|
if (ret)
|
|
|
|
goto err;
|
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2019-04-25 00:48:37 +08:00
|
|
|
return 0;
|
2013-03-24 07:11:31 +08:00
|
|
|
err:
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_notice("error %s: %s\n", dc->backing_dev_name, err);
|
2013-05-15 15:11:26 +08:00
|
|
|
bcache_device_stop(&dc->disk);
|
2019-06-28 19:59:33 +08:00
|
|
|
return ret;
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Flash only volumes */
|
|
|
|
|
2019-04-25 00:48:35 +08:00
|
|
|
/* When d->kobj released */
|
2013-03-24 07:11:31 +08:00
|
|
|
void bch_flash_dev_release(struct kobject *kobj)
|
|
|
|
{
|
|
|
|
struct bcache_device *d = container_of(kobj, struct bcache_device,
|
|
|
|
kobj);
|
|
|
|
kfree(d);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void flash_dev_free(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
|
2018-08-11 13:19:45 +08:00
|
|
|
|
2014-04-30 06:39:27 +08:00
|
|
|
mutex_lock(&bch_register_lock);
|
2018-07-26 12:17:33 +08:00
|
|
|
atomic_long_sub(bcache_dev_sectors_dirty(d),
|
|
|
|
&d->c->flash_dev_dirty_sectors);
|
2013-03-24 07:11:31 +08:00
|
|
|
bcache_device_free(d);
|
2014-04-30 06:39:27 +08:00
|
|
|
mutex_unlock(&bch_register_lock);
|
2013-03-24 07:11:31 +08:00
|
|
|
kobject_put(&d->kobj);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void flash_dev_flush(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct bcache_device *d = container_of(cl, struct bcache_device, cl);
|
|
|
|
|
2014-04-30 06:39:27 +08:00
|
|
|
mutex_lock(&bch_register_lock);
|
2013-02-01 23:29:41 +08:00
|
|
|
bcache_device_unlink(d);
|
2014-04-30 06:39:27 +08:00
|
|
|
mutex_unlock(&bch_register_lock);
|
2013-03-24 07:11:31 +08:00
|
|
|
kobject_del(&d->kobj);
|
|
|
|
continue_at(cl, flash_dev_free, system_wq);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
|
|
|
|
{
|
|
|
|
struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
|
|
|
|
GFP_KERNEL);
|
|
|
|
if (!d)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
closure_init(&d->cl, NULL);
|
|
|
|
set_closure_fn(&d->cl, flash_dev_flush, system_wq);
|
|
|
|
|
|
|
|
kobject_init(&d->kobj, &bch_flash_dev_ktype);
|
|
|
|
|
2020-10-01 14:50:49 +08:00
|
|
|
if (bcache_device_init(d, block_bytes(c->cache), u->sectors,
|
2020-07-01 16:59:43 +08:00
|
|
|
NULL, &bcache_flash_ops))
|
2013-03-24 07:11:31 +08:00
|
|
|
goto err;
|
|
|
|
|
|
|
|
bcache_device_attach(d, c, u - c->uuids);
|
2017-09-07 01:28:53 +08:00
|
|
|
bch_sectors_dirty_init(d);
|
2013-03-24 07:11:31 +08:00
|
|
|
bch_flash_dev_request_init(d);
|
|
|
|
add_disk(d->disk);
|
|
|
|
|
|
|
|
if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
bcache_device_link(d, c, "volume");
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
err:
|
|
|
|
kobject_put(&d->kobj);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int flash_devs_run(struct cache_set *c)
|
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
struct uuid_entry *u;
|
|
|
|
|
|
|
|
for (u = c->uuids;
|
2018-02-28 01:49:29 +08:00
|
|
|
u < c->uuids + c->nr_uuids && !ret;
|
2013-03-24 07:11:31 +08:00
|
|
|
u++)
|
|
|
|
if (UUID_FLASH_ONLY(u))
|
|
|
|
ret = flash_dev_run(c, u);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
int bch_flash_dev_create(struct cache_set *c, uint64_t size)
|
|
|
|
{
|
|
|
|
struct uuid_entry *u;
|
|
|
|
|
|
|
|
if (test_bit(CACHE_SET_STOPPING, &c->flags))
|
|
|
|
return -EINTR;
|
|
|
|
|
2014-07-12 03:17:41 +08:00
|
|
|
if (!test_bit(CACHE_SET_RUNNING, &c->flags))
|
|
|
|
return -EPERM;
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
u = uuid_find_empty(c);
|
|
|
|
if (!u) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_err("Can't create volume, no room for UUID\n");
|
2013-03-24 07:11:31 +08:00
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
get_random_bytes(u->uuid, 16);
|
|
|
|
memset(u->label, 0, 32);
|
2018-07-26 12:17:41 +08:00
|
|
|
u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
SET_UUID_FLASH_ONLY(u, 1);
|
|
|
|
u->sectors = size >> 9;
|
|
|
|
|
|
|
|
bch_uuid_write(c);
|
|
|
|
|
|
|
|
return flash_dev_run(c, u);
|
|
|
|
}
|
|
|
|
|
bcache: add io_disable to struct cached_dev
If a bcache device is configured to writeback mode, current code does not
handle write I/O errors on backing devices properly.
In writeback mode, write request is written to cache device, and
latter being flushed to backing device. If I/O failed when writing from
cache device to the backing device, bcache code just ignores the error and
upper layer code is NOT noticed that the backing device is broken.
This patch tries to handle backing device failure like how the cache device
failure is handled,
- Add a error counter 'io_errors' and error limit 'error_limit' in struct
cached_dev. Add another io_disable to struct cached_dev to disable I/Os
on the problematic backing device.
- When I/O error happens on backing device, increase io_errors counter. And
if io_errors reaches error_limit, set cache_dev->io_disable to true, and
stop the bcache device.
The result is, if backing device is broken of disconnected, and I/O errors
reach its error limit, backing device will be disabled and the associated
bcache device will be removed from system.
Changelog:
v2: remove "bcache: " prefix in pr_error(), and use correct name string to
print out bcache device gendisk name.
v1: indeed this is new added in v2 patch set.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:25 +08:00
|
|
|
bool bch_cached_dev_error(struct cached_dev *dc)
|
|
|
|
{
|
|
|
|
if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
dc->io_disable = true;
|
|
|
|
/* make others know io_disable is true earlier */
|
|
|
|
smp_mb();
|
|
|
|
|
|
|
|
pr_err("stop %s: too many IO errors on backing device %s\n",
|
2020-05-27 12:01:52 +08:00
|
|
|
dc->disk.disk->disk_name, dc->backing_dev_name);
|
bcache: add io_disable to struct cached_dev
If a bcache device is configured to writeback mode, current code does not
handle write I/O errors on backing devices properly.
In writeback mode, write request is written to cache device, and
latter being flushed to backing device. If I/O failed when writing from
cache device to the backing device, bcache code just ignores the error and
upper layer code is NOT noticed that the backing device is broken.
This patch tries to handle backing device failure like how the cache device
failure is handled,
- Add a error counter 'io_errors' and error limit 'error_limit' in struct
cached_dev. Add another io_disable to struct cached_dev to disable I/Os
on the problematic backing device.
- When I/O error happens on backing device, increase io_errors counter. And
if io_errors reaches error_limit, set cache_dev->io_disable to true, and
stop the bcache device.
The result is, if backing device is broken of disconnected, and I/O errors
reach its error limit, backing device will be disabled and the associated
bcache device will be removed from system.
Changelog:
v2: remove "bcache: " prefix in pr_error(), and use correct name string to
print out bcache device gendisk name.
v1: indeed this is new added in v2 patch set.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:25 +08:00
|
|
|
|
|
|
|
bcache_device_stop(&dc->disk);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
/* Cache set */
|
|
|
|
|
|
|
|
__printf(2, 3)
|
|
|
|
bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
|
|
|
|
{
|
2020-05-27 12:01:52 +08:00
|
|
|
struct va_format vaf;
|
2013-03-24 07:11:31 +08:00
|
|
|
va_list args;
|
|
|
|
|
2013-07-12 10:42:51 +08:00
|
|
|
if (c->on_error != ON_ERROR_PANIC &&
|
|
|
|
test_bit(CACHE_SET_STOPPING, &c->flags))
|
2013-03-24 07:11:31 +08:00
|
|
|
return false;
|
|
|
|
|
bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags
When too many I/Os failed on cache device, bch_cache_set_error() is called
in the error handling code path to retire whole problematic cache set. If
new I/O requests continue to come and take refcount dc->count, the cache
set won't be retired immediately, this is a problem.
Further more, there are several kernel thread and self-armed kernel work
may still running after bch_cache_set_error() is called. It needs to wait
quite a while for them to stop, or they won't stop at all. They also
prevent the cache set from being retired.
The solution in this patch is, to add per cache set flag to disable I/O
request on this cache and all attached backing devices. Then new coming I/O
requests can be rejected in *_make_request() before taking refcount, kernel
threads and self-armed kernel worker can stop very fast when flags bit
CACHE_SET_IO_DISABLE is set.
Because bcache also do internal I/Os for writeback, garbage collection,
bucket allocation, journaling, this kind of I/O should be disabled after
bch_cache_set_error() is called. So closure_bio_submit() is modified to
check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set,
closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and
return, generic_make_request() won't be called.
A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit
from cache_set->flags, to disable or enable cache set I/O for debugging. It
is helpful to trigger more corner case issues for failed cache device.
Changelog
v4, add wait_for_kthread_stop(), and call it before exits writeback and gc
kernel threads.
v3, change CACHE_SET_IO_DISABLE from 4 to 3, since it is bit index.
remove "bcache: " prefix when printing out kernel message.
v2, more changes by previous review,
- Use CACHE_SET_IO_DISABLE of cache_set->flags, suggested by Junhui.
- Check CACHE_SET_IO_DISABLE in bch_btree_gc() to stop a while-loop, this
is reported and inspired from origal patch of Pavel Vazharov.
v1, initial version.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Pavel Vazharov <freakpv@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:17 +08:00
|
|
|
if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_info("CACHE_SET_IO_DISABLE already set\n");
|
bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags
When too many I/Os failed on cache device, bch_cache_set_error() is called
in the error handling code path to retire whole problematic cache set. If
new I/O requests continue to come and take refcount dc->count, the cache
set won't be retired immediately, this is a problem.
Further more, there are several kernel thread and self-armed kernel work
may still running after bch_cache_set_error() is called. It needs to wait
quite a while for them to stop, or they won't stop at all. They also
prevent the cache set from being retired.
The solution in this patch is, to add per cache set flag to disable I/O
request on this cache and all attached backing devices. Then new coming I/O
requests can be rejected in *_make_request() before taking refcount, kernel
threads and self-armed kernel worker can stop very fast when flags bit
CACHE_SET_IO_DISABLE is set.
Because bcache also do internal I/Os for writeback, garbage collection,
bucket allocation, journaling, this kind of I/O should be disabled after
bch_cache_set_error() is called. So closure_bio_submit() is modified to
check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set,
closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and
return, generic_make_request() won't be called.
A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit
from cache_set->flags, to disable or enable cache set I/O for debugging. It
is helpful to trigger more corner case issues for failed cache device.
Changelog
v4, add wait_for_kthread_stop(), and call it before exits writeback and gc
kernel threads.
v3, change CACHE_SET_IO_DISABLE from 4 to 3, since it is bit index.
remove "bcache: " prefix when printing out kernel message.
v2, more changes by previous review,
- Use CACHE_SET_IO_DISABLE of cache_set->flags, suggested by Junhui.
- Check CACHE_SET_IO_DISABLE in bch_btree_gc() to stop a while-loop, this
is reported and inspired from origal patch of Pavel Vazharov.
v1, initial version.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Pavel Vazharov <freakpv@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:17 +08:00
|
|
|
|
2018-08-11 13:19:55 +08:00
|
|
|
/*
|
|
|
|
* XXX: we can be called from atomic context
|
|
|
|
* acquire_console_sem();
|
|
|
|
*/
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
va_start(args, fmt);
|
|
|
|
|
2020-05-27 12:01:52 +08:00
|
|
|
vaf.fmt = fmt;
|
|
|
|
vaf.va = &args;
|
|
|
|
|
|
|
|
pr_err("error on %pU: %pV, disabling caching\n",
|
2020-10-01 14:50:48 +08:00
|
|
|
c->set_uuid, &vaf);
|
2020-05-27 12:01:52 +08:00
|
|
|
|
|
|
|
va_end(args);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2013-07-12 10:42:51 +08:00
|
|
|
if (c->on_error == ON_ERROR_PANIC)
|
|
|
|
panic("panic forced after error\n");
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
bch_cache_set_unregister(c);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2019-04-25 00:48:35 +08:00
|
|
|
/* When c->kobj released */
|
2013-03-24 07:11:31 +08:00
|
|
|
void bch_cache_set_release(struct kobject *kobj)
|
|
|
|
{
|
|
|
|
struct cache_set *c = container_of(kobj, struct cache_set, kobj);
|
2018-08-11 13:19:45 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
kfree(c);
|
|
|
|
module_put(THIS_MODULE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cache_set_free(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct cache_set *c = container_of(cl, struct cache_set, cl);
|
|
|
|
struct cache *ca;
|
|
|
|
|
2018-12-13 22:53:47 +08:00
|
|
|
debugfs_remove(c->debug);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
bch_open_buckets_free(c);
|
|
|
|
bch_btree_cache_free(c);
|
|
|
|
bch_journal_free(c);
|
|
|
|
|
2019-04-25 00:48:31 +08:00
|
|
|
mutex_lock(&bch_register_lock);
|
2020-10-01 14:50:56 +08:00
|
|
|
bch_bset_sort_state_free(&c->sort);
|
|
|
|
free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
|
|
|
|
|
2020-10-01 14:50:47 +08:00
|
|
|
ca = c->cache;
|
|
|
|
if (ca) {
|
|
|
|
ca->set = NULL;
|
|
|
|
c->cache = NULL;
|
|
|
|
kobject_put(&ca->kobj);
|
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
|
2014-01-10 08:03:04 +08:00
|
|
|
if (c->moving_gc_wq)
|
|
|
|
destroy_workqueue(c->moving_gc_wq);
|
2018-05-21 06:25:51 +08:00
|
|
|
bioset_exit(&c->bio_split);
|
|
|
|
mempool_exit(&c->fill_iter);
|
|
|
|
mempool_exit(&c->bio_meta);
|
|
|
|
mempool_exit(&c->search);
|
2013-03-24 07:11:31 +08:00
|
|
|
kfree(c->devices);
|
|
|
|
|
|
|
|
list_del(&c->list);
|
|
|
|
mutex_unlock(&bch_register_lock);
|
|
|
|
|
2020-10-01 14:50:48 +08:00
|
|
|
pr_info("Cache set %pU unregistered\n", c->set_uuid);
|
2013-03-24 07:11:31 +08:00
|
|
|
wake_up(&unregister_wait);
|
|
|
|
|
|
|
|
closure_debug_destroy(&c->cl);
|
|
|
|
kobject_put(&c->kobj);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void cache_set_flush(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct cache_set *c = container_of(cl, struct cache_set, caching);
|
2020-10-01 14:50:47 +08:00
|
|
|
struct cache *ca = c->cache;
|
2013-03-24 07:11:31 +08:00
|
|
|
struct btree *b;
|
|
|
|
|
|
|
|
bch_cache_accounting_destroy(&c->accounting);
|
|
|
|
|
|
|
|
kobject_put(&c->internal);
|
|
|
|
kobject_del(&c->kobj);
|
|
|
|
|
bcache: check c->gc_thread by IS_ERR_OR_NULL in cache_set_flush()
When system memory is in heavy pressure, bch_gc_thread_start() from
run_cache_set() may fail due to out of memory. In such condition,
c->gc_thread is assigned to -ENOMEM, not NULL pointer. Then in following
failure code path bch_cache_set_error(), when cache_set_flush() gets
called, the code piece to stop c->gc_thread is broken,
if (!IS_ERR_OR_NULL(c->gc_thread))
kthread_stop(c->gc_thread);
And KASAN catches such NULL pointer deference problem, with the warning
information:
[ 561.207881] ==================================================================
[ 561.207900] BUG: KASAN: null-ptr-deref in kthread_stop+0x3b/0x440
[ 561.207904] Write of size 4 at addr 000000000000001c by task kworker/15:1/313
[ 561.207913] CPU: 15 PID: 313 Comm: kworker/15:1 Tainted: G W 5.0.0-vanilla+ #3
[ 561.207916] Hardware name: Lenovo ThinkSystem SR650 -[7X05CTO1WW]-/-[7X05CTO1WW]-, BIOS -[IVE136T-2.10]- 03/22/2019
[ 561.207935] Workqueue: events cache_set_flush [bcache]
[ 561.207940] Call Trace:
[ 561.207948] dump_stack+0x9a/0xeb
[ 561.207955] ? kthread_stop+0x3b/0x440
[ 561.207960] ? kthread_stop+0x3b/0x440
[ 561.207965] kasan_report+0x176/0x192
[ 561.207973] ? kthread_stop+0x3b/0x440
[ 561.207981] kthread_stop+0x3b/0x440
[ 561.207995] cache_set_flush+0xd4/0x6d0 [bcache]
[ 561.208008] process_one_work+0x856/0x1620
[ 561.208015] ? find_held_lock+0x39/0x1d0
[ 561.208028] ? drain_workqueue+0x380/0x380
[ 561.208048] worker_thread+0x87/0xb80
[ 561.208058] ? __kthread_parkme+0xb6/0x180
[ 561.208067] ? process_one_work+0x1620/0x1620
[ 561.208072] kthread+0x326/0x3e0
[ 561.208079] ? kthread_create_worker_on_cpu+0xc0/0xc0
[ 561.208090] ret_from_fork+0x3a/0x50
[ 561.208110] ==================================================================
[ 561.208113] Disabling lock debugging due to kernel taint
[ 561.208115] irq event stamp: 11800231
[ 561.208126] hardirqs last enabled at (11800231): [<ffffffff83008538>] do_syscall_64+0x18/0x410
[ 561.208127] BUG: unable to handle kernel NULL pointer dereference at 000000000000001c
[ 561.208129] #PF error: [WRITE]
[ 561.312253] hardirqs last disabled at (11800230): [<ffffffff830052ff>] trace_hardirqs_off_thunk+0x1a/0x1c
[ 561.312259] softirqs last enabled at (11799832): [<ffffffff850005c7>] __do_softirq+0x5c7/0x8c3
[ 561.405975] PGD 0 P4D 0
[ 561.442494] softirqs last disabled at (11799821): [<ffffffff831add2c>] irq_exit+0x1ac/0x1e0
[ 561.791359] Oops: 0002 [#1] SMP KASAN NOPTI
[ 561.791362] CPU: 15 PID: 313 Comm: kworker/15:1 Tainted: G B W 5.0.0-vanilla+ #3
[ 561.791363] Hardware name: Lenovo ThinkSystem SR650 -[7X05CTO1WW]-/-[7X05CTO1WW]-, BIOS -[IVE136T-2.10]- 03/22/2019
[ 561.791371] Workqueue: events cache_set_flush [bcache]
[ 561.791374] RIP: 0010:kthread_stop+0x3b/0x440
[ 561.791376] Code: 00 00 65 8b 05 26 d5 e0 7c 89 c0 48 0f a3 05 ec aa df 02 0f 82 dc 02 00 00 4c 8d 63 20 be 04 00 00 00 4c 89 e7 e8 65 c5 53 00 <f0> ff 43 20 48 8d 7b 24 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48
[ 561.791377] RSP: 0018:ffff88872fc8fd10 EFLAGS: 00010286
[ 561.838895] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree.
[ 561.838916] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree.
[ 561.838934] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree.
[ 561.838948] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree.
[ 561.838966] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree.
[ 561.838979] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree.
[ 561.838996] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree.
[ 563.067028] RAX: 0000000000000000 RBX: fffffffffffffffc RCX: ffffffff832dd314
[ 563.067030] RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000297
[ 563.067032] RBP: ffff88872fc8fe88 R08: fffffbfff0b8213d R09: fffffbfff0b8213d
[ 563.067034] R10: 0000000000000001 R11: fffffbfff0b8213c R12: 000000000000001c
[ 563.408618] R13: ffff88dc61cc0f68 R14: ffff888102b94900 R15: ffff88dc61cc0f68
[ 563.408620] FS: 0000000000000000(0000) GS:ffff888f7dc00000(0000) knlGS:0000000000000000
[ 563.408622] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 563.408623] CR2: 000000000000001c CR3: 0000000f48a1a004 CR4: 00000000007606e0
[ 563.408625] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 563.408627] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 563.904795] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree.
[ 563.915796] PKRU: 55555554
[ 563.915797] Call Trace:
[ 563.915807] cache_set_flush+0xd4/0x6d0 [bcache]
[ 563.915812] process_one_work+0x856/0x1620
[ 564.001226] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree.
[ 564.033563] ? find_held_lock+0x39/0x1d0
[ 564.033567] ? drain_workqueue+0x380/0x380
[ 564.033574] worker_thread+0x87/0xb80
[ 564.062823] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree.
[ 564.118042] ? __kthread_parkme+0xb6/0x180
[ 564.118046] ? process_one_work+0x1620/0x1620
[ 564.118048] kthread+0x326/0x3e0
[ 564.118050] ? kthread_create_worker_on_cpu+0xc0/0xc0
[ 564.167066] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree.
[ 564.252441] ret_from_fork+0x3a/0x50
[ 564.252447] Modules linked in: msr rpcrdma sunrpc rdma_ucm ib_iser ib_umad rdma_cm ib_ipoib i40iw configfs iw_cm ib_cm libiscsi scsi_transport_iscsi mlx4_ib ib_uverbs mlx4_en ib_core nls_iso8859_1 nls_cp437 vfat fat intel_rapl skx_edac x86_pkg_temp_thermal coretemp iTCO_wdt iTCO_vendor_support crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel ses raid0 aesni_intel cdc_ether enclosure usbnet ipmi_ssif joydev aes_x86_64 i40e scsi_transport_sas mii bcache md_mod crypto_simd mei_me ioatdma crc64 ptp cryptd pcspkr i2c_i801 mlx4_core glue_helper pps_core mei lpc_ich dca wmi ipmi_si ipmi_devintf nd_pmem dax_pmem nd_btt ipmi_msghandler device_dax pcc_cpufreq button hid_generic usbhid mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect xhci_pci sysimgblt fb_sys_fops xhci_hcd ttm megaraid_sas drm usbcore nfit libnvdimm sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua efivarfs
[ 564.299390] bcache: bch_count_io_errors() nvme0n1: IO error on writing btree.
[ 564.348360] CR2: 000000000000001c
[ 564.348362] ---[ end trace b7f0e5cc7b2103b0 ]---
Therefore, it is not enough to only check whether c->gc_thread is NULL,
we should use IS_ERR_OR_NULL() to check both NULL pointer and error
value.
This patch changes the above buggy code piece in this way,
if (!IS_ERR_OR_NULL(c->gc_thread))
kthread_stop(c->gc_thread);
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-28 19:59:25 +08:00
|
|
|
if (!IS_ERR_OR_NULL(c->gc_thread))
|
2013-10-25 08:19:26 +08:00
|
|
|
kthread_stop(c->gc_thread);
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
if (!IS_ERR_OR_NULL(c->root))
|
|
|
|
list_add(&c->root->list, &c->btree_cache);
|
|
|
|
|
2019-06-28 19:59:28 +08:00
|
|
|
/*
|
|
|
|
* Avoid flushing cached nodes if cache set is retiring
|
|
|
|
* due to too many I/O errors detected.
|
|
|
|
*/
|
|
|
|
if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
|
|
|
|
list_for_each_entry(b, &c->btree_cache, list) {
|
|
|
|
mutex_lock(&b->write_lock);
|
|
|
|
if (btree_node_dirty(b))
|
|
|
|
__bch_btree_node_write(b, NULL);
|
|
|
|
mutex_unlock(&b->write_lock);
|
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-10-01 14:50:47 +08:00
|
|
|
if (ca->alloc_thread)
|
|
|
|
kthread_stop(ca->alloc_thread);
|
2013-07-11 09:31:58 +08:00
|
|
|
|
2014-03-20 08:49:37 +08:00
|
|
|
if (c->journal.cur) {
|
|
|
|
cancel_delayed_work_sync(&c->journal.work);
|
|
|
|
/* flush last journal entry if needed */
|
|
|
|
c->journal.work.work.func(&c->journal.work.work);
|
|
|
|
}
|
2014-02-20 11:48:26 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
closure_return(cl);
|
|
|
|
}
|
|
|
|
|
bcache: add stop_when_cache_set_failed option to backing device
When there are too many I/O errors on cache device, current bcache code
will retire the whole cache set, and detach all bcache devices. But the
detached bcache devices are not stopped, which is problematic when bcache
is in writeback mode.
If the retired cache set has dirty data of backing devices, continue
writing to bcache device will write to backing device directly. If the
LBA of write request has a dirty version cached on cache device, next time
when the cache device is re-registered and backing device re-attached to
it again, the stale dirty data on cache device will be written to backing
device, and overwrite latest directly written data. This situation causes
a quite data corruption.
But we cannot simply stop all attached bcache devices when the cache set is
broken or disconnected. For example, use bcache to accelerate performance
of an email service. In such workload, if cache device is broken but no
dirty data lost, keep the bcache device alive and permit email service
continue to access user data might be a better solution for the cache
device failure.
Nix <nix@esperi.org.uk> points out the issue and provides the above example
to explain why it might be necessary to not stop bcache device for broken
cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant
suggestion to provide "always" and "auto" options to per-cached device
sysfs file stop_when_cache_set_failed. If cache set is retiring and the
backing device has no dirty data on cache, it should be safe to keep the
bcache device alive. In this case, if stop_when_cache_set_failed is set to
"auto", the device failure handling code will not stop this bcache device
and permit application to access the backing device with a unattached
bcache device.
Changelog:
[mlyle: edited to not break string constants across lines]
v3: fix typos pointed out by Nix.
v2: change option values of stop_when_cache_set_failed from 1/0 to
"auto"/"always".
v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1
(always stop).
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Cc: Nix <nix@esperi.org.uk>
Cc: Pavel Goran <via-bcache@pvgoran.name>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:18 +08:00
|
|
|
/*
|
|
|
|
* This function is only called when CACHE_SET_IO_DISABLE is set, which means
|
|
|
|
* cache set is unregistering due to too many I/O errors. In this condition,
|
|
|
|
* the bcache device might be stopped, it depends on stop_when_cache_set_failed
|
|
|
|
* value and whether the broken cache has dirty data:
|
|
|
|
*
|
|
|
|
* dc->stop_when_cache_set_failed dc->has_dirty stop bcache device
|
|
|
|
* BCH_CACHED_STOP_AUTO 0 NO
|
|
|
|
* BCH_CACHED_STOP_AUTO 1 YES
|
|
|
|
* BCH_CACHED_DEV_STOP_ALWAYS 0 YES
|
|
|
|
* BCH_CACHED_DEV_STOP_ALWAYS 1 YES
|
|
|
|
*
|
|
|
|
* The expected behavior is, if stop_when_cache_set_failed is configured to
|
|
|
|
* "auto" via sysfs interface, the bcache device will not be stopped if the
|
|
|
|
* backing device is clean on the broken cache device.
|
|
|
|
*/
|
|
|
|
static void conditional_stop_bcache_device(struct cache_set *c,
|
|
|
|
struct bcache_device *d,
|
|
|
|
struct cached_dev *dc)
|
|
|
|
{
|
|
|
|
if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.\n",
|
2020-10-01 14:50:48 +08:00
|
|
|
d->disk->disk_name, c->set_uuid);
|
bcache: add stop_when_cache_set_failed option to backing device
When there are too many I/O errors on cache device, current bcache code
will retire the whole cache set, and detach all bcache devices. But the
detached bcache devices are not stopped, which is problematic when bcache
is in writeback mode.
If the retired cache set has dirty data of backing devices, continue
writing to bcache device will write to backing device directly. If the
LBA of write request has a dirty version cached on cache device, next time
when the cache device is re-registered and backing device re-attached to
it again, the stale dirty data on cache device will be written to backing
device, and overwrite latest directly written data. This situation causes
a quite data corruption.
But we cannot simply stop all attached bcache devices when the cache set is
broken or disconnected. For example, use bcache to accelerate performance
of an email service. In such workload, if cache device is broken but no
dirty data lost, keep the bcache device alive and permit email service
continue to access user data might be a better solution for the cache
device failure.
Nix <nix@esperi.org.uk> points out the issue and provides the above example
to explain why it might be necessary to not stop bcache device for broken
cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant
suggestion to provide "always" and "auto" options to per-cached device
sysfs file stop_when_cache_set_failed. If cache set is retiring and the
backing device has no dirty data on cache, it should be safe to keep the
bcache device alive. In this case, if stop_when_cache_set_failed is set to
"auto", the device failure handling code will not stop this bcache device
and permit application to access the backing device with a unattached
bcache device.
Changelog:
[mlyle: edited to not break string constants across lines]
v3: fix typos pointed out by Nix.
v2: change option values of stop_when_cache_set_failed from 1/0 to
"auto"/"always".
v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1
(always stop).
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Cc: Nix <nix@esperi.org.uk>
Cc: Pavel Goran <via-bcache@pvgoran.name>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:18 +08:00
|
|
|
bcache_device_stop(d);
|
|
|
|
} else if (atomic_read(&dc->has_dirty)) {
|
|
|
|
/*
|
|
|
|
* dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
|
|
|
|
* and dc->has_dirty == 1
|
|
|
|
*/
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.\n",
|
bcache: add stop_when_cache_set_failed option to backing device
When there are too many I/O errors on cache device, current bcache code
will retire the whole cache set, and detach all bcache devices. But the
detached bcache devices are not stopped, which is problematic when bcache
is in writeback mode.
If the retired cache set has dirty data of backing devices, continue
writing to bcache device will write to backing device directly. If the
LBA of write request has a dirty version cached on cache device, next time
when the cache device is re-registered and backing device re-attached to
it again, the stale dirty data on cache device will be written to backing
device, and overwrite latest directly written data. This situation causes
a quite data corruption.
But we cannot simply stop all attached bcache devices when the cache set is
broken or disconnected. For example, use bcache to accelerate performance
of an email service. In such workload, if cache device is broken but no
dirty data lost, keep the bcache device alive and permit email service
continue to access user data might be a better solution for the cache
device failure.
Nix <nix@esperi.org.uk> points out the issue and provides the above example
to explain why it might be necessary to not stop bcache device for broken
cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant
suggestion to provide "always" and "auto" options to per-cached device
sysfs file stop_when_cache_set_failed. If cache set is retiring and the
backing device has no dirty data on cache, it should be safe to keep the
bcache device alive. In this case, if stop_when_cache_set_failed is set to
"auto", the device failure handling code will not stop this bcache device
and permit application to access the backing device with a unattached
bcache device.
Changelog:
[mlyle: edited to not break string constants across lines]
v3: fix typos pointed out by Nix.
v2: change option values of stop_when_cache_set_failed from 1/0 to
"auto"/"always".
v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1
(always stop).
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Cc: Nix <nix@esperi.org.uk>
Cc: Pavel Goran <via-bcache@pvgoran.name>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:18 +08:00
|
|
|
d->disk->disk_name);
|
2019-02-09 12:52:57 +08:00
|
|
|
/*
|
|
|
|
* There might be a small time gap that cache set is
|
|
|
|
* released but bcache device is not. Inside this time
|
|
|
|
* gap, regular I/O requests will directly go into
|
|
|
|
* backing device as no cache set attached to. This
|
|
|
|
* behavior may also introduce potential inconsistence
|
|
|
|
* data in writeback mode while cache is dirty.
|
|
|
|
* Therefore before calling bcache_device_stop() due
|
|
|
|
* to a broken cache device, dc->io_disable should be
|
|
|
|
* explicitly set to true.
|
|
|
|
*/
|
|
|
|
dc->io_disable = true;
|
|
|
|
/* make others know io_disable is true earlier */
|
|
|
|
smp_mb();
|
|
|
|
bcache_device_stop(d);
|
bcache: add stop_when_cache_set_failed option to backing device
When there are too many I/O errors on cache device, current bcache code
will retire the whole cache set, and detach all bcache devices. But the
detached bcache devices are not stopped, which is problematic when bcache
is in writeback mode.
If the retired cache set has dirty data of backing devices, continue
writing to bcache device will write to backing device directly. If the
LBA of write request has a dirty version cached on cache device, next time
when the cache device is re-registered and backing device re-attached to
it again, the stale dirty data on cache device will be written to backing
device, and overwrite latest directly written data. This situation causes
a quite data corruption.
But we cannot simply stop all attached bcache devices when the cache set is
broken or disconnected. For example, use bcache to accelerate performance
of an email service. In such workload, if cache device is broken but no
dirty data lost, keep the bcache device alive and permit email service
continue to access user data might be a better solution for the cache
device failure.
Nix <nix@esperi.org.uk> points out the issue and provides the above example
to explain why it might be necessary to not stop bcache device for broken
cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant
suggestion to provide "always" and "auto" options to per-cached device
sysfs file stop_when_cache_set_failed. If cache set is retiring and the
backing device has no dirty data on cache, it should be safe to keep the
bcache device alive. In this case, if stop_when_cache_set_failed is set to
"auto", the device failure handling code will not stop this bcache device
and permit application to access the backing device with a unattached
bcache device.
Changelog:
[mlyle: edited to not break string constants across lines]
v3: fix typos pointed out by Nix.
v2: change option values of stop_when_cache_set_failed from 1/0 to
"auto"/"always".
v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1
(always stop).
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Cc: Nix <nix@esperi.org.uk>
Cc: Pavel Goran <via-bcache@pvgoran.name>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:18 +08:00
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* dc->stop_when_cache_set_failed == BCH_CACHED_STOP_AUTO
|
|
|
|
* and dc->has_dirty == 0
|
|
|
|
*/
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.\n",
|
bcache: add stop_when_cache_set_failed option to backing device
When there are too many I/O errors on cache device, current bcache code
will retire the whole cache set, and detach all bcache devices. But the
detached bcache devices are not stopped, which is problematic when bcache
is in writeback mode.
If the retired cache set has dirty data of backing devices, continue
writing to bcache device will write to backing device directly. If the
LBA of write request has a dirty version cached on cache device, next time
when the cache device is re-registered and backing device re-attached to
it again, the stale dirty data on cache device will be written to backing
device, and overwrite latest directly written data. This situation causes
a quite data corruption.
But we cannot simply stop all attached bcache devices when the cache set is
broken or disconnected. For example, use bcache to accelerate performance
of an email service. In such workload, if cache device is broken but no
dirty data lost, keep the bcache device alive and permit email service
continue to access user data might be a better solution for the cache
device failure.
Nix <nix@esperi.org.uk> points out the issue and provides the above example
to explain why it might be necessary to not stop bcache device for broken
cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant
suggestion to provide "always" and "auto" options to per-cached device
sysfs file stop_when_cache_set_failed. If cache set is retiring and the
backing device has no dirty data on cache, it should be safe to keep the
bcache device alive. In this case, if stop_when_cache_set_failed is set to
"auto", the device failure handling code will not stop this bcache device
and permit application to access the backing device with a unattached
bcache device.
Changelog:
[mlyle: edited to not break string constants across lines]
v3: fix typos pointed out by Nix.
v2: change option values of stop_when_cache_set_failed from 1/0 to
"auto"/"always".
v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1
(always stop).
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Cc: Nix <nix@esperi.org.uk>
Cc: Pavel Goran <via-bcache@pvgoran.name>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:18 +08:00
|
|
|
d->disk->disk_name);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
static void __cache_set_unregister(struct closure *cl)
|
|
|
|
{
|
|
|
|
struct cache_set *c = container_of(cl, struct cache_set, caching);
|
2013-07-11 12:03:25 +08:00
|
|
|
struct cached_dev *dc;
|
bcache: add stop_when_cache_set_failed option to backing device
When there are too many I/O errors on cache device, current bcache code
will retire the whole cache set, and detach all bcache devices. But the
detached bcache devices are not stopped, which is problematic when bcache
is in writeback mode.
If the retired cache set has dirty data of backing devices, continue
writing to bcache device will write to backing device directly. If the
LBA of write request has a dirty version cached on cache device, next time
when the cache device is re-registered and backing device re-attached to
it again, the stale dirty data on cache device will be written to backing
device, and overwrite latest directly written data. This situation causes
a quite data corruption.
But we cannot simply stop all attached bcache devices when the cache set is
broken or disconnected. For example, use bcache to accelerate performance
of an email service. In such workload, if cache device is broken but no
dirty data lost, keep the bcache device alive and permit email service
continue to access user data might be a better solution for the cache
device failure.
Nix <nix@esperi.org.uk> points out the issue and provides the above example
to explain why it might be necessary to not stop bcache device for broken
cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant
suggestion to provide "always" and "auto" options to per-cached device
sysfs file stop_when_cache_set_failed. If cache set is retiring and the
backing device has no dirty data on cache, it should be safe to keep the
bcache device alive. In this case, if stop_when_cache_set_failed is set to
"auto", the device failure handling code will not stop this bcache device
and permit application to access the backing device with a unattached
bcache device.
Changelog:
[mlyle: edited to not break string constants across lines]
v3: fix typos pointed out by Nix.
v2: change option values of stop_when_cache_set_failed from 1/0 to
"auto"/"always".
v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1
(always stop).
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Cc: Nix <nix@esperi.org.uk>
Cc: Pavel Goran <via-bcache@pvgoran.name>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:18 +08:00
|
|
|
struct bcache_device *d;
|
2013-03-24 07:11:31 +08:00
|
|
|
size_t i;
|
|
|
|
|
|
|
|
mutex_lock(&bch_register_lock);
|
|
|
|
|
bcache: add stop_when_cache_set_failed option to backing device
When there are too many I/O errors on cache device, current bcache code
will retire the whole cache set, and detach all bcache devices. But the
detached bcache devices are not stopped, which is problematic when bcache
is in writeback mode.
If the retired cache set has dirty data of backing devices, continue
writing to bcache device will write to backing device directly. If the
LBA of write request has a dirty version cached on cache device, next time
when the cache device is re-registered and backing device re-attached to
it again, the stale dirty data on cache device will be written to backing
device, and overwrite latest directly written data. This situation causes
a quite data corruption.
But we cannot simply stop all attached bcache devices when the cache set is
broken or disconnected. For example, use bcache to accelerate performance
of an email service. In such workload, if cache device is broken but no
dirty data lost, keep the bcache device alive and permit email service
continue to access user data might be a better solution for the cache
device failure.
Nix <nix@esperi.org.uk> points out the issue and provides the above example
to explain why it might be necessary to not stop bcache device for broken
cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant
suggestion to provide "always" and "auto" options to per-cached device
sysfs file stop_when_cache_set_failed. If cache set is retiring and the
backing device has no dirty data on cache, it should be safe to keep the
bcache device alive. In this case, if stop_when_cache_set_failed is set to
"auto", the device failure handling code will not stop this bcache device
and permit application to access the backing device with a unattached
bcache device.
Changelog:
[mlyle: edited to not break string constants across lines]
v3: fix typos pointed out by Nix.
v2: change option values of stop_when_cache_set_failed from 1/0 to
"auto"/"always".
v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1
(always stop).
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Cc: Nix <nix@esperi.org.uk>
Cc: Pavel Goran <via-bcache@pvgoran.name>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:18 +08:00
|
|
|
for (i = 0; i < c->devices_max_used; i++) {
|
|
|
|
d = c->devices[i];
|
|
|
|
if (!d)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
|
|
|
|
test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
|
|
|
|
dc = container_of(d, struct cached_dev, disk);
|
|
|
|
bch_cached_dev_detach(dc);
|
|
|
|
if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
|
|
|
|
conditional_stop_bcache_device(c, d, dc);
|
|
|
|
} else {
|
|
|
|
bcache_device_stop(d);
|
2013-07-11 12:03:25 +08:00
|
|
|
}
|
bcache: add stop_when_cache_set_failed option to backing device
When there are too many I/O errors on cache device, current bcache code
will retire the whole cache set, and detach all bcache devices. But the
detached bcache devices are not stopped, which is problematic when bcache
is in writeback mode.
If the retired cache set has dirty data of backing devices, continue
writing to bcache device will write to backing device directly. If the
LBA of write request has a dirty version cached on cache device, next time
when the cache device is re-registered and backing device re-attached to
it again, the stale dirty data on cache device will be written to backing
device, and overwrite latest directly written data. This situation causes
a quite data corruption.
But we cannot simply stop all attached bcache devices when the cache set is
broken or disconnected. For example, use bcache to accelerate performance
of an email service. In such workload, if cache device is broken but no
dirty data lost, keep the bcache device alive and permit email service
continue to access user data might be a better solution for the cache
device failure.
Nix <nix@esperi.org.uk> points out the issue and provides the above example
to explain why it might be necessary to not stop bcache device for broken
cache device. Pavel Goran <via-bcache@pvgoran.name> provides a brilliant
suggestion to provide "always" and "auto" options to per-cached device
sysfs file stop_when_cache_set_failed. If cache set is retiring and the
backing device has no dirty data on cache, it should be safe to keep the
bcache device alive. In this case, if stop_when_cache_set_failed is set to
"auto", the device failure handling code will not stop this bcache device
and permit application to access the backing device with a unattached
bcache device.
Changelog:
[mlyle: edited to not break string constants across lines]
v3: fix typos pointed out by Nix.
v2: change option values of stop_when_cache_set_failed from 1/0 to
"auto"/"always".
v1: initial version, stop_when_cache_set_failed can be 0 (not stop) or 1
(always stop).
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Michael Lyle <mlyle@lyle.org>
Cc: Nix <nix@esperi.org.uk>
Cc: Pavel Goran <via-bcache@pvgoran.name>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:18 +08:00
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
mutex_unlock(&bch_register_lock);
|
|
|
|
|
|
|
|
continue_at(cl, cache_set_flush, system_wq);
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch_cache_set_stop(struct cache_set *c)
|
|
|
|
{
|
|
|
|
if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
|
2019-04-25 00:48:39 +08:00
|
|
|
/* closure_fn set to __cache_set_unregister() */
|
2013-03-24 07:11:31 +08:00
|
|
|
closure_queue(&c->caching);
|
|
|
|
}
|
|
|
|
|
|
|
|
void bch_cache_set_unregister(struct cache_set *c)
|
|
|
|
{
|
|
|
|
set_bit(CACHE_SET_UNREGISTERING, &c->flags);
|
|
|
|
bch_cache_set_stop(c);
|
|
|
|
}
|
|
|
|
|
bcache: introduce meta_bucket_pages() related helper routines
Currently the in-memory meta data like c->uuids or c->disk_buckets
are allocated by alloc_bucket_pages(). The macro alloc_bucket_pages()
calls __get_free_pages() to allocated continuous pages with order
indicated by ilog2(bucket_pages(c)),
#define alloc_bucket_pages(gfp, c) \
((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
The maximum order is defined as MAX_ORDER, the default value is 11 (and
can be overwritten by CONFIG_FORCE_MAX_ZONEORDER). In bcache code the
maximum bucket size width is 16bits, this is restricted both by KEY_SIZE
size and bucket_size size from struct cache_sb_disk. The maximum 16bits
width and power-of-2 value is (1<<15) in unit of sector (512byte). It
means the maximum value of bucket size in bytes is (1<<24) bytes a.k.a
4096 pages.
When the bucket size is set to maximum permitted value, ilog2(4096) is
12, which exceeds the default maximum order __get_free_pages() can
accepted, the failed pages allocation will fail cache set registration
procedure and print a kernel oops message for the exceeded pages order.
This patch introduces meta_bucket_pages(), meta_bucket_bytes(), and
alloc_bucket_pages() helper routines. meta_bucket_pages() indicates the
maximum pages can be allocated to meta data bucket, meta_bucket_bytes()
indicates the according maximum bytes, and alloc_bucket_pages() does
the pages allocation for meta bucket. Because meta_bucket_pages()
chooses the smaller value among the bucket size and MAX_ORDER_NR_PAGES,
it still works when MAX_ORDER overwritten by CONFIG_FORCE_MAX_ZONEORDER.
Following patches will use these helper routines to decide maximum pages
can be allocated for different meta data buckets. If the bucket size is
larger than meta_bucket_bytes(), the bcache registration can continue to
success, just the space more than meta_bucket_bytes() inside the bucket
is wasted. Comparing bcache failed for large bucket size, wasting some
space for meta data buckets is acceptable at this moment.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-07-25 20:00:30 +08:00
|
|
|
#define alloc_meta_bucket_pages(gfp, sb) \
|
|
|
|
((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb))))
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
|
|
|
|
{
|
|
|
|
int iter_size;
|
2020-10-01 14:50:56 +08:00
|
|
|
struct cache *ca = container_of(sb, struct cache, sb);
|
2013-03-24 07:11:31 +08:00
|
|
|
struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
|
2018-08-11 13:19:45 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
if (!c)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
__module_get(THIS_MODULE);
|
|
|
|
closure_init(&c->cl, NULL);
|
|
|
|
set_closure_fn(&c->cl, cache_set_free, system_wq);
|
|
|
|
|
|
|
|
closure_init(&c->caching, &c->cl);
|
|
|
|
set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
|
|
|
|
|
|
|
|
/* Maybe create continue_at_noreturn() and use it here? */
|
|
|
|
closure_set_stopped(&c->cl);
|
|
|
|
closure_put(&c->cl);
|
|
|
|
|
|
|
|
kobject_init(&c->kobj, &bch_cache_set_ktype);
|
|
|
|
kobject_init(&c->internal, &bch_cache_set_internal_ktype);
|
|
|
|
|
|
|
|
bch_cache_accounting_init(&c->accounting, &c->cl);
|
|
|
|
|
2020-10-01 14:50:48 +08:00
|
|
|
memcpy(c->set_uuid, sb->set_uuid, 16);
|
2020-07-25 20:00:27 +08:00
|
|
|
|
2020-10-01 14:50:56 +08:00
|
|
|
c->cache = ca;
|
|
|
|
c->cache->set = c;
|
2013-03-24 07:11:31 +08:00
|
|
|
c->bucket_bits = ilog2(sb->bucket_size);
|
|
|
|
c->block_bits = ilog2(sb->block_size);
|
2020-10-01 14:50:56 +08:00
|
|
|
c->nr_uuids = meta_bucket_bytes(sb) / sizeof(struct uuid_entry);
|
2018-01-09 04:21:28 +08:00
|
|
|
c->devices_max_used = 0;
|
2018-08-09 15:48:49 +08:00
|
|
|
atomic_set(&c->attached_dev_nr, 0);
|
2020-10-01 14:50:56 +08:00
|
|
|
c->btree_pages = meta_bucket_pages(sb);
|
2013-03-24 07:11:31 +08:00
|
|
|
if (c->btree_pages > BTREE_MAX_PAGES)
|
|
|
|
c->btree_pages = max_t(int, c->btree_pages / 4,
|
|
|
|
BTREE_MAX_PAGES);
|
|
|
|
|
2013-12-17 07:27:25 +08:00
|
|
|
sema_init(&c->sb_write_mutex, 1);
|
2013-07-25 08:27:07 +08:00
|
|
|
mutex_init(&c->bucket_lock);
|
2014-03-18 08:15:53 +08:00
|
|
|
init_waitqueue_head(&c->btree_cache_wait);
|
2019-11-13 16:03:16 +08:00
|
|
|
spin_lock_init(&c->btree_cannibalize_lock);
|
2013-07-25 08:29:09 +08:00
|
|
|
init_waitqueue_head(&c->bucket_wait);
|
2016-10-27 11:31:17 +08:00
|
|
|
init_waitqueue_head(&c->gc_wait);
|
2013-12-17 07:27:25 +08:00
|
|
|
sema_init(&c->uuid_write_mutex, 1);
|
2013-07-31 15:03:54 +08:00
|
|
|
|
|
|
|
spin_lock_init(&c->btree_gc_time.lock);
|
|
|
|
spin_lock_init(&c->btree_split_time.lock);
|
|
|
|
spin_lock_init(&c->btree_read_time.lock);
|
2013-07-25 08:27:07 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
bch_moving_init_cache_set(c);
|
|
|
|
|
|
|
|
INIT_LIST_HEAD(&c->list);
|
|
|
|
INIT_LIST_HEAD(&c->cached_devs);
|
|
|
|
INIT_LIST_HEAD(&c->btree_cache);
|
|
|
|
INIT_LIST_HEAD(&c->btree_cache_freeable);
|
|
|
|
INIT_LIST_HEAD(&c->btree_cache_freed);
|
|
|
|
INIT_LIST_HEAD(&c->data_buckets);
|
|
|
|
|
bcache: avoid extra memory allocation from mempool c->fill_iter
Mempool c->fill_iter is used to allocate memory for struct btree_iter in
bch_btree_node_read_done() to iterate all keys of a read-in btree node.
The allocation size is defined in bch_cache_set_alloc() by,
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size))
where iter_size is defined by a calculation,
(sb->bucket_size / sb->block_size + 1) * sizeof(struct btree_iter_set)
For 16bit width bucket_size the calculation is OK, but now the bucket
size is extended to 32bit, the bucket size can be 2GB. By the above
calculation, iter_size can be 2048 pages (order 11 is still accepted by
buddy allocator).
But the actual size holds the bkeys in meta data bucket is limited to
meta_bucket_pages() already, which is 16MB. By the above calculation,
if replace sb->bucket_size by meta_bucket_pages() * PAGE_SECTORS, the
result is 16 pages. This is the size large enough for the mempool
allocation to struct btree_iter.
Therefore in worst case every time mempool c->fill_iter allocates, at
most 4080 pages are wasted and won't be used. Therefore this patch uses
meta_bucket_pages() * PAGE_SECTORS to calculate the iter size in
bch_cache_set_alloc(), to avoid extra memory allocation from mempool
c->fill_iter.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-07-25 20:00:37 +08:00
|
|
|
iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
|
2013-03-24 07:11:31 +08:00
|
|
|
sizeof(struct btree_iter_set);
|
|
|
|
|
2020-07-25 20:00:25 +08:00
|
|
|
c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
|
|
|
|
if (!c->devices)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (mempool_init_slab_pool(&c->search, 32, bch_search_cache))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (mempool_init_kmalloc_pool(&c->bio_meta, 2,
|
|
|
|
sizeof(struct bbio) +
|
2020-10-01 14:50:56 +08:00
|
|
|
sizeof(struct bio_vec) * meta_bucket_pages(sb)))
|
2020-07-25 20:00:25 +08:00
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
|
|
|
|
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
|
|
|
|
goto err;
|
|
|
|
|
2020-10-01 14:50:56 +08:00
|
|
|
c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb);
|
2020-07-25 20:00:25 +08:00
|
|
|
if (!c->uuids)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0);
|
|
|
|
if (!c->moving_gc_wq)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (bch_journal_alloc(c))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (bch_btree_cache_alloc(c))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (bch_open_buckets_alloc(c))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
|
2013-03-24 07:11:31 +08:00
|
|
|
goto err;
|
|
|
|
|
|
|
|
c->congested_read_threshold_us = 2000;
|
|
|
|
c->congested_write_threshold_us = 20000;
|
bcache: set error_limit correctly
Struct cache uses io_errors for two purposes,
- Error decay: when cache set error_decay is set, io_errors is used to
generate a small piece of delay when I/O error happens.
- I/O errors counter: in order to generate big enough value for error
decay, I/O errors counter value is stored by left shifting 20 bits (a.k.a
IO_ERROR_SHIFT).
In function bch_count_io_errors(), if I/O errors counter reaches cache set
error limit, bch_cache_set_error() will be called to retire the whold cache
set. But current code is problematic when checking the error limit, see the
following code piece from bch_count_io_errors(),
90 if (error) {
91 char buf[BDEVNAME_SIZE];
92 unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
93 &ca->io_errors);
94 errors >>= IO_ERROR_SHIFT;
95
96 if (errors < ca->set->error_limit)
97 pr_err("%s: IO error on %s, recovering",
98 bdevname(ca->bdev, buf), m);
99 else
100 bch_cache_set_error(ca->set,
101 "%s: too many IO errors %s",
102 bdevname(ca->bdev, buf), m);
103 }
At line 94, errors is right shifting IO_ERROR_SHIFT bits, now it is real
errors counter to compare at line 96. But ca->set->error_limit is initia-
lized with an amplified value in bch_cache_set_alloc(),
1545 c->error_limit = 8 << IO_ERROR_SHIFT;
It means by default, in bch_count_io_errors(), before 8<<20 errors happened
bch_cache_set_error() won't be called to retire the problematic cache
device. If the average request size is 64KB, it means bcache won't handle
failed device until 512GB data is requested. This is too large to be an I/O
threashold. So I believe the correct error limit should be much less.
This patch sets default cache set error limit to 8, then in
bch_count_io_errors() when errors counter reaches 8 (if it is default
value), function bch_cache_set_error() will be called to retire the whole
cache set. This patch also removes bits shifting when store or show
io_error_limit value via sysfs interface.
Nowadays most of SSDs handle internal flash failure automatically by LBA
address re-indirect mapping. If an I/O error can be observed by upper layer
code, it will be a notable error because that SSD can not re-indirect
map the problematic LBA address to an available flash block. This situation
indicates the whole SSD will be failed very soon. Therefore setting 8 as
the default io error limit value makes sense, it is enough for most of
cache devices.
Changelog:
v2: add reviewed-by from Hannes.
v1: initial version for review.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Tang Junhui <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-08 03:41:42 +08:00
|
|
|
c->error_limit = DEFAULT_IO_ERROR_LIMIT;
|
2019-11-13 16:03:23 +08:00
|
|
|
c->idle_max_writeback_rate_enabled = 1;
|
bcache: add CACHE_SET_IO_DISABLE to struct cache_set flags
When too many I/Os failed on cache device, bch_cache_set_error() is called
in the error handling code path to retire whole problematic cache set. If
new I/O requests continue to come and take refcount dc->count, the cache
set won't be retired immediately, this is a problem.
Further more, there are several kernel thread and self-armed kernel work
may still running after bch_cache_set_error() is called. It needs to wait
quite a while for them to stop, or they won't stop at all. They also
prevent the cache set from being retired.
The solution in this patch is, to add per cache set flag to disable I/O
request on this cache and all attached backing devices. Then new coming I/O
requests can be rejected in *_make_request() before taking refcount, kernel
threads and self-armed kernel worker can stop very fast when flags bit
CACHE_SET_IO_DISABLE is set.
Because bcache also do internal I/Os for writeback, garbage collection,
bucket allocation, journaling, this kind of I/O should be disabled after
bch_cache_set_error() is called. So closure_bio_submit() is modified to
check whether CACHE_SET_IO_DISABLE is set on cache_set->flags. If set,
closure_bio_submit() will set bio->bi_status to BLK_STS_IOERR and
return, generic_make_request() won't be called.
A sysfs interface is also added to set or clear CACHE_SET_IO_DISABLE bit
from cache_set->flags, to disable or enable cache set I/O for debugging. It
is helpful to trigger more corner case issues for failed cache device.
Changelog
v4, add wait_for_kthread_stop(), and call it before exits writeback and gc
kernel threads.
v3, change CACHE_SET_IO_DISABLE from 4 to 3, since it is bit index.
remove "bcache: " prefix when printing out kernel message.
v2, more changes by previous review,
- Use CACHE_SET_IO_DISABLE of cache_set->flags, suggested by Junhui.
- Check CACHE_SET_IO_DISABLE in bch_btree_gc() to stop a while-loop, this
is reported and inspired from origal patch of Pavel Vazharov.
v1, initial version.
Signed-off-by: Coly Li <colyli@suse.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Cc: Junhui Tang <tang.junhui@zte.com.cn>
Cc: Michael Lyle <mlyle@lyle.org>
Cc: Pavel Vazharov <freakpv@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-03-19 08:36:17 +08:00
|
|
|
WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
return c;
|
|
|
|
err:
|
|
|
|
bch_cache_set_unregister(c);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2019-04-25 00:48:34 +08:00
|
|
|
static int run_cache_set(struct cache_set *c)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
const char *err = "cannot allocate memory";
|
|
|
|
struct cached_dev *dc, *t;
|
2020-10-01 14:50:47 +08:00
|
|
|
struct cache *ca = c->cache;
|
2013-07-25 08:44:17 +08:00
|
|
|
struct closure cl;
|
2019-04-25 00:48:43 +08:00
|
|
|
LIST_HEAD(journal);
|
|
|
|
struct journal_replay *l;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2013-07-25 08:44:17 +08:00
|
|
|
closure_init_stack(&cl);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-10-01 14:50:47 +08:00
|
|
|
c->nbuckets = ca->sb.nbuckets;
|
2016-10-27 11:31:17 +08:00
|
|
|
set_gc_sectors(c);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-10-01 14:50:55 +08:00
|
|
|
if (CACHE_SYNC(&c->cache->sb)) {
|
2013-03-24 07:11:31 +08:00
|
|
|
struct bkey *k;
|
|
|
|
struct jset *j;
|
|
|
|
|
|
|
|
err = "cannot allocate memory for journal";
|
2013-07-25 08:44:17 +08:00
|
|
|
if (bch_journal_read(c, &journal))
|
2013-03-24 07:11:31 +08:00
|
|
|
goto err;
|
|
|
|
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_debug("btree_journal_read() done\n");
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
err = "no journal entries found";
|
|
|
|
if (list_empty(&journal))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
j = &list_entry(journal.prev, struct journal_replay, list)->j;
|
|
|
|
|
|
|
|
err = "IO error reading priorities";
|
2020-10-01 14:50:47 +08:00
|
|
|
if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]))
|
|
|
|
goto err;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If prio_read() fails it'll call cache_set_error and we'll
|
|
|
|
* tear everything down right away, but if we perhaps checked
|
|
|
|
* sooner we could avoid journal replay.
|
|
|
|
*/
|
|
|
|
|
|
|
|
k = &j->btree_root;
|
|
|
|
|
|
|
|
err = "bad btree root";
|
2013-12-21 09:22:05 +08:00
|
|
|
if (__bch_btree_ptr_invalid(c, k))
|
2013-03-24 07:11:31 +08:00
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "error reading btree root";
|
2018-08-11 13:19:47 +08:00
|
|
|
c->root = bch_btree_node_get(c, NULL, k,
|
|
|
|
j->btree_level,
|
|
|
|
true, NULL);
|
2013-03-24 07:11:31 +08:00
|
|
|
if (IS_ERR_OR_NULL(c->root))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
list_del_init(&c->root->list);
|
|
|
|
rw_unlock(true, c->root);
|
|
|
|
|
2013-07-25 08:44:17 +08:00
|
|
|
err = uuid_read(c, j, &cl);
|
2013-03-24 07:11:31 +08:00
|
|
|
if (err)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = "error in recovery";
|
2013-07-25 08:44:17 +08:00
|
|
|
if (bch_btree_check(c))
|
2013-03-24 07:11:31 +08:00
|
|
|
goto err;
|
|
|
|
|
|
|
|
bch_journal_mark(c, &journal);
|
2014-03-18 07:55:55 +08:00
|
|
|
bch_initial_gc_finish(c);
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_debug("btree_check() done\n");
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* bcache_journal_next() can't happen sooner, or
|
|
|
|
* btree_gc_finish() will give spurious errors about last_gc >
|
|
|
|
* gc_gen - this is a hack but oh well.
|
|
|
|
*/
|
|
|
|
bch_journal_next(&c->journal);
|
|
|
|
|
2013-04-25 10:01:12 +08:00
|
|
|
err = "error starting allocator thread";
|
2020-10-01 14:50:47 +08:00
|
|
|
if (bch_cache_allocator_start(ca))
|
|
|
|
goto err;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* First place it's safe to allocate: btree_check() and
|
|
|
|
* btree_gc_finish() have to run before we have buckets to
|
|
|
|
* allocate, and bch_bucket_alloc_set() might cause a journal
|
|
|
|
* entry to be written so bcache_journal_next() has to be called
|
|
|
|
* first.
|
|
|
|
*
|
|
|
|
* If the uuids were in the old format we have to rewrite them
|
|
|
|
* before the next journal entry is written:
|
|
|
|
*/
|
|
|
|
if (j->version < BCACHE_JSET_VERSION_UUID)
|
|
|
|
__uuid_write(c);
|
|
|
|
|
2019-04-25 00:48:34 +08:00
|
|
|
err = "bcache: replay journal failed";
|
|
|
|
if (bch_journal_replay(c, &journal))
|
|
|
|
goto err;
|
2013-03-24 07:11:31 +08:00
|
|
|
} else {
|
2020-10-01 14:50:47 +08:00
|
|
|
unsigned int j;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-10-01 14:50:47 +08:00
|
|
|
pr_notice("invalidating existing data\n");
|
|
|
|
ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
|
|
|
|
2, SB_JOURNAL_BUCKETS);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-10-01 14:50:47 +08:00
|
|
|
for (j = 0; j < ca->sb.keys; j++)
|
|
|
|
ca->sb.d[j] = ca->sb.first_bucket + j;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2014-03-18 07:55:55 +08:00
|
|
|
bch_initial_gc_finish(c);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2013-04-25 10:01:12 +08:00
|
|
|
err = "error starting allocator thread";
|
2020-10-01 14:50:47 +08:00
|
|
|
if (bch_cache_allocator_start(ca))
|
|
|
|
goto err;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
mutex_lock(&c->bucket_lock);
|
2020-10-01 14:50:47 +08:00
|
|
|
bch_prio_write(ca, true);
|
2013-03-24 07:11:31 +08:00
|
|
|
mutex_unlock(&c->bucket_lock);
|
|
|
|
|
|
|
|
err = "cannot allocate new UUID bucket";
|
|
|
|
if (__uuid_write(c))
|
2013-10-25 08:19:26 +08:00
|
|
|
goto err;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
err = "cannot allocate new btree root";
|
2014-07-12 15:22:53 +08:00
|
|
|
c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
|
2013-03-24 07:11:31 +08:00
|
|
|
if (IS_ERR_OR_NULL(c->root))
|
2013-10-25 08:19:26 +08:00
|
|
|
goto err;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2014-03-05 08:42:42 +08:00
|
|
|
mutex_lock(&c->root->write_lock);
|
2013-03-24 07:11:31 +08:00
|
|
|
bkey_copy_key(&c->root->key, &MAX_KEY);
|
2013-07-25 08:44:17 +08:00
|
|
|
bch_btree_node_write(c->root, &cl);
|
2014-03-05 08:42:42 +08:00
|
|
|
mutex_unlock(&c->root->write_lock);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
bch_btree_set_root(c->root);
|
|
|
|
rw_unlock(true, c->root);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We don't want to write the first journal entry until
|
|
|
|
* everything is set up - fortunately journal entries won't be
|
|
|
|
* written until the SET_CACHE_SYNC() here:
|
|
|
|
*/
|
2020-10-01 14:50:55 +08:00
|
|
|
SET_CACHE_SYNC(&c->cache->sb, true);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
bch_journal_next(&c->journal);
|
2013-07-25 08:44:17 +08:00
|
|
|
bch_journal_meta(c, &cl);
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
2013-10-25 08:19:26 +08:00
|
|
|
err = "error starting gc thread";
|
|
|
|
if (bch_gc_thread_start(c))
|
|
|
|
goto err;
|
|
|
|
|
2013-07-25 08:44:17 +08:00
|
|
|
closure_sync(&cl);
|
2020-10-01 14:50:56 +08:00
|
|
|
c->cache->sb.last_mount = (u32)ktime_get_real_seconds();
|
2013-03-24 07:11:31 +08:00
|
|
|
bcache_write_super(c);
|
|
|
|
|
|
|
|
list_for_each_entry_safe(dc, t, &uncached_devices, list)
|
bcache: fix for data collapse after re-attaching an attached device
back-end device sdm has already attached a cache_set with ID
f67ebe1f-f8bc-4d73-bfe5-9dc88607f119, then try to attach with
another cache set, and it returns with an error:
[root]# cd /sys/block/sdm/bcache
[root]# echo 5ccd0a63-148e-48b8-afa2-aca9cbd6279f > attach
-bash: echo: write error: Invalid argument
After that, execute a command to modify the label of bcache
device:
[root]# echo data_disk1 > label
Then we reboot the system, when the system power on, the back-end
device can not attach to cache_set, a messages show in the log:
Feb 5 12:05:52 ceph152 kernel: [922385.508498] bcache:
bch_cached_dev_attach() couldn't find uuid for sdm in set
In sysfs_attach(), dc->sb.set_uuid was assigned to the value
which input through sysfs, no matter whether it is success
or not in bch_cached_dev_attach(). For example, If the back-end
device has already attached to an cache set, bch_cached_dev_attach()
would fail, but dc->sb.set_uuid was changed. Then modify the
label of bcache device, it will call bch_write_bdev_super(),
which would write the dc->sb.set_uuid to the super block, so we
record a wrong cache set ID in the super block, after the system
reboot, the cache set couldn't find the uuid of the back-end
device, so the bcache device couldn't exist and use any more.
In this patch, we don't assigned cache set ID to dc->sb.set_uuid
in sysfs_attach() directly, but input it into bch_cached_dev_attach(),
and assigned dc->sb.set_uuid to the cache set ID after the back-end
device attached to the cache set successful.
Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-08 03:41:46 +08:00
|
|
|
bch_cached_dev_attach(dc, c, NULL);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
flash_devs_run(c);
|
|
|
|
|
2014-07-12 03:17:41 +08:00
|
|
|
set_bit(CACHE_SET_RUNNING, &c->flags);
|
2019-04-25 00:48:34 +08:00
|
|
|
return 0;
|
2013-03-24 07:11:31 +08:00
|
|
|
err:
|
2019-04-25 00:48:43 +08:00
|
|
|
while (!list_empty(&journal)) {
|
|
|
|
l = list_first_entry(&journal, struct journal_replay, list);
|
|
|
|
list_del(&l->list);
|
|
|
|
kfree(l);
|
|
|
|
}
|
|
|
|
|
2013-07-25 08:44:17 +08:00
|
|
|
closure_sync(&cl);
|
2019-06-28 19:59:40 +08:00
|
|
|
|
2013-09-11 12:41:34 +08:00
|
|
|
bch_cache_set_error(c, "%s", err);
|
2019-04-25 00:48:34 +08:00
|
|
|
|
|
|
|
return -EIO;
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static const char *register_cache_set(struct cache *ca)
|
|
|
|
{
|
|
|
|
char buf[12];
|
|
|
|
const char *err = "cannot allocate memory";
|
|
|
|
struct cache_set *c;
|
|
|
|
|
|
|
|
list_for_each_entry(c, &bch_cache_sets, list)
|
2020-10-01 14:50:48 +08:00
|
|
|
if (!memcmp(c->set_uuid, ca->sb.set_uuid, 16)) {
|
2020-10-01 14:50:46 +08:00
|
|
|
if (c->cache)
|
2013-03-24 07:11:31 +08:00
|
|
|
return "duplicate cache set member";
|
|
|
|
|
|
|
|
goto found;
|
|
|
|
}
|
|
|
|
|
|
|
|
c = bch_cache_set_alloc(&ca->sb);
|
|
|
|
if (!c)
|
|
|
|
return err;
|
|
|
|
|
|
|
|
err = "error creating kobject";
|
2020-10-01 14:50:48 +08:00
|
|
|
if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->set_uuid) ||
|
2013-03-24 07:11:31 +08:00
|
|
|
kobject_add(&c->internal, &c->kobj, "internal"))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
bch_debug_init_cache_set(c);
|
|
|
|
|
|
|
|
list_add(&c->list, &bch_cache_sets);
|
|
|
|
found:
|
|
|
|
sprintf(buf, "cache%i", ca->sb.nr_this_dev);
|
|
|
|
if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
|
|
|
|
sysfs_create_link(&c->kobj, &ca->kobj, buf))
|
|
|
|
goto err;
|
|
|
|
|
2014-06-12 10:44:49 +08:00
|
|
|
kobject_get(&ca->kobj);
|
2013-03-24 07:11:31 +08:00
|
|
|
ca->set = c;
|
2020-10-01 14:50:46 +08:00
|
|
|
ca->set->cache = ca;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-10-01 14:50:46 +08:00
|
|
|
err = "failed to run cache set";
|
|
|
|
if (run_cache_set(c) < 0)
|
|
|
|
goto err;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
return NULL;
|
|
|
|
err:
|
|
|
|
bch_cache_set_unregister(c);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Cache device */
|
|
|
|
|
2019-04-25 00:48:35 +08:00
|
|
|
/* When ca->kobj released */
|
2013-03-24 07:11:31 +08:00
|
|
|
void bch_cache_release(struct kobject *kobj)
|
|
|
|
{
|
|
|
|
struct cache *ca = container_of(kobj, struct cache, kobj);
|
2018-08-11 13:19:44 +08:00
|
|
|
unsigned int i;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2014-06-20 06:05:59 +08:00
|
|
|
if (ca->set) {
|
2020-10-01 14:50:46 +08:00
|
|
|
BUG_ON(ca->set->cache != ca);
|
|
|
|
ca->set->cache = NULL;
|
2014-06-20 06:05:59 +08:00
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-07-25 20:00:32 +08:00
|
|
|
free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb)));
|
2013-03-24 07:11:31 +08:00
|
|
|
kfree(ca->prio_buckets);
|
|
|
|
vfree(ca->buckets);
|
|
|
|
|
|
|
|
free_heap(&ca->heap);
|
|
|
|
free_fifo(&ca->free_inc);
|
2013-12-17 17:29:34 +08:00
|
|
|
|
|
|
|
for (i = 0; i < RESERVE_NR; i++)
|
|
|
|
free_fifo(&ca->free[i]);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-01-24 01:01:33 +08:00
|
|
|
if (ca->sb_disk)
|
|
|
|
put_page(virt_to_page(ca->sb_disk));
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2014-07-08 04:03:36 +08:00
|
|
|
if (!IS_ERR_OR_NULL(ca->bdev))
|
2013-03-24 07:11:31 +08:00
|
|
|
blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
|
|
|
|
|
|
|
|
kfree(ca);
|
|
|
|
module_put(THIS_MODULE);
|
|
|
|
}
|
|
|
|
|
2016-07-04 09:23:25 +08:00
|
|
|
static int cache_alloc(struct cache *ca)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
|
|
|
size_t free;
|
bcache: fix for allocator and register thread race
After long time running of random small IO writing,
I reboot the machine, and after the machine power on,
I found bcache got stuck, the stack is:
[root@ceph153 ~]# cat /proc/2510/task/*/stack
[<ffffffffa06b2455>] closure_sync+0x25/0x90 [bcache]
[<ffffffffa06b6be8>] bch_journal+0x118/0x2b0 [bcache]
[<ffffffffa06b6dc7>] bch_journal_meta+0x47/0x70 [bcache]
[<ffffffffa06be8f7>] bch_prio_write+0x237/0x340 [bcache]
[<ffffffffa06a8018>] bch_allocator_thread+0x3c8/0x3d0 [bcache]
[<ffffffff810a631f>] kthread+0xcf/0xe0
[<ffffffff8164c318>] ret_from_fork+0x58/0x90
[<ffffffffffffffff>] 0xffffffffffffffff
[root@ceph153 ~]# cat /proc/2038/task/*/stack
[<ffffffffa06b1abd>] __bch_btree_map_nodes+0x12d/0x150 [bcache]
[<ffffffffa06b1bd1>] bch_btree_insert+0xf1/0x170 [bcache]
[<ffffffffa06b637f>] bch_journal_replay+0x13f/0x230 [bcache]
[<ffffffffa06c75fe>] run_cache_set+0x79a/0x7c2 [bcache]
[<ffffffffa06c0cf8>] register_bcache+0xd48/0x1310 [bcache]
[<ffffffff812f702f>] kobj_attr_store+0xf/0x20
[<ffffffff8125b216>] sysfs_write_file+0xc6/0x140
[<ffffffff811dfbfd>] vfs_write+0xbd/0x1e0
[<ffffffff811e069f>] SyS_write+0x7f/0xe0
[<ffffffff8164c3c9>] system_call_fastpath+0x16/0x1
The stack shows the register thread and allocator thread
were getting stuck when registering cache device.
I reboot the machine several times, the issue always
exsit in this machine.
I debug the code, and found the call trace as bellow:
register_bcache()
==>run_cache_set()
==>bch_journal_replay()
==>bch_btree_insert()
==>__bch_btree_map_nodes()
==>btree_insert_fn()
==>btree_split() //node need split
==>btree_check_reserve()
In btree_check_reserve(), It will check if there is enough buckets
of RESERVE_BTREE type, since allocator thread did not work yet, so
no buckets of RESERVE_BTREE type allocated, so the register thread
waits on c->btree_cache_wait, and goes to sleep.
Then the allocator thread initialized, the call trace is bellow:
bch_allocator_thread()
==>bch_prio_write()
==>bch_journal_meta()
==>bch_journal()
==>journal_wait_for_write()
In journal_wait_for_write(), It will check if journal is full by
journal_full(), but the long time random small IO writing
causes the exhaustion of journal buckets(journal.blocks_free=0),
In order to release the journal buckets,
the allocator calls btree_flush_write() to flush keys to
btree nodes, and waits on c->journal.wait until btree nodes writing
over or there has already some journal buckets space, then the
allocator thread goes to sleep. but in btree_flush_write(), since
bch_journal_replay() is not finished, so no btree nodes have journal
(condition "if (btree_current_write(b)->journal)" never satisfied),
so we got no btree node to flush, no journal bucket released,
and allocator sleep all the times.
Through the above analysis, we can see that:
1) Register thread wait for allocator thread to allocate buckets of
RESERVE_BTREE type;
2) Alloctor thread wait for register thread to replay journal, so it
can flush btree nodes and get journal bucket.
then they are all got stuck by waiting for each other.
Hua Rui provided a patch for me, by allocating some buckets of
RESERVE_BTREE type in advance, so the register thread can get bucket
when btree node splitting and no need to waiting for the allocator
thread. I tested it, it has effect, and register thread run a step
forward, but finally are still got stuck, the reason is only 8 bucket
of RESERVE_BTREE type were allocated, and in bch_journal_replay(),
after 2 btree nodes splitting, only 4 bucket of RESERVE_BTREE type left,
then btree_check_reserve() is not satisfied anymore, so it goes to sleep
again, and in the same time, alloctor thread did not flush enough btree
nodes to release a journal bucket, so they all got stuck again.
So we need to allocate more buckets of RESERVE_BTREE type in advance,
but how much is enough? By experience and test, I think it should be
as much as journal buckets. Then I modify the code as this patch,
and test in the machine, and it works.
This patch modified base on Hua Rui’s patch, and allocate more buckets
of RESERVE_BTREE type in advance to avoid register thread and allocate
thread going to wait for each other.
[patch v2] ca->sb.njournal_buckets would be 0 in the first time after
cache creation, and no journal exists, so just 8 btree buckets is OK.
Signed-off-by: Hua Rui <huarui.dev@gmail.com>
Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-08 03:41:43 +08:00
|
|
|
size_t btree_buckets;
|
2013-03-24 07:11:31 +08:00
|
|
|
struct bucket *b;
|
2018-10-08 20:41:20 +08:00
|
|
|
int ret = -ENOMEM;
|
|
|
|
const char *err = NULL;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
__module_get(THIS_MODULE);
|
|
|
|
kobject_init(&ca->kobj, &bch_cache_ktype);
|
|
|
|
|
2016-11-22 23:57:21 +08:00
|
|
|
bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
bcache: fix for allocator and register thread race
After long time running of random small IO writing,
I reboot the machine, and after the machine power on,
I found bcache got stuck, the stack is:
[root@ceph153 ~]# cat /proc/2510/task/*/stack
[<ffffffffa06b2455>] closure_sync+0x25/0x90 [bcache]
[<ffffffffa06b6be8>] bch_journal+0x118/0x2b0 [bcache]
[<ffffffffa06b6dc7>] bch_journal_meta+0x47/0x70 [bcache]
[<ffffffffa06be8f7>] bch_prio_write+0x237/0x340 [bcache]
[<ffffffffa06a8018>] bch_allocator_thread+0x3c8/0x3d0 [bcache]
[<ffffffff810a631f>] kthread+0xcf/0xe0
[<ffffffff8164c318>] ret_from_fork+0x58/0x90
[<ffffffffffffffff>] 0xffffffffffffffff
[root@ceph153 ~]# cat /proc/2038/task/*/stack
[<ffffffffa06b1abd>] __bch_btree_map_nodes+0x12d/0x150 [bcache]
[<ffffffffa06b1bd1>] bch_btree_insert+0xf1/0x170 [bcache]
[<ffffffffa06b637f>] bch_journal_replay+0x13f/0x230 [bcache]
[<ffffffffa06c75fe>] run_cache_set+0x79a/0x7c2 [bcache]
[<ffffffffa06c0cf8>] register_bcache+0xd48/0x1310 [bcache]
[<ffffffff812f702f>] kobj_attr_store+0xf/0x20
[<ffffffff8125b216>] sysfs_write_file+0xc6/0x140
[<ffffffff811dfbfd>] vfs_write+0xbd/0x1e0
[<ffffffff811e069f>] SyS_write+0x7f/0xe0
[<ffffffff8164c3c9>] system_call_fastpath+0x16/0x1
The stack shows the register thread and allocator thread
were getting stuck when registering cache device.
I reboot the machine several times, the issue always
exsit in this machine.
I debug the code, and found the call trace as bellow:
register_bcache()
==>run_cache_set()
==>bch_journal_replay()
==>bch_btree_insert()
==>__bch_btree_map_nodes()
==>btree_insert_fn()
==>btree_split() //node need split
==>btree_check_reserve()
In btree_check_reserve(), It will check if there is enough buckets
of RESERVE_BTREE type, since allocator thread did not work yet, so
no buckets of RESERVE_BTREE type allocated, so the register thread
waits on c->btree_cache_wait, and goes to sleep.
Then the allocator thread initialized, the call trace is bellow:
bch_allocator_thread()
==>bch_prio_write()
==>bch_journal_meta()
==>bch_journal()
==>journal_wait_for_write()
In journal_wait_for_write(), It will check if journal is full by
journal_full(), but the long time random small IO writing
causes the exhaustion of journal buckets(journal.blocks_free=0),
In order to release the journal buckets,
the allocator calls btree_flush_write() to flush keys to
btree nodes, and waits on c->journal.wait until btree nodes writing
over or there has already some journal buckets space, then the
allocator thread goes to sleep. but in btree_flush_write(), since
bch_journal_replay() is not finished, so no btree nodes have journal
(condition "if (btree_current_write(b)->journal)" never satisfied),
so we got no btree node to flush, no journal bucket released,
and allocator sleep all the times.
Through the above analysis, we can see that:
1) Register thread wait for allocator thread to allocate buckets of
RESERVE_BTREE type;
2) Alloctor thread wait for register thread to replay journal, so it
can flush btree nodes and get journal bucket.
then they are all got stuck by waiting for each other.
Hua Rui provided a patch for me, by allocating some buckets of
RESERVE_BTREE type in advance, so the register thread can get bucket
when btree node splitting and no need to waiting for the allocator
thread. I tested it, it has effect, and register thread run a step
forward, but finally are still got stuck, the reason is only 8 bucket
of RESERVE_BTREE type were allocated, and in bch_journal_replay(),
after 2 btree nodes splitting, only 4 bucket of RESERVE_BTREE type left,
then btree_check_reserve() is not satisfied anymore, so it goes to sleep
again, and in the same time, alloctor thread did not flush enough btree
nodes to release a journal bucket, so they all got stuck again.
So we need to allocate more buckets of RESERVE_BTREE type in advance,
but how much is enough? By experience and test, I think it should be
as much as journal buckets. Then I modify the code as this patch,
and test in the machine, and it works.
This patch modified base on Hua Rui’s patch, and allocate more buckets
of RESERVE_BTREE type in advance to avoid register thread and allocate
thread going to wait for each other.
[patch v2] ca->sb.njournal_buckets would be 0 in the first time after
cache creation, and no journal exists, so just 8 btree buckets is OK.
Signed-off-by: Hua Rui <huarui.dev@gmail.com>
Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Reviewed-by: Michael Lyle <mlyle@lyle.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2018-02-08 03:41:43 +08:00
|
|
|
/*
|
|
|
|
* when ca->sb.njournal_buckets is not zero, journal exists,
|
|
|
|
* and in bch_journal_replay(), tree node may split,
|
|
|
|
* so bucket of RESERVE_BTREE type is needed,
|
|
|
|
* the worst situation is all journal buckets are valid journal,
|
|
|
|
* and all the keys need to replay,
|
|
|
|
* so the number of RESERVE_BTREE type buckets should be as much
|
|
|
|
* as journal buckets
|
|
|
|
*/
|
|
|
|
btree_buckets = ca->sb.njournal_buckets ?: 8;
|
2013-12-17 17:29:34 +08:00
|
|
|
free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
|
2018-10-08 20:41:21 +08:00
|
|
|
if (!free) {
|
|
|
|
ret = -EPERM;
|
|
|
|
err = "ca->sb.nbuckets is too small";
|
|
|
|
goto err_free;
|
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2018-10-08 20:41:20 +08:00
|
|
|
if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets,
|
|
|
|
GFP_KERNEL)) {
|
|
|
|
err = "ca->free[RESERVE_BTREE] alloc failed";
|
|
|
|
goto err_btree_alloc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca),
|
|
|
|
GFP_KERNEL)) {
|
|
|
|
err = "ca->free[RESERVE_PRIO] alloc failed";
|
|
|
|
goto err_prio_alloc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) {
|
|
|
|
err = "ca->free[RESERVE_MOVINGGC] alloc failed";
|
|
|
|
goto err_movinggc_alloc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) {
|
|
|
|
err = "ca->free[RESERVE_NONE] alloc failed";
|
|
|
|
goto err_none_alloc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) {
|
|
|
|
err = "ca->free_inc alloc failed";
|
|
|
|
goto err_free_inc_alloc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!init_heap(&ca->heap, free << 3, GFP_KERNEL)) {
|
|
|
|
err = "ca->heap alloc failed";
|
|
|
|
goto err_heap_alloc;
|
|
|
|
}
|
|
|
|
|
|
|
|
ca->buckets = vzalloc(array_size(sizeof(struct bucket),
|
|
|
|
ca->sb.nbuckets));
|
|
|
|
if (!ca->buckets) {
|
|
|
|
err = "ca->buckets alloc failed";
|
|
|
|
goto err_buckets_alloc;
|
|
|
|
}
|
|
|
|
|
|
|
|
ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
|
|
|
|
prio_buckets(ca), 2),
|
|
|
|
GFP_KERNEL);
|
|
|
|
if (!ca->prio_buckets) {
|
|
|
|
err = "ca->prio_buckets alloc failed";
|
|
|
|
goto err_prio_buckets_alloc;
|
|
|
|
}
|
|
|
|
|
2020-07-25 20:00:32 +08:00
|
|
|
ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb);
|
2018-10-08 20:41:20 +08:00
|
|
|
if (!ca->disk_buckets) {
|
|
|
|
err = "ca->disk_buckets alloc failed";
|
|
|
|
goto err_disk_buckets_alloc;
|
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
|
|
|
|
|
|
|
|
for_each_bucket(b, ca)
|
|
|
|
atomic_set(&b->pin, 0);
|
|
|
|
return 0;
|
2018-10-08 20:41:20 +08:00
|
|
|
|
|
|
|
err_disk_buckets_alloc:
|
|
|
|
kfree(ca->prio_buckets);
|
|
|
|
err_prio_buckets_alloc:
|
|
|
|
vfree(ca->buckets);
|
|
|
|
err_buckets_alloc:
|
|
|
|
free_heap(&ca->heap);
|
|
|
|
err_heap_alloc:
|
|
|
|
free_fifo(&ca->free_inc);
|
|
|
|
err_free_inc_alloc:
|
|
|
|
free_fifo(&ca->free[RESERVE_NONE]);
|
|
|
|
err_none_alloc:
|
|
|
|
free_fifo(&ca->free[RESERVE_MOVINGGC]);
|
|
|
|
err_movinggc_alloc:
|
|
|
|
free_fifo(&ca->free[RESERVE_PRIO]);
|
|
|
|
err_prio_alloc:
|
|
|
|
free_fifo(&ca->free[RESERVE_BTREE]);
|
|
|
|
err_btree_alloc:
|
2018-10-08 20:41:21 +08:00
|
|
|
err_free:
|
2018-10-08 20:41:20 +08:00
|
|
|
module_put(THIS_MODULE);
|
|
|
|
if (err)
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_notice("error %s: %s\n", ca->cache_dev_name, err);
|
2018-10-08 20:41:20 +08:00
|
|
|
return ret;
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
2020-01-24 01:01:32 +08:00
|
|
|
static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
|
2014-06-20 06:05:59 +08:00
|
|
|
struct block_device *bdev, struct cache *ca)
|
2013-03-24 07:11:31 +08:00
|
|
|
{
|
2016-06-18 06:01:54 +08:00
|
|
|
const char *err = NULL; /* must be set for any error case */
|
2016-02-27 06:33:56 +08:00
|
|
|
int ret = 0;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2018-05-03 18:51:32 +08:00
|
|
|
bdevname(bdev, ca->cache_dev_name);
|
2013-05-15 15:11:26 +08:00
|
|
|
memcpy(&ca->sb, sb, sizeof(struct cache_sb));
|
2013-03-24 07:11:31 +08:00
|
|
|
ca->bdev = bdev;
|
|
|
|
ca->bdev->bd_holder = ca;
|
2020-01-24 01:01:33 +08:00
|
|
|
ca->sb_disk = sb_disk;
|
2013-05-15 15:11:26 +08:00
|
|
|
|
2018-03-06 05:41:54 +08:00
|
|
|
if (blk_queue_discard(bdev_get_queue(bdev)))
|
2013-03-24 07:11:31 +08:00
|
|
|
ca->discard = CACHE_DISCARD(&ca->sb);
|
|
|
|
|
2016-07-04 09:23:25 +08:00
|
|
|
ret = cache_alloc(ca);
|
2016-06-18 06:01:54 +08:00
|
|
|
if (ret != 0) {
|
2019-04-25 00:48:38 +08:00
|
|
|
/*
|
|
|
|
* If we failed here, it means ca->kobj is not initialized yet,
|
|
|
|
* kobject_put() won't be called and there is no chance to
|
|
|
|
* call blkdev_put() to bdev in bch_cache_release(). So we
|
|
|
|
* explicitly call blkdev_put() here.
|
|
|
|
*/
|
2018-03-06 05:41:54 +08:00
|
|
|
blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
|
2016-06-18 06:01:54 +08:00
|
|
|
if (ret == -ENOMEM)
|
|
|
|
err = "cache_alloc(): -ENOMEM";
|
2018-10-08 20:41:21 +08:00
|
|
|
else if (ret == -EPERM)
|
|
|
|
err = "cache_alloc(): cache device is too small";
|
2016-06-18 06:01:54 +08:00
|
|
|
else
|
|
|
|
err = "cache_alloc(): unknown error";
|
2013-05-15 15:11:26 +08:00
|
|
|
goto err;
|
2016-06-18 06:01:54 +08:00
|
|
|
}
|
2013-05-15 15:11:26 +08:00
|
|
|
|
2020-11-17 15:18:55 +08:00
|
|
|
if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) {
|
2016-02-27 06:33:56 +08:00
|
|
|
err = "error calling kobject_add";
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2014-03-18 09:58:55 +08:00
|
|
|
mutex_lock(&bch_register_lock);
|
2013-03-24 07:11:31 +08:00
|
|
|
err = register_cache_set(ca);
|
2014-03-18 09:58:55 +08:00
|
|
|
mutex_unlock(&bch_register_lock);
|
|
|
|
|
2016-02-27 06:33:56 +08:00
|
|
|
if (err) {
|
|
|
|
ret = -ENODEV;
|
|
|
|
goto out;
|
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_info("registered cache device %s\n", ca->cache_dev_name);
|
2016-02-27 06:33:56 +08:00
|
|
|
|
2014-06-12 10:44:49 +08:00
|
|
|
out:
|
|
|
|
kobject_put(&ca->kobj);
|
2016-02-27 06:33:56 +08:00
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
err:
|
2016-02-27 06:33:56 +08:00
|
|
|
if (err)
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_notice("error %s: %s\n", ca->cache_dev_name, err);
|
2016-02-27 06:33:56 +08:00
|
|
|
|
|
|
|
return ret;
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Global interfaces/init */
|
|
|
|
|
2018-08-11 13:19:46 +08:00
|
|
|
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
|
|
|
|
const char *buffer, size_t size);
|
2019-06-28 19:59:42 +08:00
|
|
|
static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
|
|
|
|
struct kobj_attribute *attr,
|
|
|
|
const char *buffer, size_t size);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
|
|
|
kobj_attribute_write(register, register_bcache);
|
|
|
|
kobj_attribute_write(register_quiet, register_bcache);
|
2019-06-28 19:59:42 +08:00
|
|
|
kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-11-23 20:38:40 +08:00
|
|
|
static bool bch_is_open_backing(dev_t dev)
|
2018-08-11 13:19:57 +08:00
|
|
|
{
|
2013-05-04 18:19:41 +08:00
|
|
|
struct cache_set *c, *tc;
|
|
|
|
struct cached_dev *dc, *t;
|
|
|
|
|
|
|
|
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
|
|
|
|
list_for_each_entry_safe(dc, t, &c->cached_devs, list)
|
2020-11-23 20:38:40 +08:00
|
|
|
if (dc->bdev->bd_dev == dev)
|
2013-05-04 18:19:41 +08:00
|
|
|
return true;
|
|
|
|
list_for_each_entry_safe(dc, t, &uncached_devices, list)
|
2020-11-23 20:38:40 +08:00
|
|
|
if (dc->bdev->bd_dev == dev)
|
2013-05-04 18:19:41 +08:00
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-11-23 20:38:40 +08:00
|
|
|
static bool bch_is_open_cache(dev_t dev)
|
2018-08-11 13:19:57 +08:00
|
|
|
{
|
2013-05-04 18:19:41 +08:00
|
|
|
struct cache_set *c, *tc;
|
|
|
|
|
2020-10-01 14:50:47 +08:00
|
|
|
list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
|
|
|
|
struct cache *ca = c->cache;
|
|
|
|
|
2020-11-23 20:38:40 +08:00
|
|
|
if (ca->bdev->bd_dev == dev)
|
2020-10-01 14:50:47 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-05-04 18:19:41 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-11-23 20:38:40 +08:00
|
|
|
static bool bch_is_open(dev_t dev)
|
2018-08-11 13:19:57 +08:00
|
|
|
{
|
2020-11-23 20:38:40 +08:00
|
|
|
return bch_is_open_cache(dev) || bch_is_open_backing(dev);
|
2013-05-04 18:19:41 +08:00
|
|
|
}
|
|
|
|
|
bcache: asynchronous devices registration
When there is a lot of data cached on cache device, the bcach internal
btree can take a very long to validate during the backing device and
cache device registration. In my test, it may takes 55+ minutes to check
all the internal btree nodes.
The problem is that the registration is invoked by udev rules and the
udevd has 180 seconds timeout by default. If the btree node checking
time is longer than udevd timeout, the registering process will be
killed by udevd with SIGKILL. If the registering process has pending
sigal, creating kthread for bcache will fail and the device registration
will fail. The result is, for bcache device which cached a lot of data
on cache device, the bcache device node like /dev/bcache<N> won't create
always due to the very long btree checking time.
A solution to avoid the udevd 180 seconds timeout is to register devices
in an asynchronous way. Which is, after writing cache or backing device
path into /sys/fs/bcache/register_async, the kernel code will create a
kworker and move all the btree node checking (for cache device) or dirty
data counting (for cached device) in the kwork context. Then the kworder
is scheduled on system_wq and the registration code just returned to
user space udev rule task. By this asynchronous way, the udev task for
bcache rule will complete in seconds, no matter how long time spent in
the kworker context, it won't be killed by udevd for a timeout.
After all the checking and counting are done asynchronously in the
kworker, the bcache device will eventually be created successfully.
This patch does the above chagne and add a register sysfs file
/sys/fs/bcache/register_async. Writing the registering device path into
this sysfs file will do the asynchronous registration.
The register_async interface is for very rare condition and won't be
used for common users. In future I plan to make the asynchronous
registration as default behavior, which depends on feedback for this
patch.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-05-27 12:01:54 +08:00
|
|
|
struct async_reg_args {
|
2020-06-15 00:53:32 +08:00
|
|
|
struct delayed_work reg_work;
|
bcache: asynchronous devices registration
When there is a lot of data cached on cache device, the bcach internal
btree can take a very long to validate during the backing device and
cache device registration. In my test, it may takes 55+ minutes to check
all the internal btree nodes.
The problem is that the registration is invoked by udev rules and the
udevd has 180 seconds timeout by default. If the btree node checking
time is longer than udevd timeout, the registering process will be
killed by udevd with SIGKILL. If the registering process has pending
sigal, creating kthread for bcache will fail and the device registration
will fail. The result is, for bcache device which cached a lot of data
on cache device, the bcache device node like /dev/bcache<N> won't create
always due to the very long btree checking time.
A solution to avoid the udevd 180 seconds timeout is to register devices
in an asynchronous way. Which is, after writing cache or backing device
path into /sys/fs/bcache/register_async, the kernel code will create a
kworker and move all the btree node checking (for cache device) or dirty
data counting (for cached device) in the kwork context. Then the kworder
is scheduled on system_wq and the registration code just returned to
user space udev rule task. By this asynchronous way, the udev task for
bcache rule will complete in seconds, no matter how long time spent in
the kworker context, it won't be killed by udevd for a timeout.
After all the checking and counting are done asynchronously in the
kworker, the bcache device will eventually be created successfully.
This patch does the above chagne and add a register sysfs file
/sys/fs/bcache/register_async. Writing the registering device path into
this sysfs file will do the asynchronous registration.
The register_async interface is for very rare condition and won't be
used for common users. In future I plan to make the asynchronous
registration as default behavior, which depends on feedback for this
patch.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-05-27 12:01:54 +08:00
|
|
|
char *path;
|
|
|
|
struct cache_sb *sb;
|
|
|
|
struct cache_sb_disk *sb_disk;
|
|
|
|
struct block_device *bdev;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void register_bdev_worker(struct work_struct *work)
|
|
|
|
{
|
|
|
|
int fail = false;
|
|
|
|
struct async_reg_args *args =
|
2020-06-15 00:53:32 +08:00
|
|
|
container_of(work, struct async_reg_args, reg_work.work);
|
bcache: asynchronous devices registration
When there is a lot of data cached on cache device, the bcach internal
btree can take a very long to validate during the backing device and
cache device registration. In my test, it may takes 55+ minutes to check
all the internal btree nodes.
The problem is that the registration is invoked by udev rules and the
udevd has 180 seconds timeout by default. If the btree node checking
time is longer than udevd timeout, the registering process will be
killed by udevd with SIGKILL. If the registering process has pending
sigal, creating kthread for bcache will fail and the device registration
will fail. The result is, for bcache device which cached a lot of data
on cache device, the bcache device node like /dev/bcache<N> won't create
always due to the very long btree checking time.
A solution to avoid the udevd 180 seconds timeout is to register devices
in an asynchronous way. Which is, after writing cache or backing device
path into /sys/fs/bcache/register_async, the kernel code will create a
kworker and move all the btree node checking (for cache device) or dirty
data counting (for cached device) in the kwork context. Then the kworder
is scheduled on system_wq and the registration code just returned to
user space udev rule task. By this asynchronous way, the udev task for
bcache rule will complete in seconds, no matter how long time spent in
the kworker context, it won't be killed by udevd for a timeout.
After all the checking and counting are done asynchronously in the
kworker, the bcache device will eventually be created successfully.
This patch does the above chagne and add a register sysfs file
/sys/fs/bcache/register_async. Writing the registering device path into
this sysfs file will do the asynchronous registration.
The register_async interface is for very rare condition and won't be
used for common users. In future I plan to make the asynchronous
registration as default behavior, which depends on feedback for this
patch.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-05-27 12:01:54 +08:00
|
|
|
struct cached_dev *dc;
|
|
|
|
|
|
|
|
dc = kzalloc(sizeof(*dc), GFP_KERNEL);
|
|
|
|
if (!dc) {
|
|
|
|
fail = true;
|
|
|
|
put_page(virt_to_page(args->sb_disk));
|
|
|
|
blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_lock(&bch_register_lock);
|
|
|
|
if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0)
|
|
|
|
fail = true;
|
|
|
|
mutex_unlock(&bch_register_lock);
|
|
|
|
|
|
|
|
out:
|
|
|
|
if (fail)
|
|
|
|
pr_info("error %s: fail to register backing device\n",
|
|
|
|
args->path);
|
|
|
|
kfree(args->sb);
|
|
|
|
kfree(args->path);
|
|
|
|
kfree(args);
|
|
|
|
module_put(THIS_MODULE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void register_cache_worker(struct work_struct *work)
|
|
|
|
{
|
|
|
|
int fail = false;
|
|
|
|
struct async_reg_args *args =
|
2020-06-15 00:53:32 +08:00
|
|
|
container_of(work, struct async_reg_args, reg_work.work);
|
bcache: asynchronous devices registration
When there is a lot of data cached on cache device, the bcach internal
btree can take a very long to validate during the backing device and
cache device registration. In my test, it may takes 55+ minutes to check
all the internal btree nodes.
The problem is that the registration is invoked by udev rules and the
udevd has 180 seconds timeout by default. If the btree node checking
time is longer than udevd timeout, the registering process will be
killed by udevd with SIGKILL. If the registering process has pending
sigal, creating kthread for bcache will fail and the device registration
will fail. The result is, for bcache device which cached a lot of data
on cache device, the bcache device node like /dev/bcache<N> won't create
always due to the very long btree checking time.
A solution to avoid the udevd 180 seconds timeout is to register devices
in an asynchronous way. Which is, after writing cache or backing device
path into /sys/fs/bcache/register_async, the kernel code will create a
kworker and move all the btree node checking (for cache device) or dirty
data counting (for cached device) in the kwork context. Then the kworder
is scheduled on system_wq and the registration code just returned to
user space udev rule task. By this asynchronous way, the udev task for
bcache rule will complete in seconds, no matter how long time spent in
the kworker context, it won't be killed by udevd for a timeout.
After all the checking and counting are done asynchronously in the
kworker, the bcache device will eventually be created successfully.
This patch does the above chagne and add a register sysfs file
/sys/fs/bcache/register_async. Writing the registering device path into
this sysfs file will do the asynchronous registration.
The register_async interface is for very rare condition and won't be
used for common users. In future I plan to make the asynchronous
registration as default behavior, which depends on feedback for this
patch.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-05-27 12:01:54 +08:00
|
|
|
struct cache *ca;
|
|
|
|
|
|
|
|
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
|
|
|
|
if (!ca) {
|
|
|
|
fail = true;
|
|
|
|
put_page(virt_to_page(args->sb_disk));
|
|
|
|
blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* blkdev_put() will be called in bch_cache_release() */
|
|
|
|
if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0)
|
|
|
|
fail = true;
|
|
|
|
|
|
|
|
out:
|
|
|
|
if (fail)
|
|
|
|
pr_info("error %s: fail to register cache device\n",
|
|
|
|
args->path);
|
|
|
|
kfree(args->sb);
|
|
|
|
kfree(args->path);
|
|
|
|
kfree(args);
|
|
|
|
module_put(THIS_MODULE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void register_device_aync(struct async_reg_args *args)
|
|
|
|
{
|
|
|
|
if (SB_IS_BDEV(args->sb))
|
2020-06-15 00:53:32 +08:00
|
|
|
INIT_DELAYED_WORK(&args->reg_work, register_bdev_worker);
|
bcache: asynchronous devices registration
When there is a lot of data cached on cache device, the bcach internal
btree can take a very long to validate during the backing device and
cache device registration. In my test, it may takes 55+ minutes to check
all the internal btree nodes.
The problem is that the registration is invoked by udev rules and the
udevd has 180 seconds timeout by default. If the btree node checking
time is longer than udevd timeout, the registering process will be
killed by udevd with SIGKILL. If the registering process has pending
sigal, creating kthread for bcache will fail and the device registration
will fail. The result is, for bcache device which cached a lot of data
on cache device, the bcache device node like /dev/bcache<N> won't create
always due to the very long btree checking time.
A solution to avoid the udevd 180 seconds timeout is to register devices
in an asynchronous way. Which is, after writing cache or backing device
path into /sys/fs/bcache/register_async, the kernel code will create a
kworker and move all the btree node checking (for cache device) or dirty
data counting (for cached device) in the kwork context. Then the kworder
is scheduled on system_wq and the registration code just returned to
user space udev rule task. By this asynchronous way, the udev task for
bcache rule will complete in seconds, no matter how long time spent in
the kworker context, it won't be killed by udevd for a timeout.
After all the checking and counting are done asynchronously in the
kworker, the bcache device will eventually be created successfully.
This patch does the above chagne and add a register sysfs file
/sys/fs/bcache/register_async. Writing the registering device path into
this sysfs file will do the asynchronous registration.
The register_async interface is for very rare condition and won't be
used for common users. In future I plan to make the asynchronous
registration as default behavior, which depends on feedback for this
patch.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-05-27 12:01:54 +08:00
|
|
|
else
|
2020-06-15 00:53:32 +08:00
|
|
|
INIT_DELAYED_WORK(&args->reg_work, register_cache_worker);
|
bcache: asynchronous devices registration
When there is a lot of data cached on cache device, the bcach internal
btree can take a very long to validate during the backing device and
cache device registration. In my test, it may takes 55+ minutes to check
all the internal btree nodes.
The problem is that the registration is invoked by udev rules and the
udevd has 180 seconds timeout by default. If the btree node checking
time is longer than udevd timeout, the registering process will be
killed by udevd with SIGKILL. If the registering process has pending
sigal, creating kthread for bcache will fail and the device registration
will fail. The result is, for bcache device which cached a lot of data
on cache device, the bcache device node like /dev/bcache<N> won't create
always due to the very long btree checking time.
A solution to avoid the udevd 180 seconds timeout is to register devices
in an asynchronous way. Which is, after writing cache or backing device
path into /sys/fs/bcache/register_async, the kernel code will create a
kworker and move all the btree node checking (for cache device) or dirty
data counting (for cached device) in the kwork context. Then the kworder
is scheduled on system_wq and the registration code just returned to
user space udev rule task. By this asynchronous way, the udev task for
bcache rule will complete in seconds, no matter how long time spent in
the kworker context, it won't be killed by udevd for a timeout.
After all the checking and counting are done asynchronously in the
kworker, the bcache device will eventually be created successfully.
This patch does the above chagne and add a register sysfs file
/sys/fs/bcache/register_async. Writing the registering device path into
this sysfs file will do the asynchronous registration.
The register_async interface is for very rare condition and won't be
used for common users. In future I plan to make the asynchronous
registration as default behavior, which depends on feedback for this
patch.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-05-27 12:01:54 +08:00
|
|
|
|
2020-06-15 00:53:32 +08:00
|
|
|
/* 10 jiffies is enough for a delay */
|
|
|
|
queue_delayed_work(system_wq, &args->reg_work, 10);
|
bcache: asynchronous devices registration
When there is a lot of data cached on cache device, the bcach internal
btree can take a very long to validate during the backing device and
cache device registration. In my test, it may takes 55+ minutes to check
all the internal btree nodes.
The problem is that the registration is invoked by udev rules and the
udevd has 180 seconds timeout by default. If the btree node checking
time is longer than udevd timeout, the registering process will be
killed by udevd with SIGKILL. If the registering process has pending
sigal, creating kthread for bcache will fail and the device registration
will fail. The result is, for bcache device which cached a lot of data
on cache device, the bcache device node like /dev/bcache<N> won't create
always due to the very long btree checking time.
A solution to avoid the udevd 180 seconds timeout is to register devices
in an asynchronous way. Which is, after writing cache or backing device
path into /sys/fs/bcache/register_async, the kernel code will create a
kworker and move all the btree node checking (for cache device) or dirty
data counting (for cached device) in the kwork context. Then the kworder
is scheduled on system_wq and the registration code just returned to
user space udev rule task. By this asynchronous way, the udev task for
bcache rule will complete in seconds, no matter how long time spent in
the kworker context, it won't be killed by udevd for a timeout.
After all the checking and counting are done asynchronously in the
kworker, the bcache device will eventually be created successfully.
This patch does the above chagne and add a register sysfs file
/sys/fs/bcache/register_async. Writing the registering device path into
this sysfs file will do the asynchronous registration.
The register_async interface is for very rare condition and won't be
used for common users. In future I plan to make the asynchronous
registration as default behavior, which depends on feedback for this
patch.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-05-27 12:01:54 +08:00
|
|
|
}
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
|
|
|
|
const char *buffer, size_t size)
|
|
|
|
{
|
2020-01-24 01:01:28 +08:00
|
|
|
const char *err;
|
2020-01-24 01:01:29 +08:00
|
|
|
char *path = NULL;
|
2020-01-24 01:01:28 +08:00
|
|
|
struct cache_sb *sb;
|
2020-01-24 01:01:32 +08:00
|
|
|
struct cache_sb_disk *sb_disk;
|
2020-01-24 01:01:31 +08:00
|
|
|
struct block_device *bdev;
|
2020-01-24 01:01:28 +08:00
|
|
|
ssize_t ret;
|
2020-10-01 14:50:42 +08:00
|
|
|
bool async_registration = false;
|
|
|
|
|
|
|
|
#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
|
|
|
|
async_registration = true;
|
|
|
|
#endif
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-01-24 01:01:28 +08:00
|
|
|
ret = -EBUSY;
|
2020-01-24 01:01:29 +08:00
|
|
|
err = "failed to reference bcache module";
|
2013-03-24 07:11:31 +08:00
|
|
|
if (!try_module_get(THIS_MODULE))
|
2020-01-24 01:01:28 +08:00
|
|
|
goto out;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
bcache: avoid a deadlock in bcache_reboot()
It is quite frequently to observe deadlock in bcache_reboot() happens
and hang the system reboot process. The reason is, in bcache_reboot()
when calling bch_cache_set_stop() and bcache_device_stop() the mutex
bch_register_lock is held. But in the process to stop cache set and
bcache device, bch_register_lock will be acquired again. If this mutex
is held here, deadlock will happen inside the stopping process. The
aftermath of the deadlock is, whole system reboot gets hung.
The fix is to avoid holding bch_register_lock for the following loops
in bcache_reboot(),
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
A module range variable 'bcache_is_reboot' is added, it sets to true
in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked
to be true, reject the registration by returning -EBUSY immediately.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-28 19:59:46 +08:00
|
|
|
/* For latest state of bcache_is_reboot */
|
|
|
|
smp_mb();
|
2020-01-24 01:01:29 +08:00
|
|
|
err = "bcache is in reboot";
|
bcache: avoid a deadlock in bcache_reboot()
It is quite frequently to observe deadlock in bcache_reboot() happens
and hang the system reboot process. The reason is, in bcache_reboot()
when calling bch_cache_set_stop() and bcache_device_stop() the mutex
bch_register_lock is held. But in the process to stop cache set and
bcache device, bch_register_lock will be acquired again. If this mutex
is held here, deadlock will happen inside the stopping process. The
aftermath of the deadlock is, whole system reboot gets hung.
The fix is to avoid holding bch_register_lock for the following loops
in bcache_reboot(),
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
A module range variable 'bcache_is_reboot' is added, it sets to true
in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked
to be true, reject the registration by returning -EBUSY immediately.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-28 19:59:46 +08:00
|
|
|
if (bcache_is_reboot)
|
2020-01-24 01:01:28 +08:00
|
|
|
goto out_module_put;
|
bcache: avoid a deadlock in bcache_reboot()
It is quite frequently to observe deadlock in bcache_reboot() happens
and hang the system reboot process. The reason is, in bcache_reboot()
when calling bch_cache_set_stop() and bcache_device_stop() the mutex
bch_register_lock is held. But in the process to stop cache set and
bcache device, bch_register_lock will be acquired again. If this mutex
is held here, deadlock will happen inside the stopping process. The
aftermath of the deadlock is, whole system reboot gets hung.
The fix is to avoid holding bch_register_lock for the following loops
in bcache_reboot(),
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
A module range variable 'bcache_is_reboot' is added, it sets to true
in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked
to be true, reject the registration by returning -EBUSY immediately.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-28 19:59:46 +08:00
|
|
|
|
2020-01-24 01:01:28 +08:00
|
|
|
ret = -ENOMEM;
|
|
|
|
err = "cannot allocate memory";
|
2018-07-26 12:17:37 +08:00
|
|
|
path = kstrndup(buffer, size, GFP_KERNEL);
|
|
|
|
if (!path)
|
2020-01-24 01:01:28 +08:00
|
|
|
goto out_module_put;
|
2018-07-26 12:17:37 +08:00
|
|
|
|
|
|
|
sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
|
|
|
|
if (!sb)
|
2020-01-24 01:01:28 +08:00
|
|
|
goto out_free_path;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-01-24 01:01:28 +08:00
|
|
|
ret = -EINVAL;
|
2013-03-24 07:11:31 +08:00
|
|
|
err = "failed to open device";
|
|
|
|
bdev = blkdev_get_by_path(strim(path),
|
|
|
|
FMODE_READ|FMODE_WRITE|FMODE_EXCL,
|
|
|
|
sb);
|
2013-05-15 15:11:26 +08:00
|
|
|
if (IS_ERR(bdev)) {
|
2013-05-04 18:19:41 +08:00
|
|
|
if (bdev == ERR_PTR(-EBUSY)) {
|
2020-11-23 20:38:40 +08:00
|
|
|
dev_t dev;
|
|
|
|
|
2014-07-14 00:08:59 +08:00
|
|
|
mutex_lock(&bch_register_lock);
|
2020-11-23 20:38:40 +08:00
|
|
|
if (lookup_bdev(strim(path), &dev) == 0 &&
|
|
|
|
bch_is_open(dev))
|
2013-05-04 18:19:41 +08:00
|
|
|
err = "device already registered";
|
|
|
|
else
|
|
|
|
err = "device busy";
|
2014-07-14 00:08:59 +08:00
|
|
|
mutex_unlock(&bch_register_lock);
|
2017-09-06 14:25:51 +08:00
|
|
|
if (!IS_ERR(bdev))
|
|
|
|
bdput(bdev);
|
2015-11-30 10:40:23 +08:00
|
|
|
if (attr == &ksysfs_register_quiet)
|
2020-01-24 01:01:28 +08:00
|
|
|
goto done;
|
2013-05-04 18:19:41 +08:00
|
|
|
}
|
2020-01-24 01:01:28 +08:00
|
|
|
goto out_free_sb;
|
2013-05-15 15:11:26 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
err = "failed to set blocksize";
|
|
|
|
if (set_blocksize(bdev, 4096))
|
2020-01-24 01:01:28 +08:00
|
|
|
goto out_blkdev_put;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2020-01-24 01:01:32 +08:00
|
|
|
err = read_super(sb, bdev, &sb_disk);
|
2013-03-24 07:11:31 +08:00
|
|
|
if (err)
|
2020-01-24 01:01:28 +08:00
|
|
|
goto out_blkdev_put;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2018-03-06 05:41:54 +08:00
|
|
|
err = "failed to register device";
|
2020-10-01 14:50:42 +08:00
|
|
|
|
|
|
|
if (async_registration) {
|
bcache: asynchronous devices registration
When there is a lot of data cached on cache device, the bcach internal
btree can take a very long to validate during the backing device and
cache device registration. In my test, it may takes 55+ minutes to check
all the internal btree nodes.
The problem is that the registration is invoked by udev rules and the
udevd has 180 seconds timeout by default. If the btree node checking
time is longer than udevd timeout, the registering process will be
killed by udevd with SIGKILL. If the registering process has pending
sigal, creating kthread for bcache will fail and the device registration
will fail. The result is, for bcache device which cached a lot of data
on cache device, the bcache device node like /dev/bcache<N> won't create
always due to the very long btree checking time.
A solution to avoid the udevd 180 seconds timeout is to register devices
in an asynchronous way. Which is, after writing cache or backing device
path into /sys/fs/bcache/register_async, the kernel code will create a
kworker and move all the btree node checking (for cache device) or dirty
data counting (for cached device) in the kwork context. Then the kworder
is scheduled on system_wq and the registration code just returned to
user space udev rule task. By this asynchronous way, the udev task for
bcache rule will complete in seconds, no matter how long time spent in
the kworker context, it won't be killed by udevd for a timeout.
After all the checking and counting are done asynchronously in the
kworker, the bcache device will eventually be created successfully.
This patch does the above chagne and add a register sysfs file
/sys/fs/bcache/register_async. Writing the registering device path into
this sysfs file will do the asynchronous registration.
The register_async interface is for very rare condition and won't be
used for common users. In future I plan to make the asynchronous
registration as default behavior, which depends on feedback for this
patch.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-05-27 12:01:54 +08:00
|
|
|
/* register in asynchronous way */
|
|
|
|
struct async_reg_args *args =
|
|
|
|
kzalloc(sizeof(struct async_reg_args), GFP_KERNEL);
|
|
|
|
|
|
|
|
if (!args) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
err = "cannot allocate memory";
|
|
|
|
goto out_put_sb_page;
|
|
|
|
}
|
|
|
|
|
|
|
|
args->path = path;
|
|
|
|
args->sb = sb;
|
|
|
|
args->sb_disk = sb_disk;
|
|
|
|
args->bdev = bdev;
|
|
|
|
register_device_aync(args);
|
|
|
|
/* No wait and returns to user space */
|
|
|
|
goto async_done;
|
|
|
|
}
|
|
|
|
|
2013-04-12 06:14:35 +08:00
|
|
|
if (SB_IS_BDEV(sb)) {
|
2013-03-24 07:11:31 +08:00
|
|
|
struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
|
2018-08-11 13:19:45 +08:00
|
|
|
|
2013-05-15 15:11:26 +08:00
|
|
|
if (!dc)
|
2020-01-24 01:01:28 +08:00
|
|
|
goto out_put_sb_page;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2014-03-18 09:58:55 +08:00
|
|
|
mutex_lock(&bch_register_lock);
|
2020-01-24 01:01:32 +08:00
|
|
|
ret = register_bdev(sb, sb_disk, bdev, dc);
|
2014-03-18 09:58:55 +08:00
|
|
|
mutex_unlock(&bch_register_lock);
|
2019-04-25 00:48:38 +08:00
|
|
|
/* blkdev_put() will be called in cached_dev_free() */
|
2020-01-24 01:01:31 +08:00
|
|
|
if (ret < 0)
|
|
|
|
goto out_free_sb;
|
2013-03-24 07:11:31 +08:00
|
|
|
} else {
|
|
|
|
struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
|
2018-08-11 13:19:45 +08:00
|
|
|
|
2013-05-15 15:11:26 +08:00
|
|
|
if (!ca)
|
2020-01-24 01:01:28 +08:00
|
|
|
goto out_put_sb_page;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2019-04-25 00:48:38 +08:00
|
|
|
/* blkdev_put() will be called in bch_cache_release() */
|
2020-01-24 01:01:32 +08:00
|
|
|
if (register_cache(sb, sb_disk, bdev, ca) != 0)
|
2020-01-24 01:01:31 +08:00
|
|
|
goto out_free_sb;
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
2020-01-24 01:01:28 +08:00
|
|
|
|
|
|
|
done:
|
2013-03-24 07:11:31 +08:00
|
|
|
kfree(sb);
|
|
|
|
kfree(path);
|
|
|
|
module_put(THIS_MODULE);
|
bcache: asynchronous devices registration
When there is a lot of data cached on cache device, the bcach internal
btree can take a very long to validate during the backing device and
cache device registration. In my test, it may takes 55+ minutes to check
all the internal btree nodes.
The problem is that the registration is invoked by udev rules and the
udevd has 180 seconds timeout by default. If the btree node checking
time is longer than udevd timeout, the registering process will be
killed by udevd with SIGKILL. If the registering process has pending
sigal, creating kthread for bcache will fail and the device registration
will fail. The result is, for bcache device which cached a lot of data
on cache device, the bcache device node like /dev/bcache<N> won't create
always due to the very long btree checking time.
A solution to avoid the udevd 180 seconds timeout is to register devices
in an asynchronous way. Which is, after writing cache or backing device
path into /sys/fs/bcache/register_async, the kernel code will create a
kworker and move all the btree node checking (for cache device) or dirty
data counting (for cached device) in the kwork context. Then the kworder
is scheduled on system_wq and the registration code just returned to
user space udev rule task. By this asynchronous way, the udev task for
bcache rule will complete in seconds, no matter how long time spent in
the kworker context, it won't be killed by udevd for a timeout.
After all the checking and counting are done asynchronously in the
kworker, the bcache device will eventually be created successfully.
This patch does the above chagne and add a register sysfs file
/sys/fs/bcache/register_async. Writing the registering device path into
this sysfs file will do the asynchronous registration.
The register_async interface is for very rare condition and won't be
used for common users. In future I plan to make the asynchronous
registration as default behavior, which depends on feedback for this
patch.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2020-05-27 12:01:54 +08:00
|
|
|
async_done:
|
2020-01-24 01:01:28 +08:00
|
|
|
return size;
|
|
|
|
|
|
|
|
out_put_sb_page:
|
2020-01-24 01:01:32 +08:00
|
|
|
put_page(virt_to_page(sb_disk));
|
2020-01-24 01:01:28 +08:00
|
|
|
out_blkdev_put:
|
2020-01-24 01:01:31 +08:00
|
|
|
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
|
2020-01-24 01:01:28 +08:00
|
|
|
out_free_sb:
|
|
|
|
kfree(sb);
|
|
|
|
out_free_path:
|
|
|
|
kfree(path);
|
2020-01-24 01:01:30 +08:00
|
|
|
path = NULL;
|
2020-01-24 01:01:28 +08:00
|
|
|
out_module_put:
|
|
|
|
module_put(THIS_MODULE);
|
|
|
|
out:
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_info("error %s: %s\n", path?path:"", err);
|
2020-01-24 01:01:28 +08:00
|
|
|
return ret;
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
2019-06-28 19:59:42 +08:00
|
|
|
|
|
|
|
struct pdev {
|
|
|
|
struct list_head list;
|
|
|
|
struct cached_dev *dc;
|
|
|
|
};
|
|
|
|
|
|
|
|
static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
|
|
|
|
struct kobj_attribute *attr,
|
|
|
|
const char *buffer,
|
|
|
|
size_t size)
|
|
|
|
{
|
|
|
|
LIST_HEAD(pending_devs);
|
|
|
|
ssize_t ret = size;
|
|
|
|
struct cached_dev *dc, *tdc;
|
|
|
|
struct pdev *pdev, *tpdev;
|
|
|
|
struct cache_set *c, *tc;
|
|
|
|
|
|
|
|
mutex_lock(&bch_register_lock);
|
|
|
|
list_for_each_entry_safe(dc, tdc, &uncached_devices, list) {
|
|
|
|
pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL);
|
|
|
|
if (!pdev)
|
|
|
|
break;
|
|
|
|
pdev->dc = dc;
|
|
|
|
list_add(&pdev->list, &pending_devs);
|
|
|
|
}
|
|
|
|
|
|
|
|
list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
|
|
|
|
list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
|
|
|
|
char *pdev_set_uuid = pdev->dc->sb.set_uuid;
|
2020-10-01 14:50:48 +08:00
|
|
|
char *set_uuid = c->set_uuid;
|
2019-06-28 19:59:42 +08:00
|
|
|
|
|
|
|
if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
|
|
|
|
list_del(&pdev->list);
|
|
|
|
kfree(pdev);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_unlock(&bch_register_lock);
|
|
|
|
|
|
|
|
list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_info("delete pdev %p\n", pdev);
|
2019-06-28 19:59:42 +08:00
|
|
|
list_del(&pdev->list);
|
|
|
|
bcache_device_stop(&pdev->dc->disk);
|
|
|
|
kfree(pdev);
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
|
|
|
|
{
|
bcache: avoid a deadlock in bcache_reboot()
It is quite frequently to observe deadlock in bcache_reboot() happens
and hang the system reboot process. The reason is, in bcache_reboot()
when calling bch_cache_set_stop() and bcache_device_stop() the mutex
bch_register_lock is held. But in the process to stop cache set and
bcache device, bch_register_lock will be acquired again. If this mutex
is held here, deadlock will happen inside the stopping process. The
aftermath of the deadlock is, whole system reboot gets hung.
The fix is to avoid holding bch_register_lock for the following loops
in bcache_reboot(),
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
A module range variable 'bcache_is_reboot' is added, it sets to true
in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked
to be true, reject the registration by returning -EBUSY immediately.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-28 19:59:46 +08:00
|
|
|
if (bcache_is_reboot)
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
if (code == SYS_DOWN ||
|
|
|
|
code == SYS_HALT ||
|
|
|
|
code == SYS_POWER_OFF) {
|
|
|
|
DEFINE_WAIT(wait);
|
|
|
|
unsigned long start = jiffies;
|
|
|
|
bool stopped = false;
|
|
|
|
|
|
|
|
struct cache_set *c, *tc;
|
|
|
|
struct cached_dev *dc, *tdc;
|
|
|
|
|
|
|
|
mutex_lock(&bch_register_lock);
|
|
|
|
|
bcache: avoid a deadlock in bcache_reboot()
It is quite frequently to observe deadlock in bcache_reboot() happens
and hang the system reboot process. The reason is, in bcache_reboot()
when calling bch_cache_set_stop() and bcache_device_stop() the mutex
bch_register_lock is held. But in the process to stop cache set and
bcache device, bch_register_lock will be acquired again. If this mutex
is held here, deadlock will happen inside the stopping process. The
aftermath of the deadlock is, whole system reboot gets hung.
The fix is to avoid holding bch_register_lock for the following loops
in bcache_reboot(),
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
A module range variable 'bcache_is_reboot' is added, it sets to true
in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked
to be true, reject the registration by returning -EBUSY immediately.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-28 19:59:46 +08:00
|
|
|
if (bcache_is_reboot)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* New registration is rejected since now */
|
|
|
|
bcache_is_reboot = true;
|
|
|
|
/*
|
|
|
|
* Make registering caller (if there is) on other CPU
|
|
|
|
* core know bcache_is_reboot set to true earlier
|
|
|
|
*/
|
|
|
|
smp_mb();
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
if (list_empty(&bch_cache_sets) &&
|
|
|
|
list_empty(&uncached_devices))
|
|
|
|
goto out;
|
|
|
|
|
bcache: avoid a deadlock in bcache_reboot()
It is quite frequently to observe deadlock in bcache_reboot() happens
and hang the system reboot process. The reason is, in bcache_reboot()
when calling bch_cache_set_stop() and bcache_device_stop() the mutex
bch_register_lock is held. But in the process to stop cache set and
bcache device, bch_register_lock will be acquired again. If this mutex
is held here, deadlock will happen inside the stopping process. The
aftermath of the deadlock is, whole system reboot gets hung.
The fix is to avoid holding bch_register_lock for the following loops
in bcache_reboot(),
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
A module range variable 'bcache_is_reboot' is added, it sets to true
in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked
to be true, reject the registration by returning -EBUSY immediately.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-28 19:59:46 +08:00
|
|
|
mutex_unlock(&bch_register_lock);
|
|
|
|
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_info("Stopping all devices:\n");
|
2013-03-24 07:11:31 +08:00
|
|
|
|
bcache: avoid a deadlock in bcache_reboot()
It is quite frequently to observe deadlock in bcache_reboot() happens
and hang the system reboot process. The reason is, in bcache_reboot()
when calling bch_cache_set_stop() and bcache_device_stop() the mutex
bch_register_lock is held. But in the process to stop cache set and
bcache device, bch_register_lock will be acquired again. If this mutex
is held here, deadlock will happen inside the stopping process. The
aftermath of the deadlock is, whole system reboot gets hung.
The fix is to avoid holding bch_register_lock for the following loops
in bcache_reboot(),
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
A module range variable 'bcache_is_reboot' is added, it sets to true
in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked
to be true, reject the registration by returning -EBUSY immediately.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-28 19:59:46 +08:00
|
|
|
/*
|
|
|
|
* The reason bch_register_lock is not held to call
|
|
|
|
* bch_cache_set_stop() and bcache_device_stop() is to
|
|
|
|
* avoid potential deadlock during reboot, because cache
|
|
|
|
* set or bcache device stopping process will acqurie
|
|
|
|
* bch_register_lock too.
|
|
|
|
*
|
|
|
|
* We are safe here because bcache_is_reboot sets to
|
|
|
|
* true already, register_bcache() will reject new
|
|
|
|
* registration now. bcache_is_reboot also makes sure
|
|
|
|
* bcache_reboot() won't be re-entered on by other thread,
|
|
|
|
* so there is no race in following list iteration by
|
|
|
|
* list_for_each_entry_safe().
|
|
|
|
*/
|
2013-03-24 07:11:31 +08:00
|
|
|
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
|
|
|
|
bch_cache_set_stop(c);
|
|
|
|
|
|
|
|
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
|
|
|
|
bcache_device_stop(&dc->disk);
|
|
|
|
|
2019-04-25 00:48:40 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Give an early chance for other kthreads and
|
|
|
|
* kworkers to stop themselves
|
|
|
|
*/
|
|
|
|
schedule();
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
/* What's a condition variable? */
|
|
|
|
while (1) {
|
2019-04-25 00:48:40 +08:00
|
|
|
long timeout = start + 10 * HZ - jiffies;
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2019-04-25 00:48:40 +08:00
|
|
|
mutex_lock(&bch_register_lock);
|
2013-03-24 07:11:31 +08:00
|
|
|
stopped = list_empty(&bch_cache_sets) &&
|
|
|
|
list_empty(&uncached_devices);
|
|
|
|
|
|
|
|
if (timeout < 0 || stopped)
|
|
|
|
break;
|
|
|
|
|
|
|
|
prepare_to_wait(&unregister_wait, &wait,
|
|
|
|
TASK_UNINTERRUPTIBLE);
|
|
|
|
|
|
|
|
mutex_unlock(&bch_register_lock);
|
|
|
|
schedule_timeout(timeout);
|
|
|
|
}
|
|
|
|
|
|
|
|
finish_wait(&unregister_wait, &wait);
|
|
|
|
|
|
|
|
if (stopped)
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_info("All devices stopped\n");
|
2013-03-24 07:11:31 +08:00
|
|
|
else
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_notice("Timeout waiting for devices to be closed\n");
|
2013-03-24 07:11:31 +08:00
|
|
|
out:
|
|
|
|
mutex_unlock(&bch_register_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
return NOTIFY_DONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct notifier_block reboot = {
|
|
|
|
.notifier_call = bcache_reboot,
|
|
|
|
.priority = INT_MAX, /* before any real devices */
|
|
|
|
};
|
|
|
|
|
|
|
|
static void bcache_exit(void)
|
|
|
|
{
|
|
|
|
bch_debug_exit();
|
|
|
|
bch_request_exit();
|
|
|
|
if (bcache_kobj)
|
|
|
|
kobject_put(bcache_kobj);
|
|
|
|
if (bcache_wq)
|
|
|
|
destroy_workqueue(bcache_wq);
|
2018-09-27 23:41:46 +08:00
|
|
|
if (bch_journal_wq)
|
|
|
|
destroy_workqueue(bch_journal_wq);
|
|
|
|
|
2013-07-09 08:53:26 +08:00
|
|
|
if (bcache_major)
|
|
|
|
unregister_blkdev(bcache_major, "bcache");
|
2013-03-24 07:11:31 +08:00
|
|
|
unregister_reboot_notifier(&reboot);
|
2017-10-31 05:46:35 +08:00
|
|
|
mutex_destroy(&bch_register_lock);
|
2013-03-24 07:11:31 +08:00
|
|
|
}
|
|
|
|
|
2018-12-13 22:53:55 +08:00
|
|
|
/* Check and fixup module parameters */
|
|
|
|
static void check_module_parameters(void)
|
|
|
|
{
|
|
|
|
if (bch_cutoff_writeback_sync == 0)
|
|
|
|
bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
|
|
|
|
else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u\n",
|
2018-12-13 22:53:55 +08:00
|
|
|
bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
|
|
|
|
bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (bch_cutoff_writeback == 0)
|
|
|
|
bch_cutoff_writeback = CUTOFF_WRITEBACK;
|
|
|
|
else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_warn("set bch_cutoff_writeback (%u) to max value %u\n",
|
2018-12-13 22:53:55 +08:00
|
|
|
bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
|
|
|
|
bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
|
2020-05-27 12:01:52 +08:00
|
|
|
pr_warn("set bch_cutoff_writeback (%u) to %u\n",
|
2018-12-13 22:53:55 +08:00
|
|
|
bch_cutoff_writeback, bch_cutoff_writeback_sync);
|
|
|
|
bch_cutoff_writeback = bch_cutoff_writeback_sync;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
static int __init bcache_init(void)
|
|
|
|
{
|
|
|
|
static const struct attribute *files[] = {
|
|
|
|
&ksysfs_register.attr,
|
|
|
|
&ksysfs_register_quiet.attr,
|
2019-06-28 19:59:42 +08:00
|
|
|
&ksysfs_pendings_cleanup.attr,
|
2013-03-24 07:11:31 +08:00
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2018-12-13 22:53:55 +08:00
|
|
|
check_module_parameters();
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
mutex_init(&bch_register_lock);
|
|
|
|
init_waitqueue_head(&unregister_wait);
|
|
|
|
register_reboot_notifier(&reboot);
|
|
|
|
|
|
|
|
bcache_major = register_blkdev(0, "bcache");
|
2015-11-30 09:21:57 +08:00
|
|
|
if (bcache_major < 0) {
|
|
|
|
unregister_reboot_notifier(&reboot);
|
2017-10-31 05:46:35 +08:00
|
|
|
mutex_destroy(&bch_register_lock);
|
2013-03-24 07:11:31 +08:00
|
|
|
return bcache_major;
|
2015-11-30 09:21:57 +08:00
|
|
|
}
|
2013-03-24 07:11:31 +08:00
|
|
|
|
2018-07-26 12:17:39 +08:00
|
|
|
bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
|
|
|
|
if (!bcache_wq)
|
|
|
|
goto err;
|
|
|
|
|
2018-09-27 23:41:46 +08:00
|
|
|
bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
|
|
|
|
if (!bch_journal_wq)
|
|
|
|
goto err;
|
|
|
|
|
2018-07-26 12:17:39 +08:00
|
|
|
bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
|
|
|
|
if (!bcache_kobj)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
if (bch_request_init() ||
|
2017-10-31 05:46:35 +08:00
|
|
|
sysfs_create_files(bcache_kobj, files))
|
2013-03-24 07:11:31 +08:00
|
|
|
goto err;
|
|
|
|
|
2018-10-08 20:41:17 +08:00
|
|
|
bch_debug_init();
|
2018-08-09 15:48:42 +08:00
|
|
|
closure_debug_init();
|
|
|
|
|
bcache: avoid a deadlock in bcache_reboot()
It is quite frequently to observe deadlock in bcache_reboot() happens
and hang the system reboot process. The reason is, in bcache_reboot()
when calling bch_cache_set_stop() and bcache_device_stop() the mutex
bch_register_lock is held. But in the process to stop cache set and
bcache device, bch_register_lock will be acquired again. If this mutex
is held here, deadlock will happen inside the stopping process. The
aftermath of the deadlock is, whole system reboot gets hung.
The fix is to avoid holding bch_register_lock for the following loops
in bcache_reboot(),
list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
bch_cache_set_stop(c);
list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
bcache_device_stop(&dc->disk);
A module range variable 'bcache_is_reboot' is added, it sets to true
in bcache_reboot(). In register_bcache(), if bcache_is_reboot is checked
to be true, reject the registration by returning -EBUSY immediately.
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2019-06-28 19:59:46 +08:00
|
|
|
bcache_is_reboot = false;
|
|
|
|
|
2013-03-24 07:11:31 +08:00
|
|
|
return 0;
|
|
|
|
err:
|
|
|
|
bcache_exit();
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
2018-12-13 22:53:55 +08:00
|
|
|
/*
|
|
|
|
* Module hooks
|
|
|
|
*/
|
2013-03-24 07:11:31 +08:00
|
|
|
module_exit(bcache_exit);
|
|
|
|
module_init(bcache_init);
|
2018-12-13 22:53:54 +08:00
|
|
|
|
2018-12-13 22:53:55 +08:00
|
|
|
module_param(bch_cutoff_writeback, uint, 0);
|
|
|
|
MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
|
|
|
|
|
|
|
|
module_param(bch_cutoff_writeback_sync, uint, 0);
|
|
|
|
MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
|
|
|
|
|
2018-12-13 22:53:54 +08:00
|
|
|
MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
|
|
|
|
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
|
|
|
|
MODULE_LICENSE("GPL");
|