mirror of
https://mirrors.bfsu.edu.cn/git/linux.git
synced 2025-01-18 20:04:16 +08:00
- Various DM persistent-data library improvements and fixes that
benefit both the DM thinp and cache targets. - A few small DM kcopyd efficiency improvements. - Significant zoned related block core, DM core and DM zoned target changes that culminate with adding zoned append emulation (which is required to properly fix DM crypt's zoned support). - Various DM writecache target changes that improve efficiency. Adds an optional "metadata_only" feature that only promotes bios flagged with REQ_META. But the most significant improvement is writecache's ability to pause writeback, for a confiurable time, if/when the working set is larger than the cache (and the cache is full) -- this ensures performance is no worse than the slower origin device. -----BEGIN PGP SIGNATURE----- iQEzBAABCAAdFiEEJfWUX4UqZ4x1O2wixSPxCi2dA1oFAmDcpWgACgkQxSPxCi2d A1pD0AgAmySdpJxQBzBMOqnKaClErfxiWXDtvzBxFupG/jmqaN/k/kCFdKyDk89M 9r2rlv4+teZReEGjqjJ0umQgbX62x5y6f7vy4CeoE/+EQAUiZYXNARW8Uubu/Sgy mmvsgAdiuJqfJCX5TiQDwZIdll/QV8isteddMpOdrdM0fpCNlTvRao4S9UE2Rfni fPoPu7KNGDhKORvy/NloYFSHuxTaOSv6A44z15T2SoXPw9hLloFoXegE9Vrcfr/j gwLX3ponp4+K91BzPWz0QIQ7Wh+7O4xrmcXtBIvuIGNcfV+oGMZMtq/zEX8T6sDh GDlclxh/76iGgvINAQ437mXBINbPYQ== =8dUv -----END PGP SIGNATURE----- Merge tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm Pull device mapper updates from Mike Snitzer: - Various DM persistent-data library improvements and fixes that benefit both the DM thinp and cache targets. - A few small DM kcopyd efficiency improvements. - Significant zoned related block core, DM core and DM zoned target changes that culminate with adding zoned append emulation (which is required to properly fix DM crypt's zoned support). - Various DM writecache target changes that improve efficiency. Adds an optional "metadata_only" feature that only promotes bios flagged with REQ_META. But the most significant improvement is writecache's ability to pause writeback, for a confiurable time, if/when the working set is larger than the cache (and the cache is full) -- this ensures performance is no worse than the slower origin device. * tag 'for-5.14/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (35 commits) dm writecache: make writeback pause configurable dm writecache: pause writeback if cache full and origin being written directly dm io tracker: factor out IO tracker dm btree remove: assign new_root only when removal succeeds dm zone: fix dm_revalidate_zones() memory allocation dm ps io affinity: remove redundant continue statement dm writecache: add optional "metadata_only" parameter dm writecache: add "cleaner" and "max_age" to Documentation dm writecache: write at least 4k when committing dm writecache: flush origin device when writing and cache is full dm writecache: have ssd writeback wait if the kcopyd workqueue is busy dm writecache: use list_move instead of list_del/list_add in writecache_writeback() dm writecache: commit just one block, not a full page dm writecache: remove unused gfp_t argument from wc_add_block() dm crypt: Fix zoned block device support dm: introduce zone append emulation dm: rearrange core declarations for extended use from dm-zone.c block: introduce BIO_ZONE_WRITE_LOCKED bio flag block: introduce bio zone helpers block: improve handling of all zones reset operation ...
This commit is contained in:
commit
2cfa582be8
@ -12,7 +12,6 @@ first sector should contain valid superblock from previous invocation.
|
||||
Constructor parameters:
|
||||
|
||||
1. type of the cache device - "p" or "s"
|
||||
|
||||
- p - persistent memory
|
||||
- s - SSD
|
||||
2. the underlying device that will be cached
|
||||
@ -21,7 +20,6 @@ Constructor parameters:
|
||||
size)
|
||||
5. the number of optional parameters (the parameters with an argument
|
||||
count as two)
|
||||
|
||||
start_sector n (default: 0)
|
||||
offset from the start of cache device in 512-byte sectors
|
||||
high_watermark n (default: 50)
|
||||
@ -53,6 +51,27 @@ Constructor parameters:
|
||||
|
||||
- some underlying devices perform better with fua, some
|
||||
with nofua. The user should test it
|
||||
cleaner
|
||||
when this option is activated (either in the constructor
|
||||
arguments or by a message), the cache will not promote
|
||||
new writes (however, writes to already cached blocks are
|
||||
promoted, to avoid data corruption due to misordered
|
||||
writes) and it will gradually writeback any cached
|
||||
data. The userspace can then monitor the cleaning
|
||||
process with "dmsetup status". When the number of cached
|
||||
blocks drops to zero, userspace can unload the
|
||||
dm-writecache target and replace it with dm-linear or
|
||||
other targets.
|
||||
max_age n
|
||||
specifies the maximum age of a block in milliseconds. If
|
||||
a block is stored in the cache for too long, it will be
|
||||
written to the underlying device and cleaned up.
|
||||
metadata_only
|
||||
only metadata is promoted to the cache. This option
|
||||
improves performance for heavier REQ_META workloads.
|
||||
pause_writeback n (default: 3000)
|
||||
pause writeback if there was some write I/O redirected to
|
||||
the origin volume in the last n milliseconds
|
||||
|
||||
Status:
|
||||
1. error indicator - 0 if there was no error, otherwise error number
|
||||
@ -77,3 +96,5 @@ Messages:
|
||||
5. resume the device, so that it will use the linear
|
||||
target
|
||||
6. the cache device is now inactive and it can be deleted
|
||||
cleaner
|
||||
See above "cleaner" constructor documentation.
|
||||
|
@ -161,18 +161,89 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blkdev_report_zones);
|
||||
|
||||
static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
|
||||
sector_t sector,
|
||||
sector_t nr_sectors)
|
||||
static inline unsigned long *blk_alloc_zone_bitmap(int node,
|
||||
unsigned int nr_zones)
|
||||
{
|
||||
if (!blk_queue_zone_resetall(bdev_get_queue(bdev)))
|
||||
return false;
|
||||
return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
|
||||
GFP_NOIO, node);
|
||||
}
|
||||
|
||||
static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
|
||||
void *data)
|
||||
{
|
||||
/*
|
||||
* REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors
|
||||
* of the applicable zone range is the entire disk.
|
||||
* For an all-zones reset, ignore conventional, empty, read-only
|
||||
* and offline zones.
|
||||
*/
|
||||
return !sector && nr_sectors == get_capacity(bdev->bd_disk);
|
||||
switch (zone->cond) {
|
||||
case BLK_ZONE_COND_NOT_WP:
|
||||
case BLK_ZONE_COND_EMPTY:
|
||||
case BLK_ZONE_COND_READONLY:
|
||||
case BLK_ZONE_COND_OFFLINE:
|
||||
return 0;
|
||||
default:
|
||||
set_bit(idx, (unsigned long *)data);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue(bdev);
|
||||
sector_t capacity = get_capacity(bdev->bd_disk);
|
||||
sector_t zone_sectors = blk_queue_zone_sectors(q);
|
||||
unsigned long *need_reset;
|
||||
struct bio *bio = NULL;
|
||||
sector_t sector = 0;
|
||||
int ret;
|
||||
|
||||
need_reset = blk_alloc_zone_bitmap(q->node, q->nr_zones);
|
||||
if (!need_reset)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = bdev->bd_disk->fops->report_zones(bdev->bd_disk, 0,
|
||||
q->nr_zones, blk_zone_need_reset_cb,
|
||||
need_reset);
|
||||
if (ret < 0)
|
||||
goto out_free_need_reset;
|
||||
|
||||
ret = 0;
|
||||
while (sector < capacity) {
|
||||
if (!test_bit(blk_queue_zone_no(q, sector), need_reset)) {
|
||||
sector += zone_sectors;
|
||||
continue;
|
||||
}
|
||||
|
||||
bio = blk_next_bio(bio, 0, gfp_mask);
|
||||
bio_set_dev(bio, bdev);
|
||||
bio->bi_opf = REQ_OP_ZONE_RESET | REQ_SYNC;
|
||||
bio->bi_iter.bi_sector = sector;
|
||||
sector += zone_sectors;
|
||||
|
||||
/* This may take a while, so be nice to others */
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
if (bio) {
|
||||
ret = submit_bio_wait(bio);
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
out_free_need_reset:
|
||||
kfree(need_reset);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
|
||||
{
|
||||
struct bio bio;
|
||||
|
||||
bio_init(&bio, NULL, 0);
|
||||
bio_set_dev(&bio, bdev);
|
||||
bio.bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
|
||||
|
||||
return submit_bio_wait(&bio);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -200,7 +271,7 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
|
||||
sector_t capacity = get_capacity(bdev->bd_disk);
|
||||
sector_t end_sector = sector + nr_sectors;
|
||||
struct bio *bio = NULL;
|
||||
int ret;
|
||||
int ret = 0;
|
||||
|
||||
if (!blk_queue_is_zoned(q))
|
||||
return -EOPNOTSUPP;
|
||||
@ -222,20 +293,21 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
|
||||
if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* In the case of a zone reset operation over all zones,
|
||||
* REQ_OP_ZONE_RESET_ALL can be used with devices supporting this
|
||||
* command. For other devices, we emulate this command behavior by
|
||||
* identifying the zones needing a reset.
|
||||
*/
|
||||
if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
|
||||
if (!blk_queue_zone_resetall(q))
|
||||
return blkdev_zone_reset_all_emulated(bdev, gfp_mask);
|
||||
return blkdev_zone_reset_all(bdev, gfp_mask);
|
||||
}
|
||||
|
||||
while (sector < end_sector) {
|
||||
bio = blk_next_bio(bio, 0, gfp_mask);
|
||||
bio_set_dev(bio, bdev);
|
||||
|
||||
/*
|
||||
* Special case for the zone reset operation that reset all
|
||||
* zones, this is useful for applications like mkfs.
|
||||
*/
|
||||
if (op == REQ_OP_ZONE_RESET &&
|
||||
blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) {
|
||||
bio->bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
|
||||
break;
|
||||
}
|
||||
|
||||
bio->bi_opf = op | REQ_SYNC;
|
||||
bio->bi_iter.bi_sector = sector;
|
||||
sector += zone_sectors;
|
||||
@ -396,13 +468,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline unsigned long *blk_alloc_zone_bitmap(int node,
|
||||
unsigned int nr_zones)
|
||||
{
|
||||
return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
|
||||
GFP_NOIO, node);
|
||||
}
|
||||
|
||||
void blk_queue_free_zone_bitmaps(struct request_queue *q)
|
||||
{
|
||||
kfree(q->conv_zones_bitmap);
|
||||
|
@ -92,6 +92,10 @@ ifeq ($(CONFIG_DM_UEVENT),y)
|
||||
dm-mod-objs += dm-uevent.o
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_BLK_DEV_ZONED),y)
|
||||
dm-mod-objs += dm-zone.o
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_DM_VERITY_FEC),y)
|
||||
dm-verity-objs += dm-verity-fec.o
|
||||
endif
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include "dm-bio-prison-v2.h"
|
||||
#include "dm-bio-record.h"
|
||||
#include "dm-cache-metadata.h"
|
||||
#include "dm-io-tracker.h"
|
||||
|
||||
#include <linux/dm-io.h>
|
||||
#include <linux/dm-kcopyd.h>
|
||||
@ -39,77 +40,6 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
|
||||
struct io_tracker {
|
||||
spinlock_t lock;
|
||||
|
||||
/*
|
||||
* Sectors of in-flight IO.
|
||||
*/
|
||||
sector_t in_flight;
|
||||
|
||||
/*
|
||||
* The time, in jiffies, when this device became idle (if it is
|
||||
* indeed idle).
|
||||
*/
|
||||
unsigned long idle_time;
|
||||
unsigned long last_update_time;
|
||||
};
|
||||
|
||||
static void iot_init(struct io_tracker *iot)
|
||||
{
|
||||
spin_lock_init(&iot->lock);
|
||||
iot->in_flight = 0ul;
|
||||
iot->idle_time = 0ul;
|
||||
iot->last_update_time = jiffies;
|
||||
}
|
||||
|
||||
static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
|
||||
{
|
||||
if (iot->in_flight)
|
||||
return false;
|
||||
|
||||
return time_after(jiffies, iot->idle_time + jifs);
|
||||
}
|
||||
|
||||
static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
|
||||
{
|
||||
bool r;
|
||||
|
||||
spin_lock_irq(&iot->lock);
|
||||
r = __iot_idle_for(iot, jifs);
|
||||
spin_unlock_irq(&iot->lock);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static void iot_io_begin(struct io_tracker *iot, sector_t len)
|
||||
{
|
||||
spin_lock_irq(&iot->lock);
|
||||
iot->in_flight += len;
|
||||
spin_unlock_irq(&iot->lock);
|
||||
}
|
||||
|
||||
static void __iot_io_end(struct io_tracker *iot, sector_t len)
|
||||
{
|
||||
if (!len)
|
||||
return;
|
||||
|
||||
iot->in_flight -= len;
|
||||
if (!iot->in_flight)
|
||||
iot->idle_time = jiffies;
|
||||
}
|
||||
|
||||
static void iot_io_end(struct io_tracker *iot, sector_t len)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&iot->lock, flags);
|
||||
__iot_io_end(iot, len);
|
||||
spin_unlock_irqrestore(&iot->lock, flags);
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
|
||||
/*
|
||||
* Represents a chunk of future work. 'input' allows continuations to pass
|
||||
* values between themselves, typically error values.
|
||||
@ -470,7 +400,7 @@ struct cache {
|
||||
struct batcher committer;
|
||||
struct work_struct commit_ws;
|
||||
|
||||
struct io_tracker tracker;
|
||||
struct dm_io_tracker tracker;
|
||||
|
||||
mempool_t migration_pool;
|
||||
|
||||
@ -866,7 +796,7 @@ static void accounted_begin(struct cache *cache, struct bio *bio)
|
||||
if (accountable_bio(cache, bio)) {
|
||||
pb = get_per_bio_data(bio);
|
||||
pb->len = bio_sectors(bio);
|
||||
iot_io_begin(&cache->tracker, pb->len);
|
||||
dm_iot_io_begin(&cache->tracker, pb->len);
|
||||
}
|
||||
}
|
||||
|
||||
@ -874,7 +804,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio)
|
||||
{
|
||||
struct per_bio_data *pb = get_per_bio_data(bio);
|
||||
|
||||
iot_io_end(&cache->tracker, pb->len);
|
||||
dm_iot_io_end(&cache->tracker, pb->len);
|
||||
}
|
||||
|
||||
static void accounted_request(struct cache *cache, struct bio *bio)
|
||||
@ -1642,7 +1572,7 @@ enum busy {
|
||||
|
||||
static enum busy spare_migration_bandwidth(struct cache *cache)
|
||||
{
|
||||
bool idle = iot_idle_for(&cache->tracker, HZ);
|
||||
bool idle = dm_iot_idle_for(&cache->tracker, HZ);
|
||||
sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
|
||||
cache->sectors_per_block;
|
||||
|
||||
@ -2603,7 +2533,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
|
||||
|
||||
batcher_init(&cache->committer, commit_op, cache,
|
||||
issue_op, cache, cache->wq);
|
||||
iot_init(&cache->tracker);
|
||||
dm_iot_init(&cache->tracker);
|
||||
|
||||
init_rwsem(&cache->background_work_lock);
|
||||
prevent_background_work(cache);
|
||||
|
@ -114,8 +114,27 @@ struct mapped_device {
|
||||
bool init_tio_pdu:1;
|
||||
|
||||
struct srcu_struct io_barrier;
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
unsigned int nr_zones;
|
||||
unsigned int *zwp_offset;
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
* Bits for the flags field of struct mapped_device.
|
||||
*/
|
||||
#define DMF_BLOCK_IO_FOR_SUSPEND 0
|
||||
#define DMF_SUSPENDED 1
|
||||
#define DMF_FROZEN 2
|
||||
#define DMF_FREEING 3
|
||||
#define DMF_DELETING 4
|
||||
#define DMF_NOFLUSH_SUSPENDING 5
|
||||
#define DMF_DEFERRED_REMOVE 6
|
||||
#define DMF_SUSPENDED_INTERNALLY 7
|
||||
#define DMF_POST_SUSPENDING 8
|
||||
#define DMF_EMULATE_ZONE_APPEND 9
|
||||
|
||||
void disable_discard(struct mapped_device *md);
|
||||
void disable_write_same(struct mapped_device *md);
|
||||
void disable_write_zeroes(struct mapped_device *md);
|
||||
@ -130,6 +149,13 @@ static inline struct dm_stats *dm_get_stats(struct mapped_device *md)
|
||||
return &md->stats;
|
||||
}
|
||||
|
||||
static inline bool dm_emulate_zone_append(struct mapped_device *md)
|
||||
{
|
||||
if (blk_queue_is_zoned(md->queue))
|
||||
return test_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
|
||||
return false;
|
||||
}
|
||||
|
||||
#define DM_TABLE_MAX_DEPTH 16
|
||||
|
||||
struct dm_table {
|
||||
@ -173,6 +199,45 @@ struct dm_table {
|
||||
#endif
|
||||
};
|
||||
|
||||
/*
|
||||
* One of these is allocated per clone bio.
|
||||
*/
|
||||
#define DM_TIO_MAGIC 7282014
|
||||
struct dm_target_io {
|
||||
unsigned int magic;
|
||||
struct dm_io *io;
|
||||
struct dm_target *ti;
|
||||
unsigned int target_bio_nr;
|
||||
unsigned int *len_ptr;
|
||||
bool inside_dm_io;
|
||||
struct bio clone;
|
||||
};
|
||||
|
||||
/*
|
||||
* One of these is allocated per original bio.
|
||||
* It contains the first clone used for that original.
|
||||
*/
|
||||
#define DM_IO_MAGIC 5191977
|
||||
struct dm_io {
|
||||
unsigned int magic;
|
||||
struct mapped_device *md;
|
||||
blk_status_t status;
|
||||
atomic_t io_count;
|
||||
struct bio *orig_bio;
|
||||
unsigned long start_time;
|
||||
spinlock_t endio_lock;
|
||||
struct dm_stats_aux stats_aux;
|
||||
/* last member of dm_target_io is 'struct bio' */
|
||||
struct dm_target_io tio;
|
||||
};
|
||||
|
||||
static inline void dm_io_inc_pending(struct dm_io *io)
|
||||
{
|
||||
atomic_inc(&io->io_count);
|
||||
}
|
||||
|
||||
void dm_io_dec_pending(struct dm_io *io, blk_status_t error);
|
||||
|
||||
static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
|
||||
{
|
||||
return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
|
||||
|
@ -3138,11 +3138,10 @@ static int crypt_report_zones(struct dm_target *ti,
|
||||
struct dm_report_zones_args *args, unsigned int nr_zones)
|
||||
{
|
||||
struct crypt_config *cc = ti->private;
|
||||
sector_t sector = cc->start + dm_target_offset(ti, args->next_sector);
|
||||
|
||||
args->start = cc->start;
|
||||
return blkdev_report_zones(cc->dev->bdev, sector, nr_zones,
|
||||
dm_report_zones_cb, args);
|
||||
return dm_report_zones(cc->dev->bdev, cc->start,
|
||||
cc->start + dm_target_offset(ti, args->next_sector),
|
||||
args, nr_zones);
|
||||
}
|
||||
#else
|
||||
#define crypt_report_zones NULL
|
||||
@ -3281,14 +3280,28 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
||||
}
|
||||
cc->start = tmpll;
|
||||
|
||||
/*
|
||||
* For zoned block devices, we need to preserve the issuer write
|
||||
* ordering. To do so, disable write workqueues and force inline
|
||||
* encryption completion.
|
||||
*/
|
||||
if (bdev_is_zoned(cc->dev->bdev)) {
|
||||
/*
|
||||
* For zoned block devices, we need to preserve the issuer write
|
||||
* ordering. To do so, disable write workqueues and force inline
|
||||
* encryption completion.
|
||||
*/
|
||||
set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags);
|
||||
set_bit(DM_CRYPT_WRITE_INLINE, &cc->flags);
|
||||
|
||||
/*
|
||||
* All zone append writes to a zone of a zoned block device will
|
||||
* have the same BIO sector, the start of the zone. When the
|
||||
* cypher IV mode uses sector values, all data targeting a
|
||||
* zone will be encrypted using the first sector numbers of the
|
||||
* zone. This will not result in write errors but will
|
||||
* cause most reads to fail as reads will use the sector values
|
||||
* for the actual data locations, resulting in IV mismatch.
|
||||
* To avoid this problem, ask DM core to emulate zone append
|
||||
* operations with regular writes.
|
||||
*/
|
||||
DMDEBUG("Zone append operations will be emulated");
|
||||
ti->emulate_zone_append = true;
|
||||
}
|
||||
|
||||
if (crypt_integrity_aead(cc) || cc->integrity_iv_size) {
|
||||
|
@ -363,28 +363,32 @@ static void ws_unpack(const struct writeset_disk *disk, struct writeset_metadata
|
||||
core->root = le64_to_cpu(disk->root);
|
||||
}
|
||||
|
||||
static void ws_inc(void *context, const void *value)
|
||||
static void ws_inc(void *context, const void *value, unsigned count)
|
||||
{
|
||||
struct era_metadata *md = context;
|
||||
struct writeset_disk ws_d;
|
||||
dm_block_t b;
|
||||
unsigned i;
|
||||
|
||||
memcpy(&ws_d, value, sizeof(ws_d));
|
||||
b = le64_to_cpu(ws_d.root);
|
||||
|
||||
dm_tm_inc(md->tm, b);
|
||||
for (i = 0; i < count; i++) {
|
||||
memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d));
|
||||
b = le64_to_cpu(ws_d.root);
|
||||
dm_tm_inc(md->tm, b);
|
||||
}
|
||||
}
|
||||
|
||||
static void ws_dec(void *context, const void *value)
|
||||
static void ws_dec(void *context, const void *value, unsigned count)
|
||||
{
|
||||
struct era_metadata *md = context;
|
||||
struct writeset_disk ws_d;
|
||||
dm_block_t b;
|
||||
unsigned i;
|
||||
|
||||
memcpy(&ws_d, value, sizeof(ws_d));
|
||||
b = le64_to_cpu(ws_d.root);
|
||||
|
||||
dm_bitset_del(&md->bitset_info, b);
|
||||
for (i = 0; i < count; i++) {
|
||||
memcpy(&ws_d, value + (i * sizeof(ws_d)), sizeof(ws_d));
|
||||
b = le64_to_cpu(ws_d.root);
|
||||
dm_bitset_del(&md->bitset_info, b);
|
||||
}
|
||||
}
|
||||
|
||||
static int ws_eq(void *context, const void *value1, const void *value2)
|
||||
|
@ -463,11 +463,10 @@ static int flakey_report_zones(struct dm_target *ti,
|
||||
struct dm_report_zones_args *args, unsigned int nr_zones)
|
||||
{
|
||||
struct flakey_c *fc = ti->private;
|
||||
sector_t sector = flakey_map_sector(ti, args->next_sector);
|
||||
|
||||
args->start = fc->start;
|
||||
return blkdev_report_zones(fc->dev->bdev, sector, nr_zones,
|
||||
dm_report_zones_cb, args);
|
||||
return dm_report_zones(fc->dev->bdev, fc->start,
|
||||
flakey_map_sector(ti, args->next_sector),
|
||||
args, nr_zones);
|
||||
}
|
||||
#else
|
||||
#define flakey_report_zones NULL
|
||||
|
81
drivers/md/dm-io-tracker.h
Normal file
81
drivers/md/dm-io-tracker.h
Normal file
@ -0,0 +1,81 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Red Hat, Inc. All rights reserved.
|
||||
*
|
||||
* This file is released under the GPL.
|
||||
*/
|
||||
|
||||
#ifndef DM_IO_TRACKER_H
|
||||
#define DM_IO_TRACKER_H
|
||||
|
||||
#include <linux/jiffies.h>
|
||||
|
||||
struct dm_io_tracker {
|
||||
spinlock_t lock;
|
||||
|
||||
/*
|
||||
* Sectors of in-flight IO.
|
||||
*/
|
||||
sector_t in_flight;
|
||||
|
||||
/*
|
||||
* The time, in jiffies, when this device became idle
|
||||
* (if it is indeed idle).
|
||||
*/
|
||||
unsigned long idle_time;
|
||||
unsigned long last_update_time;
|
||||
};
|
||||
|
||||
static inline void dm_iot_init(struct dm_io_tracker *iot)
|
||||
{
|
||||
spin_lock_init(&iot->lock);
|
||||
iot->in_flight = 0ul;
|
||||
iot->idle_time = 0ul;
|
||||
iot->last_update_time = jiffies;
|
||||
}
|
||||
|
||||
static inline bool dm_iot_idle_for(struct dm_io_tracker *iot, unsigned long j)
|
||||
{
|
||||
bool r = false;
|
||||
|
||||
spin_lock_irq(&iot->lock);
|
||||
if (!iot->in_flight)
|
||||
r = time_after(jiffies, iot->idle_time + j);
|
||||
spin_unlock_irq(&iot->lock);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline unsigned long dm_iot_idle_time(struct dm_io_tracker *iot)
|
||||
{
|
||||
unsigned long r = 0;
|
||||
|
||||
spin_lock_irq(&iot->lock);
|
||||
if (!iot->in_flight)
|
||||
r = jiffies - iot->idle_time;
|
||||
spin_unlock_irq(&iot->lock);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static inline void dm_iot_io_begin(struct dm_io_tracker *iot, sector_t len)
|
||||
{
|
||||
spin_lock_irq(&iot->lock);
|
||||
iot->in_flight += len;
|
||||
spin_unlock_irq(&iot->lock);
|
||||
}
|
||||
|
||||
static inline void dm_iot_io_end(struct dm_io_tracker *iot, sector_t len)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
if (!len)
|
||||
return;
|
||||
|
||||
spin_lock_irqsave(&iot->lock, flags);
|
||||
iot->in_flight -= len;
|
||||
if (!iot->in_flight)
|
||||
iot->idle_time = jiffies;
|
||||
spin_unlock_irqrestore(&iot->lock, flags);
|
||||
}
|
||||
|
||||
#endif
|
@ -341,7 +341,7 @@ static void client_free_pages(struct dm_kcopyd_client *kc)
|
||||
struct kcopyd_job {
|
||||
struct dm_kcopyd_client *kc;
|
||||
struct list_head list;
|
||||
unsigned long flags;
|
||||
unsigned flags;
|
||||
|
||||
/*
|
||||
* Error state of the job.
|
||||
@ -418,7 +418,7 @@ static struct kcopyd_job *pop_io_job(struct list_head *jobs,
|
||||
* constraint and sequential writes that are at the right position.
|
||||
*/
|
||||
list_for_each_entry(job, jobs, list) {
|
||||
if (job->rw == READ || !test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
|
||||
if (job->rw == READ || !(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
|
||||
list_del(&job->list);
|
||||
return job;
|
||||
}
|
||||
@ -437,9 +437,8 @@ static struct kcopyd_job *pop(struct list_head *jobs,
|
||||
struct dm_kcopyd_client *kc)
|
||||
{
|
||||
struct kcopyd_job *job = NULL;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&kc->job_lock, flags);
|
||||
spin_lock_irq(&kc->job_lock);
|
||||
|
||||
if (!list_empty(jobs)) {
|
||||
if (jobs == &kc->io_jobs)
|
||||
@ -449,7 +448,7 @@ static struct kcopyd_job *pop(struct list_head *jobs,
|
||||
list_del(&job->list);
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&kc->job_lock, flags);
|
||||
spin_unlock_irq(&kc->job_lock);
|
||||
|
||||
return job;
|
||||
}
|
||||
@ -467,12 +466,11 @@ static void push(struct list_head *jobs, struct kcopyd_job *job)
|
||||
|
||||
static void push_head(struct list_head *jobs, struct kcopyd_job *job)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct dm_kcopyd_client *kc = job->kc;
|
||||
|
||||
spin_lock_irqsave(&kc->job_lock, flags);
|
||||
spin_lock_irq(&kc->job_lock);
|
||||
list_add(&job->list, jobs);
|
||||
spin_unlock_irqrestore(&kc->job_lock, flags);
|
||||
spin_unlock_irq(&kc->job_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -525,7 +523,7 @@ static void complete_io(unsigned long error, void *context)
|
||||
else
|
||||
job->read_err = 1;
|
||||
|
||||
if (!test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
|
||||
if (!(job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))) {
|
||||
push(&kc->complete_jobs, job);
|
||||
wake(kc);
|
||||
return;
|
||||
@ -565,7 +563,7 @@ static int run_io_job(struct kcopyd_job *job)
|
||||
* If we need to write sequentially and some reads or writes failed,
|
||||
* no point in continuing.
|
||||
*/
|
||||
if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
|
||||
if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) &&
|
||||
job->master_job->write_err) {
|
||||
job->write_err = job->master_job->write_err;
|
||||
return -EIO;
|
||||
@ -648,7 +646,6 @@ static void do_work(struct work_struct *work)
|
||||
struct dm_kcopyd_client *kc = container_of(work,
|
||||
struct dm_kcopyd_client, kcopyd_work);
|
||||
struct blk_plug plug;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* The order that these are called is *very* important.
|
||||
@ -657,9 +654,9 @@ static void do_work(struct work_struct *work)
|
||||
* list. io jobs call wake when they complete and it all
|
||||
* starts again.
|
||||
*/
|
||||
spin_lock_irqsave(&kc->job_lock, flags);
|
||||
spin_lock_irq(&kc->job_lock);
|
||||
list_splice_tail_init(&kc->callback_jobs, &kc->complete_jobs);
|
||||
spin_unlock_irqrestore(&kc->job_lock, flags);
|
||||
spin_unlock_irq(&kc->job_lock);
|
||||
|
||||
blk_start_plug(&plug);
|
||||
process_jobs(&kc->complete_jobs, kc, run_complete_job);
|
||||
@ -709,7 +706,7 @@ static void segment_complete(int read_err, unsigned long write_err,
|
||||
* Only dispatch more work if there hasn't been an error.
|
||||
*/
|
||||
if ((!job->read_err && !job->write_err) ||
|
||||
test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags)) {
|
||||
job->flags & BIT(DM_KCOPYD_IGNORE_ERROR)) {
|
||||
/* get the next chunk of work */
|
||||
progress = job->progress;
|
||||
count = job->source.count - progress;
|
||||
@ -801,10 +798,10 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
|
||||
* we need to write sequentially. If one of the destination is a
|
||||
* host-aware device, then leave it to the caller to choose what to do.
|
||||
*/
|
||||
if (!test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
|
||||
if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
|
||||
for (i = 0; i < job->num_dests; i++) {
|
||||
if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) {
|
||||
set_bit(DM_KCOPYD_WRITE_SEQ, &job->flags);
|
||||
job->flags |= BIT(DM_KCOPYD_WRITE_SEQ);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -813,9 +810,9 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
|
||||
/*
|
||||
* If we need to write sequentially, errors cannot be ignored.
|
||||
*/
|
||||
if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
|
||||
test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags))
|
||||
clear_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags);
|
||||
if (job->flags & BIT(DM_KCOPYD_WRITE_SEQ) &&
|
||||
job->flags & BIT(DM_KCOPYD_IGNORE_ERROR))
|
||||
job->flags &= ~BIT(DM_KCOPYD_IGNORE_ERROR);
|
||||
|
||||
if (from) {
|
||||
job->source = *from;
|
||||
@ -983,3 +980,9 @@ void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc)
|
||||
kfree(kc);
|
||||
}
|
||||
EXPORT_SYMBOL(dm_kcopyd_client_destroy);
|
||||
|
||||
void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc)
|
||||
{
|
||||
flush_workqueue(kc->kcopyd_wq);
|
||||
}
|
||||
EXPORT_SYMBOL(dm_kcopyd_client_flush);
|
||||
|
@ -140,11 +140,10 @@ static int linear_report_zones(struct dm_target *ti,
|
||||
struct dm_report_zones_args *args, unsigned int nr_zones)
|
||||
{
|
||||
struct linear_c *lc = ti->private;
|
||||
sector_t sector = linear_map_sector(ti, args->next_sector);
|
||||
|
||||
args->start = lc->start;
|
||||
return blkdev_report_zones(lc->dev->bdev, sector, nr_zones,
|
||||
dm_report_zones_cb, args);
|
||||
return dm_report_zones(lc->dev->bdev, lc->start,
|
||||
linear_map_sector(ti, args->next_sector),
|
||||
args, nr_zones);
|
||||
}
|
||||
#else
|
||||
#define linear_report_zones NULL
|
||||
|
@ -91,7 +91,6 @@ static int ioa_add_path(struct path_selector *ps, struct dm_path *path,
|
||||
cpumask_set_cpu(cpu, s->path_mask);
|
||||
s->path_map[cpu] = pi;
|
||||
refcount_inc(&pi->refcount);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (refcount_dec_and_test(&pi->refcount)) {
|
||||
|
@ -364,7 +364,7 @@ static void recover(struct mirror_set *ms, struct dm_region *reg)
|
||||
|
||||
/* hand to kcopyd */
|
||||
if (!errors_handled(ms))
|
||||
set_bit(DM_KCOPYD_IGNORE_ERROR, &flags);
|
||||
flags |= BIT(DM_KCOPYD_IGNORE_ERROR);
|
||||
|
||||
dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to,
|
||||
flags, recovery_complete, reg);
|
||||
|
@ -249,7 +249,7 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
|
||||
* If the target is mapped to zoned block device(s), check
|
||||
* that the zones are not partially mapped.
|
||||
*/
|
||||
if (bdev_zoned_model(bdev) != BLK_ZONED_NONE) {
|
||||
if (bdev_is_zoned(bdev)) {
|
||||
unsigned int zone_sectors = bdev_zone_sectors(bdev);
|
||||
|
||||
if (start & (zone_sectors - 1)) {
|
||||
@ -1244,7 +1244,7 @@ static int dm_keyslot_evict(struct blk_keyslot_manager *ksm,
|
||||
return args.err;
|
||||
}
|
||||
|
||||
static struct blk_ksm_ll_ops dm_ksm_ll_ops = {
|
||||
static const struct blk_ksm_ll_ops dm_ksm_ll_ops = {
|
||||
.keyslot_evict = dm_keyslot_evict,
|
||||
};
|
||||
|
||||
@ -1981,11 +1981,12 @@ static int device_requires_stable_pages(struct dm_target *ti,
|
||||
return blk_queue_stable_writes(q);
|
||||
}
|
||||
|
||||
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
|
||||
struct queue_limits *limits)
|
||||
int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
|
||||
struct queue_limits *limits)
|
||||
{
|
||||
bool wc = false, fua = false;
|
||||
int page_size = PAGE_SIZE;
|
||||
int r;
|
||||
|
||||
/*
|
||||
* Copy table's limits to the DM device's request_queue
|
||||
@ -2065,19 +2066,19 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
|
||||
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
|
||||
|
||||
/*
|
||||
* For a zoned target, the number of zones should be updated for the
|
||||
* correct value to be exposed in sysfs queue/nr_zones. For a BIO based
|
||||
* target, this is all that is needed.
|
||||
* For a zoned target, setup the zones related queue attributes
|
||||
* and resources necessary for zone append emulation if necessary.
|
||||
*/
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
if (blk_queue_is_zoned(q)) {
|
||||
WARN_ON_ONCE(queue_is_mq(q));
|
||||
q->nr_zones = blkdev_nr_zones(t->md->disk);
|
||||
r = dm_set_zones_restrictions(t, q);
|
||||
if (r)
|
||||
return r;
|
||||
}
|
||||
#endif
|
||||
|
||||
dm_update_keyslot_manager(q, t);
|
||||
blk_queue_update_readahead(q);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned int dm_table_get_num_targets(struct dm_table *t)
|
||||
|
@ -311,28 +311,53 @@ static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
|
||||
*t = v & ((1 << 24) - 1);
|
||||
}
|
||||
|
||||
static void data_block_inc(void *context, const void *value_le)
|
||||
{
|
||||
struct dm_space_map *sm = context;
|
||||
__le64 v_le;
|
||||
uint64_t b;
|
||||
uint32_t t;
|
||||
/*
|
||||
* It's more efficient to call dm_sm_{inc,dec}_blocks as few times as
|
||||
* possible. 'with_runs' reads contiguous runs of blocks, and calls the
|
||||
* given sm function.
|
||||
*/
|
||||
typedef int (*run_fn)(struct dm_space_map *, dm_block_t, dm_block_t);
|
||||
|
||||
memcpy(&v_le, value_le, sizeof(v_le));
|
||||
unpack_block_time(le64_to_cpu(v_le), &b, &t);
|
||||
dm_sm_inc_block(sm, b);
|
||||
static void with_runs(struct dm_space_map *sm, const __le64 *value_le, unsigned count, run_fn fn)
|
||||
{
|
||||
uint64_t b, begin, end;
|
||||
uint32_t t;
|
||||
bool in_run = false;
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < count; i++, value_le++) {
|
||||
/* We know value_le is 8 byte aligned */
|
||||
unpack_block_time(le64_to_cpu(*value_le), &b, &t);
|
||||
|
||||
if (in_run) {
|
||||
if (b == end) {
|
||||
end++;
|
||||
} else {
|
||||
fn(sm, begin, end);
|
||||
begin = b;
|
||||
end = b + 1;
|
||||
}
|
||||
} else {
|
||||
in_run = true;
|
||||
begin = b;
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (in_run)
|
||||
fn(sm, begin, end);
|
||||
}
|
||||
|
||||
static void data_block_dec(void *context, const void *value_le)
|
||||
static void data_block_inc(void *context, const void *value_le, unsigned count)
|
||||
{
|
||||
struct dm_space_map *sm = context;
|
||||
__le64 v_le;
|
||||
uint64_t b;
|
||||
uint32_t t;
|
||||
with_runs((struct dm_space_map *) context,
|
||||
(const __le64 *) value_le, count, dm_sm_inc_blocks);
|
||||
}
|
||||
|
||||
memcpy(&v_le, value_le, sizeof(v_le));
|
||||
unpack_block_time(le64_to_cpu(v_le), &b, &t);
|
||||
dm_sm_dec_block(sm, b);
|
||||
static void data_block_dec(void *context, const void *value_le, unsigned count)
|
||||
{
|
||||
with_runs((struct dm_space_map *) context,
|
||||
(const __le64 *) value_le, count, dm_sm_dec_blocks);
|
||||
}
|
||||
|
||||
static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
|
||||
@ -349,27 +374,25 @@ static int data_block_equal(void *context, const void *value1_le, const void *va
|
||||
return b1 == b2;
|
||||
}
|
||||
|
||||
static void subtree_inc(void *context, const void *value)
|
||||
static void subtree_inc(void *context, const void *value, unsigned count)
|
||||
{
|
||||
struct dm_btree_info *info = context;
|
||||
__le64 root_le;
|
||||
uint64_t root;
|
||||
const __le64 *root_le = value;
|
||||
unsigned i;
|
||||
|
||||
memcpy(&root_le, value, sizeof(root_le));
|
||||
root = le64_to_cpu(root_le);
|
||||
dm_tm_inc(info->tm, root);
|
||||
for (i = 0; i < count; i++, root_le++)
|
||||
dm_tm_inc(info->tm, le64_to_cpu(*root_le));
|
||||
}
|
||||
|
||||
static void subtree_dec(void *context, const void *value)
|
||||
static void subtree_dec(void *context, const void *value, unsigned count)
|
||||
{
|
||||
struct dm_btree_info *info = context;
|
||||
__le64 root_le;
|
||||
uint64_t root;
|
||||
const __le64 *root_le = value;
|
||||
unsigned i;
|
||||
|
||||
memcpy(&root_le, value, sizeof(root_le));
|
||||
root = le64_to_cpu(root_le);
|
||||
if (dm_btree_del(info, root))
|
||||
DMERR("btree delete failed");
|
||||
for (i = 0; i < count; i++, root_le++)
|
||||
if (dm_btree_del(info, le64_to_cpu(*root_le)))
|
||||
DMERR("btree delete failed");
|
||||
}
|
||||
|
||||
static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
|
||||
@ -1761,11 +1784,7 @@ int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
|
||||
int r = 0;
|
||||
|
||||
pmd_write_lock(pmd);
|
||||
for (; b != e; b++) {
|
||||
r = dm_sm_inc_block(pmd->data_sm, b);
|
||||
if (r)
|
||||
break;
|
||||
}
|
||||
r = dm_sm_inc_blocks(pmd->data_sm, b, e);
|
||||
pmd_write_unlock(pmd);
|
||||
|
||||
return r;
|
||||
@ -1776,11 +1795,7 @@ int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_
|
||||
int r = 0;
|
||||
|
||||
pmd_write_lock(pmd);
|
||||
for (; b != e; b++) {
|
||||
r = dm_sm_dec_block(pmd->data_sm, b);
|
||||
if (r)
|
||||
break;
|
||||
}
|
||||
r = dm_sm_dec_blocks(pmd->data_sm, b, e);
|
||||
pmd_write_unlock(pmd);
|
||||
|
||||
return r;
|
||||
|
@ -15,6 +15,8 @@
|
||||
#include <linux/dax.h>
|
||||
#include <linux/pfn_t.h>
|
||||
#include <linux/libnvdimm.h>
|
||||
#include <linux/delay.h>
|
||||
#include "dm-io-tracker.h"
|
||||
|
||||
#define DM_MSG_PREFIX "writecache"
|
||||
|
||||
@ -28,6 +30,7 @@
|
||||
#define AUTOCOMMIT_MSEC 1000
|
||||
#define MAX_AGE_DIV 16
|
||||
#define MAX_AGE_UNSPECIFIED -1UL
|
||||
#define PAUSE_WRITEBACK (HZ * 3)
|
||||
|
||||
#define BITMAP_GRANULARITY 65536
|
||||
#if BITMAP_GRANULARITY < PAGE_SIZE
|
||||
@ -123,6 +126,7 @@ struct dm_writecache {
|
||||
size_t freelist_high_watermark;
|
||||
size_t freelist_low_watermark;
|
||||
unsigned long max_age;
|
||||
unsigned long pause;
|
||||
|
||||
unsigned uncommitted_blocks;
|
||||
unsigned autocommit_blocks;
|
||||
@ -171,17 +175,22 @@ struct dm_writecache {
|
||||
bool flush_on_suspend:1;
|
||||
bool cleaner:1;
|
||||
bool cleaner_set:1;
|
||||
bool metadata_only:1;
|
||||
bool pause_set:1;
|
||||
|
||||
unsigned high_wm_percent_value;
|
||||
unsigned low_wm_percent_value;
|
||||
unsigned autocommit_time_value;
|
||||
unsigned max_age_value;
|
||||
unsigned pause_value;
|
||||
|
||||
unsigned writeback_all;
|
||||
struct workqueue_struct *writeback_wq;
|
||||
struct work_struct writeback_work;
|
||||
struct work_struct flush_work;
|
||||
|
||||
struct dm_io_tracker iot;
|
||||
|
||||
struct dm_io_client *dm_io;
|
||||
|
||||
raw_spinlock_t endio_list_lock;
|
||||
@ -532,7 +541,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc)
|
||||
|
||||
region.bdev = wc->ssd_dev->bdev;
|
||||
region.sector = 0;
|
||||
region.count = PAGE_SIZE >> SECTOR_SHIFT;
|
||||
region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
|
||||
|
||||
if (unlikely(region.sector + region.count > wc->metadata_sectors))
|
||||
region.count = wc->metadata_sectors - region.sector;
|
||||
@ -1301,8 +1310,12 @@ static int writecache_map(struct dm_target *ti, struct bio *bio)
|
||||
writecache_flush(wc);
|
||||
if (writecache_has_error(wc))
|
||||
goto unlock_error;
|
||||
if (unlikely(wc->cleaner) || unlikely(wc->metadata_only))
|
||||
goto unlock_remap_origin;
|
||||
goto unlock_submit;
|
||||
} else {
|
||||
if (dm_bio_get_target_bio_nr(bio))
|
||||
goto unlock_remap_origin;
|
||||
writecache_offload_bio(wc, bio);
|
||||
goto unlock_return;
|
||||
}
|
||||
@ -1360,24 +1373,29 @@ read_next_block:
|
||||
} else {
|
||||
do {
|
||||
bool found_entry = false;
|
||||
bool search_used = false;
|
||||
if (writecache_has_error(wc))
|
||||
goto unlock_error;
|
||||
e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
|
||||
if (e) {
|
||||
if (!writecache_entry_is_committed(wc, e))
|
||||
if (!writecache_entry_is_committed(wc, e)) {
|
||||
search_used = true;
|
||||
goto bio_copy;
|
||||
}
|
||||
if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
|
||||
wc->overwrote_committed = true;
|
||||
search_used = true;
|
||||
goto bio_copy;
|
||||
}
|
||||
found_entry = true;
|
||||
} else {
|
||||
if (unlikely(wc->cleaner))
|
||||
if (unlikely(wc->cleaner) ||
|
||||
(wc->metadata_only && !(bio->bi_opf & REQ_META)))
|
||||
goto direct_write;
|
||||
}
|
||||
e = writecache_pop_from_freelist(wc, (sector_t)-1);
|
||||
if (unlikely(!e)) {
|
||||
if (!found_entry) {
|
||||
if (!WC_MODE_PMEM(wc) && !found_entry) {
|
||||
direct_write:
|
||||
e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
|
||||
if (e) {
|
||||
@ -1404,13 +1422,31 @@ bio_copy:
|
||||
sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
|
||||
|
||||
while (bio_size < bio->bi_iter.bi_size) {
|
||||
struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
|
||||
if (!f)
|
||||
break;
|
||||
write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
|
||||
(bio_size >> SECTOR_SHIFT), wc->seq_count);
|
||||
writecache_insert_entry(wc, f);
|
||||
wc->uncommitted_blocks++;
|
||||
if (!search_used) {
|
||||
struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
|
||||
if (!f)
|
||||
break;
|
||||
write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
|
||||
(bio_size >> SECTOR_SHIFT), wc->seq_count);
|
||||
writecache_insert_entry(wc, f);
|
||||
wc->uncommitted_blocks++;
|
||||
} else {
|
||||
struct wc_entry *f;
|
||||
struct rb_node *next = rb_next(&e->rb_node);
|
||||
if (!next)
|
||||
break;
|
||||
f = container_of(next, struct wc_entry, rb_node);
|
||||
if (f != e + 1)
|
||||
break;
|
||||
if (read_original_sector(wc, f) !=
|
||||
read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
|
||||
break;
|
||||
if (unlikely(f->write_in_progress))
|
||||
break;
|
||||
if (writecache_entry_is_committed(wc, f))
|
||||
wc->overwrote_committed = true;
|
||||
e = f;
|
||||
}
|
||||
bio_size += wc->block_size;
|
||||
current_cache_sec += wc->block_size >> SECTOR_SHIFT;
|
||||
}
|
||||
@ -1438,6 +1474,12 @@ bio_copy:
|
||||
}
|
||||
|
||||
unlock_remap_origin:
|
||||
if (likely(wc->pause != 0)) {
|
||||
if (bio_op(bio) == REQ_OP_WRITE) {
|
||||
dm_iot_io_begin(&wc->iot, 1);
|
||||
bio->bi_private = (void *)2;
|
||||
}
|
||||
}
|
||||
bio_set_dev(bio, wc->dev->bdev);
|
||||
wc_unlock(wc);
|
||||
return DM_MAPIO_REMAPPED;
|
||||
@ -1468,11 +1510,13 @@ static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t
|
||||
{
|
||||
struct dm_writecache *wc = ti->private;
|
||||
|
||||
if (bio->bi_private != NULL) {
|
||||
if (bio->bi_private == (void *)1) {
|
||||
int dir = bio_data_dir(bio);
|
||||
if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
|
||||
if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
|
||||
wake_up(&wc->bio_in_progress_wait[dir]);
|
||||
} else if (bio->bi_private == (void *)2) {
|
||||
dm_iot_io_end(&wc->iot, 1);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@ -1642,7 +1686,7 @@ pop_from_list:
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
|
||||
static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
|
||||
{
|
||||
struct dm_writecache *wc = wb->wc;
|
||||
unsigned block_size = wc->block_size;
|
||||
@ -1703,7 +1747,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
|
||||
max_pages = WB_LIST_INLINE;
|
||||
}
|
||||
|
||||
BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
|
||||
BUG_ON(!wc_add_block(wb, e));
|
||||
|
||||
wb->wc_list[0] = e;
|
||||
wb->wc_list_n = 1;
|
||||
@ -1713,7 +1757,7 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba
|
||||
if (read_original_sector(wc, f) !=
|
||||
read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
|
||||
break;
|
||||
if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
|
||||
if (!wc_add_block(wb, f))
|
||||
break;
|
||||
wbl->size--;
|
||||
list_del(&f->lru);
|
||||
@ -1794,6 +1838,27 @@ static void writecache_writeback(struct work_struct *work)
|
||||
struct writeback_list wbl;
|
||||
unsigned long n_walked;
|
||||
|
||||
if (!WC_MODE_PMEM(wc)) {
|
||||
/* Wait for any active kcopyd work on behalf of ssd writeback */
|
||||
dm_kcopyd_client_flush(wc->dm_kcopyd);
|
||||
}
|
||||
|
||||
if (likely(wc->pause != 0)) {
|
||||
while (1) {
|
||||
unsigned long idle;
|
||||
if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) ||
|
||||
unlikely(dm_suspended(wc->ti)))
|
||||
break;
|
||||
idle = dm_iot_idle_time(&wc->iot);
|
||||
if (idle >= wc->pause)
|
||||
break;
|
||||
idle = wc->pause - idle;
|
||||
if (idle > HZ)
|
||||
idle = HZ;
|
||||
schedule_timeout_idle(idle);
|
||||
}
|
||||
}
|
||||
|
||||
wc_lock(wc);
|
||||
restart:
|
||||
if (writecache_has_error(wc)) {
|
||||
@ -1822,8 +1887,9 @@ restart:
|
||||
|
||||
n_walked++;
|
||||
if (unlikely(n_walked > WRITEBACK_LATENCY) &&
|
||||
likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) {
|
||||
queue_work(wc->writeback_wq, &wc->writeback_work);
|
||||
likely(!wc->writeback_all)) {
|
||||
if (likely(!dm_suspended(wc->ti)))
|
||||
queue_work(wc->writeback_wq, &wc->writeback_work);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -1845,15 +1911,13 @@ restart:
|
||||
if (unlikely(read_original_sector(wc, f) ==
|
||||
read_original_sector(wc, e))) {
|
||||
BUG_ON(!f->write_in_progress);
|
||||
list_del(&e->lru);
|
||||
list_add(&e->lru, &skipped);
|
||||
list_move(&e->lru, &skipped);
|
||||
cond_resched();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
wc->writeback_size++;
|
||||
list_del(&e->lru);
|
||||
list_add(&e->lru, &wbl.list);
|
||||
list_move(&e->lru, &wbl.list);
|
||||
wbl.size++;
|
||||
e->write_in_progress = true;
|
||||
e->wc_list_contiguous = 1;
|
||||
@ -1888,8 +1952,7 @@ restart:
|
||||
// break;
|
||||
|
||||
wc->writeback_size++;
|
||||
list_del(&g->lru);
|
||||
list_add(&g->lru, &wbl.list);
|
||||
list_move(&g->lru, &wbl.list);
|
||||
wbl.size++;
|
||||
g->write_in_progress = true;
|
||||
g->wc_list_contiguous = BIO_MAX_VECS;
|
||||
@ -2065,7 +2128,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
|
||||
struct wc_memory_superblock s;
|
||||
|
||||
static struct dm_arg _args[] = {
|
||||
{0, 16, "Invalid number of feature args"},
|
||||
{0, 18, "Invalid number of feature args"},
|
||||
};
|
||||
|
||||
as.argc = argc;
|
||||
@ -2109,6 +2172,8 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
|
||||
INIT_WORK(&wc->writeback_work, writecache_writeback);
|
||||
INIT_WORK(&wc->flush_work, writecache_flush_work);
|
||||
|
||||
dm_iot_init(&wc->iot);
|
||||
|
||||
raw_spin_lock_init(&wc->endio_list_lock);
|
||||
INIT_LIST_HEAD(&wc->endio_list);
|
||||
wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
|
||||
@ -2156,6 +2221,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
|
||||
goto bad;
|
||||
}
|
||||
} else {
|
||||
wc->pause = PAUSE_WRITEBACK;
|
||||
r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
|
||||
if (r) {
|
||||
ti->error = "Could not allocate mempool";
|
||||
@ -2292,6 +2358,20 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
|
||||
wc->writeback_fua = false;
|
||||
wc->writeback_fua_set = true;
|
||||
} else goto invalid_optional;
|
||||
} else if (!strcasecmp(string, "metadata_only")) {
|
||||
wc->metadata_only = true;
|
||||
} else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) {
|
||||
unsigned pause_msecs;
|
||||
if (WC_MODE_PMEM(wc))
|
||||
goto invalid_optional;
|
||||
string = dm_shift_arg(&as), opt_params--;
|
||||
if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1)
|
||||
goto invalid_optional;
|
||||
if (pause_msecs > 60000)
|
||||
goto invalid_optional;
|
||||
wc->pause = msecs_to_jiffies(pause_msecs);
|
||||
wc->pause_set = true;
|
||||
wc->pause_value = pause_msecs;
|
||||
} else {
|
||||
invalid_optional:
|
||||
r = -EINVAL;
|
||||
@ -2463,7 +2543,7 @@ overflow:
|
||||
goto bad;
|
||||
}
|
||||
|
||||
ti->num_flush_bios = 1;
|
||||
ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
|
||||
ti->flush_supported = true;
|
||||
ti->num_discard_bios = 1;
|
||||
|
||||
@ -2515,6 +2595,10 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
|
||||
extra_args++;
|
||||
if (wc->writeback_fua_set)
|
||||
extra_args++;
|
||||
if (wc->metadata_only)
|
||||
extra_args++;
|
||||
if (wc->pause_set)
|
||||
extra_args += 2;
|
||||
|
||||
DMEMIT("%u", extra_args);
|
||||
if (wc->start_sector_set)
|
||||
@ -2535,13 +2619,17 @@ static void writecache_status(struct dm_target *ti, status_type_t type,
|
||||
DMEMIT(" cleaner");
|
||||
if (wc->writeback_fua_set)
|
||||
DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
|
||||
if (wc->metadata_only)
|
||||
DMEMIT(" metadata_only");
|
||||
if (wc->pause_set)
|
||||
DMEMIT(" pause_writeback %u", wc->pause_value);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static struct target_type writecache_target = {
|
||||
.name = "writecache",
|
||||
.version = {1, 4, 0},
|
||||
.version = {1, 5, 0},
|
||||
.module = THIS_MODULE,
|
||||
.ctr = writecache_ctr,
|
||||
.dtr = writecache_dtr,
|
||||
|
660
drivers/md/dm-zone.c
Normal file
660
drivers/md/dm-zone.c
Normal file
@ -0,0 +1,660 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright (C) 2021 Western Digital Corporation or its affiliates.
|
||||
*/
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include "dm-core.h"
|
||||
|
||||
#define DM_MSG_PREFIX "zone"
|
||||
|
||||
#define DM_ZONE_INVALID_WP_OFST UINT_MAX
|
||||
|
||||
/*
|
||||
* For internal zone reports bypassing the top BIO submission path.
|
||||
*/
|
||||
static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
|
||||
sector_t sector, unsigned int nr_zones,
|
||||
report_zones_cb cb, void *data)
|
||||
{
|
||||
struct gendisk *disk = md->disk;
|
||||
int ret;
|
||||
struct dm_report_zones_args args = {
|
||||
.next_sector = sector,
|
||||
.orig_data = data,
|
||||
.orig_cb = cb,
|
||||
};
|
||||
|
||||
do {
|
||||
struct dm_target *tgt;
|
||||
|
||||
tgt = dm_table_find_target(t, args.next_sector);
|
||||
if (WARN_ON_ONCE(!tgt->type->report_zones))
|
||||
return -EIO;
|
||||
|
||||
args.tgt = tgt;
|
||||
ret = tgt->type->report_zones(tgt, &args,
|
||||
nr_zones - args.zone_idx);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
} while (args.zone_idx < nr_zones &&
|
||||
args.next_sector < get_capacity(disk));
|
||||
|
||||
return args.zone_idx;
|
||||
}
|
||||
|
||||
/*
|
||||
* User facing dm device block device report zone operation. This calls the
|
||||
* report_zones operation for each target of a device table. This operation is
|
||||
* generally implemented by targets using dm_report_zones().
|
||||
*/
|
||||
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
|
||||
unsigned int nr_zones, report_zones_cb cb, void *data)
|
||||
{
|
||||
struct mapped_device *md = disk->private_data;
|
||||
struct dm_table *map;
|
||||
int srcu_idx, ret;
|
||||
|
||||
if (dm_suspended_md(md))
|
||||
return -EAGAIN;
|
||||
|
||||
map = dm_get_live_table(md, &srcu_idx);
|
||||
if (!map)
|
||||
return -EIO;
|
||||
|
||||
ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
|
||||
|
||||
dm_put_live_table(md, srcu_idx);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx,
|
||||
void *data)
|
||||
{
|
||||
struct dm_report_zones_args *args = data;
|
||||
sector_t sector_diff = args->tgt->begin - args->start;
|
||||
|
||||
/*
|
||||
* Ignore zones beyond the target range.
|
||||
*/
|
||||
if (zone->start >= args->start + args->tgt->len)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Remap the start sector and write pointer position of the zone
|
||||
* to match its position in the target range.
|
||||
*/
|
||||
zone->start += sector_diff;
|
||||
if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
|
||||
if (zone->cond == BLK_ZONE_COND_FULL)
|
||||
zone->wp = zone->start + zone->len;
|
||||
else if (zone->cond == BLK_ZONE_COND_EMPTY)
|
||||
zone->wp = zone->start;
|
||||
else
|
||||
zone->wp += sector_diff;
|
||||
}
|
||||
|
||||
args->next_sector = zone->start + zone->len;
|
||||
return args->orig_cb(zone, args->zone_idx++, args->orig_data);
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper for drivers of zoned targets to implement struct target_type
|
||||
* report_zones operation.
|
||||
*/
|
||||
int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector,
|
||||
struct dm_report_zones_args *args, unsigned int nr_zones)
|
||||
{
|
||||
/*
|
||||
* Set the target mapping start sector first so that
|
||||
* dm_report_zones_cb() can correctly remap zone information.
|
||||
*/
|
||||
args->start = start;
|
||||
|
||||
return blkdev_report_zones(bdev, sector, nr_zones,
|
||||
dm_report_zones_cb, args);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dm_report_zones);
|
||||
|
||||
bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
|
||||
{
|
||||
struct request_queue *q = md->queue;
|
||||
|
||||
if (!blk_queue_is_zoned(q))
|
||||
return false;
|
||||
|
||||
switch (bio_op(bio)) {
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
case REQ_OP_WRITE_SAME:
|
||||
case REQ_OP_WRITE:
|
||||
return !op_is_flush(bio->bi_opf) && bio_sectors(bio);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void dm_cleanup_zoned_dev(struct mapped_device *md)
|
||||
{
|
||||
struct request_queue *q = md->queue;
|
||||
|
||||
if (q) {
|
||||
kfree(q->conv_zones_bitmap);
|
||||
q->conv_zones_bitmap = NULL;
|
||||
kfree(q->seq_zones_wlock);
|
||||
q->seq_zones_wlock = NULL;
|
||||
}
|
||||
|
||||
kvfree(md->zwp_offset);
|
||||
md->zwp_offset = NULL;
|
||||
md->nr_zones = 0;
|
||||
}
|
||||
|
||||
static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone)
|
||||
{
|
||||
switch (zone->cond) {
|
||||
case BLK_ZONE_COND_IMP_OPEN:
|
||||
case BLK_ZONE_COND_EXP_OPEN:
|
||||
case BLK_ZONE_COND_CLOSED:
|
||||
return zone->wp - zone->start;
|
||||
case BLK_ZONE_COND_FULL:
|
||||
return zone->len;
|
||||
case BLK_ZONE_COND_EMPTY:
|
||||
case BLK_ZONE_COND_NOT_WP:
|
||||
case BLK_ZONE_COND_OFFLINE:
|
||||
case BLK_ZONE_COND_READONLY:
|
||||
default:
|
||||
/*
|
||||
* Conventional, offline and read-only zones do not have a valid
|
||||
* write pointer. Use 0 as for an empty zone.
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
|
||||
void *data)
|
||||
{
|
||||
struct mapped_device *md = data;
|
||||
struct request_queue *q = md->queue;
|
||||
|
||||
switch (zone->type) {
|
||||
case BLK_ZONE_TYPE_CONVENTIONAL:
|
||||
if (!q->conv_zones_bitmap) {
|
||||
q->conv_zones_bitmap =
|
||||
kcalloc(BITS_TO_LONGS(q->nr_zones),
|
||||
sizeof(unsigned long), GFP_NOIO);
|
||||
if (!q->conv_zones_bitmap)
|
||||
return -ENOMEM;
|
||||
}
|
||||
set_bit(idx, q->conv_zones_bitmap);
|
||||
break;
|
||||
case BLK_ZONE_TYPE_SEQWRITE_REQ:
|
||||
case BLK_ZONE_TYPE_SEQWRITE_PREF:
|
||||
if (!q->seq_zones_wlock) {
|
||||
q->seq_zones_wlock =
|
||||
kcalloc(BITS_TO_LONGS(q->nr_zones),
|
||||
sizeof(unsigned long), GFP_NOIO);
|
||||
if (!q->seq_zones_wlock)
|
||||
return -ENOMEM;
|
||||
}
|
||||
if (!md->zwp_offset) {
|
||||
md->zwp_offset =
|
||||
kvcalloc(q->nr_zones, sizeof(unsigned int),
|
||||
GFP_KERNEL);
|
||||
if (!md->zwp_offset)
|
||||
return -ENOMEM;
|
||||
}
|
||||
md->zwp_offset[idx] = dm_get_zone_wp_offset(zone);
|
||||
|
||||
break;
|
||||
default:
|
||||
DMERR("Invalid zone type 0x%x at sectors %llu",
|
||||
(int)zone->type, zone->start);
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Revalidate the zones of a mapped device to initialize resource necessary
|
||||
* for zone append emulation. Note that we cannot simply use the block layer
|
||||
* blk_revalidate_disk_zones() function here as the mapped device is suspended
|
||||
* (this is called from __bind() context).
|
||||
*/
|
||||
static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
|
||||
{
|
||||
struct request_queue *q = md->queue;
|
||||
unsigned int noio_flag;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Check if something changed. If yes, cleanup the current resources
|
||||
* and reallocate everything.
|
||||
*/
|
||||
if (!q->nr_zones || q->nr_zones != md->nr_zones)
|
||||
dm_cleanup_zoned_dev(md);
|
||||
if (md->nr_zones)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Scan all zones to initialize everything. Ensure that all vmalloc
|
||||
* operations in this context are done as if GFP_NOIO was specified.
|
||||
*/
|
||||
noio_flag = memalloc_noio_save();
|
||||
ret = dm_blk_do_report_zones(md, t, 0, q->nr_zones,
|
||||
dm_zone_revalidate_cb, md);
|
||||
memalloc_noio_restore(noio_flag);
|
||||
if (ret < 0)
|
||||
goto err;
|
||||
if (ret != q->nr_zones) {
|
||||
ret = -EIO;
|
||||
goto err;
|
||||
}
|
||||
|
||||
md->nr_zones = q->nr_zones;
|
||||
|
||||
return 0;
|
||||
|
||||
err:
|
||||
DMERR("Revalidate zones failed %d", ret);
|
||||
dm_cleanup_zoned_dev(md);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int device_not_zone_append_capable(struct dm_target *ti,
|
||||
struct dm_dev *dev, sector_t start,
|
||||
sector_t len, void *data)
|
||||
{
|
||||
return !blk_queue_is_zoned(bdev_get_queue(dev->bdev));
|
||||
}
|
||||
|
||||
static bool dm_table_supports_zone_append(struct dm_table *t)
|
||||
{
|
||||
struct dm_target *ti;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < dm_table_get_num_targets(t); i++) {
|
||||
ti = dm_table_get_target(t, i);
|
||||
|
||||
if (ti->emulate_zone_append)
|
||||
return false;
|
||||
|
||||
if (!ti->type->iterate_devices ||
|
||||
ti->type->iterate_devices(ti, device_not_zone_append_capable, NULL))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
|
||||
{
|
||||
struct mapped_device *md = t->md;
|
||||
|
||||
/*
|
||||
* For a zoned target, the number of zones should be updated for the
|
||||
* correct value to be exposed in sysfs queue/nr_zones.
|
||||
*/
|
||||
WARN_ON_ONCE(queue_is_mq(q));
|
||||
q->nr_zones = blkdev_nr_zones(md->disk);
|
||||
|
||||
/* Check if zone append is natively supported */
|
||||
if (dm_table_supports_zone_append(t)) {
|
||||
clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
|
||||
dm_cleanup_zoned_dev(md);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark the mapped device as needing zone append emulation and
|
||||
* initialize the emulation resources once the capacity is set.
|
||||
*/
|
||||
set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
|
||||
if (!get_capacity(md->disk))
|
||||
return 0;
|
||||
|
||||
return dm_revalidate_zones(md, t);
|
||||
}
|
||||
|
||||
static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
|
||||
void *data)
|
||||
{
|
||||
unsigned int *wp_offset = data;
|
||||
|
||||
*wp_offset = dm_get_zone_wp_offset(zone);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno,
|
||||
unsigned int *wp_ofst)
|
||||
{
|
||||
sector_t sector = zno * blk_queue_zone_sectors(md->queue);
|
||||
unsigned int noio_flag;
|
||||
struct dm_table *t;
|
||||
int srcu_idx, ret;
|
||||
|
||||
t = dm_get_live_table(md, &srcu_idx);
|
||||
if (!t)
|
||||
return -EIO;
|
||||
|
||||
/*
|
||||
* Ensure that all memory allocations in this context are done as if
|
||||
* GFP_NOIO was specified.
|
||||
*/
|
||||
noio_flag = memalloc_noio_save();
|
||||
ret = dm_blk_do_report_zones(md, t, sector, 1,
|
||||
dm_update_zone_wp_offset_cb, wp_ofst);
|
||||
memalloc_noio_restore(noio_flag);
|
||||
|
||||
dm_put_live_table(md, srcu_idx);
|
||||
|
||||
if (ret != 1)
|
||||
return -EIO;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* First phase of BIO mapping for targets with zone append emulation:
|
||||
* check all BIO that change a zone writer pointer and change zone
|
||||
* append operations into regular write operations.
|
||||
*/
|
||||
static bool dm_zone_map_bio_begin(struct mapped_device *md,
|
||||
struct bio *orig_bio, struct bio *clone)
|
||||
{
|
||||
sector_t zsectors = blk_queue_zone_sectors(md->queue);
|
||||
unsigned int zno = bio_zone_no(orig_bio);
|
||||
unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
|
||||
|
||||
/*
|
||||
* If the target zone is in an error state, recover by inspecting the
|
||||
* zone to get its current write pointer position. Note that since the
|
||||
* target zone is already locked, a BIO issuing context should never
|
||||
* see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
|
||||
*/
|
||||
if (zwp_offset == DM_ZONE_INVALID_WP_OFST) {
|
||||
if (dm_update_zone_wp_offset(md, zno, &zwp_offset))
|
||||
return false;
|
||||
WRITE_ONCE(md->zwp_offset[zno], zwp_offset);
|
||||
}
|
||||
|
||||
switch (bio_op(orig_bio)) {
|
||||
case REQ_OP_ZONE_RESET:
|
||||
case REQ_OP_ZONE_FINISH:
|
||||
return true;
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
case REQ_OP_WRITE_SAME:
|
||||
case REQ_OP_WRITE:
|
||||
/* Writes must be aligned to the zone write pointer */
|
||||
if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset)
|
||||
return false;
|
||||
break;
|
||||
case REQ_OP_ZONE_APPEND:
|
||||
/*
|
||||
* Change zone append operations into a non-mergeable regular
|
||||
* writes directed at the current write pointer position of the
|
||||
* target zone.
|
||||
*/
|
||||
clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
|
||||
(orig_bio->bi_opf & (~REQ_OP_MASK));
|
||||
clone->bi_iter.bi_sector =
|
||||
orig_bio->bi_iter.bi_sector + zwp_offset;
|
||||
break;
|
||||
default:
|
||||
DMWARN_LIMIT("Invalid BIO operation");
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Cannot write to a full zone */
|
||||
if (zwp_offset >= zsectors)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Second phase of BIO mapping for targets with zone append emulation:
|
||||
* update the zone write pointer offset array to account for the additional
|
||||
* data written to a zone. Note that at this point, the remapped clone BIO
|
||||
* may already have completed, so we do not touch it.
|
||||
*/
|
||||
static blk_status_t dm_zone_map_bio_end(struct mapped_device *md,
|
||||
struct bio *orig_bio,
|
||||
unsigned int nr_sectors)
|
||||
{
|
||||
unsigned int zno = bio_zone_no(orig_bio);
|
||||
unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
|
||||
|
||||
/* The clone BIO may already have been completed and failed */
|
||||
if (zwp_offset == DM_ZONE_INVALID_WP_OFST)
|
||||
return BLK_STS_IOERR;
|
||||
|
||||
/* Update the zone wp offset */
|
||||
switch (bio_op(orig_bio)) {
|
||||
case REQ_OP_ZONE_RESET:
|
||||
WRITE_ONCE(md->zwp_offset[zno], 0);
|
||||
return BLK_STS_OK;
|
||||
case REQ_OP_ZONE_FINISH:
|
||||
WRITE_ONCE(md->zwp_offset[zno],
|
||||
blk_queue_zone_sectors(md->queue));
|
||||
return BLK_STS_OK;
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
case REQ_OP_WRITE_SAME:
|
||||
case REQ_OP_WRITE:
|
||||
WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
|
||||
return BLK_STS_OK;
|
||||
case REQ_OP_ZONE_APPEND:
|
||||
/*
|
||||
* Check that the target did not truncate the write operation
|
||||
* emulating a zone append.
|
||||
*/
|
||||
if (nr_sectors != bio_sectors(orig_bio)) {
|
||||
DMWARN_LIMIT("Truncated write for zone append");
|
||||
return BLK_STS_IOERR;
|
||||
}
|
||||
WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
|
||||
return BLK_STS_OK;
|
||||
default:
|
||||
DMWARN_LIMIT("Invalid BIO operation");
|
||||
return BLK_STS_IOERR;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void dm_zone_lock(struct request_queue *q,
|
||||
unsigned int zno, struct bio *clone)
|
||||
{
|
||||
if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)))
|
||||
return;
|
||||
|
||||
wait_on_bit_lock_io(q->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
|
||||
bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED);
|
||||
}
|
||||
|
||||
static inline void dm_zone_unlock(struct request_queue *q,
|
||||
unsigned int zno, struct bio *clone)
|
||||
{
|
||||
if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(!test_bit(zno, q->seq_zones_wlock));
|
||||
clear_bit_unlock(zno, q->seq_zones_wlock);
|
||||
smp_mb__after_atomic();
|
||||
wake_up_bit(q->seq_zones_wlock, zno);
|
||||
|
||||
bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED);
|
||||
}
|
||||
|
||||
static bool dm_need_zone_wp_tracking(struct bio *orig_bio)
|
||||
{
|
||||
/*
|
||||
* Special processing is not needed for operations that do not need the
|
||||
* zone write lock, that is, all operations that target conventional
|
||||
* zones and all operations that do not modify directly a sequential
|
||||
* zone write pointer.
|
||||
*/
|
||||
if (op_is_flush(orig_bio->bi_opf) && !bio_sectors(orig_bio))
|
||||
return false;
|
||||
switch (bio_op(orig_bio)) {
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
case REQ_OP_WRITE_SAME:
|
||||
case REQ_OP_WRITE:
|
||||
case REQ_OP_ZONE_RESET:
|
||||
case REQ_OP_ZONE_FINISH:
|
||||
case REQ_OP_ZONE_APPEND:
|
||||
return bio_zone_is_seq(orig_bio);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Special IO mapping for targets needing zone append emulation.
|
||||
*/
|
||||
int dm_zone_map_bio(struct dm_target_io *tio)
|
||||
{
|
||||
struct dm_io *io = tio->io;
|
||||
struct dm_target *ti = tio->ti;
|
||||
struct mapped_device *md = io->md;
|
||||
struct request_queue *q = md->queue;
|
||||
struct bio *orig_bio = io->orig_bio;
|
||||
struct bio *clone = &tio->clone;
|
||||
unsigned int zno;
|
||||
blk_status_t sts;
|
||||
int r;
|
||||
|
||||
/*
|
||||
* IOs that do not change a zone write pointer do not need
|
||||
* any additional special processing.
|
||||
*/
|
||||
if (!dm_need_zone_wp_tracking(orig_bio))
|
||||
return ti->type->map(ti, clone);
|
||||
|
||||
/* Lock the target zone */
|
||||
zno = bio_zone_no(orig_bio);
|
||||
dm_zone_lock(q, zno, clone);
|
||||
|
||||
/*
|
||||
* Check that the bio and the target zone write pointer offset are
|
||||
* both valid, and if the bio is a zone append, remap it to a write.
|
||||
*/
|
||||
if (!dm_zone_map_bio_begin(md, orig_bio, clone)) {
|
||||
dm_zone_unlock(q, zno, clone);
|
||||
return DM_MAPIO_KILL;
|
||||
}
|
||||
|
||||
/*
|
||||
* The target map function may issue and complete the IO quickly.
|
||||
* Take an extra reference on the IO to make sure it does disappear
|
||||
* until we run dm_zone_map_bio_end().
|
||||
*/
|
||||
dm_io_inc_pending(io);
|
||||
|
||||
/* Let the target do its work */
|
||||
r = ti->type->map(ti, clone);
|
||||
switch (r) {
|
||||
case DM_MAPIO_SUBMITTED:
|
||||
/*
|
||||
* The target submitted the clone BIO. The target zone will
|
||||
* be unlocked on completion of the clone.
|
||||
*/
|
||||
sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr);
|
||||
break;
|
||||
case DM_MAPIO_REMAPPED:
|
||||
/*
|
||||
* The target only remapped the clone BIO. In case of error,
|
||||
* unlock the target zone here as the clone will not be
|
||||
* submitted.
|
||||
*/
|
||||
sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr);
|
||||
if (sts != BLK_STS_OK)
|
||||
dm_zone_unlock(q, zno, clone);
|
||||
break;
|
||||
case DM_MAPIO_REQUEUE:
|
||||
case DM_MAPIO_KILL:
|
||||
default:
|
||||
dm_zone_unlock(q, zno, clone);
|
||||
sts = BLK_STS_IOERR;
|
||||
break;
|
||||
}
|
||||
|
||||
/* Drop the extra reference on the IO */
|
||||
dm_io_dec_pending(io, sts);
|
||||
|
||||
if (sts != BLK_STS_OK)
|
||||
return DM_MAPIO_KILL;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
/*
|
||||
* IO completion callback called from clone_endio().
|
||||
*/
|
||||
void dm_zone_endio(struct dm_io *io, struct bio *clone)
|
||||
{
|
||||
struct mapped_device *md = io->md;
|
||||
struct request_queue *q = md->queue;
|
||||
struct bio *orig_bio = io->orig_bio;
|
||||
unsigned int zwp_offset;
|
||||
unsigned int zno;
|
||||
|
||||
/*
|
||||
* For targets that do not emulate zone append, we only need to
|
||||
* handle native zone-append bios.
|
||||
*/
|
||||
if (!dm_emulate_zone_append(md)) {
|
||||
/*
|
||||
* Get the offset within the zone of the written sector
|
||||
* and add that to the original bio sector position.
|
||||
*/
|
||||
if (clone->bi_status == BLK_STS_OK &&
|
||||
bio_op(clone) == REQ_OP_ZONE_APPEND) {
|
||||
sector_t mask = (sector_t)blk_queue_zone_sectors(q) - 1;
|
||||
|
||||
orig_bio->bi_iter.bi_sector +=
|
||||
clone->bi_iter.bi_sector & mask;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* For targets that do emulate zone append, if the clone BIO does not
|
||||
* own the target zone write lock, we have nothing to do.
|
||||
*/
|
||||
if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
|
||||
return;
|
||||
|
||||
zno = bio_zone_no(orig_bio);
|
||||
|
||||
if (clone->bi_status != BLK_STS_OK) {
|
||||
/*
|
||||
* BIOs that modify a zone write pointer may leave the zone
|
||||
* in an unknown state in case of failure (e.g. the write
|
||||
* pointer was only partially advanced). In this case, set
|
||||
* the target zone write pointer as invalid unless it is
|
||||
* already being updated.
|
||||
*/
|
||||
WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST);
|
||||
} else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
|
||||
/*
|
||||
* Get the written sector for zone append operation that were
|
||||
* emulated using regular write operations.
|
||||
*/
|
||||
zwp_offset = READ_ONCE(md->zwp_offset[zno]);
|
||||
if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio)))
|
||||
WRITE_ONCE(md->zwp_offset[zno],
|
||||
DM_ZONE_INVALID_WP_OFST);
|
||||
else
|
||||
orig_bio->bi_iter.bi_sector +=
|
||||
zwp_offset - bio_sectors(orig_bio);
|
||||
}
|
||||
|
||||
dm_zone_unlock(q, zno, clone);
|
||||
}
|
@ -1390,6 +1390,13 @@ static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data)
|
||||
return -ENXIO;
|
||||
}
|
||||
|
||||
/*
|
||||
* Devices that have zones with a capacity smaller than the zone size
|
||||
* (e.g. NVMe zoned namespaces) are not supported.
|
||||
*/
|
||||
if (blkz->capacity != blkz->len)
|
||||
return -ENXIO;
|
||||
|
||||
switch (blkz->type) {
|
||||
case BLK_ZONE_TYPE_CONVENTIONAL:
|
||||
set_bit(DMZ_RND, &zone->flags);
|
||||
|
@ -134,7 +134,7 @@ static int dmz_reclaim_copy(struct dmz_reclaim *zrc,
|
||||
dst_zone_block = dmz_start_block(zmd, dst_zone);
|
||||
|
||||
if (dmz_is_seq(dst_zone))
|
||||
set_bit(DM_KCOPYD_WRITE_SEQ, &flags);
|
||||
flags |= BIT(DM_KCOPYD_WRITE_SEQ);
|
||||
|
||||
while (block < end_block) {
|
||||
if (src_zone->dev->flags & DMZ_BDEV_DYING)
|
||||
|
208
drivers/md/dm.c
208
drivers/md/dm.c
@ -74,38 +74,6 @@ struct clone_info {
|
||||
unsigned sector_count;
|
||||
};
|
||||
|
||||
/*
|
||||
* One of these is allocated per clone bio.
|
||||
*/
|
||||
#define DM_TIO_MAGIC 7282014
|
||||
struct dm_target_io {
|
||||
unsigned magic;
|
||||
struct dm_io *io;
|
||||
struct dm_target *ti;
|
||||
unsigned target_bio_nr;
|
||||
unsigned *len_ptr;
|
||||
bool inside_dm_io;
|
||||
struct bio clone;
|
||||
};
|
||||
|
||||
/*
|
||||
* One of these is allocated per original bio.
|
||||
* It contains the first clone used for that original.
|
||||
*/
|
||||
#define DM_IO_MAGIC 5191977
|
||||
struct dm_io {
|
||||
unsigned magic;
|
||||
struct mapped_device *md;
|
||||
blk_status_t status;
|
||||
atomic_t io_count;
|
||||
struct bio *orig_bio;
|
||||
unsigned long start_time;
|
||||
spinlock_t endio_lock;
|
||||
struct dm_stats_aux stats_aux;
|
||||
/* last member of dm_target_io is 'struct bio' */
|
||||
struct dm_target_io tio;
|
||||
};
|
||||
|
||||
#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
|
||||
#define DM_IO_BIO_OFFSET \
|
||||
(offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
|
||||
@ -137,19 +105,6 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
|
||||
|
||||
#define MINOR_ALLOCED ((void *)-1)
|
||||
|
||||
/*
|
||||
* Bits for the md->flags field.
|
||||
*/
|
||||
#define DMF_BLOCK_IO_FOR_SUSPEND 0
|
||||
#define DMF_SUSPENDED 1
|
||||
#define DMF_FROZEN 2
|
||||
#define DMF_FREEING 3
|
||||
#define DMF_DELETING 4
|
||||
#define DMF_NOFLUSH_SUSPENDING 5
|
||||
#define DMF_DEFERRED_REMOVE 6
|
||||
#define DMF_SUSPENDED_INTERNALLY 7
|
||||
#define DMF_POST_SUSPENDING 8
|
||||
|
||||
#define DM_NUMA_NODE NUMA_NO_NODE
|
||||
static int dm_numa_node = DM_NUMA_NODE;
|
||||
|
||||
@ -444,84 +399,6 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
|
||||
return dm_get_geometry(md, geo);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
|
||||
{
|
||||
struct dm_report_zones_args *args = data;
|
||||
sector_t sector_diff = args->tgt->begin - args->start;
|
||||
|
||||
/*
|
||||
* Ignore zones beyond the target range.
|
||||
*/
|
||||
if (zone->start >= args->start + args->tgt->len)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Remap the start sector and write pointer position of the zone
|
||||
* to match its position in the target range.
|
||||
*/
|
||||
zone->start += sector_diff;
|
||||
if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
|
||||
if (zone->cond == BLK_ZONE_COND_FULL)
|
||||
zone->wp = zone->start + zone->len;
|
||||
else if (zone->cond == BLK_ZONE_COND_EMPTY)
|
||||
zone->wp = zone->start;
|
||||
else
|
||||
zone->wp += sector_diff;
|
||||
}
|
||||
|
||||
args->next_sector = zone->start + zone->len;
|
||||
return args->orig_cb(zone, args->zone_idx++, args->orig_data);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dm_report_zones_cb);
|
||||
|
||||
static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
|
||||
unsigned int nr_zones, report_zones_cb cb, void *data)
|
||||
{
|
||||
struct mapped_device *md = disk->private_data;
|
||||
struct dm_table *map;
|
||||
int srcu_idx, ret;
|
||||
struct dm_report_zones_args args = {
|
||||
.next_sector = sector,
|
||||
.orig_data = data,
|
||||
.orig_cb = cb,
|
||||
};
|
||||
|
||||
if (dm_suspended_md(md))
|
||||
return -EAGAIN;
|
||||
|
||||
map = dm_get_live_table(md, &srcu_idx);
|
||||
if (!map) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
do {
|
||||
struct dm_target *tgt;
|
||||
|
||||
tgt = dm_table_find_target(map, args.next_sector);
|
||||
if (WARN_ON_ONCE(!tgt->type->report_zones)) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
args.tgt = tgt;
|
||||
ret = tgt->type->report_zones(tgt, &args,
|
||||
nr_zones - args.zone_idx);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
} while (args.zone_idx < nr_zones &&
|
||||
args.next_sector < get_capacity(disk));
|
||||
|
||||
ret = args.zone_idx;
|
||||
out:
|
||||
dm_put_live_table(md, srcu_idx);
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
#define dm_blk_report_zones NULL
|
||||
#endif /* CONFIG_BLK_DEV_ZONED */
|
||||
|
||||
static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
|
||||
struct block_device **bdev)
|
||||
{
|
||||
@ -903,7 +780,7 @@ static int __noflush_suspending(struct mapped_device *md)
|
||||
* Decrements the number of outstanding ios that a bio has been
|
||||
* cloned into, completing the original io if necc.
|
||||
*/
|
||||
static void dec_pending(struct dm_io *io, blk_status_t error)
|
||||
void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
|
||||
{
|
||||
unsigned long flags;
|
||||
blk_status_t io_error;
|
||||
@ -919,22 +796,27 @@ static void dec_pending(struct dm_io *io, blk_status_t error)
|
||||
}
|
||||
|
||||
if (atomic_dec_and_test(&io->io_count)) {
|
||||
bio = io->orig_bio;
|
||||
if (io->status == BLK_STS_DM_REQUEUE) {
|
||||
/*
|
||||
* Target requested pushing back the I/O.
|
||||
*/
|
||||
spin_lock_irqsave(&md->deferred_lock, flags);
|
||||
if (__noflush_suspending(md))
|
||||
if (__noflush_suspending(md) &&
|
||||
!WARN_ON_ONCE(dm_is_zone_write(md, bio))) {
|
||||
/* NOTE early return due to BLK_STS_DM_REQUEUE below */
|
||||
bio_list_add_head(&md->deferred, io->orig_bio);
|
||||
else
|
||||
/* noflush suspend was interrupted. */
|
||||
bio_list_add_head(&md->deferred, bio);
|
||||
} else {
|
||||
/*
|
||||
* noflush suspend was interrupted or this is
|
||||
* a write to a zoned target.
|
||||
*/
|
||||
io->status = BLK_STS_IOERR;
|
||||
}
|
||||
spin_unlock_irqrestore(&md->deferred_lock, flags);
|
||||
}
|
||||
|
||||
io_error = io->status;
|
||||
bio = io->orig_bio;
|
||||
end_io_acct(io);
|
||||
free_io(md, io);
|
||||
|
||||
@ -994,7 +876,6 @@ static void clone_endio(struct bio *bio)
|
||||
struct dm_io *io = tio->io;
|
||||
struct mapped_device *md = tio->io->md;
|
||||
dm_endio_fn endio = tio->ti->type->end_io;
|
||||
struct bio *orig_bio = io->orig_bio;
|
||||
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
|
||||
|
||||
if (unlikely(error == BLK_STS_TARGET)) {
|
||||
@ -1009,23 +890,22 @@ static void clone_endio(struct bio *bio)
|
||||
disable_write_zeroes(md);
|
||||
}
|
||||
|
||||
/*
|
||||
* For zone-append bios get offset in zone of the written
|
||||
* sector and add that to the original bio sector pos.
|
||||
*/
|
||||
if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
|
||||
sector_t written_sector = bio->bi_iter.bi_sector;
|
||||
struct request_queue *q = orig_bio->bi_bdev->bd_disk->queue;
|
||||
u64 mask = (u64)blk_queue_zone_sectors(q) - 1;
|
||||
|
||||
orig_bio->bi_iter.bi_sector += written_sector & mask;
|
||||
}
|
||||
if (blk_queue_is_zoned(q))
|
||||
dm_zone_endio(io, bio);
|
||||
|
||||
if (endio) {
|
||||
int r = endio(tio->ti, bio, &error);
|
||||
switch (r) {
|
||||
case DM_ENDIO_REQUEUE:
|
||||
error = BLK_STS_DM_REQUEUE;
|
||||
/*
|
||||
* Requeuing writes to a sequential zone of a zoned
|
||||
* target will break the sequential write pattern:
|
||||
* fail such IO.
|
||||
*/
|
||||
if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
|
||||
error = BLK_STS_IOERR;
|
||||
else
|
||||
error = BLK_STS_DM_REQUEUE;
|
||||
fallthrough;
|
||||
case DM_ENDIO_DONE:
|
||||
break;
|
||||
@ -1044,7 +924,7 @@ static void clone_endio(struct bio *bio)
|
||||
}
|
||||
|
||||
free_tio(tio);
|
||||
dec_pending(io, error);
|
||||
dm_io_dec_pending(io, error);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1237,8 +1117,8 @@ static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
|
||||
|
||||
/*
|
||||
* A target may call dm_accept_partial_bio only from the map routine. It is
|
||||
* allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_RESET,
|
||||
* REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH.
|
||||
* allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
|
||||
* operations and REQ_OP_ZONE_APPEND (zone append writes).
|
||||
*
|
||||
* dm_accept_partial_bio informs the dm that the target only wants to process
|
||||
* additional n_sectors sectors of the bio and the rest of the data should be
|
||||
@ -1268,9 +1148,13 @@ void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
|
||||
{
|
||||
struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
|
||||
unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
|
||||
|
||||
BUG_ON(bio->bi_opf & REQ_PREFLUSH);
|
||||
BUG_ON(op_is_zone_mgmt(bio_op(bio)));
|
||||
BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
|
||||
BUG_ON(bi_size > *tio->len_ptr);
|
||||
BUG_ON(n_sectors > bi_size);
|
||||
|
||||
*tio->len_ptr -= bi_size - n_sectors;
|
||||
bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
|
||||
}
|
||||
@ -1308,7 +1192,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
|
||||
* anything, the target has assumed ownership of
|
||||
* this io.
|
||||
*/
|
||||
atomic_inc(&io->io_count);
|
||||
dm_io_inc_pending(io);
|
||||
sector = clone->bi_iter.bi_sector;
|
||||
|
||||
if (unlikely(swap_bios_limit(ti, clone))) {
|
||||
@ -1319,7 +1203,16 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
|
||||
down(&md->swap_bios_semaphore);
|
||||
}
|
||||
|
||||
r = ti->type->map(ti, clone);
|
||||
/*
|
||||
* Check if the IO needs a special mapping due to zone append emulation
|
||||
* on zoned target. In this case, dm_zone_map_bio() calls the target
|
||||
* map operation.
|
||||
*/
|
||||
if (dm_emulate_zone_append(io->md))
|
||||
r = dm_zone_map_bio(tio);
|
||||
else
|
||||
r = ti->type->map(ti, clone);
|
||||
|
||||
switch (r) {
|
||||
case DM_MAPIO_SUBMITTED:
|
||||
break;
|
||||
@ -1334,7 +1227,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
|
||||
up(&md->swap_bios_semaphore);
|
||||
}
|
||||
free_tio(tio);
|
||||
dec_pending(io, BLK_STS_IOERR);
|
||||
dm_io_dec_pending(io, BLK_STS_IOERR);
|
||||
break;
|
||||
case DM_MAPIO_REQUEUE:
|
||||
if (unlikely(swap_bios_limit(ti, clone))) {
|
||||
@ -1342,7 +1235,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio)
|
||||
up(&md->swap_bios_semaphore);
|
||||
}
|
||||
free_tio(tio);
|
||||
dec_pending(io, BLK_STS_DM_REQUEUE);
|
||||
dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
|
||||
break;
|
||||
default:
|
||||
DMWARN("unimplemented target map return value: %d", r);
|
||||
@ -1631,7 +1524,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
|
||||
|
||||
if (bio->bi_opf & REQ_PREFLUSH) {
|
||||
error = __send_empty_flush(&ci);
|
||||
/* dec_pending submits any data associated with flush */
|
||||
/* dm_io_dec_pending submits any data associated with flush */
|
||||
} else if (op_is_zone_mgmt(bio_op(bio))) {
|
||||
ci.bio = bio;
|
||||
ci.sector_count = 0;
|
||||
@ -1672,7 +1565,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md,
|
||||
}
|
||||
|
||||
/* drop the extra reference count */
|
||||
dec_pending(ci.io, errno_to_blk_status(error));
|
||||
dm_io_dec_pending(ci.io, errno_to_blk_status(error));
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1817,6 +1710,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
|
||||
mutex_destroy(&md->swap_bios_lock);
|
||||
|
||||
dm_mq_cleanup_mapped_device(md);
|
||||
dm_cleanup_zoned_dev(md);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2060,11 +1954,16 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = dm_table_set_restrictions(t, q, limits);
|
||||
if (ret) {
|
||||
old_map = ERR_PTR(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
|
||||
rcu_assign_pointer(md->map, (void *)t);
|
||||
md->immutable_target_type = dm_table_get_immutable_target_type(t);
|
||||
|
||||
dm_table_set_restrictions(t, q, limits);
|
||||
if (old_map)
|
||||
dm_sync_table(md);
|
||||
|
||||
@ -2183,7 +2082,10 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
|
||||
DMERR("Cannot calculate initial queue limits");
|
||||
return r;
|
||||
}
|
||||
dm_table_set_restrictions(t, md->queue, &limits);
|
||||
r = dm_table_set_restrictions(t, md->queue, &limits);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
blk_register_queue(md->disk);
|
||||
|
||||
return 0;
|
||||
|
@ -45,6 +45,8 @@ struct dm_dev_internal {
|
||||
|
||||
struct dm_table;
|
||||
struct dm_md_mempools;
|
||||
struct dm_target_io;
|
||||
struct dm_io;
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* Internal table functions.
|
||||
@ -56,8 +58,8 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
|
||||
bool dm_table_has_no_data_devices(struct dm_table *table);
|
||||
int dm_calculate_queue_limits(struct dm_table *table,
|
||||
struct queue_limits *limits);
|
||||
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
|
||||
struct queue_limits *limits);
|
||||
int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
|
||||
struct queue_limits *limits);
|
||||
struct list_head *dm_table_get_devices(struct dm_table *t);
|
||||
void dm_table_presuspend_targets(struct dm_table *t);
|
||||
void dm_table_presuspend_undo_targets(struct dm_table *t);
|
||||
@ -100,6 +102,30 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
|
||||
*/
|
||||
#define dm_target_hybrid(t) (dm_target_bio_based(t) && dm_target_request_based(t))
|
||||
|
||||
/*
|
||||
* Zoned targets related functions.
|
||||
*/
|
||||
int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q);
|
||||
void dm_zone_endio(struct dm_io *io, struct bio *clone);
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
void dm_cleanup_zoned_dev(struct mapped_device *md);
|
||||
int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
|
||||
unsigned int nr_zones, report_zones_cb cb, void *data);
|
||||
bool dm_is_zone_write(struct mapped_device *md, struct bio *bio);
|
||||
int dm_zone_map_bio(struct dm_target_io *io);
|
||||
#else
|
||||
static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {}
|
||||
#define dm_blk_report_zones NULL
|
||||
static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
static inline int dm_zone_map_bio(struct dm_target_io *tio)
|
||||
{
|
||||
return DM_MAPIO_KILL;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*-----------------------------------------------------------------
|
||||
* A registry of target types.
|
||||
*---------------------------------------------------------------*/
|
||||
|
@ -108,12 +108,10 @@ static void *element_at(struct dm_array_info *info, struct array_block *ab,
|
||||
* in an array block.
|
||||
*/
|
||||
static void on_entries(struct dm_array_info *info, struct array_block *ab,
|
||||
void (*fn)(void *, const void *))
|
||||
void (*fn)(void *, const void *, unsigned))
|
||||
{
|
||||
unsigned i, nr_entries = le32_to_cpu(ab->nr_entries);
|
||||
|
||||
for (i = 0; i < nr_entries; i++)
|
||||
fn(info->value_type.context, element_at(info, ab, i));
|
||||
unsigned nr_entries = le32_to_cpu(ab->nr_entries);
|
||||
fn(info->value_type.context, element_at(info, ab, 0), nr_entries);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -175,19 +173,18 @@ static int alloc_ablock(struct dm_array_info *info, size_t size_of_block,
|
||||
static void fill_ablock(struct dm_array_info *info, struct array_block *ab,
|
||||
const void *value, unsigned new_nr)
|
||||
{
|
||||
unsigned i;
|
||||
uint32_t nr_entries;
|
||||
uint32_t nr_entries, delta, i;
|
||||
struct dm_btree_value_type *vt = &info->value_type;
|
||||
|
||||
BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
|
||||
BUG_ON(new_nr < le32_to_cpu(ab->nr_entries));
|
||||
|
||||
nr_entries = le32_to_cpu(ab->nr_entries);
|
||||
for (i = nr_entries; i < new_nr; i++) {
|
||||
if (vt->inc)
|
||||
vt->inc(vt->context, value);
|
||||
delta = new_nr - nr_entries;
|
||||
if (vt->inc)
|
||||
vt->inc(vt->context, value, delta);
|
||||
for (i = nr_entries; i < new_nr; i++)
|
||||
memcpy(element_at(info, ab, i), value, vt->size);
|
||||
}
|
||||
ab->nr_entries = cpu_to_le32(new_nr);
|
||||
}
|
||||
|
||||
@ -199,17 +196,16 @@ static void fill_ablock(struct dm_array_info *info, struct array_block *ab,
|
||||
static void trim_ablock(struct dm_array_info *info, struct array_block *ab,
|
||||
unsigned new_nr)
|
||||
{
|
||||
unsigned i;
|
||||
uint32_t nr_entries;
|
||||
uint32_t nr_entries, delta;
|
||||
struct dm_btree_value_type *vt = &info->value_type;
|
||||
|
||||
BUG_ON(new_nr > le32_to_cpu(ab->max_entries));
|
||||
BUG_ON(new_nr > le32_to_cpu(ab->nr_entries));
|
||||
|
||||
nr_entries = le32_to_cpu(ab->nr_entries);
|
||||
for (i = nr_entries; i > new_nr; i--)
|
||||
if (vt->dec)
|
||||
vt->dec(vt->context, element_at(info, ab, i - 1));
|
||||
delta = nr_entries - new_nr;
|
||||
if (vt->dec)
|
||||
vt->dec(vt->context, element_at(info, ab, new_nr - 1), delta);
|
||||
ab->nr_entries = cpu_to_le32(new_nr);
|
||||
}
|
||||
|
||||
@ -573,16 +569,17 @@ static int grow(struct resize *resize)
|
||||
* These are the value_type functions for the btree elements, which point
|
||||
* to array blocks.
|
||||
*/
|
||||
static void block_inc(void *context, const void *value)
|
||||
static void block_inc(void *context, const void *value, unsigned count)
|
||||
{
|
||||
__le64 block_le;
|
||||
const __le64 *block_le = value;
|
||||
struct dm_array_info *info = context;
|
||||
unsigned i;
|
||||
|
||||
memcpy(&block_le, value, sizeof(block_le));
|
||||
dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le));
|
||||
for (i = 0; i < count; i++, block_le++)
|
||||
dm_tm_inc(info->btree_info.tm, le64_to_cpu(*block_le));
|
||||
}
|
||||
|
||||
static void block_dec(void *context, const void *value)
|
||||
static void __block_dec(void *context, const void *value)
|
||||
{
|
||||
int r;
|
||||
uint64_t b;
|
||||
@ -621,6 +618,13 @@ static void block_dec(void *context, const void *value)
|
||||
dm_tm_dec(info->btree_info.tm, b);
|
||||
}
|
||||
|
||||
static void block_dec(void *context, const void *value, unsigned count)
|
||||
{
|
||||
unsigned i;
|
||||
for (i = 0; i < count; i++, value += sizeof(__le64))
|
||||
__block_dec(context, value);
|
||||
}
|
||||
|
||||
static int block_equal(void *context, const void *value1, const void *value2)
|
||||
{
|
||||
return !memcmp(value1, value2, sizeof(__le64));
|
||||
@ -711,7 +715,7 @@ static int populate_ablock_with_values(struct dm_array_info *info, struct array_
|
||||
return r;
|
||||
|
||||
if (vt->inc)
|
||||
vt->inc(vt->context, element_at(info, ab, i));
|
||||
vt->inc(vt->context, element_at(info, ab, i), 1);
|
||||
}
|
||||
|
||||
ab->nr_entries = cpu_to_le32(new_nr);
|
||||
@ -822,9 +826,9 @@ static int array_set_value(struct dm_array_info *info, dm_block_t root,
|
||||
old_value = element_at(info, ab, entry);
|
||||
if (vt->dec &&
|
||||
(!vt->equal || !vt->equal(vt->context, old_value, value))) {
|
||||
vt->dec(vt->context, old_value);
|
||||
vt->dec(vt->context, old_value, 1);
|
||||
if (vt->inc)
|
||||
vt->inc(vt->context, value);
|
||||
vt->inc(vt->context, value, 1);
|
||||
}
|
||||
|
||||
memcpy(old_value, value, info->value_type.size);
|
||||
|
@ -144,4 +144,17 @@ extern struct dm_block_validator btree_node_validator;
|
||||
extern void init_le64_type(struct dm_transaction_manager *tm,
|
||||
struct dm_btree_value_type *vt);
|
||||
|
||||
/*
|
||||
* This returns a shadowed btree leaf that you may modify. In practise
|
||||
* this means overwrites only, since an insert could cause a node to
|
||||
* be split. Useful if you need access to the old value to calculate the
|
||||
* new one.
|
||||
*
|
||||
* This only works with single level btrees. The given key must be present in
|
||||
* the tree, otherwise -EINVAL will be returned.
|
||||
*/
|
||||
int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root,
|
||||
uint64_t key, int *index,
|
||||
dm_block_t *new_root, struct dm_block **leaf);
|
||||
|
||||
#endif /* DM_BTREE_INTERNAL_H */
|
||||
|
@ -544,12 +544,13 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
|
||||
|
||||
if (info->value_type.dec)
|
||||
info->value_type.dec(info->value_type.context,
|
||||
value_ptr(n, index));
|
||||
value_ptr(n, index), 1);
|
||||
|
||||
delete_at(n, index);
|
||||
}
|
||||
|
||||
*new_root = shadow_root(&spine);
|
||||
if (!r)
|
||||
*new_root = shadow_root(&spine);
|
||||
exit_shadow_spine(&spine);
|
||||
|
||||
return r;
|
||||
@ -653,7 +654,7 @@ static int remove_one(struct dm_btree_info *info, dm_block_t root,
|
||||
if (k >= keys[last_level] && k < end_key) {
|
||||
if (info->value_type.dec)
|
||||
info->value_type.dec(info->value_type.context,
|
||||
value_ptr(n, index));
|
||||
value_ptr(n, index), 1);
|
||||
|
||||
delete_at(n, index);
|
||||
keys[last_level] = k + 1ull;
|
||||
|
@ -236,22 +236,14 @@ dm_block_t shadow_root(struct shadow_spine *s)
|
||||
return s->root;
|
||||
}
|
||||
|
||||
static void le64_inc(void *context, const void *value_le)
|
||||
static void le64_inc(void *context, const void *value_le, unsigned count)
|
||||
{
|
||||
struct dm_transaction_manager *tm = context;
|
||||
__le64 v_le;
|
||||
|
||||
memcpy(&v_le, value_le, sizeof(v_le));
|
||||
dm_tm_inc(tm, le64_to_cpu(v_le));
|
||||
dm_tm_with_runs(context, value_le, count, dm_tm_inc_range);
|
||||
}
|
||||
|
||||
static void le64_dec(void *context, const void *value_le)
|
||||
static void le64_dec(void *context, const void *value_le, unsigned count)
|
||||
{
|
||||
struct dm_transaction_manager *tm = context;
|
||||
__le64 v_le;
|
||||
|
||||
memcpy(&v_le, value_le, sizeof(v_le));
|
||||
dm_tm_dec(tm, le64_to_cpu(v_le));
|
||||
dm_tm_with_runs(context, value_le, count, dm_tm_dec_range);
|
||||
}
|
||||
|
||||
static int le64_equal(void *context, const void *value1_le, const void *value2_le)
|
||||
|
@ -71,15 +71,13 @@ static int upper_bound(struct btree_node *n, uint64_t key)
|
||||
void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
|
||||
struct dm_btree_value_type *vt)
|
||||
{
|
||||
unsigned i;
|
||||
uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
|
||||
|
||||
if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
|
||||
for (i = 0; i < nr_entries; i++)
|
||||
dm_tm_inc(tm, value64(n, i));
|
||||
dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range);
|
||||
|
||||
else if (vt->inc)
|
||||
for (i = 0; i < nr_entries; i++)
|
||||
vt->inc(vt->context, value_ptr(n, i));
|
||||
vt->inc(vt->context, value_ptr(n, 0), nr_entries);
|
||||
}
|
||||
|
||||
static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
|
||||
@ -318,13 +316,9 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
|
||||
goto out;
|
||||
|
||||
} else {
|
||||
if (info->value_type.dec) {
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < f->nr_children; i++)
|
||||
info->value_type.dec(info->value_type.context,
|
||||
value_ptr(f->n, i));
|
||||
}
|
||||
if (info->value_type.dec)
|
||||
info->value_type.dec(info->value_type.context,
|
||||
value_ptr(f->n, 0), f->nr_children);
|
||||
pop_frame(s);
|
||||
}
|
||||
}
|
||||
@ -500,6 +494,122 @@ out:
|
||||
|
||||
EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
|
||||
/*
|
||||
* Copies entries from one region of a btree node to another. The regions
|
||||
* must not overlap.
|
||||
*/
|
||||
static void copy_entries(struct btree_node *dest, unsigned dest_offset,
|
||||
struct btree_node *src, unsigned src_offset,
|
||||
unsigned count)
|
||||
{
|
||||
size_t value_size = le32_to_cpu(dest->header.value_size);
|
||||
memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
|
||||
memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
|
||||
}
|
||||
|
||||
/*
|
||||
* Moves entries from one region fo a btree node to another. The regions
|
||||
* may overlap.
|
||||
*/
|
||||
static void move_entries(struct btree_node *dest, unsigned dest_offset,
|
||||
struct btree_node *src, unsigned src_offset,
|
||||
unsigned count)
|
||||
{
|
||||
size_t value_size = le32_to_cpu(dest->header.value_size);
|
||||
memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
|
||||
memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
|
||||
}
|
||||
|
||||
/*
|
||||
* Erases the first 'count' entries of a btree node, shifting following
|
||||
* entries down into their place.
|
||||
*/
|
||||
static void shift_down(struct btree_node *n, unsigned count)
|
||||
{
|
||||
move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count);
|
||||
}
|
||||
|
||||
/*
|
||||
* Moves entries in a btree node up 'count' places, making space for
|
||||
* new entries at the start of the node.
|
||||
*/
|
||||
static void shift_up(struct btree_node *n, unsigned count)
|
||||
{
|
||||
move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries));
|
||||
}
|
||||
|
||||
/*
|
||||
* Redistributes entries between two btree nodes to make them
|
||||
* have similar numbers of entries.
|
||||
*/
|
||||
static void redistribute2(struct btree_node *left, struct btree_node *right)
|
||||
{
|
||||
unsigned nr_left = le32_to_cpu(left->header.nr_entries);
|
||||
unsigned nr_right = le32_to_cpu(right->header.nr_entries);
|
||||
unsigned total = nr_left + nr_right;
|
||||
unsigned target_left = total / 2;
|
||||
unsigned target_right = total - target_left;
|
||||
|
||||
if (nr_left < target_left) {
|
||||
unsigned delta = target_left - nr_left;
|
||||
copy_entries(left, nr_left, right, 0, delta);
|
||||
shift_down(right, delta);
|
||||
} else if (nr_left > target_left) {
|
||||
unsigned delta = nr_left - target_left;
|
||||
if (nr_right)
|
||||
shift_up(right, delta);
|
||||
copy_entries(right, 0, left, target_left, delta);
|
||||
}
|
||||
|
||||
left->header.nr_entries = cpu_to_le32(target_left);
|
||||
right->header.nr_entries = cpu_to_le32(target_right);
|
||||
}
|
||||
|
||||
/*
|
||||
* Redistribute entries between three nodes. Assumes the central
|
||||
* node is empty.
|
||||
*/
|
||||
static void redistribute3(struct btree_node *left, struct btree_node *center,
|
||||
struct btree_node *right)
|
||||
{
|
||||
unsigned nr_left = le32_to_cpu(left->header.nr_entries);
|
||||
unsigned nr_center = le32_to_cpu(center->header.nr_entries);
|
||||
unsigned nr_right = le32_to_cpu(right->header.nr_entries);
|
||||
unsigned total, target_left, target_center, target_right;
|
||||
|
||||
BUG_ON(nr_center);
|
||||
|
||||
total = nr_left + nr_right;
|
||||
target_left = total / 3;
|
||||
target_center = (total - target_left) / 2;
|
||||
target_right = (total - target_left - target_center);
|
||||
|
||||
if (nr_left < target_left) {
|
||||
unsigned left_short = target_left - nr_left;
|
||||
copy_entries(left, nr_left, right, 0, left_short);
|
||||
copy_entries(center, 0, right, left_short, target_center);
|
||||
shift_down(right, nr_right - target_right);
|
||||
|
||||
} else if (nr_left < (target_left + target_center)) {
|
||||
unsigned left_to_center = nr_left - target_left;
|
||||
copy_entries(center, 0, left, target_left, left_to_center);
|
||||
copy_entries(center, left_to_center, right, 0, target_center - left_to_center);
|
||||
shift_down(right, nr_right - target_right);
|
||||
|
||||
} else {
|
||||
unsigned right_short = target_right - nr_right;
|
||||
shift_up(right, right_short);
|
||||
copy_entries(right, 0, left, nr_left - right_short, right_short);
|
||||
copy_entries(center, 0, left, target_left, nr_left - target_left);
|
||||
}
|
||||
|
||||
left->header.nr_entries = cpu_to_le32(target_left);
|
||||
center->header.nr_entries = cpu_to_le32(target_center);
|
||||
right->header.nr_entries = cpu_to_le32(target_right);
|
||||
}
|
||||
|
||||
/*
|
||||
* Splits a node by creating a sibling node and shifting half the nodes
|
||||
* contents across. Assumes there is a parent node, and it has room for
|
||||
@ -530,12 +640,10 @@ EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
|
||||
*
|
||||
* Where A* is a shadow of A.
|
||||
*/
|
||||
static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
|
||||
uint64_t key)
|
||||
static int split_one_into_two(struct shadow_spine *s, unsigned parent_index,
|
||||
struct dm_btree_value_type *vt, uint64_t key)
|
||||
{
|
||||
int r;
|
||||
size_t size;
|
||||
unsigned nr_left, nr_right;
|
||||
struct dm_block *left, *right, *parent;
|
||||
struct btree_node *ln, *rn, *pn;
|
||||
__le64 location;
|
||||
@ -549,36 +657,18 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
|
||||
ln = dm_block_data(left);
|
||||
rn = dm_block_data(right);
|
||||
|
||||
nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
|
||||
nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
|
||||
|
||||
ln->header.nr_entries = cpu_to_le32(nr_left);
|
||||
|
||||
rn->header.flags = ln->header.flags;
|
||||
rn->header.nr_entries = cpu_to_le32(nr_right);
|
||||
rn->header.nr_entries = cpu_to_le32(0);
|
||||
rn->header.max_entries = ln->header.max_entries;
|
||||
rn->header.value_size = ln->header.value_size;
|
||||
memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
|
||||
redistribute2(ln, rn);
|
||||
|
||||
size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
|
||||
sizeof(uint64_t) : s->info->value_type.size;
|
||||
memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
|
||||
size * nr_right);
|
||||
|
||||
/*
|
||||
* Patch up the parent
|
||||
*/
|
||||
/* patch up the parent */
|
||||
parent = shadow_parent(s);
|
||||
|
||||
pn = dm_block_data(parent);
|
||||
location = cpu_to_le64(dm_block_location(left));
|
||||
__dm_bless_for_disk(&location);
|
||||
memcpy_disk(value_ptr(pn, parent_index),
|
||||
&location, sizeof(__le64));
|
||||
|
||||
location = cpu_to_le64(dm_block_location(right));
|
||||
__dm_bless_for_disk(&location);
|
||||
|
||||
r = insert_at(sizeof(__le64), pn, parent_index + 1,
|
||||
le64_to_cpu(rn->keys[0]), &location);
|
||||
if (r) {
|
||||
@ -586,6 +676,7 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
|
||||
return r;
|
||||
}
|
||||
|
||||
/* patch up the spine */
|
||||
if (key < le64_to_cpu(rn->keys[0])) {
|
||||
unlock_block(s->info, right);
|
||||
s->nodes[1] = left;
|
||||
@ -597,6 +688,121 @@ static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* We often need to modify a sibling node. This function shadows a particular
|
||||
* child of the given parent node. Making sure to update the parent to point
|
||||
* to the new shadow.
|
||||
*/
|
||||
static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
|
||||
struct btree_node *parent, unsigned index,
|
||||
struct dm_block **result)
|
||||
{
|
||||
int r, inc;
|
||||
dm_block_t root;
|
||||
struct btree_node *node;
|
||||
|
||||
root = value64(parent, index);
|
||||
|
||||
r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
|
||||
result, &inc);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
node = dm_block_data(*result);
|
||||
|
||||
if (inc)
|
||||
inc_children(info->tm, node, vt);
|
||||
|
||||
*((__le64 *) value_ptr(parent, index)) =
|
||||
cpu_to_le64(dm_block_location(*result));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Splits two nodes into three. This is more work, but results in fuller
|
||||
* nodes, so saves metadata space.
|
||||
*/
|
||||
static int split_two_into_three(struct shadow_spine *s, unsigned parent_index,
|
||||
struct dm_btree_value_type *vt, uint64_t key)
|
||||
{
|
||||
int r;
|
||||
unsigned middle_index;
|
||||
struct dm_block *left, *middle, *right, *parent;
|
||||
struct btree_node *ln, *rn, *mn, *pn;
|
||||
__le64 location;
|
||||
|
||||
parent = shadow_parent(s);
|
||||
pn = dm_block_data(parent);
|
||||
|
||||
if (parent_index == 0) {
|
||||
middle_index = 1;
|
||||
left = shadow_current(s);
|
||||
r = shadow_child(s->info, vt, pn, parent_index + 1, &right);
|
||||
if (r)
|
||||
return r;
|
||||
} else {
|
||||
middle_index = parent_index;
|
||||
right = shadow_current(s);
|
||||
r = shadow_child(s->info, vt, pn, parent_index - 1, &left);
|
||||
if (r)
|
||||
return r;
|
||||
}
|
||||
|
||||
r = new_block(s->info, &middle);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
ln = dm_block_data(left);
|
||||
mn = dm_block_data(middle);
|
||||
rn = dm_block_data(right);
|
||||
|
||||
mn->header.nr_entries = cpu_to_le32(0);
|
||||
mn->header.flags = ln->header.flags;
|
||||
mn->header.max_entries = ln->header.max_entries;
|
||||
mn->header.value_size = ln->header.value_size;
|
||||
|
||||
redistribute3(ln, mn, rn);
|
||||
|
||||
/* patch up the parent */
|
||||
pn->keys[middle_index] = rn->keys[0];
|
||||
location = cpu_to_le64(dm_block_location(middle));
|
||||
__dm_bless_for_disk(&location);
|
||||
r = insert_at(sizeof(__le64), pn, middle_index,
|
||||
le64_to_cpu(mn->keys[0]), &location);
|
||||
if (r) {
|
||||
if (shadow_current(s) != left)
|
||||
unlock_block(s->info, left);
|
||||
|
||||
unlock_block(s->info, middle);
|
||||
|
||||
if (shadow_current(s) != right)
|
||||
unlock_block(s->info, right);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
/* patch up the spine */
|
||||
if (key < le64_to_cpu(mn->keys[0])) {
|
||||
unlock_block(s->info, middle);
|
||||
unlock_block(s->info, right);
|
||||
s->nodes[1] = left;
|
||||
} else if (key < le64_to_cpu(rn->keys[0])) {
|
||||
unlock_block(s->info, left);
|
||||
unlock_block(s->info, right);
|
||||
s->nodes[1] = middle;
|
||||
} else {
|
||||
unlock_block(s->info, left);
|
||||
unlock_block(s->info, middle);
|
||||
s->nodes[1] = right;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
|
||||
/*
|
||||
* Splits a node by creating two new children beneath the given node.
|
||||
*
|
||||
@ -690,6 +896,186 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
|
||||
/*
|
||||
* Redistributes a node's entries with its left sibling.
|
||||
*/
|
||||
static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt,
|
||||
unsigned parent_index, uint64_t key)
|
||||
{
|
||||
int r;
|
||||
struct dm_block *sib;
|
||||
struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
|
||||
|
||||
r = shadow_child(s->info, vt, parent, parent_index - 1, &sib);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
left = dm_block_data(sib);
|
||||
right = dm_block_data(shadow_current(s));
|
||||
redistribute2(left, right);
|
||||
*key_ptr(parent, parent_index) = right->keys[0];
|
||||
|
||||
if (key < le64_to_cpu(right->keys[0])) {
|
||||
unlock_block(s->info, s->nodes[1]);
|
||||
s->nodes[1] = sib;
|
||||
} else {
|
||||
unlock_block(s->info, sib);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Redistributes a nodes entries with its right sibling.
|
||||
*/
|
||||
static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt,
|
||||
unsigned parent_index, uint64_t key)
|
||||
{
|
||||
int r;
|
||||
struct dm_block *sib;
|
||||
struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
|
||||
|
||||
r = shadow_child(s->info, vt, parent, parent_index + 1, &sib);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
left = dm_block_data(shadow_current(s));
|
||||
right = dm_block_data(sib);
|
||||
redistribute2(left, right);
|
||||
*key_ptr(parent, parent_index + 1) = right->keys[0];
|
||||
|
||||
if (key < le64_to_cpu(right->keys[0])) {
|
||||
unlock_block(s->info, sib);
|
||||
} else {
|
||||
unlock_block(s->info, s->nodes[1]);
|
||||
s->nodes[1] = sib;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the number of spare entries in a node.
|
||||
*/
|
||||
static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned *space)
|
||||
{
|
||||
int r;
|
||||
unsigned nr_entries;
|
||||
struct dm_block *block;
|
||||
struct btree_node *node;
|
||||
|
||||
r = bn_read_lock(info, b, &block);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
node = dm_block_data(block);
|
||||
nr_entries = le32_to_cpu(node->header.nr_entries);
|
||||
*space = le32_to_cpu(node->header.max_entries) - nr_entries;
|
||||
|
||||
unlock_block(info, block);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make space in a node, either by moving some entries to a sibling,
|
||||
* or creating a new sibling node. SPACE_THRESHOLD defines the minimum
|
||||
* number of free entries that must be in the sibling to make the move
|
||||
* worth while. If the siblings are shared (eg, part of a snapshot),
|
||||
* then they are not touched, since this break sharing and so consume
|
||||
* more space than we save.
|
||||
*/
|
||||
#define SPACE_THRESHOLD 8
|
||||
static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt,
|
||||
unsigned parent_index, uint64_t key)
|
||||
{
|
||||
int r;
|
||||
struct btree_node *parent = dm_block_data(shadow_parent(s));
|
||||
unsigned nr_parent = le32_to_cpu(parent->header.nr_entries);
|
||||
unsigned free_space;
|
||||
int left_shared = 0, right_shared = 0;
|
||||
|
||||
/* Should we move entries to the left sibling? */
|
||||
if (parent_index > 0) {
|
||||
dm_block_t left_b = value64(parent, parent_index - 1);
|
||||
r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
if (!left_shared) {
|
||||
r = get_node_free_space(s->info, left_b, &free_space);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
if (free_space >= SPACE_THRESHOLD)
|
||||
return rebalance_left(s, vt, parent_index, key);
|
||||
}
|
||||
}
|
||||
|
||||
/* Should we move entries to the right sibling? */
|
||||
if (parent_index < (nr_parent - 1)) {
|
||||
dm_block_t right_b = value64(parent, parent_index + 1);
|
||||
r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
if (!right_shared) {
|
||||
r = get_node_free_space(s->info, right_b, &free_space);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
if (free_space >= SPACE_THRESHOLD)
|
||||
return rebalance_right(s, vt, parent_index, key);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We need to split the node, normally we split two nodes
|
||||
* into three. But when inserting a sequence that is either
|
||||
* monotonically increasing or decreasing it's better to split
|
||||
* a single node into two.
|
||||
*/
|
||||
if (left_shared || right_shared || (nr_parent <= 2) ||
|
||||
(parent_index == 0) || (parent_index + 1 == nr_parent)) {
|
||||
return split_one_into_two(s, parent_index, vt, key);
|
||||
} else {
|
||||
return split_two_into_three(s, parent_index, vt, key);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Does the node contain a particular key?
|
||||
*/
|
||||
static bool contains_key(struct btree_node *node, uint64_t key)
|
||||
{
|
||||
int i = lower_bound(node, key);
|
||||
|
||||
if (i >= 0 && le64_to_cpu(node->keys[i]) == key)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* In general we preemptively make sure there's a free entry in every
|
||||
* node on the spine when doing an insert. But we can avoid that with
|
||||
* leaf nodes if we know it's an overwrite.
|
||||
*/
|
||||
static bool has_space_for_insert(struct btree_node *node, uint64_t key)
|
||||
{
|
||||
if (node->header.nr_entries == node->header.max_entries) {
|
||||
if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
|
||||
/* we don't need space if it's an overwrite */
|
||||
return contains_key(node, key);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
|
||||
struct dm_btree_value_type *vt,
|
||||
uint64_t key, unsigned *index)
|
||||
@ -719,17 +1105,18 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
|
||||
|
||||
node = dm_block_data(shadow_current(s));
|
||||
|
||||
if (node->header.nr_entries == node->header.max_entries) {
|
||||
if (!has_space_for_insert(node, key)) {
|
||||
if (top)
|
||||
r = btree_split_beneath(s, key);
|
||||
else
|
||||
r = btree_split_sibling(s, i, key);
|
||||
r = rebalance_or_split(s, vt, i, key);
|
||||
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
|
||||
node = dm_block_data(shadow_current(s));
|
||||
/* making space can cause the current node to change */
|
||||
node = dm_block_data(shadow_current(s));
|
||||
}
|
||||
|
||||
i = lower_bound(node, key);
|
||||
|
||||
@ -753,6 +1140,77 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root,
|
||||
uint64_t key, int *index)
|
||||
{
|
||||
int r, i = -1;
|
||||
struct btree_node *node;
|
||||
|
||||
*index = 0;
|
||||
for (;;) {
|
||||
r = shadow_step(s, root, &s->info->value_type);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
node = dm_block_data(shadow_current(s));
|
||||
|
||||
/*
|
||||
* We have to patch up the parent node, ugly, but I don't
|
||||
* see a way to do this automatically as part of the spine
|
||||
* op.
|
||||
*/
|
||||
if (shadow_has_parent(s) && i >= 0) {
|
||||
__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
|
||||
|
||||
__dm_bless_for_disk(&location);
|
||||
memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
|
||||
&location, sizeof(__le64));
|
||||
}
|
||||
|
||||
node = dm_block_data(shadow_current(s));
|
||||
i = lower_bound(node, key);
|
||||
|
||||
BUG_ON(i < 0);
|
||||
BUG_ON(i >= le32_to_cpu(node->header.nr_entries));
|
||||
|
||||
if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
|
||||
if (key != le64_to_cpu(node->keys[i]))
|
||||
return -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
root = value64(node, i);
|
||||
}
|
||||
|
||||
*index = i;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root,
|
||||
uint64_t key, int *index,
|
||||
dm_block_t *new_root, struct dm_block **leaf)
|
||||
{
|
||||
int r;
|
||||
struct shadow_spine spine;
|
||||
|
||||
BUG_ON(info->levels > 1);
|
||||
init_shadow_spine(&spine, info);
|
||||
r = __btree_get_overwrite_leaf(&spine, root, key, index);
|
||||
if (!r) {
|
||||
*new_root = shadow_root(&spine);
|
||||
*leaf = shadow_current(&spine);
|
||||
|
||||
/*
|
||||
* Decrement the count so exit_shadow_spine() doesn't
|
||||
* unlock the leaf.
|
||||
*/
|
||||
spine.count--;
|
||||
}
|
||||
exit_shadow_spine(&spine);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static bool need_insert(struct btree_node *node, uint64_t *keys,
|
||||
unsigned level, unsigned index)
|
||||
{
|
||||
@ -829,7 +1287,7 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
|
||||
value_ptr(n, index),
|
||||
value))) {
|
||||
info->value_type.dec(info->value_type.context,
|
||||
value_ptr(n, index));
|
||||
value_ptr(n, index), 1);
|
||||
}
|
||||
memcpy_disk(value_ptr(n, index),
|
||||
value, info->value_type.size);
|
||||
|
@ -51,21 +51,21 @@ struct dm_btree_value_type {
|
||||
*/
|
||||
|
||||
/*
|
||||
* The btree is making a duplicate of the value, for instance
|
||||
* The btree is making a duplicate of a run of values, for instance
|
||||
* because previously-shared btree nodes have now diverged.
|
||||
* @value argument is the new copy that the copy function may modify.
|
||||
* (Probably it just wants to increment a reference count
|
||||
* somewhere.) This method is _not_ called for insertion of a new
|
||||
* value: It is assumed the ref count is already 1.
|
||||
*/
|
||||
void (*inc)(void *context, const void *value);
|
||||
void (*inc)(void *context, const void *value, unsigned count);
|
||||
|
||||
/*
|
||||
* This value is being deleted. The btree takes care of freeing
|
||||
* These values are being deleted. The btree takes care of freeing
|
||||
* the memory pointed to by @value. Often the del function just
|
||||
* needs to decrement a reference count somewhere.
|
||||
* needs to decrement a reference counts somewhere.
|
||||
*/
|
||||
void (*dec)(void *context, const void *value);
|
||||
void (*dec)(void *context, const void *value, unsigned count);
|
||||
|
||||
/*
|
||||
* A test for equality between two values. When a value is
|
||||
|
@ -6,6 +6,8 @@
|
||||
|
||||
#include "dm-space-map-common.h"
|
||||
#include "dm-transaction-manager.h"
|
||||
#include "dm-btree-internal.h"
|
||||
#include "dm-persistent-data-internal.h"
|
||||
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/device-mapper.h>
|
||||
@ -409,12 +411,13 @@ int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
|
||||
return r;
|
||||
}
|
||||
|
||||
static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
|
||||
int (*mutator)(void *context, uint32_t old, uint32_t *new),
|
||||
void *context, enum allocation_event *ev)
|
||||
/*----------------------------------------------------------------*/
|
||||
|
||||
int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
|
||||
uint32_t ref_count, int32_t *nr_allocations)
|
||||
{
|
||||
int r;
|
||||
uint32_t bit, old, ref_count;
|
||||
uint32_t bit, old;
|
||||
struct dm_block *nb;
|
||||
dm_block_t index = b;
|
||||
struct disk_index_entry ie_disk;
|
||||
@ -433,10 +436,9 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
|
||||
return r;
|
||||
}
|
||||
ie_disk.blocknr = cpu_to_le64(dm_block_location(nb));
|
||||
|
||||
bm_le = dm_bitmap_data(nb);
|
||||
old = sm_lookup_bitmap(bm_le, bit);
|
||||
|
||||
old = sm_lookup_bitmap(bm_le, bit);
|
||||
if (old > 2) {
|
||||
r = sm_ll_lookup_big_ref_count(ll, b, &old);
|
||||
if (r < 0) {
|
||||
@ -445,7 +447,6 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
|
||||
}
|
||||
}
|
||||
|
||||
r = mutator(context, old, &ref_count);
|
||||
if (r) {
|
||||
dm_tm_unlock(ll->tm, nb);
|
||||
return r;
|
||||
@ -453,7 +454,6 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
|
||||
|
||||
if (ref_count <= 2) {
|
||||
sm_set_bitmap(bm_le, bit, ref_count);
|
||||
|
||||
dm_tm_unlock(ll->tm, nb);
|
||||
|
||||
if (old > 2) {
|
||||
@ -480,62 +480,459 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
|
||||
}
|
||||
|
||||
if (ref_count && !old) {
|
||||
*ev = SM_ALLOC;
|
||||
*nr_allocations = 1;
|
||||
ll->nr_allocated++;
|
||||
le32_add_cpu(&ie_disk.nr_free, -1);
|
||||
if (le32_to_cpu(ie_disk.none_free_before) == bit)
|
||||
ie_disk.none_free_before = cpu_to_le32(bit + 1);
|
||||
|
||||
} else if (old && !ref_count) {
|
||||
*ev = SM_FREE;
|
||||
*nr_allocations = -1;
|
||||
ll->nr_allocated--;
|
||||
le32_add_cpu(&ie_disk.nr_free, 1);
|
||||
ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit));
|
||||
} else
|
||||
*ev = SM_NONE;
|
||||
*nr_allocations = 0;
|
||||
|
||||
return ll->save_ie(ll, index, &ie_disk);
|
||||
}
|
||||
|
||||
static int set_ref_count(void *context, uint32_t old, uint32_t *new)
|
||||
/*----------------------------------------------------------------*/
|
||||
|
||||
/*
|
||||
* Holds useful intermediate results for the range based inc and dec
|
||||
* operations.
|
||||
*/
|
||||
struct inc_context {
|
||||
struct disk_index_entry ie_disk;
|
||||
struct dm_block *bitmap_block;
|
||||
void *bitmap;
|
||||
|
||||
struct dm_block *overflow_leaf;
|
||||
};
|
||||
|
||||
static inline void init_inc_context(struct inc_context *ic)
|
||||
{
|
||||
*new = *((uint32_t *) context);
|
||||
return 0;
|
||||
ic->bitmap_block = NULL;
|
||||
ic->bitmap = NULL;
|
||||
ic->overflow_leaf = NULL;
|
||||
}
|
||||
|
||||
int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
|
||||
uint32_t ref_count, enum allocation_event *ev)
|
||||
static inline void exit_inc_context(struct ll_disk *ll, struct inc_context *ic)
|
||||
{
|
||||
return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev);
|
||||
if (ic->bitmap_block)
|
||||
dm_tm_unlock(ll->tm, ic->bitmap_block);
|
||||
if (ic->overflow_leaf)
|
||||
dm_tm_unlock(ll->tm, ic->overflow_leaf);
|
||||
}
|
||||
|
||||
static int inc_ref_count(void *context, uint32_t old, uint32_t *new)
|
||||
static inline void reset_inc_context(struct ll_disk *ll, struct inc_context *ic)
|
||||
{
|
||||
*new = old + 1;
|
||||
return 0;
|
||||
exit_inc_context(ll, ic);
|
||||
init_inc_context(ic);
|
||||
}
|
||||
|
||||
int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
|
||||
/*
|
||||
* Confirms a btree node contains a particular key at an index.
|
||||
*/
|
||||
static bool contains_key(struct btree_node *n, uint64_t key, int index)
|
||||
{
|
||||
return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev);
|
||||
return index >= 0 &&
|
||||
index < le32_to_cpu(n->header.nr_entries) &&
|
||||
le64_to_cpu(n->keys[index]) == key;
|
||||
}
|
||||
|
||||
static int dec_ref_count(void *context, uint32_t old, uint32_t *new)
|
||||
static int __sm_ll_inc_overflow(struct ll_disk *ll, dm_block_t b, struct inc_context *ic)
|
||||
{
|
||||
if (!old) {
|
||||
DMERR_LIMIT("unable to decrement a reference count below 0");
|
||||
int r;
|
||||
int index;
|
||||
struct btree_node *n;
|
||||
__le32 *v_ptr;
|
||||
uint32_t rc;
|
||||
|
||||
/*
|
||||
* bitmap_block needs to be unlocked because getting the
|
||||
* overflow_leaf may need to allocate, and thus use the space map.
|
||||
*/
|
||||
reset_inc_context(ll, ic);
|
||||
|
||||
r = btree_get_overwrite_leaf(&ll->ref_count_info, ll->ref_count_root,
|
||||
b, &index, &ll->ref_count_root, &ic->overflow_leaf);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
n = dm_block_data(ic->overflow_leaf);
|
||||
|
||||
if (!contains_key(n, b, index)) {
|
||||
DMERR("overflow btree is missing an entry");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
*new = old - 1;
|
||||
v_ptr = value_ptr(n, index);
|
||||
rc = le32_to_cpu(*v_ptr) + 1;
|
||||
*v_ptr = cpu_to_le32(rc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
|
||||
static int sm_ll_inc_overflow(struct ll_disk *ll, dm_block_t b, struct inc_context *ic)
|
||||
{
|
||||
return sm_ll_mutate(ll, b, dec_ref_count, NULL, ev);
|
||||
int index;
|
||||
struct btree_node *n;
|
||||
__le32 *v_ptr;
|
||||
uint32_t rc;
|
||||
|
||||
/*
|
||||
* Do we already have the correct overflow leaf?
|
||||
*/
|
||||
if (ic->overflow_leaf) {
|
||||
n = dm_block_data(ic->overflow_leaf);
|
||||
index = lower_bound(n, b);
|
||||
if (contains_key(n, b, index)) {
|
||||
v_ptr = value_ptr(n, index);
|
||||
rc = le32_to_cpu(*v_ptr) + 1;
|
||||
*v_ptr = cpu_to_le32(rc);
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return __sm_ll_inc_overflow(ll, b, ic);
|
||||
}
|
||||
|
||||
static inline int shadow_bitmap(struct ll_disk *ll, struct inc_context *ic)
|
||||
{
|
||||
int r, inc;
|
||||
r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ic->ie_disk.blocknr),
|
||||
&dm_sm_bitmap_validator, &ic->bitmap_block, &inc);
|
||||
if (r < 0) {
|
||||
DMERR("dm_tm_shadow_block() failed");
|
||||
return r;
|
||||
}
|
||||
ic->ie_disk.blocknr = cpu_to_le64(dm_block_location(ic->bitmap_block));
|
||||
ic->bitmap = dm_bitmap_data(ic->bitmap_block);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Once shadow_bitmap has been called, which always happens at the start of inc/dec,
|
||||
* we can reopen the bitmap with a simple write lock, rather than re calling
|
||||
* dm_tm_shadow_block().
|
||||
*/
|
||||
static inline int ensure_bitmap(struct ll_disk *ll, struct inc_context *ic)
|
||||
{
|
||||
if (!ic->bitmap_block) {
|
||||
int r = dm_bm_write_lock(dm_tm_get_bm(ll->tm), le64_to_cpu(ic->ie_disk.blocknr),
|
||||
&dm_sm_bitmap_validator, &ic->bitmap_block);
|
||||
if (r) {
|
||||
DMERR("unable to re-get write lock for bitmap");
|
||||
return r;
|
||||
}
|
||||
ic->bitmap = dm_bitmap_data(ic->bitmap_block);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Loops round incrementing entries in a single bitmap.
|
||||
*/
|
||||
static inline int sm_ll_inc_bitmap(struct ll_disk *ll, dm_block_t b,
|
||||
uint32_t bit, uint32_t bit_end,
|
||||
int32_t *nr_allocations, dm_block_t *new_b,
|
||||
struct inc_context *ic)
|
||||
{
|
||||
int r;
|
||||
__le32 le_rc;
|
||||
uint32_t old;
|
||||
|
||||
for (; bit != bit_end; bit++, b++) {
|
||||
/*
|
||||
* We only need to drop the bitmap if we need to find a new btree
|
||||
* leaf for the overflow. So if it was dropped last iteration,
|
||||
* we now re-get it.
|
||||
*/
|
||||
r = ensure_bitmap(ll, ic);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
old = sm_lookup_bitmap(ic->bitmap, bit);
|
||||
switch (old) {
|
||||
case 0:
|
||||
/* inc bitmap, adjust nr_allocated */
|
||||
sm_set_bitmap(ic->bitmap, bit, 1);
|
||||
(*nr_allocations)++;
|
||||
ll->nr_allocated++;
|
||||
le32_add_cpu(&ic->ie_disk.nr_free, -1);
|
||||
if (le32_to_cpu(ic->ie_disk.none_free_before) == bit)
|
||||
ic->ie_disk.none_free_before = cpu_to_le32(bit + 1);
|
||||
break;
|
||||
|
||||
case 1:
|
||||
/* inc bitmap */
|
||||
sm_set_bitmap(ic->bitmap, bit, 2);
|
||||
break;
|
||||
|
||||
case 2:
|
||||
/* inc bitmap and insert into overflow */
|
||||
sm_set_bitmap(ic->bitmap, bit, 3);
|
||||
reset_inc_context(ll, ic);
|
||||
|
||||
le_rc = cpu_to_le32(3);
|
||||
__dm_bless_for_disk(&le_rc);
|
||||
r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root,
|
||||
&b, &le_rc, &ll->ref_count_root);
|
||||
if (r < 0) {
|
||||
DMERR("ref count insert failed");
|
||||
return r;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
/*
|
||||
* inc within the overflow tree only.
|
||||
*/
|
||||
r = sm_ll_inc_overflow(ll, b, ic);
|
||||
if (r < 0)
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
*new_b = b;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Finds a bitmap that contains entries in the block range, and increments
|
||||
* them.
|
||||
*/
|
||||
static int __sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e,
|
||||
int32_t *nr_allocations, dm_block_t *new_b)
|
||||
{
|
||||
int r;
|
||||
struct inc_context ic;
|
||||
uint32_t bit, bit_end;
|
||||
dm_block_t index = b;
|
||||
|
||||
init_inc_context(&ic);
|
||||
|
||||
bit = do_div(index, ll->entries_per_block);
|
||||
r = ll->load_ie(ll, index, &ic.ie_disk);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = shadow_bitmap(ll, &ic);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
bit_end = min(bit + (e - b), (dm_block_t) ll->entries_per_block);
|
||||
r = sm_ll_inc_bitmap(ll, b, bit, bit_end, nr_allocations, new_b, &ic);
|
||||
|
||||
exit_inc_context(ll, &ic);
|
||||
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
return ll->save_ie(ll, index, &ic.ie_disk);
|
||||
}
|
||||
|
||||
int sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e,
|
||||
int32_t *nr_allocations)
|
||||
{
|
||||
*nr_allocations = 0;
|
||||
while (b != e) {
|
||||
int r = __sm_ll_inc(ll, b, e, nr_allocations, &b);
|
||||
if (r)
|
||||
return r;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
|
||||
static int __sm_ll_del_overflow(struct ll_disk *ll, dm_block_t b,
|
||||
struct inc_context *ic)
|
||||
{
|
||||
reset_inc_context(ll, ic);
|
||||
return dm_btree_remove(&ll->ref_count_info, ll->ref_count_root,
|
||||
&b, &ll->ref_count_root);
|
||||
}
|
||||
|
||||
static int __sm_ll_dec_overflow(struct ll_disk *ll, dm_block_t b,
|
||||
struct inc_context *ic, uint32_t *old_rc)
|
||||
{
|
||||
int r;
|
||||
int index = -1;
|
||||
struct btree_node *n;
|
||||
__le32 *v_ptr;
|
||||
uint32_t rc;
|
||||
|
||||
reset_inc_context(ll, ic);
|
||||
r = btree_get_overwrite_leaf(&ll->ref_count_info, ll->ref_count_root,
|
||||
b, &index, &ll->ref_count_root, &ic->overflow_leaf);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
n = dm_block_data(ic->overflow_leaf);
|
||||
|
||||
if (!contains_key(n, b, index)) {
|
||||
DMERR("overflow btree is missing an entry");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
v_ptr = value_ptr(n, index);
|
||||
rc = le32_to_cpu(*v_ptr);
|
||||
*old_rc = rc;
|
||||
|
||||
if (rc == 3) {
|
||||
return __sm_ll_del_overflow(ll, b, ic);
|
||||
} else {
|
||||
rc--;
|
||||
*v_ptr = cpu_to_le32(rc);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static int sm_ll_dec_overflow(struct ll_disk *ll, dm_block_t b,
|
||||
struct inc_context *ic, uint32_t *old_rc)
|
||||
{
|
||||
/*
|
||||
* Do we already have the correct overflow leaf?
|
||||
*/
|
||||
if (ic->overflow_leaf) {
|
||||
int index;
|
||||
struct btree_node *n;
|
||||
__le32 *v_ptr;
|
||||
uint32_t rc;
|
||||
|
||||
n = dm_block_data(ic->overflow_leaf);
|
||||
index = lower_bound(n, b);
|
||||
if (contains_key(n, b, index)) {
|
||||
v_ptr = value_ptr(n, index);
|
||||
rc = le32_to_cpu(*v_ptr);
|
||||
*old_rc = rc;
|
||||
|
||||
if (rc > 3) {
|
||||
rc--;
|
||||
*v_ptr = cpu_to_le32(rc);
|
||||
return 0;
|
||||
} else {
|
||||
return __sm_ll_del_overflow(ll, b, ic);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return __sm_ll_dec_overflow(ll, b, ic, old_rc);
|
||||
}
|
||||
|
||||
/*
|
||||
* Loops round incrementing entries in a single bitmap.
|
||||
*/
|
||||
static inline int sm_ll_dec_bitmap(struct ll_disk *ll, dm_block_t b,
|
||||
uint32_t bit, uint32_t bit_end,
|
||||
struct inc_context *ic,
|
||||
int32_t *nr_allocations, dm_block_t *new_b)
|
||||
{
|
||||
int r;
|
||||
uint32_t old;
|
||||
|
||||
for (; bit != bit_end; bit++, b++) {
|
||||
/*
|
||||
* We only need to drop the bitmap if we need to find a new btree
|
||||
* leaf for the overflow. So if it was dropped last iteration,
|
||||
* we now re-get it.
|
||||
*/
|
||||
r = ensure_bitmap(ll, ic);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
old = sm_lookup_bitmap(ic->bitmap, bit);
|
||||
switch (old) {
|
||||
case 0:
|
||||
DMERR("unable to decrement block");
|
||||
return -EINVAL;
|
||||
|
||||
case 1:
|
||||
/* dec bitmap */
|
||||
sm_set_bitmap(ic->bitmap, bit, 0);
|
||||
(*nr_allocations)--;
|
||||
ll->nr_allocated--;
|
||||
le32_add_cpu(&ic->ie_disk.nr_free, 1);
|
||||
ic->ie_disk.none_free_before =
|
||||
cpu_to_le32(min(le32_to_cpu(ic->ie_disk.none_free_before), bit));
|
||||
break;
|
||||
|
||||
case 2:
|
||||
/* dec bitmap and insert into overflow */
|
||||
sm_set_bitmap(ic->bitmap, bit, 1);
|
||||
break;
|
||||
|
||||
case 3:
|
||||
r = sm_ll_dec_overflow(ll, b, ic, &old);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
if (old == 3) {
|
||||
r = ensure_bitmap(ll, ic);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
sm_set_bitmap(ic->bitmap, bit, 2);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
*new_b = b;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e,
|
||||
int32_t *nr_allocations, dm_block_t *new_b)
|
||||
{
|
||||
int r;
|
||||
uint32_t bit, bit_end;
|
||||
struct inc_context ic;
|
||||
dm_block_t index = b;
|
||||
|
||||
init_inc_context(&ic);
|
||||
|
||||
bit = do_div(index, ll->entries_per_block);
|
||||
r = ll->load_ie(ll, index, &ic.ie_disk);
|
||||
if (r < 0)
|
||||
return r;
|
||||
|
||||
r = shadow_bitmap(ll, &ic);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
bit_end = min(bit + (e - b), (dm_block_t) ll->entries_per_block);
|
||||
r = sm_ll_dec_bitmap(ll, b, bit, bit_end, &ic, nr_allocations, new_b);
|
||||
exit_inc_context(ll, &ic);
|
||||
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
return ll->save_ie(ll, index, &ic.ie_disk);
|
||||
}
|
||||
|
||||
int sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e,
|
||||
int32_t *nr_allocations)
|
||||
{
|
||||
*nr_allocations = 0;
|
||||
while (b != e) {
|
||||
int r = __sm_ll_dec(ll, b, e, nr_allocations, &b);
|
||||
if (r)
|
||||
return r;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
|
||||
int sm_ll_commit(struct ll_disk *ll)
|
||||
{
|
||||
int r = 0;
|
||||
@ -687,28 +1084,92 @@ int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
|
||||
static inline int ie_cache_writeback(struct ll_disk *ll, struct ie_cache *iec)
|
||||
{
|
||||
iec->dirty = false;
|
||||
__dm_bless_for_disk(iec->ie);
|
||||
return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root,
|
||||
&iec->index, &iec->ie, &ll->bitmap_root);
|
||||
}
|
||||
|
||||
static inline unsigned hash_index(dm_block_t index)
|
||||
{
|
||||
return dm_hash_block(index, IE_CACHE_MASK);
|
||||
}
|
||||
|
||||
static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index,
|
||||
struct disk_index_entry *ie)
|
||||
{
|
||||
return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie);
|
||||
int r;
|
||||
unsigned h = hash_index(index);
|
||||
struct ie_cache *iec = ll->ie_cache + h;
|
||||
|
||||
if (iec->valid) {
|
||||
if (iec->index == index) {
|
||||
memcpy(ie, &iec->ie, sizeof(*ie));
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (iec->dirty) {
|
||||
r = ie_cache_writeback(ll, iec);
|
||||
if (r)
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
r = dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie);
|
||||
if (!r) {
|
||||
iec->valid = true;
|
||||
iec->dirty = false;
|
||||
iec->index = index;
|
||||
memcpy(&iec->ie, ie, sizeof(*ie));
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index,
|
||||
struct disk_index_entry *ie)
|
||||
{
|
||||
__dm_bless_for_disk(ie);
|
||||
return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root,
|
||||
&index, ie, &ll->bitmap_root);
|
||||
int r;
|
||||
unsigned h = hash_index(index);
|
||||
struct ie_cache *iec = ll->ie_cache + h;
|
||||
|
||||
ll->bitmap_index_changed = true;
|
||||
if (iec->valid) {
|
||||
if (iec->index == index) {
|
||||
memcpy(&iec->ie, ie, sizeof(*ie));
|
||||
iec->dirty = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (iec->dirty) {
|
||||
r = ie_cache_writeback(ll, iec);
|
||||
if (r)
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
iec->valid = true;
|
||||
iec->dirty = true;
|
||||
iec->index = index;
|
||||
memcpy(&iec->ie, ie, sizeof(*ie));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int disk_ll_init_index(struct ll_disk *ll)
|
||||
{
|
||||
unsigned i;
|
||||
for (i = 0; i < IE_CACHE_SIZE; i++) {
|
||||
struct ie_cache *iec = ll->ie_cache + i;
|
||||
iec->valid = false;
|
||||
iec->dirty = false;
|
||||
}
|
||||
return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root);
|
||||
}
|
||||
|
||||
static int disk_ll_open(struct ll_disk *ll)
|
||||
{
|
||||
/* nothing to do */
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -719,7 +1180,16 @@ static dm_block_t disk_ll_max_entries(struct ll_disk *ll)
|
||||
|
||||
static int disk_ll_commit(struct ll_disk *ll)
|
||||
{
|
||||
return 0;
|
||||
int r = 0;
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < IE_CACHE_SIZE; i++) {
|
||||
struct ie_cache *iec = ll->ie_cache + i;
|
||||
if (iec->valid && iec->dirty)
|
||||
r = ie_cache_writeback(ll, iec);
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm)
|
||||
|
@ -54,6 +54,20 @@ typedef int (*open_index_fn)(struct ll_disk *ll);
|
||||
typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll);
|
||||
typedef int (*commit_fn)(struct ll_disk *ll);
|
||||
|
||||
/*
|
||||
* A lot of time can be wasted reading and writing the same
|
||||
* index entry. So we cache a few entries.
|
||||
*/
|
||||
#define IE_CACHE_SIZE 64
|
||||
#define IE_CACHE_MASK (IE_CACHE_SIZE - 1)
|
||||
|
||||
struct ie_cache {
|
||||
bool valid;
|
||||
bool dirty;
|
||||
dm_block_t index;
|
||||
struct disk_index_entry ie;
|
||||
};
|
||||
|
||||
struct ll_disk {
|
||||
struct dm_transaction_manager *tm;
|
||||
struct dm_btree_info bitmap_info;
|
||||
@ -79,6 +93,8 @@ struct ll_disk {
|
||||
max_index_entries_fn max_entries;
|
||||
commit_fn commit;
|
||||
bool bitmap_index_changed:1;
|
||||
|
||||
struct ie_cache ie_cache[IE_CACHE_SIZE];
|
||||
};
|
||||
|
||||
struct disk_sm_root {
|
||||
@ -96,12 +112,6 @@ struct disk_bitmap_header {
|
||||
__le64 blocknr;
|
||||
} __attribute__ ((packed, aligned(8)));
|
||||
|
||||
enum allocation_event {
|
||||
SM_NONE,
|
||||
SM_ALLOC,
|
||||
SM_FREE,
|
||||
};
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
|
||||
int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks);
|
||||
@ -111,9 +121,15 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
|
||||
dm_block_t end, dm_block_t *result);
|
||||
int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll,
|
||||
dm_block_t begin, dm_block_t end, dm_block_t *result);
|
||||
int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
|
||||
int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
|
||||
int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
|
||||
|
||||
/*
|
||||
* The next three functions return (via nr_allocations) the net number of
|
||||
* allocations that were made. This number may be negative if there were
|
||||
* more frees than allocs.
|
||||
*/
|
||||
int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, int32_t *nr_allocations);
|
||||
int sm_ll_inc(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations);
|
||||
int sm_ll_dec(struct ll_disk *ll, dm_block_t b, dm_block_t e, int32_t *nr_allocations);
|
||||
int sm_ll_commit(struct ll_disk *ll);
|
||||
|
||||
int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm);
|
||||
|
@ -87,76 +87,39 @@ static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b,
|
||||
uint32_t count)
|
||||
{
|
||||
int r;
|
||||
uint32_t old_count;
|
||||
enum allocation_event ev;
|
||||
int32_t nr_allocations;
|
||||
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
|
||||
|
||||
r = sm_ll_insert(&smd->ll, b, count, &ev);
|
||||
r = sm_ll_insert(&smd->ll, b, count, &nr_allocations);
|
||||
if (!r) {
|
||||
switch (ev) {
|
||||
case SM_NONE:
|
||||
break;
|
||||
|
||||
case SM_ALLOC:
|
||||
/*
|
||||
* This _must_ be free in the prior transaction
|
||||
* otherwise we've lost atomicity.
|
||||
*/
|
||||
smd->nr_allocated_this_transaction++;
|
||||
break;
|
||||
|
||||
case SM_FREE:
|
||||
/*
|
||||
* It's only free if it's also free in the last
|
||||
* transaction.
|
||||
*/
|
||||
r = sm_ll_lookup(&smd->old_ll, b, &old_count);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
if (!old_count)
|
||||
smd->nr_allocated_this_transaction--;
|
||||
break;
|
||||
}
|
||||
smd->nr_allocated_this_transaction += nr_allocations;
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
|
||||
static int sm_disk_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
|
||||
{
|
||||
int r;
|
||||
enum allocation_event ev;
|
||||
int32_t nr_allocations;
|
||||
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
|
||||
|
||||
r = sm_ll_inc(&smd->ll, b, &ev);
|
||||
if (!r && (ev == SM_ALLOC))
|
||||
/*
|
||||
* This _must_ be free in the prior transaction
|
||||
* otherwise we've lost atomicity.
|
||||
*/
|
||||
smd->nr_allocated_this_transaction++;
|
||||
r = sm_ll_inc(&smd->ll, b, e, &nr_allocations);
|
||||
if (!r)
|
||||
smd->nr_allocated_this_transaction += nr_allocations;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
|
||||
static int sm_disk_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
|
||||
{
|
||||
int r;
|
||||
uint32_t old_count;
|
||||
enum allocation_event ev;
|
||||
int32_t nr_allocations;
|
||||
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
|
||||
|
||||
r = sm_ll_dec(&smd->ll, b, &ev);
|
||||
if (!r && (ev == SM_FREE)) {
|
||||
/*
|
||||
* It's only free if it's also free in the last
|
||||
* transaction.
|
||||
*/
|
||||
r = sm_ll_lookup(&smd->old_ll, b, &old_count);
|
||||
if (!r && !old_count)
|
||||
smd->nr_allocated_this_transaction--;
|
||||
}
|
||||
r = sm_ll_dec(&smd->ll, b, e, &nr_allocations);
|
||||
if (!r)
|
||||
smd->nr_allocated_this_transaction += nr_allocations;
|
||||
|
||||
return r;
|
||||
}
|
||||
@ -164,21 +127,28 @@ static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
|
||||
static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
|
||||
{
|
||||
int r;
|
||||
enum allocation_event ev;
|
||||
int32_t nr_allocations;
|
||||
struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
|
||||
|
||||
/*
|
||||
* Any block we allocate has to be free in both the old and current ll.
|
||||
*/
|
||||
r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b);
|
||||
if (r == -ENOSPC) {
|
||||
/*
|
||||
* There's no free block between smd->begin and the end of the metadata device.
|
||||
* We search before smd->begin in case something has been freed.
|
||||
*/
|
||||
r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, 0, smd->begin, b);
|
||||
}
|
||||
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
smd->begin = *b + 1;
|
||||
r = sm_ll_inc(&smd->ll, *b, &ev);
|
||||
r = sm_ll_inc(&smd->ll, *b, *b + 1, &nr_allocations);
|
||||
if (!r) {
|
||||
BUG_ON(ev != SM_ALLOC);
|
||||
smd->nr_allocated_this_transaction++;
|
||||
smd->nr_allocated_this_transaction += nr_allocations;
|
||||
}
|
||||
|
||||
return r;
|
||||
@ -194,7 +164,6 @@ static int sm_disk_commit(struct dm_space_map *sm)
|
||||
return r;
|
||||
|
||||
memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll));
|
||||
smd->begin = 0;
|
||||
smd->nr_allocated_this_transaction = 0;
|
||||
|
||||
return 0;
|
||||
@ -235,8 +204,8 @@ static struct dm_space_map ops = {
|
||||
.get_count = sm_disk_get_count,
|
||||
.count_is_more_than_one = sm_disk_count_is_more_than_one,
|
||||
.set_count = sm_disk_set_count,
|
||||
.inc_block = sm_disk_inc_block,
|
||||
.dec_block = sm_disk_dec_block,
|
||||
.inc_blocks = sm_disk_inc_blocks,
|
||||
.dec_blocks = sm_disk_dec_blocks,
|
||||
.new_block = sm_disk_new_block,
|
||||
.commit = sm_disk_commit,
|
||||
.root_size = sm_disk_root_size,
|
||||
|
@ -89,7 +89,8 @@ enum block_op_type {
|
||||
|
||||
struct block_op {
|
||||
enum block_op_type type;
|
||||
dm_block_t block;
|
||||
dm_block_t b;
|
||||
dm_block_t e;
|
||||
};
|
||||
|
||||
struct bop_ring_buffer {
|
||||
@ -116,7 +117,7 @@ static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
|
||||
}
|
||||
|
||||
static int brb_push(struct bop_ring_buffer *brb,
|
||||
enum block_op_type type, dm_block_t b)
|
||||
enum block_op_type type, dm_block_t b, dm_block_t e)
|
||||
{
|
||||
struct block_op *bop;
|
||||
unsigned next = brb_next(brb, brb->end);
|
||||
@ -130,7 +131,8 @@ static int brb_push(struct bop_ring_buffer *brb,
|
||||
|
||||
bop = brb->bops + brb->end;
|
||||
bop->type = type;
|
||||
bop->block = b;
|
||||
bop->b = b;
|
||||
bop->e = e;
|
||||
|
||||
brb->end = next;
|
||||
|
||||
@ -145,9 +147,7 @@ static int brb_peek(struct bop_ring_buffer *brb, struct block_op *result)
|
||||
return -ENODATA;
|
||||
|
||||
bop = brb->bops + brb->begin;
|
||||
result->type = bop->type;
|
||||
result->block = bop->block;
|
||||
|
||||
memcpy(result, bop, sizeof(*result));
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -178,10 +178,9 @@ struct sm_metadata {
|
||||
struct threshold threshold;
|
||||
};
|
||||
|
||||
static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
|
||||
static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b, dm_block_t e)
|
||||
{
|
||||
int r = brb_push(&smm->uncommitted, type, b);
|
||||
|
||||
int r = brb_push(&smm->uncommitted, type, b, e);
|
||||
if (r) {
|
||||
DMERR("too many recursive allocations");
|
||||
return -ENOMEM;
|
||||
@ -193,15 +192,15 @@ static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t
|
||||
static int commit_bop(struct sm_metadata *smm, struct block_op *op)
|
||||
{
|
||||
int r = 0;
|
||||
enum allocation_event ev;
|
||||
int32_t nr_allocations;
|
||||
|
||||
switch (op->type) {
|
||||
case BOP_INC:
|
||||
r = sm_ll_inc(&smm->ll, op->block, &ev);
|
||||
r = sm_ll_inc(&smm->ll, op->b, op->e, &nr_allocations);
|
||||
break;
|
||||
|
||||
case BOP_DEC:
|
||||
r = sm_ll_dec(&smm->ll, op->block, &ev);
|
||||
r = sm_ll_dec(&smm->ll, op->b, op->e, &nr_allocations);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -314,7 +313,7 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
|
||||
i = brb_next(&smm->uncommitted, i)) {
|
||||
struct block_op *op = smm->uncommitted.bops + i;
|
||||
|
||||
if (op->block != b)
|
||||
if (b < op->b || b >= op->e)
|
||||
continue;
|
||||
|
||||
switch (op->type) {
|
||||
@ -355,7 +354,7 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
|
||||
|
||||
struct block_op *op = smm->uncommitted.bops + i;
|
||||
|
||||
if (op->block != b)
|
||||
if (b < op->b || b >= op->e)
|
||||
continue;
|
||||
|
||||
switch (op->type) {
|
||||
@ -393,7 +392,7 @@ static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b,
|
||||
uint32_t count)
|
||||
{
|
||||
int r, r2;
|
||||
enum allocation_event ev;
|
||||
int32_t nr_allocations;
|
||||
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
|
||||
|
||||
if (smm->recursion_count) {
|
||||
@ -402,40 +401,42 @@ static int sm_metadata_set_count(struct dm_space_map *sm, dm_block_t b,
|
||||
}
|
||||
|
||||
in(smm);
|
||||
r = sm_ll_insert(&smm->ll, b, count, &ev);
|
||||
r = sm_ll_insert(&smm->ll, b, count, &nr_allocations);
|
||||
r2 = out(smm);
|
||||
|
||||
return combine_errors(r, r2);
|
||||
}
|
||||
|
||||
static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b)
|
||||
static int sm_metadata_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
|
||||
{
|
||||
int r, r2 = 0;
|
||||
enum allocation_event ev;
|
||||
int32_t nr_allocations;
|
||||
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
|
||||
|
||||
if (recursing(smm))
|
||||
r = add_bop(smm, BOP_INC, b);
|
||||
else {
|
||||
if (recursing(smm)) {
|
||||
r = add_bop(smm, BOP_INC, b, e);
|
||||
if (r)
|
||||
return r;
|
||||
} else {
|
||||
in(smm);
|
||||
r = sm_ll_inc(&smm->ll, b, &ev);
|
||||
r = sm_ll_inc(&smm->ll, b, e, &nr_allocations);
|
||||
r2 = out(smm);
|
||||
}
|
||||
|
||||
return combine_errors(r, r2);
|
||||
}
|
||||
|
||||
static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b)
|
||||
static int sm_metadata_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
|
||||
{
|
||||
int r, r2 = 0;
|
||||
enum allocation_event ev;
|
||||
int32_t nr_allocations;
|
||||
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
|
||||
|
||||
if (recursing(smm))
|
||||
r = add_bop(smm, BOP_DEC, b);
|
||||
r = add_bop(smm, BOP_DEC, b, e);
|
||||
else {
|
||||
in(smm);
|
||||
r = sm_ll_dec(&smm->ll, b, &ev);
|
||||
r = sm_ll_dec(&smm->ll, b, e, &nr_allocations);
|
||||
r2 = out(smm);
|
||||
}
|
||||
|
||||
@ -445,23 +446,31 @@ static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b)
|
||||
static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
|
||||
{
|
||||
int r, r2 = 0;
|
||||
enum allocation_event ev;
|
||||
int32_t nr_allocations;
|
||||
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
|
||||
|
||||
/*
|
||||
* Any block we allocate has to be free in both the old and current ll.
|
||||
*/
|
||||
r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b);
|
||||
if (r == -ENOSPC) {
|
||||
/*
|
||||
* There's no free block between smm->begin and the end of the metadata device.
|
||||
* We search before smm->begin in case something has been freed.
|
||||
*/
|
||||
r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, 0, smm->begin, b);
|
||||
}
|
||||
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
smm->begin = *b + 1;
|
||||
|
||||
if (recursing(smm))
|
||||
r = add_bop(smm, BOP_INC, *b);
|
||||
r = add_bop(smm, BOP_INC, *b, *b + 1);
|
||||
else {
|
||||
in(smm);
|
||||
r = sm_ll_inc(&smm->ll, *b, &ev);
|
||||
r = sm_ll_inc(&smm->ll, *b, *b + 1, &nr_allocations);
|
||||
r2 = out(smm);
|
||||
}
|
||||
|
||||
@ -503,7 +512,6 @@ static int sm_metadata_commit(struct dm_space_map *sm)
|
||||
return r;
|
||||
|
||||
memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
|
||||
smm->begin = 0;
|
||||
smm->allocated_this_transaction = 0;
|
||||
|
||||
return 0;
|
||||
@ -556,8 +564,8 @@ static const struct dm_space_map ops = {
|
||||
.get_count = sm_metadata_get_count,
|
||||
.count_is_more_than_one = sm_metadata_count_is_more_than_one,
|
||||
.set_count = sm_metadata_set_count,
|
||||
.inc_block = sm_metadata_inc_block,
|
||||
.dec_block = sm_metadata_dec_block,
|
||||
.inc_blocks = sm_metadata_inc_blocks,
|
||||
.dec_blocks = sm_metadata_dec_blocks,
|
||||
.new_block = sm_metadata_new_block,
|
||||
.commit = sm_metadata_commit,
|
||||
.root_size = sm_metadata_root_size,
|
||||
@ -641,18 +649,28 @@ static int sm_bootstrap_new_block(struct dm_space_map *sm, dm_block_t *b)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sm_bootstrap_inc_block(struct dm_space_map *sm, dm_block_t b)
|
||||
static int sm_bootstrap_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
|
||||
{
|
||||
int r;
|
||||
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
|
||||
|
||||
return add_bop(smm, BOP_INC, b);
|
||||
r = add_bop(smm, BOP_INC, b, e);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sm_bootstrap_dec_block(struct dm_space_map *sm, dm_block_t b)
|
||||
static int sm_bootstrap_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
|
||||
{
|
||||
int r;
|
||||
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
|
||||
|
||||
return add_bop(smm, BOP_DEC, b);
|
||||
r = add_bop(smm, BOP_DEC, b, e);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sm_bootstrap_commit(struct dm_space_map *sm)
|
||||
@ -683,8 +701,8 @@ static const struct dm_space_map bootstrap_ops = {
|
||||
.get_count = sm_bootstrap_get_count,
|
||||
.count_is_more_than_one = sm_bootstrap_count_is_more_than_one,
|
||||
.set_count = sm_bootstrap_set_count,
|
||||
.inc_block = sm_bootstrap_inc_block,
|
||||
.dec_block = sm_bootstrap_dec_block,
|
||||
.inc_blocks = sm_bootstrap_inc_blocks,
|
||||
.dec_blocks = sm_bootstrap_dec_blocks,
|
||||
.new_block = sm_bootstrap_new_block,
|
||||
.commit = sm_bootstrap_commit,
|
||||
.root_size = sm_bootstrap_root_size,
|
||||
@ -696,7 +714,7 @@ static const struct dm_space_map bootstrap_ops = {
|
||||
|
||||
static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
|
||||
{
|
||||
int r, i;
|
||||
int r;
|
||||
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
|
||||
dm_block_t old_len = smm->ll.nr_blocks;
|
||||
|
||||
@ -718,9 +736,7 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
|
||||
* allocate any new blocks.
|
||||
*/
|
||||
do {
|
||||
for (i = old_len; !r && i < smm->begin; i++)
|
||||
r = add_bop(smm, BOP_INC, i);
|
||||
|
||||
r = add_bop(smm, BOP_INC, old_len, smm->begin);
|
||||
if (r)
|
||||
goto out;
|
||||
|
||||
@ -767,7 +783,6 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
|
||||
dm_block_t superblock)
|
||||
{
|
||||
int r;
|
||||
dm_block_t i;
|
||||
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
|
||||
|
||||
smm->begin = superblock + 1;
|
||||
@ -792,9 +807,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
|
||||
* Now we need to update the newly created data structures with the
|
||||
* allocated blocks that they were built from.
|
||||
*/
|
||||
for (i = superblock; !r && i < smm->begin; i++)
|
||||
r = add_bop(smm, BOP_INC, i);
|
||||
|
||||
r = add_bop(smm, BOP_INC, superblock, smm->begin);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
|
@ -46,8 +46,8 @@ struct dm_space_map {
|
||||
|
||||
int (*commit)(struct dm_space_map *sm);
|
||||
|
||||
int (*inc_block)(struct dm_space_map *sm, dm_block_t b);
|
||||
int (*dec_block)(struct dm_space_map *sm, dm_block_t b);
|
||||
int (*inc_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e);
|
||||
int (*dec_blocks)(struct dm_space_map *sm, dm_block_t b, dm_block_t e);
|
||||
|
||||
/*
|
||||
* new_block will increment the returned block.
|
||||
@ -117,14 +117,24 @@ static inline int dm_sm_commit(struct dm_space_map *sm)
|
||||
return sm->commit(sm);
|
||||
}
|
||||
|
||||
static inline int dm_sm_inc_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
|
||||
{
|
||||
return sm->inc_blocks(sm, b, e);
|
||||
}
|
||||
|
||||
static inline int dm_sm_inc_block(struct dm_space_map *sm, dm_block_t b)
|
||||
{
|
||||
return sm->inc_block(sm, b);
|
||||
return dm_sm_inc_blocks(sm, b, b + 1);
|
||||
}
|
||||
|
||||
static inline int dm_sm_dec_blocks(struct dm_space_map *sm, dm_block_t b, dm_block_t e)
|
||||
{
|
||||
return sm->dec_blocks(sm, b, e);
|
||||
}
|
||||
|
||||
static inline int dm_sm_dec_block(struct dm_space_map *sm, dm_block_t b)
|
||||
{
|
||||
return sm->dec_block(sm, b);
|
||||
return dm_sm_dec_blocks(sm, b, b + 1);
|
||||
}
|
||||
|
||||
static inline int dm_sm_new_block(struct dm_space_map *sm, dm_block_t *b)
|
||||
|
@ -359,6 +359,17 @@ void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dm_tm_inc);
|
||||
|
||||
void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e)
|
||||
{
|
||||
/*
|
||||
* The non-blocking clone doesn't support this.
|
||||
*/
|
||||
BUG_ON(tm->is_clone);
|
||||
|
||||
dm_sm_inc_blocks(tm->sm, b, e);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dm_tm_inc_range);
|
||||
|
||||
void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b)
|
||||
{
|
||||
/*
|
||||
@ -370,6 +381,47 @@ void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dm_tm_dec);
|
||||
|
||||
void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e)
|
||||
{
|
||||
/*
|
||||
* The non-blocking clone doesn't support this.
|
||||
*/
|
||||
BUG_ON(tm->is_clone);
|
||||
|
||||
dm_sm_dec_blocks(tm->sm, b, e);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dm_tm_dec_range);
|
||||
|
||||
void dm_tm_with_runs(struct dm_transaction_manager *tm,
|
||||
const __le64 *value_le, unsigned count, dm_tm_run_fn fn)
|
||||
{
|
||||
uint64_t b, begin, end;
|
||||
bool in_run = false;
|
||||
unsigned i;
|
||||
|
||||
for (i = 0; i < count; i++, value_le++) {
|
||||
b = le64_to_cpu(*value_le);
|
||||
|
||||
if (in_run) {
|
||||
if (b == end)
|
||||
end++;
|
||||
else {
|
||||
fn(tm, begin, end);
|
||||
begin = b;
|
||||
end = b + 1;
|
||||
}
|
||||
} else {
|
||||
in_run = true;
|
||||
begin = b;
|
||||
end = b + 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (in_run)
|
||||
fn(tm, begin, end);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dm_tm_with_runs);
|
||||
|
||||
int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
|
||||
uint32_t *result)
|
||||
{
|
||||
@ -379,6 +431,15 @@ int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
|
||||
return dm_sm_get_count(tm->sm, b, result);
|
||||
}
|
||||
|
||||
int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b,
|
||||
int *result)
|
||||
{
|
||||
if (tm->is_clone)
|
||||
return -EWOULDBLOCK;
|
||||
|
||||
return dm_sm_count_is_more_than_one(tm->sm, b, result);
|
||||
}
|
||||
|
||||
struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm)
|
||||
{
|
||||
return tm->bm;
|
||||
|
@ -100,11 +100,27 @@ void dm_tm_unlock(struct dm_transaction_manager *tm, struct dm_block *b);
|
||||
* Functions for altering the reference count of a block directly.
|
||||
*/
|
||||
void dm_tm_inc(struct dm_transaction_manager *tm, dm_block_t b);
|
||||
|
||||
void dm_tm_inc_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e);
|
||||
void dm_tm_dec(struct dm_transaction_manager *tm, dm_block_t b);
|
||||
void dm_tm_dec_range(struct dm_transaction_manager *tm, dm_block_t b, dm_block_t e);
|
||||
|
||||
int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
|
||||
uint32_t *result);
|
||||
/*
|
||||
* Builds up runs of adjacent blocks, and then calls the given fn
|
||||
* (typically dm_tm_inc/dec). Very useful when you have to perform
|
||||
* the same tm operation on all values in a btree leaf.
|
||||
*/
|
||||
typedef void (*dm_tm_run_fn)(struct dm_transaction_manager *, dm_block_t, dm_block_t);
|
||||
void dm_tm_with_runs(struct dm_transaction_manager *tm,
|
||||
const __le64 *value_le, unsigned count, dm_tm_run_fn fn);
|
||||
|
||||
int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b, uint32_t *result);
|
||||
|
||||
/*
|
||||
* Finds out if a given block is shared (ie. has a reference count higher
|
||||
* than one).
|
||||
*/
|
||||
int dm_tm_block_is_shared(struct dm_transaction_manager *tm, dm_block_t b,
|
||||
int *result);
|
||||
|
||||
struct dm_block_manager *dm_tm_get_bm(struct dm_transaction_manager *tm);
|
||||
|
||||
|
@ -300,6 +300,7 @@ enum {
|
||||
BIO_CGROUP_ACCT, /* has been accounted to a cgroup */
|
||||
BIO_TRACKED, /* set if bio goes through the rq_qos path */
|
||||
BIO_REMAPPED,
|
||||
BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */
|
||||
BIO_FLAG_LAST
|
||||
};
|
||||
|
||||
|
@ -1012,6 +1012,18 @@ static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
|
||||
/* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */
|
||||
const char *blk_zone_cond_str(enum blk_zone_cond zone_cond);
|
||||
|
||||
static inline unsigned int bio_zone_no(struct bio *bio)
|
||||
{
|
||||
return blk_queue_zone_no(bdev_get_queue(bio->bi_bdev),
|
||||
bio->bi_iter.bi_sector);
|
||||
}
|
||||
|
||||
static inline unsigned int bio_zone_is_seq(struct bio *bio)
|
||||
{
|
||||
return blk_queue_zone_is_seq(bdev_get_queue(bio->bi_bdev),
|
||||
bio->bi_iter.bi_sector);
|
||||
}
|
||||
|
||||
static inline unsigned int blk_rq_zone_no(struct request *rq)
|
||||
{
|
||||
return blk_queue_zone_no(rq->q, blk_rq_pos(rq));
|
||||
|
@ -361,6 +361,12 @@ struct dm_target {
|
||||
* Set if we need to limit the number of in-flight bios when swapping.
|
||||
*/
|
||||
bool limit_swap_bios:1;
|
||||
|
||||
/*
|
||||
* Set if this target implements a a zoned device and needs emulation of
|
||||
* zone append operations using regular writes.
|
||||
*/
|
||||
bool emulate_zone_append:1;
|
||||
};
|
||||
|
||||
void *dm_per_bio_data(struct bio *bio, size_t data_size);
|
||||
@ -478,7 +484,8 @@ struct dm_report_zones_args {
|
||||
/* must be filled by ->report_zones before calling dm_report_zones_cb */
|
||||
sector_t start;
|
||||
};
|
||||
int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data);
|
||||
int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector,
|
||||
struct dm_report_zones_args *args, unsigned int nr_zones);
|
||||
#endif /* CONFIG_BLK_DEV_ZONED */
|
||||
|
||||
/*
|
||||
|
@ -51,6 +51,7 @@ MODULE_PARM_DESC(name, description)
|
||||
struct dm_kcopyd_client;
|
||||
struct dm_kcopyd_client *dm_kcopyd_client_create(struct dm_kcopyd_throttle *throttle);
|
||||
void dm_kcopyd_client_destroy(struct dm_kcopyd_client *kc);
|
||||
void dm_kcopyd_client_flush(struct dm_kcopyd_client *kc);
|
||||
|
||||
/*
|
||||
* Submit a copy job to kcopyd. This is built on top of the
|
||||
|
Loading…
Reference in New Issue
Block a user