From e0d6609a5fe34463ae2fd48d846931f70de8b37b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 14 Mar 2014 18:40:39 -0400 Subject: [PATCH 01/11] dm: change sector_count member in clone_info from sector_t to unsigned It is impossible to create bios with 2^23 or more sectors (the size is stored as a 32-bit byte count in the bio). So we convert some sector_t values to unsigned integers. This is needed for the next commit ("dm: introduce dm_accept_partial_bio") that replaces integer value arguments with pointers, so the size of the integer must match. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 455e64916498..368a20dd85c2 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1152,10 +1152,10 @@ struct clone_info { struct bio *bio; struct dm_io *io; sector_t sector; - sector_t sector_count; + unsigned sector_count; }; -static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len) +static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) { bio->bi_iter.bi_sector = sector; bio->bi_iter.bi_size = to_bytes(len); @@ -1200,7 +1200,7 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, static void __clone_and_map_simple_bio(struct clone_info *ci, struct dm_target *ti, - unsigned target_bio_nr, sector_t len) + unsigned target_bio_nr, unsigned len) { struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr); struct bio *clone = &tio->clone; @@ -1218,7 +1218,7 @@ static void __clone_and_map_simple_bio(struct clone_info *ci, } static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, - unsigned num_bios, sector_t len) + unsigned num_bios, unsigned len) { unsigned target_bio_nr; @@ -1283,7 +1283,7 @@ static int __send_changing_extent_only(struct clone_info *ci, is_split_required_fn is_split_required) { struct dm_target *ti; - sector_t len; + unsigned len; unsigned num_bios; do { @@ -1302,9 +1302,9 @@ static int __send_changing_extent_only(struct clone_info *ci, return -EOPNOTSUPP; if (is_split_required && !is_split_required(ti)) - len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); + len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); else - len = min(ci->sector_count, max_io_len(ci->sector, ti)); + len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); __send_duplicate_bios(ci, ti, num_bios, len); From 1dd40c3ecd9b8a4ab91dbf2e6ce10b82a3b5ae63 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 14 Mar 2014 18:41:24 -0400 Subject: [PATCH 02/11] dm: introduce dm_accept_partial_bio The function dm_accept_partial_bio allows the target to specify how many sectors of the current bio it will process. If the target only wants to accept part of the bio, it calls dm_accept_partial_bio and the DM core sends the rest of the data in next bio. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 59 ++++++++++++++++++++++++++++++----- include/linux/device-mapper.h | 2 ++ 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 368a20dd85c2..97940fc8c302 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1110,6 +1110,46 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) } EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); +/* + * A target may call dm_accept_partial_bio only from the map routine. It is + * allowed for all bio types except REQ_FLUSH. + * + * dm_accept_partial_bio informs the dm that the target only wants to process + * additional n_sectors sectors of the bio and the rest of the data should be + * sent in a next bio. + * + * A diagram that explains the arithmetics: + * +--------------------+---------------+-------+ + * | 1 | 2 | 3 | + * +--------------------+---------------+-------+ + * + * <-------------- *tio->len_ptr ---------------> + * <------- bi_size -------> + * <-- n_sectors --> + * + * Region 1 was already iterated over with bio_advance or similar function. + * (it may be empty if the target doesn't use bio_advance) + * Region 2 is the remaining bio size that the target wants to process. + * (it may be empty if region 1 is non-empty, although there is no reason + * to make it empty) + * The target requires that region 3 is to be sent in the next bio. + * + * If the target wants to receive multiple copies of the bio (via num_*bios, etc), + * the partially processed part (the sum of regions 1+2) must be the same for all + * copies of the bio. + */ +void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) +{ + struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); + unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; + BUG_ON(bio->bi_rw & REQ_FLUSH); + BUG_ON(bi_size > *tio->len_ptr); + BUG_ON(n_sectors > bi_size); + *tio->len_ptr -= bi_size - n_sectors; + bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; +} +EXPORT_SYMBOL_GPL(dm_accept_partial_bio); + static void __map_bio(struct dm_target_io *tio) { int r; @@ -1200,11 +1240,13 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, static void __clone_and_map_simple_bio(struct clone_info *ci, struct dm_target *ti, - unsigned target_bio_nr, unsigned len) + unsigned target_bio_nr, unsigned *len) { struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr); struct bio *clone = &tio->clone; + tio->len_ptr = len; + /* * Discard requests require the bio's inline iovecs be initialized. * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush @@ -1212,13 +1254,13 @@ static void __clone_and_map_simple_bio(struct clone_info *ci, */ __bio_clone_fast(clone, ci->bio); if (len) - bio_setup_sector(clone, ci->sector, len); + bio_setup_sector(clone, ci->sector, *len); __map_bio(tio); } static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, - unsigned num_bios, unsigned len) + unsigned num_bios, unsigned *len) { unsigned target_bio_nr; @@ -1233,13 +1275,13 @@ static int __send_empty_flush(struct clone_info *ci) BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) - __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0); + __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); return 0; } static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, - sector_t sector, unsigned len) + sector_t sector, unsigned *len) { struct bio *bio = ci->bio; struct dm_target_io *tio; @@ -1254,7 +1296,8 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { tio = alloc_tio(ci, ti, 0, target_bio_nr); - clone_bio(tio, bio, sector, len); + tio->len_ptr = len; + clone_bio(tio, bio, sector, *len); __map_bio(tio); } } @@ -1306,7 +1349,7 @@ static int __send_changing_extent_only(struct clone_info *ci, else len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); - __send_duplicate_bios(ci, ti, num_bios, len); + __send_duplicate_bios(ci, ti, num_bios, &len); ci->sector += len; } while (ci->sector_count -= len); @@ -1345,7 +1388,7 @@ static int __split_and_process_non_flush(struct clone_info *ci) len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); - __clone_and_map_data_bio(ci, ti, ci->sector, len); + __clone_and_map_data_bio(ci, ti, ci->sector, &len); ci->sector += len; ci->sector_count -= len; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 63da56ed9796..0adca299f238 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -291,6 +291,7 @@ struct dm_target_io { struct dm_io *io; struct dm_target *ti; unsigned target_bio_nr; + unsigned *len_ptr; struct bio clone; }; @@ -401,6 +402,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid); struct gendisk *dm_disk(struct mapped_device *md); int dm_suspended(struct dm_target *ti); int dm_noflush_suspending(struct dm_target *ti); +void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors); union map_info *dm_get_rq_mapinfo(struct request *rq); struct queue_limits *dm_get_queue_limits(struct mapped_device *md); From 599cdf3bfbe21fe94f4416c9e54363b285c9532a Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 14 Mar 2014 18:42:12 -0400 Subject: [PATCH 03/11] dm snapshot: allocate a per-target structure for snapshot-origin target Allocate a per-target dm_origin structure. This is a prerequisite for the next commit ("dm snapshot: do not split read bios sent to snapshot-origin target") which adds a new member to this structure. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 57 ++++++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index ebddef5237e4..220a06bfe91b 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -2141,6 +2141,10 @@ static int origin_write_extent(struct dm_snapshot *merging_snap, * Origin: maps a linear range of a device, with hooks for snapshotting. */ +struct dm_origin { + struct dm_dev *dev; +}; + /* * Construct an origin mapping: * The context for an origin is merely a 'struct dm_dev *' @@ -2149,41 +2153,54 @@ static int origin_write_extent(struct dm_snapshot *merging_snap, static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) { int r; - struct dm_dev *dev; + struct dm_origin *o; if (argc != 1) { ti->error = "origin: incorrect number of arguments"; return -EINVAL; } - r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); - if (r) { - ti->error = "Cannot get target device"; - return r; + o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL); + if (!o) { + ti->error = "Cannot allocate private origin structure"; + r = -ENOMEM; + goto bad_alloc; } - ti->private = dev; + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev); + if (r) { + ti->error = "Cannot get target device"; + goto bad_open; + } + + ti->private = o; ti->num_flush_bios = 1; return 0; + +bad_open: + kfree(o); +bad_alloc: + return r; } static void origin_dtr(struct dm_target *ti) { - struct dm_dev *dev = ti->private; - dm_put_device(ti, dev); + struct dm_origin *o = ti->private; + dm_put_device(ti, o->dev); + kfree(o); } static int origin_map(struct dm_target *ti, struct bio *bio) { - struct dm_dev *dev = ti->private; - bio->bi_bdev = dev->bdev; + struct dm_origin *o = ti->private; + bio->bi_bdev = o->dev->bdev; if (bio->bi_rw & REQ_FLUSH) return DM_MAPIO_REMAPPED; /* Only tell snapshots if this is a write */ - return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; + return (bio_rw(bio) == WRITE) ? do_origin(o->dev, bio) : DM_MAPIO_REMAPPED; } /* @@ -2192,15 +2209,15 @@ static int origin_map(struct dm_target *ti, struct bio *bio) */ static void origin_resume(struct dm_target *ti) { - struct dm_dev *dev = ti->private; + struct dm_origin *o = ti->private; - ti->max_io_len = get_origin_minimum_chunksize(dev->bdev); + ti->max_io_len = get_origin_minimum_chunksize(o->dev->bdev); } static void origin_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { - struct dm_dev *dev = ti->private; + struct dm_origin *o = ti->private; switch (type) { case STATUSTYPE_INFO: @@ -2208,7 +2225,7 @@ static void origin_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - snprintf(result, maxlen, "%s", dev->name); + snprintf(result, maxlen, "%s", o->dev->name); break; } } @@ -2216,13 +2233,13 @@ static void origin_status(struct dm_target *ti, status_type_t type, static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, struct bio_vec *biovec, int max_size) { - struct dm_dev *dev = ti->private; - struct request_queue *q = bdev_get_queue(dev->bdev); + struct dm_origin *o = ti->private; + struct request_queue *q = bdev_get_queue(o->dev->bdev); if (!q->merge_bvec_fn) return max_size; - bvm->bi_bdev = dev->bdev; + bvm->bi_bdev = o->dev->bdev; return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } @@ -2230,9 +2247,9 @@ static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, static int origin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { - struct dm_dev *dev = ti->private; + struct dm_origin *o = ti->private; - return fn(ti, dev, 0, ti->len, data); + return fn(ti, o->dev, 0, ti->len, data); } static struct target_type origin_target = { From 298eaa89b02e88dc9081f8761a957f7cd5e8b201 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 14 Mar 2014 18:43:07 -0400 Subject: [PATCH 04/11] dm snapshot: do not split read bios sent to snapshot-origin target Change the snapshot-origin target so that only write bios are split on chunk boundary. Read bios are passed unchanged to the underlying device, so they don't have to be split. Later, we could change the target so that it accepts a larger write bio if it spans an area that is completely covered by snapshot exceptions. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 220a06bfe91b..e5a84c890e8f 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -2143,6 +2143,7 @@ static int origin_write_extent(struct dm_snapshot *merging_snap, struct dm_origin { struct dm_dev *dev; + unsigned split_boundary; }; /* @@ -2194,13 +2195,24 @@ static void origin_dtr(struct dm_target *ti) static int origin_map(struct dm_target *ti, struct bio *bio) { struct dm_origin *o = ti->private; + unsigned available_sectors; + bio->bi_bdev = o->dev->bdev; - if (bio->bi_rw & REQ_FLUSH) + if (unlikely(bio->bi_rw & REQ_FLUSH)) return DM_MAPIO_REMAPPED; + if (bio_rw(bio) != WRITE) + return DM_MAPIO_REMAPPED; + + available_sectors = o->split_boundary - + ((unsigned)bio->bi_iter.bi_sector & (o->split_boundary - 1)); + + if (bio_sectors(bio) > available_sectors) + dm_accept_partial_bio(bio, available_sectors); + /* Only tell snapshots if this is a write */ - return (bio_rw(bio) == WRITE) ? do_origin(o->dev, bio) : DM_MAPIO_REMAPPED; + return do_origin(o->dev, bio); } /* @@ -2211,7 +2223,7 @@ static void origin_resume(struct dm_target *ti) { struct dm_origin *o = ti->private; - ti->max_io_len = get_origin_minimum_chunksize(o->dev->bdev); + o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev); } static void origin_status(struct dm_target *ti, status_type_t type, From e7a3e871d8954c636b6cd2db7c7ece7ffe405986 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 13 May 2014 16:14:14 -0400 Subject: [PATCH 05/11] dm thin: cleanup noflush_work to use a proper completion Factor out a pool_work interface that noflush_work makes use of to wait for and complete work items (in terms of a proper completion struct). Allows discontinuing the use of a custom completion in terms of atomic_t and wait_event. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 54 ++++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 242ac2ea5f29..7694988fb806 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1610,47 +1610,63 @@ static void do_no_space_timeout(struct work_struct *ws) /*----------------------------------------------------------------*/ -struct noflush_work { +struct pool_work { struct work_struct worker; - struct thin_c *tc; - - atomic_t complete; - wait_queue_head_t wait; + struct completion complete; }; -static void complete_noflush_work(struct noflush_work *w) +static struct pool_work *to_pool_work(struct work_struct *ws) { - atomic_set(&w->complete, 1); - wake_up(&w->wait); + return container_of(ws, struct pool_work, worker); +} + +static void pool_work_complete(struct pool_work *pw) +{ + complete(&pw->complete); +} + +static void pool_work_wait(struct pool_work *pw, struct pool *pool, + void (*fn)(struct work_struct *)) +{ + INIT_WORK_ONSTACK(&pw->worker, fn); + init_completion(&pw->complete); + queue_work(pool->wq, &pw->worker); + wait_for_completion(&pw->complete); +} + +/*----------------------------------------------------------------*/ + +struct noflush_work { + struct pool_work pw; + struct thin_c *tc; +}; + +static struct noflush_work *to_noflush(struct work_struct *ws) +{ + return container_of(to_pool_work(ws), struct noflush_work, pw); } static void do_noflush_start(struct work_struct *ws) { - struct noflush_work *w = container_of(ws, struct noflush_work, worker); + struct noflush_work *w = to_noflush(ws); w->tc->requeue_mode = true; requeue_io(w->tc); - complete_noflush_work(w); + pool_work_complete(&w->pw); } static void do_noflush_stop(struct work_struct *ws) { - struct noflush_work *w = container_of(ws, struct noflush_work, worker); + struct noflush_work *w = to_noflush(ws); w->tc->requeue_mode = false; - complete_noflush_work(w); + pool_work_complete(&w->pw); } static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) { struct noflush_work w; - INIT_WORK_ONSTACK(&w.worker, fn); w.tc = tc; - atomic_set(&w.complete, 0); - init_waitqueue_head(&w.wait); - - queue_work(tc->pool->wq, &w.worker); - - wait_event(w.wait, atomic_read(&w.complete)); + pool_work_wait(&w.pw, tc->pool, fn); } /*----------------------------------------------------------------*/ From af91805a497d3aa694704172b41ba953be3738ed Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 22 May 2014 14:32:51 -0400 Subject: [PATCH 06/11] dm thin: return ENOSPC instead of EIO when error_if_no_space enabled Update the DM thin provisioning target's allocation failure error to be consistent with commit a9d6ceb8 ("[SCSI] return ENOSPC on thin provisioning failure"). The DM thin target now returns -ENOSPC rather than -EIO when block allocation fails due to the pool being out of data space (and the 'error_if_no_space' thin-pool feature is enabled). Signed-off-by: Mike Snitzer Acked-By: Joe Thornber --- drivers/md/dm-bio-prison.c | 4 ++-- drivers/md/dm-bio-prison.h | 2 +- drivers/md/dm-thin.c | 38 ++++++++++++++++++++++++-------------- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c index 85f0b7074257..71434382cb64 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison.c @@ -238,7 +238,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); void dm_cell_error(struct dm_bio_prison *prison, - struct dm_bio_prison_cell *cell) + struct dm_bio_prison_cell *cell, int error) { struct bio_list bios; struct bio *bio; @@ -251,7 +251,7 @@ void dm_cell_error(struct dm_bio_prison *prison, spin_unlock_irqrestore(&prison->lock, flags); while ((bio = bio_list_pop(&bios))) - bio_io_error(bio); + bio_endio(bio, error); } EXPORT_SYMBOL_GPL(dm_cell_error); diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h index 3f833190eadf..6805a142b750 100644 --- a/drivers/md/dm-bio-prison.h +++ b/drivers/md/dm-bio-prison.h @@ -85,7 +85,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell, struct bio_list *inmates); void dm_cell_error(struct dm_bio_prison *prison, - struct dm_bio_prison_cell *cell); + struct dm_bio_prison_cell *cell, int error); /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 7694988fb806..a0bdd562e026 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -310,13 +310,18 @@ static void cell_defer_no_holder_no_free(struct thin_c *tc, wake_worker(pool); } -static void cell_error(struct pool *pool, - struct dm_bio_prison_cell *cell) +static void cell_error_with_code(struct pool *pool, + struct dm_bio_prison_cell *cell, int error_code) { - dm_cell_error(pool->prison, cell); + dm_cell_error(pool->prison, cell, error_code); dm_bio_prison_free_cell(pool->prison, cell); } +static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, -EIO); +} + /*----------------------------------------------------------------*/ /* @@ -1027,7 +1032,7 @@ static void retry_on_resume(struct bio *bio) spin_unlock_irqrestore(&tc->lock, flags); } -static bool should_error_unserviceable_bio(struct pool *pool) +static int should_error_unserviceable_bio(struct pool *pool) { enum pool_mode m = get_pool_mode(pool); @@ -1035,25 +1040,27 @@ static bool should_error_unserviceable_bio(struct pool *pool) case PM_WRITE: /* Shouldn't get here */ DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); - return true; + return -EIO; case PM_OUT_OF_DATA_SPACE: - return pool->pf.error_if_no_space; + return pool->pf.error_if_no_space ? -ENOSPC : 0; case PM_READ_ONLY: case PM_FAIL: - return true; + return -EIO; default: /* Shouldn't get here */ DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); - return true; + return -EIO; } } static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) { - if (should_error_unserviceable_bio(pool)) - bio_io_error(bio); + int error = should_error_unserviceable_bio(pool); + + if (error) + bio_endio(bio, error); else retry_on_resume(bio); } @@ -1062,18 +1069,21 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c { struct bio *bio; struct bio_list bios; + int error; - if (should_error_unserviceable_bio(pool)) { - cell_error(pool, cell); + error = should_error_unserviceable_bio(pool); + if (error) { + cell_error_with_code(pool, cell, error); return; } bio_list_init(&bios); cell_release(pool, cell, &bios); - if (should_error_unserviceable_bio(pool)) + error = should_error_unserviceable_bio(pool); + if (error) while ((bio = bio_list_pop(&bios))) - bio_io_error(bio); + bio_endio(bio, error); else while ((bio = bio_list_pop(&bios))) retry_on_resume(bio); From 989f26f5ad308f40a95f280bf9cd75e558d4f18d Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Tue, 11 Mar 2014 16:46:25 +0000 Subject: [PATCH 07/11] dm era: check for a non-NULL metadata object before closing it era_ctr() may call era_destroy() before era->md is initialized so era_destory() must only close the metadata object if it is not NULL. Signed-off-by: Joe Thornber Signed-off-by: Naohiro Aota Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org # 3.15+ --- drivers/md/dm-era-target.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 414dad4cb49b..ad913cd4aded 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1391,7 +1391,8 @@ static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits) static void era_destroy(struct era *era) { - metadata_close(era->md); + if (era->md) + metadata_close(era->md); if (era->wq) destroy_workqueue(era->wq); From 7eee4ae2dbb2be0a15a4406718806e48b18ba831 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 2 Jun 2014 15:50:06 -0400 Subject: [PATCH 08/11] dm: disable WRITE SAME if it fails Add DM core support for disabling WRITE SAME on first failure to both request-based and bio-based targets. The need to disable WRITE SAME stems from SCSI enabling it by default but then disabling it when it fails. When SCSI does this it returns "permanent target failure, do not retry" using -EREMOTEIO. Update DM core to only disable WRITE SAME on failure if the returned error is -EREMOTEIO. Commit f84cb8a4 ("dm mpath: disable WRITE SAME if it fails") implemented multipath specific disabling of WRITE SAME if it fails. However, as that commit detailed, the multipath-only solution doesn't go far enough if bio-based DM targets are stacked ontop of the request-based dm-multipath target (as is commonly done using dm-linear to support partitions on multipath devices, via kpartx). Signed-off-by: Mike Snitzer Acked-by: Martin K. Petersen Tested-by: Alex Chen --- drivers/md/dm-mpath.c | 11 +---------- drivers/md/dm.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index ebfa411d1a7d..3f6fd9d33ba3 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1242,17 +1242,8 @@ static int do_end_io(struct multipath *m, struct request *clone, if (!error && !clone->errors) return 0; /* I/O complete */ - if (noretry_error(error)) { - if ((clone->cmd_flags & REQ_WRITE_SAME) && - !clone->q->limits.max_write_same_sectors) { - struct queue_limits *limits; - - /* device doesn't really support WRITE SAME, disable it */ - limits = dm_get_queue_limits(dm_table_get_md(m->ti->table)); - limits->max_write_same_sectors = 0; - } + if (noretry_error(error)) return error; - } if (mpio->pgpath) fail_path(mpio->pgpath); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 97940fc8c302..3234a753a80d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -755,6 +755,14 @@ static void dec_pending(struct dm_io *io, int error) } } +static void disable_write_same(struct mapped_device *md) +{ + struct queue_limits *limits = dm_get_queue_limits(md); + + /* device doesn't really support WRITE SAME, disable it */ + limits->max_write_same_sectors = 0; +} + static void clone_endio(struct bio *bio, int error) { int r = 0; @@ -783,6 +791,10 @@ static void clone_endio(struct bio *bio, int error) } } + if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) && + !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) + disable_write_same(md); + free_tio(md, tio); dec_pending(io, error); } @@ -977,6 +989,10 @@ static void dm_done(struct request *clone, int error, bool mapped) r = rq_end_io(tio->ti, clone, error, &tio->info); } + if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) && + !clone->q->limits.max_write_same_sectors)) + disable_write_same(tio->md); + if (r <= 0) /* The target wants to complete the I/O */ dm_end_request(clone, r); From 11f0431be2f99c574a65c6dfc0ca205511500f29 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 3 Jun 2014 10:30:28 -0400 Subject: [PATCH 09/11] dm: remove symbol export for dm_set_device_limits There is no need for code other than DM core to use dm_set_device_limits so remove its EXPORT_SYMBOL_GPL. Also, cleanup a couple whitespace nits. Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 5 ++--- drivers/md/dm.c | 1 - include/linux/device-mapper.h | 8 +------- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 50601ec7017a..5f59f1e3e5b1 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -465,8 +465,8 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, } EXPORT_SYMBOL(dm_get_device); -int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct queue_limits *limits = data; struct block_device *bdev = dev->bdev; @@ -499,7 +499,6 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, (unsigned int) (PAGE_SIZE >> 9)); return 0; } -EXPORT_SYMBOL_GPL(dm_set_device_limits); /* * Decrement a device's use count and remove it if necessary. diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3234a753a80d..bf1a1eaad9a9 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1498,7 +1498,6 @@ static int dm_merge_bvec(struct request_queue *q, * just one page. */ else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) - max_size = 0; out: diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 0adca299f238..e1707de043ae 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -115,12 +115,6 @@ typedef int (*dm_busy_fn) (struct dm_target *ti); void dm_error(const char *message); -/* - * Combine device limits. - */ -int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data); - struct dm_dev { struct block_device *bdev; fmode_t mode; @@ -132,7 +126,7 @@ struct dm_dev { * are opened/closed correctly. */ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, - struct dm_dev **result); + struct dm_dev **result); void dm_put_device(struct dm_target *ti, struct dm_dev *d); /* From adcc44472bacb227ebc0b1a8876efa5302474338 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Thu, 5 Jun 2014 15:23:09 +0100 Subject: [PATCH 10/11] dm bio prison: implement per bucket locking in the dm_bio_prison hash table Split the single per bio-prison lock by using per bucket locking. Per bucket locking benefits both dm-thin and dm-cache targets by reducing bio-prison lock contention. Signed-off-by: Heinz Mauelshagen Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-bio-prison.c | 66 ++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c index 71434382cb64..f752d12081ff 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison.c @@ -14,13 +14,17 @@ /*----------------------------------------------------------------*/ -struct dm_bio_prison { +struct bucket { spinlock_t lock; + struct hlist_head cells; +}; + +struct dm_bio_prison { mempool_t *cell_pool; unsigned nr_buckets; unsigned hash_mask; - struct hlist_head *cells; + struct bucket *buckets; }; /*----------------------------------------------------------------*/ @@ -40,6 +44,12 @@ static uint32_t calc_nr_buckets(unsigned nr_cells) static struct kmem_cache *_cell_cache; +static void init_bucket(struct bucket *b) +{ + spin_lock_init(&b->lock); + INIT_HLIST_HEAD(&b->cells); +} + /* * @nr_cells should be the number of cells you want in use _concurrently_. * Don't confuse it with the number of distinct keys. @@ -49,13 +59,12 @@ struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells) unsigned i; uint32_t nr_buckets = calc_nr_buckets(nr_cells); size_t len = sizeof(struct dm_bio_prison) + - (sizeof(struct hlist_head) * nr_buckets); + (sizeof(struct bucket) * nr_buckets); struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL); if (!prison) return NULL; - spin_lock_init(&prison->lock); prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); if (!prison->cell_pool) { kfree(prison); @@ -64,9 +73,9 @@ struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells) prison->nr_buckets = nr_buckets; prison->hash_mask = nr_buckets - 1; - prison->cells = (struct hlist_head *) (prison + 1); + prison->buckets = (struct bucket *) (prison + 1); for (i = 0; i < nr_buckets; i++) - INIT_HLIST_HEAD(prison->cells + i); + init_bucket(prison->buckets + i); return prison; } @@ -107,40 +116,44 @@ static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs) (lhs->block == rhs->block); } -static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket, +static struct bucket *get_bucket(struct dm_bio_prison *prison, + struct dm_cell_key *key) +{ + return prison->buckets + hash_key(prison, key); +} + +static struct dm_bio_prison_cell *__search_bucket(struct bucket *b, struct dm_cell_key *key) { struct dm_bio_prison_cell *cell; - hlist_for_each_entry(cell, bucket, list) + hlist_for_each_entry(cell, &b->cells, list) if (keys_equal(&cell->key, key)) return cell; return NULL; } -static void __setup_new_cell(struct dm_bio_prison *prison, +static void __setup_new_cell(struct bucket *b, struct dm_cell_key *key, struct bio *holder, - uint32_t hash, struct dm_bio_prison_cell *cell) { memcpy(&cell->key, key, sizeof(cell->key)); cell->holder = holder; bio_list_init(&cell->bios); - hlist_add_head(&cell->list, prison->cells + hash); + hlist_add_head(&cell->list, &b->cells); } -static int __bio_detain(struct dm_bio_prison *prison, +static int __bio_detain(struct bucket *b, struct dm_cell_key *key, struct bio *inmate, struct dm_bio_prison_cell *cell_prealloc, struct dm_bio_prison_cell **cell_result) { - uint32_t hash = hash_key(prison, key); struct dm_bio_prison_cell *cell; - cell = __search_bucket(prison->cells + hash, key); + cell = __search_bucket(b, key); if (cell) { if (inmate) bio_list_add(&cell->bios, inmate); @@ -148,7 +161,7 @@ static int __bio_detain(struct dm_bio_prison *prison, return 1; } - __setup_new_cell(prison, key, inmate, hash, cell_prealloc); + __setup_new_cell(b, key, inmate, cell_prealloc); *cell_result = cell_prealloc; return 0; } @@ -161,10 +174,11 @@ static int bio_detain(struct dm_bio_prison *prison, { int r; unsigned long flags; + struct bucket *b = get_bucket(prison, key); - spin_lock_irqsave(&prison->lock, flags); - r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_lock_irqsave(&b->lock, flags); + r = __bio_detain(b, key, inmate, cell_prealloc, cell_result); + spin_unlock_irqrestore(&b->lock, flags); return r; } @@ -208,10 +222,11 @@ void dm_cell_release(struct dm_bio_prison *prison, struct bio_list *bios) { unsigned long flags; + struct bucket *b = get_bucket(prison, &cell->key); - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irqsave(&b->lock, flags); __cell_release(cell, bios); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irqrestore(&b->lock, flags); } EXPORT_SYMBOL_GPL(dm_cell_release); @@ -230,10 +245,11 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, struct bio_list *inmates) { unsigned long flags; + struct bucket *b = get_bucket(prison, &cell->key); - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irqsave(&b->lock, flags); __cell_release_no_holder(cell, inmates); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irqrestore(&b->lock, flags); } EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); @@ -242,13 +258,9 @@ void dm_cell_error(struct dm_bio_prison *prison, { struct bio_list bios; struct bio *bio; - unsigned long flags; bio_list_init(&bios); - - spin_lock_irqsave(&prison->lock, flags); - __cell_release(cell, &bios); - spin_unlock_irqrestore(&prison->lock, flags); + dm_cell_release(prison, cell, &bios); while ((bio = bio_list_pop(&bios))) bio_endio(bio, error); From 09869de57ed2728ae3c619803932a86cb0e2c4f8 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Wed, 11 Jun 2014 12:28:43 -0400 Subject: [PATCH 11/11] dm thin: update discard_granularity to reflect the thin-pool blocksize DM thinp already checks whether the discard_granularity of the data device is a factor of the thin-pool block size. But when using the dm-thin-pool's discard passdown support, DM thinp was not selecting the max of the underlying data device's discard_granularity and the thin-pool's block size. Update set_discard_limits() to set discard_granularity to the max of these values. This enables blkdev_issue_discard() to properly align the discards that are sent to the DM thin device on a full block boundary. As such each discard will now cover an entire DM thin-pool block and the block will be reclaimed. Reported-by: Zdenek Kabelac Signed-off-by: Lukas Czerner Signed-off-by: Mike Snitzer Cc: stable@vger.kernel.org --- drivers/md/dm-thin.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index a0bdd562e026..fc9c848a60c9 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3094,7 +3094,8 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) */ if (pt->adjusted_pf.discard_passdown) { data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; - limits->discard_granularity = data_limits->discard_granularity; + limits->discard_granularity = max(data_limits->discard_granularity, + pool->sectors_per_block << SECTOR_SHIFT); } else limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; }