mirror of
https://github.com/edk2-porting/linux-next.git
synced 2024-12-23 12:43:55 +08:00
b4b610f684
The siblings "list" is used unsafely at the moment. Firstly, only the element on the list being changed gets locked (via the snapshot lock), not the next and previous elements which have pointers that are also being changed. Secondly, if you have two or more snapshots and write to the same chunk a second time before every snapshot has finished making its private copy of the data, if you're unlucky, _origin_write() could attempt its list_merge() and dereference a 'last' pointer to a pending_exception structure that has just been freed. Analysis reveals that the list is actually only there for reference counting. If 5 pending_exceptions are needed in origin_write, then the 5 are joined together into a 5-element list - without a separate list head because there's nowhere suitable to store it. As the pending_exceptions complete, they are removed from the list one-by-one and any contents of origin_bios get moved across to one of the remaining pending_exceptions on the list. Whichever one is last is detected because list_empty() is then true and the origin_bios get submitted. The fix proposed here uses an alternative reference counting mechanism by choosing one of the pending_exceptions as primary and maintaining an atomic counter there. Signed-off-by: Alasdair G Kergon <agk@redhat.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
1259 lines
27 KiB
C
1259 lines
27 KiB
C
/*
|
|
* dm-snapshot.c
|
|
*
|
|
* Copyright (C) 2001-2002 Sistina Software (UK) Limited.
|
|
*
|
|
* This file is released under the GPL.
|
|
*/
|
|
|
|
#include <linux/blkdev.h>
|
|
#include <linux/config.h>
|
|
#include <linux/ctype.h>
|
|
#include <linux/device-mapper.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/init.h>
|
|
#include <linux/kdev_t.h>
|
|
#include <linux/list.h>
|
|
#include <linux/mempool.h>
|
|
#include <linux/module.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/vmalloc.h>
|
|
|
|
#include "dm-snap.h"
|
|
#include "dm-bio-list.h"
|
|
#include "kcopyd.h"
|
|
|
|
/*
|
|
* The percentage increment we will wake up users at
|
|
*/
|
|
#define WAKE_UP_PERCENT 5
|
|
|
|
/*
|
|
* kcopyd priority of snapshot operations
|
|
*/
|
|
#define SNAPSHOT_COPY_PRIORITY 2
|
|
|
|
/*
|
|
* Each snapshot reserves this many pages for io
|
|
*/
|
|
#define SNAPSHOT_PAGES 256
|
|
|
|
struct pending_exception {
|
|
struct exception e;
|
|
|
|
/*
|
|
* Origin buffers waiting for this to complete are held
|
|
* in a bio list
|
|
*/
|
|
struct bio_list origin_bios;
|
|
struct bio_list snapshot_bios;
|
|
|
|
/*
|
|
* Short-term queue of pending exceptions prior to submission.
|
|
*/
|
|
struct list_head list;
|
|
|
|
/*
|
|
* The primary pending_exception is the one that holds
|
|
* the sibling_count and the list of origin_bios for a
|
|
* group of pending_exceptions. It is always last to get freed.
|
|
* These fields get set up when writing to the origin.
|
|
*/
|
|
struct pending_exception *primary_pe;
|
|
|
|
/*
|
|
* Number of pending_exceptions processing this chunk.
|
|
* When this drops to zero we must complete the origin bios.
|
|
* If incrementing or decrementing this, hold pe->snap->lock for
|
|
* the sibling concerned and not pe->primary_pe->snap->lock unless
|
|
* they are the same.
|
|
*/
|
|
atomic_t sibling_count;
|
|
|
|
/* Pointer back to snapshot context */
|
|
struct dm_snapshot *snap;
|
|
|
|
/*
|
|
* 1 indicates the exception has already been sent to
|
|
* kcopyd.
|
|
*/
|
|
int started;
|
|
};
|
|
|
|
/*
|
|
* Hash table mapping origin volumes to lists of snapshots and
|
|
* a lock to protect it
|
|
*/
|
|
static kmem_cache_t *exception_cache;
|
|
static kmem_cache_t *pending_cache;
|
|
static mempool_t *pending_pool;
|
|
|
|
/*
|
|
* One of these per registered origin, held in the snapshot_origins hash
|
|
*/
|
|
struct origin {
|
|
/* The origin device */
|
|
struct block_device *bdev;
|
|
|
|
struct list_head hash_list;
|
|
|
|
/* List of snapshots for this origin */
|
|
struct list_head snapshots;
|
|
};
|
|
|
|
/*
|
|
* Size of the hash table for origin volumes. If we make this
|
|
* the size of the minors list then it should be nearly perfect
|
|
*/
|
|
#define ORIGIN_HASH_SIZE 256
|
|
#define ORIGIN_MASK 0xFF
|
|
static struct list_head *_origins;
|
|
static struct rw_semaphore _origins_lock;
|
|
|
|
static int init_origin_hash(void)
|
|
{
|
|
int i;
|
|
|
|
_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
|
|
GFP_KERNEL);
|
|
if (!_origins) {
|
|
DMERR("Device mapper: Snapshot: unable to allocate memory");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
for (i = 0; i < ORIGIN_HASH_SIZE; i++)
|
|
INIT_LIST_HEAD(_origins + i);
|
|
init_rwsem(&_origins_lock);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void exit_origin_hash(void)
|
|
{
|
|
kfree(_origins);
|
|
}
|
|
|
|
static inline unsigned int origin_hash(struct block_device *bdev)
|
|
{
|
|
return bdev->bd_dev & ORIGIN_MASK;
|
|
}
|
|
|
|
static struct origin *__lookup_origin(struct block_device *origin)
|
|
{
|
|
struct list_head *ol;
|
|
struct origin *o;
|
|
|
|
ol = &_origins[origin_hash(origin)];
|
|
list_for_each_entry (o, ol, hash_list)
|
|
if (bdev_equal(o->bdev, origin))
|
|
return o;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static void __insert_origin(struct origin *o)
|
|
{
|
|
struct list_head *sl = &_origins[origin_hash(o->bdev)];
|
|
list_add_tail(&o->hash_list, sl);
|
|
}
|
|
|
|
/*
|
|
* Make a note of the snapshot and its origin so we can look it
|
|
* up when the origin has a write on it.
|
|
*/
|
|
static int register_snapshot(struct dm_snapshot *snap)
|
|
{
|
|
struct origin *o;
|
|
struct block_device *bdev = snap->origin->bdev;
|
|
|
|
down_write(&_origins_lock);
|
|
o = __lookup_origin(bdev);
|
|
|
|
if (!o) {
|
|
/* New origin */
|
|
o = kmalloc(sizeof(*o), GFP_KERNEL);
|
|
if (!o) {
|
|
up_write(&_origins_lock);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
/* Initialise the struct */
|
|
INIT_LIST_HEAD(&o->snapshots);
|
|
o->bdev = bdev;
|
|
|
|
__insert_origin(o);
|
|
}
|
|
|
|
list_add_tail(&snap->list, &o->snapshots);
|
|
|
|
up_write(&_origins_lock);
|
|
return 0;
|
|
}
|
|
|
|
static void unregister_snapshot(struct dm_snapshot *s)
|
|
{
|
|
struct origin *o;
|
|
|
|
down_write(&_origins_lock);
|
|
o = __lookup_origin(s->origin->bdev);
|
|
|
|
list_del(&s->list);
|
|
if (list_empty(&o->snapshots)) {
|
|
list_del(&o->hash_list);
|
|
kfree(o);
|
|
}
|
|
|
|
up_write(&_origins_lock);
|
|
}
|
|
|
|
/*
|
|
* Implementation of the exception hash tables.
|
|
*/
|
|
static int init_exception_table(struct exception_table *et, uint32_t size)
|
|
{
|
|
unsigned int i;
|
|
|
|
et->hash_mask = size - 1;
|
|
et->table = dm_vcalloc(size, sizeof(struct list_head));
|
|
if (!et->table)
|
|
return -ENOMEM;
|
|
|
|
for (i = 0; i < size; i++)
|
|
INIT_LIST_HEAD(et->table + i);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
|
|
{
|
|
struct list_head *slot;
|
|
struct exception *ex, *next;
|
|
int i, size;
|
|
|
|
size = et->hash_mask + 1;
|
|
for (i = 0; i < size; i++) {
|
|
slot = et->table + i;
|
|
|
|
list_for_each_entry_safe (ex, next, slot, hash_list)
|
|
kmem_cache_free(mem, ex);
|
|
}
|
|
|
|
vfree(et->table);
|
|
}
|
|
|
|
static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
|
|
{
|
|
return chunk & et->hash_mask;
|
|
}
|
|
|
|
static void insert_exception(struct exception_table *eh, struct exception *e)
|
|
{
|
|
struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
|
|
list_add(&e->hash_list, l);
|
|
}
|
|
|
|
static inline void remove_exception(struct exception *e)
|
|
{
|
|
list_del(&e->hash_list);
|
|
}
|
|
|
|
/*
|
|
* Return the exception data for a sector, or NULL if not
|
|
* remapped.
|
|
*/
|
|
static struct exception *lookup_exception(struct exception_table *et,
|
|
chunk_t chunk)
|
|
{
|
|
struct list_head *slot;
|
|
struct exception *e;
|
|
|
|
slot = &et->table[exception_hash(et, chunk)];
|
|
list_for_each_entry (e, slot, hash_list)
|
|
if (e->old_chunk == chunk)
|
|
return e;
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static inline struct exception *alloc_exception(void)
|
|
{
|
|
struct exception *e;
|
|
|
|
e = kmem_cache_alloc(exception_cache, GFP_NOIO);
|
|
if (!e)
|
|
e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
|
|
|
|
return e;
|
|
}
|
|
|
|
static inline void free_exception(struct exception *e)
|
|
{
|
|
kmem_cache_free(exception_cache, e);
|
|
}
|
|
|
|
static inline struct pending_exception *alloc_pending_exception(void)
|
|
{
|
|
return mempool_alloc(pending_pool, GFP_NOIO);
|
|
}
|
|
|
|
static inline void free_pending_exception(struct pending_exception *pe)
|
|
{
|
|
mempool_free(pe, pending_pool);
|
|
}
|
|
|
|
int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
|
|
{
|
|
struct exception *e;
|
|
|
|
e = alloc_exception();
|
|
if (!e)
|
|
return -ENOMEM;
|
|
|
|
e->old_chunk = old;
|
|
e->new_chunk = new;
|
|
insert_exception(&s->complete, e);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Hard coded magic.
|
|
*/
|
|
static int calc_max_buckets(void)
|
|
{
|
|
/* use a fixed size of 2MB */
|
|
unsigned long mem = 2 * 1024 * 1024;
|
|
mem /= sizeof(struct list_head);
|
|
|
|
return mem;
|
|
}
|
|
|
|
/*
|
|
* Rounds a number down to a power of 2.
|
|
*/
|
|
static inline uint32_t round_down(uint32_t n)
|
|
{
|
|
while (n & (n - 1))
|
|
n &= (n - 1);
|
|
return n;
|
|
}
|
|
|
|
/*
|
|
* Allocate room for a suitable hash table.
|
|
*/
|
|
static int init_hash_tables(struct dm_snapshot *s)
|
|
{
|
|
sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
|
|
|
|
/*
|
|
* Calculate based on the size of the original volume or
|
|
* the COW volume...
|
|
*/
|
|
cow_dev_size = get_dev_size(s->cow->bdev);
|
|
origin_dev_size = get_dev_size(s->origin->bdev);
|
|
max_buckets = calc_max_buckets();
|
|
|
|
hash_size = min(origin_dev_size, cow_dev_size) >> s->chunk_shift;
|
|
hash_size = min(hash_size, max_buckets);
|
|
|
|
/* Round it down to a power of 2 */
|
|
hash_size = round_down(hash_size);
|
|
if (init_exception_table(&s->complete, hash_size))
|
|
return -ENOMEM;
|
|
|
|
/*
|
|
* Allocate hash table for in-flight exceptions
|
|
* Make this smaller than the real hash table
|
|
*/
|
|
hash_size >>= 3;
|
|
if (hash_size < 64)
|
|
hash_size = 64;
|
|
|
|
if (init_exception_table(&s->pending, hash_size)) {
|
|
exit_exception_table(&s->complete, exception_cache);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Round a number up to the nearest 'size' boundary. size must
|
|
* be a power of 2.
|
|
*/
|
|
static inline ulong round_up(ulong n, ulong size)
|
|
{
|
|
size--;
|
|
return (n + size) & ~size;
|
|
}
|
|
|
|
static void read_snapshot_metadata(struct dm_snapshot *s)
|
|
{
|
|
if (s->store.read_metadata(&s->store)) {
|
|
down_write(&s->lock);
|
|
s->valid = 0;
|
|
up_write(&s->lock);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
|
|
*/
|
|
static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|
{
|
|
struct dm_snapshot *s;
|
|
unsigned long chunk_size;
|
|
int r = -EINVAL;
|
|
char persistent;
|
|
char *origin_path;
|
|
char *cow_path;
|
|
char *value;
|
|
int blocksize;
|
|
|
|
if (argc < 4) {
|
|
ti->error = "dm-snapshot: requires exactly 4 arguments";
|
|
r = -EINVAL;
|
|
goto bad1;
|
|
}
|
|
|
|
origin_path = argv[0];
|
|
cow_path = argv[1];
|
|
persistent = toupper(*argv[2]);
|
|
|
|
if (persistent != 'P' && persistent != 'N') {
|
|
ti->error = "Persistent flag is not P or N";
|
|
r = -EINVAL;
|
|
goto bad1;
|
|
}
|
|
|
|
chunk_size = simple_strtoul(argv[3], &value, 10);
|
|
if (chunk_size == 0 || value == NULL) {
|
|
ti->error = "Invalid chunk size";
|
|
r = -EINVAL;
|
|
goto bad1;
|
|
}
|
|
|
|
s = kmalloc(sizeof(*s), GFP_KERNEL);
|
|
if (s == NULL) {
|
|
ti->error = "Cannot allocate snapshot context private "
|
|
"structure";
|
|
r = -ENOMEM;
|
|
goto bad1;
|
|
}
|
|
|
|
r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
|
|
if (r) {
|
|
ti->error = "Cannot get origin device";
|
|
goto bad2;
|
|
}
|
|
|
|
r = dm_get_device(ti, cow_path, 0, 0,
|
|
FMODE_READ | FMODE_WRITE, &s->cow);
|
|
if (r) {
|
|
dm_put_device(ti, s->origin);
|
|
ti->error = "Cannot get COW device";
|
|
goto bad2;
|
|
}
|
|
|
|
/*
|
|
* Chunk size must be multiple of page size. Silently
|
|
* round up if it's not.
|
|
*/
|
|
chunk_size = round_up(chunk_size, PAGE_SIZE >> 9);
|
|
|
|
/* Validate the chunk size against the device block size */
|
|
blocksize = s->cow->bdev->bd_disk->queue->hardsect_size;
|
|
if (chunk_size % (blocksize >> 9)) {
|
|
ti->error = "Chunk size is not a multiple of device blocksize";
|
|
r = -EINVAL;
|
|
goto bad3;
|
|
}
|
|
|
|
/* Check chunk_size is a power of 2 */
|
|
if (chunk_size & (chunk_size - 1)) {
|
|
ti->error = "Chunk size is not a power of 2";
|
|
r = -EINVAL;
|
|
goto bad3;
|
|
}
|
|
|
|
s->chunk_size = chunk_size;
|
|
s->chunk_mask = chunk_size - 1;
|
|
s->type = persistent;
|
|
s->chunk_shift = ffs(chunk_size) - 1;
|
|
|
|
s->valid = 1;
|
|
s->active = 0;
|
|
s->last_percent = 0;
|
|
init_rwsem(&s->lock);
|
|
s->table = ti->table;
|
|
|
|
/* Allocate hash table for COW data */
|
|
if (init_hash_tables(s)) {
|
|
ti->error = "Unable to allocate hash table space";
|
|
r = -ENOMEM;
|
|
goto bad3;
|
|
}
|
|
|
|
/*
|
|
* Check the persistent flag - done here because we need the iobuf
|
|
* to check the LV header
|
|
*/
|
|
s->store.snap = s;
|
|
|
|
if (persistent == 'P')
|
|
r = dm_create_persistent(&s->store, chunk_size);
|
|
else
|
|
r = dm_create_transient(&s->store, s, blocksize);
|
|
|
|
if (r) {
|
|
ti->error = "Couldn't create exception store";
|
|
r = -EINVAL;
|
|
goto bad4;
|
|
}
|
|
|
|
r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
|
|
if (r) {
|
|
ti->error = "Could not create kcopyd client";
|
|
goto bad5;
|
|
}
|
|
|
|
/* Metadata must only be loaded into one table at once */
|
|
read_snapshot_metadata(s);
|
|
|
|
/* Add snapshot to the list of snapshots for this origin */
|
|
/* Exceptions aren't triggered till snapshot_resume() is called */
|
|
if (register_snapshot(s)) {
|
|
r = -EINVAL;
|
|
ti->error = "Cannot register snapshot origin";
|
|
goto bad6;
|
|
}
|
|
|
|
ti->private = s;
|
|
ti->split_io = chunk_size;
|
|
|
|
return 0;
|
|
|
|
bad6:
|
|
kcopyd_client_destroy(s->kcopyd_client);
|
|
|
|
bad5:
|
|
s->store.destroy(&s->store);
|
|
|
|
bad4:
|
|
exit_exception_table(&s->pending, pending_cache);
|
|
exit_exception_table(&s->complete, exception_cache);
|
|
|
|
bad3:
|
|
dm_put_device(ti, s->cow);
|
|
dm_put_device(ti, s->origin);
|
|
|
|
bad2:
|
|
kfree(s);
|
|
|
|
bad1:
|
|
return r;
|
|
}
|
|
|
|
static void snapshot_dtr(struct dm_target *ti)
|
|
{
|
|
struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
|
|
|
|
unregister_snapshot(s);
|
|
|
|
exit_exception_table(&s->pending, pending_cache);
|
|
exit_exception_table(&s->complete, exception_cache);
|
|
|
|
/* Deallocate memory used */
|
|
s->store.destroy(&s->store);
|
|
|
|
dm_put_device(ti, s->origin);
|
|
dm_put_device(ti, s->cow);
|
|
kcopyd_client_destroy(s->kcopyd_client);
|
|
kfree(s);
|
|
}
|
|
|
|
/*
|
|
* Flush a list of buffers.
|
|
*/
|
|
static void flush_bios(struct bio *bio)
|
|
{
|
|
struct bio *n;
|
|
|
|
while (bio) {
|
|
n = bio->bi_next;
|
|
bio->bi_next = NULL;
|
|
generic_make_request(bio);
|
|
bio = n;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Error a list of buffers.
|
|
*/
|
|
static void error_bios(struct bio *bio)
|
|
{
|
|
struct bio *n;
|
|
|
|
while (bio) {
|
|
n = bio->bi_next;
|
|
bio->bi_next = NULL;
|
|
bio_io_error(bio, bio->bi_size);
|
|
bio = n;
|
|
}
|
|
}
|
|
|
|
static struct bio *__flush_bios(struct pending_exception *pe)
|
|
{
|
|
/*
|
|
* If this pe is involved in a write to the origin and
|
|
* it is the last sibling to complete then release
|
|
* the bios for the original write to the origin.
|
|
*/
|
|
|
|
if (pe->primary_pe &&
|
|
atomic_dec_and_test(&pe->primary_pe->sibling_count))
|
|
return bio_list_get(&pe->primary_pe->origin_bios);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static void pending_complete(struct pending_exception *pe, int success)
|
|
{
|
|
struct exception *e;
|
|
struct pending_exception *primary_pe;
|
|
struct dm_snapshot *s = pe->snap;
|
|
struct bio *flush = NULL;
|
|
|
|
if (success) {
|
|
e = alloc_exception();
|
|
if (!e) {
|
|
DMWARN("Unable to allocate exception.");
|
|
down_write(&s->lock);
|
|
s->store.drop_snapshot(&s->store);
|
|
s->valid = 0;
|
|
flush = __flush_bios(pe);
|
|
up_write(&s->lock);
|
|
|
|
error_bios(bio_list_get(&pe->snapshot_bios));
|
|
goto out;
|
|
}
|
|
*e = pe->e;
|
|
|
|
/*
|
|
* Add a proper exception, and remove the
|
|
* in-flight exception from the list.
|
|
*/
|
|
down_write(&s->lock);
|
|
insert_exception(&s->complete, e);
|
|
remove_exception(&pe->e);
|
|
flush = __flush_bios(pe);
|
|
|
|
/* Submit any pending write bios */
|
|
up_write(&s->lock);
|
|
|
|
flush_bios(bio_list_get(&pe->snapshot_bios));
|
|
} else {
|
|
/* Read/write error - snapshot is unusable */
|
|
down_write(&s->lock);
|
|
if (s->valid)
|
|
DMERR("Error reading/writing snapshot");
|
|
s->store.drop_snapshot(&s->store);
|
|
s->valid = 0;
|
|
remove_exception(&pe->e);
|
|
flush = __flush_bios(pe);
|
|
up_write(&s->lock);
|
|
|
|
error_bios(bio_list_get(&pe->snapshot_bios));
|
|
|
|
dm_table_event(s->table);
|
|
}
|
|
|
|
out:
|
|
primary_pe = pe->primary_pe;
|
|
|
|
/*
|
|
* Free the pe if it's not linked to an origin write or if
|
|
* it's not itself a primary pe.
|
|
*/
|
|
if (!primary_pe || primary_pe != pe)
|
|
free_pending_exception(pe);
|
|
|
|
/*
|
|
* Free the primary pe if nothing references it.
|
|
*/
|
|
if (primary_pe && !atomic_read(&primary_pe->sibling_count))
|
|
free_pending_exception(primary_pe);
|
|
|
|
if (flush)
|
|
flush_bios(flush);
|
|
}
|
|
|
|
static void commit_callback(void *context, int success)
|
|
{
|
|
struct pending_exception *pe = (struct pending_exception *) context;
|
|
pending_complete(pe, success);
|
|
}
|
|
|
|
/*
|
|
* Called when the copy I/O has finished. kcopyd actually runs
|
|
* this code so don't block.
|
|
*/
|
|
static void copy_callback(int read_err, unsigned int write_err, void *context)
|
|
{
|
|
struct pending_exception *pe = (struct pending_exception *) context;
|
|
struct dm_snapshot *s = pe->snap;
|
|
|
|
if (read_err || write_err)
|
|
pending_complete(pe, 0);
|
|
|
|
else
|
|
/* Update the metadata if we are persistent */
|
|
s->store.commit_exception(&s->store, &pe->e, commit_callback,
|
|
pe);
|
|
}
|
|
|
|
/*
|
|
* Dispatches the copy operation to kcopyd.
|
|
*/
|
|
static void start_copy(struct pending_exception *pe)
|
|
{
|
|
struct dm_snapshot *s = pe->snap;
|
|
struct io_region src, dest;
|
|
struct block_device *bdev = s->origin->bdev;
|
|
sector_t dev_size;
|
|
|
|
dev_size = get_dev_size(bdev);
|
|
|
|
src.bdev = bdev;
|
|
src.sector = chunk_to_sector(s, pe->e.old_chunk);
|
|
src.count = min(s->chunk_size, dev_size - src.sector);
|
|
|
|
dest.bdev = s->cow->bdev;
|
|
dest.sector = chunk_to_sector(s, pe->e.new_chunk);
|
|
dest.count = src.count;
|
|
|
|
/* Hand over to kcopyd */
|
|
kcopyd_copy(s->kcopyd_client,
|
|
&src, 1, &dest, 0, copy_callback, pe);
|
|
}
|
|
|
|
/*
|
|
* Looks to see if this snapshot already has a pending exception
|
|
* for this chunk, otherwise it allocates a new one and inserts
|
|
* it into the pending table.
|
|
*
|
|
* NOTE: a write lock must be held on snap->lock before calling
|
|
* this.
|
|
*/
|
|
static struct pending_exception *
|
|
__find_pending_exception(struct dm_snapshot *s, struct bio *bio)
|
|
{
|
|
struct exception *e;
|
|
struct pending_exception *pe;
|
|
chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
|
|
|
|
/*
|
|
* Is there a pending exception for this already ?
|
|
*/
|
|
e = lookup_exception(&s->pending, chunk);
|
|
if (e) {
|
|
/* cast the exception to a pending exception */
|
|
pe = container_of(e, struct pending_exception, e);
|
|
|
|
} else {
|
|
/*
|
|
* Create a new pending exception, we don't want
|
|
* to hold the lock while we do this.
|
|
*/
|
|
up_write(&s->lock);
|
|
pe = alloc_pending_exception();
|
|
down_write(&s->lock);
|
|
|
|
e = lookup_exception(&s->pending, chunk);
|
|
if (e) {
|
|
free_pending_exception(pe);
|
|
pe = container_of(e, struct pending_exception, e);
|
|
} else {
|
|
pe->e.old_chunk = chunk;
|
|
bio_list_init(&pe->origin_bios);
|
|
bio_list_init(&pe->snapshot_bios);
|
|
pe->primary_pe = NULL;
|
|
atomic_set(&pe->sibling_count, 1);
|
|
pe->snap = s;
|
|
pe->started = 0;
|
|
|
|
if (s->store.prepare_exception(&s->store, &pe->e)) {
|
|
free_pending_exception(pe);
|
|
s->valid = 0;
|
|
return NULL;
|
|
}
|
|
|
|
insert_exception(&s->pending, &pe->e);
|
|
}
|
|
}
|
|
|
|
return pe;
|
|
}
|
|
|
|
static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
|
|
struct bio *bio)
|
|
{
|
|
bio->bi_bdev = s->cow->bdev;
|
|
bio->bi_sector = chunk_to_sector(s, e->new_chunk) +
|
|
(bio->bi_sector & s->chunk_mask);
|
|
}
|
|
|
|
static int snapshot_map(struct dm_target *ti, struct bio *bio,
|
|
union map_info *map_context)
|
|
{
|
|
struct exception *e;
|
|
struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
|
|
int r = 1;
|
|
chunk_t chunk;
|
|
struct pending_exception *pe;
|
|
|
|
chunk = sector_to_chunk(s, bio->bi_sector);
|
|
|
|
/* Full snapshots are not usable */
|
|
if (!s->valid)
|
|
return -EIO;
|
|
|
|
if (unlikely(bio_barrier(bio)))
|
|
return -EOPNOTSUPP;
|
|
|
|
/*
|
|
* Write to snapshot - higher level takes care of RW/RO
|
|
* flags so we should only get this if we are
|
|
* writeable.
|
|
*/
|
|
if (bio_rw(bio) == WRITE) {
|
|
|
|
/* FIXME: should only take write lock if we need
|
|
* to copy an exception */
|
|
down_write(&s->lock);
|
|
|
|
/* If the block is already remapped - use that, else remap it */
|
|
e = lookup_exception(&s->complete, chunk);
|
|
if (e) {
|
|
remap_exception(s, e, bio);
|
|
up_write(&s->lock);
|
|
|
|
} else {
|
|
pe = __find_pending_exception(s, bio);
|
|
|
|
if (!pe) {
|
|
if (s->store.drop_snapshot)
|
|
s->store.drop_snapshot(&s->store);
|
|
s->valid = 0;
|
|
r = -EIO;
|
|
up_write(&s->lock);
|
|
} else {
|
|
remap_exception(s, &pe->e, bio);
|
|
bio_list_add(&pe->snapshot_bios, bio);
|
|
|
|
if (!pe->started) {
|
|
/* this is protected by snap->lock */
|
|
pe->started = 1;
|
|
up_write(&s->lock);
|
|
start_copy(pe);
|
|
} else
|
|
up_write(&s->lock);
|
|
r = 0;
|
|
}
|
|
}
|
|
|
|
} else {
|
|
/*
|
|
* FIXME: this read path scares me because we
|
|
* always use the origin when we have a pending
|
|
* exception. However I can't think of a
|
|
* situation where this is wrong - ejt.
|
|
*/
|
|
|
|
/* Do reads */
|
|
down_read(&s->lock);
|
|
|
|
/* See if it it has been remapped */
|
|
e = lookup_exception(&s->complete, chunk);
|
|
if (e)
|
|
remap_exception(s, e, bio);
|
|
else
|
|
bio->bi_bdev = s->origin->bdev;
|
|
|
|
up_read(&s->lock);
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
static void snapshot_resume(struct dm_target *ti)
|
|
{
|
|
struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
|
|
|
|
down_write(&s->lock);
|
|
s->active = 1;
|
|
up_write(&s->lock);
|
|
}
|
|
|
|
static int snapshot_status(struct dm_target *ti, status_type_t type,
|
|
char *result, unsigned int maxlen)
|
|
{
|
|
struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
|
|
|
|
switch (type) {
|
|
case STATUSTYPE_INFO:
|
|
if (!snap->valid)
|
|
snprintf(result, maxlen, "Invalid");
|
|
else {
|
|
if (snap->store.fraction_full) {
|
|
sector_t numerator, denominator;
|
|
snap->store.fraction_full(&snap->store,
|
|
&numerator,
|
|
&denominator);
|
|
snprintf(result, maxlen,
|
|
SECTOR_FORMAT "/" SECTOR_FORMAT,
|
|
numerator, denominator);
|
|
}
|
|
else
|
|
snprintf(result, maxlen, "Unknown");
|
|
}
|
|
break;
|
|
|
|
case STATUSTYPE_TABLE:
|
|
/*
|
|
* kdevname returns a static pointer so we need
|
|
* to make private copies if the output is to
|
|
* make sense.
|
|
*/
|
|
snprintf(result, maxlen, "%s %s %c " SECTOR_FORMAT,
|
|
snap->origin->name, snap->cow->name,
|
|
snap->type, snap->chunk_size);
|
|
break;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*-----------------------------------------------------------------
|
|
* Origin methods
|
|
*---------------------------------------------------------------*/
|
|
static int __origin_write(struct list_head *snapshots, struct bio *bio)
|
|
{
|
|
int r = 1, first = 0;
|
|
struct dm_snapshot *snap;
|
|
struct exception *e;
|
|
struct pending_exception *pe, *next_pe, *primary_pe = NULL;
|
|
chunk_t chunk;
|
|
LIST_HEAD(pe_queue);
|
|
|
|
/* Do all the snapshots on this origin */
|
|
list_for_each_entry (snap, snapshots, list) {
|
|
|
|
/* Only deal with valid and active snapshots */
|
|
if (!snap->valid || !snap->active)
|
|
continue;
|
|
|
|
/* Nothing to do if writing beyond end of snapshot */
|
|
if (bio->bi_sector >= dm_table_get_size(snap->table))
|
|
continue;
|
|
|
|
down_write(&snap->lock);
|
|
|
|
/*
|
|
* Remember, different snapshots can have
|
|
* different chunk sizes.
|
|
*/
|
|
chunk = sector_to_chunk(snap, bio->bi_sector);
|
|
|
|
/*
|
|
* Check exception table to see if block
|
|
* is already remapped in this snapshot
|
|
* and trigger an exception if not.
|
|
*
|
|
* sibling_count is initialised to 1 so pending_complete()
|
|
* won't destroy the primary_pe while we're inside this loop.
|
|
*/
|
|
e = lookup_exception(&snap->complete, chunk);
|
|
if (!e) {
|
|
pe = __find_pending_exception(snap, bio);
|
|
if (!pe) {
|
|
snap->store.drop_snapshot(&snap->store);
|
|
snap->valid = 0;
|
|
|
|
} else {
|
|
if (!primary_pe) {
|
|
/*
|
|
* Either every pe here has same
|
|
* primary_pe or none has one yet.
|
|
*/
|
|
if (pe->primary_pe)
|
|
primary_pe = pe->primary_pe;
|
|
else {
|
|
primary_pe = pe;
|
|
first = 1;
|
|
}
|
|
|
|
bio_list_add(&primary_pe->origin_bios,
|
|
bio);
|
|
r = 0;
|
|
}
|
|
if (!pe->primary_pe) {
|
|
atomic_inc(&primary_pe->sibling_count);
|
|
pe->primary_pe = primary_pe;
|
|
}
|
|
if (!pe->started) {
|
|
pe->started = 1;
|
|
list_add_tail(&pe->list, &pe_queue);
|
|
}
|
|
}
|
|
}
|
|
|
|
up_write(&snap->lock);
|
|
}
|
|
|
|
if (!primary_pe)
|
|
goto out;
|
|
|
|
/*
|
|
* If this is the first time we're processing this chunk and
|
|
* sibling_count is now 1 it means all the pending exceptions
|
|
* got completed while we were in the loop above, so it falls to
|
|
* us here to remove the primary_pe and submit any origin_bios.
|
|
*/
|
|
|
|
if (first && atomic_dec_and_test(&primary_pe->sibling_count)) {
|
|
flush_bios(bio_list_get(&primary_pe->origin_bios));
|
|
free_pending_exception(primary_pe);
|
|
/* If we got here, pe_queue is necessarily empty. */
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* Now that we have a complete pe list we can start the copying.
|
|
*/
|
|
list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
|
|
start_copy(pe);
|
|
|
|
out:
|
|
return r;
|
|
}
|
|
|
|
/*
|
|
* Called on a write from the origin driver.
|
|
*/
|
|
static int do_origin(struct dm_dev *origin, struct bio *bio)
|
|
{
|
|
struct origin *o;
|
|
int r = 1;
|
|
|
|
down_read(&_origins_lock);
|
|
o = __lookup_origin(origin->bdev);
|
|
if (o)
|
|
r = __origin_write(&o->snapshots, bio);
|
|
up_read(&_origins_lock);
|
|
|
|
return r;
|
|
}
|
|
|
|
/*
|
|
* Origin: maps a linear range of a device, with hooks for snapshotting.
|
|
*/
|
|
|
|
/*
|
|
* Construct an origin mapping: <dev_path>
|
|
* The context for an origin is merely a 'struct dm_dev *'
|
|
* pointing to the real device.
|
|
*/
|
|
static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
|
{
|
|
int r;
|
|
struct dm_dev *dev;
|
|
|
|
if (argc != 1) {
|
|
ti->error = "dm-origin: incorrect number of arguments";
|
|
return -EINVAL;
|
|
}
|
|
|
|
r = dm_get_device(ti, argv[0], 0, ti->len,
|
|
dm_table_get_mode(ti->table), &dev);
|
|
if (r) {
|
|
ti->error = "Cannot get target device";
|
|
return r;
|
|
}
|
|
|
|
ti->private = dev;
|
|
return 0;
|
|
}
|
|
|
|
static void origin_dtr(struct dm_target *ti)
|
|
{
|
|
struct dm_dev *dev = (struct dm_dev *) ti->private;
|
|
dm_put_device(ti, dev);
|
|
}
|
|
|
|
static int origin_map(struct dm_target *ti, struct bio *bio,
|
|
union map_info *map_context)
|
|
{
|
|
struct dm_dev *dev = (struct dm_dev *) ti->private;
|
|
bio->bi_bdev = dev->bdev;
|
|
|
|
if (unlikely(bio_barrier(bio)))
|
|
return -EOPNOTSUPP;
|
|
|
|
/* Only tell snapshots if this is a write */
|
|
return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : 1;
|
|
}
|
|
|
|
#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
|
|
|
|
/*
|
|
* Set the target "split_io" field to the minimum of all the snapshots'
|
|
* chunk sizes.
|
|
*/
|
|
static void origin_resume(struct dm_target *ti)
|
|
{
|
|
struct dm_dev *dev = (struct dm_dev *) ti->private;
|
|
struct dm_snapshot *snap;
|
|
struct origin *o;
|
|
chunk_t chunk_size = 0;
|
|
|
|
down_read(&_origins_lock);
|
|
o = __lookup_origin(dev->bdev);
|
|
if (o)
|
|
list_for_each_entry (snap, &o->snapshots, list)
|
|
chunk_size = min_not_zero(chunk_size, snap->chunk_size);
|
|
up_read(&_origins_lock);
|
|
|
|
ti->split_io = chunk_size;
|
|
}
|
|
|
|
static int origin_status(struct dm_target *ti, status_type_t type, char *result,
|
|
unsigned int maxlen)
|
|
{
|
|
struct dm_dev *dev = (struct dm_dev *) ti->private;
|
|
|
|
switch (type) {
|
|
case STATUSTYPE_INFO:
|
|
result[0] = '\0';
|
|
break;
|
|
|
|
case STATUSTYPE_TABLE:
|
|
snprintf(result, maxlen, "%s", dev->name);
|
|
break;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static struct target_type origin_target = {
|
|
.name = "snapshot-origin",
|
|
.version = {1, 1, 0},
|
|
.module = THIS_MODULE,
|
|
.ctr = origin_ctr,
|
|
.dtr = origin_dtr,
|
|
.map = origin_map,
|
|
.resume = origin_resume,
|
|
.status = origin_status,
|
|
};
|
|
|
|
static struct target_type snapshot_target = {
|
|
.name = "snapshot",
|
|
.version = {1, 1, 0},
|
|
.module = THIS_MODULE,
|
|
.ctr = snapshot_ctr,
|
|
.dtr = snapshot_dtr,
|
|
.map = snapshot_map,
|
|
.resume = snapshot_resume,
|
|
.status = snapshot_status,
|
|
};
|
|
|
|
static int __init dm_snapshot_init(void)
|
|
{
|
|
int r;
|
|
|
|
r = dm_register_target(&snapshot_target);
|
|
if (r) {
|
|
DMERR("snapshot target register failed %d", r);
|
|
return r;
|
|
}
|
|
|
|
r = dm_register_target(&origin_target);
|
|
if (r < 0) {
|
|
DMERR("Device mapper: Origin: register failed %d\n", r);
|
|
goto bad1;
|
|
}
|
|
|
|
r = init_origin_hash();
|
|
if (r) {
|
|
DMERR("init_origin_hash failed.");
|
|
goto bad2;
|
|
}
|
|
|
|
exception_cache = kmem_cache_create("dm-snapshot-ex",
|
|
sizeof(struct exception),
|
|
__alignof__(struct exception),
|
|
0, NULL, NULL);
|
|
if (!exception_cache) {
|
|
DMERR("Couldn't create exception cache.");
|
|
r = -ENOMEM;
|
|
goto bad3;
|
|
}
|
|
|
|
pending_cache =
|
|
kmem_cache_create("dm-snapshot-in",
|
|
sizeof(struct pending_exception),
|
|
__alignof__(struct pending_exception),
|
|
0, NULL, NULL);
|
|
if (!pending_cache) {
|
|
DMERR("Couldn't create pending cache.");
|
|
r = -ENOMEM;
|
|
goto bad4;
|
|
}
|
|
|
|
pending_pool = mempool_create_slab_pool(128, pending_cache);
|
|
if (!pending_pool) {
|
|
DMERR("Couldn't create pending pool.");
|
|
r = -ENOMEM;
|
|
goto bad5;
|
|
}
|
|
|
|
return 0;
|
|
|
|
bad5:
|
|
kmem_cache_destroy(pending_cache);
|
|
bad4:
|
|
kmem_cache_destroy(exception_cache);
|
|
bad3:
|
|
exit_origin_hash();
|
|
bad2:
|
|
dm_unregister_target(&origin_target);
|
|
bad1:
|
|
dm_unregister_target(&snapshot_target);
|
|
return r;
|
|
}
|
|
|
|
static void __exit dm_snapshot_exit(void)
|
|
{
|
|
int r;
|
|
|
|
r = dm_unregister_target(&snapshot_target);
|
|
if (r)
|
|
DMERR("snapshot unregister failed %d", r);
|
|
|
|
r = dm_unregister_target(&origin_target);
|
|
if (r)
|
|
DMERR("origin unregister failed %d", r);
|
|
|
|
exit_origin_hash();
|
|
mempool_destroy(pending_pool);
|
|
kmem_cache_destroy(pending_cache);
|
|
kmem_cache_destroy(exception_cache);
|
|
}
|
|
|
|
/* Module hooks */
|
|
module_init(dm_snapshot_init);
|
|
module_exit(dm_snapshot_exit);
|
|
|
|
MODULE_DESCRIPTION(DM_NAME " snapshot target");
|
|
MODULE_AUTHOR("Joe Thornber");
|
|
MODULE_LICENSE("GPL");
|