diff --git a/block/backup.c b/block/backup.c index 582bd0f7ee..02dbe48035 100644 --- a/block/backup.c +++ b/block/backup.c @@ -372,14 +372,14 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) int64_t end; int64_t last_cluster = -1; int64_t sectors_per_cluster = cluster_size_sectors(job); - HBitmapIter hbi; + BdrvDirtyBitmapIter *dbi; granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap); clusters_per_iter = MAX((granularity / job->cluster_size), 1); - bdrv_dirty_iter_init(job->sync_bitmap, &hbi); + dbi = bdrv_dirty_iter_new(job->sync_bitmap, 0); /* Find the next dirty sector(s) */ - while ((sector = hbitmap_iter_next(&hbi)) != -1) { + while ((sector = bdrv_dirty_iter_next(dbi)) != -1) { cluster = sector / sectors_per_cluster; /* Fake progress updates for any clusters we skipped */ @@ -391,7 +391,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) for (end = cluster + clusters_per_iter; cluster < end; cluster++) { do { if (yield_and_check(job)) { - return ret; + goto out; } ret = backup_do_cow(job, cluster * sectors_per_cluster, sectors_per_cluster, &error_is_read, @@ -399,7 +399,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) if ((ret < 0) && backup_error_action(job, error_is_read, -ret) == BLOCK_ERROR_ACTION_REPORT) { - return ret; + goto out; } } while (ret < 0); } @@ -407,7 +407,7 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) /* If the bitmap granularity is smaller than the backup granularity, * we need to advance the iterator pointer to the next cluster. */ if (granularity < job->cluster_size) { - bdrv_set_dirty_iter(&hbi, cluster * sectors_per_cluster); + bdrv_set_dirty_iter(dbi, cluster * sectors_per_cluster); } last_cluster = cluster - 1; @@ -419,6 +419,8 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) job->common.offset += ((end - last_cluster - 1) * job->cluster_size); } +out: + bdrv_dirty_iter_free(dbi); return ret; } diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c index f2bfdcfdea..519737c8d3 100644 --- a/block/dirty-bitmap.c +++ b/block/dirty-bitmap.c @@ -38,13 +38,20 @@ */ struct BdrvDirtyBitmap { HBitmap *bitmap; /* Dirty sector bitmap implementation */ + HBitmap *meta; /* Meta dirty bitmap */ BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */ char *name; /* Optional non-empty unique ID */ int64_t size; /* Size of the bitmap (Number of sectors) */ bool disabled; /* Bitmap is read-only */ + int active_iterators; /* How many iterators are active */ QLIST_ENTRY(BdrvDirtyBitmap) list; }; +struct BdrvDirtyBitmapIter { + HBitmapIter hbi; + BdrvDirtyBitmap *bitmap; +}; + BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) { BdrvDirtyBitmap *bm; @@ -97,6 +104,66 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, return bitmap; } +/* bdrv_create_meta_dirty_bitmap + * + * Create a meta dirty bitmap that tracks the changes of bits in @bitmap. I.e. + * when a dirty status bit in @bitmap is changed (either from reset to set or + * the other way around), its respective meta dirty bitmap bit will be marked + * dirty as well. + * + * @bitmap: the block dirty bitmap for which to create a meta dirty bitmap. + * @chunk_size: how many bytes of bitmap data does each bit in the meta bitmap + * track. + */ +void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int chunk_size) +{ + assert(!bitmap->meta); + bitmap->meta = hbitmap_create_meta(bitmap->bitmap, + chunk_size * BITS_PER_BYTE); +} + +void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap) +{ + assert(bitmap->meta); + hbitmap_free_meta(bitmap->bitmap); + bitmap->meta = NULL; +} + +int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, int64_t sector, + int nb_sectors) +{ + uint64_t i; + int sectors_per_bit = 1 << hbitmap_granularity(bitmap->meta); + + /* To optimize: we can make hbitmap to internally check the range in a + * coarse level, or at least do it word by word. */ + for (i = sector; i < sector + nb_sectors; i += sectors_per_bit) { + if (hbitmap_get(bitmap->meta, i)) { + return true; + } + } + return false; +} + +void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, int64_t sector, + int nb_sectors) +{ + hbitmap_reset(bitmap->meta, sector, nb_sectors); +} + +int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap) +{ + return bitmap->size; +} + +const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap) +{ + return bitmap->name; +} + bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap) { return bitmap->successor; @@ -212,6 +279,7 @@ void bdrv_dirty_bitmap_truncate(BlockDriverState *bs) QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { assert(!bdrv_dirty_bitmap_frozen(bitmap)); + assert(!bitmap->active_iterators); hbitmap_truncate(bitmap->bitmap, size); bitmap->size = size; } @@ -224,7 +292,9 @@ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bm, *next; QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) { + assert(!bm->active_iterators); assert(!bdrv_dirty_bitmap_frozen(bm)); + assert(!bm->meta); QLIST_REMOVE(bm, list); hbitmap_free(bm->bitmap); g_free(bm->name); @@ -235,6 +305,9 @@ static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, } } } + if (bitmap) { + abort(); + } } void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) @@ -320,9 +393,43 @@ uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap) return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap); } -void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) +uint32_t bdrv_dirty_bitmap_meta_granularity(BdrvDirtyBitmap *bitmap) { - hbitmap_iter_init(hbi, bitmap->bitmap, 0); + return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->meta); +} + +BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap, + uint64_t first_sector) +{ + BdrvDirtyBitmapIter *iter = g_new(BdrvDirtyBitmapIter, 1); + hbitmap_iter_init(&iter->hbi, bitmap->bitmap, first_sector); + iter->bitmap = bitmap; + bitmap->active_iterators++; + return iter; +} + +BdrvDirtyBitmapIter *bdrv_dirty_meta_iter_new(BdrvDirtyBitmap *bitmap) +{ + BdrvDirtyBitmapIter *iter = g_new(BdrvDirtyBitmapIter, 1); + hbitmap_iter_init(&iter->hbi, bitmap->meta, 0); + iter->bitmap = bitmap; + bitmap->active_iterators++; + return iter; +} + +void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter) +{ + if (!iter) { + return; + } + assert(iter->bitmap->active_iterators > 0); + iter->bitmap->active_iterators--; + g_free(iter); +} + +int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter) +{ + return hbitmap_iter_next(&iter->hbi); } void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, @@ -360,6 +467,43 @@ void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in) hbitmap_free(tmp); } +uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap, + uint64_t start, uint64_t count) +{ + return hbitmap_serialization_size(bitmap->bitmap, start, count); +} + +uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap) +{ + return hbitmap_serialization_granularity(bitmap->bitmap); +} + +void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap, + uint8_t *buf, uint64_t start, + uint64_t count) +{ + hbitmap_serialize_part(bitmap->bitmap, buf, start, count); +} + +void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap, + uint8_t *buf, uint64_t start, + uint64_t count, bool finish) +{ + hbitmap_deserialize_part(bitmap->bitmap, buf, start, count, finish); +} + +void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap, + uint64_t start, uint64_t count, + bool finish) +{ + hbitmap_deserialize_zeroes(bitmap->bitmap, start, count, finish); +} + +void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap) +{ + hbitmap_deserialize_finish(bitmap->bitmap); +} + void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int64_t nr_sectors) { @@ -373,15 +517,19 @@ void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, } /** - * Advance an HBitmapIter to an arbitrary offset. + * Advance a BdrvDirtyBitmapIter to an arbitrary offset. */ -void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset) +void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *iter, int64_t sector_num) { - assert(hbi->hb); - hbitmap_iter_init(hbi, hbi->hb, offset); + hbitmap_iter_init(&iter->hbi, iter->hbi.hb, sector_num); } int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap) { return hbitmap_count(bitmap->bitmap); } + +int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap) +{ + return hbitmap_count(bitmap->meta); +} diff --git a/block/mirror.c b/block/mirror.c index f9d1fecaa0..a433e6848c 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -55,7 +55,7 @@ typedef struct MirrorBlockJob { int64_t bdev_length; unsigned long *cow_bitmap; BdrvDirtyBitmap *dirty_bitmap; - HBitmapIter hbi; + BdrvDirtyBitmapIter *dbi; uint8_t *buf; QSIMPLEQ_HEAD(, MirrorBuffer) buf_free; int buf_free_count; @@ -330,10 +330,10 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) int max_io_sectors = MAX((s->buf_size >> BDRV_SECTOR_BITS) / MAX_IN_FLIGHT, MAX_IO_SECTORS); - sector_num = hbitmap_iter_next(&s->hbi); + sector_num = bdrv_dirty_iter_next(s->dbi); if (sector_num < 0) { - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); - sector_num = hbitmap_iter_next(&s->hbi); + bdrv_set_dirty_iter(s->dbi, 0); + sector_num = bdrv_dirty_iter_next(s->dbi); trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); assert(sector_num >= 0); } @@ -349,7 +349,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) /* Find the number of consective dirty chunks following the first dirty * one, and wait for in flight requests in them. */ while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) { - int64_t hbitmap_next; + int64_t next_dirty; int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk; int64_t next_chunk = next_sector / sectors_per_chunk; if (next_sector >= end || @@ -360,13 +360,13 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) break; } - hbitmap_next = hbitmap_iter_next(&s->hbi); - if (hbitmap_next > next_sector || hbitmap_next < 0) { + next_dirty = bdrv_dirty_iter_next(s->dbi); + if (next_dirty > next_sector || next_dirty < 0) { /* The bitmap iterator's cache is stale, refresh it */ - bdrv_set_dirty_iter(&s->hbi, next_sector); - hbitmap_next = hbitmap_iter_next(&s->hbi); + bdrv_set_dirty_iter(s->dbi, next_sector); + next_dirty = bdrv_dirty_iter_next(s->dbi); } - assert(hbitmap_next == next_sector); + assert(next_dirty == next_sector); nb_chunks++; } @@ -679,7 +679,8 @@ static void coroutine_fn mirror_run(void *opaque) } } - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); + assert(!s->dbi); + s->dbi = bdrv_dirty_iter_new(s->dirty_bitmap, 0); for (;;) { uint64_t delay_ns = 0; int64_t cnt, delta; @@ -793,6 +794,7 @@ immediate_exit: qemu_vfree(s->buf); g_free(s->cow_bitmap); g_free(s->in_flight_bitmap); + bdrv_dirty_iter_free(s->dbi); bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); data = g_malloc(sizeof(*data)); diff --git a/block/qapi.c b/block/qapi.c index 6f947e3e66..50d30907a2 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -698,6 +698,7 @@ void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f, assert(qobject_type(obj) == QTYPE_QDICT); data = qdict_get(qobject_to_qdict(obj), "data"); dump_qobject(func_fprintf, f, 1, data); + qobject_decref(obj); visit_free(v); } diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index 61d1ffd223..928c1e298d 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -1558,7 +1558,7 @@ fail: * clusters. */ static int zero_single_l2(BlockDriverState *bs, uint64_t offset, - uint64_t nb_clusters) + uint64_t nb_clusters, int flags) { BDRVQcow2State *s = bs->opaque; uint64_t *l2_table; @@ -1582,7 +1582,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset, /* Update L2 entries */ qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); - if (old_offset & QCOW_OFLAG_COMPRESSED) { + if (old_offset & QCOW_OFLAG_COMPRESSED || flags & BDRV_REQ_MAY_UNMAP) { l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); } else { @@ -1595,7 +1595,8 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset, return nb_clusters; } -int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors, + int flags) { BDRVQcow2State *s = bs->opaque; uint64_t nb_clusters; @@ -1612,7 +1613,7 @@ int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) s->cache_discards = true; while (nb_clusters > 0) { - ret = zero_single_l2(bs, offset, nb_clusters); + ret = zero_single_l2(bs, offset, nb_clusters, flags); if (ret < 0) { goto fail; } diff --git a/block/qcow2.c b/block/qcow2.c index e11c7c9d16..6d5689a23c 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1155,6 +1155,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, /* Initialise locks */ qemu_co_mutex_init(&s->lock); + bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP; /* Repair image if dirty */ if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only && @@ -2477,7 +2478,7 @@ static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs, trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, count); /* Whatever is left can use real zero clusters */ - ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS); + ret = qcow2_zero_clusters(bs, offset, count >> BDRV_SECTOR_BITS, flags); qemu_co_mutex_unlock(&s->lock); return ret; diff --git a/block/qcow2.h b/block/qcow2.h index 9ce5a37d3a..92203a8b8c 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -547,7 +547,8 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors, enum qcow2_discard_type type, bool full_discard); -int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors, + int flags); int qcow2_expand_zero_clusters(BlockDriverState *bs, BlockDriverAmendStatusCB *status_cb, diff --git a/block/quorum.c b/block/quorum.c index 9cf876fb34..d122299352 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -130,7 +130,7 @@ struct QuorumAIOCB { bool is_read; int vote_ret; - int child_iter; /* which child to read in fifo pattern */ + int children_read; /* how many children have been read from */ }; static bool quorum_vote(QuorumAIOCB *acb); @@ -156,22 +156,7 @@ static AIOCBInfo quorum_aiocb_info = { static void quorum_aio_finalize(QuorumAIOCB *acb) { - int i, ret = 0; - - if (acb->vote_ret) { - ret = acb->vote_ret; - } - - acb->common.cb(acb->common.opaque, ret); - - if (acb->is_read) { - /* on the quorum case acb->child_iter == s->num_children - 1 */ - for (i = 0; i <= acb->child_iter; i++) { - qemu_vfree(acb->qcrs[i].buf); - qemu_iovec_destroy(&acb->qcrs[i].qiov); - } - } - + acb->common.cb(acb->common.opaque, acb->vote_ret); g_free(acb->qcrs); qemu_aio_unref(acb); } @@ -283,39 +268,52 @@ static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) } } +static void quorum_report_bad_acb(QuorumChildRequest *sacb, int ret) +{ + QuorumAIOCB *acb = sacb->parent; + QuorumOpType type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE; + quorum_report_bad(type, acb->sector_num, acb->nb_sectors, + sacb->aiocb->bs->node_name, ret); +} + +static void quorum_fifo_aio_cb(void *opaque, int ret) +{ + QuorumChildRequest *sacb = opaque; + QuorumAIOCB *acb = sacb->parent; + BDRVQuorumState *s = acb->common.bs->opaque; + + assert(acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO); + + if (ret < 0) { + quorum_report_bad_acb(sacb, ret); + + /* We try to read next child in FIFO order if we fail to read */ + if (acb->children_read < s->num_children) { + read_fifo_child(acb); + return; + } + } + + acb->vote_ret = ret; + + /* FIXME: rewrite failed children if acb->children_read > 1? */ + quorum_aio_finalize(acb); +} + static void quorum_aio_cb(void *opaque, int ret) { QuorumChildRequest *sacb = opaque; QuorumAIOCB *acb = sacb->parent; BDRVQuorumState *s = acb->common.bs->opaque; bool rewrite = false; + int i; + sacb->ret = ret; if (ret == 0) { acb->success_count++; } else { - QuorumOpType type; - type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE; - quorum_report_bad(type, acb->sector_num, acb->nb_sectors, - sacb->aiocb->bs->node_name, ret); + quorum_report_bad_acb(sacb, ret); } - - if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) { - /* We try to read next child in FIFO order if we fail to read */ - if (ret < 0 && (acb->child_iter + 1) < s->num_children) { - acb->child_iter++; - read_fifo_child(acb); - return; - } - - if (ret == 0) { - quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qiov); - } - acb->vote_ret = ret; - quorum_aio_finalize(acb); - return; - } - - sacb->ret = ret; acb->count++; assert(acb->count <= s->num_children); assert(acb->success_count <= s->num_children); @@ -326,6 +324,10 @@ static void quorum_aio_cb(void *opaque, int ret) /* Do the vote on read */ if (acb->is_read) { rewrite = quorum_vote(acb); + for (i = 0; i < s->num_children; i++) { + qemu_vfree(acb->qcrs[i].buf); + qemu_iovec_destroy(&acb->qcrs[i].qiov); + } } else { quorum_has_too_much_io_failed(acb); } @@ -653,6 +655,7 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb) BDRVQuorumState *s = acb->common.bs->opaque; int i; + acb->children_read = s->num_children; for (i = 0; i < s->num_children; i++) { acb->qcrs[i].buf = qemu_blockalign(s->children[i]->bs, acb->qiov->size); qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov); @@ -671,16 +674,11 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb) static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb) { BDRVQuorumState *s = acb->common.bs->opaque; + int n = acb->children_read++; - acb->qcrs[acb->child_iter].buf = - qemu_blockalign(s->children[acb->child_iter]->bs, acb->qiov->size); - qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov); - qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov, - acb->qcrs[acb->child_iter].buf); - acb->qcrs[acb->child_iter].aiocb = - bdrv_aio_readv(s->children[acb->child_iter], acb->sector_num, - &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, - quorum_aio_cb, &acb->qcrs[acb->child_iter]); + acb->qcrs[n].aiocb = bdrv_aio_readv(s->children[n], acb->sector_num, + acb->qiov, acb->nb_sectors, + quorum_fifo_aio_cb, &acb->qcrs[n]); return &acb->common; } @@ -696,13 +694,12 @@ static BlockAIOCB *quorum_aio_readv(BlockDriverState *bs, QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors, cb, opaque); acb->is_read = true; + acb->children_read = 0; if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { - acb->child_iter = s->num_children - 1; return read_quorum_children(acb); } - acb->child_iter = 0; return read_fifo_child(acb); } diff --git a/block/raw-posix.c b/block/raw-posix.c index 166e9d1ad5..f481e57f78 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -443,6 +443,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, fd = qemu_open(filename, s->open_flags, 0644); if (fd < 0) { ret = -errno; + error_setg_errno(errp, errno, "Could not open '%s'", filename); if (ret == -EROFS) { ret = -EACCES; } diff --git a/block/raw-win32.c b/block/raw-win32.c index 734bb105bd..800fabdd72 100644 --- a/block/raw-win32.c +++ b/block/raw-win32.c @@ -373,6 +373,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, if (s->hfile == INVALID_HANDLE_VALUE) { int err = GetLastError(); + error_setg_win32(errp, err, "Could not open '%s'", filename); if (err == ERROR_ACCESS_DENIED) { ret = -EACCES; } else { diff --git a/block/replication.c b/block/replication.c index 3bd1cf1809..8bbfc8f870 100644 --- a/block/replication.c +++ b/block/replication.c @@ -101,6 +101,11 @@ static int replication_open(BlockDriverState *bs, QDict *options, if (!strcmp(mode, "primary")) { s->mode = REPLICATION_MODE_PRIMARY; + top_id = qemu_opt_get(opts, REPLICATION_TOP_ID); + if (top_id) { + error_setg(&local_err, "The primary side does not support option top-id"); + goto fail; + } } else if (!strcmp(mode, "secondary")) { s->mode = REPLICATION_MODE_SECONDARY; top_id = qemu_opt_get(opts, REPLICATION_TOP_ID); diff --git a/block/throttle-groups.c b/block/throttle-groups.c index 59545e287e..17b2efb7c7 100644 --- a/block/throttle-groups.c +++ b/block/throttle-groups.c @@ -168,6 +168,22 @@ static BlockBackend *throttle_group_next_blk(BlockBackend *blk) return blk_by_public(next); } +/* + * Return whether a BlockBackend has pending requests. + * + * This assumes that tg->lock is held. + * + * @blk: the BlockBackend + * @is_write: the type of operation (read/write) + * @ret: whether the BlockBackend has pending requests. + */ +static inline bool blk_has_pending_reqs(BlockBackend *blk, + bool is_write) +{ + const BlockBackendPublic *blkp = blk_get_public(blk); + return blkp->pending_reqs[is_write]; +} + /* Return the next BlockBackend in the round-robin sequence with pending I/O * requests. * @@ -188,7 +204,7 @@ static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write) /* get next bs round in round robin style */ token = throttle_group_next_blk(token); - while (token != start && !blkp->pending_reqs[is_write]) { + while (token != start && !blk_has_pending_reqs(token, is_write)) { token = throttle_group_next_blk(token); } @@ -196,10 +212,13 @@ static BlockBackend *next_throttle_token(BlockBackend *blk, bool is_write) * then decide the token is the current bs because chances are * the current bs get the current request queued. */ - if (token == start && !blkp->pending_reqs[is_write]) { + if (token == start && !blk_has_pending_reqs(token, is_write)) { token = blk; } + /* Either we return the original BB, or one with pending requests */ + assert(token == blk || blk_has_pending_reqs(token, is_write)); + return token; } @@ -257,7 +276,7 @@ static void schedule_next_request(BlockBackend *blk, bool is_write) /* Check if there's any pending request to schedule next */ token = next_throttle_token(blk, is_write); - if (!blkp->pending_reqs[is_write]) { + if (!blk_has_pending_reqs(token, is_write)) { return; } @@ -271,7 +290,7 @@ static void schedule_next_request(BlockBackend *blk, bool is_write) qemu_co_queue_next(&blkp->throttled_reqs[is_write])) { token = blk; } else { - ThrottleTimers *tt = &blkp->throttle_timers; + ThrottleTimers *tt = &blk_get_public(token)->throttle_timers; int64_t now = qemu_clock_get_ns(tt->clock_type); timer_mod(tt->timers[is_write], now + 1); tg->any_timer_armed[is_write] = true; diff --git a/docs/qmp-commands.txt b/docs/qmp-commands.txt index 3220fb1075..284576d795 100644 --- a/docs/qmp-commands.txt +++ b/docs/qmp-commands.txt @@ -1090,11 +1090,11 @@ Arguments: Example: -> { "execute": "blockdev-add", - "arguments": { "options": { "driver": "qcow2", - "node-name": "node1534", - "file": { "driver": "file", - "filename": "hd1.qcow2" }, - "backing": "" } } } + "arguments": { "driver": "qcow2", + "node-name": "node1534", + "file": { "driver": "file", + "filename": "hd1.qcow2" }, + "backing": "" } } <- { "return": {} } @@ -3130,41 +3130,37 @@ This command is still a work in progress. It doesn't support all block drivers among other things. Stay away from it unless you want to help with its development. -Arguments: - -- "options": block driver options +For the arguments, see the QAPI schema documentation of BlockdevOptions. Example (1): -> { "execute": "blockdev-add", - "arguments": { "options" : { "driver": "qcow2", - "file": { "driver": "file", - "filename": "test.qcow2" } } } } + "arguments": { "driver": "qcow2", + "file": { "driver": "file", + "filename": "test.qcow2" } } } <- { "return": {} } Example (2): -> { "execute": "blockdev-add", "arguments": { - "options": { - "driver": "qcow2", - "node-name": "my_disk", - "discard": "unmap", - "cache": { - "direct": true, - "writeback": true - }, - "file": { - "driver": "file", - "filename": "/tmp/test.qcow2" - }, - "backing": { - "driver": "raw", - "file": { - "driver": "file", - "filename": "/dev/fdset/4" - } - } + "driver": "qcow2", + "node-name": "my_disk", + "discard": "unmap", + "cache": { + "direct": true, + "writeback": true + }, + "file": { + "driver": "file", + "filename": "/tmp/test.qcow2" + }, + "backing": { + "driver": "raw", + "file": { + "driver": "file", + "filename": "/dev/fdset/4" + } } } } @@ -3191,13 +3187,11 @@ Example: -> { "execute": "blockdev-add", "arguments": { - "options": { - "driver": "qcow2", - "node-name": "node0", - "file": { - "driver": "file", - "filename": "test.qcow2" - } + "driver": "qcow2", + "node-name": "node0", + "file": { + "driver": "file", + "filename": "test.qcow2" } } } @@ -3342,10 +3336,10 @@ Arguments: Example: -> { "execute": "blockdev-add", - "arguments": { "options": { "node-name": "node0", - "driver": "raw", - "file": { "driver": "file", - "filename": "fedora.iso" } } } } + "arguments": { { "node-name": "node0", + "driver": "raw", + "file": { "driver": "file", + "filename": "fedora.iso" } } } <- { "return": {} } @@ -3383,10 +3377,10 @@ Example: Add a new node to a quorum -> { "execute": "blockdev-add", - "arguments": { "options": { "driver": "raw", - "node-name": "new_node", - "file": { "driver": "file", - "filename": "test.raw" } } } } + "arguments": { "driver": "raw", + "node-name": "new_node", + "file": { "driver": "file", + "filename": "test.raw" } } } <- { "return": {} } -> { "execute": "x-blockdev-change", "arguments": { "parent": "disk1", diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h index ee3388f90d..9dea14ba03 100644 --- a/include/block/dirty-bitmap.h +++ b/include/block/dirty-bitmap.h @@ -8,6 +8,9 @@ BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, uint32_t granularity, const char *name, Error **errp); +void bdrv_create_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int chunk_size); +void bdrv_release_meta_dirty_bitmap(BdrvDirtyBitmap *bitmap); int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, Error **errp); @@ -27,8 +30,11 @@ void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap); BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs); uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs); uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap); +uint32_t bdrv_dirty_bitmap_meta_granularity(BdrvDirtyBitmap *bitmap); bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap); bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap); +const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap); +int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap); DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap); int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector); @@ -36,9 +42,34 @@ void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int64_t nr_sectors); void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, int64_t cur_sector, int64_t nr_sectors); -void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi); -void bdrv_set_dirty_iter(struct HBitmapIter *hbi, int64_t offset); +int bdrv_dirty_bitmap_get_meta(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, int64_t sector, + int nb_sectors); +void bdrv_dirty_bitmap_reset_meta(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, int64_t sector, + int nb_sectors); +BdrvDirtyBitmapIter *bdrv_dirty_meta_iter_new(BdrvDirtyBitmap *bitmap); +BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap, + uint64_t first_sector); +void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter); +int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter); +void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t sector_num); int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); +int64_t bdrv_get_meta_dirty_count(BdrvDirtyBitmap *bitmap); void bdrv_dirty_bitmap_truncate(BlockDriverState *bs); +uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap, + uint64_t start, uint64_t count); +uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap); +void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap, + uint8_t *buf, uint64_t start, + uint64_t count); +void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap, + uint8_t *buf, uint64_t start, + uint64_t count, bool finish); +void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap, + uint64_t start, uint64_t count, + bool finish); +void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap); + #endif diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h index 8ab721e5aa..eb464759d5 100644 --- a/include/qemu/hbitmap.h +++ b/include/qemu/hbitmap.h @@ -145,6 +145,85 @@ void hbitmap_reset_all(HBitmap *hb); */ bool hbitmap_get(const HBitmap *hb, uint64_t item); +/** + * hbitmap_serialization_granularity: + * @hb: HBitmap to operate on. + * + * Granularity of serialization chunks, used by other serialization functions. + * For every chunk: + * 1. Chunk start should be aligned to this granularity. + * 2. Chunk size should be aligned too, except for last chunk (for which + * start + count == hb->size) + */ +uint64_t hbitmap_serialization_granularity(const HBitmap *hb); + +/** + * hbitmap_serialization_size: + * @hb: HBitmap to operate on. + * @start: Starting bit + * @count: Number of bits + * + * Return number of bytes hbitmap_(de)serialize_part needs + */ +uint64_t hbitmap_serialization_size(const HBitmap *hb, + uint64_t start, uint64_t count); + +/** + * hbitmap_serialize_part + * @hb: HBitmap to operate on. + * @buf: Buffer to store serialized bitmap. + * @start: First bit to store. + * @count: Number of bits to store. + * + * Stores HBitmap data corresponding to given region. The format of saved data + * is linear sequence of bits, so it can be used by hbitmap_deserialize_part + * independently of endianness and size of HBitmap level array elements + */ +void hbitmap_serialize_part(const HBitmap *hb, uint8_t *buf, + uint64_t start, uint64_t count); + +/** + * hbitmap_deserialize_part + * @hb: HBitmap to operate on. + * @buf: Buffer to restore bitmap data from. + * @start: First bit to restore. + * @count: Number of bits to restore. + * @finish: Whether to call hbitmap_deserialize_finish automatically. + * + * Restores HBitmap data corresponding to given region. The format is the same + * as for hbitmap_serialize_part. + * + * If @finish is false, caller must call hbitmap_serialize_finish before using + * the bitmap. + */ +void hbitmap_deserialize_part(HBitmap *hb, uint8_t *buf, + uint64_t start, uint64_t count, + bool finish); + +/** + * hbitmap_deserialize_zeroes + * @hb: HBitmap to operate on. + * @start: First bit to restore. + * @count: Number of bits to restore. + * @finish: Whether to call hbitmap_deserialize_finish automatically. + * + * Fills the bitmap with zeroes. + * + * If @finish is false, caller must call hbitmap_serialize_finish before using + * the bitmap. + */ +void hbitmap_deserialize_zeroes(HBitmap *hb, uint64_t start, uint64_t count, + bool finish); + +/** + * hbitmap_deserialize_finish + * @hb: HBitmap to operate on. + * + * Repair HBitmap after calling hbitmap_deserialize_data. Actually, all HBitmap + * layers are restored here. + */ +void hbitmap_deserialize_finish(HBitmap *hb); + /** * hbitmap_free: * @hb: HBitmap to operate on. @@ -178,6 +257,27 @@ void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first); */ unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi); +/* hbitmap_create_meta: + * Create a "meta" hbitmap to track dirtiness of the bits in this HBitmap. + * The caller owns the created bitmap and must call hbitmap_free_meta(hb) to + * free it. + * + * Currently, we only guarantee that if a bit in the hbitmap is changed it + * will be reflected in the meta bitmap, but we do not yet guarantee the + * opposite. + * + * @hb: The HBitmap to operate on. + * @chunk_size: How many bits in @hb does one bit in the meta track. + */ +HBitmap *hbitmap_create_meta(HBitmap *hb, int chunk_size); + +/* hbitmap_free_meta: + * Free the meta bitmap of @hb. + * + * @hb: The HBitmap whose meta bitmap should be freed. + */ +void hbitmap_free_meta(HBitmap *hb); + /** * hbitmap_iter_next: * @hbi: HBitmapIter to operate on. diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index b113fcf156..1b8c30a7a0 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -11,6 +11,7 @@ typedef struct AioContext AioContext; typedef struct AllwinnerAHCIState AllwinnerAHCIState; typedef struct AudioState AudioState; typedef struct BdrvDirtyBitmap BdrvDirtyBitmap; +typedef struct BdrvDirtyBitmapIter BdrvDirtyBitmapIter; typedef struct BlockBackend BlockBackend; typedef struct BlockBackendRootState BlockBackendRootState; typedef struct BlockDriverState BlockDriverState; diff --git a/qapi/block-core.json b/qapi/block-core.json index 1b7aa1befd..97b120532a 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2197,7 +2197,8 @@ # @mode: the replication mode # # @top-id: #optional In secondary mode, node name or device ID of the root -# node who owns the replication node chain. Ignored in primary mode. +# node who owns the replication node chain. Must not be given in +# primary mode. # # Since: 2.8 ## @@ -2312,11 +2313,11 @@ # block drivers among other things. Stay away from it unless you want # to help with its development. # -# @options: block device options for the new device +# For the arguments, see the documentation of BlockdevOptions. # # Since: 1.7 ## -{ 'command': 'blockdev-add', 'data': { 'options': 'BlockdevOptions' } } +{ 'command': 'blockdev-add', 'data': 'BlockdevOptions', 'boxed': true } ## # @x-blockdev-del: diff --git a/qemu-img.c b/qemu-img.c index 67e851248a..ab395a9b1a 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -2956,6 +2956,7 @@ static int img_rebase(int argc, char **argv) error_reportf_err(local_err, "Could not open old backing file '%s': ", backing_name); + ret = -1; goto out; } @@ -2973,6 +2974,7 @@ static int img_rebase(int argc, char **argv) error_reportf_err(local_err, "Could not open new backing file '%s': ", out_baseimg); + ret = -1; goto out; } } diff --git a/qemu-nbd.c b/qemu-nbd.c index cca4a983b7..b757dc7621 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -48,6 +48,7 @@ #define QEMU_NBD_OPT_OBJECT 260 #define QEMU_NBD_OPT_TLSCREDS 261 #define QEMU_NBD_OPT_IMAGE_OPTS 262 +#define QEMU_NBD_OPT_FORK 263 #define MBR_SIZE 512 @@ -92,6 +93,8 @@ static void usage(const char *name) " passwords and/or encryption keys\n" " -T, --trace [[enable=]][,events=][,file=]\n" " specify tracing options\n" +" --fork fork off the server process and exit the parent\n" +" once the server is running\n" #ifdef __linux__ "Kernel NBD client support:\n" " -c, --connect=DEV connect FILE to the local NBD device DEV\n" @@ -503,6 +506,7 @@ int main(int argc, char **argv) { "tls-creds", required_argument, NULL, QEMU_NBD_OPT_TLSCREDS }, { "image-opts", no_argument, NULL, QEMU_NBD_OPT_IMAGE_OPTS }, { "trace", required_argument, NULL, 'T' }, + { "fork", no_argument, NULL, QEMU_NBD_OPT_FORK }, { NULL, 0, NULL, 0 } }; int ch; @@ -524,6 +528,8 @@ int main(int argc, char **argv) bool imageOpts = false; bool writethrough = true; char *trace_file = NULL; + bool fork_process = false; + int old_stderr = -1; /* The client thread uses SIGTERM to interrupt the server. A signal * handler ensures that "qemu-nbd -v -c" exits with a nice status code. @@ -715,6 +721,9 @@ int main(int argc, char **argv) g_free(trace_file); trace_file = trace_opt_parse(optarg); break; + case QEMU_NBD_OPT_FORK: + fork_process = true; + break; } } @@ -774,7 +783,7 @@ int main(int argc, char **argv) return 0; } - if (device && !verbose) { + if ((device && !verbose) || fork_process) { int stderr_fd[2]; pid_t pid; int ret; @@ -797,6 +806,7 @@ int main(int argc, char **argv) ret = qemu_daemon(1, 0); /* Temporarily redirect stderr to the parent's pipe... */ + old_stderr = dup(STDERR_FILENO); dup2(stderr_fd[1], STDERR_FILENO); if (ret < 0) { error_report("Failed to daemonize: %s", strerror(errno)); @@ -960,6 +970,11 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } + if (fork_process) { + dup2(old_stderr, STDERR_FILENO); + close(old_stderr); + } + state = RUNNING; do { main_loop_wait(false); diff --git a/qemu-nbd.texi b/qemu-nbd.texi index 91ebf04b5b..b7a9c6d02f 100644 --- a/qemu-nbd.texi +++ b/qemu-nbd.texi @@ -86,6 +86,8 @@ the new style NBD protocol negotiation Enable mandatory TLS encryption for the server by setting the ID of the TLS credentials object previously created with the --object option. +@item --fork +Fork off the server process and exit the parent once the server is running. @item -v, --verbose Display extra debugging information @item -h, --help diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041 index d1e1ad8bd2..30e628f0f7 100755 --- a/tests/qemu-iotests/041 +++ b/tests/qemu-iotests/041 @@ -194,10 +194,9 @@ class TestSingleBlockdev(TestSingleDrive): def setUp(self): TestSingleDrive.setUp(self) qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % backing_img, target_img) - args = {'options': - {'driver': iotests.imgfmt, - 'node-name': self.qmp_target, - 'file': { 'filename': target_img, 'driver': 'file' } } } + args = {'driver': iotests.imgfmt, + 'node-name': self.qmp_target, + 'file': { 'filename': target_img, 'driver': 'file' } } result = self.vm.qmp("blockdev-add", **args) self.assert_qmp(result, 'return', {}) @@ -782,8 +781,8 @@ class TestRepairQuorum(iotests.QMPTestCase): self.vm.launch() #assemble the quorum block device from the individual files - args = { "options" : { "driver": "quorum", "node-name": "quorum0", - "vote-threshold": 2, "children": [ "img0", "img1", "img2" ] } } + args = { "driver": "quorum", "node-name": "quorum0", + "vote-threshold": 2, "children": [ "img0", "img1", "img2" ] } if self.has_quorum(): result = self.vm.qmp("blockdev-add", **args) self.assert_qmp(result, 'return', {}) diff --git a/tests/qemu-iotests/067 b/tests/qemu-iotests/067 index a12125bd46..38d23fce6b 100755 --- a/tests/qemu-iotests/067 +++ b/tests/qemu-iotests/067 @@ -119,13 +119,11 @@ run_qemu < /dev/null; then + break + fi +done + +$QEMU_IMG info "json:{'driver': 'nbd', 'host': 'localhost', 'port': $port}" \ + | grep '^image' | sed -e "s/$port/PORT/" # This is a test for NBD's bdrv_refresh_filename() implementation: It expects # either host or path to be set, but it must not assume that they are set to # strings in the options QDict -$QEMU_NBD -k "$PWD/42" -f raw null-co:// & -sleep 0.5 +$QEMU_NBD -k "$PWD/42" -f raw --fork null-co:// $QEMU_IMG info 'json:{"driver": "nbd", "path": 42}' | grep '^image' rm -f 42 diff --git a/tests/qemu-iotests/162.out b/tests/qemu-iotests/162.out index 9bba72353a..3c5be2c569 100644 --- a/tests/qemu-iotests/162.out +++ b/tests/qemu-iotests/162.out @@ -2,7 +2,7 @@ QA output created by 162 === NBD === qemu-img: Could not open 'json:{"driver": "nbd", "host": 42}': Failed to connect socket: Invalid argument -qemu-img: Could not open 'json:{"driver": "nbd", "host": "does.not.exist.example.com", "port": 42}': address resolution failed for does.not.exist.example.com:42: Name or service not known +image: nbd://localhost:PORT image: nbd+unix://?socket=42 === SSH === diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c index c0e9895fb9..9b7495cc32 100644 --- a/tests/test-hbitmap.c +++ b/tests/test-hbitmap.c @@ -11,6 +11,8 @@ #include "qemu/osdep.h" #include "qemu/hbitmap.h" +#include "qemu/bitmap.h" +#include "block/block.h" #define LOG_BITS_PER_LONG (BITS_PER_LONG == 32 ? 5 : 6) @@ -20,6 +22,7 @@ typedef struct TestHBitmapData { HBitmap *hb; + HBitmap *meta; unsigned long *bits; size_t size; size_t old_size; @@ -91,6 +94,14 @@ static void hbitmap_test_init(TestHBitmapData *data, } } +static void hbitmap_test_init_meta(TestHBitmapData *data, + uint64_t size, int granularity, + int meta_chunk) +{ + hbitmap_test_init(data, size, granularity); + data->meta = hbitmap_create_meta(data->hb, meta_chunk); +} + static inline size_t hbitmap_test_array_size(size_t bits) { size_t n = DIV_ROUND_UP(bits, BITS_PER_LONG); @@ -133,6 +144,9 @@ static void hbitmap_test_teardown(TestHBitmapData *data, const void *unused) { if (data->hb) { + if (data->meta) { + hbitmap_free_meta(data->hb); + } hbitmap_free(data->hb); data->hb = NULL; } @@ -634,6 +648,249 @@ static void test_hbitmap_truncate_shrink_large(TestHBitmapData *data, hbitmap_test_truncate(data, size, -diff, 0); } +static void hbitmap_check_meta(TestHBitmapData *data, + int64_t start, int count) +{ + int64_t i; + + for (i = 0; i < data->size; i++) { + if (i >= start && i < start + count) { + g_assert(hbitmap_get(data->meta, i)); + } else { + g_assert(!hbitmap_get(data->meta, i)); + } + } +} + +static void hbitmap_test_meta(TestHBitmapData *data, + int64_t start, int count, + int64_t check_start, int check_count) +{ + hbitmap_reset_all(data->hb); + hbitmap_reset_all(data->meta); + + /* Test "unset" -> "unset" will not update meta. */ + hbitmap_reset(data->hb, start, count); + hbitmap_check_meta(data, 0, 0); + + /* Test "unset" -> "set" will update meta */ + hbitmap_set(data->hb, start, count); + hbitmap_check_meta(data, check_start, check_count); + + /* Test "set" -> "set" will not update meta */ + hbitmap_reset_all(data->meta); + hbitmap_set(data->hb, start, count); + hbitmap_check_meta(data, 0, 0); + + /* Test "set" -> "unset" will update meta */ + hbitmap_reset_all(data->meta); + hbitmap_reset(data->hb, start, count); + hbitmap_check_meta(data, check_start, check_count); +} + +static void hbitmap_test_meta_do(TestHBitmapData *data, int chunk_size) +{ + uint64_t size = chunk_size * 100; + hbitmap_test_init_meta(data, size, 0, chunk_size); + + hbitmap_test_meta(data, 0, 1, 0, chunk_size); + hbitmap_test_meta(data, 0, chunk_size, 0, chunk_size); + hbitmap_test_meta(data, chunk_size - 1, 1, 0, chunk_size); + hbitmap_test_meta(data, chunk_size - 1, 2, 0, chunk_size * 2); + hbitmap_test_meta(data, chunk_size - 1, chunk_size + 1, 0, chunk_size * 2); + hbitmap_test_meta(data, chunk_size - 1, chunk_size + 2, 0, chunk_size * 3); + hbitmap_test_meta(data, 7 * chunk_size - 1, chunk_size + 2, + 6 * chunk_size, chunk_size * 3); + hbitmap_test_meta(data, size - 1, 1, size - chunk_size, chunk_size); + hbitmap_test_meta(data, 0, size, 0, size); +} + +static void test_hbitmap_meta_byte(TestHBitmapData *data, const void *unused) +{ + hbitmap_test_meta_do(data, BITS_PER_BYTE); +} + +static void test_hbitmap_meta_word(TestHBitmapData *data, const void *unused) +{ + hbitmap_test_meta_do(data, BITS_PER_LONG); +} + +static void test_hbitmap_meta_sector(TestHBitmapData *data, const void *unused) +{ + hbitmap_test_meta_do(data, BDRV_SECTOR_SIZE * BITS_PER_BYTE); +} + +/** + * Create an HBitmap and test set/unset. + */ +static void test_hbitmap_meta_one(TestHBitmapData *data, const void *unused) +{ + int i; + int64_t offsets[] = { + 0, 1, L1 - 1, L1, L1 + 1, L2 - 1, L2, L2 + 1, L3 - 1, L3, L3 + 1 + }; + + hbitmap_test_init_meta(data, L3 * 2, 0, 1); + for (i = 0; i < ARRAY_SIZE(offsets); i++) { + hbitmap_test_meta(data, offsets[i], 1, offsets[i], 1); + hbitmap_test_meta(data, offsets[i], L1, offsets[i], L1); + hbitmap_test_meta(data, offsets[i], L2, offsets[i], L2); + } +} + +static void test_hbitmap_serialize_granularity(TestHBitmapData *data, + const void *unused) +{ + int r; + + hbitmap_test_init(data, L3 * 2, 3); + r = hbitmap_serialization_granularity(data->hb); + g_assert_cmpint(r, ==, 64 << 3); +} + +static void test_hbitmap_meta_zero(TestHBitmapData *data, const void *unused) +{ + hbitmap_test_init_meta(data, 0, 0, 1); + + hbitmap_check_meta(data, 0, 0); +} + +static void hbitmap_test_serialize_range(TestHBitmapData *data, + uint8_t *buf, size_t buf_size, + uint64_t pos, uint64_t count) +{ + size_t i; + unsigned long *el = (unsigned long *)buf; + + assert(hbitmap_granularity(data->hb) == 0); + hbitmap_reset_all(data->hb); + memset(buf, 0, buf_size); + if (count) { + hbitmap_set(data->hb, pos, count); + } + hbitmap_serialize_part(data->hb, buf, 0, data->size); + + /* Serialized buffer is inherently LE, convert it back manually to test */ + for (i = 0; i < buf_size / sizeof(unsigned long); i++) { + el[i] = (BITS_PER_LONG == 32 ? le32_to_cpu(el[i]) : le64_to_cpu(el[i])); + } + + for (i = 0; i < data->size; i++) { + int is_set = test_bit(i, (unsigned long *)buf); + if (i >= pos && i < pos + count) { + g_assert(is_set); + } else { + g_assert(!is_set); + } + } + + /* Re-serialize for deserialization testing */ + memset(buf, 0, buf_size); + hbitmap_serialize_part(data->hb, buf, 0, data->size); + hbitmap_reset_all(data->hb); + hbitmap_deserialize_part(data->hb, buf, 0, data->size, true); + + for (i = 0; i < data->size; i++) { + int is_set = hbitmap_get(data->hb, i); + if (i >= pos && i < pos + count) { + g_assert(is_set); + } else { + g_assert(!is_set); + } + } +} + +static void test_hbitmap_serialize_basic(TestHBitmapData *data, + const void *unused) +{ + int i, j; + size_t buf_size; + uint8_t *buf; + uint64_t positions[] = { 0, 1, L1 - 1, L1, L2 - 1, L2, L2 + 1, L3 - 1 }; + int num_positions = sizeof(positions) / sizeof(positions[0]); + + hbitmap_test_init(data, L3, 0); + buf_size = hbitmap_serialization_size(data->hb, 0, data->size); + buf = g_malloc0(buf_size); + + for (i = 0; i < num_positions; i++) { + for (j = 0; j < num_positions; j++) { + hbitmap_test_serialize_range(data, buf, buf_size, + positions[i], + MIN(positions[j], L3 - positions[i])); + } + } + + g_free(buf); +} + +static void test_hbitmap_serialize_part(TestHBitmapData *data, + const void *unused) +{ + int i, j, k; + size_t buf_size; + uint8_t *buf; + uint64_t positions[] = { 0, 1, L1 - 1, L1, L2 - 1, L2, L2 + 1, L3 - 1 }; + int num_positions = sizeof(positions) / sizeof(positions[0]); + + hbitmap_test_init(data, L3, 0); + buf_size = L2; + buf = g_malloc0(buf_size); + + for (i = 0; i < num_positions; i++) { + hbitmap_set(data->hb, positions[i], 1); + } + + for (i = 0; i < data->size; i += buf_size) { + unsigned long *el = (unsigned long *)buf; + hbitmap_serialize_part(data->hb, buf, i, buf_size); + for (j = 0; j < buf_size / sizeof(unsigned long); j++) { + el[j] = (BITS_PER_LONG == 32 ? le32_to_cpu(el[j]) : le64_to_cpu(el[j])); + } + + for (j = 0; j < buf_size; j++) { + bool should_set = false; + for (k = 0; k < num_positions; k++) { + if (positions[k] == j + i) { + should_set = true; + break; + } + } + g_assert_cmpint(should_set, ==, test_bit(j, (unsigned long *)buf)); + } + } + + g_free(buf); +} + +static void test_hbitmap_serialize_zeroes(TestHBitmapData *data, + const void *unused) +{ + int i; + HBitmapIter iter; + int64_t next; + uint64_t min_l1 = MAX(L1, 64); + uint64_t positions[] = { 0, min_l1, L2, L3 - min_l1}; + int num_positions = sizeof(positions) / sizeof(positions[0]); + + hbitmap_test_init(data, L3, 0); + + for (i = 0; i < num_positions; i++) { + hbitmap_set(data->hb, positions[i], L1); + } + + for (i = 0; i < num_positions; i++) { + hbitmap_deserialize_zeroes(data->hb, positions[i], min_l1, true); + hbitmap_iter_init(&iter, data->hb, 0); + next = hbitmap_iter_next(&iter); + if (i == num_positions - 1) { + g_assert_cmpint(next, ==, -1); + } else { + g_assert_cmpint(next, ==, positions[i + 1]); + } + } +} + static void hbitmap_test_add(const char *testpath, void (*test_func)(TestHBitmapData *data, const void *user_data)) { @@ -683,6 +940,21 @@ int main(int argc, char **argv) test_hbitmap_truncate_grow_large); hbitmap_test_add("/hbitmap/truncate/shrink/large", test_hbitmap_truncate_shrink_large); + + hbitmap_test_add("/hbitmap/meta/zero", test_hbitmap_meta_zero); + hbitmap_test_add("/hbitmap/meta/one", test_hbitmap_meta_one); + hbitmap_test_add("/hbitmap/meta/byte", test_hbitmap_meta_byte); + hbitmap_test_add("/hbitmap/meta/word", test_hbitmap_meta_word); + hbitmap_test_add("/hbitmap/meta/sector", test_hbitmap_meta_sector); + + hbitmap_test_add("/hbitmap/serialize/granularity", + test_hbitmap_serialize_granularity); + hbitmap_test_add("/hbitmap/serialize/basic", + test_hbitmap_serialize_basic); + hbitmap_test_add("/hbitmap/serialize/part", + test_hbitmap_serialize_part); + hbitmap_test_add("/hbitmap/serialize/zeroes", + test_hbitmap_serialize_zeroes); g_test_run(); return 0; diff --git a/util/hbitmap.c b/util/hbitmap.c index 99fd2ba37b..5d1a21ce91 100644 --- a/util/hbitmap.c +++ b/util/hbitmap.c @@ -78,6 +78,9 @@ struct HBitmap { */ int granularity; + /* A meta dirty bitmap to track the dirtiness of bits in this HBitmap. */ + HBitmap *meta; + /* A number of progressively less coarse bitmaps (i.e. level 0 is the * coarsest). Each bit in level N represents a word in level N+1 that * has a set bit, except the last level where each bit represents the @@ -209,25 +212,27 @@ static uint64_t hb_count_between(HBitmap *hb, uint64_t start, uint64_t last) } /* Setting starts at the last layer and propagates up if an element - * changes from zero to non-zero. + * changes. */ static inline bool hb_set_elem(unsigned long *elem, uint64_t start, uint64_t last) { unsigned long mask; - bool changed; + unsigned long old; assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL)); assert(start <= last); mask = 2UL << (last & (BITS_PER_LONG - 1)); mask -= 1UL << (start & (BITS_PER_LONG - 1)); - changed = (*elem == 0); + old = *elem; *elem |= mask; - return changed; + return old != *elem; } -/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */ -static void hb_set_between(HBitmap *hb, int level, uint64_t start, uint64_t last) +/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... + * Returns true if at least one bit is changed. */ +static bool hb_set_between(HBitmap *hb, int level, uint64_t start, + uint64_t last) { size_t pos = start >> BITS_PER_LEVEL; size_t lastpos = last >> BITS_PER_LEVEL; @@ -256,23 +261,28 @@ static void hb_set_between(HBitmap *hb, int level, uint64_t start, uint64_t last if (level > 0 && changed) { hb_set_between(hb, level - 1, pos, lastpos); } + return changed; } void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count) { /* Compute range in the last layer. */ + uint64_t first, n; uint64_t last = start + count - 1; trace_hbitmap_set(hb, start, count, start >> hb->granularity, last >> hb->granularity); - start >>= hb->granularity; + first = start >> hb->granularity; last >>= hb->granularity; - count = last - start + 1; assert(last < hb->size); + n = last - first + 1; - hb->count += count - hb_count_between(hb, start, last); - hb_set_between(hb, HBITMAP_LEVELS - 1, start, last); + hb->count += n - hb_count_between(hb, first, last); + if (hb_set_between(hb, HBITMAP_LEVELS - 1, first, last) && + hb->meta) { + hbitmap_set(hb->meta, start, count); + } } /* Resetting works the other way round: propagate up if the new @@ -293,8 +303,10 @@ static inline bool hb_reset_elem(unsigned long *elem, uint64_t start, uint64_t l return blanked; } -/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... */ -static void hb_reset_between(HBitmap *hb, int level, uint64_t start, uint64_t last) +/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... + * Returns true if at least one bit is changed. */ +static bool hb_reset_between(HBitmap *hb, int level, uint64_t start, + uint64_t last) { size_t pos = start >> BITS_PER_LEVEL; size_t lastpos = last >> BITS_PER_LEVEL; @@ -337,22 +349,29 @@ static void hb_reset_between(HBitmap *hb, int level, uint64_t start, uint64_t la if (level > 0 && changed) { hb_reset_between(hb, level - 1, pos, lastpos); } + + return changed; + } void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count) { /* Compute range in the last layer. */ + uint64_t first; uint64_t last = start + count - 1; trace_hbitmap_reset(hb, start, count, start >> hb->granularity, last >> hb->granularity); - start >>= hb->granularity; + first = start >> hb->granularity; last >>= hb->granularity; assert(last < hb->size); - hb->count -= hb_count_between(hb, start, last); - hb_reset_between(hb, HBITMAP_LEVELS - 1, start, last); + hb->count -= hb_count_between(hb, first, last); + if (hb_reset_between(hb, HBITMAP_LEVELS - 1, first, last) && + hb->meta) { + hbitmap_set(hb->meta, start, count); + } } void hbitmap_reset_all(HBitmap *hb) @@ -378,9 +397,147 @@ bool hbitmap_get(const HBitmap *hb, uint64_t item) return (hb->levels[HBITMAP_LEVELS - 1][pos >> BITS_PER_LEVEL] & bit) != 0; } +uint64_t hbitmap_serialization_granularity(const HBitmap *hb) +{ + /* Require at least 64 bit granularity to be safe on both 64 bit and 32 bit + * hosts. */ + return 64 << hb->granularity; +} + +/* Start should be aligned to serialization granularity, chunk size should be + * aligned to serialization granularity too, except for last chunk. + */ +static void serialization_chunk(const HBitmap *hb, + uint64_t start, uint64_t count, + unsigned long **first_el, uint64_t *el_count) +{ + uint64_t last = start + count - 1; + uint64_t gran = hbitmap_serialization_granularity(hb); + + assert((start & (gran - 1)) == 0); + assert((last >> hb->granularity) < hb->size); + if ((last >> hb->granularity) != hb->size - 1) { + assert((count & (gran - 1)) == 0); + } + + start = (start >> hb->granularity) >> BITS_PER_LEVEL; + last = (last >> hb->granularity) >> BITS_PER_LEVEL; + + *first_el = &hb->levels[HBITMAP_LEVELS - 1][start]; + *el_count = last - start + 1; +} + +uint64_t hbitmap_serialization_size(const HBitmap *hb, + uint64_t start, uint64_t count) +{ + uint64_t el_count; + unsigned long *cur; + + if (!count) { + return 0; + } + serialization_chunk(hb, start, count, &cur, &el_count); + + return el_count * sizeof(unsigned long); +} + +void hbitmap_serialize_part(const HBitmap *hb, uint8_t *buf, + uint64_t start, uint64_t count) +{ + uint64_t el_count; + unsigned long *cur, *end; + + if (!count) { + return; + } + serialization_chunk(hb, start, count, &cur, &el_count); + end = cur + el_count; + + while (cur != end) { + unsigned long el = + (BITS_PER_LONG == 32 ? cpu_to_le32(*cur) : cpu_to_le64(*cur)); + + memcpy(buf, &el, sizeof(el)); + buf += sizeof(el); + cur++; + } +} + +void hbitmap_deserialize_part(HBitmap *hb, uint8_t *buf, + uint64_t start, uint64_t count, + bool finish) +{ + uint64_t el_count; + unsigned long *cur, *end; + + if (!count) { + return; + } + serialization_chunk(hb, start, count, &cur, &el_count); + end = cur + el_count; + + while (cur != end) { + memcpy(cur, buf, sizeof(*cur)); + + if (BITS_PER_LONG == 32) { + le32_to_cpus((uint32_t *)cur); + } else { + le64_to_cpus((uint64_t *)cur); + } + + buf += sizeof(unsigned long); + cur++; + } + if (finish) { + hbitmap_deserialize_finish(hb); + } +} + +void hbitmap_deserialize_zeroes(HBitmap *hb, uint64_t start, uint64_t count, + bool finish) +{ + uint64_t el_count; + unsigned long *first; + + if (!count) { + return; + } + serialization_chunk(hb, start, count, &first, &el_count); + + memset(first, 0, el_count * sizeof(unsigned long)); + if (finish) { + hbitmap_deserialize_finish(hb); + } +} + +void hbitmap_deserialize_finish(HBitmap *bitmap) +{ + int64_t i, size, prev_size; + int lev; + + /* restore levels starting from penultimate to zero level, assuming + * that the last level is ok */ + size = MAX((bitmap->size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1); + for (lev = HBITMAP_LEVELS - 1; lev-- > 0; ) { + prev_size = size; + size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1); + memset(bitmap->levels[lev], 0, size * sizeof(unsigned long)); + + for (i = 0; i < prev_size; ++i) { + if (bitmap->levels[lev + 1][i]) { + bitmap->levels[lev][i >> BITS_PER_LEVEL] |= + 1UL << (i & (BITS_PER_LONG - 1)); + } + } + } + + bitmap->levels[0][0] |= 1UL << (BITS_PER_LONG - 1); +} + void hbitmap_free(HBitmap *hb) { unsigned i; + assert(!hb->meta); for (i = HBITMAP_LEVELS; i-- > 0; ) { g_free(hb->levels[i]); } @@ -458,6 +615,9 @@ void hbitmap_truncate(HBitmap *hb, uint64_t size) (size - old) * sizeof(*hb->levels[i])); } } + if (hb->meta) { + hbitmap_truncate(hb->meta, hb->size << hb->granularity); + } } @@ -493,3 +653,19 @@ bool hbitmap_merge(HBitmap *a, const HBitmap *b) return true; } + +HBitmap *hbitmap_create_meta(HBitmap *hb, int chunk_size) +{ + assert(!(chunk_size & (chunk_size - 1))); + assert(!hb->meta); + hb->meta = hbitmap_alloc(hb->size << hb->granularity, + hb->granularity + ctz32(chunk_size)); + return hb->meta; +} + +void hbitmap_free_meta(HBitmap *hb) +{ + assert(hb->meta); + hbitmap_free(hb->meta); + hb->meta = NULL; +}