Block patches

- Fix padding of unaligned vectored requests to match the host alignment
   for vectors with 1023 or 1024 buffers
 - Refactor and fix bugs in parallels's image check functionality
 - Add an option to the qcow2 driver to retain (qcow2-level) allocations
   on discard requests from the guest (while still forwarding the discard
   to the lower level and marking the range as zero)
 -----BEGIN PGP SIGNATURE-----
 
 iQJGBAABCAAwFiEEy2LXoO44KeRfAE00ofpA0JgBnN8FAmR+AT4SHGhyZWl0ekBy
 ZWRoYXQuY29tAAoJEKH6QNCYAZzfnboQAKD6YrreZLoseomRfqOAoApSf6yOdcHk
 6kfsvzwzjosomsF1Pkzm4851vX5PyDqTdeu0iViM+pxanVO1b494q1P4VcAERqMB
 iZVs68R6M0l6HV9btWFGm+ibHJf4FapdntkIdwog1ka5TIhw5oDWCVNLigjhIoRv
 sM37Bgf14kC3sFTR++0HESsyU1eUP5gJjwJbPZ2IgJBmzYay0is1z5nHA/3VUswu
 8dKnGQDsv62EtlK7PK8cU2BhLOeNi6Wr3bAb6Wf2QLB5e0qRb7oAkqNx5/UcTznk
 a3XMC1aiWhYvM/+DaYIpQUcIPgA8xQ1KHKeD6WjbGfLgZBqseX0aGWMByUsiY8Bo
 +BPIBnUDrbiPnAKB/XLQfnzlE+s7121/JpEbB7AkZqVFRGuw8Wur4tbc2fzvy8Pw
 x/uQfv3ZPi/2Lf6u7hv/TVHubXi8jucVgx3Ubu5Jeo3901S4/KOQBQ4BQ/GYIGQX
 38ijSROcEd0eQJ1mTKPEctouxjSZCghNSbrn9DfsL1V3VWqWNKKGCU3hM+RQ1SJT
 688qvnyYt8QZfTsiDSHR/GfKsufG0DkoqE7c9IhSEPohecAH8Rrc3HcLut7fuwD2
 gCFQhm68CPwwRmBjPCY6Zi1RDzeOyFBSWN31T6t0yTb4OHJ/3/cSZVBJtwwkOVbx
 zwabHDNdY5Kw
 =GuoL
 -----END PGP SIGNATURE-----

Merge tag 'pull-block-2023-06-05' of https://gitlab.com/hreitz/qemu into staging

Block patches

- Fix padding of unaligned vectored requests to match the host alignment
  for vectors with 1023 or 1024 buffers
- Refactor and fix bugs in parallels's image check functionality
- Add an option to the qcow2 driver to retain (qcow2-level) allocations
  on discard requests from the guest (while still forwarding the discard
  to the lower level and marking the range as zero)

# -----BEGIN PGP SIGNATURE-----
#
# iQJGBAABCAAwFiEEy2LXoO44KeRfAE00ofpA0JgBnN8FAmR+AT4SHGhyZWl0ekBy
# ZWRoYXQuY29tAAoJEKH6QNCYAZzfnboQAKD6YrreZLoseomRfqOAoApSf6yOdcHk
# 6kfsvzwzjosomsF1Pkzm4851vX5PyDqTdeu0iViM+pxanVO1b494q1P4VcAERqMB
# iZVs68R6M0l6HV9btWFGm+ibHJf4FapdntkIdwog1ka5TIhw5oDWCVNLigjhIoRv
# sM37Bgf14kC3sFTR++0HESsyU1eUP5gJjwJbPZ2IgJBmzYay0is1z5nHA/3VUswu
# 8dKnGQDsv62EtlK7PK8cU2BhLOeNi6Wr3bAb6Wf2QLB5e0qRb7oAkqNx5/UcTznk
# a3XMC1aiWhYvM/+DaYIpQUcIPgA8xQ1KHKeD6WjbGfLgZBqseX0aGWMByUsiY8Bo
# +BPIBnUDrbiPnAKB/XLQfnzlE+s7121/JpEbB7AkZqVFRGuw8Wur4tbc2fzvy8Pw
# x/uQfv3ZPi/2Lf6u7hv/TVHubXi8jucVgx3Ubu5Jeo3901S4/KOQBQ4BQ/GYIGQX
# 38ijSROcEd0eQJ1mTKPEctouxjSZCghNSbrn9DfsL1V3VWqWNKKGCU3hM+RQ1SJT
# 688qvnyYt8QZfTsiDSHR/GfKsufG0DkoqE7c9IhSEPohecAH8Rrc3HcLut7fuwD2
# gCFQhm68CPwwRmBjPCY6Zi1RDzeOyFBSWN31T6t0yTb4OHJ/3/cSZVBJtwwkOVbx
# zwabHDNdY5Kw
# =GuoL
# -----END PGP SIGNATURE-----
# gpg: Signature made Mon 05 Jun 2023 08:37:34 AM PDT
# gpg:                using RSA key CB62D7A0EE3829E45F004D34A1FA40D098019CDF
# gpg:                issuer "hreitz@redhat.com"
# gpg: Good signature from "Hanna Reitz <hreitz@redhat.com>" [unknown]
# gpg: WARNING: This key is not certified with a trusted signature!
# gpg:          There is no indication that the signature belongs to the owner.
# Primary key fingerprint: CB62 D7A0 EE38 29E4 5F00  4D34 A1FA 40D0 9801 9CDF

* tag 'pull-block-2023-06-05' of https://gitlab.com/hreitz/qemu:
  qcow2: add discard-no-unref option
  parallels: Incorrect condition in out-of-image check
  parallels: Replace qemu_co_mutex_lock by WITH_QEMU_LOCK_GUARD
  parallels: Move statistic collection to a separate function
  parallels: Move check of leaks to a separate function
  parallels: Fix statistics calculation
  parallels: Move check of cluster outside image to a separate function
  parallels: Move check of unclean image to a separate function
  parallels: Use generic infrastructure for BAT writing in parallels_co_check()
  parallels: create parallels_set_bat_entry_helper() to assign BAT value
  parallels: Fix image_end_offset and data_end after out-of-image check
  parallels: Fix high_off calculation in parallels_co_check()
  parallels: Out of image offset in BAT leads to image inflation
  iotests/iov-padding: New test
  util/iov: Remove qemu_iovec_init_extended()
  block: Collapse padded I/O vecs exceeding IOV_MAX
  util/iov: Make qiov_slice() public

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
Richard Henderson 2023-06-05 10:27:31 -07:00
commit b52daaf2c8
11 changed files with 526 additions and 154 deletions

View File

@ -1441,6 +1441,14 @@ out:
* @merge_reads is true for small requests,
* if @buf_len == @head + bytes + @tail. In this case it is possible that both
* head and tail exist but @buf_len == align and @tail_buf == @buf.
*
* @write is true for write requests, false for read requests.
*
* If padding makes the vector too long (exceeding IOV_MAX), then we need to
* merge existing vector elements into a single one. @collapse_bounce_buf acts
* as the bounce buffer in such cases. @pre_collapse_qiov has the pre-collapse
* I/O vector elements so for read requests, the data can be copied back after
* the read is done.
*/
typedef struct BdrvRequestPadding {
uint8_t *buf;
@ -1449,11 +1457,17 @@ typedef struct BdrvRequestPadding {
size_t head;
size_t tail;
bool merge_reads;
bool write;
QEMUIOVector local_qiov;
uint8_t *collapse_bounce_buf;
size_t collapse_len;
QEMUIOVector pre_collapse_qiov;
} BdrvRequestPadding;
static bool bdrv_init_padding(BlockDriverState *bs,
int64_t offset, int64_t bytes,
bool write,
BdrvRequestPadding *pad)
{
int64_t align = bs->bl.request_alignment;
@ -1485,6 +1499,8 @@ static bool bdrv_init_padding(BlockDriverState *bs,
pad->tail_buf = pad->buf + pad->buf_len - align;
}
pad->write = write;
return true;
}
@ -1549,8 +1565,23 @@ zero_mem:
return 0;
}
static void bdrv_padding_destroy(BdrvRequestPadding *pad)
/**
* Free *pad's associated buffers, and perform any necessary finalization steps.
*/
static void bdrv_padding_finalize(BdrvRequestPadding *pad)
{
if (pad->collapse_bounce_buf) {
if (!pad->write) {
/*
* If padding required elements in the vector to be collapsed into a
* bounce buffer, copy the bounce buffer content back
*/
qemu_iovec_from_buf(&pad->pre_collapse_qiov, 0,
pad->collapse_bounce_buf, pad->collapse_len);
}
qemu_vfree(pad->collapse_bounce_buf);
qemu_iovec_destroy(&pad->pre_collapse_qiov);
}
if (pad->buf) {
qemu_vfree(pad->buf);
qemu_iovec_destroy(&pad->local_qiov);
@ -1558,6 +1589,101 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
memset(pad, 0, sizeof(*pad));
}
/*
* Create pad->local_qiov by wrapping @iov in the padding head and tail, while
* ensuring that the resulting vector will not exceed IOV_MAX elements.
*
* To ensure this, when necessary, the first two or three elements of @iov are
* merged into pad->collapse_bounce_buf and replaced by a reference to that
* bounce buffer in pad->local_qiov.
*
* After performing a read request, the data from the bounce buffer must be
* copied back into pad->pre_collapse_qiov (e.g. by bdrv_padding_finalize()).
*/
static int bdrv_create_padded_qiov(BlockDriverState *bs,
BdrvRequestPadding *pad,
struct iovec *iov, int niov,
size_t iov_offset, size_t bytes)
{
int padded_niov, surplus_count, collapse_count;
/* Assert this invariant */
assert(niov <= IOV_MAX);
/*
* Cannot pad if resulting length would exceed SIZE_MAX. Returning an error
* to the guest is not ideal, but there is little else we can do. At least
* this will practically never happen on 64-bit systems.
*/
if (SIZE_MAX - pad->head < bytes ||
SIZE_MAX - pad->head - bytes < pad->tail)
{
return -EINVAL;
}
/* Length of the resulting IOV if we just concatenated everything */
padded_niov = !!pad->head + niov + !!pad->tail;
qemu_iovec_init(&pad->local_qiov, MIN(padded_niov, IOV_MAX));
if (pad->head) {
qemu_iovec_add(&pad->local_qiov, pad->buf, pad->head);
}
/*
* If padded_niov > IOV_MAX, we cannot just concatenate everything.
* Instead, merge the first two or three elements of @iov to reduce the
* number of vector elements as necessary.
*/
if (padded_niov > IOV_MAX) {
/*
* Only head and tail can have lead to the number of entries exceeding
* IOV_MAX, so we can exceed it by the head and tail at most. We need
* to reduce the number of elements by `surplus_count`, so we merge that
* many elements plus one into one element.
*/
surplus_count = padded_niov - IOV_MAX;
assert(surplus_count <= !!pad->head + !!pad->tail);
collapse_count = surplus_count + 1;
/*
* Move the elements to collapse into `pad->pre_collapse_qiov`, then
* advance `iov` (and associated variables) by those elements.
*/
qemu_iovec_init(&pad->pre_collapse_qiov, collapse_count);
qemu_iovec_concat_iov(&pad->pre_collapse_qiov, iov,
collapse_count, iov_offset, SIZE_MAX);
iov += collapse_count;
iov_offset = 0;
niov -= collapse_count;
bytes -= pad->pre_collapse_qiov.size;
/*
* Construct the bounce buffer to match the length of the to-collapse
* vector elements, and for write requests, initialize it with the data
* from those elements. Then add it to `pad->local_qiov`.
*/
pad->collapse_len = pad->pre_collapse_qiov.size;
pad->collapse_bounce_buf = qemu_blockalign(bs, pad->collapse_len);
if (pad->write) {
qemu_iovec_to_buf(&pad->pre_collapse_qiov, 0,
pad->collapse_bounce_buf, pad->collapse_len);
}
qemu_iovec_add(&pad->local_qiov,
pad->collapse_bounce_buf, pad->collapse_len);
}
qemu_iovec_concat_iov(&pad->local_qiov, iov, niov, iov_offset, bytes);
if (pad->tail) {
qemu_iovec_add(&pad->local_qiov,
pad->buf + pad->buf_len - pad->tail, pad->tail);
}
assert(pad->local_qiov.niov == MIN(padded_niov, IOV_MAX));
return 0;
}
/*
* bdrv_pad_request
*
@ -1565,6 +1691,8 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
* read of padding, bdrv_padding_rmw_read() should be called separately if
* needed.
*
* @write is true for write requests, false for read requests.
*
* Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out:
* - on function start they represent original request
* - on failure or when padding is not needed they are unchanged
@ -1573,26 +1701,34 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
static int bdrv_pad_request(BlockDriverState *bs,
QEMUIOVector **qiov, size_t *qiov_offset,
int64_t *offset, int64_t *bytes,
bool write,
BdrvRequestPadding *pad, bool *padded,
BdrvRequestFlags *flags)
{
int ret;
struct iovec *sliced_iov;
int sliced_niov;
size_t sliced_head, sliced_tail;
bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort);
if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
if (!bdrv_init_padding(bs, *offset, *bytes, write, pad)) {
if (padded) {
*padded = false;
}
return 0;
}
ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
*qiov, *qiov_offset, *bytes,
pad->buf + pad->buf_len - pad->tail,
pad->tail);
sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
&sliced_head, &sliced_tail,
&sliced_niov);
/* Guaranteed by bdrv_check_qiov_request() */
assert(*bytes <= SIZE_MAX);
ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
sliced_head, *bytes);
if (ret < 0) {
bdrv_padding_destroy(pad);
bdrv_padding_finalize(pad);
return ret;
}
*bytes += pad->head + pad->tail;
@ -1659,8 +1795,8 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
flags |= BDRV_REQ_COPY_ON_READ;
}
ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
NULL, &flags);
ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, false,
&pad, NULL, &flags);
if (ret < 0) {
goto fail;
}
@ -1670,7 +1806,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
bs->bl.request_alignment,
qiov, qiov_offset, flags);
tracked_request_end(&req);
bdrv_padding_destroy(&pad);
bdrv_padding_finalize(&pad);
fail:
bdrv_dec_in_flight(bs);
@ -2002,7 +2138,7 @@ bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes,
/* This flag doesn't make sense for padding or zero writes */
flags &= ~BDRV_REQ_REGISTERED_BUF;
padding = bdrv_init_padding(bs, offset, bytes, &pad);
padding = bdrv_init_padding(bs, offset, bytes, true, &pad);
if (padding) {
assert(!(flags & BDRV_REQ_NO_WAIT));
bdrv_make_request_serialising(req, align);
@ -2050,7 +2186,7 @@ bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes,
}
out:
bdrv_padding_destroy(&pad);
bdrv_padding_finalize(&pad);
return ret;
}
@ -2118,8 +2254,8 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
* bdrv_co_do_zero_pwritev() does aligning by itself, so, we do
* alignment only if there is no ZERO flag.
*/
ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
&padded, &flags);
ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, true,
&pad, &padded, &flags);
if (ret < 0) {
return ret;
}
@ -2149,7 +2285,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
qiov, qiov_offset, flags);
bdrv_padding_destroy(&pad);
bdrv_padding_finalize(&pad);
out:
tracked_request_end(&req);

View File

@ -165,6 +165,13 @@ static int64_t block_status(BDRVParallelsState *s, int64_t sector_num,
return start_off;
}
static void parallels_set_bat_entry(BDRVParallelsState *s,
uint32_t index, uint32_t offset)
{
s->bat_bitmap[index] = cpu_to_le32(offset);
bitmap_set(s->bat_dirty_bmap, bat_entry_off(index) / s->bat_dirty_block, 1);
}
static int64_t coroutine_fn GRAPH_RDLOCK
allocate_clusters(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, int *pnum)
@ -251,10 +258,8 @@ allocate_clusters(BlockDriverState *bs, int64_t sector_num,
}
for (i = 0; i < to_allocate; i++) {
s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier);
parallels_set_bat_entry(s, idx + i, s->data_end / s->off_multiplier);
s->data_end += s->tracks;
bitmap_set(s->bat_dirty_bmap,
bat_entry_off(idx + i) / s->bat_dirty_block, 1);
}
return bat2sect(s, idx) + sector_num % s->tracks;
@ -415,16 +420,33 @@ parallels_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
return ret;
}
static int coroutine_fn GRAPH_RDLOCK
parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
BdrvCheckMode fix)
static void parallels_check_unclean(BlockDriverState *bs,
BdrvCheckResult *res,
BdrvCheckMode fix)
{
BDRVParallelsState *s = bs->opaque;
if (!s->header_unclean) {
return;
}
fprintf(stderr, "%s image was not closed correctly\n",
fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
res->corruptions++;
if (fix & BDRV_FIX_ERRORS) {
/* parallels_close will do the job right */
res->corruptions_fixed++;
s->header_unclean = false;
}
}
static int coroutine_fn GRAPH_RDLOCK
parallels_check_outside_image(BlockDriverState *bs, BdrvCheckResult *res,
BdrvCheckMode fix)
{
BDRVParallelsState *s = bs->opaque;
int64_t size, prev_off, high_off;
int ret;
uint32_t i;
bool flush_bat = false;
int64_t off, high_off, size;
size = bdrv_getlength(bs->file->bs);
if (size < 0) {
@ -432,65 +454,48 @@ parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
return size;
}
qemu_co_mutex_lock(&s->lock);
if (s->header_unclean) {
fprintf(stderr, "%s image was not closed correctly\n",
fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
res->corruptions++;
if (fix & BDRV_FIX_ERRORS) {
/* parallels_close will do the job right */
res->corruptions_fixed++;
s->header_unclean = false;
}
}
res->bfi.total_clusters = s->bat_size;
res->bfi.compressed_clusters = 0; /* compression is not supported */
high_off = 0;
prev_off = 0;
for (i = 0; i < s->bat_size; i++) {
int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS;
if (off == 0) {
prev_off = 0;
continue;
}
/* cluster outside the image */
if (off > size) {
off = bat2sect(s, i) << BDRV_SECTOR_BITS;
if (off + s->cluster_size > size) {
fprintf(stderr, "%s cluster %u is outside image\n",
fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
res->corruptions++;
if (fix & BDRV_FIX_ERRORS) {
prev_off = 0;
s->bat_bitmap[i] = 0;
parallels_set_bat_entry(s, i, 0);
res->corruptions_fixed++;
flush_bat = true;
continue;
}
continue;
}
res->bfi.allocated_clusters++;
if (off > high_off) {
if (high_off < off) {
high_off = off;
}
if (prev_off != 0 && (prev_off + s->cluster_size) != off) {
res->bfi.fragmented_clusters++;
}
prev_off = off;
}
ret = 0;
if (flush_bat) {
ret = bdrv_co_pwrite_sync(bs->file, 0, s->header_size, s->header, 0);
if (ret < 0) {
res->check_errors++;
goto out;
}
if (high_off == 0) {
res->image_end_offset = s->data_end << BDRV_SECTOR_BITS;
} else {
res->image_end_offset = high_off + s->cluster_size;
s->data_end = res->image_end_offset >> BDRV_SECTOR_BITS;
}
return 0;
}
static int coroutine_fn GRAPH_RDLOCK
parallels_check_leak(BlockDriverState *bs, BdrvCheckResult *res,
BdrvCheckMode fix)
{
BDRVParallelsState *s = bs->opaque;
int64_t size;
int ret;
size = bdrv_getlength(bs->file->bs);
if (size < 0) {
res->check_errors++;
return size;
}
res->image_end_offset = high_off + s->cluster_size;
if (size > res->image_end_offset) {
int64_t count;
count = DIV_ROUND_UP(size - res->image_end_offset, s->cluster_size);
@ -510,14 +515,74 @@ parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
if (ret < 0) {
error_report_err(local_err);
res->check_errors++;
goto out;
return ret;
}
res->leaks_fixed += count;
}
}
out:
qemu_co_mutex_unlock(&s->lock);
return 0;
}
static void parallels_collect_statistics(BlockDriverState *bs,
BdrvCheckResult *res,
BdrvCheckMode fix)
{
BDRVParallelsState *s = bs->opaque;
int64_t off, prev_off;
uint32_t i;
res->bfi.total_clusters = s->bat_size;
res->bfi.compressed_clusters = 0; /* compression is not supported */
prev_off = 0;
for (i = 0; i < s->bat_size; i++) {
off = bat2sect(s, i) << BDRV_SECTOR_BITS;
/*
* If BDRV_FIX_ERRORS is not set, out-of-image BAT entries were not
* fixed. Skip not allocated and out-of-image BAT entries.
*/
if (off == 0 || off + s->cluster_size > res->image_end_offset) {
prev_off = 0;
continue;
}
if (prev_off != 0 && (prev_off + s->cluster_size) != off) {
res->bfi.fragmented_clusters++;
}
prev_off = off;
res->bfi.allocated_clusters++;
}
}
static int coroutine_fn GRAPH_RDLOCK
parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
BdrvCheckMode fix)
{
BDRVParallelsState *s = bs->opaque;
int ret;
WITH_QEMU_LOCK_GUARD(&s->lock) {
parallels_check_unclean(bs, res, fix);
ret = parallels_check_outside_image(bs, res, fix);
if (ret < 0) {
return ret;
}
ret = parallels_check_leak(bs, res, fix);
if (ret < 0) {
return ret;
}
parallels_collect_statistics(bs, res, fix);
}
ret = bdrv_co_flush(bs);
if (ret < 0) {
res->check_errors++;
}
return ret;
}
@ -733,6 +798,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
BDRVParallelsState *s = bs->opaque;
ParallelsHeader ph;
int ret, size, i;
int64_t file_nb_sectors;
QemuOpts *opts = NULL;
Error *local_err = NULL;
char *buf;
@ -742,6 +808,11 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
return ret;
}
file_nb_sectors = bdrv_nb_sectors(bs->file->bs);
if (file_nb_sectors < 0) {
return -EINVAL;
}
ret = bdrv_pread(bs->file, 0, sizeof(ph), &ph, 0);
if (ret < 0) {
goto fail;
@ -806,6 +877,17 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
for (i = 0; i < s->bat_size; i++) {
int64_t off = bat2sect(s, i);
if (off >= file_nb_sectors) {
if (flags & BDRV_O_CHECK) {
continue;
}
error_setg(errp, "parallels: Offset %" PRIi64 " in BAT[%d] entry "
"is larger than file size (%" PRIi64 ")",
off << BDRV_SECTOR_BITS, i,
file_nb_sectors << BDRV_SECTOR_BITS);
ret = -EINVAL;
goto fail;
}
if (off >= s->data_end) {
s->data_end = off + s->tracks;
}

View File

@ -1925,6 +1925,10 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
uint64_t new_l2_bitmap = old_l2_bitmap;
QCow2ClusterType cluster_type =
qcow2_get_cluster_type(bs, old_l2_entry);
bool keep_reference = (cluster_type != QCOW2_CLUSTER_COMPRESSED) &&
!full_discard &&
(s->discard_no_unref &&
type == QCOW2_DISCARD_REQUEST);
/*
* If full_discard is true, the cluster should not read back as zeroes,
@ -1943,10 +1947,22 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
new_l2_entry = new_l2_bitmap = 0;
} else if (bs->backing || qcow2_cluster_is_allocated(cluster_type)) {
if (has_subclusters(s)) {
new_l2_entry = 0;
if (keep_reference) {
new_l2_entry = old_l2_entry;
} else {
new_l2_entry = 0;
}
new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
} else {
new_l2_entry = s->qcow_version >= 3 ? QCOW_OFLAG_ZERO : 0;
if (s->qcow_version >= 3) {
if (keep_reference) {
new_l2_entry |= QCOW_OFLAG_ZERO;
} else {
new_l2_entry = QCOW_OFLAG_ZERO;
}
} else {
new_l2_entry = 0;
}
}
}
@ -1960,8 +1976,16 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
if (has_subclusters(s)) {
set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
}
/* Then decrease the refcount */
qcow2_free_any_cluster(bs, old_l2_entry, type);
if (!keep_reference) {
/* Then decrease the refcount */
qcow2_free_any_cluster(bs, old_l2_entry, type);
} else if (s->discard_passthrough[type] &&
(cluster_type == QCOW2_CLUSTER_NORMAL ||
cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) {
/* If we keep the reference, pass on the discard still */
bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
s->cluster_size);
}
}
qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);

View File

@ -682,6 +682,7 @@ static const char *const mutable_opts[] = {
QCOW2_OPT_DISCARD_REQUEST,
QCOW2_OPT_DISCARD_SNAPSHOT,
QCOW2_OPT_DISCARD_OTHER,
QCOW2_OPT_DISCARD_NO_UNREF,
QCOW2_OPT_OVERLAP,
QCOW2_OPT_OVERLAP_TEMPLATE,
QCOW2_OPT_OVERLAP_MAIN_HEADER,
@ -726,6 +727,11 @@ static QemuOptsList qcow2_runtime_opts = {
.type = QEMU_OPT_BOOL,
.help = "Generate discard requests when other clusters are freed",
},
{
.name = QCOW2_OPT_DISCARD_NO_UNREF,
.type = QEMU_OPT_BOOL,
.help = "Do not unreference discarded clusters",
},
{
.name = QCOW2_OPT_OVERLAP,
.type = QEMU_OPT_STRING,
@ -969,6 +975,7 @@ typedef struct Qcow2ReopenState {
bool use_lazy_refcounts;
int overlap_check;
bool discard_passthrough[QCOW2_DISCARD_MAX];
bool discard_no_unref;
uint64_t cache_clean_interval;
QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
} Qcow2ReopenState;
@ -1140,6 +1147,15 @@ static int qcow2_update_options_prepare(BlockDriverState *bs,
r->discard_passthrough[QCOW2_DISCARD_OTHER] =
qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
r->discard_no_unref = qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_NO_UNREF,
false);
if (r->discard_no_unref && s->qcow_version < 3) {
error_setg(errp,
"discard-no-unref is only supported since qcow2 version 3");
ret = -EINVAL;
goto fail;
}
switch (s->crypt_method_header) {
case QCOW_CRYPT_NONE:
if (encryptfmt) {
@ -1220,6 +1236,8 @@ static void qcow2_update_options_commit(BlockDriverState *bs,
s->discard_passthrough[i] = r->discard_passthrough[i];
}
s->discard_no_unref = r->discard_no_unref;
if (s->cache_clean_interval != r->cache_clean_interval) {
cache_clean_timer_del(bs);
s->cache_clean_interval = r->cache_clean_interval;

View File

@ -133,6 +133,7 @@
#define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request"
#define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot"
#define QCOW2_OPT_DISCARD_OTHER "pass-discard-other"
#define QCOW2_OPT_DISCARD_NO_UNREF "discard-no-unref"
#define QCOW2_OPT_OVERLAP "overlap-check"
#define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template"
#define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header"
@ -385,6 +386,8 @@ typedef struct BDRVQcow2State {
bool discard_passthrough[QCOW2_DISCARD_MAX];
bool discard_no_unref;
int overlap_check; /* bitmask of Qcow2MetadataOverlap values */
bool signaled_corruption;

View File

@ -222,13 +222,11 @@ static inline void *qemu_iovec_buf(QEMUIOVector *qiov)
void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint);
void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov);
int qemu_iovec_init_extended(
QEMUIOVector *qiov,
void *head_buf, size_t head_len,
QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len,
void *tail_buf, size_t tail_len);
void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source,
size_t offset, size_t len);
struct iovec *qemu_iovec_slice(QEMUIOVector *qiov,
size_t offset, size_t len,
size_t *head, size_t *tail, int *niov);
int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len);
void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len);
void qemu_iovec_concat(QEMUIOVector *dst,

View File

@ -3478,6 +3478,17 @@
# @pass-discard-other: whether discard requests for the data source
# should be issued on other occasions where a cluster gets freed
#
# @discard-no-unref: when enabled, discards from the guest will not cause
# cluster allocations to be relinquished. This prevents qcow2 fragmentation
# that would be caused by such discards. Besides potential
# performance degradation, such fragmentation can lead to increased
# allocation of clusters past the end of the image file,
# resulting in image files whose file length can grow much larger
# than their guest disk size would suggest.
# If image file length is of concern (e.g. when storing qcow2
# images directly on block devices), you should consider enabling
# this option. (since 8.1)
#
# @overlap-check: which overlap checks to perform for writes to the
# image, defaults to 'cached' (since 2.2)
#
@ -3516,6 +3527,7 @@
'*pass-discard-request': 'bool',
'*pass-discard-snapshot': 'bool',
'*pass-discard-other': 'bool',
'*discard-no-unref': 'bool',
'*overlap-check': 'Qcow2OverlapChecks',
'*cache-size': 'int',
'*l2-cache-size': 'int',

View File

@ -1431,6 +1431,18 @@ SRST
issued on other occasions where a cluster gets freed
(on/off; default: off)
``discard-no-unref``
When enabled, discards from the guest will not cause cluster
allocations to be relinquished. This prevents qcow2 fragmentation
that would be caused by such discards. Besides potential
performance degradation, such fragmentation can lead to increased
allocation of clusters past the end of the image file,
resulting in image files whose file length can grow much larger
than their guest disk size would suggest.
If image file length is of concern (e.g. when storing qcow2
images directly on block devices), you should consider enabling
this option.
``overlap-check``
Which overlap checks to perform for writes to the image
(none/constant/cached/all; default: cached). For details or

View File

@ -0,0 +1,85 @@
#!/usr/bin/env bash
# group: rw quick
#
# Check the interaction of request padding (to fit alignment restrictions) with
# vectored I/O from the guest
#
# Copyright Red Hat
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
seq=$(basename $0)
echo "QA output created by $seq"
status=1 # failure is the default!
_cleanup()
{
_cleanup_test_img
}
trap "_cleanup; exit \$status" 0 1 2 3 15
# get standard environment, filters and checks
cd ..
. ./common.rc
. ./common.filter
_supported_fmt raw
_supported_proto file
_make_test_img 1M
IMGSPEC="driver=blkdebug,align=4096,image.driver=file,image.filename=$TEST_IMG"
# Four combinations:
# - Offset 4096, length 1023 * 512 + 512: Fully aligned to 4k
# - Offset 4096, length 1023 * 512 + 4096: Head is aligned, tail is not
# - Offset 512, length 1023 * 512 + 512: Neither head nor tail are aligned
# - Offset 512, length 1023 * 512 + 4096: Tail is aligned, head is not
for start_offset in 4096 512; do
for last_element_length in 512 4096; do
length=$((1023 * 512 + $last_element_length))
echo
echo "== performing 1024-element vectored requests to image (offset: $start_offset; length: $length) =="
# Fill with data for testing
$QEMU_IO -c 'write -P 1 0 1M' "$TEST_IMG" | _filter_qemu_io
# 1023 512-byte buffers, and then one with length $last_element_length
cmd_params="-P 2 $start_offset $(yes 512 | head -n 1023 | tr '\n' ' ') $last_element_length"
QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \
-c "writev $cmd_params" \
--image-opts \
"$IMGSPEC" \
| _filter_qemu_io
# Read all patterns -- read the part we just wrote with writev twice,
# once "normally", and once with a readv, so we see that that works, too
QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \
-c "read -P 1 0 $start_offset" \
-c "read -P 2 $start_offset $length" \
-c "readv $cmd_params" \
-c "read -P 1 $((start_offset + length)) $((1024 * 1024 - length - start_offset))" \
--image-opts \
"$IMGSPEC" \
| _filter_qemu_io
done
done
# success, all done
echo "*** done"
rm -f $seq.full
status=0

View File

@ -0,0 +1,59 @@
QA output created by iov-padding
Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
== performing 1024-element vectored requests to image (offset: 4096; length: 524288) ==
wrote 1048576/1048576 bytes at offset 0
1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
wrote 524288/524288 bytes at offset 4096
512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 4096/4096 bytes at offset 0
4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 524288/524288 bytes at offset 4096
512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 524288/524288 bytes at offset 4096
512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 520192/520192 bytes at offset 528384
508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
== performing 1024-element vectored requests to image (offset: 4096; length: 527872) ==
wrote 1048576/1048576 bytes at offset 0
1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
wrote 527872/527872 bytes at offset 4096
515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 4096/4096 bytes at offset 0
4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 527872/527872 bytes at offset 4096
515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 527872/527872 bytes at offset 4096
515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 516608/516608 bytes at offset 531968
504.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
== performing 1024-element vectored requests to image (offset: 512; length: 524288) ==
wrote 1048576/1048576 bytes at offset 0
1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
wrote 524288/524288 bytes at offset 512
512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 512/512 bytes at offset 0
512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 524288/524288 bytes at offset 512
512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 524288/524288 bytes at offset 512
512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 523776/523776 bytes at offset 524800
511.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
== performing 1024-element vectored requests to image (offset: 512; length: 527872) ==
wrote 1048576/1048576 bytes at offset 0
1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
wrote 527872/527872 bytes at offset 512
515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 512/512 bytes at offset 0
512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 527872/527872 bytes at offset 512
515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 527872/527872 bytes at offset 512
515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
read 520192/520192 bytes at offset 528384
508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
*** done

View File

@ -378,15 +378,15 @@ static struct iovec *iov_skip_offset(struct iovec *iov, size_t offset,
}
/*
* qiov_slice
* qemu_iovec_slice
*
* Find subarray of iovec's, containing requested range. @head would
* be offset in first iov (returned by the function), @tail would be
* count of extra bytes in last iovec (returned iov + @niov - 1).
*/
static struct iovec *qiov_slice(QEMUIOVector *qiov,
size_t offset, size_t len,
size_t *head, size_t *tail, int *niov)
struct iovec *qemu_iovec_slice(QEMUIOVector *qiov,
size_t offset, size_t len,
size_t *head, size_t *tail, int *niov)
{
struct iovec *iov, *end_iov;
@ -411,75 +411,11 @@ int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len)
size_t head, tail;
int niov;
qiov_slice(qiov, offset, len, &head, &tail, &niov);
qemu_iovec_slice(qiov, offset, len, &head, &tail, &niov);
return niov;
}
/*
* Compile new iovec, combining @head_buf buffer, sub-qiov of @mid_qiov,
* and @tail_buf buffer into new qiov.
*/
int qemu_iovec_init_extended(
QEMUIOVector *qiov,
void *head_buf, size_t head_len,
QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len,
void *tail_buf, size_t tail_len)
{
size_t mid_head, mid_tail;
int total_niov, mid_niov = 0;
struct iovec *p, *mid_iov = NULL;
assert(mid_qiov->niov <= IOV_MAX);
if (SIZE_MAX - head_len < mid_len ||
SIZE_MAX - head_len - mid_len < tail_len)
{
return -EINVAL;
}
if (mid_len) {
mid_iov = qiov_slice(mid_qiov, mid_offset, mid_len,
&mid_head, &mid_tail, &mid_niov);
}
total_niov = !!head_len + mid_niov + !!tail_len;
if (total_niov > IOV_MAX) {
return -EINVAL;
}
if (total_niov == 1) {
qemu_iovec_init_buf(qiov, NULL, 0);
p = &qiov->local_iov;
} else {
qiov->niov = qiov->nalloc = total_niov;
qiov->size = head_len + mid_len + tail_len;
p = qiov->iov = g_new(struct iovec, qiov->niov);
}
if (head_len) {
p->iov_base = head_buf;
p->iov_len = head_len;
p++;
}
assert(!mid_niov == !mid_len);
if (mid_niov) {
memcpy(p, mid_iov, mid_niov * sizeof(*p));
p[0].iov_base = (uint8_t *)p[0].iov_base + mid_head;
p[0].iov_len -= mid_head;
p[mid_niov - 1].iov_len -= mid_tail;
p += mid_niov;
}
if (tail_len) {
p->iov_base = tail_buf;
p->iov_len = tail_len;
}
return 0;
}
/*
* Check if the contents of subrange of qiov data is all zeroes.
*/
@ -511,14 +447,21 @@ bool qemu_iovec_is_zero(QEMUIOVector *qiov, size_t offset, size_t bytes)
void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source,
size_t offset, size_t len)
{
int ret;
struct iovec *slice_iov;
int slice_niov;
size_t slice_head, slice_tail;
assert(source->size >= len);
assert(source->size - len >= offset);
/* We shrink the request, so we can't overflow neither size_t nor MAX_IOV */
ret = qemu_iovec_init_extended(qiov, NULL, 0, source, offset, len, NULL, 0);
assert(ret == 0);
slice_iov = qemu_iovec_slice(source, offset, len,
&slice_head, &slice_tail, &slice_niov);
if (slice_niov == 1) {
qemu_iovec_init_buf(qiov, slice_iov[0].iov_base + slice_head, len);
} else {
qemu_iovec_init(qiov, slice_niov);
qemu_iovec_concat_iov(qiov, slice_iov, slice_niov, slice_head, len);
}
}
void qemu_iovec_destroy(QEMUIOVector *qiov)