Block patches

- Fix padding of unaligned vectored requests to match the host alignment for vectors with 1023 or 1024 buffers - Refactor and fix bugs in parallels's image check functionality - Add an option to the qcow2 driver to retain (qcow2-level) allocations on discard requests from the guest (while still forwarding the discard to the lower level and marking the range as zero) -----BEGIN PGP SIGNATURE----- iQJGBAABCAAwFiEEy2LXoO44KeRfAE00ofpA0JgBnN8FAmR+AT4SHGhyZWl0ekBy ZWRoYXQuY29tAAoJEKH6QNCYAZzfnboQAKD6YrreZLoseomRfqOAoApSf6yOdcHk 6kfsvzwzjosomsF1Pkzm4851vX5PyDqTdeu0iViM+pxanVO1b494q1P4VcAERqMB iZVs68R6M0l6HV9btWFGm+ibHJf4FapdntkIdwog1ka5TIhw5oDWCVNLigjhIoRv sM37Bgf14kC3sFTR++0HESsyU1eUP5gJjwJbPZ2IgJBmzYay0is1z5nHA/3VUswu 8dKnGQDsv62EtlK7PK8cU2BhLOeNi6Wr3bAb6Wf2QLB5e0qRb7oAkqNx5/UcTznk a3XMC1aiWhYvM/+DaYIpQUcIPgA8xQ1KHKeD6WjbGfLgZBqseX0aGWMByUsiY8Bo +BPIBnUDrbiPnAKB/XLQfnzlE+s7121/JpEbB7AkZqVFRGuw8Wur4tbc2fzvy8Pw x/uQfv3ZPi/2Lf6u7hv/TVHubXi8jucVgx3Ubu5Jeo3901S4/KOQBQ4BQ/GYIGQX 38ijSROcEd0eQJ1mTKPEctouxjSZCghNSbrn9DfsL1V3VWqWNKKGCU3hM+RQ1SJT 688qvnyYt8QZfTsiDSHR/GfKsufG0DkoqE7c9IhSEPohecAH8Rrc3HcLut7fuwD2 gCFQhm68CPwwRmBjPCY6Zi1RDzeOyFBSWN31T6t0yTb4OHJ/3/cSZVBJtwwkOVbx zwabHDNdY5Kw =GuoL -----END PGP SIGNATURE----- Merge tag 'pull-block-2023-06-05' of https://gitlab.com/hreitz/qemu into staging Block patches - Fix padding of unaligned vectored requests to match the host alignment for vectors with 1023 or 1024 buffers - Refactor and fix bugs in parallels's image check functionality - Add an option to the qcow2 driver to retain (qcow2-level) allocations on discard requests from the guest (while still forwarding the discard to the lower level and marking the range as zero) # -----BEGIN PGP SIGNATURE----- # # iQJGBAABCAAwFiEEy2LXoO44KeRfAE00ofpA0JgBnN8FAmR+AT4SHGhyZWl0ekBy # ZWRoYXQuY29tAAoJEKH6QNCYAZzfnboQAKD6YrreZLoseomRfqOAoApSf6yOdcHk # 6kfsvzwzjosomsF1Pkzm4851vX5PyDqTdeu0iViM+pxanVO1b494q1P4VcAERqMB # iZVs68R6M0l6HV9btWFGm+ibHJf4FapdntkIdwog1ka5TIhw5oDWCVNLigjhIoRv # sM37Bgf14kC3sFTR++0HESsyU1eUP5gJjwJbPZ2IgJBmzYay0is1z5nHA/3VUswu # 8dKnGQDsv62EtlK7PK8cU2BhLOeNi6Wr3bAb6Wf2QLB5e0qRb7oAkqNx5/UcTznk # a3XMC1aiWhYvM/+DaYIpQUcIPgA8xQ1KHKeD6WjbGfLgZBqseX0aGWMByUsiY8Bo # +BPIBnUDrbiPnAKB/XLQfnzlE+s7121/JpEbB7AkZqVFRGuw8Wur4tbc2fzvy8Pw # x/uQfv3ZPi/2Lf6u7hv/TVHubXi8jucVgx3Ubu5Jeo3901S4/KOQBQ4BQ/GYIGQX # 38ijSROcEd0eQJ1mTKPEctouxjSZCghNSbrn9DfsL1V3VWqWNKKGCU3hM+RQ1SJT # 688qvnyYt8QZfTsiDSHR/GfKsufG0DkoqE7c9IhSEPohecAH8Rrc3HcLut7fuwD2 # gCFQhm68CPwwRmBjPCY6Zi1RDzeOyFBSWN31T6t0yTb4OHJ/3/cSZVBJtwwkOVbx # zwabHDNdY5Kw # =GuoL # -----END PGP SIGNATURE----- # gpg: Signature made Mon 05 Jun 2023 08:37:34 AM PDT # gpg: using RSA key CB62D7A0EE3829E45F004D34A1FA40D098019CDF # gpg: issuer "hreitz@redhat.com" # gpg: Good signature from "Hanna Reitz <hreitz@redhat.com>" [unknown] # gpg: WARNING: This key is not certified with a trusted signature! # gpg: There is no indication that the signature belongs to the owner. # Primary key fingerprint: CB62 D7A0 EE38 29E4 5F00 4D34 A1FA 40D0 9801 9CDF * tag 'pull-block-2023-06-05' of https://gitlab.com/hreitz/qemu: qcow2: add discard-no-unref option parallels: Incorrect condition in out-of-image check parallels: Replace qemu_co_mutex_lock by WITH_QEMU_LOCK_GUARD parallels: Move statistic collection to a separate function parallels: Move check of leaks to a separate function parallels: Fix statistics calculation parallels: Move check of cluster outside image to a separate function parallels: Move check of unclean image to a separate function parallels: Use generic infrastructure for BAT writing in parallels_co_check() parallels: create parallels_set_bat_entry_helper() to assign BAT value parallels: Fix image_end_offset and data_end after out-of-image check parallels: Fix high_off calculation in parallels_co_check() parallels: Out of image offset in BAT leads to image inflation iotests/iov-padding: New test util/iov: Remove qemu_iovec_init_extended() block: Collapse padded I/O vecs exceeding IOV_MAX util/iov: Make qiov_slice() public Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
2024-12-16 16:53:28 +08:00 · 2023-06-05 10:27:31 -07:00 · 2023-06-05 10:27:31 -07:00 · b52daaf2c8
commit b52daaf2c8
parent afa351fe36 42a2890a76
11 changed files with 526 additions and 154 deletions
--- a/block/io.c
+++ b/block/io.c
@ -1441,6 +1441,14 @@ out:
 * @merge_reads is true for small requests,
 * if @buf_len == @head + bytes + @tail. In this case it is possible that both
 * head and tail exist but @buf_len == align and @tail_buf == @buf.
+ *
+ * @write is true for write requests, false for read requests.
+ *
+ * If padding makes the vector too long (exceeding IOV_MAX), then we need to
+ * merge existing vector elements into a single one.  @collapse_bounce_buf acts
+ * as the bounce buffer in such cases.  @pre_collapse_qiov has the pre-collapse
+ * I/O vector elements so for read requests, the data can be copied back after
+ * the read is done.
 */
 typedef struct BdrvRequestPadding {
    uint8_t *buf;
@ -1449,11 +1457,17 @@ typedef struct BdrvRequestPadding {
    size_t head;
    size_t tail;
    bool merge_reads;
+    bool write;
    QEMUIOVector local_qiov;
+
+    uint8_t *collapse_bounce_buf;
+    size_t collapse_len;
+    QEMUIOVector pre_collapse_qiov;
 } BdrvRequestPadding;

 static bool bdrv_init_padding(BlockDriverState *bs,
                              int64_t offset, int64_t bytes,
+                              bool write,
                              BdrvRequestPadding *pad)
 {
    int64_t align = bs->bl.request_alignment;
@ -1485,6 +1499,8 @@ static bool bdrv_init_padding(BlockDriverState *bs,
        pad->tail_buf = pad->buf + pad->buf_len - align;
    }

+    pad->write = write;
+
    return true;
 }

@ -1549,8 +1565,23 @@ zero_mem:
    return 0;
 }

-static void bdrv_padding_destroy(BdrvRequestPadding *pad)
+/**
+ * Free *pad's associated buffers, and perform any necessary finalization steps.
+ */
+static void bdrv_padding_finalize(BdrvRequestPadding *pad)
 {
+    if (pad->collapse_bounce_buf) {
+        if (!pad->write) {
+            /*
+             * If padding required elements in the vector to be collapsed into a
+             * bounce buffer, copy the bounce buffer content back
+             */
+            qemu_iovec_from_buf(&pad->pre_collapse_qiov, 0,
+                                pad->collapse_bounce_buf, pad->collapse_len);
+        }
+        qemu_vfree(pad->collapse_bounce_buf);
+        qemu_iovec_destroy(&pad->pre_collapse_qiov);
+    }
    if (pad->buf) {
        qemu_vfree(pad->buf);
        qemu_iovec_destroy(&pad->local_qiov);
@ -1558,6 +1589,101 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
    memset(pad, 0, sizeof(*pad));
 }

+/*
+ * Create pad->local_qiov by wrapping @iov in the padding head and tail, while
+ * ensuring that the resulting vector will not exceed IOV_MAX elements.
+ *
+ * To ensure this, when necessary, the first two or three elements of @iov are
+ * merged into pad->collapse_bounce_buf and replaced by a reference to that
+ * bounce buffer in pad->local_qiov.
+ *
+ * After performing a read request, the data from the bounce buffer must be
+ * copied back into pad->pre_collapse_qiov (e.g. by bdrv_padding_finalize()).
+ */
+static int bdrv_create_padded_qiov(BlockDriverState *bs,
+                                   BdrvRequestPadding *pad,
+                                   struct iovec *iov, int niov,
+                                   size_t iov_offset, size_t bytes)
+{
+    int padded_niov, surplus_count, collapse_count;
+
+    /* Assert this invariant */
+    assert(niov <= IOV_MAX);
+
+    /*
+     * Cannot pad if resulting length would exceed SIZE_MAX.  Returning an error
+     * to the guest is not ideal, but there is little else we can do.  At least
+     * this will practically never happen on 64-bit systems.
+     */
+    if (SIZE_MAX - pad->head < bytes ||
+        SIZE_MAX - pad->head - bytes < pad->tail)
+    {
+        return -EINVAL;
+    }
+
+    /* Length of the resulting IOV if we just concatenated everything */
+    padded_niov = !!pad->head + niov + !!pad->tail;
+
+    qemu_iovec_init(&pad->local_qiov, MIN(padded_niov, IOV_MAX));
+
+    if (pad->head) {
+        qemu_iovec_add(&pad->local_qiov, pad->buf, pad->head);
+    }
+
+    /*
+     * If padded_niov > IOV_MAX, we cannot just concatenate everything.
+     * Instead, merge the first two or three elements of @iov to reduce the
+     * number of vector elements as necessary.
+     */
+    if (padded_niov > IOV_MAX) {
+        /*
+         * Only head and tail can have lead to the number of entries exceeding
+         * IOV_MAX, so we can exceed it by the head and tail at most.  We need
+         * to reduce the number of elements by `surplus_count`, so we merge that
+         * many elements plus one into one element.
+         */
+        surplus_count = padded_niov - IOV_MAX;
+        assert(surplus_count <= !!pad->head + !!pad->tail);
+        collapse_count = surplus_count + 1;
+
+        /*
+         * Move the elements to collapse into `pad->pre_collapse_qiov`, then
+         * advance `iov` (and associated variables) by those elements.
+         */
+        qemu_iovec_init(&pad->pre_collapse_qiov, collapse_count);
+        qemu_iovec_concat_iov(&pad->pre_collapse_qiov, iov,
+                              collapse_count, iov_offset, SIZE_MAX);
+        iov += collapse_count;
+        iov_offset = 0;
+        niov -= collapse_count;
+        bytes -= pad->pre_collapse_qiov.size;
+
+        /*
+         * Construct the bounce buffer to match the length of the to-collapse
+         * vector elements, and for write requests, initialize it with the data
+         * from those elements.  Then add it to `pad->local_qiov`.
+         */
+        pad->collapse_len = pad->pre_collapse_qiov.size;
+        pad->collapse_bounce_buf = qemu_blockalign(bs, pad->collapse_len);
+        if (pad->write) {
+            qemu_iovec_to_buf(&pad->pre_collapse_qiov, 0,
+                              pad->collapse_bounce_buf, pad->collapse_len);
+        }
+        qemu_iovec_add(&pad->local_qiov,
+                       pad->collapse_bounce_buf, pad->collapse_len);
+    }
+
+    qemu_iovec_concat_iov(&pad->local_qiov, iov, niov, iov_offset, bytes);
+
+    if (pad->tail) {
+        qemu_iovec_add(&pad->local_qiov,
+                       pad->buf + pad->buf_len - pad->tail, pad->tail);
+    }
+
+    assert(pad->local_qiov.niov == MIN(padded_niov, IOV_MAX));
+    return 0;
+}
+
 /*
 * bdrv_pad_request
 *
@ -1565,6 +1691,8 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
 * read of padding, bdrv_padding_rmw_read() should be called separately if
 * needed.
 *
+ * @write is true for write requests, false for read requests.
+ *
 * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out:
 *  - on function start they represent original request
 *  - on failure or when padding is not needed they are unchanged
@ -1573,26 +1701,34 @@ static void bdrv_padding_destroy(BdrvRequestPadding *pad)
 static int bdrv_pad_request(BlockDriverState *bs,
                            QEMUIOVector **qiov, size_t *qiov_offset,
                            int64_t *offset, int64_t *bytes,
+                            bool write,
                            BdrvRequestPadding *pad, bool *padded,
                            BdrvRequestFlags *flags)
 {
    int ret;
+    struct iovec *sliced_iov;
+    int sliced_niov;
+    size_t sliced_head, sliced_tail;

    bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort);

-    if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
+    if (!bdrv_init_padding(bs, *offset, *bytes, write, pad)) {
        if (padded) {
            *padded = false;
        }
        return 0;
    }

-    ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
-                                   *qiov, *qiov_offset, *bytes,
-                                   pad->buf + pad->buf_len - pad->tail,
-                                   pad->tail);
+    sliced_iov = qemu_iovec_slice(*qiov, *qiov_offset, *bytes,
+                                  &sliced_head, &sliced_tail,
+                                  &sliced_niov);
+
+    /* Guaranteed by bdrv_check_qiov_request() */
+    assert(*bytes <= SIZE_MAX);
+    ret = bdrv_create_padded_qiov(bs, pad, sliced_iov, sliced_niov,
+                                  sliced_head, *bytes);
    if (ret < 0) {
-        bdrv_padding_destroy(pad);
+        bdrv_padding_finalize(pad);
        return ret;
    }
    *bytes += pad->head + pad->tail;
@ -1659,8 +1795,8 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
        flags |= BDRV_REQ_COPY_ON_READ;
    }

-    ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
-                           NULL, &flags);
+    ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, false,
+                           &pad, NULL, &flags);
    if (ret < 0) {
        goto fail;
    }
@ -1670,7 +1806,7 @@ int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
                              bs->bl.request_alignment,
                              qiov, qiov_offset, flags);
    tracked_request_end(&req);
-    bdrv_padding_destroy(&pad);
+    bdrv_padding_finalize(&pad);

 fail:
    bdrv_dec_in_flight(bs);
@ -2002,7 +2138,7 @@ bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes,
    /* This flag doesn't make sense for padding or zero writes */
    flags &= ~BDRV_REQ_REGISTERED_BUF;

-    padding = bdrv_init_padding(bs, offset, bytes, &pad);
+    padding = bdrv_init_padding(bs, offset, bytes, true, &pad);
    if (padding) {
        assert(!(flags & BDRV_REQ_NO_WAIT));
        bdrv_make_request_serialising(req, align);
@ -2050,7 +2186,7 @@ bdrv_co_do_zero_pwritev(BdrvChild *child, int64_t offset, int64_t bytes,
    }

 out:
-    bdrv_padding_destroy(&pad);
+    bdrv_padding_finalize(&pad);

    return ret;
 }
@ -2118,8 +2254,8 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
         * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do
         * alignment only if there is no ZERO flag.
         */
-        ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
-                               &padded, &flags);
+        ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, true,
+                               &pad, &padded, &flags);
        if (ret < 0) {
            return ret;
        }
@ -2149,7 +2285,7 @@ int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
    ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
                               qiov, qiov_offset, flags);

-    bdrv_padding_destroy(&pad);
+    bdrv_padding_finalize(&pad);

 out:
    tracked_request_end(&req);
--- a/block/parallels.c
+++ b/block/parallels.c
@ -165,6 +165,13 @@ static int64_t block_status(BDRVParallelsState *s, int64_t sector_num,
    return start_off;
 }

+static void parallels_set_bat_entry(BDRVParallelsState *s,
+                                    uint32_t index, uint32_t offset)
+{
+    s->bat_bitmap[index] = cpu_to_le32(offset);
+    bitmap_set(s->bat_dirty_bmap, bat_entry_off(index) / s->bat_dirty_block, 1);
+}
+
 static int64_t coroutine_fn GRAPH_RDLOCK
 allocate_clusters(BlockDriverState *bs, int64_t sector_num,
                  int nb_sectors, int *pnum)
@ -251,10 +258,8 @@ allocate_clusters(BlockDriverState *bs, int64_t sector_num,
    }

    for (i = 0; i < to_allocate; i++) {
-        s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier);
+        parallels_set_bat_entry(s, idx + i, s->data_end / s->off_multiplier);
        s->data_end += s->tracks;
-        bitmap_set(s->bat_dirty_bmap,
-                   bat_entry_off(idx + i) / s->bat_dirty_block, 1);
    }

    return bat2sect(s, idx) + sector_num % s->tracks;
@ -415,16 +420,33 @@ parallels_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
    return ret;
 }

-
-static int coroutine_fn GRAPH_RDLOCK
-parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
-                   BdrvCheckMode fix)
+static void parallels_check_unclean(BlockDriverState *bs,
+                                    BdrvCheckResult *res,
+                                    BdrvCheckMode fix)
+{
+    BDRVParallelsState *s = bs->opaque;
+
+    if (!s->header_unclean) {
+        return;
+    }
+
+    fprintf(stderr, "%s image was not closed correctly\n",
+            fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
+    res->corruptions++;
+    if (fix & BDRV_FIX_ERRORS) {
+        /* parallels_close will do the job right */
+        res->corruptions_fixed++;
+        s->header_unclean = false;
+    }
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+parallels_check_outside_image(BlockDriverState *bs, BdrvCheckResult *res,
+                              BdrvCheckMode fix)
 {
    BDRVParallelsState *s = bs->opaque;
-    int64_t size, prev_off, high_off;
-    int ret;
    uint32_t i;
-    bool flush_bat = false;
+    int64_t off, high_off, size;

    size = bdrv_getlength(bs->file->bs);
    if (size < 0) {
@ -432,65 +454,48 @@ parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
        return size;
    }

-    qemu_co_mutex_lock(&s->lock);
-    if (s->header_unclean) {
-        fprintf(stderr, "%s image was not closed correctly\n",
-                fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR");
-        res->corruptions++;
-        if (fix & BDRV_FIX_ERRORS) {
-            /* parallels_close will do the job right */
-            res->corruptions_fixed++;
-            s->header_unclean = false;
-        }
-    }
-
-    res->bfi.total_clusters = s->bat_size;
-    res->bfi.compressed_clusters = 0; /* compression is not supported */
-
    high_off = 0;
-    prev_off = 0;
    for (i = 0; i < s->bat_size; i++) {
-        int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS;
-        if (off == 0) {
-            prev_off = 0;
-            continue;
-        }
-
-        /* cluster outside the image */
-        if (off > size) {
+        off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+        if (off + s->cluster_size > size) {
            fprintf(stderr, "%s cluster %u is outside image\n",
                    fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
            res->corruptions++;
            if (fix & BDRV_FIX_ERRORS) {
-                prev_off = 0;
-                s->bat_bitmap[i] = 0;
+                parallels_set_bat_entry(s, i, 0);
                res->corruptions_fixed++;
-                flush_bat = true;
-                continue;
            }
+            continue;
        }
-
-        res->bfi.allocated_clusters++;
-        if (off > high_off) {
+        if (high_off < off) {
            high_off = off;
        }
-
-        if (prev_off != 0 && (prev_off + s->cluster_size) != off) {
-            res->bfi.fragmented_clusters++;
-        }
-        prev_off = off;
    }

-    ret = 0;
-    if (flush_bat) {
-        ret = bdrv_co_pwrite_sync(bs->file, 0, s->header_size, s->header, 0);
-        if (ret < 0) {
-            res->check_errors++;
-            goto out;
-        }
+    if (high_off == 0) {
+        res->image_end_offset = s->data_end << BDRV_SECTOR_BITS;
+    } else {
+        res->image_end_offset = high_off + s->cluster_size;
+        s->data_end = res->image_end_offset >> BDRV_SECTOR_BITS;
+    }
+
+    return 0;
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+parallels_check_leak(BlockDriverState *bs, BdrvCheckResult *res,
+                     BdrvCheckMode fix)
+{
+    BDRVParallelsState *s = bs->opaque;
+    int64_t size;
+    int ret;
+
+    size = bdrv_getlength(bs->file->bs);
+    if (size < 0) {
+        res->check_errors++;
+        return size;
    }

-    res->image_end_offset = high_off + s->cluster_size;
    if (size > res->image_end_offset) {
        int64_t count;
        count = DIV_ROUND_UP(size - res->image_end_offset, s->cluster_size);
@ -510,14 +515,74 @@ parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
            if (ret < 0) {
                error_report_err(local_err);
                res->check_errors++;
-                goto out;
+                return ret;
            }
            res->leaks_fixed += count;
        }
    }

-out:
-    qemu_co_mutex_unlock(&s->lock);
+    return 0;
+}
+
+static void parallels_collect_statistics(BlockDriverState *bs,
+                                         BdrvCheckResult *res,
+                                         BdrvCheckMode fix)
+{
+    BDRVParallelsState *s = bs->opaque;
+    int64_t off, prev_off;
+    uint32_t i;
+
+    res->bfi.total_clusters = s->bat_size;
+    res->bfi.compressed_clusters = 0; /* compression is not supported */
+
+    prev_off = 0;
+    for (i = 0; i < s->bat_size; i++) {
+        off = bat2sect(s, i) << BDRV_SECTOR_BITS;
+        /*
+         * If BDRV_FIX_ERRORS is not set, out-of-image BAT entries were not
+         * fixed. Skip not allocated and out-of-image BAT entries.
+         */
+        if (off == 0 || off + s->cluster_size > res->image_end_offset) {
+            prev_off = 0;
+            continue;
+        }
+
+        if (prev_off != 0 && (prev_off + s->cluster_size) != off) {
+            res->bfi.fragmented_clusters++;
+        }
+        prev_off = off;
+        res->bfi.allocated_clusters++;
+    }
+}
+
+static int coroutine_fn GRAPH_RDLOCK
+parallels_co_check(BlockDriverState *bs, BdrvCheckResult *res,
+                   BdrvCheckMode fix)
+{
+    BDRVParallelsState *s = bs->opaque;
+    int ret;
+
+    WITH_QEMU_LOCK_GUARD(&s->lock) {
+        parallels_check_unclean(bs, res, fix);
+
+        ret = parallels_check_outside_image(bs, res, fix);
+        if (ret < 0) {
+            return ret;
+        }
+
+        ret = parallels_check_leak(bs, res, fix);
+        if (ret < 0) {
+            return ret;
+        }
+
+        parallels_collect_statistics(bs, res, fix);
+    }
+
+    ret = bdrv_co_flush(bs);
+    if (ret < 0) {
+        res->check_errors++;
+    }
+
    return ret;
 }

@ -733,6 +798,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
    BDRVParallelsState *s = bs->opaque;
    ParallelsHeader ph;
    int ret, size, i;
+    int64_t file_nb_sectors;
    QemuOpts *opts = NULL;
    Error *local_err = NULL;
    char *buf;
@ -742,6 +808,11 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,
        return ret;
    }

+    file_nb_sectors = bdrv_nb_sectors(bs->file->bs);
+    if (file_nb_sectors < 0) {
+        return -EINVAL;
+    }
+
    ret = bdrv_pread(bs->file, 0, sizeof(ph), &ph, 0);
    if (ret < 0) {
        goto fail;
@ -806,6 +877,17 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags,

    for (i = 0; i < s->bat_size; i++) {
        int64_t off = bat2sect(s, i);
+        if (off >= file_nb_sectors) {
+            if (flags & BDRV_O_CHECK) {
+                continue;
+            }
+            error_setg(errp, "parallels: Offset %" PRIi64 " in BAT[%d] entry "
+                       "is larger than file size (%" PRIi64 ")",
+                       off << BDRV_SECTOR_BITS, i,
+                       file_nb_sectors << BDRV_SECTOR_BITS);
+            ret = -EINVAL;
+            goto fail;
+        }
        if (off >= s->data_end) {
            s->data_end = off + s->tracks;
        }
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@ -1925,6 +1925,10 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
        uint64_t new_l2_bitmap = old_l2_bitmap;
        QCow2ClusterType cluster_type =
            qcow2_get_cluster_type(bs, old_l2_entry);
+        bool keep_reference = (cluster_type != QCOW2_CLUSTER_COMPRESSED) &&
+                              !full_discard &&
+                              (s->discard_no_unref &&
+                               type == QCOW2_DISCARD_REQUEST);

        /*
         * If full_discard is true, the cluster should not read back as zeroes,
@ -1943,10 +1947,22 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
            new_l2_entry = new_l2_bitmap = 0;
        } else if (bs->backing || qcow2_cluster_is_allocated(cluster_type)) {
            if (has_subclusters(s)) {
-                new_l2_entry = 0;
+                if (keep_reference) {
+                    new_l2_entry = old_l2_entry;
+                } else {
+                    new_l2_entry = 0;
+                }
                new_l2_bitmap = QCOW_L2_BITMAP_ALL_ZEROES;
            } else {
-                new_l2_entry = s->qcow_version >= 3 ? QCOW_OFLAG_ZERO : 0;
+                if (s->qcow_version >= 3) {
+                    if (keep_reference) {
+                        new_l2_entry |= QCOW_OFLAG_ZERO;
+                    } else {
+                        new_l2_entry = QCOW_OFLAG_ZERO;
+                    }
+                } else {
+                    new_l2_entry = 0;
+                }
            }
        }

@ -1960,8 +1976,16 @@ static int discard_in_l2_slice(BlockDriverState *bs, uint64_t offset,
        if (has_subclusters(s)) {
            set_l2_bitmap(s, l2_slice, l2_index + i, new_l2_bitmap);
        }
-        /* Then decrease the refcount */
-        qcow2_free_any_cluster(bs, old_l2_entry, type);
+        if (!keep_reference) {
+            /* Then decrease the refcount */
+            qcow2_free_any_cluster(bs, old_l2_entry, type);
+        } else if (s->discard_passthrough[type] &&
+                   (cluster_type == QCOW2_CLUSTER_NORMAL ||
+                    cluster_type == QCOW2_CLUSTER_ZERO_ALLOC)) {
+            /* If we keep the reference, pass on the discard still */
+            bdrv_pdiscard(s->data_file, old_l2_entry & L2E_OFFSET_MASK,
+                          s->cluster_size);
+       }
    }

    qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
--- a/block/qcow2.c
+++ b/block/qcow2.c
@ -682,6 +682,7 @@ static const char *const mutable_opts[] = {
    QCOW2_OPT_DISCARD_REQUEST,
    QCOW2_OPT_DISCARD_SNAPSHOT,
    QCOW2_OPT_DISCARD_OTHER,
+    QCOW2_OPT_DISCARD_NO_UNREF,
    QCOW2_OPT_OVERLAP,
    QCOW2_OPT_OVERLAP_TEMPLATE,
    QCOW2_OPT_OVERLAP_MAIN_HEADER,
@ -726,6 +727,11 @@ static QemuOptsList qcow2_runtime_opts = {
            .type = QEMU_OPT_BOOL,
            .help = "Generate discard requests when other clusters are freed",
        },
+        {
+            .name = QCOW2_OPT_DISCARD_NO_UNREF,
+            .type = QEMU_OPT_BOOL,
+            .help = "Do not unreference discarded clusters",
+        },
        {
            .name = QCOW2_OPT_OVERLAP,
            .type = QEMU_OPT_STRING,
@ -969,6 +975,7 @@ typedef struct Qcow2ReopenState {
    bool use_lazy_refcounts;
    int overlap_check;
    bool discard_passthrough[QCOW2_DISCARD_MAX];
+    bool discard_no_unref;
    uint64_t cache_clean_interval;
    QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
 } Qcow2ReopenState;
@ -1140,6 +1147,15 @@ static int qcow2_update_options_prepare(BlockDriverState *bs,
    r->discard_passthrough[QCOW2_DISCARD_OTHER] =
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);

+    r->discard_no_unref = qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_NO_UNREF,
+                                            false);
+    if (r->discard_no_unref && s->qcow_version < 3) {
+        error_setg(errp,
+                   "discard-no-unref is only supported since qcow2 version 3");
+        ret = -EINVAL;
+        goto fail;
+    }
+
    switch (s->crypt_method_header) {
    case QCOW_CRYPT_NONE:
        if (encryptfmt) {
@ -1220,6 +1236,8 @@ static void qcow2_update_options_commit(BlockDriverState *bs,
        s->discard_passthrough[i] = r->discard_passthrough[i];
    }

+    s->discard_no_unref = r->discard_no_unref;
+
    if (s->cache_clean_interval != r->cache_clean_interval) {
        cache_clean_timer_del(bs);
        s->cache_clean_interval = r->cache_clean_interval;
--- a/block/qcow2.h
+++ b/block/qcow2.h
@ -133,6 +133,7 @@
 #define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request"
 #define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot"
 #define QCOW2_OPT_DISCARD_OTHER "pass-discard-other"
+#define QCOW2_OPT_DISCARD_NO_UNREF "discard-no-unref"
 #define QCOW2_OPT_OVERLAP "overlap-check"
 #define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template"
 #define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header"
@ -385,6 +386,8 @@ typedef struct BDRVQcow2State {

    bool discard_passthrough[QCOW2_DISCARD_MAX];

+    bool discard_no_unref;
+
    int overlap_check; /* bitmask of Qcow2MetadataOverlap values */
    bool signaled_corruption;

--- a/include/qemu/iov.h
+++ b/include/qemu/iov.h
@ -222,13 +222,11 @@ static inline void *qemu_iovec_buf(QEMUIOVector *qiov)

 void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint);
 void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov);
-int qemu_iovec_init_extended(
-        QEMUIOVector *qiov,
-        void *head_buf, size_t head_len,
-        QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len,
-        void *tail_buf, size_t tail_len);
 void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source,
                           size_t offset, size_t len);
+struct iovec *qemu_iovec_slice(QEMUIOVector *qiov,
+                               size_t offset, size_t len,
+                               size_t *head, size_t *tail, int *niov);
 int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len);
 void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len);
 void qemu_iovec_concat(QEMUIOVector *dst,
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@ -3478,6 +3478,17 @@
 # @pass-discard-other: whether discard requests for the data source
 #     should be issued on other occasions where a cluster gets freed
 #
+# @discard-no-unref: when enabled, discards from the guest will not cause
+#     cluster allocations to be relinquished. This prevents qcow2 fragmentation
+#     that would be caused by such discards. Besides potential
+#     performance degradation, such fragmentation can lead to increased
+#     allocation of clusters past the end of the image file,
+#     resulting in image files whose file length can grow much larger
+#     than their guest disk size would suggest.
+#     If image file length is of concern (e.g. when storing qcow2
+#     images directly on block devices), you should consider enabling
+#     this option. (since 8.1)
+#
 # @overlap-check: which overlap checks to perform for writes to the
 #     image, defaults to 'cached' (since 2.2)
 #
@ -3516,6 +3527,7 @@
            '*pass-discard-request': 'bool',
            '*pass-discard-snapshot': 'bool',
            '*pass-discard-other': 'bool',
+            '*discard-no-unref': 'bool',
            '*overlap-check': 'Qcow2OverlapChecks',
            '*cache-size': 'int',
            '*l2-cache-size': 'int',
--- a/qemu-options.hx
+++ b/qemu-options.hx
@ -1431,6 +1431,18 @@ SRST
            issued on other occasions where a cluster gets freed
            (on/off; default: off)

+        ``discard-no-unref``
+            When enabled, discards from the guest will not cause cluster
+            allocations to be relinquished. This prevents qcow2 fragmentation
+            that would be caused by such discards. Besides potential
+            performance degradation, such fragmentation can lead to increased
+            allocation of clusters past the end of the image file,
+            resulting in image files whose file length can grow much larger
+            than their guest disk size would suggest.
+            If image file length is of concern (e.g. when storing qcow2
+            images directly on block devices), you should consider enabling
+            this option.
+
        ``overlap-check``
            Which overlap checks to perform for writes to the image
            (none/constant/cached/all; default: cached). For details or
--- a/tests/qemu-iotests/tests/iov-padding
+++ b/tests/qemu-iotests/tests/iov-padding
@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+# group: rw quick
+#
+# Check the interaction of request padding (to fit alignment restrictions) with
+# vectored I/O from the guest
+#
+# Copyright Red Hat
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+seq=$(basename $0)
+echo "QA output created by $seq"
+
+status=1	# failure is the default!
+
+_cleanup()
+{
+    _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+cd ..
+. ./common.rc
+. ./common.filter
+
+_supported_fmt raw
+_supported_proto file
+
+_make_test_img 1M
+
+IMGSPEC="driver=blkdebug,align=4096,image.driver=file,image.filename=$TEST_IMG"
+
+# Four combinations:
+# - Offset 4096, length 1023 * 512 + 512: Fully aligned to 4k
+# - Offset 4096, length 1023 * 512 + 4096: Head is aligned, tail is not
+# - Offset 512, length 1023 * 512 + 512: Neither head nor tail are aligned
+# - Offset 512, length 1023 * 512 + 4096: Tail is aligned, head is not
+for start_offset in 4096 512; do
+    for last_element_length in 512 4096; do
+        length=$((1023 * 512 + $last_element_length))
+
+        echo
+        echo "== performing 1024-element vectored requests to image (offset: $start_offset; length: $length) =="
+
+        # Fill with data for testing
+        $QEMU_IO -c 'write -P 1 0 1M' "$TEST_IMG" | _filter_qemu_io
+
+        # 1023 512-byte buffers, and then one with length $last_element_length
+        cmd_params="-P 2 $start_offset $(yes 512 | head -n 1023 | tr '\n' ' ') $last_element_length"
+        QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \
+            -c "writev $cmd_params" \
+            --image-opts \
+            "$IMGSPEC" \
+            | _filter_qemu_io
+
+        # Read all patterns -- read the part we just wrote with writev twice,
+        # once "normally", and once with a readv, so we see that that works, too
+        QEMU_IO_OPTIONS="$QEMU_IO_OPTIONS_NO_FMT" $QEMU_IO \
+            -c "read -P 1 0 $start_offset" \
+            -c "read -P 2 $start_offset $length" \
+            -c "readv $cmd_params" \
+            -c "read -P 1 $((start_offset + length)) $((1024 * 1024 - length - start_offset))" \
+            --image-opts \
+            "$IMGSPEC" \
+            | _filter_qemu_io
+    done
+done
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
--- a/tests/qemu-iotests/tests/iov-padding.out
+++ b/tests/qemu-iotests/tests/iov-padding.out
@ -0,0 +1,59 @@
+QA output created by iov-padding
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576
+
+== performing 1024-element vectored requests to image (offset: 4096; length: 524288) ==
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 524288/524288 bytes at offset 4096
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 524288/524288 bytes at offset 4096
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 524288/524288 bytes at offset 4096
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 520192/520192 bytes at offset 528384
+508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== performing 1024-element vectored requests to image (offset: 4096; length: 527872) ==
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 527872/527872 bytes at offset 4096
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 4096/4096 bytes at offset 0
+4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 527872/527872 bytes at offset 4096
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 527872/527872 bytes at offset 4096
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 516608/516608 bytes at offset 531968
+504.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== performing 1024-element vectored requests to image (offset: 512; length: 524288) ==
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 524288/524288 bytes at offset 512
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 524288/524288 bytes at offset 512
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 524288/524288 bytes at offset 512
+512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 523776/523776 bytes at offset 524800
+511.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+== performing 1024-element vectored requests to image (offset: 512; length: 527872) ==
+wrote 1048576/1048576 bytes at offset 0
+1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 527872/527872 bytes at offset 512
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 527872/527872 bytes at offset 512
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 527872/527872 bytes at offset 512
+515.500 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+read 520192/520192 bytes at offset 528384
+508 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+*** done
--- a/util/iov.c
+++ b/util/iov.c
@ -378,15 +378,15 @@ static struct iovec *iov_skip_offset(struct iovec *iov, size_t offset,
 }

 /*
- * qiov_slice
+ * qemu_iovec_slice
 *
 * Find subarray of iovec's, containing requested range. @head would
 * be offset in first iov (returned by the function), @tail would be
 * count of extra bytes in last iovec (returned iov + @niov - 1).
 */
-static struct iovec *qiov_slice(QEMUIOVector *qiov,
-                                size_t offset, size_t len,
-                                size_t *head, size_t *tail, int *niov)
+struct iovec *qemu_iovec_slice(QEMUIOVector *qiov,
+                               size_t offset, size_t len,
+                               size_t *head, size_t *tail, int *niov)
 {
    struct iovec *iov, *end_iov;

@ -411,75 +411,11 @@ int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len)
    size_t head, tail;
    int niov;

-    qiov_slice(qiov, offset, len, &head, &tail, &niov);
+    qemu_iovec_slice(qiov, offset, len, &head, &tail, &niov);

    return niov;
 }

-/*
- * Compile new iovec, combining @head_buf buffer, sub-qiov of @mid_qiov,
- * and @tail_buf buffer into new qiov.
- */
-int qemu_iovec_init_extended(
-        QEMUIOVector *qiov,
-        void *head_buf, size_t head_len,
-        QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len,
-        void *tail_buf, size_t tail_len)
-{
-    size_t mid_head, mid_tail;
-    int total_niov, mid_niov = 0;
-    struct iovec *p, *mid_iov = NULL;
-
-    assert(mid_qiov->niov <= IOV_MAX);
-
-    if (SIZE_MAX - head_len < mid_len ||
-        SIZE_MAX - head_len - mid_len < tail_len)
-    {
-        return -EINVAL;
-    }
-
-    if (mid_len) {
-        mid_iov = qiov_slice(mid_qiov, mid_offset, mid_len,
-                             &mid_head, &mid_tail, &mid_niov);
-    }
-
-    total_niov = !!head_len + mid_niov + !!tail_len;
-    if (total_niov > IOV_MAX) {
-        return -EINVAL;
-    }
-
-    if (total_niov == 1) {
-        qemu_iovec_init_buf(qiov, NULL, 0);
-        p = &qiov->local_iov;
-    } else {
-        qiov->niov = qiov->nalloc = total_niov;
-        qiov->size = head_len + mid_len + tail_len;
-        p = qiov->iov = g_new(struct iovec, qiov->niov);
-    }
-
-    if (head_len) {
-        p->iov_base = head_buf;
-        p->iov_len = head_len;
-        p++;
-    }
-
-    assert(!mid_niov == !mid_len);
-    if (mid_niov) {
-        memcpy(p, mid_iov, mid_niov * sizeof(*p));
-        p[0].iov_base = (uint8_t *)p[0].iov_base + mid_head;
-        p[0].iov_len -= mid_head;
-        p[mid_niov - 1].iov_len -= mid_tail;
-        p += mid_niov;
-    }
-
-    if (tail_len) {
-        p->iov_base = tail_buf;
-        p->iov_len = tail_len;
-    }
-
-    return 0;
-}
-
 /*
 * Check if the contents of subrange of qiov data is all zeroes.
 */
@ -511,14 +447,21 @@ bool qemu_iovec_is_zero(QEMUIOVector *qiov, size_t offset, size_t bytes)
 void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source,
                           size_t offset, size_t len)
 {
-    int ret;
+    struct iovec *slice_iov;
+    int slice_niov;
+    size_t slice_head, slice_tail;

    assert(source->size >= len);
    assert(source->size - len >= offset);

-    /* We shrink the request, so we can't overflow neither size_t nor MAX_IOV */
-    ret = qemu_iovec_init_extended(qiov, NULL, 0, source, offset, len, NULL, 0);
-    assert(ret == 0);
+    slice_iov = qemu_iovec_slice(source, offset, len,
+                                 &slice_head, &slice_tail, &slice_niov);
+    if (slice_niov == 1) {
+        qemu_iovec_init_buf(qiov, slice_iov[0].iov_base + slice_head, len);
+    } else {
+        qemu_iovec_init(qiov, slice_niov);
+        qemu_iovec_concat_iov(qiov, slice_iov, slice_niov, slice_head, len);
+    }
 }

 void qemu_iovec_destroy(QEMUIOVector *qiov)