[pve-devel] [PATCH qemu] add revert to work around performance regression when backing up large RBD disk

Fabian Ebner f.ebner at proxmox.com
Tue May 17 10:29:49 CEST 2022


resulting in QMP timeouts and very slow backups. The plan is to figure
out (ideally together with upstream) a way to make the implementation
of bdrv_co_block_status for RBD more efficient. But for now, revert
the problematic change as a stop-gap measure.

Upstream bug report:
https://gitlab.com/qemu-project/qemu/-/issues/1026

Forum threads:
https://forum.proxmox.com/threads/109272/
https://forum.proxmox.com/threads/109448/
https://forum.proxmox.com/threads/101334/ (partially)

Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
---

I simply removed the other two patches already fixing the offending
commit from the "series" file, to make it less likely that we forget
to re-add them once we revert the revert and to reduce the git diff.
Hope that is ok.

 ...k-rbd-implement-bdrv_co_block_status.patch | 161 ++++++++++++++++++
 debian/patches/series                         |   3 +-
 2 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 debian/patches/pve/0053-Revert-block-rbd-implement-bdrv_co_block_status.patch

diff --git a/debian/patches/pve/0053-Revert-block-rbd-implement-bdrv_co_block_status.patch b/debian/patches/pve/0053-Revert-block-rbd-implement-bdrv_co_block_status.patch
new file mode 100644
index 0000000..d3c6a71
--- /dev/null
+++ b/debian/patches/pve/0053-Revert-block-rbd-implement-bdrv_co_block_status.patch
@@ -0,0 +1,161 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Fabian Ebner <f.ebner at proxmox.com>
+Date: Tue, 17 May 2022 09:46:02 +0200
+Subject: [PATCH] Revert "block/rbd: implement bdrv_co_block_status"
+
+During backup, bdrv_co_block_status is called for each block copy
+chunk. When RBD is used, the current implementation with
+rbd_diff_iterate2() using whole_object=true takes about linearly more
+time, depending on the image size. Since there are linearly more
+chunks, the slowdown is quadratic, becoming unacceptable for large
+images (starting somewhere between 500-1000 GiB in my testing).
+
+This reverts commit 0347a8fd4c3faaedf119be04c197804be40a384b as a
+stop-gap measure, until it's clear how to make the implemenation
+more efficient.
+
+Upstream bug report:
+https://gitlab.com/qemu-project/qemu/-/issues/1026
+
+Signed-off-by: Fabian Ebner <f.ebner at proxmox.com>
+---
+ block/rbd.c | 112 ----------------------------------------------------
+ 1 file changed, 112 deletions(-)
+
+diff --git a/block/rbd.c b/block/rbd.c
+index a4b8fb482c..3393b06a4e 100644
+--- a/block/rbd.c
++++ b/block/rbd.c
+@@ -97,12 +97,6 @@ typedef struct RBDTask {
+     int64_t ret;
+ } RBDTask;
+ 
+-typedef struct RBDDiffIterateReq {
+-    uint64_t offs;
+-    uint64_t bytes;
+-    bool exists;
+-} RBDDiffIterateReq;
+-
+ static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
+                             BlockdevOptionsRbd *opts, bool cache,
+                             const char *keypairs, const char *secretid,
+@@ -1267,111 +1261,6 @@ static ImageInfoSpecific *qemu_rbd_get_specific_info(BlockDriverState *bs,
+     return spec_info;
+ }
+ 
+-/*
+- * rbd_diff_iterate2 allows to interrupt the exection by returning a negative
+- * value in the callback routine. Choose a value that does not conflict with
+- * an existing exitcode and return it if we want to prematurely stop the
+- * execution because we detected a change in the allocation status.
+- */
+-#define QEMU_RBD_EXIT_DIFF_ITERATE2 -9000
+-
+-static int qemu_rbd_diff_iterate_cb(uint64_t offs, size_t len,
+-                                    int exists, void *opaque)
+-{
+-    RBDDiffIterateReq *req = opaque;
+-
+-    assert(req->offs + req->bytes <= offs);
+-    /*
+-     * we do not diff against a snapshot so we should never receive a callback
+-     * for a hole.
+-     */
+-    assert(exists);
+-
+-    if (!req->exists && offs > req->offs) {
+-        /*
+-         * we started in an unallocated area and hit the first allocated
+-         * block. req->bytes must be set to the length of the unallocated area
+-         * before the allocated area. stop further processing.
+-         */
+-        req->bytes = offs - req->offs;
+-        return QEMU_RBD_EXIT_DIFF_ITERATE2;
+-    }
+-
+-    if (req->exists && offs > req->offs + req->bytes) {
+-        /*
+-         * we started in an allocated area and jumped over an unallocated area,
+-         * req->bytes contains the length of the allocated area before the
+-         * unallocated area. stop further processing.
+-         */
+-        return QEMU_RBD_EXIT_DIFF_ITERATE2;
+-    }
+-
+-    req->bytes += len;
+-    req->exists = true;
+-
+-    return 0;
+-}
+-
+-static int coroutine_fn qemu_rbd_co_block_status(BlockDriverState *bs,
+-                                                 bool want_zero, int64_t offset,
+-                                                 int64_t bytes, int64_t *pnum,
+-                                                 int64_t *map,
+-                                                 BlockDriverState **file)
+-{
+-    BDRVRBDState *s = bs->opaque;
+-    int status, r;
+-    RBDDiffIterateReq req = { .offs = offset };
+-    uint64_t features, flags;
+-
+-    assert(offset + bytes <= s->image_size);
+-
+-    /* default to all sectors allocated */
+-    status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+-    *map = offset;
+-    *file = bs;
+-    *pnum = bytes;
+-
+-    /* check if RBD image supports fast-diff */
+-    r = rbd_get_features(s->image, &features);
+-    if (r < 0) {
+-        return status;
+-    }
+-    if (!(features & RBD_FEATURE_FAST_DIFF)) {
+-        return status;
+-    }
+-
+-    /* check if RBD fast-diff result is valid */
+-    r = rbd_get_flags(s->image, &flags);
+-    if (r < 0) {
+-        return status;
+-    }
+-    if (flags & RBD_FLAG_FAST_DIFF_INVALID) {
+-        return status;
+-    }
+-
+-    r = rbd_diff_iterate2(s->image, NULL, offset, bytes, true, true,
+-                          qemu_rbd_diff_iterate_cb, &req);
+-    if (r < 0 && r != QEMU_RBD_EXIT_DIFF_ITERATE2) {
+-        return status;
+-    }
+-    assert(req.bytes <= bytes);
+-    if (!req.exists) {
+-        if (r == 0) {
+-            /*
+-             * rbd_diff_iterate2 does not invoke callbacks for unallocated
+-             * areas. This here catches the case where no callback was
+-             * invoked at all (req.bytes == 0).
+-             */
+-            assert(req.bytes == 0);
+-            req.bytes = bytes;
+-        }
+-        status = BDRV_BLOCK_ZERO | BDRV_BLOCK_OFFSET_VALID;
+-    }
+-
+-    *pnum = req.bytes;
+-    return status;
+-}
+-
+ static int64_t qemu_rbd_getlength(BlockDriverState *bs)
+ {
+     BDRVRBDState *s = bs->opaque;
+@@ -1607,7 +1496,6 @@ static BlockDriver bdrv_rbd = {
+ #ifdef LIBRBD_SUPPORTS_WRITE_ZEROES
+     .bdrv_co_pwrite_zeroes  = qemu_rbd_co_pwrite_zeroes,
+ #endif
+-    .bdrv_co_block_status   = qemu_rbd_co_block_status,
+ 
+     .bdrv_snapshot_create   = qemu_rbd_snap_create,
+     .bdrv_snapshot_delete   = qemu_rbd_snap_remove,
diff --git a/debian/patches/series b/debian/patches/series
index 96a525a..821d63d 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,8 +1,6 @@
 extra/0001-monitor-qmp-fix-race-with-clients-disconnecting-earl.patch
 extra/0002-monitor-hmp-add-support-for-flag-argument-with-value.patch
 extra/0003-monitor-refactor-set-expire_password-and-allow-VNC-d.patch
-extra/0004-block-rbd-fix-handling-of-holes-in-.bdrv_co_block_st.patch
-extra/0005-block-rbd-workaround-for-ceph-issue-53784.patch
 extra/0006-block-io-Update-BSC-only-if-want_zero-is-true.patch
 extra/0007-block-nbd-Delete-reconnect-delay-timer-when-done.patch
 extra/0008-block-nbd-Assert-there-are-no-timers-when-closed.patch
@@ -77,3 +75,4 @@ pve/0049-PVE-savevm-async-register-yank-before-migration_inco.patch
 pve/0050-qemu-img-dd-add-l-option-for-loading-a-snapshot.patch
 pve/0051-vma-allow-partial-restore.patch
 pve/0052-pbs-namespace-support.patch
+pve/0053-Revert-block-rbd-implement-bdrv_co_block_status.patch
-- 
2.30.2






More information about the pve-devel mailing list