[pve-devel] [PATCH qemu v5 03/32] PVE backup: implement backup access setup and teardown API for external providers

Mon Mar 24 14:02:13 CET 2025

On Fri, Mar 21, 2025 at 02:48:23PM +0100, Fiona Ebner wrote:
> For external backup providers, the state of the VM's disk images at
> the time the backup is started is preserved via a snapshot-access
> block node. Old data is moved to the fleecing image when new guest
> writes come in. The snapshot-access block node, as well as the
> associated bitmap in case of incremental backup, will be exported via
> NBD to the external provider. The NBD export will be done by the
> management layer, the missing functionality is setting up and tearing
> down the snapshot-access block nodes, which this patch adds.
> 
> It is necessary to also set up fleecing for EFI and TPM disks, so that
> old data can be moved out of the way when a new guest write comes in.
> 
> There can only be one regular backup or one active backup access at
> a time, because both require replacing the original block node of the
> drive. Thus the backup state is re-used, and checks are added to
> prohibit regular backup while snapshot access is active and vice
> versa.
> 
> The block nodes added by the backup-access-setup QMP call are not
> tracked anywhere else (there is no job they are associated to like for
> regular backup). This requires adding a callback for teardown when
> QEMU exits, i.e. in qemu_cleanup(). Otherwise, there will be an
> assertion failure that the block graph is not empty when QEMU exits
> before the backup-access-teardown QMP command is called.
> 
> The code for the qmp_backup_access_setup() was based on the existing
> qmp_backup() routine.
> 
> The return value for the setup QMP command contains information about
> the snapshot-access block nodes that can be used by the management
> layer to set up the NBD exports.
> 
> Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
> ---
> 
> Changes in v5:
> * Set finishing state and end time in backup state in QEMU to avoid
>   wrongly detecting previous backup as still active leading to a
>   warning.
> * Expose support for backup-access API as a feature with
>   query-proxmox-support.
> 
>  pve-backup.c         | 283 +++++++++++++++++++++++++++++++++++++++++++
>  pve-backup.h         |  16 +++
>  qapi/block-core.json |  49 ++++++++
>  system/runstate.c    |   6 +
>  4 files changed, 354 insertions(+)
>  create mode 100644 pve-backup.h
> 
> diff --git a/pve-backup.c b/pve-backup.c
> index b22064a64e..8f0f9921ab 100644
> --- a/pve-backup.c
> +++ b/pve-backup.c
> @@ -1,4 +1,5 @@
>  #include "proxmox-backup-client.h"
> +#include "pve-backup.h"
>  #include "vma.h"
>  
>  #include "qemu/osdep.h"
> @@ -585,6 +586,37 @@ static int setup_snapshot_access(PVEBackupDevInfo *di, Error **errp)
>      return 0;
>  }
>  
> +static void setup_all_snapshot_access_bh(void *opaque)
> +{
> +    assert(!qemu_in_coroutine());
> +
> +    CoCtxData *data = (CoCtxData*)opaque;
> +    Error **errp = (Error**)data->data;
> +
> +    Error *local_err = NULL;
> +
> +    GList *l =  backup_state.di_list;
> +    while (l) {
> +        PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
> +        l = g_list_next(l);
> +
> +        bdrv_drained_begin(di->bs);
> +
> +        if (setup_snapshot_access(di, &local_err) < 0) {
> +            cleanup_snapshot_access(di);

(not part of this patch, but why does our `setup_snapshot_access()` not
call `cleanup_snapshot_access()` on failure?)

> +            bdrv_drained_end(di->bs);
> +            error_setg(errp, "%s - setting up snapshot access failed: %s", di->device_name,
> +                       local_err ? error_get_pretty(local_err) : "unknown error");
> +            break;
> +        }
> +
> +        bdrv_drained_end(di->bs);
> +    }
> +
> +    /* return */
> +    aio_co_enter(data->ctx, data->co);
> +}
> +
>  /*
>   * backup_job_create can *not* be run from a coroutine, so this can't either.
>   * The caller is responsible that backup_mutex is held nonetheless.
> @@ -722,6 +754,11 @@ static bool fleecing_no_efi_tpm(const char *device_id)
>      return strncmp(device_id, "drive-efidisk", 13) && strncmp(device_id, "drive-tpmstate", 14);
>  }
>  
> +static bool fleecing_all(const char *device_id)
> +{
> +    return true;
> +}
> +
>  /*
>   * Returns a list of device infos, which needs to be freed by the caller. In
>   * case of an error, errp will be set, but the returned value might still be a
> @@ -810,6 +847,251 @@ err:
>      return di_list;
>  }
>  
> +BackupAccessInfoList *coroutine_fn qmp_backup_access_setup(
> +    const char *target_id,
> +    const char *devlist,
> +    Error **errp)
> +{
> +    assert(qemu_in_coroutine());
> +
> +    qemu_co_mutex_lock(&backup_state.backup_mutex);
> +
> +    Error *local_err = NULL;
> +    GList *di_list = NULL;
> +    GList *l;
> +
> +    if (backup_state.di_list) {
> +        error_set(errp, ERROR_CLASS_GENERIC_ERROR,
> +                  "previous backup for target '%s' not finished", backup_state.target_id);
> +        qemu_co_mutex_unlock(&backup_state.backup_mutex);
> +        return NULL;
> +    }
> +
> +    bdrv_graph_co_rdlock();
> +    di_list = get_device_info(devlist, fleecing_all, &local_err);
> +    bdrv_graph_co_rdunlock();
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        goto err;
> +    }
> +    assert(di_list);
> +
> +    size_t total = 0;
> +
> +    l = di_list;
> +    while (l) {
> +        PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
> +        l = g_list_next(l);
> +
> +        ssize_t size = bdrv_getlength(di->bs);
> +        if (size < 0) {
> +            error_setg_errno(errp, -size, "bdrv_getlength failed");
> +            goto err;
> +        }
> +        di->size = size;
> +        total += size;
> +
> +        di->completed_ret = INT_MAX;
> +    }
> +
> +    qemu_mutex_lock(&backup_state.stat.lock);
> +    backup_state.stat.reused = 0;
> +
> +    /* clear previous backup's bitmap_list */
> +    if (backup_state.stat.bitmap_list) {
> +        GList *bl = backup_state.stat.bitmap_list;
> +        while (bl) {
> +            g_free(((PBSBitmapInfo *)bl->data)->drive);
> +            g_free(bl->data);
> +            bl = g_list_next(bl);
> +        }
> +        g_list_free(backup_state.stat.bitmap_list);
> +        backup_state.stat.bitmap_list = NULL;
> +    }

^ should be factored into a function instead of copied

The code below can also be factored out as AFAICT it, too, is a copy, if
`backup_file` is a parameter (which which `NULL` may be passed, as
`g_strdup()` explicitly maps NULL to NULL), and `uuid` as well I guess
(here it's not touched but may as well be cleared (or the function would
ignore it when NULL is passed)).

> +
> +    /* initialize global backup_state now */
> +
> +    if (backup_state.stat.error) {
> +        error_free(backup_state.stat.error);
> +        backup_state.stat.error = NULL;
> +    }
> +
> +    backup_state.stat.start_time = time(NULL);
> +    backup_state.stat.end_time = 0;
> +
> +    if (backup_state.stat.backup_file) {
> +        g_free(backup_state.stat.backup_file);
> +    }
> +    backup_state.stat.backup_file = NULL;
> +
> +    if (backup_state.target_id) {
> +        g_free(backup_state.target_id);
> +    }
> +    backup_state.target_id = g_strdup(target_id);

(^ Not sure if `qmp_backup`should include the above)

> +
> +    /*
> +     * The stats will never update, because there is no internal backup job. Initialize them anyway
> +     * for completeness.
> +     */
> +    backup_state.stat.total = total;
> +    backup_state.stat.dirty = total - backup_state.stat.reused;
> +    backup_state.stat.transferred = 0;
> +    backup_state.stat.zero_bytes = 0;
> +    backup_state.stat.finishing = false;
> +    backup_state.stat.starting = false; // there's no associated QEMU job

^ up to this point - requiring stat.lock to be held I suppose, (unless
qmp_backup allows moving the stuff in between to above the lock()
call...)

> +
> +    qemu_mutex_unlock(&backup_state.stat.lock);
> +
> +    backup_state.vmaw = NULL;
> +    backup_state.pbs = NULL;
> +
> +    backup_state.di_list = di_list;
> +
> +    /* Run setup_all_snapshot_access_bh outside of coroutine (in BH) but keep
> +    * backup_mutex locked. This is fine, a CoMutex can be held across yield
> +    * points, and we'll release it as soon as the BH reschedules us.
> +    */
> +    CoCtxData waker = {
> +        .co = qemu_coroutine_self(),
> +        .ctx = qemu_get_current_aio_context(),
> +        .data = &local_err,
> +    };
> +    aio_bh_schedule_oneshot(waker.ctx, setup_all_snapshot_access_bh, &waker);
> +    qemu_coroutine_yield();
> +
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        goto err;
> +    }
> +
> +    qemu_co_mutex_unlock(&backup_state.backup_mutex);
> +
> +    BackupAccessInfoList *bai_head = NULL, **p_bai_next = &bai_head;
> +
> +    l = di_list;
> +    while (l) {
> +        PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
> +        l = g_list_next(l);
> +
> +        BackupAccessInfoList *info = g_malloc0(sizeof(*info));
> +        info->value = g_malloc0(sizeof(*info->value));
> +        info->value->node_name = g_strdup(bdrv_get_node_name(di->fleecing.snapshot_access));
> +        info->value->device = g_strdup(di->device_name);
> +        info->value->size = di->size;
> +
> +        *p_bai_next = info;
> +        p_bai_next = &info->next;
> +    }
> +
> +    return bai_head;
> +
> +err:
> +
> +    l = di_list;
> +    while (l) {
> +        PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
> +        l = g_list_next(l);
> +
> +        g_free(di->device_name);
> +        di->device_name = NULL;
> +
> +        g_free(di);
> +    }
> +    g_list_free(di_list);
> +    backup_state.di_list = NULL;
> +
> +    qemu_co_mutex_unlock(&backup_state.backup_mutex);
> +    return NULL;
> +}
> +
> +/*
> + * Caller needs to hold the backup mutex or the BQL.
> + */
> +void backup_access_teardown(void)
> +{
> +    GList *l = backup_state.di_list;
> +
> +    qemu_mutex_lock(&backup_state.stat.lock);
> +    backup_state.stat.finishing = true;
> +    qemu_mutex_unlock(&backup_state.stat.lock);
> +
> +    while (l) {
> +        PVEBackupDevInfo *di = (PVEBackupDevInfo *)l->data;
> +        l = g_list_next(l);
> +
> +        if (di->fleecing.snapshot_access) {
> +            bdrv_unref(di->fleecing.snapshot_access);
> +            di->fleecing.snapshot_access = NULL;
> +        }
> +        if (di->fleecing.cbw) {
> +            bdrv_cbw_drop(di->fleecing.cbw);
> +            di->fleecing.cbw = NULL;
> +        }
> +
> +        g_free(di->device_name);
> +        di->device_name = NULL;
> +
> +        g_free(di);
> +    }
> +    g_list_free(backup_state.di_list);
> +    backup_state.di_list = NULL;
> +
> +    qemu_mutex_lock(&backup_state.stat.lock);
> +    backup_state.stat.end_time = time(NULL);
> +    backup_state.stat.finishing = false;
> +    qemu_mutex_unlock(&backup_state.stat.lock);
> +}
> +
> +// Not done in a coroutine, because bdrv_co_unref() and cbw_drop() would just spawn BHs anyways.
> +// Caller needs to hold the backup_state.backup_mutex lock
> +static void backup_access_teardown_bh(void *opaque)
> +{
> +    CoCtxData *data = (CoCtxData*)opaque;
> +
> +    backup_access_teardown();
> +
> +    /* return */
> +    aio_co_enter(data->ctx, data->co);
> +}
> +
> +void coroutine_fn qmp_backup_access_teardown(const char *target_id, Error **errp)
> +{
> +    assert(qemu_in_coroutine());
> +
> +    qemu_co_mutex_lock(&backup_state.backup_mutex);
> +
> +    if (!backup_state.target_id) { // nothing to do
> +        qemu_co_mutex_unlock(&backup_state.backup_mutex);
> +        return;
> +    }
> +
> +    /*
> +     * Continue with target_id == NULL, used by the callback registered for qemu_cleanup()
> +     */
> +    if (target_id && strcmp(backup_state.target_id, target_id)) {
> +        error_setg(errp, "cannot teardown backup access - got target %s instead of %s",
> +                   target_id, backup_state.target_id);
> +        qemu_co_mutex_unlock(&backup_state.backup_mutex);
> +        return;
> +    }
> +
> +    if (!strcmp(backup_state.target_id, "Proxmox VE")) {
> +        error_setg(errp, "cannot teardown backup access for PVE - use backup-cancel instead");
> +        qemu_co_mutex_unlock(&backup_state.backup_mutex);
> +        return;
> +    }
> +
> +    CoCtxData waker = {
> +        .co = qemu_coroutine_self(),
> +        .ctx = qemu_get_current_aio_context(),
> +    };
> +    aio_bh_schedule_oneshot(waker.ctx, backup_access_teardown_bh, &waker);
> +    qemu_coroutine_yield();
> +
> +    qemu_co_mutex_unlock(&backup_state.backup_mutex);
> +    return;
> +}
> +
>  UuidInfo coroutine_fn *qmp_backup(
>      const char *backup_file,
>      const char *password,
> @@ -1267,5 +1549,6 @@ ProxmoxSupportStatus *qmp_query_proxmox_support(Error **errp)
>      ret->pbs_masterkey = true;
>      ret->backup_max_workers = true;
>      ret->backup_fleecing = true;
> +    ret->backup_access_api = true;
>      return ret;
>  }
> diff --git a/pve-backup.h b/pve-backup.h
> new file mode 100644
> index 0000000000..4033bc848f
> --- /dev/null
> +++ b/pve-backup.h
> @@ -0,0 +1,16 @@
> +/*
> + * Bacup code used by Proxmox VE
> + *
> + * Copyright (C) Proxmox Server Solutions
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef PVE_BACKUP_H
> +#define PVE_BACKUP_H
> +
> +void backup_access_teardown(void);
> +
> +#endif /* PVE_BACKUP_H */
> diff --git a/qapi/block-core.json b/qapi/block-core.json
> index c581f1f238..3f092221ce 100644
> --- a/qapi/block-core.json
> +++ b/qapi/block-core.json
> @@ -1019,6 +1019,9 @@
>  #
>  # @pbs-library-version: Running version of libproxmox-backup-qemu0 library.
>  #
> +# @backup-access-api: Whether backup access API for external providers is
> +#     supported or not.
> +#
>  # @backup-fleecing: Whether backup fleecing is supported or not.
>  #
>  # @backup-max-workers: Whether the 'max-workers' @BackupPerf setting is
> @@ -1032,6 +1035,7 @@
>              'pbs-dirty-bitmap-migration': 'bool',
>              'pbs-masterkey': 'bool',
>              'pbs-library-version': 'str',
> +            'backup-access-api': 'bool',
>              'backup-fleecing': 'bool',
>              'backup-max-workers': 'bool' } }
>  
> @@ -1098,6 +1102,51 @@
>  ##
>  { 'command': 'query-pbs-bitmap-info', 'returns': ['PBSBitmapInfo'] }
>  
> +##
> +# @BackupAccessInfo:
> +#
> +# Info associated to a snapshot access for backup.  For more information about
> +# the bitmap see @BackupAccessBitmapMode.
> +#
> +# @node-name: the block node name of the snapshot-access node.
> +#
> +# @device: the device on top of which the snapshot access was created.
> +#
> +# @size: the size of the block device in bytes.
> +#
> +##
> +{ 'struct': 'BackupAccessInfo',
> +  'data': { 'node-name': 'str', 'device': 'str', 'size': 'size' } }
> +
> +##
> +# @backup-access-setup:
> +#
> +# Set up snapshot access to VM drives for an external backup provider.  No other
> +# backup or backup access can be done before tearing down the backup access.
> +#
> +# @target-id: the unique ID of the backup target.
> +#
> +# @devlist: list of block device names (separated by ',', ';' or ':'). By
> +#     default the backup includes all writable block devices.
> +#
> +# Returns: a list of @BackupAccessInfo, one for each device.
> +#
> +##
> +{ 'command': 'backup-access-setup',
> +  'data': { 'target-id': 'str', '*devlist': 'str' },
> +  'returns': [ 'BackupAccessInfo' ], 'coroutine': true }
> +
> +##
> +# @backup-access-teardown:
> +#
> +# Tear down previously setup snapshot access for the same target.
> +#
> +# @target-id: the ID of the backup target.
> +#
> +##
> +{ 'command': 'backup-access-teardown', 'data': { 'target-id': 'str' },
> +  'coroutine': true }
> +
>  ##
>  # @BlockDeviceTimedStats:
>  #
> diff --git a/system/runstate.c b/system/runstate.c
> index c2c9afa905..6f93d7c2fb 100644
> --- a/system/runstate.c
> +++ b/system/runstate.c
> @@ -60,6 +60,7 @@
>  #include "sysemu/sysemu.h"
>  #include "sysemu/tpm.h"
>  #include "trace.h"
> +#include "pve-backup.h"
>  
>  static NotifierList exit_notifiers =
>      NOTIFIER_LIST_INITIALIZER(exit_notifiers);
> @@ -920,6 +921,11 @@ void qemu_cleanup(int status)
>       * requests happening from here on anyway.
>       */
>      bdrv_drain_all_begin();
> +    /*
> +     * The backup access is set up by a QMP command, but is neither owned by a monitor nor
> +     * associated to a BlockBackend. Need to tear it down manually here.
> +     */
> +    backup_access_teardown();
>      job_cancel_sync_all();
>      bdrv_close_all();
>  
> -- 
> 2.39.5