[pve-devel] [RFC qemu-server v3 21/34] backup: implement backup for external providers
Fabian Grünbichler
f.gruenbichler at proxmox.com
Tue Nov 12 13:27:34 CET 2024
some nits/comments/questions below, but the general direction/structure
already looks quite good I think!
On November 7, 2024 5:51 pm, Fiona Ebner wrote:
> The state of the VM's disk images at the time the backup is started is
> preserved via a snapshot-access block node. Old data is moved to the
> fleecing image when new guest writes come in. The snapshot-access
> block node, as well as the associated bitmap in case of incremental
> backup, will be made available to the external provider. They are
> exported via NBD and for 'nbd' mechanism, the NBD socket path is
> passed to the provider, while for 'block-device' mechanism, the NBD
> export is made accessible as a regular block device first and the
> bitmap information is made available via a $next_dirty_region->()
> function. For 'block-device', the 'nbdinfo' binary is required.
>
> The provider can indicate that it wants to do an incremental backup by
> returning the bitmap ID that was used for a previous backup and it
> will then be told if the bitmap was newly created (either first backup
> or old bitmap was invalid) or if the bitmap can be reused.
>
> The provider then reads the parts of the NBD or block device it needs,
> either the full disk for full backup, or the dirty parts according to
> the bitmap for incremental backup. The bitmap has to be respected,
> reads to other parts of the image will return an error. After backing
> up each part of the disk, it should be discarded in the export to
> avoid unnecessary space usage in the fleecing image (requires the
> storage underlying the fleecing image to support discard too).
>
> Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
> ---
>
> Changes in v3:
> * adapt to API changes, config files are now passed as raw
>
> PVE/VZDump/QemuServer.pm | 309 ++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 308 insertions(+), 1 deletion(-)
>
> diff --git a/PVE/VZDump/QemuServer.pm b/PVE/VZDump/QemuServer.pm
> index b6dcd6cc..d0218c9b 100644
> --- a/PVE/VZDump/QemuServer.pm
> +++ b/PVE/VZDump/QemuServer.pm
> @@ -20,7 +20,7 @@ use PVE::QMPClient;
> use PVE::Storage::Plugin;
> use PVE::Storage::PBSPlugin;
> use PVE::Storage;
> -use PVE::Tools;
> +use PVE::Tools qw(run_command);
> use PVE::VZDump;
> use PVE::Format qw(render_duration render_bytes);
>
> @@ -277,6 +277,8 @@ sub archive {
>
> if ($self->{vzdump}->{opts}->{pbs}) {
> $self->archive_pbs($task, $vmid);
> + } elsif ($self->{vzdump}->{'backup-provider'}) {
> + $self->archive_external($task, $vmid);
> } else {
> $self->archive_vma($task, $vmid, $filename, $comp);
> }
> @@ -1149,6 +1151,23 @@ sub cleanup {
>
> # If VM was started only for backup, it is already stopped now.
> if (PVE::QemuServer::Helpers::vm_running_locally($vmid)) {
> + if ($task->{cleanup}->{'nbd-stop'}) {
> + eval { PVE::QemuServer::QMPHelpers::nbd_stop($vmid); };
> + $self->logerr($@) if $@;
> + }
> +
> + if (my $info = $task->{cleanup}->{'backup-access-teardown'}) {
> + my $params = {
> + 'target-id' => $info->{'target-id'},
> + timeout => 60,
> + success => $info->{success} ? JSON::true : JSON::false,
> + };
> +
> + $self->loginfo("tearing down backup-access");
> + eval { mon_cmd($vmid, "backup-access-teardown", $params->%*) };
> + $self->logerr($@) if $@;
> + }
> +
> $detach_tpmstate_drive->($task, $vmid);
> detach_fleecing_images($task->{disks}, $vmid) if $task->{'use-fleecing'};
> }
> @@ -1160,4 +1179,292 @@ sub cleanup {
> }
> }
>
> +my sub block_device_backup_cleanup {
> + my ($self, $paths, $cpids) = @_;
> +
> + for my $path ($paths->@*) {
> + eval { run_command(["qemu-nbd", "-d", $path ]); };
> + $self->log('warn', "unable to disconnect NBD backup source '$path' - $@") if $@;
> + }
> +
> + my $waited;
> + my $wait_limit = 5;
> + for ($waited = 0; $waited < $wait_limit && scalar(keys $cpids->%*); $waited++) {
> + while ((my $cpid = waitpid(-1, POSIX::WNOHANG)) > 0) {
> + delete($cpids->{$cpid});
> + }
> + if ($waited == 0) {
> + kill 15, $_ for keys $cpids->%*;
> + }
> + sleep 1;
> + }
> + if ($waited == $wait_limit && scalar(keys $cpids->%*)) {
> + kill 9, $_ for keys $cpids->%*;
> + sleep 1;
> + while ((my $cpid = waitpid(-1, POSIX::WNOHANG)) > 0) {
this could be a bit dangerous, since we have an explicit list of cpids
we want to wait for, couldn't we just waitpid explicitly for them?
just wary of potential side-effects on things like hookscripts or future
features that also require forking ;)
> + delete($cpids->{$cpid});
> + }
> + $self->log('warn', "unable to collect nbdinfo child process '$_'") for keys $cpids->%*;
> + }
> +}
> +
> +my sub block_device_backup_prepare {
> + my ($self, $devicename, $size, $nbd_path, $bitmap_name, $count) = @_;
nit: $device_name for consistency's sake?
> +
> + my $nbd_info_uri = "nbd+unix:///${devicename}?socket=${nbd_path}";
> + my $qemu_nbd_uri = "nbd:unix:${nbd_path}:exportname=${devicename}";
> +
> + my $cpid;
> + my $error_fh;
> + my $next_dirty_region;
> +
> + # If there is no dirty bitmap, it can be treated as if there's a full dirty one. The output of
> + # nbdinfo is a list of tuples with offset, length, type, description. The first bit of 'type' is
> + # set when the bitmap is dirty, see QEMU's docs/interop/nbd.txt
> + my $dirty_bitmap = [];
> + if ($bitmap_name) {
> + my $input = IO::File->new();
> + my $info = IO::File->new();
> + $error_fh = IO::File->new();
> + my $nbdinfo_cmd = ["nbdinfo", $nbd_info_uri, "--map=qemu:dirty-bitmap:${bitmap_name}"];
> + $cpid = open3($input, $info, $error_fh, $nbdinfo_cmd->@*)
> + or die "failed to spawn nbdinfo child - $!\n";
> +
> + $next_dirty_region = sub {
> + my ($offset, $length, $type);
> + do {
> + my $line = <$info>;
> + return if !$line;
> + die "unexpected output from nbdinfo - $line\n"
> + if $line !~ m/^\s*(\d+)\s*(\d+)\s*(\d+)/; # also untaints
> + ($offset, $length, $type) = ($1, $2, $3);
> + } while (($type & 0x1) == 0); # not dirty
> + return ($offset, $length);
> + };
> + } else {
> + my $done = 0;
> + $next_dirty_region = sub {
> + return if $done;
> + $done = 1;
> + return (0, $size);
> + };
> + }
> +
> + my $blockdev = "/dev/nbd${count}";
what if that is already used/taken by somebody? I think we'd need logic
to find a free slot here..
> +
> + eval {
> + run_command(["qemu-nbd", "-c", $blockdev, $qemu_nbd_uri, "--format=raw", "--discard=on"]);
> + };
> + if (my $err = $@) {
> + my $cpids = {};
> + $cpids->{$cpid} = 1 if $cpid;
> + block_device_backup_cleanup($self, [$blockdev], $cpids);
> + die $err;
> + }
> +
> + return ($blockdev, $next_dirty_region, $cpid);
> +}
> +
> +my sub backup_access_to_volume_info {
> + my ($self, $backup_access_info, $mechanism, $nbd_path) = @_;
> +
> + my $child_pids = {}; # used for nbdinfo calls
> + my $count = 0; # counter for block devices, i.e. /dev/nbd${count}
> + my $volumes = {};
> +
> + for my $info ($backup_access_info->@*) {
> + my $bitmap_status = 'none';
> + my $bitmap_name;
> + if (my $bitmap_action = $info->{'bitmap-action'}) {
> + my $bitmap_action_to_status = {
> + 'not-used' => 'none',
> + 'not-used-removed' => 'none',
> + 'new' => 'new',
> + 'used' => 'reuse',
> + 'invalid' => 'new',
> + };
nit: should we move this outside of the loop? it's a static map after
all.. (or maybe the perl interpreter is smart enough anyway ;))
> +
> + $bitmap_status = $bitmap_action_to_status->{$bitmap_action}
> + or die "got unexpected bitmap action '$bitmap_action'\n";
> +
> + $bitmap_name = $info->{'bitmap-name'} or die "bitmap-name is not present\n";
> + }
> +
> + my ($device, $size) = $info->@{qw(device size)};
> +
> + $volumes->{$device}->{'bitmap-mode'} = $bitmap_status;
> + $volumes->{$device}->{size} = $size;
> +
> + if ($mechanism eq 'block-device') {
> + my ($blockdev, $next_dirty_region, $child_pid) = block_device_backup_prepare(
> + $self, $device, $size, $nbd_path, $bitmap_name, $count);
> + $count++;
> + $child_pids->{$child_pid} = 1 if $child_pid;
> + $volumes->{$device}->{path} = $blockdev;
> + $volumes->{$device}->{'next-dirty-region'} = $next_dirty_region;
> + } elsif ($mechanism eq 'nbd') {
> + $volumes->{$device}->{'nbd-path'} = $nbd_path;
> + $volumes->{$device}->{'bitmap-name'} = $bitmap_name;
> + } else {
> + die "internal error - unkown mechanism '$mechanism'";
> + }
> + }
> +
> + return ($volumes, $child_pids);
> +}
> +
> +sub archive_external {
> + my ($self, $task, $vmid) = @_;
> +
> + my $guest_config = PVE::Tools::file_get_contents("$task->{tmpdir}/qemu-server.conf");
> + my $firewall_file = "$task->{tmpdir}/qemu-server.fw";
> +
> + my $opts = $self->{vzdump}->{opts};
> +
> + my $backup_provider = $self->{vzdump}->{'backup-provider'};
> +
> + $self->loginfo("starting external backup via " . $backup_provider->provider_name());
> +
> + my $starttime = time();
> +
> + # get list early so we die on unkown drive types before doing anything
> + my $devlist = _get_task_devlist($task);
> +
> + $self->enforce_vm_running_for_backup($vmid);
> + $self->{qmeventd_fh} = PVE::QemuServer::register_qmeventd_handle($vmid);
> +
> + eval {
> + $SIG{INT} = $SIG{TERM} = $SIG{QUIT} = $SIG{HUP} = $SIG{PIPE} = sub {
> + die "interrupted by signal\n";
> + };
> +
> + my $qemu_support = mon_cmd($vmid, "query-proxmox-support");
> +
> + $attach_tpmstate_drive->($self, $task, $vmid);
> +
> + my $is_template = PVE::QemuConfig->is_template($self->{vmlist}->{$vmid});
> +
> + my $fleecing = check_and_prepare_fleecing(
> + $self, $task, $vmid, $opts->{fleecing}, $task->{disks}, $is_template, $qemu_support, 1);
> + die "cannot setup backup access without fleecing\n" if !$fleecing;
> +
> + $task->{'use-fleecing'} = 1;
> +
> + my $fs_frozen = $self->qga_fs_freeze($task, $vmid);
should we move this (A)
> +
> + my $target_id = $opts->{storage};
> +
> + my $params = {
> + 'target-id' => $target_id,
> + devlist => $devlist,
> + timeout => 60,
> + };
and this (B)
> +
> + my ($mechanism, $bitmap_name) = $backup_provider->backup_get_mechanism($vmid, 'qemu');
> + die "mechanism '$mechanism' requested by backup provider is not supported for VMs\n"
> + if $mechanism ne 'block-device' && $mechanism ne 'nbd';
> +
> + if ($mechanism eq 'block-device') {
> + # For mechanism 'block-device' the bitmap needs to be passed to the provider. The bitmap
> + # cannot be dumped via QMP and doing it via qemu-img is experimental, so use nbdinfo.
> + die "need 'nbdinfo' binary from package libnbd-bin\n" if !-e "/usr/bin/nbdinfo";
> +
> + # NOTE nbds_max won't change if module is already loaded
> + run_command(["modprobe", "nbd", "nbds_max=128"]);
should this maybe be put into a modprobe snippet somewhere, and we just
verify here that nbd is available? not that we can currently reach 128
guest disks ;)
> + }
down here (B)
> +
> + if ($bitmap_name) {
> + # prepend storage ID so different providers can never cause clashes
> + $bitmap_name = "$opts->{storage}-" . $bitmap_name;
> + $params->{'bitmap-name'} = $bitmap_name;
not related to this patch directly - if we do this for external
providers, do we also want to do it for different PBS targets maybe? :)
> + }
> +
> + $self->loginfo("setting up snapshot-access for backup");
> +
and down here (A)?
> + my $backup_access_info = eval { mon_cmd($vmid, "backup-access-setup", $params->%*) };
> + my $qmperr = $@;
> +
> + $task->{cleanup}->{'backup-access-teardown'} = { 'target-id' => $target_id, success => 0 };
should we differentiate here between setup success or failure? if not,
should we move it directly before the setup call?
> +
> + if ($fs_frozen) {
> + $self->qga_fs_thaw($vmid);
> + }
> +
> + die $qmperr if $qmperr;
> +
> + $self->resume_vm_after_job_start($task, $vmid);
> +
> + my $bitmap_info = mon_cmd($vmid, 'query-pbs-bitmap-info');
> + for my $info (sort { $a->{drive} cmp $b->{drive} } $bitmap_info->@*) {
> + my $text = $bitmap_action_to_human->($self, $info);
> + my $drive = $info->{drive};
> + $drive =~ s/^drive-//; # for consistency
> + $self->loginfo("$drive: dirty-bitmap status: $text");
> + }
> +
> + $self->loginfo("starting NBD server");
> +
> + my $nbd_path = "/run/qemu-server/$vmid\_nbd.backup_access";
> + mon_cmd(
> + $vmid, "nbd-server-start", addr => { type => 'unix', data => { path => $nbd_path } } );
> + $task->{cleanup}->{'nbd-stop'} = 1;
> +
> + for my $info ($backup_access_info->@*) {
> + $self->loginfo("adding NBD export for $info->{device}");
> +
> + my $export_params = {
> + id => $info->{device},
> + 'node-name' => $info->{'node-name'},
> + writable => JSON::true, # for discard
> + type => "nbd",
> + name => $info->{device}, # NBD export name
> + };
> +
> + if ($info->{'bitmap-name'}) {
> + $export_params->{bitmaps} = [{
> + node => $info->{'bitmap-node-name'},
> + name => $info->{'bitmap-name'},
> + }],
> + }
> +
> + mon_cmd($vmid, "block-export-add", $export_params->%*);
> + }
> +
> + my $child_pids = {}; # used for nbdinfo calls
> + my $volumes = {};
> +
> + eval {
> + ($volumes, $child_pids) =
> + backup_access_to_volume_info($self, $backup_access_info, $mechanism, $nbd_path);
so this here forks child processes (via block_device_backup_prepare),
but it might fail halfway through after having forked X/N children, then
we don't have any information about the forked processes here (C)
> +
> + my $param = {};
> + $param->{'bandwidth-limit'} = $opts->{bwlimit} * 1024 if $opts->{bwlimit};
> + $param->{'firewall-config'} = PVE::Tools::file_get_contents($firewall_file)
> + if -e $firewall_file;
> +
> + $backup_provider->backup_vm($vmid, $guest_config, $volumes, $param);
> + };
> + my $err = $@;
> +
> + if ($mechanism eq 'block-device') {
> + my $cleanup_paths = [map { $volumes->{$_}->{path} } keys $volumes->%*];
> + block_device_backup_cleanup($self, $cleanup_paths, $child_pids)
C: to do this cleanup here.. should we maybe record both cpids and
volumes as part of $self->{cleanup}, instead of returning them, so that
we can handle that case as well?
> + }
> +
> + die $err if $err;
> + };
> + my $err = $@;
> +
> + if ($err) {
> + $self->logerr($err);
> + $self->resume_vm_after_job_start($task, $vmid);
> + } else {
> + $task->{size} = $backup_provider->backup_get_task_size($vmid);
> + $task->{cleanup}->{'backup-access-teardown'}->{success} = 1;
> + }
> + $self->restore_vm_power_state($vmid);
> +
> + die $err if $err;
> +}
> +
> 1;
> --
> 2.39.5
>
>
>
> _______________________________________________
> pve-devel mailing list
> pve-devel at lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
>
>
>
More information about the pve-devel
mailing list