[pve-devel] [PATCH v4 pve-storage 1/5] qcow2: add external snapshot support
Fabian Grünbichler
f.gruenbichler at proxmox.com
Tue Apr 1 15:50:37 CEST 2025
> Alexandre Derumier via pve-devel <pve-devel at lists.proxmox.com> hat am 11.03.2025 11:28 CET geschrieben:
some sort of description here would be great ;)
> ---
> src/PVE/Storage.pm | 4 +-
> src/PVE/Storage/DirPlugin.pm | 1 +
> src/PVE/Storage/Plugin.pm | 232 +++++++++++++++++++++++++++++------
> 3 files changed, 196 insertions(+), 41 deletions(-)
>
> diff --git a/src/PVE/Storage.pm b/src/PVE/Storage.pm
> index 3b4f041..79e5c3a 100755
> --- a/src/PVE/Storage.pm
> +++ b/src/PVE/Storage.pm
> @@ -1002,7 +1002,7 @@ sub unmap_volume {
> }
>
> sub vdisk_alloc {
> - my ($cfg, $storeid, $vmid, $fmt, $name, $size) = @_;
> + my ($cfg, $storeid, $vmid, $fmt, $name, $size, $backing) = @_;
>
> die "no storage ID specified\n" if !$storeid;
>
> @@ -1025,7 +1025,7 @@ sub vdisk_alloc {
> # lock shared storage
> return $plugin->cluster_lock_storage($storeid, $scfg->{shared}, undef, sub {
> my $old_umask = umask(umask|0037);
> - my $volname = eval { $plugin->alloc_image($storeid, $scfg, $vmid, $fmt, $name, $size) };
> + my $volname = eval { $plugin->alloc_image($storeid, $scfg, $vmid, $fmt, $name, $size, $backing) };
> my $err = $@;
> umask $old_umask;
> die $err if $err;
> diff --git a/src/PVE/Storage/DirPlugin.pm b/src/PVE/Storage/DirPlugin.pm
> index fb23e0a..1cd7ac3 100644
> --- a/src/PVE/Storage/DirPlugin.pm
> +++ b/src/PVE/Storage/DirPlugin.pm
> @@ -81,6 +81,7 @@ sub options {
> is_mountpoint => { optional => 1 },
> bwlimit => { optional => 1 },
> preallocation => { optional => 1 },
> + snapext => { optional => 1 },
> };
> }
>
> diff --git a/src/PVE/Storage/Plugin.pm b/src/PVE/Storage/Plugin.pm
> index 65cf43f..d7f485f 100644
> --- a/src/PVE/Storage/Plugin.pm
> +++ b/src/PVE/Storage/Plugin.pm
> @@ -216,6 +216,11 @@ my $defaultData = {
> maximum => 65535,
> optional => 1,
> },
> + 'snapext' => {
> + type => 'boolean',
> + description => 'enable external snapshot.',
> + optional => 1,
> + },
> },
> };
>
> @@ -716,7 +721,11 @@ sub filesystem_path {
>
> my $dir = $class->get_subdir($scfg, $vtype);
>
> - $dir .= "/$vmid" if $vtype eq 'images';
> + if ($scfg->{snapext} && $snapname) {
> + $name = $class->get_snap_volname($volname, $snapname);
> + } else {
> + $dir .= "/$vmid" if $vtype eq 'images';
> + }
this is a bit weird, as it mixes volnames (with the `$vmid/` prefix) and names (without), it's only called twice in this patch, and this here already has $volname parsed, so could we maybe let get_snap_volname take and return the $name part without the dir?
>
> my $path = "$dir/$name";
>
> @@ -873,7 +882,7 @@ sub clone_image {
> }
>
> sub alloc_image {
> - my ($class, $storeid, $scfg, $vmid, $fmt, $name, $size) = @_;
> + my ($class, $storeid, $scfg, $vmid, $fmt, $name, $size, $backing) = @_;
this extends the storage API, so it should actually do that.. and probably $backing should not be an arbitrary path, but something that is resolved locally?
>
> my $imagedir = $class->get_subdir($scfg, 'images');
> $imagedir .= "/$vmid";
> @@ -901,17 +910,11 @@ sub alloc_image {
> umask $old_umask;
> die $err if $err;
> } else {
> - my $cmd = ['/usr/bin/qemu-img', 'create'];
> -
> - my $prealloc_opt = preallocation_cmd_option($scfg, $fmt);
> - push @$cmd, '-o', $prealloc_opt if defined($prealloc_opt);
>
> - push @$cmd, '-f', $fmt, $path, "${size}K";
> -
> - eval { run_command($cmd, errmsg => "unable to create image"); };
> + eval { qemu_img_create($scfg, $fmt, $size, $path, $backing) };
> if ($@) {
> unlink $path;
> - rmdir $imagedir;
> + rmdir $imagedir if !$backing;
don't think this is needed, rmdir will fail if the dir isn't empty anyway..
> die "$@";
> }
> }
> @@ -955,6 +958,50 @@ sub free_image {
> # TODO taken from PVE/QemuServer/Drive.pm, avoiding duplication would be nice
> my @checked_qemu_img_formats = qw(raw cow qcow qcow2 qed vmdk cloop);
>
> +sub qemu_img_create {
> + my ($scfg, $fmt, $size, $path, $backing) = @_;
> +
> + my $cmd = ['/usr/bin/qemu-img', 'create'];
> +
> + my $options = [];
> +
> + if($backing) {
> + push @$cmd, '-b', $backing, '-F', 'qcow2';
> + push @$options, 'extended_l2=on','cluster_size=128k';
> + };
> + push @$options, preallocation_cmd_option($scfg, $fmt);
> + push @$cmd, '-o', join(',', @$options) if @$options > 0;
> + push @$cmd, '-f', $fmt, $path;
> + push @$cmd, "${size}K" if !$backing;
is this because it will automatically take the size from the backing image?
> +
> + run_command($cmd, errmsg => "unable to create image");
> +}
> +
> +sub qemu_img_info {
> + my ($filename, $file_format, $timeout, $follow_backing_files) = @_;
> +
> + my $cmd = ['/usr/bin/qemu-img', 'info', '--output=json', $filename];
> + push $cmd->@*, '-f', $file_format if $file_format;
> + push $cmd->@*, '--backing-chain' if $follow_backing_files;
> +
> + my $json = '';
> + my $err_output = '';
> + eval {
> + run_command($cmd,
> + timeout => $timeout,
> + outfunc => sub { $json .= shift },
> + errfunc => sub { $err_output .= shift . "\n"},
> + );
> + };
> + warn $@ if $@;
> + if ($err_output) {
> + # if qemu did not output anything to stdout we die with stderr as an error
> + die $err_output if !$json;
> + # otherwise we warn about it and try to parse the json
> + warn $err_output;
> + }
> + return $json;
> +}
> # set $untrusted if the file in question might be malicious since it isn't
> # created by our stack
> # this makes certain checks fatal, and adds extra checks for known problems like
> @@ -1018,25 +1065,9 @@ sub file_size_info {
> warn "file_size_info: '$filename': falling back to 'raw' from unknown format '$file_format'\n";
> $file_format = 'raw';
> }
> - my $cmd = ['/usr/bin/qemu-img', 'info', '--output=json', $filename];
> - push $cmd->@*, '-f', $file_format if $file_format;
>
> - my $json = '';
> - my $err_output = '';
> - eval {
> - run_command($cmd,
> - timeout => $timeout,
> - outfunc => sub { $json .= shift },
> - errfunc => sub { $err_output .= shift . "\n"},
> - );
> - };
> - warn $@ if $@;
> - if ($err_output) {
> - # if qemu did not output anything to stdout we die with stderr as an error
> - die $err_output if !$json;
> - # otherwise we warn about it and try to parse the json
> - warn $err_output;
> - }
> + my $json = qemu_img_info($filename, $file_format, $timeout);
> +
> if (!$json) {
> die "failed to query file information with qemu-img\n" if $untrusted;
> # skip decoding if there was no output, e.g. if there was a timeout.
> @@ -1162,11 +1193,29 @@ sub volume_snapshot {
>
> die "can't snapshot this image format\n" if $volname !~ m/\.(qcow2|qed)$/;
>
> - my $path = $class->filesystem_path($scfg, $volname);
> + if($scfg->{snapext}) {
> +
> + my $path = $class->path($scfg, $volname, $storeid);
> + my $snappath = $class->path($scfg, $volname, $storeid, $snap);
> + #rename current volume to snap volume
> + die "snapshot volume $snappath already exist\n" if -e $snappath;
> + rename($path, $snappath) if -e $path;
this is still looking weird.. I don't think it makes sense interface wise to allow snapshotting a volume that doesn't even exist..
> +
> + my ($vtype, $name, $vmid, undef, undef, $isBase, $format) =
> + $class->parse_volname($volname);
> +
> + $class->alloc_image($storeid, $scfg, $vmid, 'qcow2', $name, undef, $snappath);
> + if ($@) {
> + eval { $class->free_image($storeid, $scfg, $volname, 0) };
> + warn $@ if $@;
missing cleanup - this should undo the rename from above
> + }
>
> - my $cmd = ['/usr/bin/qemu-img', 'snapshot','-c', $snap, $path];
> + } else {
>
> - run_command($cmd);
> + my $path = $class->filesystem_path($scfg, $volname);
> + my $cmd = ['/usr/bin/qemu-img', 'snapshot','-c', $snap, $path];
> + run_command($cmd);
> + }
>
> return undef;
> }
> @@ -1177,6 +1226,21 @@ sub volume_snapshot {
> sub volume_rollback_is_possible {
> my ($class, $scfg, $storeid, $volname, $snap, $blockers) = @_;
>
> + if ($scfg->{snapext}) {
> + #technically, we could manage multibranch, we it need lot more work for snapshot delete
> + #we need to implemente block-stream from deleted snapshot to all others child branchs
> + #when online, we need to do a transaction for multiple disk when delete the last snapshot
> + #and need to merge in current running file
> +
> + my $snappath = $class->path($scfg, $volname, $storeid, $snap);
> + my $snapshots = $class->volume_snapshot_info($scfg, $storeid, $volname);
> + my $parentsnap = $snapshots->{current}->{parent};
wouldn't it be enough to check that this equals $snap?
> +
> + return 1 if $snapshots->{$parentsnap}->{file} eq $snappath;
> +
> + die "can't rollback, '$snap' is not most recent snapshot on '$volname'\n";
> + }
> +
> return 1;
> }
>
> @@ -1187,9 +1251,15 @@ sub volume_snapshot_rollback {
>
> my $path = $class->filesystem_path($scfg, $volname);
>
> - my $cmd = ['/usr/bin/qemu-img', 'snapshot','-a', $snap, $path];
> -
> - run_command($cmd);
> + if ($scfg->{snapext}) {
> + #simply delete the current snapshot and recreate it
> + my $path = $class->filesystem_path($scfg, $volname);
> + unlink($path);
> + $class->volume_snapshot($scfg, $storeid, $volname, $snap);
instead of volume_snapshot, this could simply call alloc_image with the backing file? then volume_snapshot could always rename and always cleanup properly..
> + } else {
> + my $cmd = ['/usr/bin/qemu-img', 'snapshot','-a', $snap, $path];
> + run_command($cmd);
> + }
>
> return undef;
> }
> @@ -1201,13 +1271,49 @@ sub volume_snapshot_delete {
>
> return 1 if $running;
>
> + my $cmd = "";
> my $path = $class->filesystem_path($scfg, $volname);
>
> - $class->deactivate_volume($storeid, $scfg, $volname, $snap, {});
> + if ($scfg->{snapext}) {
> +
> + my $snapshots = $class->volume_snapshot_info($scfg, $storeid, $volname);
> + my $snappath = $snapshots->{$snap}->{file};
> + die "volume $snappath is missing" if !-e $snappath;
>
> - my $cmd = ['/usr/bin/qemu-img', 'snapshot','-d', $snap, $path];
> + my $parentsnap = $snapshots->{$snap}->{parent};
> + my $childsnap = $snapshots->{$snap}->{child};
>
> - run_command($cmd);
> + my $parentpath = $snapshots->{$parentsnap}->{file} if $parentsnap;
> + my $childpath = $snapshots->{$childsnap}->{file} if $childsnap;
my $foo = .. if ...;
is forbidden in our code ;) but I think we always need to have a childsnap anyway, right?
so we could simply check for that, and then switch around the two branches below so that one of them can do
if (my $parentsnap = ...) {
...
} else {
...
}
> +
> + #if first snapshot,as it should be bigger, we merge child, and rename the snapshot to child
> + if(!$parentsnap) {
> + print"commit $childpath\n";
> + $cmd = ['/usr/bin/qemu-img', 'commit', $childpath];
we could provide `-d` here to skip emptying $childpath since we renamed over it anyway below..
> + eval { run_command($cmd) };
> + if ($@) {
> + die "error commiting $childpath to $parentpath; $@\n";
this is wrong, there is no $parentpath.. we are committing into $snappath
> + }
> + print"rename $snappath to $childpath\n";
> + rename($snappath, $childpath);
what if this fails?
> + } else {
> + #we rebase the child image on the parent as new backing image
should we extend this to make it clear what this means? it means copying any parts of $snap that are not in $parent and not yet overwritten by $child into $child, right?
so how expensive this is depends on:
- how many changes are between $parent and $snap (increases cost)
- how many of those are overwritten by changes between $snap and $child (decreases cost)
> + die "missing parentsnap snapshot to rebase child $childpath\n" if !$parentpath;
how can this happen? if there is a parentsnap there must be a parentpath as well?
> + $cmd = ['/usr/bin/qemu-img', 'rebase', '-b', $parentpath, '-F', 'qcow2', '-f', 'qcow2', $childpath];
> + eval { run_command($cmd) };
> + if ($@) {
> + die "error rebase $childpath from $parentpath; $@\n";
> + }
> + #delete the snapshot
> + unlink($snappath);
> + }
> +
> + } else {
> + $class->deactivate_volume($storeid, $scfg, $volname, $snap, {});
> +
> + $cmd = ['/usr/bin/qemu-img', 'snapshot','-d', $snap, $path];
> + run_command($cmd);
> + }
>
> return undef;
> }
> @@ -1246,7 +1352,7 @@ sub volume_has_feature {
> current => { qcow2 => 1, raw => 1, vmdk => 1 },
> },
> rename => {
> - current => {qcow2 => 1, raw => 1, vmdk => 1},
> + current => { qcow2 => 1, raw => 1, vmdk => 1},
> },
> };
>
> @@ -1481,7 +1587,37 @@ sub status {
> sub volume_snapshot_info {
> my ($class, $scfg, $storeid, $volname) = @_;
>
> - die "volume_snapshot_info is not implemented for $class";
> + my $path = $class->filesystem_path($scfg, $volname);
> +
> + my $backing_chain = 1;
> + my $json = qemu_img_info($path, undef, 10, $backing_chain);
> + die "failed to query file information with qemu-img\n" if !$json;
> + my $snapshots = eval { decode_json($json) };
missing error handlign for json decoding..
> +
> + my $info = {};
> + my $order = 0;
> + for my $snap (@$snapshots) {
> +
> + my $snapfile = $snap->{filename};
> + my $snapname = parse_snapname($snapfile);
> + $snapname = 'current' if !$snapname;
> + my $snapvolname = $class->get_snap_volname($volname, $snapname);
> +
> + $info->{$snapname}->{order} = $order;
> + $info->{$snapname}->{file}= $snapfile;
> + $info->{$snapname}->{volname} = $snapvolname;
> + $info->{$snapname}->{volid} = "$storeid:$snapvolname";
> + $info->{$snapname}->{ext} = 1;
> +
> + my $parentfile = $snap->{'backing-filename'};
> + if ($parentfile) {
> + my $parentname = parse_snapname($parentfile);
> + $info->{$snapname}->{parent} = $parentname;
> + $info->{$parentname}->{child} = $snapname;
> + }
> + $order++;
> + }
> + return $info;
> }
>
> sub activate_storage {
> @@ -1867,4 +2003,22 @@ sub config_aware_base_mkdir {
> }
> }
>
> +sub get_snap_volname {
> + my ($class, $volname, $snapname) = @_;
> +
> + my ($vtype, $name, $vmid, $basename, $basevmid, $isBase, $format) = $class->parse_volname($volname);
> + $name = !$snapname || $snapname eq 'current' ? $volname : "$vmid/snap-$snapname-$name";
other way round would be better to group by volume first IMHO ($vmid/snap-$name-$snapname), as this is similar to how we encode snapshots often on the storage level (volume at snap). we also need to have some delimiter between snapshot and volume name that is not allowed in either (hard for volname since basically everything but '/' goes, but snapshots have a restricted character set (configid, which means alphanumeric, hyphen and underscore), so we could use something like '.' as delimiter? or we switch to directories and do $vmid/snap/$snap/$name?)
> + return $name;
> +}
> +
> +sub parse_snapname {
> + my ($name) = @_;
> +
> + my $basename = basename($name);
> + if ($basename =~ m/^snap-(.*)-vm(.*)$/) {
this is not strict enough, see above
> + return $1;
> + }
> + return undef;
> +}
> +
> 1;
> --
> 2.39.5
More information about the pve-devel
mailing list