[pve-devel] [PATCH pve-storage 3/5] lvmplugin: add qcow2 snapshot
Fabian Grünbichler
f.gruenbichler at proxmox.com
Fri May 9 12:30:02 CEST 2025
> Alexandre Derumier via pve-devel <pve-devel at lists.proxmox.com> hat am 22.04.2025 13:51 CEST geschrieben:
> we format lvm logical volume with qcow2 to handle snapshot chain.
>
> like for qcow2 file, when a snapshot is taken, the current lvm volume
> is renamed to snap volname, and a new current lvm volume is created
> with the snap volname as backing file
>
> Signed-off-by: Alexandre Derumier <alexandre.derumier at groupe-cyllene.com>
> ---
> src/PVE/Storage/LVMPlugin.pm | 301 ++++++++++++++++++++++++++++++++---
> 1 file changed, 278 insertions(+), 23 deletions(-)
>
> diff --git a/src/PVE/Storage/LVMPlugin.pm b/src/PVE/Storage/LVMPlugin.pm
> index c4648ec..8ee337a 100644
> --- a/src/PVE/Storage/LVMPlugin.pm
> +++ b/src/PVE/Storage/LVMPlugin.pm
> @@ -4,6 +4,7 @@ use strict;
> use warnings;
>
> use IO::File;
> +use POSIX qw/ceil/;
>
> use PVE::Tools qw(run_command trim);
> use PVE::Storage::Plugin;
> @@ -218,6 +219,7 @@ sub type {
> sub plugindata {
> return {
> content => [ {images => 1, rootdir => 1}, { images => 1 }],
> + format => [ { raw => 1, qcow2 => 1 } , 'raw' ],
> 'sensitive-properties' => {},
> };
> }
> @@ -294,7 +296,10 @@ sub parse_volname {
> PVE::Storage::Plugin::parse_lvm_name($volname);
>
> if ($volname =~ m/^(vm-(\d+)-\S+)$/) {
> - return ('images', $1, $2, undef, undef, undef, 'raw');
> + my $name = $1;
> + my $vmid = $2;
> + my $format = $volname =~ m/\.qcow2$/ ? 'qcow2' : 'raw';
> + return ('images', $name, $vmid, undef, undef, undef, $format);
> }
>
> die "unable to parse lvm volume name '$volname'\n";
> @@ -303,11 +308,13 @@ sub parse_volname {
> sub filesystem_path {
> my ($class, $scfg, $volname, $snapname) = @_;
>
> - die "lvm snapshot is not implemented"if defined($snapname);
> + my ($vtype, $name, $vmid, $basename, $basevmid, $isBase, $format) =
> + $class->parse_volname($volname);
>
> - my ($vtype, $name, $vmid) = $class->parse_volname($volname);
> + die "snapshot is working with qcow2 format only" if defined($snapname) && $format ne 'qcow2';
>
> my $vg = $scfg->{vgname};
> + $name = $class->get_snap_name($volname, $snapname) if $snapname;
>
> my $path = "/dev/$vg/$name";
>
> @@ -335,7 +342,9 @@ sub find_free_diskname {
>
> my $disk_list = [ keys %{$lvs->{$vg}} ];
>
> - return PVE::Storage::Plugin::get_next_vm_diskname($disk_list, $storeid, $vmid, undef, $scfg);
> + $add_fmt_suffix = $fmt eq 'qcow2' ? 1 : undef;
> +
> + return PVE::Storage::Plugin::get_next_vm_diskname($disk_list, $storeid, $vmid, $fmt, $scfg, $add_fmt_suffix);
> }
>
> sub lvcreate {
> @@ -363,13 +372,43 @@ sub lvrename {
> );
> }
>
> -sub alloc_image {
> - my ($class, $storeid, $scfg, $vmid, $fmt, $name, $size) = @_;
> +my sub lvm_qcow2_format {
> + my ($class, $storeid, $scfg, $name, $fmt, $backing_snap, $size) = @_;
> +
> + return if $fmt ne 'qcow2';
> +
> + $class->activate_volume($storeid, $scfg, $name);
> + my $path = $class->path($scfg, $name, $storeid);
> + my $backing_path = $class->path($scfg, $name, $storeid, $backing_snap) if $backing_snap;
> + PVE::Storage::Plugin::qemu_img_create($scfg, 'qcow2', $size, $path, $backing_path);
>
> - die "unsupported format '$fmt'" if $fmt ne 'raw';
> +}
> +
> +my sub lvm_size {
that's a bit of a misnomer as well, maybe 'calculate_lv_size' ?
> + my ($size, $fmt, $backing_snap) = @_;
> +
> + #add extra space for qcow2 metadatas for initial image
> + #if backing_snap exist, the parent lvm volume already have the overhead
are you sure about that? the 'current' volume also has to have the same overhead if you write
the full logical size to it, or am I missing something?
e.g., while doing the same with files:
1. dd 1G of random data into raw file
2. create 1G qcow2 file
3. qemu-img dd 1G of random data into qcow2 file
4. observe that raw file is 1073741824 bytes big, qcow2 file is 1074266112 bytes big (512K difference)
5. create second 1G qcow2 file backed by first
6. qemu-img dd 1G of random data into second qcow2 file
7. observe second qcow2 file is also bigger than 1G (by 384K)
and this is without extended l2 and big clusters, with those the qcow2 file is slightly bigger
> + return $size if $fmt ne 'qcow2' || $backing_snap;
> +
> + #without sub-allocated clusters : l2_size = disk_size × 8 / cluster_size
> + #with sub-allocated clusters : l2_size = disk_size × 8 / cluster_size / 16
> + #ex: 4MB overhead for 1TB with extented l2 clustersize=128k
> + #can't use qemu-img measure, because it's not possible to define options like clustersize && extended_l2
> + #verification has been done with : qemu-img create -f qcow2 -o extended_l2=on,cluster_size=128k test.img 1G
> +
> + my $qcow2_overhead = ceil($size/1024/1024/1024) * 4096;
> + $size += $qcow2_overhead;
> + return $size;
> +}
> +
> +my sub alloc_lvm_image {
here as well - this could be first extracted without semantic changes, and
then have the semantic changes in this commit..
> + my ($class, $storeid, $scfg, $vmid, $fmt, $name, $size, $backing_snap) = @_;
> +
> + die "unsupported format '$fmt'" if $fmt !~ m/(raw|qcow2)/;
>
> die "illegal name '$name' - should be 'vm-$vmid-*'\n"
> - if $name && $name !~ m/^vm-$vmid-/;
> + if $name !~ m/^vm-$vmid-/;
>
> my $vgs = lvm_vgs();
>
> @@ -378,17 +417,51 @@ sub alloc_image {
> die "no such volume group '$vg'\n" if !defined ($vgs->{$vg});
>
> my $free = int($vgs->{$vg}->{free});
> + my $lvmsize = lvm_size($size, $fmt, $backing_snap);
>
> die "not enough free space ($free < $size)\n" if $free < $size;
>
> - $name = $class->find_free_diskname($storeid, $scfg, $vmid)
> + my $tags = ["pve-vm-$vmid"];
> + #tags all snapshots volumes with the main volume tag for easier activation of the whole group
> + push @$tags, "\@pve-$name" if $fmt eq 'qcow2';
> + lvcreate($vg, $name, $lvmsize, $tags);
> +
> + #format the lvm volume with qcow2 format
> + eval { lvm_qcow2_format($class, $storeid, $scfg, $name, $fmt, $backing_snap, $size) };
> + if ($@) {
> + my $err = $@;
> + #no need to safe cleanup as the volume is still empty
> + eval {
> + my $cmd = ['/sbin/lvremove', '-f', "$vg/$name"];
> + run_command($cmd, errmsg => "lvremove '$vg/$name' error");
> + };
should still log any errors encountered by the cleanup..
> + die $err;
> + }
> +
> +}
> +
> +sub alloc_image {
> + my ($class, $storeid, $scfg, $vmid, $fmt, $name, $size) = @_;
> +
> + $name = $class->find_free_diskname($storeid, $scfg, $vmid, $fmt)
> if !$name;
>
> - lvcreate($vg, $name, $size, ["pve-vm-$vmid"]);
> + alloc_lvm_image($class, $storeid, $scfg, $vmid, $fmt, $name, $size);
>
> return $name;
> }
>
> +sub alloc_snap_image {
should be private
> + my ($class, $storeid, $scfg, $volname, $backing_snap) = @_;
> +
> + my $size = $class->volume_size_info($scfg, $storeid, $volname, 5, $backing_snap);
> + $size = $size / 1024; #we use kb in lvcreate
what if something regarding the size calculation changed in the meantime,
shouldn't this always start from the "logical" size or else we risk
accidentally creating a too small volume?
> +
> + my ($vmid, $format) = ($class->parse_volname($volname))[2,6];
we've already done this at both call sites, so we could just pass those in
and/or inline this helper?
> +
> + alloc_lvm_image($class, $storeid, $scfg, $vmid, $format, $volname, $size, $backing_snap);
> +}
> +
> sub free_image {
> my ($class, $storeid, $scfg, $volname, $isBase) = @_;
>
> @@ -539,6 +612,12 @@ sub activate_volume {
>
> my $lvm_activate_mode = 'ey';
>
> + #activate volume && all snapshots volumes by tag
> + my ($vtype, $name, $vmid, $basename, $basevmid, $isBase, $format) =
> + $class->parse_volname($volname);
> +
> + $path = "\@pve-$name" if $format eq 'qcow2';
> +
> my $cmd = ['/sbin/lvchange', "-a$lvm_activate_mode", $path];
> run_command($cmd, errmsg => "can't activate LV '$path'");
> $cmd = ['/sbin/lvchange', '--refresh', $path];
> @@ -551,6 +630,10 @@ sub deactivate_volume {
> my $path = $class->path($scfg, $volname, $storeid, $snapname);
> return if ! -b $path;
>
> + my ($vtype, $name, $vmid, $basename, $basevmid, $isBase, $format) =
> + $class->parse_volname($volname);
> + $path = "\@pve-$name" if $format eq 'qcow2';
> +
> my $cmd = ['/sbin/lvchange', '-aln', $path];
> run_command($cmd, errmsg => "can't deactivate LV '$path'");
> }
> @@ -558,21 +641,31 @@ sub deactivate_volume {
> sub volume_resize {
> my ($class, $scfg, $storeid, $volname, $size, $running) = @_;
>
> - $size = ($size/1024/1024) . "M";
> + my ($vtype, $name, $vmid, $basename, $basevmid, $isBase, $format) =
> + $class->parse_volname($volname);
> +
> + my $lvmsize = lvm_size($size/1024, $format);
same question here - what if the overhead calculation changes at some point?
smaller to bigger doesn't matter, that's okay, but what if the existing
volume had a higher overhead than we now calculate? then resizing would
actually end up with a too small target LV size?
are we not allowed to ever do that? then we need a big warning up top
at the size calculation helper so that we don't forget about this invariant..
> + $lvmsize = "${lvmsize}k";
>
> my $path = $class->path($scfg, $volname);
> - my $cmd = ['/sbin/lvextend', '-L', $size, $path];
> + my $cmd = ['/sbin/lvextend', '-L', $lvmsize, $path];
>
> $class->cluster_lock_storage($storeid, $scfg->{shared}, undef, sub {
> run_command($cmd, errmsg => "error resizing volume '$path'");
> });
>
> + if(!$running && $format eq 'qcow2') {
> + my $prealloc_opt = PVE::Storage::Plugin::preallocation_cmd_option($scfg, $format);
> + my $cmd = ['/usr/bin/qemu-img', 'resize', "--$prealloc_opt", '-f', $format, $path , $size];
the regular qemu-img resize in Plugin.pm doesn't pass the prealloc options..
also, if there are none set this would pass `--` ?
> + run_command($cmd, timeout => 10);
> + }
> +
> return 1;
> }
>
> sub volume_size_info {
> - my ($class, $scfg, $storeid, $volname, $timeout) = @_;
> - my $path = $class->filesystem_path($scfg, $volname);
> + my ($class, $scfg, $storeid, $volname, $timeout, $snap) = @_;
> + my $path = $class->filesystem_path($scfg, $volname, $snap);
>
> my $cmd = ['/sbin/lvs', '--separator', ':', '--noheadings', '--units', 'b',
> '--unbuffered', '--nosuffix', '--options', 'lv_size', $path];
> @@ -586,32 +679,180 @@ sub volume_size_info {
> }
>
> sub volume_snapshot {
> - my ($class, $scfg, $storeid, $volname, $snap) = @_;
> + my ($class, $scfg, $storeid, $volname, $snap, $running) = @_;
> +
> + my ($vmid, $format) = ($class->parse_volname($volname))[2,6];
> +
> + die "can't snapshot this image format\n" if $format ne 'qcow2';
> +
> + if ($running) {
> + #rename with blockdev-reopen is done at qemu level when running
> + $class->alloc_snap_image($storeid, $scfg, $volname, $snap);
missing eval?
> + if ($@) {
> + die "can't allocate new volume $volname: $@\n";
> + }
> + return;
> + }
> +
> + #rename current volume to snap volume
> + eval { $class->rename_volume($scfg, $storeid, $volname, $vmid, undef, 'current', $snap) };
> + die "error rename $volname to $snap\n" if $@;
> +
> + eval { $class->alloc_snap_image($storeid, $scfg, $volname, $snap) };
> + if ($@) {
> + my $err = $@;
> + eval { $class->rename_volume($scfg, $storeid, $volname, $vmid, undef, $snap, 'current') };
should log if renaming back failed..
> + die $err;
> + }
> +}
> +
> +sub volume_rollback_is_possible {
> + my ($class, $scfg, $storeid, $volname, $snap, $blockers) = @_;
> +
> + my $snap_path = $class->path($scfg, $volname, $storeid, $snap);
>
> - die "lvm snapshot is not implemented";
> + $class->activate_volume($storeid, $scfg, $volname, undef, {});
> + my $snapshots = $class->volume_snapshot_info($scfg, $storeid, $volname);
> + my $parent_snap = $snapshots->{current}->{parent};
> +
> + return 1 if $parent_snap eq $snap;
> + die "can't rollback, '$snap' is not most recent snapshot on '$volname'\n";
> +
> + return 1;
this return is dead code..
> }
>
> +
> sub volume_snapshot_rollback {
> my ($class, $scfg, $storeid, $volname, $snap) = @_;
>
> - die "lvm snapshot rollback is not implemented";
> + my $format = ($class->parse_volname($volname))[6];
> +
> + die "can't rollback snapshot for this image format\n" if $format ne 'qcow2';
> +
> + $class->activate_volume($storeid, $scfg, $volname, undef, {});
> +
> + # we can simply reformat the current lvm volume to avoid
> + # a long safe remove.(not needed here, as the allocated space
> + # is still the same owner)
> + eval { lvm_qcow2_format($class, $storeid, $scfg, $volname, $format, $snap) };
should we also follow this approach for qcow2 files?
> + if($@) {
> + die "can't rollback. Error reformating current $volname\n";
> + }
> + return undef;
> }
>
> sub volume_snapshot_delete {
> - my ($class, $scfg, $storeid, $volname, $snap) = @_;
> + my ($class, $scfg, $storeid, $volname, $snap, $running) = @_;
> +
> + my ($vtype, $name, $vmid, $basename, $basevmid, $isBase, $format) = $class->parse_volname($volname);
> +
> + die "can't delete snapshot for this image format\n" if $format ne 'qcow2';
> +
> + if ($running) {
> + $volname = $class->get_snap_volname($volname, $snap);
> + my $cleanup_worker = eval { $class->free_image($storeid, $scfg, $volname, $isBase, $format) };
> + die "error deleting snapshot $snap\n" if $@;
> +
> + if ($cleanup_worker) {
> + my $rpcenv = PVE::RPCEnvironment::get();
> + my $authuser = $rpcenv->get_user();
> + $rpcenv->fork_worker('imgdel', undef, $authuser, $cleanup_worker);
> + }
> + return;
> + }
>
> - die "lvm snapshot delete is not implemented";
> + my $cmd = "";
> + my $path = $class->filesystem_path($scfg, $volname);
> +
> + my $snapshots = $class->volume_snapshot_info($scfg, $storeid, $volname);
> + my $snappath = $snapshots->{$snap}->{file};
> + my $snapvolname = $snapshots->{$snap}->{volname};
> + die "volume $snappath is missing" if !-e $snappath;
> +
> + my $parentsnap = $snapshots->{$snap}->{parent};
> +
> + my $childsnap = $snapshots->{$snap}->{child};
> + my $childpath = $snapshots->{$childsnap}->{file};
> + my $childvolname = $snapshots->{$childsnap}->{volname};
> +
> + my $cleanup_worker = undef;
> + my $err = undef;
> + #if first snapshot,as it should be bigger, we merge child, and rename the snapshot to child
> + if(!$parentsnap) {
> + print "commit: merge content of $childpath into $snappath\n";
> + #can't use -d here, as it's an lvm volume
> + $cmd = ['/usr/bin/qemu-img', 'commit', $childpath];
> + eval { run_command($cmd) };
> + if ($@) {
> + die "error commiting $childpath to $snappath; $@\n";
> + }
> + print"delete $childvolname\n";
> +
> + $cleanup_worker = eval { $class->free_image($storeid, $scfg, $childvolname, 0) };
> + if ($@) {
> + die "error delete old snapshot volume $childvolname: $@\n";
> + }
> +
> + print"rename $snapvolname to $childvolname\n";
> + my $vg = $scfg->{vgname};
> + eval { lvrename($vg, $snapvolname, $childvolname) };
> + if ($@) {
> + warn $@;
> + $err = "error renaming snapshot: $@\n";
> + }
> +
> + } else {
> + #we rebase the child image on the parent as new backing image
> + my $parentpath = $snapshots->{$parentsnap}->{file};
> + print "rebase: merge diff content between $parentpath and $childpath into $childpath\n";
> + $cmd = ['/usr/bin/qemu-img', 'rebase', '-b', $parentpath, '-F', 'qcow2', '-f', 'qcow2', $childpath];
> + eval { run_command($cmd) };
> + if ($@) {
> + die "error rebase $childpath from $parentpath; $@\n";
> + }
> + #delete the snapshot
> + eval { $cleanup_worker = $class->free_image($storeid, $scfg, $snapvolname, 0); };
> + if ($@) {
> + die "error delete old snapshot volume $snapvolname\n";
> + }
> + }
> +
> + if ($cleanup_worker) {
> + my $rpcenv = PVE::RPCEnvironment::get();
> + my $authuser = $rpcenv->get_user();
> + $rpcenv->fork_worker('imgdel', undef, $authuser, $cleanup_worker);
> + }
> +
> + die $err if $err;
> }
>
> sub volume_has_feature {
> my ($class, $scfg, $feature, $storeid, $volname, $snapname, $running) = @_;
>
> my $features = {
> - copy => { base => 1, current => 1},
> - rename => {current => 1},
> + copy => {
> + base => { qcow2 => 1, raw => 1 },
> + current => { qcow2 => 1, raw => 1},
> + snap => { qcow2 => 1 },
> + },
> + 'rename' => {
> + current => { qcow2 => 1, raw => 1},
> + },
> + snapshot => {
> + current => { qcow2 => 1 },
> + snap => { qcow2 => 1 },
> + },
> +# fixme: add later ? (we need to handle basepath, volume activation,...)
> +# template => {
> +# current => { raw => 1, qcow2 => 1},
> +# },
> +# clone => {
> +# base => { qcow2 => 1 },
> +# },
> };
>
> - my ($vtype, $name, $vmid, $basename, $basevmid, $isBase) =
> +
> + my ($vtype, $name, $vmid, $basename, $basevmid, $isBase, $format) =
> $class->parse_volname($volname);
>
> my $key = undef;
> @@ -620,7 +861,7 @@ sub volume_has_feature {
> }else{
> $key = $isBase ? 'base' : 'current';
> }
> - return 1 if $features->{$feature}->{$key};
> + return 1 if defined($features->{$feature}->{$key}->{$format});
>
> return undef;
> }
> @@ -745,4 +986,18 @@ sub rename_volume {
> return "${storeid}:${target_volname}";
> }
>
> +sub get_snap_name {
> + my ($class, $volname, $snapname) = @_;
> +
> + my ($vtype, $name, $vmid, $basename, $basevmid, $isBase, $format) = $class->parse_volname($volname);
> + $name = !$snapname || $snapname eq 'current' ? $name : "snap-$snapname-$name";
> + return $name;
> +}
> +
> +sub get_snap_volname {
> + my ($class, $volname, $snapname) = @_;
> +
> + return $class->get_snap_name($volname, $snapname);
> +}
> +
> 1;
> --
> 2.39.5
More information about the pve-devel
mailing list