[pve-devel] [PATCH v3 qemu-server 01/11] blockdev: cmdline: convert drive to blockdev syntax
Fabian Grünbichler
f.gruenbichler at proxmox.com
Wed Jan 8 15:17:23 CET 2025
> Alexandre Derumier via pve-devel <pve-devel at lists.proxmox.com> hat am 16.12.2024 10:12 CET geschrieben:
> The blockdev chain is:
> -throttle-group-node (drive-(ide|scsi|virtio)x)
> - format-node (fmt-drive-x)
> - file-node (file-drive -x)
>
> fixme: implement iscsi:// path
> Signed-off-by: Alexandre Derumier <alexandre.derumier at groupe-cyllene.com>
> ---
> PVE/QemuServer.pm | 351 +++++++++++++++++++++++++++++++---------------
> 1 file changed, 237 insertions(+), 114 deletions(-)
>
> diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
> index 8192599a..2832ed09 100644
> --- a/PVE/QemuServer.pm
> +++ b/PVE/QemuServer.pm
> @@ -1464,7 +1464,8 @@ sub print_drivedevice_full {
> } else {
> $device .= ",bus=ahci$controller.$unit";
> }
> - $device .= ",drive=drive-$drive_id,id=$drive_id";
> + $device .= ",id=$drive_id";
> + $device .= ",drive=drive-$drive_id" if $device_type ne 'cd' || $drive->{file} ne 'none';
is this just because you remove the whole drive when ejecting? not sure whether that is really needed..
>
> if ($device_type eq 'hd') {
> if (my $model = $drive->{model}) {
> @@ -1490,6 +1491,13 @@ sub print_drivedevice_full {
> $device .= ",serial=$serial";
> }
>
> + my $writecache = $drive->{cache} && $drive->{cache} =~ /^(?:none|writeback|unsafe)$/ ? "on" : "off";
> + $device .= ",write-cache=$writecache" if $drive->{media} && $drive->{media} ne 'cdrom';
> +
> + my @qemu_drive_options = qw(heads secs cyls trans rerror werror);
> + foreach my $o (@qemu_drive_options) {
> + $device .= ",$o=$drive->{$o}" if defined($drive->{$o});
> + }
>
> return $device;
> }
> @@ -1539,145 +1547,256 @@ my sub drive_uses_cache_direct {
> return $cache_direct;
> }
>
> -sub print_drive_commandline_full {
> - my ($storecfg, $vmid, $drive, $live_restore_name, $io_uring) = @_;
> +sub print_drive_throttle_group {
> + my ($drive) = @_;
> + #command line can't use the structured json limits option,
> + #so limit params need to use with x- as it's unstable api
this comment should be below the early return, or above the whole sub.
> + return if drive_is_cdrom($drive) && $drive->{file} eq 'none';
is this needed if we keep empty cdrom drives around like before? I know throttling practically makes no sense in that case, but it might make the code in general more simple?
>
> - my $path;
> - my $volid = $drive->{file};
> my $drive_id = get_drive_id($drive);
>
> + my $throttle_group = "throttle-group,id=throttle-drive-$drive_id";
> + foreach my $type (['', '-total'], [_rd => '-read'], [_wr => '-write']) {
> + my ($dir, $qmpname) = @$type;
> +
> + if (my $v = $drive->{"mbps$dir"}) {
> + $throttle_group .= ",x-bps$qmpname=".int($v*1024*1024);
> + }
> + if (my $v = $drive->{"mbps${dir}_max"}) {
> + $throttle_group .= ",x-bps$qmpname-max=".int($v*1024*1024);
> + }
> + if (my $v = $drive->{"bps${dir}_max_length"}) {
> + $throttle_group .= ",x-bps$qmpname-max-length=$v";
> + }
> + if (my $v = $drive->{"iops${dir}"}) {
> + $throttle_group .= ",x-iops$qmpname=$v";
> + }
> + if (my $v = $drive->{"iops${dir}_max"}) {
> + $throttle_group .= ",x-iops$qmpname-max=$v";
> + }
> + if (my $v = $drive->{"iops${dir}_max_length"}) {
> + $throttle_group .= ",x-iops$qmpname-max-length=$v";
> + }
> + }
> +
> + return $throttle_group;
> +}
> +
> +sub generate_file_blockdev {
> + my ($storecfg, $drive, $nodename) = @_;
> +
> + my $volid = $drive->{file};
> my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid, 1);
> - my $scfg = $storeid ? PVE::Storage::storage_config($storecfg, $storeid) : undef;
>
> - if (drive_is_cdrom($drive)) {
> - $path = get_iso_path($storecfg, $vmid, $volid);
> - die "$drive_id: cannot back cdrom drive with a live restore image\n" if $live_restore_name;
> + my $scfg = undef;
> + my $path = $volid;
I think this should only happen if the parse_volume_id above told us this is an absolute path and not a PVE-managed volume..
> + if($storeid && $storeid ne 'nbd') {
this is wrong.. I guess it's also somewhat wrong in the old qemu_drive_mirror code.. we should probably check using a more specific RE that the "volid" is an NBD URI, and not attempt to parse it as a regular volid in that case..
> + $scfg = PVE::Storage::storage_config($storecfg, $storeid);
> + $path = PVE::Storage::path($storecfg, $volid);
> + }
> +
> + my $blockdev = {};
> +
> + if ($path =~ m/^rbd:(\S+)$/) {
> +
> + $blockdev->{driver} = 'rbd';
> +
> + my @rbd_options = split(/:/, $1);
> + my $keyring = undef;
> + for my $option (@rbd_options) {
> + if ($option =~ m/^(\S+)=(\S+)$/) {
> + my $key = $1;
> + my $value = $2;
> + $blockdev->{'auth-client-required'} = [$value] if $key eq 'auth_supported';
> + $blockdev->{'conf'} = $value if $key eq 'conf';
> + $blockdev->{'user'} = $value if $key eq 'id';
> + $keyring = $value if $key eq 'keyring';
> + if ($key eq 'mon_host') {
> + my $server = [];
> + my @mons = split(';', $value);
> + for my $mon (@mons) {
> + my ($host, $port) = PVE::Tools::parse_host_and_port($mon);
> + $port = '3300' if !$port;
> + push @$server, { host => $host, port => $port };
> + }
> + $blockdev->{server} = $server;
> + }
> + } elsif ($option =~ m|^(\S+)/(\S+)$|){
> + $blockdev->{pool} = $1;
> + my $image = $2;
> +
> + if($image =~ m|^(\S+)/(\S+)$|) {
> + $blockdev->{namespace} = $1;
> + $blockdev->{image} = $2;
> + } else {
> + $blockdev->{image} = $image;
> + }
> + }
> + }
> +
> + if($keyring && $blockdev->{server}) {
> + #qemu devs are removed passing arbitrary values to blockdev object, and don't have added
> + #keyring to the list of allowed keys. It need to be defined in the store ceph.conf.
> + #https://lists.gnu.org/archive/html/qemu-devel/2018-08/msg02676.html
> + #another way could be to simply patch qemu to allow the key
I think we either want to allow the keys we need in Qemu (and upstream that), or we want to write the config out to a temporary config and clean that up after Qemu has read its contents..
> + my $ceph_conf = "/etc/pve/priv/ceph/${storeid}.conf";
this file is already taken for external Ceph clusters, we can't just re-use for this purpose without a lot of side effects I think..
> + $blockdev->{conf} = $ceph_conf;
> + if (!-e $ceph_conf) {
> + my $content = "[global]\nkeyring = $keyring\n";
> + PVE::Tools::file_set_contents($ceph_conf, $content, 0400);
> + }
> + }
> + } elsif ($path =~ m/^nbd:(\S+):(\d+):exportname=(\S+)$/) {
> + my $server = { type => 'inet', host => $1, port => $2 };
> + $blockdev = { driver => 'nbd', server => $server, export => $3 };
> + } elsif ($path =~ m/^nbd:unix:(\S+):exportname=(\S+)$/) {
> + my $server = { type => 'unix', path => $1 };
> + $blockdev = { driver => 'nbd', server => $server, export => $2 };
> + } elsif ($path =~ m|^gluster(\+(tcp\|unix\|rdma))?://(.*)/(.*)/(images/(\S+)/(\S+))$|) {
> + my $protocol = $2 ? $2 : 'inet';
> + $protocol = 'inet' if $protocol eq 'tcp';
> + my $server = [{ type => $protocol, host => $3, port => '24007' }];
> + $blockdev = { driver => 'gluster', server => $server, volume => $4, path => $5 };
> + } elsif ($path =~ m/^\/dev/) {
> + my $driver = drive_is_cdrom($drive) ? 'host_cdrom' : 'host_device';
> + $blockdev = { driver => $driver, filename => $path };
> + } elsif ($path =~ m/^\//) {
> + $blockdev = { driver => 'file', filename => $path};
> } else {
> - if ($storeid) {
> - $path = PVE::Storage::path($storecfg, $volid);
> - } else {
> - $path = $volid;
> + die "unsupported path: $path\n";
> + #fixme
> + #'{"driver":"iscsi","portal":"iscsi.example.com:3260","target":"demo-target","lun":3,"transport":"tcp"}'
> + }
> +
> + my $cache_direct = drive_uses_cache_direct($drive, $scfg);
> + my $cache = {};
> + $cache->{direct} = $cache_direct ? JSON::true : JSON::false;
> + $cache->{'no-flush'} = $drive->{cache} && $drive->{cache} eq 'unsafe' ? JSON::true : JSON::false;
> + $blockdev->{cache} = $cache;
> +
> + ##aio
> + if($blockdev->{filename}) {
> + $drive->{aio} = 'threads' if drive_is_cdrom($drive);
> + my $aio = $drive->{aio};
> + if (!$aio) {
> + if (storage_allows_io_uring_default($scfg, $cache_direct)) {
> + # io_uring supports all cache modes
> + $aio = "io_uring";
> + } else {
> + # aio native works only with O_DIRECT
> + if($cache_direct) {
> + $aio = "native";
> + } else {
> + $aio = "threads";
> + }
> + }
> }
> + $blockdev->{aio} = $aio;
> }
>
> - # For PVE-managed volumes, use the format from the storage layer and prevent overrides via the
> - # drive's 'format' option. For unmanaged volumes, fallback to 'raw' to avoid auto-detection by
> - # QEMU. For the special case 'none' (get_iso_path() returns an empty $path), there should be no
> - # format or QEMU won't start.
> - my $format;
> - if (drive_is_cdrom($drive) && !$path) {
> - # no format
> - } elsif ($storeid) {
> - $format = checked_volume_format($storecfg, $volid);
> + ##discard && detect-zeroes
> + my $discard = 'ignore';
> + if($drive->{discard}) {
> + $discard = $drive->{discard};
> + $discard = 'unmap' if $discard eq 'on';
> + }
> + $blockdev->{discard} = $discard if !drive_is_cdrom($drive);
>
> - if ($drive->{format} && $drive->{format} ne $format) {
> - die "drive '$drive->{interface}$drive->{index}' - volume '$volid'"
> - ." - 'format=$drive->{format}' option different from storage format '$format'\n";
> - }
> + my $detectzeroes;
nit: detect_zeroes
> + if (defined($drive->{detect_zeroes}) && !$drive->{detect_zeroes}) {
> + $detectzeroes = 'off';
> + } elsif ($drive->{discard}) {
> + $detectzeroes = $drive->{discard} eq 'on' ? 'unmap' : 'on';
> } else {
> - $format = $drive->{format} // 'raw';
> + # This used to be our default with discard not being specified:
> + $detectzeroes = 'on';
> }
> + $blockdev->{'detect-zeroes'} = $detectzeroes if !drive_is_cdrom($drive);
> + $blockdev->{'node-name'} = $nodename if $nodename;
this last line could be a lot higher up?
>
> - my $is_rbd = $path =~ m/^rbd:/;
> + return $blockdev;
> +}
>
> - my $opts = '';
> - my @qemu_drive_options = qw(heads secs cyls trans media cache rerror werror aio discard);
> - foreach my $o (@qemu_drive_options) {
> - $opts .= ",$o=$drive->{$o}" if defined($drive->{$o});
> - }
> +sub generate_format_blockdev {
> + my ($storecfg, $drive, $nodename, $file, $force_readonly) = @_;
>
> - # snapshot only accepts on|off
> - if (defined($drive->{snapshot})) {
> - my $v = $drive->{snapshot} ? 'on' : 'off';
> - $opts .= ",snapshot=$v";
> - }
> + my $volid = $drive->{file};
> + my $scfg = undef;
> + my $path = $volid;
path is not used at all, other than being conditionally overwritten below..
> + my $format = $drive->{format};
> + $format //= "raw";
the format handling here is very sensitive, and I think this broke it. see the big comment this patch removed ;)
short summary: for PVE-managed volumes we want the format from the storage layer (via checked_volume_format). if the drive has a format set that disagrees, that is a hard error. for absolute paths we us the format from the drive with a fallback to raw.
>
> - if (defined($drive->{ro})) { # ro maps to QEMUs `readonly`, which accepts `on` or `off` only
> - $opts .= ",readonly=" . ($drive->{ro} ? 'on' : 'off');
> - }
> + my $drive_id = get_drive_id($drive);
>
> - foreach my $type (['', '-total'], [_rd => '-read'], [_wr => '-write']) {
> - my ($dir, $qmpname) = @$type;
> - if (my $v = $drive->{"mbps$dir"}) {
> - $opts .= ",throttling.bps$qmpname=".int($v*1024*1024);
> - }
> - if (my $v = $drive->{"mbps${dir}_max"}) {
> - $opts .= ",throttling.bps$qmpname-max=".int($v*1024*1024);
> - }
> - if (my $v = $drive->{"bps${dir}_max_length"}) {
> - $opts .= ",throttling.bps$qmpname-max-length=$v";
> - }
> - if (my $v = $drive->{"iops${dir}"}) {
> - $opts .= ",throttling.iops$qmpname=$v";
> - }
> - if (my $v = $drive->{"iops${dir}_max"}) {
> - $opts .= ",throttling.iops$qmpname-max=$v";
> - }
> - if (my $v = $drive->{"iops${dir}_max_length"}) {
> - $opts .= ",throttling.iops$qmpname-max-length=$v";
> - }
> + if ($drive->{zeroinit}) {
> + #fixme how to handle zeroinit ? insert special blockdev filter ?
> }
>
> - if ($live_restore_name) {
> - $format = "rbd" if $is_rbd;
> - die "$drive_id: Proxmox Backup Server backed drive cannot auto-detect the format\n"
> - if !$format;
> - $opts .= ",format=alloc-track,file.driver=$format";
> - } elsif ($format) {
> - $opts .= ",format=$format";
> + my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid, 1);
so I guess this should never be called with nbd-URI-volids?
nit: $volname is not used anywhere, so can be removed..
> +
> + if($storeid) {
> + $scfg = PVE::Storage::storage_config($storecfg, $storeid);
> + $format = checked_volume_format($storecfg, $volid);
this is missing the comparison against $drive->{format}
> + $path = PVE::Storage::path($storecfg, $volid);
this is not used anywhere..
> }
>
> + my $readonly = defined($drive->{ro}) || $force_readonly ? JSON::true : JSON::false;
> +
> + #libvirt define cache option on both format && file
> my $cache_direct = drive_uses_cache_direct($drive, $scfg);
> + my $cache = {};
> + $cache->{direct} = $cache_direct ? JSON::true : JSON::false;
> + $cache->{'no-flush'} = $drive->{cache} && $drive->{cache} eq 'unsafe' ? JSON::true : JSON::false;
so we have the same code in two places? should probably be a helper then to not have them go out of sync..
>
> - $opts .= ",cache=none" if !$drive->{cache} && $cache_direct;
> + my $blockdev = { driver => $format, file => $file, cache => $cache, 'read-only' => $readonly };
> + $blockdev->{'node-name'} = $nodename if $nodename;
>
> - if (!$drive->{aio}) {
> - if ($io_uring && storage_allows_io_uring_default($scfg, $cache_direct)) {
> - # io_uring supports all cache modes
> - $opts .= ",aio=io_uring";
> - } else {
> - # aio native works only with O_DIRECT
> - if($cache_direct) {
> - $opts .= ",aio=native";
> - } else {
> - $opts .= ",aio=threads";
> - }
> - }
> - }
> + return $blockdev;
>
> - if (!drive_is_cdrom($drive)) {
> - my $detectzeroes;
> - if (defined($drive->{detect_zeroes}) && !$drive->{detect_zeroes}) {
> - $detectzeroes = 'off';
> - } elsif ($drive->{discard}) {
> - $detectzeroes = $drive->{discard} eq 'on' ? 'unmap' : 'on';
> - } else {
> - # This used to be our default with discard not being specified:
> - $detectzeroes = 'on';
> - }
> +}
>
> - # note: 'detect-zeroes' works per blockdev and we want it to persist
> - # after the alloc-track is removed, so put it on 'file' directly
> - my $dz_param = $live_restore_name ? "file.detect-zeroes" : "detect-zeroes";
> - $opts .= ",$dz_param=$detectzeroes" if $detectzeroes;
> - }
> +sub generate_drive_blockdev {
> + my ($storecfg, $vmid, $drive, $force_readonly, $live_restore_name) = @_;
>
> - if ($live_restore_name) {
> - $opts .= ",backing=$live_restore_name";
> - $opts .= ",auto-remove=on";
> + my $path;
> + my $volid = $drive->{file};
> + my $format = $drive->{format};
this is only used once below
> + my $drive_id = get_drive_id($drive);
> +
> + my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid, 1);
> + my $scfg = $storeid ? PVE::Storage::storage_config($storecfg, $storeid) : undef;
> +
> + my $blockdevs = [];
> +
> + if (drive_is_cdrom($drive)) {
> + die "$drive_id: cannot back cdrom drive with a live restore image\n" if $live_restore_name;
> +
> + $path = get_iso_path($storecfg, $vmid, $volid);
> + return if !$path;
> + $force_readonly = 1;
> }
>
> - # my $file_param = $live_restore_name ? "file.file.filename" : "file";
> - my $file_param = "file";
> + my $file_nodename = "file-drive-$drive_id";
> + my $blockdev_file = generate_file_blockdev($storecfg, $drive, $file_nodename);
> + my $fmt_nodename = "fmt-drive-$drive_id";
> + my $blockdev_format = generate_format_blockdev($storecfg, $drive, $fmt_nodename, $blockdev_file, $force_readonly);
> +
> + my $blockdev_live_restore = undef;
> if ($live_restore_name) {
> - # non-rbd drivers require the underlying file to be a separate block
> - # node, so add a second .file indirection
> - $file_param .= ".file" if !$is_rbd;
> - $file_param .= ".filename";
> + die "$drive_id: Proxmox Backup Server backed drive cannot auto-detect the format\n"
> + if !$format;
for this check, but it is not actually set anywhere here.. so is something missing or can the check go?
> +
> + $blockdev_live_restore = { 'node-name' => "liverestore-drive-$drive_id",
> + backing => $live_restore_name,
> + 'auto-remove' => 'on', format => "alloc-track",
> + file => $blockdev_format };
> }
> - my $pathinfo = $path ? "$file_param=$path," : '';
>
> - return "${pathinfo}if=none,id=drive-$drive->{interface}$drive->{index}$opts";
> + #this is the topfilter entry point, use $drive-drive_id as nodename
> + my $blockdev_throttle = { driver => "throttle", 'node-name' => "drive-$drive_id", 'throttle-group' => "throttle-drive-$drive_id" };
> + #put liverestore filter between throttle && format filter
> + $blockdev_throttle->{file} = $live_restore_name ? $blockdev_live_restore : $blockdev_format;
> + return $blockdev_throttle,
> }
>
> sub print_pbs_blockdev {
> @@ -4091,13 +4210,13 @@ sub config_to_command {
> push @$devices, '-blockdev', $live_restore->{blockdev};
> }
>
> - my $drive_cmd = print_drive_commandline_full(
> - $storecfg, $vmid, $drive, $live_blockdev_name, min_version($kvmver, 6, 0));
> -
> - # extra protection for templates, but SATA and IDE don't support it..
> - $drive_cmd .= ',readonly=on' if drive_is_read_only($conf, $drive);
> + my $throttle_group = print_drive_throttle_group($drive);
> + push @$devices, '-object', $throttle_group if $throttle_group;
>
> - push @$devices, '-drive',$drive_cmd;
> +# # extra protection for templates, but SATA and IDE don't support it..
> + my $force_readonly = drive_is_read_only($conf, $drive);
> + my $blockdev = generate_drive_blockdev($storecfg, $vmid, $drive, $force_readonly, $live_blockdev_name);
> + push @$devices, '-blockdev', encode_json_ordered($blockdev) if $blockdev;
> push @$devices, '-device', print_drivedevice_full(
> $storecfg, $conf, $vmid, $drive, $bridges, $arch, $machine_type);
> });
> @@ -8986,4 +9105,8 @@ sub delete_ifaces_ipams_ips {
> }
> }
>
> +sub encode_json_ordered {
> + return JSON->new->canonical->allow_nonref->encode( $_[0] );
> +}
this is only used in a single place..
> +
> 1;
> --
> 2.39.5
More information about the pve-devel
mailing list