[pve-devel] [PATCH v3 qemu-server 01/11] blockdev: cmdline: convert drive to blockdev syntax

Fabian Grünbichler f.gruenbichler at proxmox.com
Wed Jan 8 15:17:23 CET 2025


> Alexandre Derumier via pve-devel <pve-devel at lists.proxmox.com> hat am 16.12.2024 10:12 CET geschrieben:
> The blockdev chain is:
> -throttle-group-node (drive-(ide|scsi|virtio)x)
>     - format-node (fmt-drive-x)
>          - file-node (file-drive -x)
> 
> fixme: implement iscsi:// path
> Signed-off-by: Alexandre Derumier <alexandre.derumier at groupe-cyllene.com>
> ---
>  PVE/QemuServer.pm | 351 +++++++++++++++++++++++++++++++---------------
>  1 file changed, 237 insertions(+), 114 deletions(-)
> 
> diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
> index 8192599a..2832ed09 100644
> --- a/PVE/QemuServer.pm
> +++ b/PVE/QemuServer.pm
> @@ -1464,7 +1464,8 @@ sub print_drivedevice_full {
>  	} else {
>  	    $device .= ",bus=ahci$controller.$unit";
>  	}
> -	$device .= ",drive=drive-$drive_id,id=$drive_id";
> +	$device .= ",id=$drive_id";
> +	$device .= ",drive=drive-$drive_id" if $device_type ne 'cd' || $drive->{file} ne 'none';

is this just because you remove the whole drive when ejecting? not sure whether that is really needed..

>  
>  	if ($device_type eq 'hd') {
>  	    if (my $model = $drive->{model}) {
> @@ -1490,6 +1491,13 @@ sub print_drivedevice_full {
>  	$device .= ",serial=$serial";
>      }
>  
> +    my $writecache = $drive->{cache} && $drive->{cache} =~ /^(?:none|writeback|unsafe)$/  ? "on" : "off";
> +    $device .= ",write-cache=$writecache" if $drive->{media} && $drive->{media} ne 'cdrom';
> +
> +    my @qemu_drive_options = qw(heads secs cyls trans rerror werror);
> +    foreach my $o (@qemu_drive_options) {
> +       $device .= ",$o=$drive->{$o}" if defined($drive->{$o});
> +    }
>  
>      return $device;
>  }
> @@ -1539,145 +1547,256 @@ my sub drive_uses_cache_direct {
>      return $cache_direct;
>  }
>  
> -sub print_drive_commandline_full {
> -    my ($storecfg, $vmid, $drive, $live_restore_name, $io_uring) = @_;
> +sub print_drive_throttle_group {
> +    my ($drive) = @_;
> +    #command line can't use the structured json limits option,
> +    #so limit params need to use with x- as it's unstable api

this comment should be below the early return, or above the whole sub.

> +    return if drive_is_cdrom($drive) && $drive->{file} eq 'none';

is this needed if we keep empty cdrom drives around like before? I know throttling practically makes no sense in that case, but it might make the code in general more simple?
>  
> -    my $path;
> -    my $volid = $drive->{file};
>      my $drive_id = get_drive_id($drive);
>  
> +    my $throttle_group = "throttle-group,id=throttle-drive-$drive_id";
> +    foreach my $type (['', '-total'], [_rd => '-read'], [_wr => '-write']) {
> +        my ($dir, $qmpname) = @$type;
> +
> +        if (my $v = $drive->{"mbps$dir"}) {
> +            $throttle_group .= ",x-bps$qmpname=".int($v*1024*1024);
> +        }
> +        if (my $v = $drive->{"mbps${dir}_max"}) {
> +            $throttle_group .= ",x-bps$qmpname-max=".int($v*1024*1024);
> +        }
> +        if (my $v = $drive->{"bps${dir}_max_length"}) {
> +            $throttle_group .= ",x-bps$qmpname-max-length=$v";
> +        }
> +        if (my $v = $drive->{"iops${dir}"}) {
> +            $throttle_group .= ",x-iops$qmpname=$v";
> +        }
> +        if (my $v = $drive->{"iops${dir}_max"}) {
> +            $throttle_group .= ",x-iops$qmpname-max=$v";
> +        }
> +        if (my $v = $drive->{"iops${dir}_max_length"}) {
> +            $throttle_group .= ",x-iops$qmpname-max-length=$v";
> +        }
> +   }
> +
> +   return $throttle_group;
> +}
> +
> +sub generate_file_blockdev {
> +    my ($storecfg, $drive, $nodename) = @_;
> +
> +    my $volid = $drive->{file};
>      my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid, 1);
> -    my $scfg = $storeid ? PVE::Storage::storage_config($storecfg, $storeid) : undef;
>  
> -    if (drive_is_cdrom($drive)) {
> -	$path = get_iso_path($storecfg, $vmid, $volid);
> -	die "$drive_id: cannot back cdrom drive with a live restore image\n" if $live_restore_name;
> +    my $scfg = undef;
> +    my $path = $volid;

I think this should only happen if the parse_volume_id above told us this is an absolute path and not a PVE-managed volume..

> +    if($storeid && $storeid ne 'nbd') {

this is wrong.. I guess it's also somewhat wrong in the old qemu_drive_mirror code.. we should probably check using a more specific RE that the "volid" is an NBD URI, and not attempt to parse it as a regular volid in that case..

> +	$scfg = PVE::Storage::storage_config($storecfg, $storeid);
> +	$path = PVE::Storage::path($storecfg, $volid);
> +    }
> +
> +    my $blockdev = {};
> +
> +    if ($path =~ m/^rbd:(\S+)$/) {
> +
> +        $blockdev->{driver} = 'rbd';
> +
> +	my @rbd_options = split(/:/, $1);
> +	my $keyring = undef;
> +	for my $option (@rbd_options) {
> +	    if ($option =~ m/^(\S+)=(\S+)$/) {
> +		my $key = $1;
> +		my $value = $2;
> +		$blockdev->{'auth-client-required'} = [$value] if $key eq 'auth_supported';
> +		$blockdev->{'conf'} = $value if $key eq 'conf';
> +		$blockdev->{'user'} = $value if $key eq 'id';
> +		$keyring = $value if $key eq 'keyring';
> +	        if ($key eq 'mon_host') {
> +		    my $server = [];
> +		    my @mons = split(';', $value);
> +		    for my $mon (@mons) {
> +			my ($host, $port) = PVE::Tools::parse_host_and_port($mon);
> +			$port = '3300' if !$port;
> +			push @$server, { host => $host, port => $port };
> +		    }
> +		    $blockdev->{server} = $server;
> +		}
> +	    } elsif ($option =~ m|^(\S+)/(\S+)$|){
> +                $blockdev->{pool} = $1;
> +		my $image = $2;
> +
> +                if($image =~ m|^(\S+)/(\S+)$|) {
> +                    $blockdev->{namespace} = $1;
> +                    $blockdev->{image} = $2;
> +                } else {
> +                    $blockdev->{image} = $image;
> +                }
> +            }
> +	}
> +
> +	if($keyring && $blockdev->{server}) {
> +	    #qemu devs are removed passing arbitrary values to blockdev object, and don't have added
> +	    #keyring to the list of allowed keys. It need to be defined in the store ceph.conf.
> +	    #https://lists.gnu.org/archive/html/qemu-devel/2018-08/msg02676.html
> +	    #another way could be to simply patch qemu to allow the key

I think we either want to allow the keys we need in Qemu (and upstream that), or we want to write the config out to a temporary config and clean that up after Qemu has read its contents..

> +	    my $ceph_conf = "/etc/pve/priv/ceph/${storeid}.conf";

this file is already taken for external Ceph clusters, we can't just re-use for this purpose without a lot of side effects I think..

> +	    $blockdev->{conf} = $ceph_conf;
> +	    if (!-e $ceph_conf) {
> +		my $content = "[global]\nkeyring = $keyring\n";
> +		PVE::Tools::file_set_contents($ceph_conf, $content, 0400);
> +	    }
> +	}
> +    } elsif ($path =~ m/^nbd:(\S+):(\d+):exportname=(\S+)$/) {
> +	my $server = { type => 'inet', host => $1, port => $2 };
> +	$blockdev = { driver => 'nbd', server => $server, export => $3 };
> +    } elsif ($path =~ m/^nbd:unix:(\S+):exportname=(\S+)$/) {
> +	my $server = { type => 'unix', path => $1 };
> +	$blockdev = { driver => 'nbd', server => $server, export => $2 };
> +    } elsif ($path =~ m|^gluster(\+(tcp\|unix\|rdma))?://(.*)/(.*)/(images/(\S+)/(\S+))$|) {
> +	my $protocol = $2 ? $2 : 'inet';
> +	$protocol = 'inet' if $protocol eq 'tcp';
> +	my $server = [{ type => $protocol, host => $3, port => '24007' }];
> +	$blockdev = { driver => 'gluster', server => $server, volume => $4, path => $5 };
> +    } elsif ($path =~ m/^\/dev/) {
> +	my $driver = drive_is_cdrom($drive) ? 'host_cdrom' : 'host_device';
> +	$blockdev = { driver => $driver, filename => $path };
> +    } elsif ($path =~ m/^\//) {
> +	$blockdev = { driver => 'file', filename => $path};
>      } else {
> -	if ($storeid) {
> -	    $path = PVE::Storage::path($storecfg, $volid);
> -	} else {
> -	    $path = $volid;
> +	die "unsupported path: $path\n";
> +	#fixme
> +	#'{"driver":"iscsi","portal":"iscsi.example.com:3260","target":"demo-target","lun":3,"transport":"tcp"}'
> +    }
> +
> +    my $cache_direct = drive_uses_cache_direct($drive, $scfg);
> +    my $cache = {};
> +    $cache->{direct} = $cache_direct ? JSON::true : JSON::false;
> +    $cache->{'no-flush'} = $drive->{cache} && $drive->{cache} eq 'unsafe' ? JSON::true : JSON::false;
> +    $blockdev->{cache} = $cache;
> +
> +    ##aio
> +    if($blockdev->{filename}) {
> +	$drive->{aio} = 'threads' if drive_is_cdrom($drive);
> +	my $aio = $drive->{aio};
> +	if (!$aio) {
> +	    if (storage_allows_io_uring_default($scfg, $cache_direct)) {
> +		# io_uring supports all cache modes
> +		$aio = "io_uring";
> +	    } else {
> +		# aio native works only with O_DIRECT
> +		if($cache_direct) {
> +		    $aio = "native";
> +		} else {
> +		    $aio = "threads";
> +		}
> +	    }
>  	}
> +	$blockdev->{aio} = $aio;
>      }
>  
> -    # For PVE-managed volumes, use the format from the storage layer and prevent overrides via the
> -    # drive's 'format' option. For unmanaged volumes, fallback to 'raw' to avoid auto-detection by
> -    # QEMU. For the special case 'none' (get_iso_path() returns an empty $path), there should be no
> -    # format or QEMU won't start.
> -    my $format;
> -    if (drive_is_cdrom($drive) && !$path) {
> -	# no format
> -    } elsif ($storeid) {
> -	$format = checked_volume_format($storecfg, $volid);
> +    ##discard && detect-zeroes
> +    my $discard = 'ignore';
> +    if($drive->{discard}) {
> +	$discard = $drive->{discard};
> +	$discard = 'unmap' if $discard eq 'on';
> +    }
> +    $blockdev->{discard} = $discard if !drive_is_cdrom($drive);
>  
> -	if ($drive->{format} && $drive->{format} ne $format) {
> -	    die "drive '$drive->{interface}$drive->{index}' - volume '$volid'"
> -		." - 'format=$drive->{format}' option different from storage format '$format'\n";
> -	}
> +    my $detectzeroes;

nit: detect_zeroes

> +    if (defined($drive->{detect_zeroes}) && !$drive->{detect_zeroes}) {
> +	$detectzeroes = 'off';
> +    } elsif ($drive->{discard}) {
> +	$detectzeroes = $drive->{discard} eq 'on' ? 'unmap' : 'on';
>      } else {
> -	$format = $drive->{format} // 'raw';
> +	# This used to be our default with discard not being specified:
> +	$detectzeroes = 'on';
>      }
> +    $blockdev->{'detect-zeroes'} = $detectzeroes if !drive_is_cdrom($drive);
> +    $blockdev->{'node-name'} = $nodename if $nodename;

this last line could be a lot higher up?

>  
> -    my $is_rbd = $path =~ m/^rbd:/;
> +    return $blockdev;
> +}
>  
> -    my $opts = '';
> -    my @qemu_drive_options = qw(heads secs cyls trans media cache rerror werror aio discard);
> -    foreach my $o (@qemu_drive_options) {
> -	$opts .= ",$o=$drive->{$o}" if defined($drive->{$o});
> -    }
> +sub generate_format_blockdev {
> +    my ($storecfg, $drive, $nodename, $file, $force_readonly) = @_;
>  
> -    # snapshot only accepts on|off
> -    if (defined($drive->{snapshot})) {
> -	my $v = $drive->{snapshot} ? 'on' : 'off';
> -	$opts .= ",snapshot=$v";
> -    }
> +    my $volid = $drive->{file};
> +    my $scfg = undef;
> +    my $path = $volid;

path is not used at all, other than being conditionally overwritten below..

> +    my $format = $drive->{format};
> +    $format //= "raw";

the format handling here is very sensitive, and I think this broke it. see the big comment this patch removed ;)

short summary: for PVE-managed volumes we want the format from the storage layer (via checked_volume_format). if the drive has a format set that disagrees, that is a hard error. for absolute paths we us the format from the drive with a fallback to raw.

>  
> -    if (defined($drive->{ro})) { # ro maps to QEMUs `readonly`, which accepts `on` or `off` only
> -	$opts .= ",readonly=" . ($drive->{ro} ? 'on' : 'off');
> -    }
> +    my $drive_id = get_drive_id($drive);
>  
> -    foreach my $type (['', '-total'], [_rd => '-read'], [_wr => '-write']) {
> -	my ($dir, $qmpname) = @$type;
> -	if (my $v = $drive->{"mbps$dir"}) {
> -	    $opts .= ",throttling.bps$qmpname=".int($v*1024*1024);
> -	}
> -	if (my $v = $drive->{"mbps${dir}_max"}) {
> -	    $opts .= ",throttling.bps$qmpname-max=".int($v*1024*1024);
> -	}
> -	if (my $v = $drive->{"bps${dir}_max_length"}) {
> -	    $opts .= ",throttling.bps$qmpname-max-length=$v";
> -	}
> -	if (my $v = $drive->{"iops${dir}"}) {
> -	    $opts .= ",throttling.iops$qmpname=$v";
> -	}
> -	if (my $v = $drive->{"iops${dir}_max"}) {
> -	    $opts .= ",throttling.iops$qmpname-max=$v";
> -	}
> -	if (my $v = $drive->{"iops${dir}_max_length"}) {
> -	    $opts .= ",throttling.iops$qmpname-max-length=$v";
> -	}
> +    if ($drive->{zeroinit}) {
> +	#fixme how to handle zeroinit ? insert special blockdev filter ?
>      }
>  
> -    if ($live_restore_name) {
> -	$format = "rbd" if $is_rbd;
> -	die "$drive_id: Proxmox Backup Server backed drive cannot auto-detect the format\n"
> -	    if !$format;
> -	$opts .= ",format=alloc-track,file.driver=$format";
> -    } elsif ($format) {
> -	$opts .= ",format=$format";
> +    my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid, 1);

so I guess this should never be called with nbd-URI-volids?

nit: $volname is not used anywhere, so can be removed..

> +
> +    if($storeid) {
> +	$scfg = PVE::Storage::storage_config($storecfg, $storeid);
> +        $format = checked_volume_format($storecfg, $volid);

this is missing the comparison against $drive->{format}

> +	$path = PVE::Storage::path($storecfg, $volid);

this is not used anywhere..

>      }
>  
> +    my $readonly = defined($drive->{ro}) || $force_readonly ? JSON::true : JSON::false;
> +
> +    #libvirt define cache option on both format && file
>      my $cache_direct = drive_uses_cache_direct($drive, $scfg);
> +    my $cache = {};
> +    $cache->{direct} = $cache_direct ? JSON::true : JSON::false;
> +    $cache->{'no-flush'} = $drive->{cache} && $drive->{cache} eq 'unsafe' ? JSON::true : JSON::false;

so we have the same code in two places? should probably be a helper then to not have them go out of sync..

>  
> -    $opts .= ",cache=none" if !$drive->{cache} && $cache_direct;
> +    my $blockdev = { driver => $format, file => $file, cache => $cache, 'read-only' => $readonly };
> +    $blockdev->{'node-name'} = $nodename if $nodename;
>  
> -    if (!$drive->{aio}) {
> -	if ($io_uring && storage_allows_io_uring_default($scfg, $cache_direct)) {
> -	    # io_uring supports all cache modes
> -	    $opts .= ",aio=io_uring";
> -	} else {
> -	    # aio native works only with O_DIRECT
> -	    if($cache_direct) {
> -		$opts .= ",aio=native";
> -	    } else {
> -		$opts .= ",aio=threads";
> -	    }
> -	}
> -    }
> +    return $blockdev;
>  
> -    if (!drive_is_cdrom($drive)) {
> -	my $detectzeroes;
> -	if (defined($drive->{detect_zeroes}) && !$drive->{detect_zeroes}) {
> -	    $detectzeroes = 'off';
> -	} elsif ($drive->{discard}) {
> -	    $detectzeroes = $drive->{discard} eq 'on' ? 'unmap' : 'on';
> -	} else {
> -	    # This used to be our default with discard not being specified:
> -	    $detectzeroes = 'on';
> -	}
> +}
>  
> -	# note: 'detect-zeroes' works per blockdev and we want it to persist
> -	# after the alloc-track is removed, so put it on 'file' directly
> -	my $dz_param = $live_restore_name ? "file.detect-zeroes" : "detect-zeroes";
> -	$opts .= ",$dz_param=$detectzeroes" if $detectzeroes;
> -    }
> +sub generate_drive_blockdev {
> +    my ($storecfg, $vmid, $drive, $force_readonly, $live_restore_name) = @_;
>  
> -    if ($live_restore_name) {
> -	$opts .= ",backing=$live_restore_name";
> -	$opts .= ",auto-remove=on";
> +    my $path;
> +    my $volid = $drive->{file};
> +    my $format = $drive->{format};

this is only used once below

> +    my $drive_id = get_drive_id($drive);
> +
> +    my ($storeid, $volname) = PVE::Storage::parse_volume_id($volid, 1);
> +    my $scfg = $storeid ? PVE::Storage::storage_config($storecfg, $storeid) : undef;
> +
> +    my $blockdevs = [];
> +
> +    if (drive_is_cdrom($drive)) {
> +        die "$drive_id: cannot back cdrom drive with a live restore image\n" if $live_restore_name;
> +
> +	$path = get_iso_path($storecfg, $vmid, $volid);
> +	return if !$path;
> +	$force_readonly = 1;
>      }
>  
> -    # my $file_param = $live_restore_name ? "file.file.filename" : "file";
> -    my $file_param = "file";
> +    my $file_nodename = "file-drive-$drive_id";
> +    my $blockdev_file = generate_file_blockdev($storecfg, $drive, $file_nodename);
> +    my $fmt_nodename = "fmt-drive-$drive_id";
> +    my $blockdev_format = generate_format_blockdev($storecfg, $drive, $fmt_nodename, $blockdev_file, $force_readonly);
> +
> +    my $blockdev_live_restore = undef;
>      if ($live_restore_name) {
> -	# non-rbd drivers require the underlying file to be a separate block
> -	# node, so add a second .file indirection
> -	$file_param .= ".file" if !$is_rbd;
> -	$file_param .= ".filename";
> +        die "$drive_id: Proxmox Backup Server backed drive cannot auto-detect the format\n"
> +            if !$format;

for this check, but it is not actually set anywhere here.. so is something missing or can the check go?

> +
> +        $blockdev_live_restore = { 'node-name' => "liverestore-drive-$drive_id",
> +				    backing => $live_restore_name,
> +				    'auto-remove' => 'on', format => "alloc-track",
> +				    file => $blockdev_format };
>      }
> -    my $pathinfo = $path ? "$file_param=$path," : '';
>  
> -    return "${pathinfo}if=none,id=drive-$drive->{interface}$drive->{index}$opts";
> +    #this is the topfilter entry point, use $drive-drive_id as nodename
> +    my $blockdev_throttle = { driver => "throttle", 'node-name' => "drive-$drive_id", 'throttle-group' => "throttle-drive-$drive_id" };
> +    #put liverestore filter between throttle && format filter
> +    $blockdev_throttle->{file} = $live_restore_name ? $blockdev_live_restore : $blockdev_format;
> +    return $blockdev_throttle,
>  }
>  
>  sub print_pbs_blockdev {
> @@ -4091,13 +4210,13 @@ sub config_to_command {
>  	    push @$devices, '-blockdev', $live_restore->{blockdev};
>  	}
>  
> -	my $drive_cmd = print_drive_commandline_full(
> -	    $storecfg, $vmid, $drive, $live_blockdev_name, min_version($kvmver, 6, 0));
> -
> -	# extra protection for templates, but SATA and IDE don't support it..
> -	$drive_cmd .= ',readonly=on' if drive_is_read_only($conf, $drive);
> +	my $throttle_group = print_drive_throttle_group($drive);
> +	push @$devices, '-object', $throttle_group if $throttle_group;
>  
> -	push @$devices, '-drive',$drive_cmd;
> +#	# extra protection for templates, but SATA and IDE don't support it..
> +	my $force_readonly = drive_is_read_only($conf, $drive);
> +	my $blockdev = generate_drive_blockdev($storecfg, $vmid, $drive, $force_readonly, $live_blockdev_name);
> +	push @$devices, '-blockdev', encode_json_ordered($blockdev) if $blockdev;
>  	push @$devices, '-device', print_drivedevice_full(
>  	    $storecfg, $conf, $vmid, $drive, $bridges, $arch, $machine_type);
>      });
> @@ -8986,4 +9105,8 @@ sub delete_ifaces_ipams_ips {
>      }
>  }
>  
> +sub encode_json_ordered {
> +   return JSON->new->canonical->allow_nonref->encode( $_[0] );
> +}

this is only used in a single place..

> +
>  1;
> -- 
> 2.39.5




More information about the pve-devel mailing list