[pve-devel] [PATCH manager 11/20] ceph: osd: rework creation with ceph-volume

Thomas Lamprecht t.lamprecht at proxmox.com
Wed Jun 5 10:59:08 CEST 2019


On 6/4/19 2:47 PM, Dominik Csapak wrote:
> this completely rewrites the ceph os creation api call using ceph-volume
> since ceph-disk is not available anymore
> 
> breaking changes:
> no filestore anymore, journal_dev -> db_dev
> 
> it is now possible to give a specific size for db/wal, default
> is to read from ceph db/config and fallback is
> 10% of osd for block.db and 1% of osd for block.wal
> 
> the reason is that ceph-volume does not autocreate those itself
> (like ceph-disk) but you have to create it yourself
> 
> if the db/wal device has an lvm on it with naming scheme 'ceph-UUID'
> it uses that and creates a new lv
> 
> if we detect partitions, we create a new partition at the end
> 
> if the disk is not used at all, we create a pv/vg/lv for it
> 
> it is not possible to create osds on luminous with this api call anymore,
> anyone needing this has to use ceph-disk directly

applied, with a few followups (see inline) and a request for you to fix
the thing(s) we discussed off-list (see also inline).

> 
> Signed-off-by: Dominik Csapak <d.csapak at proxmox.com>
> ---
>  PVE/API2/Ceph/OSD.pm | 186 +++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 130 insertions(+), 56 deletions(-)
> 
> diff --git a/PVE/API2/Ceph/OSD.pm b/PVE/API2/Ceph/OSD.pm
> index adb0025c..ae938016 100644
> --- a/PVE/API2/Ceph/OSD.pm
> +++ b/PVE/API2/Ceph/OSD.pm
> @@ -5,12 +5,14 @@ use warnings;
>  
>  use Cwd qw(abs_path);
>  use IO::File;
> +use UUID;
>  
>  use PVE::Ceph::Tools;
>  use PVE::Ceph::Services;
>  use PVE::CephConfig;
>  use PVE::Cluster qw(cfs_read_file cfs_write_file);
>  use PVE::Diskmanage;
> +use PVE::Storage::LVMPlugin;
>  use PVE::Exception qw(raise_param_exc);
>  use PVE::JSONSchema qw(get_standard_option);
>  use PVE::RADOS;
> @@ -198,28 +200,39 @@ __PACKAGE__->register_method ({
>  		description => "Block device name.",
>  		type => 'string',
>  	    },
> -	    journal_dev => {
> -		description => "Block device name for journal (filestore) or block.db (bluestore).",
> +	    db_dev => {
> +		description => "Block device name for block.db.",
>  		optional => 1,
>  		type => 'string',
>  	    },
> -	    wal_dev => {
> -		description => "Block device name for block.wal (bluestore only).",
> +	    db_size => {
> +		description => "Size in GiB for block.db. ".

moved below in "verbose_description" and added a "default" property describing
the behaviour in a short manner

> +		    "If a block.db is requested but the size is not given, will be ".
> +		    "automatically selected by: bluestore_block_db_size from the ".
> +		    "ceph database (osd or global section) or config (osd or global section)".
> +		    "in that order. If this is not available, it will be sized 10% of the size ".
> +		    "of the OSD device. Fails if the available size is not enough.",
>  		optional => 1,
> -		type => 'string',
> +		type => 'number',
> +		requires => 'db_dev',
> +		minimum => 1.0,
>  	    },
> -	    fstype => {
> -		description => "File system type (filestore only).",
> -		type => 'string',
> -		enum => ['xfs', 'ext4'],
> -		default => 'xfs',
> +	    wal_dev => {
> +		description => "Block device name for block.wal.",
>  		optional => 1,
> +		type => 'string',
>  	    },
> -	    bluestore => {
> -		description => "Use bluestore instead of filestore. This is the default.",
> -		type => 'boolean',
> -		default => 1,
> +	    wal_size => {
> +		description => "Size in GiB for block.wal. ".

followed-up with same as above

> +		    "If a block.wal is requested but the size is not given, will be ".
> +		    "automatically selected by: bluestore_block_wal_size from the ".
> +		    "ceph database (osd or global section) or config (osd or global section)".
> +		    "in that order. If this is not available, it will be sized 1% of the size ".
> +		    "of the OSD device. Fails if the available size is not enough.",
>  		optional => 1,
> +		minimum => 0.5,
> +		requires => 'wal_dev',
> +		type => 'number',
>  	    },
>  	},
>      },
> @@ -231,44 +244,53 @@ __PACKAGE__->register_method ({
>  
>  	my $authuser = $rpcenv->get_user();
>  
> -	raise_param_exc({ 'bluestore' => "conflicts with parameter 'fstype'" })
> -	    if (defined($param->{fstype}) && defined($param->{bluestore}) && $param->{bluestore});
> -
>  	PVE::Ceph::Tools::check_ceph_inited();
>  
>  	PVE::Ceph::Tools::setup_pve_symlinks();
>  
>  	PVE::Ceph::Tools::check_ceph_installed('ceph_osd');
> +	PVE::Ceph::Tools::check_ceph_installed('ceph_volume');
>  
> -	my $bluestore = $param->{bluestore} // 1;
> -
> -	my $journal_dev;
> +	my $dev;
> +	my $db_dev;
> +	my $db_devname;
> +	my $db_size;
>  	my $wal_dev;
> +	my $wal_devname;
> +	my $wal_size;
>  
> -	if ($param->{journal_dev} && ($param->{journal_dev} ne $param->{dev})) {
> -            $journal_dev = PVE::Diskmanage::verify_blockdev_path($param->{journal_dev});
> +	if ($param->{db_dev} && ($param->{db_dev} ne $param->{dev})) {
> +            $db_dev = PVE::Diskmanage::verify_blockdev_path($param->{db_dev});
> +	    if (defined($param->{db_size})) {
> +		$db_size = PVE::Tools::convert_size($param->{db_size}, 'gb' => 'b') ;
> +	    }
> +	    ($db_devname = $db_dev) =~ s|/dev/||;
>  	}
>  
>  	if ($param->{wal_dev} &&
>  	    ($param->{wal_dev} ne $param->{dev}) &&
> -	    (!$param->{journal_dev} || $param->{wal_dev} ne $param->{journal_dev})) {
> -	    raise_param_exc({ 'wal_dev' => "can only be set with paramater 'bluestore'"})
> -		if !$bluestore;
> +	    (!$param->{db_dev} || $param->{wal_dev} ne $param->{db_dev})) {
>              $wal_dev = PVE::Diskmanage::verify_blockdev_path($param->{wal_dev});
> +	    if (defined($param->{wal_size})) {
> +		$wal_size = PVE::Tools::convert_size($param->{wal_size}, 'gb' => 'b') ;
> +	    }
> +	    ($wal_devname = $wal_dev) =~ s|/dev/||;
>  	}

I do not like that the logic of "merging wal and db if they point to the
same devices" is 1) just silent (wal_size gets ignored in this case) and
2) is a bit disconnected from the things happening in the worker, i.e.,
"far away" from it while those lay heavy assumptions on that functionality
without noting in anywhere, cries for regression potential if one touches
this without having it in mind, IMO...

>  
> -        $param->{dev} = PVE::Diskmanage::verify_blockdev_path($param->{dev});
> +        $dev = PVE::Diskmanage::verify_blockdev_path($param->{dev});
>  
> -	my $devname = $param->{dev};
> -	$devname =~ s|/dev/||;
> +	(my $devname = $dev) =~ s|/dev/||;
> +	my $devs = [$devname];
> +	push @$devs, $db_devname if $db_devname;
> +	push @$devs, $wal_devname if $wal_devname;
>  
> -	my $disklist = PVE::Diskmanage::get_disks($devname, 1);
> +	my $disklist = PVE::Diskmanage::get_disks($devs, 1);
>  
>  	my $diskinfo = $disklist->{$devname};
>  	die "unable to get device info for '$devname'\n"
>  	    if !$diskinfo;
>  
> -	die "device '$param->{dev}' is in use\n"
> +	die "device '$dev' is in use\n"
>  	    if $diskinfo->{used};
>  
>  	my $devpath = $diskinfo->{devpath};
> @@ -286,46 +308,98 @@ __PACKAGE__->register_method ({
>  	    file_set_contents($ceph_bootstrap_osd_keyring, $bindata);
>  	};
>  
> -	my $worker = sub {
> -	    my $upid = shift;
> +	my $create_part_or_lv = sub {
> +	    my ($dev, $size, $type) = @_;
> +
> +	    if ($size =~ m/^(\d+)$/) {
> +		$size = $1;
> +	    } else {
> +		die "invalid size '$size'\n";
> +	    }
> +
> +	    die "'$dev->{devpath}' is smaller than requested size '$size' bytes\n"
> +		if $dev->{size} < $size;
> +
> +	    if (!$dev->{used}) {
> +		# create pv,vg,lv
> +
> +		my $vg = "ceph-" . UUID::uuid();
> +		my $lv = $type . "-" . UUID::uuid();
> +
> +		PVE::Storage::LVMPlugin::lvm_create_volume_group($dev->{devpath}, $vg);
> +		PVE::Storage::LVMPlugin::lvcreate($vg, $lv, "${size}b");
> +
> +		return "$vg/$lv";
>  
> -	    my $fstype = $param->{fstype} || 'xfs';
> +	    } elsif ($dev->{used} eq 'LVM') {
> +		# check pv/vg and create lv
>  
> +		my $vgs = PVE::Storage::LVMPlugin::lvm_vgs(1);
> +		my $vg;
> +		for my $vgname ( sort keys %$vgs ) {
> +		    next if $vgname !~ /^ceph-/;
>  
> -	    my $ccname = PVE::Ceph::Tools::get_config('ccname');
> +		    for my $pv ( @{$vgs->{$vgname}->{pvs}} ) {
> +			next if $pv->{name} ne $dev->{devpath};
> +			$vg = $vgname;
> +			last;
> +		    }
> +		    last if $vg;
> +		}
> +
> +		die "no ceph vg found on '$dev->{devpath}'\n" if !$vg;
> +		die "vg '$vg' has not enough free space\n" if $vgs->{$vg}->{free} < $size;
> +
> +		my $lv = $type . "-" . "012345";

followed above up to using UUID::uuid() like done in the other branch
above, as 012345 is a bit of a boring UUID ;-)

> +
> +		PVE::Storage::LVMPlugin::lvcreate($vg, $lv, "${size}b");
> +
> +		return "$vg/$lv";
> +
> +	    } elsif ($dev->{used} eq 'partitions') {
> +		# create new partition at the end
> +
> +		return PVE::Diskmanage::append_partition($dev->{devpath}, $size);
> +	    }
> +
> +	    die "cannot use '$dev->{devpath}' for '$type'\n";
> +	};
> +
> +	my $worker = sub {
> +	    my $upid = shift;
> +
> +	    PVE::Diskmanage::locked_disk_action(sub {

diskscan/get disk should only be done here, and previously only the OSD device
should be checked in advanced for "early abort", or?

> +		# get db/wal size
> +		if (($db_dev && !defined($db_size)) || ($wal_dev && !defined($wal_size))) {
> +		    my $sizes = PVE::Ceph::Tools::get_db_wal_sizes();
> +		    $db_size = $sizes->{db} // int($disklist->{$devname}->{size} / 10); # 10% of OSD
> +		    $wal_size = $sizes->{wal} // int($disklist->{$devname}->{size} / 100); # 1% of OSD
> +		}
>  
> -	    my $cmd = ['ceph-disk', 'prepare', '--zap-disk',
> -		       '--cluster', $ccname, '--cluster-uuid', $fsid ];
> +		my $cmd = ['ceph-volume', 'lvm', 'create', '--cluster-fsid', $fsid ];
>  
> -	    if ($bluestore) {
>  		print "create OSD on $devpath (bluestore)\n";
> -		push @$cmd, '--bluestore';
>  
> -		if ($journal_dev) {
> -		    print "using device '$journal_dev' for block.db\n";
> -		    push @$cmd, '--block.db', $journal_dev;
> +		if ($db_dev) {
> +		    print "creating block.db on '$db_dev'\n";
> +		    my $part_or_lv = $create_part_or_lv->($disklist->{$db_devname}, $db_size, 'osd-db');
> +		    print "using '$part_or_lv' for block.db\n";
> +		    push @$cmd, '--block.db', $part_or_lv;
>  		}
>  
>  		if ($wal_dev) {
> -		    print "using device '$wal_dev' for block.wal\n";
> -		    push @$cmd, '--block.wal', $wal_dev;
> +		    print "creating block.wal on '$wal_dev'\n";
> +		    my $part_or_lv = $create_part_or_lv->($disklist->{$wal_devname}, $wal_size, 'osd-wal');
> +		    print "using '$part_or_lv' for block.wal\n";
> +		    push @$cmd, '--block.wal', $part_or_lv;
>  		}
>  
> -		push @$cmd, $devpath;
> -	    } else {
> -		print "create OSD on $devpath ($fstype)\n";
> -		push @$cmd, '--filestore', '--fs-type', $fstype;
> -		if ($journal_dev) {
> -		    print "using device '$journal_dev' for journal\n";
> -		    push @$cmd, '--journal-dev', $devpath, $journal_dev;
> -		} else {
> -		    push @$cmd, $devpath;
> -		}
> -	    }
> +		push @$cmd, '--data', $devpath;
>  
> -	    PVE::Ceph::Tools::wipe_disks($devpath);
> +		PVE::Ceph::Tools::wipe_disks($devpath);
>  
> -	    run_command($cmd);
> +		run_command($cmd);
> +	    });
>  	};
>  
>  	return $rpcenv->fork_worker('cephcreateosd', $devname,  $authuser, $worker);
> 





More information about the pve-devel mailing list