[pve-devel] [PATCH cluster 1/5] corosync: add verify_conf

Fri Nov 22 10:54:20 CET 2019

On November 20, 2019 5:43 pm, Stefan Reiter wrote:
> It does some basic sanity checking, warns the user about encryption
> settings and unresolved hostnames, and finally makes sure that all nodes
> have the same links configured (as well as comparing the configured
> links to specified interfaces, if there are any).
> 
> A corosync.conf that has been created and modified strictly through our
> API should *always* be valid.
> 
> verify_conf is called in 'addnode', warnings and errors are returned via
> the API to be displayed in the task log of the node asking to join. If a
> verification error occurs, it is handled specially via a "raise" outside
> of any lock code that strips extra information from an Exception
> instance. This ensures that multi-line formatted errors can be returned.
> Warnings are always returned as array, to be printed on the caller.
> 
> Includes testing.
> 
> Signed-off-by: Stefan Reiter <s.reiter at proxmox.com>

this would be great to re-use for pve5to6 (and more importantly, a 
future pve6to7 ;))

> ---
>  data/PVE/API2/ClusterConfig.pm    |  27 +++++++-
>  data/PVE/Cluster/Setup.pm         |  20 +++++-
>  data/PVE/Corosync.pm              | 103 +++++++++++++++++++++++++++++-
>  data/test/corosync_parser_test.pl |  28 ++++++++
>  4 files changed, 175 insertions(+), 3 deletions(-)
> 
> diff --git a/data/PVE/API2/ClusterConfig.pm b/data/PVE/API2/ClusterConfig.pm
> index c426a30..3d778b8 100644
> --- a/data/PVE/API2/ClusterConfig.pm
> +++ b/data/PVE/API2/ClusterConfig.pm
> @@ -3,6 +3,7 @@ package PVE::API2::ClusterConfig;
>  use strict;
>  use warnings;
>  
> +use PVE::Exception;
>  use PVE::Tools;
>  use PVE::SafeSyslog;
>  use PVE::RESTHandler;
> @@ -219,7 +220,13 @@ __PACKAGE__->register_method ({
>  	    },
>  	    corosync_conf => {
>  		type => 'string',
> -	    }
> +	    },
> +	    warnings => {
> +		type => 'array',
> +		items => {
> +		    type => 'string',
> +		},
> +	    },
>  	},
>      },
>      code => sub {
> @@ -227,11 +234,17 @@ __PACKAGE__->register_method ({
>  
>  	PVE::Cluster::check_cfs_quorum();
>  
> +	my $vc_errors;
> +	my $vc_warnings;
> +
>  	my $code = sub {
>  	    my $conf = PVE::Cluster::cfs_read_file("corosync.conf");
>  	    my $nodelist = PVE::Corosync::nodelist($conf);
>  	    my $totem_cfg = PVE::Corosync::totem_config($conf);
>  
> +	    ($vc_errors, $vc_warnings) = PVE::Corosync::verify_conf($conf);
> +	    die if scalar(@$vc_errors);
> +
>  	    my $name = $param->{node};
>  
>  	    # ensure we do not reuse an address, that can crash the whole cluster!
> @@ -317,11 +330,23 @@ __PACKAGE__->register_method ({
>  	};
>  
>  	$config_change_lock->($code);
> +
> +	# If vc_errors is set, we died because of verify_conf.
> +	# Raise this error, since it contains more information than just a
> +	# single-line string.
> +	if (defined($vc_errors) && scalar(@$vc_errors)) {
> +	    my @combined = ();
> +	    push @combined, @$vc_errors, @$vc_warnings;
> +	    my %err_hash = map { $_ => $combined[$_] } 0..$#combined;
> +	    PVE::Exception::raise("invalid corosync.conf\n", errors => \%err_hash);

it might make sense to somehow differentiate between errors and warnings 
here? so that a caller/user knows which parts are actually causing this 
to fail..

> +	}
> +
>  	die $@ if $@;
>  
>  	my $res = {
>  	    corosync_authkey => PVE::Tools::file_get_contents($authfile),
>  	    corosync_conf => PVE::Tools::file_get_contents($clusterconf),
> +	    warnings => $vc_warnings,
>  	};
>  
>  	return $res;
> diff --git a/data/PVE/Cluster/Setup.pm b/data/PVE/Cluster/Setup.pm
> index 81e3ef8..5569d6c 100644
> --- a/data/PVE/Cluster/Setup.pm
> +++ b/data/PVE/Cluster/Setup.pm
> @@ -667,7 +667,25 @@ sub join {
>      $args->{link1} = $param->{link1} if defined($param->{link1});
>  
>      print "Request addition of this node\n";
> -    my $res = $conn->post("/cluster/config/nodes/$nodename", $args);
> +    my $res = eval { $conn->post("/cluster/config/nodes/$nodename", $args); };
> +    if ($@) {

if (my $err = $@) {

the block below is already big, who knows what happens in the future 
that might (re)set $@ half-way through ;)

> +	if ($@->{msg} && $@->{errors}) {

would it make sense to check whether $@ is a PVE::Exception here?

> +	    # we received additional info about the error, show the user
> +	    $@->{msg} =~ s/^(\n|\s)*|(\n|\s)*$//g;

PVE::Tools::trim ? do we actually care about newlines here (or why do we 
include one below, just to remove it again here)?

> +	    warn "An error occured on the cluster node: $@->{msg}\n";
> +	    foreach my $err (keys %{$@->{errors}}) {
> +		warn "* $@->{errors}->{$err}\n"
> +	    }
> +	    die "Cluster join aborted!\n";
> +	}
> +	die $@;
> +    }
> +
> +    if (defined($res->{warnings})) {
> +	foreach my $warn (@{$res->{warnings}}) {
> +	    warn "cluster: $warn\n";
> +	}
> +    }
>  
>      print "Join request OK, finishing setup locally\n";
>  
> diff --git a/data/PVE/Corosync.pm b/data/PVE/Corosync.pm
> index f52547b..5973aaf 100644
> --- a/data/PVE/Corosync.pm
> +++ b/data/PVE/Corosync.pm
> @@ -15,6 +15,7 @@ use PVE::Tools;
>  use PVE::Tools qw($IPV4RE $IPV6RE);
>  
>  my $basedir = "/etc/pve";
> +our $link_addr_re = qw/^(ring|link)(\d+)_addr$/;
>  
>  my $conf_array_sections = {
>      node => 1,
> @@ -294,6 +295,106 @@ sub create_conf {
>      return { main => $conf };
>  }
>  
> +# returns (\@errors, \@warnings) to the caller, does *not* 'die' or 'warn'
> +# verification was successful if \@errors is empty
> +sub verify_conf {
> +    my ($conf) = @_;
> +
> +    my @errors = ();
> +    my @warnings = ();
> +
> +    my $nodelist = nodelist($conf);
> +    if (!$nodelist) {
> +	push @errors, "no nodes found";
> +	return (\@errors, \@warnings);
> +    }
> +
> +    my $totem = $conf->{main}->{totem};
> +    if (!$totem) {
> +	push @errors, "no totem found";
> +	return (\@errors, \@warnings);
> +    }
> +
> +    push @warnings, "warning: authentication/encryption is not explicitly enabled"
> +	. " (secauth / crypto_cipher / crypto_hash)"
> +	if (!defined($totem->{secauth}) || $totem->{secauth} ne 'on') &&
> +	   (!defined($totem->{crypto_cipher}) || $totem->{crypto_cipher} eq 'none');
> +
> +    my $interfaces = $totem->{interface};
> +
> +    my $verify_link_ip = sub {
> +	my ($key, $link, $node) = @_;
> +	my ($resolved_ip, undef) = resolve_hostname_like_corosync($link, $conf);
> +	if (defined($resolved_ip)) {
> +	    push @warnings, "warning: $key '$link' for node '$node' resolves to"
> +		. " '$resolved_ip' - consider replacing it with the currently"
> +		. " resolved IP address for stability"
> +		if $resolved_ip ne $link;
> +	} else {
> +	    push @warnings, "warning: unable to resolve $key '$link' for node '$node'"
> +	      . " to an IP address according to Corosync's resolve strategy -"
> +	      . " cluster could fail on restart!";
> +	}
> +    };
> +
> +    my $find_option = sub {

could get a better name? it's not about finding an(y) option, it's about
- determining ring* vs link*
- getting the value of ring/linkX_addr of node Y

at least 'find_addr_option' or 'get_addr_option' would clarify a bit 
what it's about. alternatively, could also do what happens in the API, 
and first collect all the link/ring options of all nodes into a hash, 
and then use that directly for checking in the loops below. would also 
avoid iterating over each nodes config 8 times instead of just once ;)

> +	my ($node, $num) = @_;
> +
> +	foreach my $opt (keys %$node) {
> +	    next if $opt !~ $link_addr_re;
> +	    return ("${1}${2}_addr", $node->{$opt}) if int($2) == $num;
> +	}
> +
> +	return undef;
> +    };
> +
> +    # sort for output order stability
> +    my @node_names = sort keys %$nodelist;
> +
> +    if (%$interfaces) {
> +	# if interfaces are defined, *all* links must have a matching interface
> +	# definition, and vice versa
> +	for my $link (0..1) {
> +	    my $have_interface = defined($interfaces->{$link});
> +	    foreach my $node (@node_names) {
> +		my ($optname, $addr) = $find_option->($nodelist->{$node}, $link);
> +		if (defined($optname)) {
> +		    $verify_link_ip->($optname, $addr, $node);
> +		    push @errors, "node '$node' has '$optname', but there is no"
> +				. " interface number $link configured"
> +			if !$have_interface;
> +		} else {
> +		    push @errors, "node '$node' is missing address for interface"
> +				. " number $link"
> +			if $have_interface;
> +		}
> +	    }
> +	}
> +    } else {
> +	# without interfaces, only check that links are consistent among nodes
> +	for my $link (0..1) {
> +	    my $nodes_with_link = {};
> +	    foreach my $node (@node_names) {
> +		my ($optname, $addr) = $find_option->($nodelist->{$node}, $link);
> +		if (defined($optname)) {
> +		    $verify_link_ip->($optname, $addr, $node);
> +		    $nodes_with_link->{$node} = 1;
> +		}
> +	    }
> +
> +	    if (%$nodes_with_link) {
> +		foreach my $node (@node_names) {
> +		    push @errors, "node '$node' is missing link $link, which"
> +				. " is configured on other nodes"
> +			if !defined($nodes_with_link->{$node});
> +		}
> +	    }
> +	}
> +    }
> +
> +    return (\@errors, \@warnings);
> +}
> +
>  sub for_all_corosync_addresses {
>      my ($corosync_conf, $ip_version, $func) = @_;
>  
> @@ -304,7 +405,7 @@ sub for_all_corosync_addresses {
>      foreach my $node_name (sort keys %$nodelist) {
>  	my $node_config = $nodelist->{$node_name};
>  	foreach my $node_key (sort keys %$node_config) {
> -	    if ($node_key =~ /^(ring|link)\d+_addr$/) {
> +	    if ($node_key =~ $link_addr_re) {
>  		my $node_address = $node_config->{$node_key};
>  
>  		my($ip, $version) = resolve_hostname_like_corosync($node_address, $corosync_conf);
> diff --git a/data/test/corosync_parser_test.pl b/data/test/corosync_parser_test.pl
> index 18ee4f7..42c4853 100755
> --- a/data/test/corosync_parser_test.pl
> +++ b/data/test/corosync_parser_test.pl
> @@ -5,10 +5,35 @@ use lib '..';
>  use strict;
>  use warnings;
>  
> +use Test::MockModule;
>  use Test::More;
>  
>  use PVE::Corosync;
>  
> +my $known_hosts = {
> +    "prox1" => "127.0.1.1",
> +    "prox1-ring1" => "127.0.2.1",
> +    "prox2" => "127.0.1.2",
> +    "prox2-ring1" => "127.0.2.2",
> +    "prox3" => "127.0.1.3",
> +    "prox3-ring1" => "127.0.2.3",
> +    "prox4" => "127.0.1.4",
> +    "prox4-ring1" => "127.0.2.4",
> +};
> +
> +sub mocked_resolve {
> +    my ($hostname) = @_;
> +
> +    foreach my $host (keys %$known_hosts) {
> +	return $known_hosts->{$host} if $hostname eq $host;
> +    }
> +
> +    die "got unknown hostname '$hostname' during mocked resolve_hostname_like_corosync";
> +}
> +
> +my $mocked_corosync_module = new Test::MockModule('PVE::Corosync');
> +$mocked_corosync_module->mock('resolve_hostname_like_corosync', \&mocked_resolve);
> +
>  sub parser_self_check {
>      my $cfg_fn = shift;
>  
> @@ -20,6 +45,9 @@ sub parser_self_check {
>  	$raw1 = PVE::Tools::file_get_contents($cfg_fn);
>  	$config1 = PVE::Corosync::parse_conf($cfg_fn, $raw1);
>  
> +	# test verify_config
> +	PVE::Corosync::verify_conf($config1);
> +
>  	# write config
>  	$raw2 = PVE::Corosync::write_conf(undef, $config1);
>  	# do not actually write cfg, but you can outcomment to do so, e.g. if
> -- 
> 2.20.1
> 
> 
> _______________________________________________
> pve-devel mailing list
> pve-devel at pve.proxmox.com
> https://pve.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
> 
>