[pve-devel] [PATCH cluster 1/5] corosync: add verify_conf
Fabian Grünbichler
f.gruenbichler at proxmox.com
Fri Nov 22 10:54:20 CET 2019
On November 20, 2019 5:43 pm, Stefan Reiter wrote:
> It does some basic sanity checking, warns the user about encryption
> settings and unresolved hostnames, and finally makes sure that all nodes
> have the same links configured (as well as comparing the configured
> links to specified interfaces, if there are any).
>
> A corosync.conf that has been created and modified strictly through our
> API should *always* be valid.
>
> verify_conf is called in 'addnode', warnings and errors are returned via
> the API to be displayed in the task log of the node asking to join. If a
> verification error occurs, it is handled specially via a "raise" outside
> of any lock code that strips extra information from an Exception
> instance. This ensures that multi-line formatted errors can be returned.
> Warnings are always returned as array, to be printed on the caller.
>
> Includes testing.
>
> Signed-off-by: Stefan Reiter <s.reiter at proxmox.com>
this would be great to re-use for pve5to6 (and more importantly, a
future pve6to7 ;))
> ---
> data/PVE/API2/ClusterConfig.pm | 27 +++++++-
> data/PVE/Cluster/Setup.pm | 20 +++++-
> data/PVE/Corosync.pm | 103 +++++++++++++++++++++++++++++-
> data/test/corosync_parser_test.pl | 28 ++++++++
> 4 files changed, 175 insertions(+), 3 deletions(-)
>
> diff --git a/data/PVE/API2/ClusterConfig.pm b/data/PVE/API2/ClusterConfig.pm
> index c426a30..3d778b8 100644
> --- a/data/PVE/API2/ClusterConfig.pm
> +++ b/data/PVE/API2/ClusterConfig.pm
> @@ -3,6 +3,7 @@ package PVE::API2::ClusterConfig;
> use strict;
> use warnings;
>
> +use PVE::Exception;
> use PVE::Tools;
> use PVE::SafeSyslog;
> use PVE::RESTHandler;
> @@ -219,7 +220,13 @@ __PACKAGE__->register_method ({
> },
> corosync_conf => {
> type => 'string',
> - }
> + },
> + warnings => {
> + type => 'array',
> + items => {
> + type => 'string',
> + },
> + },
> },
> },
> code => sub {
> @@ -227,11 +234,17 @@ __PACKAGE__->register_method ({
>
> PVE::Cluster::check_cfs_quorum();
>
> + my $vc_errors;
> + my $vc_warnings;
> +
> my $code = sub {
> my $conf = PVE::Cluster::cfs_read_file("corosync.conf");
> my $nodelist = PVE::Corosync::nodelist($conf);
> my $totem_cfg = PVE::Corosync::totem_config($conf);
>
> + ($vc_errors, $vc_warnings) = PVE::Corosync::verify_conf($conf);
> + die if scalar(@$vc_errors);
> +
> my $name = $param->{node};
>
> # ensure we do not reuse an address, that can crash the whole cluster!
> @@ -317,11 +330,23 @@ __PACKAGE__->register_method ({
> };
>
> $config_change_lock->($code);
> +
> + # If vc_errors is set, we died because of verify_conf.
> + # Raise this error, since it contains more information than just a
> + # single-line string.
> + if (defined($vc_errors) && scalar(@$vc_errors)) {
> + my @combined = ();
> + push @combined, @$vc_errors, @$vc_warnings;
> + my %err_hash = map { $_ => $combined[$_] } 0..$#combined;
> + PVE::Exception::raise("invalid corosync.conf\n", errors => \%err_hash);
it might make sense to somehow differentiate between errors and warnings
here? so that a caller/user knows which parts are actually causing this
to fail..
> + }
> +
> die $@ if $@;
>
> my $res = {
> corosync_authkey => PVE::Tools::file_get_contents($authfile),
> corosync_conf => PVE::Tools::file_get_contents($clusterconf),
> + warnings => $vc_warnings,
> };
>
> return $res;
> diff --git a/data/PVE/Cluster/Setup.pm b/data/PVE/Cluster/Setup.pm
> index 81e3ef8..5569d6c 100644
> --- a/data/PVE/Cluster/Setup.pm
> +++ b/data/PVE/Cluster/Setup.pm
> @@ -667,7 +667,25 @@ sub join {
> $args->{link1} = $param->{link1} if defined($param->{link1});
>
> print "Request addition of this node\n";
> - my $res = $conn->post("/cluster/config/nodes/$nodename", $args);
> + my $res = eval { $conn->post("/cluster/config/nodes/$nodename", $args); };
> + if ($@) {
if (my $err = $@) {
the block below is already big, who knows what happens in the future
that might (re)set $@ half-way through ;)
> + if ($@->{msg} && $@->{errors}) {
would it make sense to check whether $@ is a PVE::Exception here?
> + # we received additional info about the error, show the user
> + $@->{msg} =~ s/^(\n|\s)*|(\n|\s)*$//g;
PVE::Tools::trim ? do we actually care about newlines here (or why do we
include one below, just to remove it again here)?
> + warn "An error occured on the cluster node: $@->{msg}\n";
> + foreach my $err (keys %{$@->{errors}}) {
> + warn "* $@->{errors}->{$err}\n"
> + }
> + die "Cluster join aborted!\n";
> + }
> + die $@;
> + }
> +
> + if (defined($res->{warnings})) {
> + foreach my $warn (@{$res->{warnings}}) {
> + warn "cluster: $warn\n";
> + }
> + }
>
> print "Join request OK, finishing setup locally\n";
>
> diff --git a/data/PVE/Corosync.pm b/data/PVE/Corosync.pm
> index f52547b..5973aaf 100644
> --- a/data/PVE/Corosync.pm
> +++ b/data/PVE/Corosync.pm
> @@ -15,6 +15,7 @@ use PVE::Tools;
> use PVE::Tools qw($IPV4RE $IPV6RE);
>
> my $basedir = "/etc/pve";
> +our $link_addr_re = qw/^(ring|link)(\d+)_addr$/;
>
> my $conf_array_sections = {
> node => 1,
> @@ -294,6 +295,106 @@ sub create_conf {
> return { main => $conf };
> }
>
> +# returns (\@errors, \@warnings) to the caller, does *not* 'die' or 'warn'
> +# verification was successful if \@errors is empty
> +sub verify_conf {
> + my ($conf) = @_;
> +
> + my @errors = ();
> + my @warnings = ();
> +
> + my $nodelist = nodelist($conf);
> + if (!$nodelist) {
> + push @errors, "no nodes found";
> + return (\@errors, \@warnings);
> + }
> +
> + my $totem = $conf->{main}->{totem};
> + if (!$totem) {
> + push @errors, "no totem found";
> + return (\@errors, \@warnings);
> + }
> +
> + push @warnings, "warning: authentication/encryption is not explicitly enabled"
> + . " (secauth / crypto_cipher / crypto_hash)"
> + if (!defined($totem->{secauth}) || $totem->{secauth} ne 'on') &&
> + (!defined($totem->{crypto_cipher}) || $totem->{crypto_cipher} eq 'none');
> +
> + my $interfaces = $totem->{interface};
> +
> + my $verify_link_ip = sub {
> + my ($key, $link, $node) = @_;
> + my ($resolved_ip, undef) = resolve_hostname_like_corosync($link, $conf);
> + if (defined($resolved_ip)) {
> + push @warnings, "warning: $key '$link' for node '$node' resolves to"
> + . " '$resolved_ip' - consider replacing it with the currently"
> + . " resolved IP address for stability"
> + if $resolved_ip ne $link;
> + } else {
> + push @warnings, "warning: unable to resolve $key '$link' for node '$node'"
> + . " to an IP address according to Corosync's resolve strategy -"
> + . " cluster could fail on restart!";
> + }
> + };
> +
> + my $find_option = sub {
could get a better name? it's not about finding an(y) option, it's about
- determining ring* vs link*
- getting the value of ring/linkX_addr of node Y
at least 'find_addr_option' or 'get_addr_option' would clarify a bit
what it's about. alternatively, could also do what happens in the API,
and first collect all the link/ring options of all nodes into a hash,
and then use that directly for checking in the loops below. would also
avoid iterating over each nodes config 8 times instead of just once ;)
> + my ($node, $num) = @_;
> +
> + foreach my $opt (keys %$node) {
> + next if $opt !~ $link_addr_re;
> + return ("${1}${2}_addr", $node->{$opt}) if int($2) == $num;
> + }
> +
> + return undef;
> + };
> +
> + # sort for output order stability
> + my @node_names = sort keys %$nodelist;
> +
> + if (%$interfaces) {
> + # if interfaces are defined, *all* links must have a matching interface
> + # definition, and vice versa
> + for my $link (0..1) {
> + my $have_interface = defined($interfaces->{$link});
> + foreach my $node (@node_names) {
> + my ($optname, $addr) = $find_option->($nodelist->{$node}, $link);
> + if (defined($optname)) {
> + $verify_link_ip->($optname, $addr, $node);
> + push @errors, "node '$node' has '$optname', but there is no"
> + . " interface number $link configured"
> + if !$have_interface;
> + } else {
> + push @errors, "node '$node' is missing address for interface"
> + . " number $link"
> + if $have_interface;
> + }
> + }
> + }
> + } else {
> + # without interfaces, only check that links are consistent among nodes
> + for my $link (0..1) {
> + my $nodes_with_link = {};
> + foreach my $node (@node_names) {
> + my ($optname, $addr) = $find_option->($nodelist->{$node}, $link);
> + if (defined($optname)) {
> + $verify_link_ip->($optname, $addr, $node);
> + $nodes_with_link->{$node} = 1;
> + }
> + }
> +
> + if (%$nodes_with_link) {
> + foreach my $node (@node_names) {
> + push @errors, "node '$node' is missing link $link, which"
> + . " is configured on other nodes"
> + if !defined($nodes_with_link->{$node});
> + }
> + }
> + }
> + }
> +
> + return (\@errors, \@warnings);
> +}
> +
> sub for_all_corosync_addresses {
> my ($corosync_conf, $ip_version, $func) = @_;
>
> @@ -304,7 +405,7 @@ sub for_all_corosync_addresses {
> foreach my $node_name (sort keys %$nodelist) {
> my $node_config = $nodelist->{$node_name};
> foreach my $node_key (sort keys %$node_config) {
> - if ($node_key =~ /^(ring|link)\d+_addr$/) {
> + if ($node_key =~ $link_addr_re) {
> my $node_address = $node_config->{$node_key};
>
> my($ip, $version) = resolve_hostname_like_corosync($node_address, $corosync_conf);
> diff --git a/data/test/corosync_parser_test.pl b/data/test/corosync_parser_test.pl
> index 18ee4f7..42c4853 100755
> --- a/data/test/corosync_parser_test.pl
> +++ b/data/test/corosync_parser_test.pl
> @@ -5,10 +5,35 @@ use lib '..';
> use strict;
> use warnings;
>
> +use Test::MockModule;
> use Test::More;
>
> use PVE::Corosync;
>
> +my $known_hosts = {
> + "prox1" => "127.0.1.1",
> + "prox1-ring1" => "127.0.2.1",
> + "prox2" => "127.0.1.2",
> + "prox2-ring1" => "127.0.2.2",
> + "prox3" => "127.0.1.3",
> + "prox3-ring1" => "127.0.2.3",
> + "prox4" => "127.0.1.4",
> + "prox4-ring1" => "127.0.2.4",
> +};
> +
> +sub mocked_resolve {
> + my ($hostname) = @_;
> +
> + foreach my $host (keys %$known_hosts) {
> + return $known_hosts->{$host} if $hostname eq $host;
> + }
> +
> + die "got unknown hostname '$hostname' during mocked resolve_hostname_like_corosync";
> +}
> +
> +my $mocked_corosync_module = new Test::MockModule('PVE::Corosync');
> +$mocked_corosync_module->mock('resolve_hostname_like_corosync', \&mocked_resolve);
> +
> sub parser_self_check {
> my $cfg_fn = shift;
>
> @@ -20,6 +45,9 @@ sub parser_self_check {
> $raw1 = PVE::Tools::file_get_contents($cfg_fn);
> $config1 = PVE::Corosync::parse_conf($cfg_fn, $raw1);
>
> + # test verify_config
> + PVE::Corosync::verify_conf($config1);
> +
> # write config
> $raw2 = PVE::Corosync::write_conf(undef, $config1);
> # do not actually write cfg, but you can outcomment to do so, e.g. if
> --
> 2.20.1
>
>
> _______________________________________________
> pve-devel mailing list
> pve-devel at pve.proxmox.com
> https://pve.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
>
>
More information about the pve-devel
mailing list