[pve-devel] [PATCH ha-manager 1/5] Fence: cleanup class
Thomas Lamprecht
t.lamprecht at proxmox.com
Fri Apr 8 17:07:23 CEST 2016
On 04/08/2016 04:51 PM, Thomas Lamprecht wrote:
> A rather big cleanup I already had in a separate branch.
>
> Changes:
> * make process fencing private, it will be called by run_fence_jobs
> (was start_fencing) if the node has already a fence job deployed.
> * fix bugs in process_fencing where if multiple parallel device
> succeeded at once only one was counted, the same for failed devices
> * restrict waitpid to fence_job worker pids (instead of -1) else
> we could get some pretty nasty side effects
> * remove 'resed_hard', 'reset' and 'bailout', they are replaced by
> kill_and_cleanup_jobs
> * some comment and formatting changes
>
> Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
> ---
> src/PVE/HA/Fence.pm | 303 +++++++++++++++++++++++++++-------------------------
> 1 file changed, 158 insertions(+), 145 deletions(-)
>
> diff --git a/src/PVE/HA/Fence.pm b/src/PVE/HA/Fence.pm
> index 9a6c1fb..3376cb1 100644
> --- a/src/PVE/HA/Fence.pm
> +++ b/src/PVE/HA/Fence.pm
> @@ -6,83 +6,13 @@ use POSIX qw( WNOHANG );
> use PVE::HA::FenceConfig;
> use Data::Dumper;
>
> -
> - # pid's and additional info of fence processes
> +# pid's and additional info of fence processes
> my $fence_jobs = {};
>
> # fence state of a node
> my $fenced_nodes = {};
>
> -sub has_fencing_job { # update for parallel fencing
> - my ($node) = @_;
> -
> - foreach my $job (values %$fence_jobs) {
> - return 1 if ($job->{node} eq $node);
> - }
> - return undef;
> -}
> -
> -my $virtual_pid = 0; # hack for test framework
> -
> -sub start_fencing {
> - my ($haenv, $node, $try) = @_;
> -
> - $try = 0 if !defined($try) || $try<0;
> -
> - my $fence_cfg = $haenv->read_fence_config();
> - my $commands = PVE::HA::FenceConfig::get_commands($node, $try, $fence_cfg);
> -
> - if (!$commands) {
> - $haenv->log('err', "no commands for node '$node'");
> - $fenced_nodes->{$node}->{failure} = 1;
> - return 0;
> - }
> -
> - $haenv->log('notice', "Start fencing of node '$node'");
> -
> - my $can_fork = ($haenv->get_max_workers() > 0) ? 1 : 0;
> -
> - $fenced_nodes->{$node}->{needed} = scalar @$commands;
> - $fenced_nodes->{$node}->{triggered} = 0;
> -
> - for my $cmd (@$commands)
> - {
> - my $cmd_str = "$cmd->{agent} " .
> - PVE::HA::FenceConfig::gen_arg_str(@{$cmd->{param}});
> - $haenv->log('notice', "[fence '$node'] execute fence command: $cmd_str");
> -
> - if ($can_fork) {
> - my $pid = fork();
> - if (!defined($pid)) {
> - $haenv->log('err', "forking fence job failed");
> - return 0;
> - } elsif ($pid==0) { # child
> - $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
> - exit(-1);
> - } else {
> - $fence_jobs->{$pid} = {cmd=>$cmd_str, node=>$node, try=>$try};
> - }
> - } else {
> - my $res = -1;
> - eval {
> - $res = $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
> - $res = $res << 8 if $res > 0;
> - };
> - if (my $err = $@) {
> - $haenv->log('err', $err);
> - }
> -
> - $virtual_pid++;
> - $fence_jobs->{$virtual_pid} = {cmd => $cmd_str, node => $node,
> - try => $try, ec => $res};
> - }
> - }
> -
> - return 1;
> -}
> -
> -
> -# check childs and process exit status
> +# picks up/checks children and processes exit status
> my $check_jobs = sub {
> my ($haenv) = @_;
>
> @@ -91,65 +21,53 @@ my $check_jobs = sub {
>
> my @finished = ();
>
> - # pick up all finsihed childs if we can fork
> if ($haenv->get_max_workers() > 0) {
> - while((my $res = waitpid(-1, WNOHANG))>0) {
> - $fence_jobs->{$res}->{ec} = $? if $fence_jobs->{$res};
> - push @finished, $res;
> + # pick up all finished children if we can fork
> + foreach my $pid (keys %$fence_jobs) {
> +
> + my $waitpid = waitpid($pid, WNOHANG);
> + if (defined($waitpid) && ($waitpid == $pid)) {
> + $fence_jobs->{$waitpid}->{ec} = $? if $fence_jobs->{$waitpid};
> + push @finished, $waitpid;
> + }
> +
> }
> } else {
> - @finished = keys %{$fence_jobs};
> + # all jobs are already finished when not forking
> + @finished = keys %$fence_jobs;
> }
>
> - # while((my $res = waitpid(-1, WNOHANG))>0) {
> foreach my $res (@finished) {
> - if (my $job = $fence_jobs->{$res}) {
> - my $ec = $job->{ec};
> -
> - my $status = {
> - exit_code => $ec,
> - cmd => $job->{cmd},
> - try => $job->{try}
> - };
> -
> - if ($ec == 0) {
> - $succeeded->{$job->{node}} = $status;
> + my $job = $fence_jobs->{$res};
> + my $node = $job->{node};
> +
> + my $status = {
> + exit_code => $job->{ec},
> + cmd => $job->{cmd},
> + };
> +
> + if ($job->{ec} == 0) {
> + # succeeded jobs doesn't need the status for now
> + $succeeded->{$node} = $succeeded->{$node} || 0;
> + $succeeded->{$node} ++;
> + } else {
> + # failed jobs per node have the same try value, store only one
> + if (defined($failed->{$node})) {
> + push @{$failed->{$node}->{jobs}}, $status;
> } else {
> - $failed->{$job->{node}} = $status;
> + $failed->{$node}->{try} = $job->{try};
> + $failed->{$node}->{jobs} = [ $status ];
> }
> -
> - delete $fence_jobs->{$res};
> -
> - } else {
> - warn "exit from unknown child (PID=$res)";
> }
>
> + delete $fence_jobs->{$res};
> }
>
> return ($succeeded, $failed);
> };
>
> -
> -my $reset_hard = sub {
> - my ($haenv, $node) = @_;
> -
> - while (my ($pid, $job) = each %$fence_jobs) {
> - next if $job->{node} ne $node;
> -
> - if ($haenv->max_workers() > 0) {
> - kill KILL => $pid;
> - # fixme maybe use an timeout even if kill should not hang?
> - waitpid($pid, 0); # pick it up directly
> - }
> - delete $fence_jobs->{$pid};
> - }
> -
> - delete $fenced_nodes->{$node} if $fenced_nodes->{$node};
> -};
> -
> -
> # pick up jobs and process them
> -sub process_fencing {
> +my $process_fencing = sub {
> my ($haenv) = @_;
>
> my $fence_cfg = $haenv->read_fence_config();
> @@ -158,62 +76,157 @@ sub process_fencing {
>
> foreach my $node (keys %$succeeded) {
> # count how many fence devices succeeded
> - # this is needed for parallel devices
> - $fenced_nodes->{$node}->{triggered}++;
> + $fenced_nodes->{$node}->{triggered} += $succeeded->{$node};
> }
>
> # try next device for failed jobs
> - while(my ($node, $job) = each %$failed) {
> - $haenv->log('err', "fence job failed: '$job->{cmd}' returned '$job->{exit_code}'");
> + foreach my $node (keys %$failed) {
> + my @failed_jobs = @{$failed->{$node}->{jobs}};
> + my $try = $failed->{$node}->{try};
>
> - while($job->{try} < PVE::HA::FenceConfig::count_devices($node, $fence_cfg) )
> - {
> - &$reset_hard($haenv, $node);
> - $job->{try}++;
> + foreach my $job (@failed_jobs) {
> + $haenv->log('err', "fence job failed: '$job->{cmd}' returned " .
> + "'$job->{exit_code}'");
> + }
>
> - return if start_fencing($node, $job->{try});
> + # check if any devices are left to try
> + while ($try < PVE::HA::FenceConfig::count_devices($node, $fence_cfg)) {
> + # clean up the other parallel jobs, if any, as at least one failed
> + kill_and_cleanup_jobs($haenv, $node);
>
> - $haenv->log('warn', "Couldn't start fence try '$job->{try}'");
> + $try++; # try next available device
> + return if start_fencing($node, $try);
> +
> + $haenv->log('warn', "couldn't start fence try '$try'");
> }
>
> - $haenv->log('err', "Tried all fence devices\n");
> - # fixme: returnproper exit code so CRM waits for the agent lock
> + $fenced_nodes->{$node}->{failure} = 1;
> + $haenv->log('err', "tried all fence devices for node '$node'");
> + }
> +};
> +
> +sub has_fencing_job {
> + my ($node) = @_;
> +
> + foreach my $job (values %$fence_jobs) {
> + return 1 if ($job->{node} eq $node);
> }
> + return undef;
> }
>
> +my $virtual_pid = 0; # hack for test framework
>
> -sub is_node_fenced {
> - my ($node) = @_;
> +sub run_fence_jobs {
> + my ($haenv, $node, $try) = @_;
>
> - my $state = $fenced_nodes->{$node};
> - return 0 if !$state;
> + if (defined($node) && !has_fencing_job($node)) {
Ugh, the defined part is wrong and not needed, I swear I corrected that
on the rebase.
I can resend it or a cleanup afterwards, sorry for any inconvenience...
> + # start new fencing job(s)
> + $try = 0 if !defined($try) || ($try < 0);
>
> - return -1 if $state->{failure} && $state->{failure} == 1;
> + my $fence_cfg = $haenv->read_fence_config();
> + my $commands = PVE::HA::FenceConfig::get_commands($node, $try, $fence_cfg);
>
> - return ($state->{needed} && $state->{triggered} &&
> - $state->{triggered} >= $state->{needed}) ? 1 : 0;
> -}
> + if (!$commands) {
> + $haenv->log('err', "no fence commands for node '$node'");
> + $fenced_nodes->{$node}->{failure} = 1;
> + return 0;
> + }
>
> + $haenv->log('notice', "Start fencing node '$node'");
> +
> + my $can_fork = ($haenv->get_max_workers() > 0) ? 1 : 0;
> +
> + # when parallel devices are configured all must succeed
> + $fenced_nodes->{$node}->{needed} = scalar(@$commands);
> + $fenced_nodes->{$node}->{triggered} = 0;
> +
> + for my $cmd (@$commands) {
> + my $cmd_str = "$cmd->{agent} " .
> + PVE::HA::FenceConfig::gen_arg_str(@{$cmd->{param}});
> + $haenv->log('notice', "[fence '$node'] execute cmd: $cmd_str");
> +
> + if ($can_fork) {
> + my $pid = fork();
> + if (!defined($pid)) {
> + $haenv->log('err', "forking fence job failed");
> + return 0;
> + } elsif ($pid == 0) {
> + $haenv->after_fork(); # cleanup child
> +
> + $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
> + exit(-1);
> + } else {
> +
> + $fence_jobs->{$pid} = {
> + cmd => $cmd_str,
> + node => $node,
> + try => $try
> + };
> +
> + }
> + } else { # for test framework
> + my $res = -1;
> + eval {
> + $res = $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
> + $res = $res << 8 if $res > 0;
> + };
> + if (my $err = $@) {
> + $haenv->log('err', $err);
> + }
> +
> + $virtual_pid++;
> + $fence_jobs->{$virtual_pid} = {
> + cmd => $cmd_str,
> + node => $node,
> + try => $try,
> + ec => $res,
> + };
> + }
> + }
>
> -sub reset {
> - my ($node, $noerr) = @_;
> + return 1;
>
> - delete $fenced_nodes->{$node} if $fenced_nodes->{$node};
> + } else {
> + # node has already fence jobs deployed, collect finished jobs
> + # and check their result
> + &$process_fencing($haenv);
> +
> + }
> }
>
> +# if $node is undef we kill and cleanup *all* jobs from all nodes
> +sub kill_and_cleanup_jobs {
> + my ($haenv, $node) = @_;
>
> -sub bail_out {
> - my ($haenv) = @_;
> + while (my ($pid, $job) = each %$fence_jobs) {
> + next if defined($node) && $job->{node} ne $node;
>
> - if ($haenv->max_workers() > 0) {
> - foreach my $pid (keys %$fence_jobs) {
> + if ($haenv->max_workers() > 0) {
> kill KILL => $pid;
> - waitpid($pid, 0); # has to come back directly
> + # fixme maybe use an timeout even if kill should not hang?
> + waitpid($pid, 0);
> }
> + delete $fence_jobs->{$pid};
> }
>
> - $fenced_nodes = {};
> - $fence_jobs = {};
> + if (defined($node) && $fenced_nodes->{$node}) {
> + delete $fenced_nodes->{$node};
> + } else {
> + $fenced_nodes = {};
> + $fence_jobs = {};
> + }
> +};
> +
> +sub is_node_fenced {
> + my ($node) = @_;
> +
> + my $state = $fenced_nodes->{$node};
> + return 0 if !$state;
> +
> + return -1 if $state->{failure} && $state->{failure} == 1;
> +
> + return ($state->{needed} && $state->{triggered} &&
> + $state->{triggered} >= $state->{needed}) ? 1 : 0;
> }
>
> 1;
More information about the pve-devel
mailing list