[pve-devel] [PATCH ha-manager 1/5] Fence: cleanup class
Thomas Lamprecht
t.lamprecht at proxmox.com
Mon Apr 11 17:05:59 CEST 2016
A rather big cleanup I already had in a separate branch.
Changes:
* make process fencing private, it will be called by run_fence_jobs
(was start_fencing) if the node has already a fence job deployed.
* fix bugs in process_fencing where if multiple parallel device
succeeded at once only one was counted, the same for failed devices
* restrict waitpid to fence_job worker pids (instead of -1) else
we could get some pretty nasty side effects
* remove 'resed_hard', 'reset' and 'bailout', they are replaced by
kill_and_cleanup_jobs
* some comment and formatting changes
Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
src/PVE/HA/Fence.pm | 307 +++++++++++++++++++++++++++-------------------------
1 file changed, 161 insertions(+), 146 deletions(-)
diff --git a/src/PVE/HA/Fence.pm b/src/PVE/HA/Fence.pm
index 9a6c1fb..ea0646d 100644
--- a/src/PVE/HA/Fence.pm
+++ b/src/PVE/HA/Fence.pm
@@ -2,87 +2,18 @@ package PVE::HA::Fence;
use strict;
use warnings;
+
use POSIX qw( WNOHANG );
-use PVE::HA::FenceConfig;
-use Data::Dumper;
+use PVE::HA::FenceConfig;
- # pid's and additional info of fence processes
+# pid's and additional info of fence processes
my $fence_jobs = {};
# fence state of a node
my $fenced_nodes = {};
-sub has_fencing_job { # update for parallel fencing
- my ($node) = @_;
-
- foreach my $job (values %$fence_jobs) {
- return 1 if ($job->{node} eq $node);
- }
- return undef;
-}
-
-my $virtual_pid = 0; # hack for test framework
-
-sub start_fencing {
- my ($haenv, $node, $try) = @_;
-
- $try = 0 if !defined($try) || $try<0;
-
- my $fence_cfg = $haenv->read_fence_config();
- my $commands = PVE::HA::FenceConfig::get_commands($node, $try, $fence_cfg);
-
- if (!$commands) {
- $haenv->log('err', "no commands for node '$node'");
- $fenced_nodes->{$node}->{failure} = 1;
- return 0;
- }
-
- $haenv->log('notice', "Start fencing of node '$node'");
-
- my $can_fork = ($haenv->get_max_workers() > 0) ? 1 : 0;
-
- $fenced_nodes->{$node}->{needed} = scalar @$commands;
- $fenced_nodes->{$node}->{triggered} = 0;
-
- for my $cmd (@$commands)
- {
- my $cmd_str = "$cmd->{agent} " .
- PVE::HA::FenceConfig::gen_arg_str(@{$cmd->{param}});
- $haenv->log('notice', "[fence '$node'] execute fence command: $cmd_str");
-
- if ($can_fork) {
- my $pid = fork();
- if (!defined($pid)) {
- $haenv->log('err', "forking fence job failed");
- return 0;
- } elsif ($pid==0) { # child
- $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
- exit(-1);
- } else {
- $fence_jobs->{$pid} = {cmd=>$cmd_str, node=>$node, try=>$try};
- }
- } else {
- my $res = -1;
- eval {
- $res = $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
- $res = $res << 8 if $res > 0;
- };
- if (my $err = $@) {
- $haenv->log('err', $err);
- }
-
- $virtual_pid++;
- $fence_jobs->{$virtual_pid} = {cmd => $cmd_str, node => $node,
- try => $try, ec => $res};
- }
- }
-
- return 1;
-}
-
-
-# check childs and process exit status
+# picks up/checks children and processes exit status
my $check_jobs = sub {
my ($haenv) = @_;
@@ -91,65 +22,53 @@ my $check_jobs = sub {
my @finished = ();
- # pick up all finsihed childs if we can fork
if ($haenv->get_max_workers() > 0) {
- while((my $res = waitpid(-1, WNOHANG))>0) {
- $fence_jobs->{$res}->{ec} = $? if $fence_jobs->{$res};
- push @finished, $res;
+ # pick up all finished children if we can fork
+ foreach my $pid (keys %$fence_jobs) {
+
+ my $waitpid = waitpid($pid, WNOHANG);
+ if (defined($waitpid) && ($waitpid == $pid)) {
+ $fence_jobs->{$waitpid}->{ec} = $? if $fence_jobs->{$waitpid};
+ push @finished, $waitpid;
+ }
+
}
} else {
- @finished = keys %{$fence_jobs};
+ # all jobs are already finished when not forking
+ @finished = keys %$fence_jobs;
}
- # while((my $res = waitpid(-1, WNOHANG))>0) {
foreach my $res (@finished) {
- if (my $job = $fence_jobs->{$res}) {
- my $ec = $job->{ec};
-
- my $status = {
- exit_code => $ec,
- cmd => $job->{cmd},
- try => $job->{try}
- };
-
- if ($ec == 0) {
- $succeeded->{$job->{node}} = $status;
+ my $job = $fence_jobs->{$res};
+ my $node = $job->{node};
+
+ my $status = {
+ exit_code => $job->{ec},
+ cmd => $job->{cmd},
+ };
+
+ if ($job->{ec} == 0) {
+ # succeeded jobs doesn't need the status for now
+ $succeeded->{$node} = $succeeded->{$node} || 0;
+ $succeeded->{$node} ++;
+ } else {
+ # with parallel device multiple may fail at once, store all
+ if (defined($failed->{$node})) {
+ push @{$failed->{$node}->{jobs}}, $status;
} else {
- $failed->{$job->{node}} = $status;
+ $failed->{$node}->{try} = $job->{try};
+ $failed->{$node}->{jobs} = [ $status ];
}
-
- delete $fence_jobs->{$res};
-
- } else {
- warn "exit from unknown child (PID=$res)";
}
+ delete $fence_jobs->{$res};
}
return ($succeeded, $failed);
};
-
-my $reset_hard = sub {
- my ($haenv, $node) = @_;
-
- while (my ($pid, $job) = each %$fence_jobs) {
- next if $job->{node} ne $node;
-
- if ($haenv->max_workers() > 0) {
- kill KILL => $pid;
- # fixme maybe use an timeout even if kill should not hang?
- waitpid($pid, 0); # pick it up directly
- }
- delete $fence_jobs->{$pid};
- }
-
- delete $fenced_nodes->{$node} if $fenced_nodes->{$node};
-};
-
-
# pick up jobs and process them
-sub process_fencing {
+my $process_fencing = sub {
my ($haenv) = @_;
my $fence_cfg = $haenv->read_fence_config();
@@ -158,62 +77,158 @@ sub process_fencing {
foreach my $node (keys %$succeeded) {
# count how many fence devices succeeded
- # this is needed for parallel devices
- $fenced_nodes->{$node}->{triggered}++;
+ $fenced_nodes->{$node}->{triggered} += $succeeded->{$node};
}
# try next device for failed jobs
- while(my ($node, $job) = each %$failed) {
- $haenv->log('err', "fence job failed: '$job->{cmd}' returned '$job->{exit_code}'");
+ foreach my $node (keys %$failed) {
+ my @failed_jobs = @{$failed->{$node}->{jobs}};
+ my $try = $failed->{$node}->{try};
+
+ foreach my $job (@failed_jobs) {
+ $haenv->log('err', "fence job failed: '$job->{cmd}' returned " .
+ "'$job->{exit_code}'");
+ }
- while($job->{try} < PVE::HA::FenceConfig::count_devices($node, $fence_cfg) )
- {
- &$reset_hard($haenv, $node);
- $job->{try}++;
+ # check if any devices are left to try
+ while ($try < PVE::HA::FenceConfig::count_devices($node, $fence_cfg)) {
+ # clean up the other parallel jobs, if any, as at least one failed
+ kill_and_cleanup_jobs($haenv, $node);
- return if start_fencing($node, $job->{try});
+ $try++; # try next available device
+ return if start_fencing($node, $try);
- $haenv->log('warn', "Couldn't start fence try '$job->{try}'");
+ $haenv->log('warn', "couldn't start fence try '$try'");
}
- $haenv->log('err', "Tried all fence devices\n");
- # fixme: returnproper exit code so CRM waits for the agent lock
+ $fenced_nodes->{$node}->{failure} = 1;
+ $haenv->log('err', "tried all fence devices for node '$node'");
}
+};
+
+sub has_fencing_job {
+ my ($node) = @_;
+
+ foreach my $job (values %$fence_jobs) {
+ return 1 if ($job->{node} eq $node);
+ }
+ return undef;
}
+my $virtual_pid = 0; # hack for test framework
-sub is_node_fenced {
- my ($node) = @_;
+sub run_fence_jobs {
+ my ($haenv, $node, $try) = @_;
- my $state = $fenced_nodes->{$node};
- return 0 if !$state;
+ if (!has_fencing_job($node)) {
+ # start new fencing job(s)
+ $try = 0 if !defined($try) || ($try < 0);
- return -1 if $state->{failure} && $state->{failure} == 1;
+ my $fence_cfg = $haenv->read_fence_config();
+ my $commands = PVE::HA::FenceConfig::get_commands($node, $try, $fence_cfg);
- return ($state->{needed} && $state->{triggered} &&
- $state->{triggered} >= $state->{needed}) ? 1 : 0;
-}
+ if (!$commands) {
+ $haenv->log('err', "no fence commands for node '$node'");
+ $fenced_nodes->{$node}->{failure} = 1;
+ return 0;
+ }
+
+ $haenv->log('notice', "Start fencing node '$node'");
+
+ my $can_fork = ($haenv->get_max_workers() > 0) ? 1 : 0;
+
+ # when parallel devices are configured all must succeed
+ $fenced_nodes->{$node}->{needed} = scalar(@$commands);
+ $fenced_nodes->{$node}->{triggered} = 0;
+
+ for my $cmd (@$commands) {
+ my $cmd_str = "$cmd->{agent} " .
+ PVE::HA::FenceConfig::gen_arg_str(@{$cmd->{param}});
+ $haenv->log('notice', "[fence '$node'] execute cmd: $cmd_str");
+
+ if ($can_fork) {
+
+ my $pid = fork();
+ if (!defined($pid)) {
+ $haenv->log('err', "forking fence job failed");
+ return 0;
+ } elsif ($pid == 0) {
+ $haenv->after_fork(); # cleanup child
+ $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
+ exit(-1);
+ } else {
-sub reset {
- my ($node, $noerr) = @_;
+ $fence_jobs->{$pid} = {
+ cmd => $cmd_str,
+ node => $node,
+ try => $try
+ };
- delete $fenced_nodes->{$node} if $fenced_nodes->{$node};
+ }
+
+ } else {
+ # for test framework
+ my $res = -1;
+ eval {
+ $res = $haenv->exec_fence_agent($cmd->{agent}, $node, @{$cmd->{param}});
+ $res = $res << 8 if $res > 0;
+ };
+ if (my $err = $@) {
+ $haenv->log('err', $err);
+ }
+
+ $virtual_pid++;
+ $fence_jobs->{$virtual_pid} = {
+ cmd => $cmd_str,
+ node => $node,
+ try => $try,
+ ec => $res,
+ };
+
+ }
+ }
+
+ return 1;
+
+ } else {
+ # check already deployed fence jobs
+ &$process_fencing($haenv);
+ }
}
+# if $node is undef we kill and cleanup *all* jobs from all nodes
+sub kill_and_cleanup_jobs {
+ my ($haenv, $node) = @_;
-sub bail_out {
- my ($haenv) = @_;
+ while (my ($pid, $job) = each %$fence_jobs) {
+ next if defined($node) && $job->{node} ne $node;
- if ($haenv->max_workers() > 0) {
- foreach my $pid (keys %$fence_jobs) {
+ if ($haenv->max_workers() > 0) {
kill KILL => $pid;
- waitpid($pid, 0); # has to come back directly
+ waitpid($pid, 0);
}
+ delete $fence_jobs->{$pid};
+ }
+
+ if (defined($node) && $fenced_nodes->{$node}) {
+ delete $fenced_nodes->{$node};
+ } else {
+ $fenced_nodes = {};
+ $fence_jobs = {};
}
+};
+
+sub is_node_fenced {
+ my ($node) = @_;
+
+ my $state = $fenced_nodes->{$node};
+ return 0 if !$state;
- $fenced_nodes = {};
- $fence_jobs = {};
+ return -1 if $state->{failure} && $state->{failure} == 1;
+
+ return ($state->{needed} && $state->{triggered} &&
+ $state->{triggered} >= $state->{needed}) ? 1 : 0;
}
1;
--
2.1.4
More information about the pve-devel
mailing list