[pve-devel] [PATCH ha-manager 2/4] cleanup service from old locks after recovery (fixes #1100)

Thomas Lamprecht t.lamprecht at proxmox.com
Fri Sep 9 16:15:35 CEST 2016


This cleans up all lock after a service could be recovered from a
node, else it may not start if its old node failed during the time
when the service was locked, e.g. by backup/snapshot, and ast the
action which the lock protects was killed the purpose of the lock
is gone.

Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
 src/PVE/HA/Env/PVE2.pm                   |  2 ++
 src/PVE/HA/Resources.pm                  |  9 +++++++
 src/PVE/HA/Resources/PVECT.pm            | 13 ++++++++++
 src/PVE/HA/Resources/PVEVM.pm            | 13 ++++++++++
 src/PVE/HA/Sim/Env.pm                    |  9 ++++++-
 src/PVE/HA/Sim/Resources.pm              | 16 ++++++++++++
 src/test/test-locked-service1/log.expect | 44 ++++++++++++++++++++++++++++++++
 7 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 src/test/test-locked-service1/log.expect

diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm
index ef6485d..6ed7c29 100644
--- a/src/PVE/HA/Env/PVE2.pm
+++ b/src/PVE/HA/Env/PVE2.pm
@@ -184,6 +184,8 @@ sub steal_service {
 	my $new = $plugin->config_file($name, $new_node);
 	rename($old, $new) ||
 	    die "rename '$old' to '$new' failed - $!\n";
+
+	$plugin->after_recovery_cleanup($self, $name);
     } else {
 	die "implement me";
     }
diff --git a/src/PVE/HA/Resources.pm b/src/PVE/HA/Resources.pm
index 3836fc8..365d31e 100644
--- a/src/PVE/HA/Resources.pm
+++ b/src/PVE/HA/Resources.pm
@@ -124,6 +124,15 @@ sub check_running {
     die "implement in subclass";
 }
 
+# for cleaning up wrong states leftover from not finished actions on the failed
+# node, e.g. the lock from a service backup
+sub after_recovery_cleanup {
+    my ($self, $haenv, $id) = @_;
+
+    die "implement in subclass";
+}
+
+
 
 # package PVE::HA::Resources::IPAddr;
 
diff --git a/src/PVE/HA/Resources/PVECT.pm b/src/PVE/HA/Resources/PVECT.pm
index 0b44c70..fe9e522 100644
--- a/src/PVE/HA/Resources/PVECT.pm
+++ b/src/PVE/HA/Resources/PVECT.pm
@@ -113,4 +113,17 @@ sub check_running {
     return PVE::LXC::check_running($vmid);
 }
 
+sub after_recovery_cleanup {
+    my ($self, $haenv, $id) = @_;
+
+    # remove all config locks as any left overs were rendered unnecessary when the
+    # old node of the service was fenced
+    if (my $removed_lock = PVE::LXC::Config->remove_lock($id)) {
+	$haenv->log('warning', "removed leftover lock '$removed_lock' from recovered " .
+	            "service 'ct:$id' to allow its start.");
+    }
+
+    # TODO: anything else?
+}
+
 1;
diff --git a/src/PVE/HA/Resources/PVEVM.pm b/src/PVE/HA/Resources/PVEVM.pm
index 4c06df9..d996940 100644
--- a/src/PVE/HA/Resources/PVEVM.pm
+++ b/src/PVE/HA/Resources/PVEVM.pm
@@ -116,4 +116,17 @@ sub check_running {
     return PVE::QemuServer::check_running($vmid, 1, $nodename);
 }
 
+sub after_recovery_cleanup {
+    my ($self, $haenv, $id) = @_;
+
+    # remove all config locks as any left overs were rendered unnecessary when the
+    # old node of the service was fenced
+    if (my $removed_lock = PVE::QemuConfig->remove_lock($id)) {
+	$haenv->log('warning', "removed leftover lock '$removed_lock' from recovered " .
+	            "service 'vm:$id' to allow its start.");
+    }
+
+    # TODO: anything else?
+}
+
 1;
diff --git a/src/PVE/HA/Sim/Env.pm b/src/PVE/HA/Sim/Env.pm
index cd1574c..c91f67f 100644
--- a/src/PVE/HA/Sim/Env.pm
+++ b/src/PVE/HA/Sim/Env.pm
@@ -199,7 +199,14 @@ sub read_group_config {
 sub steal_service {
     my ($self, $sid, $current_node, $new_node) = @_;
 
-    return $self->{hardware}->change_service_location($sid, $current_node, $new_node);
+    $self->{hardware}->change_service_location($sid, $current_node, $new_node);
+
+    my (undef, $type, $name) = PVE::HA::Tools::parse_sid($sid);
+    if (my $plugin = PVE::HA::Sim::Resources->lookup($type)) {
+	$plugin->after_recovery_cleanup($self, $name);
+    } else {
+	die "implement me";
+    }
 }
 
 sub queue_crm_commands {
diff --git a/src/PVE/HA/Sim/Resources.pm b/src/PVE/HA/Sim/Resources.pm
index ec3d775..e664821 100644
--- a/src/PVE/HA/Sim/Resources.pm
+++ b/src/PVE/HA/Sim/Resources.pm
@@ -113,5 +113,21 @@ sub migrate {
     return defined($ss->{$sid}) ? 0 : 1;
 }
 
+sub after_recovery_cleanup {
+    my ($self, $haenv, $id) = @_;
+
+    my $sid = $self->type() . ":$id";
+    my $hardware = $haenv->hardware();
+
+    # remove all config locks as any left overs were rendered unnecessary when the
+    # old node of the service was fenced
+    if (my $removed_lock = $hardware->unlock_service($sid)) {
+	$haenv->log('warning', "removed leftover lock '$removed_lock' from recovered " .
+	            "service '$sid' to allow its start.");
+    }
+
+    # TODO: anything else?
+}
+
 
 1;
diff --git a/src/test/test-locked-service1/log.expect b/src/test/test-locked-service1/log.expect
new file mode 100644
index 0000000..42eed65
--- /dev/null
+++ b/src/test/test-locked-service1/log.expect
@@ -0,0 +1,44 @@
+info      0     hardware: starting simulation
+info     20      cmdlist: execute power node1 on
+info     20    node1/crm: status change startup => wait_for_quorum
+info     20    node1/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node2 on
+info     20    node2/crm: status change startup => wait_for_quorum
+info     20    node2/lrm: status change startup => wait_for_agent_lock
+info     20      cmdlist: execute power node3 on
+info     20    node3/crm: status change startup => wait_for_quorum
+info     20    node3/lrm: status change startup => wait_for_agent_lock
+info     20    node1/crm: got lock 'ha_manager_lock'
+info     20    node1/crm: status change wait_for_quorum => master
+info     20    node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info     20    node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info     20    node1/crm: adding new service 'vm:103' on node 'node3'
+info     22    node2/crm: status change wait_for_quorum => slave
+info     24    node3/crm: status change wait_for_quorum => slave
+info     25    node3/lrm: got lock 'ha_agent_node3_lock'
+info     25    node3/lrm: status change wait_for_agent_lock => active
+info     25    node3/lrm: starting service vm:103
+info     25    node3/lrm: service status vm:103 started
+info    120      cmdlist: execute service vm:103 lock
+info    220      cmdlist: execute network node3 off
+info    220    node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info    224    node3/crm: status change slave => wait_for_quorum
+info    225    node3/lrm: status change active => lost_agent_lock
+info    260    node1/crm: service 'vm:103': state changed from 'started' to 'fence'
+info    260    node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info    266     watchdog: execute power node3 off
+info    265    node3/crm: killed by poweroff
+info    266    node3/lrm: killed by poweroff
+info    266     hardware: server 'node3' stopped by poweroff (watchdog)
+info    340    node1/crm: got lock 'ha_agent_node3_lock'
+info    340    node1/crm: fencing: acknowleged - got agent lock for node 'node3'
+info    340    node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+info    340    node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
+warn    340    node1/crm: removed leftover lock 'virtual' from recovered service 'vm:103' to allow its start.
+info    340    node1/crm: service 'vm:103': state changed from 'fence' to 'started'  (node = node1)
+info    341    node1/lrm: got lock 'ha_agent_node1_lock'
+info    341    node1/lrm: status change wait_for_agent_lock => active
+info    341    node1/lrm: starting service vm:103
+info    341    node1/lrm: service status vm:103 started
+info    820     hardware: exit simulation - done
-- 
2.1.4





More information about the pve-devel mailing list