[pve-devel] [PATCH ha-manager 2/4] cleanup service from old locks after recovery (fixes #1100)
Thomas Lamprecht
t.lamprecht at proxmox.com
Fri Sep 9 16:15:35 CEST 2016
This cleans up all lock after a service could be recovered from a
node, else it may not start if its old node failed during the time
when the service was locked, e.g. by backup/snapshot, and ast the
action which the lock protects was killed the purpose of the lock
is gone.
Signed-off-by: Thomas Lamprecht <t.lamprecht at proxmox.com>
---
src/PVE/HA/Env/PVE2.pm | 2 ++
src/PVE/HA/Resources.pm | 9 +++++++
src/PVE/HA/Resources/PVECT.pm | 13 ++++++++++
src/PVE/HA/Resources/PVEVM.pm | 13 ++++++++++
src/PVE/HA/Sim/Env.pm | 9 ++++++-
src/PVE/HA/Sim/Resources.pm | 16 ++++++++++++
src/test/test-locked-service1/log.expect | 44 ++++++++++++++++++++++++++++++++
7 files changed, 105 insertions(+), 1 deletion(-)
create mode 100644 src/test/test-locked-service1/log.expect
diff --git a/src/PVE/HA/Env/PVE2.pm b/src/PVE/HA/Env/PVE2.pm
index ef6485d..6ed7c29 100644
--- a/src/PVE/HA/Env/PVE2.pm
+++ b/src/PVE/HA/Env/PVE2.pm
@@ -184,6 +184,8 @@ sub steal_service {
my $new = $plugin->config_file($name, $new_node);
rename($old, $new) ||
die "rename '$old' to '$new' failed - $!\n";
+
+ $plugin->after_recovery_cleanup($self, $name);
} else {
die "implement me";
}
diff --git a/src/PVE/HA/Resources.pm b/src/PVE/HA/Resources.pm
index 3836fc8..365d31e 100644
--- a/src/PVE/HA/Resources.pm
+++ b/src/PVE/HA/Resources.pm
@@ -124,6 +124,15 @@ sub check_running {
die "implement in subclass";
}
+# for cleaning up wrong states leftover from not finished actions on the failed
+# node, e.g. the lock from a service backup
+sub after_recovery_cleanup {
+ my ($self, $haenv, $id) = @_;
+
+ die "implement in subclass";
+}
+
+
# package PVE::HA::Resources::IPAddr;
diff --git a/src/PVE/HA/Resources/PVECT.pm b/src/PVE/HA/Resources/PVECT.pm
index 0b44c70..fe9e522 100644
--- a/src/PVE/HA/Resources/PVECT.pm
+++ b/src/PVE/HA/Resources/PVECT.pm
@@ -113,4 +113,17 @@ sub check_running {
return PVE::LXC::check_running($vmid);
}
+sub after_recovery_cleanup {
+ my ($self, $haenv, $id) = @_;
+
+ # remove all config locks as any left overs were rendered unnecessary when the
+ # old node of the service was fenced
+ if (my $removed_lock = PVE::LXC::Config->remove_lock($id)) {
+ $haenv->log('warning', "removed leftover lock '$removed_lock' from recovered " .
+ "service 'ct:$id' to allow its start.");
+ }
+
+ # TODO: anything else?
+}
+
1;
diff --git a/src/PVE/HA/Resources/PVEVM.pm b/src/PVE/HA/Resources/PVEVM.pm
index 4c06df9..d996940 100644
--- a/src/PVE/HA/Resources/PVEVM.pm
+++ b/src/PVE/HA/Resources/PVEVM.pm
@@ -116,4 +116,17 @@ sub check_running {
return PVE::QemuServer::check_running($vmid, 1, $nodename);
}
+sub after_recovery_cleanup {
+ my ($self, $haenv, $id) = @_;
+
+ # remove all config locks as any left overs were rendered unnecessary when the
+ # old node of the service was fenced
+ if (my $removed_lock = PVE::QemuConfig->remove_lock($id)) {
+ $haenv->log('warning', "removed leftover lock '$removed_lock' from recovered " .
+ "service 'vm:$id' to allow its start.");
+ }
+
+ # TODO: anything else?
+}
+
1;
diff --git a/src/PVE/HA/Sim/Env.pm b/src/PVE/HA/Sim/Env.pm
index cd1574c..c91f67f 100644
--- a/src/PVE/HA/Sim/Env.pm
+++ b/src/PVE/HA/Sim/Env.pm
@@ -199,7 +199,14 @@ sub read_group_config {
sub steal_service {
my ($self, $sid, $current_node, $new_node) = @_;
- return $self->{hardware}->change_service_location($sid, $current_node, $new_node);
+ $self->{hardware}->change_service_location($sid, $current_node, $new_node);
+
+ my (undef, $type, $name) = PVE::HA::Tools::parse_sid($sid);
+ if (my $plugin = PVE::HA::Sim::Resources->lookup($type)) {
+ $plugin->after_recovery_cleanup($self, $name);
+ } else {
+ die "implement me";
+ }
}
sub queue_crm_commands {
diff --git a/src/PVE/HA/Sim/Resources.pm b/src/PVE/HA/Sim/Resources.pm
index ec3d775..e664821 100644
--- a/src/PVE/HA/Sim/Resources.pm
+++ b/src/PVE/HA/Sim/Resources.pm
@@ -113,5 +113,21 @@ sub migrate {
return defined($ss->{$sid}) ? 0 : 1;
}
+sub after_recovery_cleanup {
+ my ($self, $haenv, $id) = @_;
+
+ my $sid = $self->type() . ":$id";
+ my $hardware = $haenv->hardware();
+
+ # remove all config locks as any left overs were rendered unnecessary when the
+ # old node of the service was fenced
+ if (my $removed_lock = $hardware->unlock_service($sid)) {
+ $haenv->log('warning', "removed leftover lock '$removed_lock' from recovered " .
+ "service '$sid' to allow its start.");
+ }
+
+ # TODO: anything else?
+}
+
1;
diff --git a/src/test/test-locked-service1/log.expect b/src/test/test-locked-service1/log.expect
new file mode 100644
index 0000000..42eed65
--- /dev/null
+++ b/src/test/test-locked-service1/log.expect
@@ -0,0 +1,44 @@
+info 0 hardware: starting simulation
+info 20 cmdlist: execute power node1 on
+info 20 node1/crm: status change startup => wait_for_quorum
+info 20 node1/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node2 on
+info 20 node2/crm: status change startup => wait_for_quorum
+info 20 node2/lrm: status change startup => wait_for_agent_lock
+info 20 cmdlist: execute power node3 on
+info 20 node3/crm: status change startup => wait_for_quorum
+info 20 node3/lrm: status change startup => wait_for_agent_lock
+info 20 node1/crm: got lock 'ha_manager_lock'
+info 20 node1/crm: status change wait_for_quorum => master
+info 20 node1/crm: node 'node1': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node2': state changed from 'unknown' => 'online'
+info 20 node1/crm: node 'node3': state changed from 'unknown' => 'online'
+info 20 node1/crm: adding new service 'vm:103' on node 'node3'
+info 22 node2/crm: status change wait_for_quorum => slave
+info 24 node3/crm: status change wait_for_quorum => slave
+info 25 node3/lrm: got lock 'ha_agent_node3_lock'
+info 25 node3/lrm: status change wait_for_agent_lock => active
+info 25 node3/lrm: starting service vm:103
+info 25 node3/lrm: service status vm:103 started
+info 120 cmdlist: execute service vm:103 lock
+info 220 cmdlist: execute network node3 off
+info 220 node1/crm: node 'node3': state changed from 'online' => 'unknown'
+info 224 node3/crm: status change slave => wait_for_quorum
+info 225 node3/lrm: status change active => lost_agent_lock
+info 260 node1/crm: service 'vm:103': state changed from 'started' to 'fence'
+info 260 node1/crm: node 'node3': state changed from 'unknown' => 'fence'
+info 266 watchdog: execute power node3 off
+info 265 node3/crm: killed by poweroff
+info 266 node3/lrm: killed by poweroff
+info 266 hardware: server 'node3' stopped by poweroff (watchdog)
+info 340 node1/crm: got lock 'ha_agent_node3_lock'
+info 340 node1/crm: fencing: acknowleged - got agent lock for node 'node3'
+info 340 node1/crm: node 'node3': state changed from 'fence' => 'unknown'
+info 340 node1/crm: recover service 'vm:103' from fenced node 'node3' to node 'node1'
+warn 340 node1/crm: removed leftover lock 'virtual' from recovered service 'vm:103' to allow its start.
+info 340 node1/crm: service 'vm:103': state changed from 'fence' to 'started' (node = node1)
+info 341 node1/lrm: got lock 'ha_agent_node1_lock'
+info 341 node1/lrm: status change wait_for_agent_lock => active
+info 341 node1/lrm: starting service vm:103
+info 341 node1/lrm: service status vm:103 started
+info 820 hardware: exit simulation - done
--
2.1.4
More information about the pve-devel
mailing list