[pbs-devel] [PATCH proxmox-backup 16/22] file-restore-daemon: add watchdog module
Stefan Reiter
s.reiter at proxmox.com
Tue Feb 16 18:07:04 CET 2021
Add a watchdog that will automatically shut down the VM after 10
minutes, if no API call is received.
This is handled using the unix 'alarm' syscall.
Signed-off-by: Stefan Reiter <s.reiter at proxmox.com>
---
src/api2/types/file_restore.rs | 3 ++
src/bin/proxmox-restore-daemon.rs | 5 ++
src/bin/proxmox_restore_daemon/api.rs | 22 ++++++--
src/bin/proxmox_restore_daemon/mod.rs | 3 ++
src/bin/proxmox_restore_daemon/watchdog.rs | 63 ++++++++++++++++++++++
5 files changed, 91 insertions(+), 5 deletions(-)
create mode 100644 src/bin/proxmox_restore_daemon/watchdog.rs
diff --git a/src/api2/types/file_restore.rs b/src/api2/types/file_restore.rs
index cd8df16a..710c6d83 100644
--- a/src/api2/types/file_restore.rs
+++ b/src/api2/types/file_restore.rs
@@ -8,5 +8,8 @@ use proxmox::api::api;
pub struct RestoreDaemonStatus {
/// VM uptime in seconds
pub uptime: i64,
+ /// time left until auto-shutdown, keep in mind that this is inaccurate when 'keep-timeout' is
+ /// not set, as then after the status call the timer will have reset
+ pub timeout: i64,
}
diff --git a/src/bin/proxmox-restore-daemon.rs b/src/bin/proxmox-restore-daemon.rs
index 1ec90794..d30da563 100644
--- a/src/bin/proxmox-restore-daemon.rs
+++ b/src/bin/proxmox-restore-daemon.rs
@@ -40,6 +40,9 @@ fn main() -> Result<(), Error> {
.write_style(env_logger::WriteStyle::Never)
.init();
+ // start watchdog, failure is a critical error as it leads to a scenario where we never exit
+ watchdog_init()?;
+
proxmox_backup::tools::runtime::main(run())
}
@@ -77,6 +80,8 @@ fn accept_vsock_connections(
Ok(stream) => {
if sender.send(Ok(stream)).await.is_err() {
error!("connection accept channel was closed");
+ } else {
+ watchdog_ping();
}
}
Err(err) => {
diff --git a/src/bin/proxmox_restore_daemon/api.rs b/src/bin/proxmox_restore_daemon/api.rs
index 3c642aaf..8eb727df 100644
--- a/src/bin/proxmox_restore_daemon/api.rs
+++ b/src/bin/proxmox_restore_daemon/api.rs
@@ -8,6 +8,8 @@ use proxmox::list_subdirs_api_method;
use proxmox_backup::api2::types::*;
+use super::{watchdog_remaining, watchdog_undo_ping};
+
// NOTE: All API endpoints must have Permission::World, as the configs for authentication do not
// exist within the restore VM. Safety is guaranteed since we use a low port, so only root on the
// host can contact us - and there the proxmox-backup-client validates permissions already.
@@ -25,6 +27,16 @@ fn read_uptime() -> Result<f32, Error> {
}
#[api(
+ input: {
+ properties: {
+ "keep-timeout": {
+ type: bool,
+ description: "If true, do not reset the watchdog timer on this API call.",
+ default: false,
+ optional: true,
+ },
+ },
+ },
access: {
description: "Permissions are handled outside restore VM.",
permission: &Permission::World,
@@ -34,12 +46,12 @@ fn read_uptime() -> Result<f32, Error> {
}
)]
/// General status information
-fn status(
- _param: Value,
- _info: &ApiMethod,
- _rpcenv: &mut dyn RpcEnvironment,
-) -> Result<RestoreDaemonStatus, Error> {
+fn status(keep_timeout: bool) -> Result<RestoreDaemonStatus, Error> {
+ if keep_timeout {
+ watchdog_undo_ping();
+ }
Ok(RestoreDaemonStatus {
uptime: read_uptime()? as i64,
+ timeout: watchdog_remaining(false),
})
}
diff --git a/src/bin/proxmox_restore_daemon/mod.rs b/src/bin/proxmox_restore_daemon/mod.rs
index d938a5bb..6802d31c 100644
--- a/src/bin/proxmox_restore_daemon/mod.rs
+++ b/src/bin/proxmox_restore_daemon/mod.rs
@@ -1,3 +1,6 @@
///! File restore VM related functionality
mod api;
pub use api::*;
+
+mod watchdog;
+pub use watchdog::*;
diff --git a/src/bin/proxmox_restore_daemon/watchdog.rs b/src/bin/proxmox_restore_daemon/watchdog.rs
new file mode 100644
index 00000000..f722be0b
--- /dev/null
+++ b/src/bin/proxmox_restore_daemon/watchdog.rs
@@ -0,0 +1,63 @@
+//! SIGALRM/alarm(1) based watchdog that shuts down the VM if not pinged for TIMEOUT
+use anyhow::Error;
+use std::sync::atomic::{AtomicI64, Ordering};
+
+use nix::sys::{reboot, signal::*};
+use nix::unistd::alarm;
+
+const TIMEOUT: u32 = 600; // seconds
+static TRIGGERED: AtomicI64 = AtomicI64::new(0);
+static LAST_TRIGGERED: AtomicI64 = AtomicI64::new(0);
+
+/// Handler is called when alarm-watchdog expires, immediately shuts down VM when triggered
+extern "C" fn alarm_handler(_signal: nix::libc::c_int) {
+ // use println! instead of log, since log might buffer and not print before shut down
+ println!("Watchdog expired, shutting down VM...");
+ let err = reboot::reboot(reboot::RebootMode::RB_POWER_OFF).unwrap_err();
+ println!("'reboot' syscall failed: {}", err);
+ std::process::exit(1);
+}
+
+/// Initialize alarm() based watchdog
+pub fn watchdog_init() -> Result<(), Error> {
+ unsafe {
+ sigaction(
+ Signal::SIGALRM,
+ &SigAction::new(
+ SigHandler::Handler(alarm_handler),
+ SaFlags::empty(),
+ SigSet::empty(),
+ ),
+ )?;
+ }
+
+ watchdog_ping();
+
+ Ok(())
+}
+
+/// Trigger watchdog keepalive
+pub fn watchdog_ping() {
+ alarm::set(TIMEOUT);
+ let cur_time = proxmox::tools::time::epoch_i64();
+ let last = TRIGGERED.swap(cur_time, Ordering::SeqCst);
+ LAST_TRIGGERED.store(last, Ordering::SeqCst);
+}
+
+/// Returns the remaining time before watchdog expiry in seconds if 'current' is true, otherwise it
+/// returns the remaining time before the last ping (which is probably what you want in the API, as
+/// from an API call 'current'=true will *always* return TIMEOUT)
+pub fn watchdog_remaining(current: bool) -> i64 {
+ let cur_time = proxmox::tools::time::epoch_i64();
+ let last_time = (if current { &TRIGGERED } else { &LAST_TRIGGERED }).load(Ordering::SeqCst);
+ TIMEOUT as i64 - (cur_time - last_time)
+}
+
+/// Undo the last watchdog ping and set timer back to previous state, call this in the API to fake
+/// a non-resetting call
+pub fn watchdog_undo_ping() {
+ let set = watchdog_remaining(false);
+ TRIGGERED.store(LAST_TRIGGERED.load(Ordering::SeqCst), Ordering::SeqCst);
+ // make sure argument cannot be 0, as that would cancel any alarm
+ alarm::set(1.max(set) as u32);
+}
--
2.20.1
More information about the pbs-devel
mailing list