[pbs-devel] [PATCH v3 proxmox-backup 11/20] file-restore-daemon: add watchdog module
Stefan Reiter
s.reiter at proxmox.com
Wed Mar 31 12:21:53 CEST 2021
Add a watchdog that will automatically shut down the VM after 10
minutes, if no API call is received.
Signed-off-by: Stefan Reiter <s.reiter at proxmox.com>
---
v3:
* use fetch_max and better Ordering
v2:
* use tokio instead of alarm()
src/api2/types/file_restore.rs | 3 ++
src/bin/proxmox-restore-daemon.rs | 2 ++
src/bin/proxmox_restore_daemon/api.rs | 26 ++++++++++----
src/bin/proxmox_restore_daemon/mod.rs | 3 ++
src/bin/proxmox_restore_daemon/watchdog.rs | 41 ++++++++++++++++++++++
5 files changed, 68 insertions(+), 7 deletions(-)
create mode 100644 src/bin/proxmox_restore_daemon/watchdog.rs
diff --git a/src/api2/types/file_restore.rs b/src/api2/types/file_restore.rs
index cd8df16a..29085c31 100644
--- a/src/api2/types/file_restore.rs
+++ b/src/api2/types/file_restore.rs
@@ -8,5 +8,8 @@ use proxmox::api::api;
pub struct RestoreDaemonStatus {
/// VM uptime in seconds
pub uptime: i64,
+ /// time left until auto-shutdown, keep in mind that this is useless when 'keep-timeout' is
+ /// not set, as then the status call will have reset the timer before returning the value
+ pub timeout: i64,
}
diff --git a/src/bin/proxmox-restore-daemon.rs b/src/bin/proxmox-restore-daemon.rs
index e803238a..6b453ad3 100644
--- a/src/bin/proxmox-restore-daemon.rs
+++ b/src/bin/proxmox-restore-daemon.rs
@@ -45,6 +45,8 @@ fn main() -> Result<(), Error> {
}
async fn run() -> Result<(), Error> {
+ watchdog_init();
+
let auth_config = Arc::new(
auth::ticket_auth().map_err(|err| format_err!("reading ticket file failed: {}", err))?,
);
diff --git a/src/bin/proxmox_restore_daemon/api.rs b/src/bin/proxmox_restore_daemon/api.rs
index 2dec11fe..4c78a0e8 100644
--- a/src/bin/proxmox_restore_daemon/api.rs
+++ b/src/bin/proxmox_restore_daemon/api.rs
@@ -8,6 +8,8 @@ use proxmox::list_subdirs_api_method;
use proxmox_backup::api2::types::*;
+use super::{watchdog_remaining, watchdog_ping};
+
// NOTE: All API endpoints must have Permission::Superuser, as the configs for authentication do
// not exist within the restore VM. Safety is guaranteed by checking a ticket via a custom ApiAuth.
@@ -27,22 +29,32 @@ fn read_uptime() -> Result<f32, Error> {
}
#[api(
+ input: {
+ properties: {
+ "keep-timeout": {
+ type: bool,
+ description: "If true, do not reset the watchdog timer on this API call.",
+ default: false,
+ optional: true,
+ },
+ },
+ },
access: {
- description: "Permissions are handled outside restore VM.",
- permission: &Permission::Superuser,
+ description: "Permissions are handled outside restore VM. This call can be made without a ticket, but keep-timeout is always assumed 'true' then.",
+ permission: &Permission::World,
},
returns: {
type: RestoreDaemonStatus,
}
)]
/// General status information
-fn status(
- _param: Value,
- _info: &ApiMethod,
- _rpcenv: &mut dyn RpcEnvironment,
-) -> Result<RestoreDaemonStatus, Error> {
+fn status(rpcenv: &mut dyn RpcEnvironment, keep_timeout: bool) -> Result<RestoreDaemonStatus, Error> {
+ if !keep_timeout && rpcenv.get_auth_id().is_some() {
+ watchdog_ping();
+ }
Ok(RestoreDaemonStatus {
uptime: read_uptime()? as i64,
+ timeout: watchdog_remaining(),
})
}
diff --git a/src/bin/proxmox_restore_daemon/mod.rs b/src/bin/proxmox_restore_daemon/mod.rs
index 8396ebc5..3b52cf06 100644
--- a/src/bin/proxmox_restore_daemon/mod.rs
+++ b/src/bin/proxmox_restore_daemon/mod.rs
@@ -3,3 +3,6 @@ mod api;
pub use api::*;
pub mod auth;
+
+mod watchdog;
+pub use watchdog::*;
diff --git a/src/bin/proxmox_restore_daemon/watchdog.rs b/src/bin/proxmox_restore_daemon/watchdog.rs
new file mode 100644
index 00000000..399f99a7
--- /dev/null
+++ b/src/bin/proxmox_restore_daemon/watchdog.rs
@@ -0,0 +1,41 @@
+//! Tokio-based watchdog that shuts down the VM if not pinged for TIMEOUT
+use std::sync::atomic::{AtomicI64, Ordering};
+use proxmox::tools::time::epoch_i64;
+
+const TIMEOUT: i64 = 600; // seconds
+static TRIGGERED: AtomicI64 = AtomicI64::new(0);
+
+fn handle_expired() -> ! {
+ use nix::sys::reboot;
+ println!("watchdog expired, shutting down");
+ let err = reboot::reboot(reboot::RebootMode::RB_POWER_OFF).unwrap_err();
+ println!("'reboot' syscall failed: {}", err);
+ std::process::exit(1);
+}
+
+async fn watchdog_loop() {
+ use tokio::time::{sleep, Duration};
+ loop {
+ let remaining = watchdog_remaining();
+ if remaining <= 0 {
+ handle_expired();
+ }
+ sleep(Duration::from_secs(remaining as u64)).await;
+ }
+}
+
+/// Initialize watchdog
+pub fn watchdog_init() {
+ watchdog_ping();
+ tokio::spawn(watchdog_loop());
+}
+
+/// Trigger watchdog keepalive
+pub fn watchdog_ping() {
+ TRIGGERED.fetch_max(epoch_i64(), Ordering::AcqRel);
+}
+
+/// Returns the remaining time before watchdog expiry in seconds
+pub fn watchdog_remaining() -> i64 {
+ TIMEOUT - (epoch_i64() - TRIGGERED.load(Ordering::Acquire))
+}
--
2.20.1
More information about the pbs-devel
mailing list