[pbs-devel] [PATCH v2 proxmox-backup 11/20] file-restore-daemon: add watchdog module

Stefan Reiter s.reiter at proxmox.com
Wed Mar 24 16:18:18 CET 2021


Add a watchdog that will automatically shut down the VM after 10
minutes, if no API call is received.

Signed-off-by: Stefan Reiter <s.reiter at proxmox.com>
---

v2:
* use tokio instead of alarm()

 src/api2/types/file_restore.rs             |  3 ++
 src/bin/proxmox-restore-daemon.rs          |  2 ++
 src/bin/proxmox_restore_daemon/api.rs      | 26 ++++++++++----
 src/bin/proxmox_restore_daemon/mod.rs      |  3 ++
 src/bin/proxmox_restore_daemon/watchdog.rs | 41 ++++++++++++++++++++++
 5 files changed, 68 insertions(+), 7 deletions(-)
 create mode 100644 src/bin/proxmox_restore_daemon/watchdog.rs

diff --git a/src/api2/types/file_restore.rs b/src/api2/types/file_restore.rs
index cd8df16a..29085c31 100644
--- a/src/api2/types/file_restore.rs
+++ b/src/api2/types/file_restore.rs
@@ -8,5 +8,8 @@ use proxmox::api::api;
 pub struct RestoreDaemonStatus {
     /// VM uptime in seconds
     pub uptime: i64,
+    /// time left until auto-shutdown, keep in mind that this is useless when 'keep-timeout' is
+    /// not set, as then the status call will have reset the timer before returning the value
+    pub timeout: i64,
 }
 
diff --git a/src/bin/proxmox-restore-daemon.rs b/src/bin/proxmox-restore-daemon.rs
index e803238a..6b453ad3 100644
--- a/src/bin/proxmox-restore-daemon.rs
+++ b/src/bin/proxmox-restore-daemon.rs
@@ -45,6 +45,8 @@ fn main() -> Result<(), Error> {
 }
 
 async fn run() -> Result<(), Error> {
+    watchdog_init();
+
     let auth_config = Arc::new(
         auth::ticket_auth().map_err(|err| format_err!("reading ticket file failed: {}", err))?,
     );
diff --git a/src/bin/proxmox_restore_daemon/api.rs b/src/bin/proxmox_restore_daemon/api.rs
index 2dec11fe..4c78a0e8 100644
--- a/src/bin/proxmox_restore_daemon/api.rs
+++ b/src/bin/proxmox_restore_daemon/api.rs
@@ -8,6 +8,8 @@ use proxmox::list_subdirs_api_method;
 
 use proxmox_backup::api2::types::*;
 
+use super::{watchdog_remaining, watchdog_ping};
+
 // NOTE: All API endpoints must have Permission::Superuser, as the configs for authentication do
 // not exist within the restore VM. Safety is guaranteed by checking a ticket via a custom ApiAuth.
 
@@ -27,22 +29,32 @@ fn read_uptime() -> Result<f32, Error> {
 }
 
 #[api(
+    input: {
+        properties: {
+            "keep-timeout": {
+                type: bool,
+                description: "If true, do not reset the watchdog timer on this API call.",
+                default: false,
+                optional: true,
+            },
+        },
+    },
     access: {
-        description: "Permissions are handled outside restore VM.",
-        permission: &Permission::Superuser,
+        description: "Permissions are handled outside restore VM. This call can be made without a ticket, but keep-timeout is always assumed 'true' then.",
+        permission: &Permission::World,
     },
     returns: {
         type: RestoreDaemonStatus,
     }
 )]
 /// General status information
-fn status(
-    _param: Value,
-    _info: &ApiMethod,
-    _rpcenv: &mut dyn RpcEnvironment,
-) -> Result<RestoreDaemonStatus, Error> {
+fn status(rpcenv: &mut dyn RpcEnvironment, keep_timeout: bool) -> Result<RestoreDaemonStatus, Error> {
+    if !keep_timeout && rpcenv.get_auth_id().is_some() {
+        watchdog_ping();
+    }
     Ok(RestoreDaemonStatus {
         uptime: read_uptime()? as i64,
+        timeout: watchdog_remaining(),
     })
 }
 
diff --git a/src/bin/proxmox_restore_daemon/mod.rs b/src/bin/proxmox_restore_daemon/mod.rs
index 8396ebc5..3b52cf06 100644
--- a/src/bin/proxmox_restore_daemon/mod.rs
+++ b/src/bin/proxmox_restore_daemon/mod.rs
@@ -3,3 +3,6 @@ mod api;
 pub use api::*;
 
 pub mod auth;
+
+mod watchdog;
+pub use watchdog::*;
diff --git a/src/bin/proxmox_restore_daemon/watchdog.rs b/src/bin/proxmox_restore_daemon/watchdog.rs
new file mode 100644
index 00000000..776d66e3
--- /dev/null
+++ b/src/bin/proxmox_restore_daemon/watchdog.rs
@@ -0,0 +1,41 @@
+//! Tokio-based watchdog that shuts down the VM if not pinged for TIMEOUT
+use std::sync::atomic::{AtomicI64, Ordering};
+use proxmox::tools::time::epoch_i64;
+
+const TIMEOUT: i64 = 600; // seconds
+static TRIGGERED: AtomicI64 = AtomicI64::new(0);
+
+fn handle_expired() -> ! {
+    use nix::sys::reboot;
+    println!("watchdog expired, shutting down");
+    let err = reboot::reboot(reboot::RebootMode::RB_POWER_OFF).unwrap_err();
+    println!("'reboot' syscall failed: {}", err);
+    std::process::exit(1);
+}
+
+async fn watchdog_loop() {
+    use tokio::time::{sleep, Duration};
+    loop {
+        let remaining = watchdog_remaining();
+        if remaining <= 0 {
+            handle_expired();
+        }
+        sleep(Duration::from_secs(remaining as u64)).await;
+    }
+}
+
+/// Initialize watchdog
+pub fn watchdog_init() {
+    watchdog_ping();
+    tokio::spawn(watchdog_loop());
+}
+
+/// Trigger watchdog keepalive
+pub fn watchdog_ping() {
+    TRIGGERED.store(epoch_i64(), Ordering::SeqCst);
+}
+
+/// Returns the remaining time before watchdog expiry in seconds
+pub fn watchdog_remaining() -> i64 {
+    TIMEOUT - (epoch_i64() - TRIGGERED.load(Ordering::SeqCst))
+}
-- 
2.20.1






More information about the pbs-devel mailing list