[pbs-devel] [PATCH proxmox-backup 16/22] file-restore-daemon: add watchdog module
Wolfgang Bumiller
w.bumiller at proxmox.com
Wed Feb 17 11:52:53 CET 2021
On Tue, Feb 16, 2021 at 06:07:04PM +0100, Stefan Reiter wrote:
> Add a watchdog that will automatically shut down the VM after 10
> minutes, if no API call is received.
>
> This is handled using the unix 'alarm' syscall.
>
> Signed-off-by: Stefan Reiter <s.reiter at proxmox.com>
> ---
> src/api2/types/file_restore.rs | 3 ++
> src/bin/proxmox-restore-daemon.rs | 5 ++
> src/bin/proxmox_restore_daemon/api.rs | 22 ++++++--
> src/bin/proxmox_restore_daemon/mod.rs | 3 ++
> src/bin/proxmox_restore_daemon/watchdog.rs | 63 ++++++++++++++++++++++
> 5 files changed, 91 insertions(+), 5 deletions(-)
> create mode 100644 src/bin/proxmox_restore_daemon/watchdog.rs
>
> diff --git a/src/api2/types/file_restore.rs b/src/api2/types/file_restore.rs
> index cd8df16a..710c6d83 100644
> --- a/src/api2/types/file_restore.rs
> +++ b/src/api2/types/file_restore.rs
> @@ -8,5 +8,8 @@ use proxmox::api::api;
> pub struct RestoreDaemonStatus {
> /// VM uptime in seconds
> pub uptime: i64,
> + /// time left until auto-shutdown, keep in mind that this is inaccurate when 'keep-timeout' is
> + /// not set, as then after the status call the timer will have reset
> + pub timeout: i64,
> }
>
> diff --git a/src/bin/proxmox-restore-daemon.rs b/src/bin/proxmox-restore-daemon.rs
> index 1ec90794..d30da563 100644
> --- a/src/bin/proxmox-restore-daemon.rs
> +++ b/src/bin/proxmox-restore-daemon.rs
> @@ -40,6 +40,9 @@ fn main() -> Result<(), Error> {
> .write_style(env_logger::WriteStyle::Never)
> .init();
>
> + // start watchdog, failure is a critical error as it leads to a scenario where we never exit
> + watchdog_init()?;
> +
> proxmox_backup::tools::runtime::main(run())
> }
>
> @@ -77,6 +80,8 @@ fn accept_vsock_connections(
> Ok(stream) => {
> if sender.send(Ok(stream)).await.is_err() {
> error!("connection accept channel was closed");
> + } else {
> + watchdog_ping();
Should the ping not also happen at every api call in case connections
get reused?
> }
> }
> Err(err) => {
> diff --git a/src/bin/proxmox_restore_daemon/api.rs b/src/bin/proxmox_restore_daemon/api.rs
> index 3c642aaf..8eb727df 100644
> --- a/src/bin/proxmox_restore_daemon/api.rs
> +++ b/src/bin/proxmox_restore_daemon/api.rs
> @@ -8,6 +8,8 @@ use proxmox::list_subdirs_api_method;
>
> use proxmox_backup::api2::types::*;
>
> +use super::{watchdog_remaining, watchdog_undo_ping};
> +
> // NOTE: All API endpoints must have Permission::World, as the configs for authentication do not
> // exist within the restore VM. Safety is guaranteed since we use a low port, so only root on the
> // host can contact us - and there the proxmox-backup-client validates permissions already.
> @@ -25,6 +27,16 @@ fn read_uptime() -> Result<f32, Error> {
> }
>
> #[api(
> + input: {
> + properties: {
> + "keep-timeout": {
> + type: bool,
> + description: "If true, do not reset the watchdog timer on this API call.",
> + default: false,
> + optional: true,
> + },
> + },
> + },
> access: {
> description: "Permissions are handled outside restore VM.",
> permission: &Permission::World,
> @@ -34,12 +46,12 @@ fn read_uptime() -> Result<f32, Error> {
> }
> )]
> /// General status information
> -fn status(
> - _param: Value,
> - _info: &ApiMethod,
> - _rpcenv: &mut dyn RpcEnvironment,
> -) -> Result<RestoreDaemonStatus, Error> {
> +fn status(keep_timeout: bool) -> Result<RestoreDaemonStatus, Error> {
> + if keep_timeout {
This seems just weird. Do we really need this?
> + watchdog_undo_ping();
> + }
> Ok(RestoreDaemonStatus {
> uptime: read_uptime()? as i64,
> + timeout: watchdog_remaining(false),
> })
> }
> diff --git a/src/bin/proxmox_restore_daemon/mod.rs b/src/bin/proxmox_restore_daemon/mod.rs
> index d938a5bb..6802d31c 100644
> --- a/src/bin/proxmox_restore_daemon/mod.rs
> +++ b/src/bin/proxmox_restore_daemon/mod.rs
> @@ -1,3 +1,6 @@
> ///! File restore VM related functionality
> mod api;
> pub use api::*;
> +
> +mod watchdog;
> +pub use watchdog::*;
> diff --git a/src/bin/proxmox_restore_daemon/watchdog.rs b/src/bin/proxmox_restore_daemon/watchdog.rs
> new file mode 100644
> index 00000000..f722be0b
> --- /dev/null
> +++ b/src/bin/proxmox_restore_daemon/watchdog.rs
> @@ -0,0 +1,63 @@
> +//! SIGALRM/alarm(1) based watchdog that shuts down the VM if not pinged for TIMEOUT
> +use anyhow::Error;
> +use std::sync::atomic::{AtomicI64, Ordering};
> +
> +use nix::sys::{reboot, signal::*};
> +use nix::unistd::alarm;
> +
> +const TIMEOUT: u32 = 600; // seconds
> +static TRIGGERED: AtomicI64 = AtomicI64::new(0);
> +static LAST_TRIGGERED: AtomicI64 = AtomicI64::new(0);
> +
> +/// Handler is called when alarm-watchdog expires, immediately shuts down VM when triggered
> +extern "C" fn alarm_handler(_signal: nix::libc::c_int) {
> + // use println! instead of log, since log might buffer and not print before shut down
> + println!("Watchdog expired, shutting down VM...");
> + let err = reboot::reboot(reboot::RebootMode::RB_POWER_OFF).unwrap_err();
> + println!("'reboot' syscall failed: {}", err);
> + std::process::exit(1);
> +}
> +
> +/// Initialize alarm() based watchdog
> +pub fn watchdog_init() -> Result<(), Error> {
> + unsafe {
> + sigaction(
> + Signal::SIGALRM,
Please don't use this with async code. Threads, signal handlers and
async are really annoying to keep track of. This is a perlism we really
shouldn't continue to use. We only have a single semi-acceptable excuse
for timing signals currently and that's file locks with timeouts, as
those have no alternative (yet), which use the timer_create(2)
api btw. and will hopefully at some point be replaced by io-uring...
Please just spawn a future using
tokio::time::sleep(watchdog_remaining()).await
in a loop (and don't forget to initialize `TRIGGERD` to the current time
before spawning it of course ;-) ).
> + &SigAction::new(
> + SigHandler::Handler(alarm_handler),
> + SaFlags::empty(),
> + SigSet::empty(),
> + ),
> + )?;
> + }
> +
> + watchdog_ping();
> +
> + Ok(())
> +}
> +
> +/// Trigger watchdog keepalive
> +pub fn watchdog_ping() {
> + alarm::set(TIMEOUT);
^ then this can just go
> + let cur_time = proxmox::tools::time::epoch_i64();
> + let last = TRIGGERED.swap(cur_time, Ordering::SeqCst);
> + LAST_TRIGGERED.store(last, Ordering::SeqCst);
> +}
> +
> +/// Returns the remaining time before watchdog expiry in seconds if 'current' is true, otherwise it
> +/// returns the remaining time before the last ping (which is probably what you want in the API, as
> +/// from an API call 'current'=true will *always* return TIMEOUT)
> +pub fn watchdog_remaining(current: bool) -> i64 {
> + let cur_time = proxmox::tools::time::epoch_i64();
> + let last_time = (if current { &TRIGGERED } else { &LAST_TRIGGERED }).load(Ordering::SeqCst);
> + TIMEOUT as i64 - (cur_time - last_time)
> +}
> +
> +/// Undo the last watchdog ping and set timer back to previous state, call this in the API to fake
> +/// a non-resetting call
> +pub fn watchdog_undo_ping() {
This still makes me cringe :-P
> + let set = watchdog_remaining(false);
> + TRIGGERED.store(LAST_TRIGGERED.load(Ordering::SeqCst), Ordering::SeqCst);
> + // make sure argument cannot be 0, as that would cancel any alarm
> + alarm::set(1.max(set) as u32);
> +}
> --
> 2.20.1
More information about the pbs-devel
mailing list