[pve-devel] [PATCH qemu-server 07/11] agent: implement fsfreeze helper to better handle lost commands
Mira Limbeck
m.limbeck at proxmox.com
Mon May 5 15:57:49 CEST 2025
Thank you for tackling this!
On 5/5/25 14:57, Fiona Ebner wrote:
> As reported in the enterprise support, it can happen that a guest
> agent command is read, but then the guest agent never sends an answer,
> because the service in the guest is stopped/killed. For example, if a
> guest reboot happens before the command can be successfully executed.
> This is usually not problematic, but the fsfreeze-freeze command has a
> timeout of 1 hour, so the guest agent socket would be blocked for that
> amount of time, waiting on a command that is not being executed
> anymore.
>
> Use a lower timeout for the fsfreeze-freeze command, and issue an
> fsfreeze-status command afterwards, which will return immediately if
> the fsfreeze-freeze command already finished, and which will be queued
> if not. This is used as a proxy to determine whether the
> fsfreeze-freeze command is still running and to check whether it was
> successful. Like this, the time the socket is blocked after a "lost
> command" is at most 10 minutes.
>
> Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
> ---
> PVE/QMPClient.pm | 4 ++++
> PVE/QemuConfig.pm | 5 ++--
> PVE/QemuServer.pm | 3 ++-
> PVE/QemuServer/Agent.pm | 51 ++++++++++++++++++++++++++++++++++++++++
> PVE/VZDump/QemuServer.pm | 5 ++--
> 5 files changed, 63 insertions(+), 5 deletions(-)
>
> diff --git a/PVE/QMPClient.pm b/PVE/QMPClient.pm
> index 0f08e678..b3340885 100644
> --- a/PVE/QMPClient.pm
> +++ b/PVE/QMPClient.pm
> @@ -110,6 +110,8 @@ sub cmd {
> } elsif ($cmd->{execute} =~ m/^(eject|change)/) {
> $timeout = 60; # note: cdrom mount command is slow
> } elsif ($cmd->{execute} eq 'guest-fsfreeze-freeze') {
> + # consider using the guest_fsfreeze() helper in Agent.pm
> + #
> # freeze syncs all guest FS, if we kill it it stays in an unfreezable
> # locked state with high probability, so use an generous timeout
> $timeout = 60*60; # 1 hour
> @@ -146,6 +148,7 @@ sub cmd {
> if (defined($queue_info->{error})) {
> die "VM $vmid qmp command '$cmd->{execute}' failed - $queue_info->{error}" if !$noerr;
> $result = { error => $queue_info->{error} };
> + $result->{'error-is-timeout'} = 1 if $queue_info->{'error-is-timeout'};
> }
>
> return $result;
> @@ -467,6 +470,7 @@ sub mux_timeout {
>
> if (my $queue_info = &$lookup_queue_info($self, $fh)) {
> $queue_info->{error} = "got timeout\n";
> + $queue_info->{'error-is-timeout'} = 1;
> $self->{mux}->inbuffer($fh, ''); # clear to avoid warnings
> }
>
> diff --git a/PVE/QemuConfig.pm b/PVE/QemuConfig.pm
> index 2609542c..e941f093 100644
> --- a/PVE/QemuConfig.pm
> +++ b/PVE/QemuConfig.pm
> @@ -8,6 +8,7 @@ use Scalar::Util qw(blessed);
> use PVE::AbstractConfig;
> use PVE::INotify;
> use PVE::JSONSchema;
> +use PVE::QemuServer::Agent;
> use PVE::QemuServer::CPUConfig;
> use PVE::QemuServer::Drive;
> use PVE::QemuServer::Helpers;
> @@ -291,8 +292,8 @@ sub __snapshot_freeze {
> eval { mon_cmd($vmid, "guest-fsfreeze-thaw"); };
> warn "guest-fsfreeze-thaw problems - $@" if $@;
> } else {
> - eval { mon_cmd($vmid, "guest-fsfreeze-freeze"); };
> - warn "guest-fsfreeze-freeze problems - $@" if $@;
> + eval { PVE::QemuServer::Agent::guest_fsfreeze($vmid); };
> + warn $@ if $@;
> }
> }
>
> diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
> index 577959a4..317c09f2 100644
> --- a/PVE/QemuServer.pm
> +++ b/PVE/QemuServer.pm
> @@ -51,6 +51,7 @@ use PVE::Tools qw(run_command file_read_firstline file_get_contents dir_glob_for
> use PVE::QMPClient;
> use PVE::QemuConfig;
> use PVE::QemuConfig::NoWrite;
> +use PVE::QemuServer::Agent;
> use PVE::QemuServer::Helpers qw(config_aware_timeout min_version kvm_user_version windows_version);
> use PVE::QemuServer::Cloudinit;
> use PVE::QemuServer::CGroup;
> @@ -8242,7 +8243,7 @@ sub qemu_drive_mirror_monitor {
> my $agent_running = $qga && qga_check_running($vmid);
> if ($agent_running) {
> print "freeze filesystem\n";
> - eval { mon_cmd($vmid, "guest-fsfreeze-freeze"); };
> + eval { PVE::QemuServer::Agent::guest_fsfreeze($vmid); };
> warn $@ if $@;
> } else {
> print "suspend vm\n";
> diff --git a/PVE/QemuServer/Agent.pm b/PVE/QemuServer/Agent.pm
> index 41e615aa..ef36a6a8 100644
> --- a/PVE/QemuServer/Agent.pm
> +++ b/PVE/QemuServer/Agent.pm
> @@ -119,4 +119,55 @@ sub qemu_exec_status {
> return $res;
> }
>
> +# It can happen that a guest agent command is read, but then the guest agent never sends an answer,
> +# because the service in the guest is stopped/killed. For example, if a guest reboot happens before
> +# the command can be successfully executed. This is usually not problematic, but the fsfreeze-freeze
> +# command has a timeout of 1 hour, so the guest agent socket would be blocked for that amount of
> +# time, waiting on a command that is not being executed anymore.
> +#
> +# Use a lower timeout for the fsfreeze-freeze command, and issue an fsfreeze-status command
> +# afterwards, which will return immediately if the fsfreeze-freeze command already finished, and
> +# which will be queued if not. This is used as a proxy to determine whether the fsfreeze-freeze
> +# command is still running and to check whether it was successful. Like this, the time the socket is
> +# blocked after a "lost command" is at most 10 minutes.
With the changed logic it's not so clear at first how long it would wait
in the worst case. It's still the same 60 minutes as before, just spread
over the initial fsfreeze command, and then up to 5 iterations of
fsfreese-status, each with a 10 minute timeout.
Maybe that could be documented in the comment and/or the commit message?
> +sub guest_fsfreeze {
> + my ($vmid) = @_;
> +
> + my $timeout = 10 * 60;
> +
> + my $result = eval {
> + PVE::QemuServer::Monitor::mon_cmd($vmid, 'guest-fsfreeze-freeze', timeout => $timeout);
> + };
> + if ($result && ref($result) eq 'HASH' && $result->{error}) {
> + my $error = $result->{error}->{desc} // 'unknown';
> + die "unable to freeze guest fs - $error\n";
> + } elsif (defined($result)) {
> + return; # command successful
> + }
> +
> + my $status;
> + eval {
> + my ($i, $last_iteration) = (0, 5);
> + while ($i < $last_iteration && !defined($status)) {
> + print "still waiting on guest fs freeze\n";
> + $i++;
> +
> + $status = PVE::QemuServer::Monitor::mon_cmd(
> + $vmid, 'guest-fsfreeze-status', timeout => $timeout, noerr => 1);
> +
> + if ($status && ref($status) eq 'HASH' && $status->{'error-is-timeout'}) {
> + $status = undef;
> + } else {
> + check_agent_error($status, 'unknown error');
> + }
> + }
> + if (!defined($status)) {
> + die "timeout after " . ($timeout * ($last_iteration + 1) / 60) . " minutes\n";
> + }
> + };
> + die "querying status after freezing guest fs failed - $@" if $@;
> +
> + die "unable to freeze guest fs - unexpected status '$status'\n" if $status ne 'frozen';
> +}
> +
> 1;
> diff --git a/PVE/VZDump/QemuServer.pm b/PVE/VZDump/QemuServer.pm
> index 10514f75..b686da84 100644
> --- a/PVE/VZDump/QemuServer.pm
> +++ b/PVE/VZDump/QemuServer.pm
> @@ -29,6 +29,7 @@ use PVE::Format qw(render_duration render_bytes);
>
> use PVE::QemuConfig;
> use PVE::QemuServer;
> +use PVE::QemuServer::Agent;
> use PVE::QemuServer::Drive qw(checked_volume_format);
> use PVE::QemuServer::Helpers;
> use PVE::QemuServer::Machine;
> @@ -1069,10 +1070,10 @@ sub qga_fs_freeze {
> }
>
> $self->loginfo("issuing guest-agent 'fs-freeze' command");
> - eval { mon_cmd($vmid, "guest-fsfreeze-freeze") };
> + eval { PVE::QemuServer::Agent::guest_fsfreeze($vmid); };
> $self->logerr($@) if $@;
>
> - return 1; # even on mon command error, ensure we always thaw again
> + return 1; # even on error, ensure we always thaw again
> }
>
> # only call if fs_freeze return 1
More information about the pve-devel
mailing list