[pve-devel] Script for bug #2874

Fiona Ebner f.ebner at proxmox.com
Fri Jan 27 14:57:38 CET 2023


Am 27.01.23 um 14:49 schrieb Fiona Ebner:
> The attached script allows monitoring the first sector of the bootdisk
> for running VMs (all or a selection of IDs) for people affected by bug
> #2874 [0]. The hope is to pinpoint when the sector gets corrupted to be
> able to correlate the timing with operations that might cause it. The
> script also dumps the contents, because it might help to see how the
> sector gets corrupted.
> 
> Note that the script needs to be executed on each node and that you can
> specify IDs for VMs not currently on that node, which is useful to catch
> migrating VMs (or don't specify any IDs to monitor all running VMs).
> 
> The script parses the VM config to determine the boot disk, looks up the
> path and uses qemu-img dd and base64 to save the contents of the first
> 512 bytes in a non-binary format and will dump the contents whenever
> they change.
> 
> Example invocations:
> # monitor all running VMs, check every 5 minutes
> perl monitor-sector-zero.pl --interval 300
> # only monitor 166 and 167, check every minute, log to file
> perl monitor-sector-zero.pl 166 167 &> /path/to/file
> 
> Feedback from users and other developers is highly appreciated!
> 
> [0]: https://bugzilla.proxmox.com/show_bug.cgi?id=2874
> _______________________________________________
> pve-devel mailing list
> pve-devel at lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
> 
> 

Well, apparently the attachment got removed. So here it is:

#!/bin/perl

use strict;
use warnings;

use Getopt::Long qw(GetOptions);
use POSIX qw(strftime);

use PVE::Cluster;
use PVE::QemuConfig;
use PVE::QemuServer::Drive qw(drive_is_cdrom is_valid_drivename parse_drive);
use PVE::QemuServer::Helpers;
use PVE::Storage;

# START OF HELPER FUNCTIONS

sub pprint {
     my ($msg, $vmid, $volid) = @_;

     chomp($msg);

     my $time = strftime("%F %H:%M:%S", localtime);
     my $time_prefix = "$time - ";

     my $vmid_prefix = $vmid ? "$vmid - " : '';
     my $volid_prefix = $volid ? "$volid - " : '';

     print "$time_prefix$vmid_prefix$volid_prefix$msg\n";
}

my $fixed_vmlist;
sub get_vmids {
     return $fixed_vmlist if $fixed_vmlist;

     my $list = [];
     my $vmlist = PVE::Cluster::get_vmlist();
     for my $vmid (keys $vmlist->{ids}->%*) {
	next if $vmlist->{ids}->{$vmid}->{type} ne 'qemu';
	push $list->@*, $vmid;
     }
     return $list;
}

my $running = {};
sub update_running {
     my ($vmid) = @_;

     my $old_running = $running->{$vmid};
     $running->{$vmid} = eval { PVE::QemuServer::Helpers::vm_running_locally($vmid); };
     pprint("could not check if VM is running - $@", $vmid) if $@;

     pprint("stop monitoring - not running", $vmid) if !$running->{$vmid} && $old_running;
     pprint("start monitoring - now running", $vmid) if $running->{$vmid} && !$old_running;

     return $running->{$vmid};
}

sub get_bootdisk_volid {
     my ($vmid) = @_;

     my $conf = PVE::QemuConfig->load_config($vmid);
     my $bootdisks = PVE::QemuServer::Drive::get_bootdisks($conf);
     for my $bootdisk ($bootdisks->@*) {
	next if !is_valid_drivename($bootdisk);
	next if !$conf->{$bootdisk};

	my $drive = parse_drive($bootdisk, $conf->{$bootdisk});
	next if !defined($drive);
	next if drive_is_cdrom($drive);

	my $volid = $drive->{file};
	next if !$volid;
	return $volid;
     }
     die "no bootdisk found in config\n";
}

my $errors = {};
sub should_skip {
     my ($vmid) = @_;

     return $errors->{$vmid} >= 3;
}

# END OF HELPER FUNCTIONS

my $interval = 60;
GetOptions('interval=i' => \$interval);

if (scalar(@ARGV)) {
     $fixed_vmlist = [@ARGV];
     pprint("monitoring VMs " . join(',', sort {$a <=> $b} $fixed_vmlist->@*));
} else {
     pprint("no list of VMIDs provided - monitoring all VMs");
}

my $contents = {};

while (1) {
     PVE::Cluster::cfs_update();

     my $storecfg = PVE::Storage::config();

     my $vmids = get_vmids();
     for my $vmid ($vmids->@*) {
	$errors->{$vmid} //= 0;
	next if should_skip($vmid);

	next if !update_running($vmid);

	eval {
	    my $volid = get_bootdisk_volid($vmid);
	    my $path = PVE::Storage::path($storecfg, $volid);

	    my $cmd = [
		['qemu-img', 'dd', 'bs=512', 'count=1', "if=$path"],
		['base64', '--wrap', '0'],
	    ];

	    my $content;
	    PVE::Tools::run_command($cmd, outfunc => sub { $content = shift });
	    die "no output\n" if !$content;

	    if (!defined($contents->{$vmid})) {
		pprint("registered content for first sector", $vmid, $volid);
		print "$content\n";
		$contents->{$vmid} //= $content;
	    }

	    if ($content ne $contents->{$vmid}) {
		pprint("detected changed content for first sector!", $vmid, $volid);
		print "$content\n";
		$contents->{$vmid} = $content;
	    }
	};
	if (my $err = $@) {
	    pprint("can't determine content for first sector - $err", $vmid);
	    $errors->{$vmid}++;
	    pprint("too many errors - skipping from now on", $vmid) if should_skip($vmid);
	} else {
	    $errors->{$vmid} = 0;
	}
     }

     sleep $interval;
}





More information about the pve-devel mailing list