[pve-devel] Script for bug #2874
Fiona Ebner
f.ebner at proxmox.com
Fri Jan 27 14:57:38 CET 2023
Am 27.01.23 um 14:49 schrieb Fiona Ebner:
> The attached script allows monitoring the first sector of the bootdisk
> for running VMs (all or a selection of IDs) for people affected by bug
> #2874 [0]. The hope is to pinpoint when the sector gets corrupted to be
> able to correlate the timing with operations that might cause it. The
> script also dumps the contents, because it might help to see how the
> sector gets corrupted.
>
> Note that the script needs to be executed on each node and that you can
> specify IDs for VMs not currently on that node, which is useful to catch
> migrating VMs (or don't specify any IDs to monitor all running VMs).
>
> The script parses the VM config to determine the boot disk, looks up the
> path and uses qemu-img dd and base64 to save the contents of the first
> 512 bytes in a non-binary format and will dump the contents whenever
> they change.
>
> Example invocations:
> # monitor all running VMs, check every 5 minutes
> perl monitor-sector-zero.pl --interval 300
> # only monitor 166 and 167, check every minute, log to file
> perl monitor-sector-zero.pl 166 167 &> /path/to/file
>
> Feedback from users and other developers is highly appreciated!
>
> [0]: https://bugzilla.proxmox.com/show_bug.cgi?id=2874
> _______________________________________________
> pve-devel mailing list
> pve-devel at lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
>
>
Well, apparently the attachment got removed. So here it is:
#!/bin/perl
use strict;
use warnings;
use Getopt::Long qw(GetOptions);
use POSIX qw(strftime);
use PVE::Cluster;
use PVE::QemuConfig;
use PVE::QemuServer::Drive qw(drive_is_cdrom is_valid_drivename parse_drive);
use PVE::QemuServer::Helpers;
use PVE::Storage;
# START OF HELPER FUNCTIONS
sub pprint {
my ($msg, $vmid, $volid) = @_;
chomp($msg);
my $time = strftime("%F %H:%M:%S", localtime);
my $time_prefix = "$time - ";
my $vmid_prefix = $vmid ? "$vmid - " : '';
my $volid_prefix = $volid ? "$volid - " : '';
print "$time_prefix$vmid_prefix$volid_prefix$msg\n";
}
my $fixed_vmlist;
sub get_vmids {
return $fixed_vmlist if $fixed_vmlist;
my $list = [];
my $vmlist = PVE::Cluster::get_vmlist();
for my $vmid (keys $vmlist->{ids}->%*) {
next if $vmlist->{ids}->{$vmid}->{type} ne 'qemu';
push $list->@*, $vmid;
}
return $list;
}
my $running = {};
sub update_running {
my ($vmid) = @_;
my $old_running = $running->{$vmid};
$running->{$vmid} = eval { PVE::QemuServer::Helpers::vm_running_locally($vmid); };
pprint("could not check if VM is running - $@", $vmid) if $@;
pprint("stop monitoring - not running", $vmid) if !$running->{$vmid} && $old_running;
pprint("start monitoring - now running", $vmid) if $running->{$vmid} && !$old_running;
return $running->{$vmid};
}
sub get_bootdisk_volid {
my ($vmid) = @_;
my $conf = PVE::QemuConfig->load_config($vmid);
my $bootdisks = PVE::QemuServer::Drive::get_bootdisks($conf);
for my $bootdisk ($bootdisks->@*) {
next if !is_valid_drivename($bootdisk);
next if !$conf->{$bootdisk};
my $drive = parse_drive($bootdisk, $conf->{$bootdisk});
next if !defined($drive);
next if drive_is_cdrom($drive);
my $volid = $drive->{file};
next if !$volid;
return $volid;
}
die "no bootdisk found in config\n";
}
my $errors = {};
sub should_skip {
my ($vmid) = @_;
return $errors->{$vmid} >= 3;
}
# END OF HELPER FUNCTIONS
my $interval = 60;
GetOptions('interval=i' => \$interval);
if (scalar(@ARGV)) {
$fixed_vmlist = [@ARGV];
pprint("monitoring VMs " . join(',', sort {$a <=> $b} $fixed_vmlist->@*));
} else {
pprint("no list of VMIDs provided - monitoring all VMs");
}
my $contents = {};
while (1) {
PVE::Cluster::cfs_update();
my $storecfg = PVE::Storage::config();
my $vmids = get_vmids();
for my $vmid ($vmids->@*) {
$errors->{$vmid} //= 0;
next if should_skip($vmid);
next if !update_running($vmid);
eval {
my $volid = get_bootdisk_volid($vmid);
my $path = PVE::Storage::path($storecfg, $volid);
my $cmd = [
['qemu-img', 'dd', 'bs=512', 'count=1', "if=$path"],
['base64', '--wrap', '0'],
];
my $content;
PVE::Tools::run_command($cmd, outfunc => sub { $content = shift });
die "no output\n" if !$content;
if (!defined($contents->{$vmid})) {
pprint("registered content for first sector", $vmid, $volid);
print "$content\n";
$contents->{$vmid} //= $content;
}
if ($content ne $contents->{$vmid}) {
pprint("detected changed content for first sector!", $vmid, $volid);
print "$content\n";
$contents->{$vmid} = $content;
}
};
if (my $err = $@) {
pprint("can't determine content for first sector - $err", $vmid);
$errors->{$vmid}++;
pprint("too many errors - skipping from now on", $vmid) if should_skip($vmid);
} else {
$errors->{$vmid} = 0;
}
}
sleep $interval;
}
More information about the pve-devel
mailing list