[pve-devel] [Patch V2 guest-common] fix #1694: Replication risks permanently losing sync in high loads due to timeout bug

Thu Apr 12 09:46:03 CEST 2018

If the pool is under heavy load ZFS will low prioritized deletion jobs.
This ends in a timeout and the program logic will delete the current sync snapshot.
On the next run the former sync snapshots will also removed because they are not in the state file.
In this state it is no more possible to sync and a full sync has to be performed.

We ignore if the deletion of the former snapshot failed on the end of the replication run,
because prepare_local_job will delete it anyway and
when a timeout happens in this state we can ignore it and start the replication.

The snapshot deletion error will be logged in the replication log.
---
 PVE/Replication.pm | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

Patch V2:
As discuss with Dietmar we will keep remove the former snapshot at the end of the run,
but ignore if it fails. This is to prevent we have 2 replication snapshots at time.

diff --git a/PVE/Replication.pm b/PVE/Replication.pm
index 9bc4e61..d8ccfaf 100644
--- a/PVE/Replication.pm
+++ b/PVE/Replication.pm
@@ -136,8 +136,18 @@ sub prepare {
 		$last_snapshots->{$volid}->{$snap} = 1;
 	    } elsif ($snap =~ m/^\Q$prefix\E/) {
 		$logfunc->("delete stale replication snapshot '$snap' on $volid");
-		PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snap);
-		$cleaned_replicated_volumes->{$volid} = 1;
+
+		eval {
+		    PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snap);
+		    $cleaned_replicated_volumes->{$volid} = 1;
+		};
+
+		# If deleting the snapshot fails, we can not be sure if it was due to an error or a timeout.
+		# The likelihood that the delete has worked out is high at a timeout.
+		# If it really fails, it will try to remove on the next run.
+		warn $@ if $@;
+
+		$logfunc->("delete stale replication snapshot error: $@") if $@;
 	    }
 	}
     }
@@ -293,12 +303,16 @@ sub replicate {
 	die $err;
     }
 
-    # remove old snapshots because they are no longer needed
-    $cleanup_local_snapshots->($last_snapshots, $last_sync_snapname);
+    eval {
+	# remove old snapshots because they are no longer needed
+	$cleanup_local_snapshots->($last_snapshots, $last_sync_snapname);
 
-    remote_finalize_local_job($ssh_info, $jobid, $vmid, $sorted_volids, $start_time, $logfunc);
+	remote_finalize_local_job($ssh_info, $jobid, $vmid, $sorted_volids, $start_time, $logfunc);
+    };
 
-    die $err if $err;
+    # old snapshots will removed by next run from prepare_local_job.
+    warn $@ if $@;
+    $logfunc->("delete stale replication snapshot error: $@") if $@;
 
     return $volumes;
 }
-- 
2.11.0