[pve-devel] [guest-common] fix #1694: Replication risks permanently losing sync in high loads due to timeout bug

Fri Mar 23 12:15:01 CET 2018

If the pool is under heavy load ZFS will low prioritized deletion jobs.
This ends in a timeout and the program logic will delete the current sync snapshot.
On the next run the former sync snapshots will also removed because they are not in the state file.
In this state it is no more possible to sync and a full sync has to be performed.

We do not delete the former snapshot on the end of the replication run,
because prepare_local_job will delete it anyway and
when a timeout happens in this state we can ignore it and start the replication.

The snapshot deletion error will be logged in the replication log.
---
 PVE/Replication.pm | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/PVE/Replication.pm b/PVE/Replication.pm
index 9bc4e61..1eb853d 100644
--- a/PVE/Replication.pm
+++ b/PVE/Replication.pm
@@ -136,8 +136,18 @@ sub prepare {
 		$last_snapshots->{$volid}->{$snap} = 1;
 	    } elsif ($snap =~ m/^\Q$prefix\E/) {
 		$logfunc->("delete stale replication snapshot '$snap' on $volid");
-		PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snap);
-		$cleaned_replicated_volumes->{$volid} = 1;
+
+		eval {
+		    PVE::Storage::volume_snapshot_delete($storecfg, $volid, $snap);
+		    $cleaned_replicated_volumes->{$volid} = 1;
+		};
+
+		# If deleting the snapshot fails, we can not be sure if it was due to an error or a timeout.
+		# The likelihood that the delete has worked out is high at a timeout.
+		# If it really fails, it will try to remove on the next run.
+		warn $@ if $@;
+
+		$logfunc->("delete stale replication snapshot error: $@") if $@;
 	    }
 	}
     }
@@ -282,24 +292,16 @@ sub replicate {
 	    }
 
 	    replicate_volume($ssh_info, $storecfg, $volid, $base_snapname, $sync_snapname, $rate, $insecure, $logfunc);
+	    # old snapshots will removed by next run from prepare_local_job.
 	}
     };
-    $err = $@;
-
-    if ($err) {
+    if ($@) {
 	$cleanup_local_snapshots->($replicate_snapshots, $sync_snapname); # try to cleanup
 	# we do not cleanup the remote side here - this is done in
 	# next run of prepare_local_job
-	die $err;
+	die $@;
     }
 
-    # remove old snapshots because they are no longer needed
-    $cleanup_local_snapshots->($last_snapshots, $last_sync_snapname);
-
-    remote_finalize_local_job($ssh_info, $jobid, $vmid, $sorted_volids, $start_time, $logfunc);
-
-    die $err if $err;
-
     return $volumes;
 }
 
-- 
2.11.0