[pbs-devel] [PATCH proxmox-backup 1/4] api: verify: move chunk loading into parallel handler
Christian Ebner
c.ebner at proxmox.com
Thu Nov 6 09:54:45 CET 2025
On 11/5/25 4:51 PM, Nicolas Frey wrote:
> This way, the chunks will be loaded in parallel in addition to being
> checked in parallel.
>
> Depending on the underlying storage, this can speed up reading chunks
> from disk, especially when the underlying storage is IO depth
> dependent, and the CPU is faster than the storage.
>
> Originally-by: Dominik Csapak <d.csapak at proxmox.com>
> Signed-off-by: Nicolas Frey <n.frey at proxmox.com>
> ---
> src/backup/verify.rs | 120 +++++++++++++++++++++++++++----------------
> 1 file changed, 75 insertions(+), 45 deletions(-)
>
> diff --git a/src/backup/verify.rs b/src/backup/verify.rs
> index e33fdf50..7f91f38c 100644
> --- a/src/backup/verify.rs
> +++ b/src/backup/verify.rs
> @@ -1,6 +1,6 @@
> use pbs_config::BackupLockGuard;
> use std::collections::HashSet;
> -use std::sync::atomic::{AtomicUsize, Ordering};
> +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
> use std::sync::{Arc, Mutex};
> use std::time::Instant;
>
> @@ -20,7 +20,7 @@ use pbs_datastore::index::{ChunkReadInfo, IndexFile};
> use pbs_datastore::manifest::{BackupManifest, FileInfo};
> use pbs_datastore::{DataBlob, DataStore, DatastoreBackend, StoreProgress};
>
> -use crate::tools::parallel_handler::ParallelHandler;
> +use crate::tools::parallel_handler::{ParallelHandler, SendHandle};
>
> use crate::backup::hierarchy::ListAccessibleBackupGroups;
>
> @@ -156,23 +156,20 @@ impl VerifyWorker {
>
> let start_time = Instant::now();
>
> - let mut read_bytes = 0;
> - let mut decoded_bytes = 0;
> + let read_bytes = Arc::new(AtomicU64::new(0));
> + let decoded_bytes = Arc::new(AtomicU64::new(0));
>
> - let datastore2 = Arc::clone(&self.datastore);
> - let corrupt_chunks2 = Arc::clone(&self.corrupt_chunks);
> - let verified_chunks2 = Arc::clone(&self.verified_chunks);
> - let errors2 = Arc::clone(&errors);
> -
> - let decoder_pool = ParallelHandler::new(
> - "verify chunk decoder",
> - 4,
> + let decoder_pool = ParallelHandler::new("verify chunk decoder", 4, {
> + let datastore = Arc::clone(&self.datastore);
> + let corrupt_chunks = Arc::clone(&self.corrupt_chunks);
> + let verified_chunks = Arc::clone(&self.verified_chunks);
> + let errors = Arc::clone(&errors);
Hmm, this really warrants an internal state struct for bundling theses
and only cloning the arc around that.
E.g. I would propose to introduce a struct along the lines of:
struct IndexVerifyState {
read_bytes: AtomicU64,
decoded_bytes: AtomicU64,
errors: AtomicUsize,
datastore: Arc<DataStore>,
corrupt_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
start_time: Instant,
}
Which can then be wrapped in an Arc and also passed along as parameter
to the inner methods where required.
See the diff below (untested), which could be used for further refining
this.
> move |(chunk, digest, size): (DataBlob, [u8; 32], u64)| {
> let chunk_crypt_mode = match chunk.crypt_mode() {
> Err(err) => {
> - corrupt_chunks2.lock().unwrap().insert(digest);
> + corrupt_chunks.lock().unwrap().insert(digest);
> info!("can't verify chunk, unknown CryptMode - {err}");
> - errors2.fetch_add(1, Ordering::SeqCst);
> + errors.fetch_add(1, Ordering::SeqCst);
> return Ok(());
> }
> Ok(mode) => mode,
> @@ -182,21 +179,21 @@ impl VerifyWorker {
> info!(
> "chunk CryptMode {chunk_crypt_mode:?} does not match index CryptMode {crypt_mode:?}"
> );
> - errors2.fetch_add(1, Ordering::SeqCst);
> + errors.fetch_add(1, Ordering::SeqCst);
> }
>
> if let Err(err) = chunk.verify_unencrypted(size as usize, &digest) {
> - corrupt_chunks2.lock().unwrap().insert(digest);
> + corrupt_chunks.lock().unwrap().insert(digest);
> info!("{err}");
> - errors2.fetch_add(1, Ordering::SeqCst);
> - Self::rename_corrupted_chunk(datastore2.clone(), &digest);
> + errors.fetch_add(1, Ordering::SeqCst);
> + Self::rename_corrupted_chunk(datastore.clone(), &digest);
> } else {
> - verified_chunks2.lock().unwrap().insert(digest);
> + verified_chunks.lock().unwrap().insert(digest);
> }
>
> Ok(())
> - },
> - );
> + }
> + });
>
> let skip_chunk = |digest: &[u8; 32]| -> bool {
> if self.verified_chunks.lock().unwrap().contains(digest) {
> @@ -223,6 +220,29 @@ impl VerifyWorker {
> .datastore
> .get_chunks_in_order(&*index, skip_chunk, check_abort)?;
>
> + let reader_pool = ParallelHandler::new("read chunks", 4, {
> + let decoder_pool = decoder_pool.channel();
> + let datastore = Arc::clone(&self.datastore);
> + let corrupt_chunks = Arc::clone(&self.corrupt_chunks);
> + let read_bytes = Arc::clone(&read_bytes);
> + let decoded_bytes = Arc::clone(&decoded_bytes);
> + let errors = Arc::clone(&errors);
> + let backend = self.backend.clone();
> +
> + move |info: ChunkReadInfo| {
> + Self::verify_chunk_by_backend(
> + &backend,
> + Arc::clone(&datastore),
> + Arc::clone(&corrupt_chunks),
> + Arc::clone(&read_bytes),
> + Arc::clone(&decoded_bytes),
> + Arc::clone(&errors),
> + &decoder_pool,
> + &info,
> + )
> + }
> + });
> +
> for (pos, _) in chunk_list {
> self.worker.check_abort()?;
> self.worker.fail_on_shutdown()?;
> @@ -234,19 +254,16 @@ impl VerifyWorker {
> continue; // already verified or marked corrupt
> }
>
> - self.verify_chunk_by_backend(
> - &info,
> - &mut read_bytes,
> - &mut decoded_bytes,
> - Arc::clone(&errors),
> - &decoder_pool,
> - )?;
> + reader_pool.send(info)?;
> }
>
> - decoder_pool.complete()?;
> + reader_pool.complete()?;
>
> let elapsed = start_time.elapsed().as_secs_f64();
>
> + let read_bytes = read_bytes.load(Ordering::SeqCst);
> + let decoded_bytes = decoded_bytes.load(Ordering::SeqCst);
> +
> let read_bytes_mib = (read_bytes as f64) / (1024.0 * 1024.0);
> let decoded_bytes_mib = (decoded_bytes as f64) / (1024.0 * 1024.0);
>
> @@ -266,26 +283,31 @@ impl VerifyWorker {
> Ok(())
> }
>
> + #[allow(clippy::too_many_arguments)]
> fn verify_chunk_by_backend(
> - &self,
> - info: &ChunkReadInfo,
> - read_bytes: &mut u64,
> - decoded_bytes: &mut u64,
> + backend: &DatastoreBackend,
> + datastore: Arc<DataStore>,
> + corrupt_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
> + read_bytes: Arc<AtomicU64>,
> + decoded_bytes: Arc<AtomicU64>,
> errors: Arc<AtomicUsize>,
> - decoder_pool: &ParallelHandler<(DataBlob, [u8; 32], u64)>,
> + decoder_pool: &SendHandle<(DataBlob, [u8; 32], u64)>,
> + info: &ChunkReadInfo,
> ) -> Result<(), Error> {
> - match &self.backend {
> - DatastoreBackend::Filesystem => match self.datastore.load_chunk(&info.digest) {
> - Err(err) => self.add_corrupt_chunk(
> + match backend {
> + DatastoreBackend::Filesystem => match datastore.load_chunk(&info.digest) {
> + Err(err) => Self::add_corrupt_chunk(
> + datastore,
> + corrupt_chunks,
> info.digest,
> errors,
> &format!("can't verify chunk, load failed - {err}"),
> ),
> Ok(chunk) => {
> let size = info.size();
> - *read_bytes += chunk.raw_size();
> + read_bytes.fetch_add(chunk.raw_size(), Ordering::SeqCst);
> decoder_pool.send((chunk, info.digest, size))?;
> - *decoded_bytes += size;
> + decoded_bytes.fetch_add(size, Ordering::SeqCst);
> }
> },
> DatastoreBackend::S3(s3_client) => {
> @@ -302,9 +324,9 @@ impl VerifyWorker {
> match chunk_result {
> Ok(chunk) => {
> let size = info.size();
> - *read_bytes += chunk.raw_size();
> + read_bytes.fetch_add(chunk.raw_size(), Ordering::SeqCst);
> decoder_pool.send((chunk, info.digest, size))?;
> - *decoded_bytes += size;
> + decoded_bytes.fetch_add(size, Ordering::SeqCst);
> }
> Err(err) => {
> errors.fetch_add(1, Ordering::SeqCst);
> @@ -312,7 +334,9 @@ impl VerifyWorker {
> }
> }
> }
> - Ok(None) => self.add_corrupt_chunk(
> + Ok(None) => Self::add_corrupt_chunk(
> + datastore,
> + corrupt_chunks,
> info.digest,
> errors,
> &format!(
> @@ -330,13 +354,19 @@ impl VerifyWorker {
> Ok(())
> }
>
> - fn add_corrupt_chunk(&self, digest: [u8; 32], errors: Arc<AtomicUsize>, message: &str) {
> + fn add_corrupt_chunk(
> + datastore: Arc<DataStore>,
> + corrupt_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
> + digest: [u8; 32],
> + errors: Arc<AtomicUsize>,
> + message: &str,
> + ) {
> // Panic on poisoned mutex
> - let mut corrupt_chunks = self.corrupt_chunks.lock().unwrap();
> + let mut corrupt_chunks = corrupt_chunks.lock().unwrap();
> corrupt_chunks.insert(digest);
> error!(message);
> errors.fetch_add(1, Ordering::SeqCst);
> - Self::rename_corrupted_chunk(self.datastore.clone(), &digest);
> + Self::rename_corrupted_chunk(datastore.clone(), &digest);
> }
>
> fn verify_fixed_index(&self, backup_dir: &BackupDir, info: &FileInfo) -> Result<(), Error> {
diff --git a/src/backup/verify.rs b/src/backup/verify.rs
index 7f91f38c9..e01405750 100644
--- a/src/backup/verify.rs
+++ b/src/backup/verify.rs
@@ -152,24 +152,24 @@ impl VerifyWorker {
index: Box<dyn IndexFile + Send>,
crypt_mode: CryptMode,
) -> Result<(), Error> {
- let errors = Arc::new(AtomicUsize::new(0));
-
- let start_time = Instant::now();
-
- let read_bytes = Arc::new(AtomicU64::new(0));
- let decoded_bytes = Arc::new(AtomicU64::new(0));
+ let index_verify_state = Arc::new(IndexVerifyState {
+ read_bytes: AtomicU64::new(0),
+ decoded_bytes: AtomicU64::new(0),
+ errors: AtomicUsize::new(0),
+ datastore: Arc::clone(&self.datastore),
+ corrupt_chunks: Arc::clone(&self.corrupt_chunks),
+ start_time: Instant::now(),
+ });
let decoder_pool = ParallelHandler::new("verify chunk
decoder", 4, {
- let datastore = Arc::clone(&self.datastore);
- let corrupt_chunks = Arc::clone(&self.corrupt_chunks);
+ let state = Arc::clone(&index_verify_state);
let verified_chunks = Arc::clone(&self.verified_chunks);
- let errors = Arc::clone(&errors);
move |(chunk, digest, size): (DataBlob, [u8; 32], u64)| {
let chunk_crypt_mode = match chunk.crypt_mode() {
Err(err) => {
- corrupt_chunks.lock().unwrap().insert(digest);
+
state.corrupt_chunks.lock().unwrap().insert(digest);
info!("can't verify chunk, unknown CryptMode -
{err}");
- errors.fetch_add(1, Ordering::SeqCst);
+ state.errors.fetch_add(1, Ordering::SeqCst);
return Ok(());
}
Ok(mode) => mode,
@@ -179,14 +179,14 @@ impl VerifyWorker {
info!(
"chunk CryptMode {chunk_crypt_mode:?} does not
match index CryptMode {crypt_mode:?}"
);
- errors.fetch_add(1, Ordering::SeqCst);
+ state.errors.fetch_add(1, Ordering::SeqCst);
}
if let Err(err) = chunk.verify_unencrypted(size as
usize, &digest) {
- corrupt_chunks.lock().unwrap().insert(digest);
+ state.corrupt_chunks.lock().unwrap().insert(digest);
info!("{err}");
- errors.fetch_add(1, Ordering::SeqCst);
- Self::rename_corrupted_chunk(datastore.clone(),
&digest);
+ state.errors.fetch_add(1, Ordering::SeqCst);
+
Self::rename_corrupted_chunk(state.datastore.clone(), &digest);
} else {
verified_chunks.lock().unwrap().insert(digest);
}
@@ -201,7 +201,7 @@ impl VerifyWorker {
} else if
self.corrupt_chunks.lock().unwrap().contains(digest) {
let digest_str = hex::encode(digest);
info!("chunk {digest_str} was marked as corrupt");
- errors.fetch_add(1, Ordering::SeqCst);
+ index_verify_state.errors.fetch_add(1, Ordering::SeqCst);
true
} else {
false
@@ -222,24 +222,11 @@ impl VerifyWorker {
let reader_pool = ParallelHandler::new("read chunks", 4, {
let decoder_pool = decoder_pool.channel();
- let datastore = Arc::clone(&self.datastore);
- let corrupt_chunks = Arc::clone(&self.corrupt_chunks);
- let read_bytes = Arc::clone(&read_bytes);
- let decoded_bytes = Arc::clone(&decoded_bytes);
- let errors = Arc::clone(&errors);
+ let state = Arc::clone(&index_verify_state);
let backend = self.backend.clone();
move |info: ChunkReadInfo| {
- Self::verify_chunk_by_backend(
- &backend,
- Arc::clone(&datastore),
- Arc::clone(&corrupt_chunks),
- Arc::clone(&read_bytes),
- Arc::clone(&decoded_bytes),
- Arc::clone(&errors),
- &decoder_pool,
- &info,
- )
+ Self::verify_chunk_by_backend(&backend,
Arc::clone(&state), &decoder_pool, &info)
}
});
@@ -259,10 +246,10 @@ impl VerifyWorker {
reader_pool.complete()?;
- let elapsed = start_time.elapsed().as_secs_f64();
+ let elapsed =
index_verify_state.start_time.elapsed().as_secs_f64();
- let read_bytes = read_bytes.load(Ordering::SeqCst);
- let decoded_bytes = decoded_bytes.load(Ordering::SeqCst);
+ let read_bytes =
index_verify_state.read_bytes.load(Ordering::SeqCst);
+ let decoded_bytes =
index_verify_state.decoded_bytes.load(Ordering::SeqCst);
let read_bytes_mib = (read_bytes as f64) / (1024.0 * 1024.0);
let decoded_bytes_mib = (decoded_bytes as f64) / (1024.0 *
1024.0);
@@ -270,13 +257,13 @@ impl VerifyWorker {
let read_speed = read_bytes_mib / elapsed;
let decode_speed = decoded_bytes_mib / elapsed;
- let error_count = errors.load(Ordering::SeqCst);
+ let error_count = index_verify_state.errors.load(Ordering::SeqCst);
info!(
" verified {read_bytes_mib:.2}/{decoded_bytes_mib:.2} MiB
in {elapsed:.2} seconds, speed {read_speed:.2}/{decode_speed:.2} MiB/s
({error_count} errors)"
);
- if errors.load(Ordering::SeqCst) > 0 {
+ if index_verify_state.errors.load(Ordering::SeqCst) > 0 {
bail!("chunks could not be verified");
}
@@ -286,28 +273,26 @@ impl VerifyWorker {
#[allow(clippy::too_many_arguments)]
fn verify_chunk_by_backend(
backend: &DatastoreBackend,
- datastore: Arc<DataStore>,
- corrupt_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
- read_bytes: Arc<AtomicU64>,
- decoded_bytes: Arc<AtomicU64>,
- errors: Arc<AtomicUsize>,
+ state: Arc<IndexVerifyState>,
decoder_pool: &SendHandle<(DataBlob, [u8; 32], u64)>,
info: &ChunkReadInfo,
) -> Result<(), Error> {
match backend {
- DatastoreBackend::Filesystem => match
datastore.load_chunk(&info.digest) {
+ DatastoreBackend::Filesystem => match
state.datastore.load_chunk(&info.digest) {
Err(err) => Self::add_corrupt_chunk(
- datastore,
- corrupt_chunks,
+ Arc::clone(&state.datastore),
+ Arc::clone(&state.corrupt_chunks),
info.digest,
- errors,
+ &state.errors,
&format!("can't verify chunk, load failed - {err}"),
),
Ok(chunk) => {
let size = info.size();
- read_bytes.fetch_add(chunk.raw_size(),
Ordering::SeqCst);
+ state
+ .read_bytes
+ .fetch_add(chunk.raw_size(), Ordering::SeqCst);
decoder_pool.send((chunk, info.digest, size))?;
- decoded_bytes.fetch_add(size, Ordering::SeqCst);
+ state.decoded_bytes.fetch_add(size, Ordering::SeqCst);
}
},
DatastoreBackend::S3(s3_client) => {
@@ -324,28 +309,30 @@ impl VerifyWorker {
match chunk_result {
Ok(chunk) => {
let size = info.size();
- read_bytes.fetch_add(chunk.raw_size(),
Ordering::SeqCst);
+ state
+ .read_bytes
+ .fetch_add(chunk.raw_size(),
Ordering::SeqCst);
decoder_pool.send((chunk, info.digest,
size))?;
- decoded_bytes.fetch_add(size,
Ordering::SeqCst);
+ state.decoded_bytes.fetch_add(size,
Ordering::SeqCst);
}
Err(err) => {
- errors.fetch_add(1, Ordering::SeqCst);
+ state.errors.fetch_add(1,
Ordering::SeqCst);
error!("can't verify chunk, load
failed - {err}");
}
}
}
Ok(None) => Self::add_corrupt_chunk(
- datastore,
- corrupt_chunks,
+ Arc::clone(&state.datastore),
+ Arc::clone(&state.corrupt_chunks),
info.digest,
- errors,
+ &state.errors,
&format!(
"can't verify missing chunk with digest {}",
hex::encode(info.digest)
),
),
Err(err) => {
- errors.fetch_add(1, Ordering::SeqCst);
+ state.errors.fetch_add(1, Ordering::SeqCst);
error!("can't verify chunk, load failed - {err}");
}
}
@@ -358,7 +345,7 @@ impl VerifyWorker {
datastore: Arc<DataStore>,
corrupt_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
digest: [u8; 32],
- errors: Arc<AtomicUsize>,
+ errors: &AtomicUsize,
message: &str,
) {
// Panic on poisoned mutex
@@ -681,3 +668,12 @@ impl VerifyWorker {
}
}
}
+
+struct IndexVerifyState {
+ read_bytes: AtomicU64,
+ decoded_bytes: AtomicU64,
+ errors: AtomicUsize,
+ datastore: Arc<DataStore>,
+ corrupt_chunks: Arc<Mutex<HashSet<[u8; 32]>>>,
+ start_time: Instant,
+}
More information about the pbs-devel
mailing list