[pbs-devel] [PATCH proxmox-backup 1/3] partial fix #6049: datastore: impl ConfigVersionCache fast path for lookups
Samuel Rufinatscha
s.rufinatscha at proxmox.com
Tue Nov 11 13:29:39 CET 2025
Repeated /status requests caused lookup_datastore() to re-read and
parse datastore.cfg on every call. The issue was mentioned in report
#6049 [1]. cargo-flamegraph [2] confirmed that the hot path is
dominated by pbs_config::datastore::config() (config parsing).
This patch adds a fast path to lookup_datastore() using the shared-
memory ConfigVersionCache generation counter for datastore.cfg. It
stores the last seen generation number alongside the cached
DataStoreImpl and, when a subsequent lookup sees the same generation,
we reuse the cached instance without re-reading the on-disk config. If
the generation differs (or the cache is unavailable), it falls back to
the existing slow path with no behavioral changes.
Behavioral notes
- The generation is bumped via the existing save_config() path, so
API-driven config changes are detected immediately.
- Manual edits to datastore.cfg are not detected by this commit; a TTL
guard is introduced in a later patch to cover that case.
- DataStore::drop still performs a config read on the common path; this
is addressed in the next patch.
Testing
cargo-flamegraph confirms the config-parse hotspot in
lookup_datastore() disappears from the hot path.
Additionally, an end-to-end benchmark was performed using the
`/status?verbose=0` API with 1000 datastores, 5 requests per store,
and 16-way parallelism:
| Metric | Before | After | Δ (abs) | Δ (%) |
|--------------------------|:------:|:------:|:-------:|:-------:|
| Total time | 13s | 11s | −2s | −15.38% |
| Throughput (all rounds) | 384.62 | 454.55 | +69.93 | +18.18% |
| Cold RPS (round #1) | 76.92 | 90.91 | +13.99 | +18.19% |
| Warm RPS (rounds 2..N) | 307.69 | 363.64 | +55.95 | +18.18% |
Throughput improved by ~18% overall, with total time reduced by ~15%.
Warm lookups now reuse cached datastore instances and skip redundant
config parsing entirely. The first-access round also shows a similar
improvement, likely due to reduced contention and I/O when many stores
are accessed in parallel.
Note: A second hotspot remains in Drop where a config read occurs; that
is addressed in the next commit.
Links
[1] Bugzilla: https://bugzilla.proxmox.com/show_bug.cgi?id=6049
[2] cargo-flamegraph: https://github.com/flamegraph-rs/flamegraph
Fixes: #6049
Signed-off-by: Samuel Rufinatscha <s.rufinatscha at proxmox.com>
---
pbs-config/src/config_version_cache.rs | 10 +++-
pbs-datastore/src/datastore.rs | 77 +++++++++++++++++---------
2 files changed, 59 insertions(+), 28 deletions(-)
diff --git a/pbs-config/src/config_version_cache.rs b/pbs-config/src/config_version_cache.rs
index e8fb994f..b875f7e0 100644
--- a/pbs-config/src/config_version_cache.rs
+++ b/pbs-config/src/config_version_cache.rs
@@ -26,7 +26,6 @@ struct ConfigVersionCacheDataInner {
// Traffic control (traffic-control.cfg) generation/version.
traffic_control_generation: AtomicUsize,
// datastore (datastore.cfg) generation/version
- // FIXME: remove with PBS 3.0
datastore_generation: AtomicUsize,
// Add further atomics here
}
@@ -145,8 +144,15 @@ impl ConfigVersionCache {
.fetch_add(1, Ordering::AcqRel);
}
+ /// Returns the datastore generation number.
+ pub fn datastore_generation(&self) -> usize {
+ self.shmem
+ .data()
+ .datastore_generation
+ .load(Ordering::Acquire)
+ }
+
/// Increase the datastore generation number.
- // FIXME: remove with PBS 3.0 or make actually useful again in datastore lookup
pub fn increase_datastore_generation(&self) -> usize {
self.shmem
.data()
diff --git a/pbs-datastore/src/datastore.rs b/pbs-datastore/src/datastore.rs
index 70af94d8..18eebb58 100644
--- a/pbs-datastore/src/datastore.rs
+++ b/pbs-datastore/src/datastore.rs
@@ -32,7 +32,7 @@ use pbs_api_types::{
MaintenanceType, Operation, UPID,
};
use pbs_config::s3::S3_CFG_TYPE_ID;
-use pbs_config::BackupLockGuard;
+use pbs_config::{BackupLockGuard, ConfigVersionCache};
use crate::backup_info::{
BackupDir, BackupGroup, BackupInfo, OLD_LOCKING, PROTECTED_MARKER_FILENAME,
@@ -140,10 +140,10 @@ pub struct DataStoreImpl {
last_gc_status: Mutex<GarbageCollectionStatus>,
verify_new: bool,
chunk_order: ChunkOrder,
- last_digest: Option<[u8; 32]>,
sync_level: DatastoreFSyncLevel,
backend_config: DatastoreBackendConfig,
lru_store_caching: Option<LocalDatastoreLruCache>,
+ cached_config_tag: Option<CachedDatastoreConfigTag>,
}
impl DataStoreImpl {
@@ -156,10 +156,10 @@ impl DataStoreImpl {
last_gc_status: Mutex::new(GarbageCollectionStatus::default()),
verify_new: false,
chunk_order: Default::default(),
- last_digest: None,
sync_level: Default::default(),
backend_config: Default::default(),
lru_store_caching: None,
+ cached_config_tag: None,
})
}
}
@@ -224,6 +224,15 @@ pub enum DatastoreBackend {
S3(Arc<S3Client>),
}
+/// Used to determine whether a cached datastore instance is still valid
+/// or needs to be reloaded after a config change.
+struct CachedDatastoreConfigTag {
+ /// Maintenance mode at the time the lookup hint was captured, if any.
+ last_maintenance_mode: Option<MaintenanceMode>,
+ /// Datastore generation number from `ConfigVersionCache`; `None` when the cache wasn't available.
+ last_generation: Option<usize>,
+}
+
impl DataStore {
// This one just panics on everything
#[doc(hidden)]
@@ -299,13 +308,40 @@ impl DataStore {
// we use it to decide whether it is okay to delete the datastore.
let _config_lock = pbs_config::datastore::lock_config()?;
- // we could use the ConfigVersionCache's generation for staleness detection, but we load
- // the config anyway -> just use digest, additional benefit: manual changes get detected
- let (config, digest) = pbs_config::datastore::config()?;
+ // Get the current datastore.cfg generation number
+ let gen_num = ConfigVersionCache::new()
+ .ok()
+ .map(|c| c.datastore_generation());
+
+ // Fast-path: if we have a cached entry created under the same datastore.cfg generation number, reuse it.
+ if let (Some(gen_num), Some(ds)) =
+ (gen_num, DATASTORE_MAP.lock().unwrap().get(name).cloned())
+ {
+ if let Some(cached_tag) = &ds.cached_config_tag {
+ if cached_tag.last_generation == Some(gen_num) {
+ if let Some(mm) = &cached_tag.last_maintenance_mode {
+ if let Err(error) = mm.check(operation) {
+ bail!("datastore '{name}' is unavailable: {error}");
+ }
+ }
+ if let Some(operation) = operation {
+ update_active_operations(name, operation, 1)?;
+ }
+ return Ok(Arc::new(Self {
+ inner: ds,
+ operation,
+ }));
+ }
+ }
+ }
+
+ // Slow path: (re)load config
+ let (config, _digest) = pbs_config::datastore::config()?;
let config: DataStoreConfig = config.lookup("datastore", name)?;
- if let Some(maintenance_mode) = config.get_maintenance_mode() {
- if let Err(error) = maintenance_mode.check(operation) {
+ let maintenance_mode = config.get_maintenance_mode();
+ if let Some(mm) = &maintenance_mode {
+ if let Err(error) = mm.check(operation) {
bail!("datastore '{name}' is unavailable: {error}");
}
}
@@ -321,16 +357,6 @@ impl DataStore {
// reuse chunk store so that we keep using the same process locker instance!
let chunk_store = if let Some(datastore) = &entry {
- let last_digest = datastore.last_digest.as_ref();
- if let Some(true) = last_digest.map(|last_digest| last_digest == &digest) {
- if let Some(operation) = operation {
- update_active_operations(name, operation, 1)?;
- }
- return Ok(Arc::new(Self {
- inner: Arc::clone(datastore),
- operation,
- }));
- }
Arc::clone(&datastore.chunk_store)
} else {
let tuning: DatastoreTuning = serde_json::from_value(
@@ -344,7 +370,11 @@ impl DataStore {
)?)
};
- let datastore = DataStore::with_store_and_config(chunk_store, config, Some(digest))?;
+ let mut datastore = DataStore::with_store_and_config(chunk_store, config)?;
+ datastore.cached_config_tag = Some(CachedDatastoreConfigTag {
+ last_maintenance_mode: maintenance_mode,
+ last_generation: gen_num,
+ });
let datastore = Arc::new(datastore);
datastore_cache.insert(name.to_string(), datastore.clone());
@@ -430,11 +460,7 @@ impl DataStore {
config.absolute_path(),
tuning.sync_level.unwrap_or_default(),
)?;
- let inner = Arc::new(Self::with_store_and_config(
- Arc::new(chunk_store),
- config,
- None,
- )?);
+ let inner = Arc::new(Self::with_store_and_config(Arc::new(chunk_store), config)?);
if let Some(operation) = operation {
update_active_operations(&name, operation, 1)?;
@@ -446,7 +472,6 @@ impl DataStore {
fn with_store_and_config(
chunk_store: Arc<ChunkStore>,
config: DataStoreConfig,
- last_digest: Option<[u8; 32]>,
) -> Result<DataStoreImpl, Error> {
let mut gc_status_path = chunk_store.base_path();
gc_status_path.push(".gc-status");
@@ -506,10 +531,10 @@ impl DataStore {
last_gc_status: Mutex::new(gc_status),
verify_new: config.verify_new.unwrap_or(false),
chunk_order: tuning.chunk_order.unwrap_or_default(),
- last_digest,
sync_level: tuning.sync_level.unwrap_or_default(),
backend_config,
lru_store_caching,
+ cached_config_tag: None,
})
}
--
2.47.3
More information about the pbs-devel
mailing list