[pbs-devel] [PATCH v8 proxmox-backup 48/69] pxar: caching: add look-ahead cache

Fabian Grünbichler f.gruenbichler at proxmox.com
Tue Jun 4 11:35:15 CEST 2024


On May 28, 2024 11:42 am, Christian Ebner wrote:
> Add a lookahead cache and the neccessary types to store the required
> data and keep track of directory boundaries while traversing the
> filesystem tree, in order to postpone a decision if to reuse or
> reencode a given regular file with unchanged metadata.
> 
> Signed-off-by: Christian Ebner <c.ebner at proxmox.com>
> ---
> changes since version 7:
> - no changes
> 
> changes since version 6:
> - add PxarLookaheadCache and refactor some of the logic to be contained
>   within this patch
> 
>  pbs-client/src/pxar/create.rs           |   2 +-
>  pbs-client/src/pxar/look_ahead_cache.rs | 165 ++++++++++++++++++++++++
>  pbs-client/src/pxar/mod.rs              |   1 +
>  3 files changed, 167 insertions(+), 1 deletion(-)
>  create mode 100644 pbs-client/src/pxar/look_ahead_cache.rs
> 
> diff --git a/pbs-client/src/pxar/create.rs b/pbs-client/src/pxar/create.rs
> index ac8827bb2..6127aa88f 100644
> --- a/pbs-client/src/pxar/create.rs
> +++ b/pbs-client/src/pxar/create.rs
> @@ -131,7 +131,7 @@ impl fmt::Display for ArchiveError {
>  }
>  
>  #[derive(Eq, PartialEq, Hash)]
> -struct HardLinkInfo {
> +pub(crate) struct HardLinkInfo {
>      st_dev: u64,
>      st_ino: u64,
>  }
> diff --git a/pbs-client/src/pxar/look_ahead_cache.rs b/pbs-client/src/pxar/look_ahead_cache.rs
> new file mode 100644
> index 000000000..539586271
> --- /dev/null
> +++ b/pbs-client/src/pxar/look_ahead_cache.rs
> @@ -0,0 +1,165 @@
> +use std::collections::HashSet;
> +use std::ffi::CString;
> +use std::ops::Range;
> +use std::os::unix::io::OwnedFd;
> +use std::path::PathBuf;
> +
> +use nix::sys::stat::FileStat;
> +
> +use pxar::encoder::PayloadOffset;
> +use pxar::Metadata;
> +
> +use super::create::*;
> +
> +const DEFAULT_CACHE_SIZE: usize = 512;
> +
> +pub(crate) struct CacheEntryData {
> +    pub(crate) fd: OwnedFd,
> +    pub(crate) c_file_name: CString,
> +    pub(crate) stat: FileStat,
> +    pub(crate) metadata: Metadata,
> +    pub(crate) payload_offset: PayloadOffset,
> +}
> +
> +pub(crate) enum CacheEntry {
> +    RegEntry(CacheEntryData),
> +    DirEntry(CacheEntryData),
> +    DirEnd,
> +}
> +
> +pub(crate) struct PxarLookaheadCache {
> +    // Current state of the cache
> +    enabled: bool,
> +    // Cached entries
> +    entries: Vec<CacheEntry>,
> +    // Entries encountered having more than one link given by stat
> +    hardlinks: HashSet<HardLinkInfo>,
> +    // Payload range covered by the currently cached entries
> +    range: Range<u64>,
> +    // Possible held back last chunk from last flush, used for possible chunk continuation
> +    last_chunk: Option<ReusableDynamicEntry>,
> +    // Path when started caching
> +    start_path: PathBuf,
> +    // Number of entries with file descriptors
> +    fd_entries: usize,
> +    // Max number of entries with file descriptors
> +    cache_size: usize,
> +}
> +
> +impl PxarLookaheadCache {
> +    pub(crate) fn new(size: Option<usize>) -> Self {
> +        Self {
> +            enabled: false,
> +            entries: Vec::new(),
> +            hardlinks: HashSet::new(),
> +            range: 0..0,
> +            last_chunk: None,
> +            start_path: PathBuf::new(),
> +            fd_entries: 0,
> +            cache_size: size.unwrap_or(DEFAULT_CACHE_SIZE),
> +        }
> +    }
> +
> +    pub(crate) fn is_full(&self) -> bool {
> +        self.fd_entries >= self.cache_size
> +    }
> +
> +    pub(crate) fn caching_enabled(&self) -> bool {
> +        self.enabled
> +    }
> +
> +    pub(crate) fn insert(

2 out of 3 calls to this are preceded by the same call to
update_start_path.. we could just add the path as parameter here, and
inline that call and drop update_start_path altogether AFAICT?

> +        &mut self,
> +        fd: OwnedFd,
> +        c_file_name: CString,
> +        stat: FileStat,
> +        metadata: Metadata,
> +        payload_offset: PayloadOffset,
> +    ) {
> +        self.enabled = true;
> +        self.fd_entries += 1;
> +        if metadata.is_dir() {
> +            self.entries.push(CacheEntry::DirEntry(CacheEntryData {
> +                fd,
> +                c_file_name,
> +                stat,
> +                metadata,
> +                payload_offset,
> +            }))
> +        } else {
> +            self.entries.push(CacheEntry::RegEntry(CacheEntryData {
> +                fd,
> +                c_file_name,
> +                stat,
> +                metadata,
> +                payload_offset,
> +            }))
> +        }
> +    }
> +
> +    pub(crate) fn insert_dir_end(&mut self) {
> +        self.entries.push(CacheEntry::DirEnd);
> +    }
> +
> +    pub(crate) fn take_and_reset(&mut self) -> Vec<CacheEntry> {
> +        self.fd_entries = 0;
> +        self.enabled = false;
> +        self.start_path.clear();

start_path is cleared here, and take_and_reset is called

> +        self.clear_range();
> +        std::mem::take(&mut self.entries)
> +    }
> +
> +    pub(crate) fn update_start_path(&mut self, path: PathBuf) {
> +        self.start_path = path;
> +    }
> +
> +    pub(crate) fn start_path(&self) -> &PathBuf {
> +        &self.start_path

right after the only call to this..

so take_and_reset could just take the path as well and return it, and we
can drop this one here?

> +    }
> +
> +    pub(crate) fn contains_hardlink(&self, info: &HardLinkInfo) -> bool {
> +        self.hardlinks.contains(info)
> +    }
> +
> +    pub(crate) fn insert_hardlink(&mut self, info: HardLinkInfo) -> bool {
> +        self.hardlinks.insert(info)
> +    }
> +
> +    pub(crate) fn range(&self) -> &Range<u64> {
> +        &self.range
> +    }
> +
> +    pub(crate) fn update_range(&mut self, range: Range<u64>) {
> +        self.range = range;
> +    }
> +
> +    pub(crate) fn clear_range(&mut self) {
> +        // keep end for possible continuation if cache has been cleared because
> +        // it was full, but further caching would be fine
> +        self.range = self.range.end..self.range.end
> +    }

dangerous name.. clear to me always implies removing everything..
especially since there is no doc comment on it that gives me such
important information at the call site.

buuuut, thankfully this is only called once, and that call is a few
lines above in take_and_reset, so maybe we can just inline it for now
and not expose this to accidents?

> +
> +    pub(crate) fn try_extend_range(&mut self, range: Range<u64>) -> bool {
> +        if self.range.end == 0 {
> +            // initialize first range to start and end with start of new range
> +            self.range.start = range.start;
> +            self.range.end = range.start;
> +        }
> +
> +        // range continued, update end
> +        if self.range.end == range.start {
> +            self.range.end = range.end;
> +            return true;
> +        }
> +
> +        false
> +    }
> +
> +    pub(crate) fn take_last_chunk(&mut self) -> Option<ReusableDynamicEntry> {
> +        self.last_chunk.take()
> +    }
> +
> +    pub(crate) fn update_last_chunk(&mut self, chunk: Option<ReusableDynamicEntry>) {
> +        self.last_chunk = chunk;
> +    }
> +}
> diff --git a/pbs-client/src/pxar/mod.rs b/pbs-client/src/pxar/mod.rs
> index 5248a1956..334759df6 100644
> --- a/pbs-client/src/pxar/mod.rs
> +++ b/pbs-client/src/pxar/mod.rs
> @@ -50,6 +50,7 @@
>  pub(crate) mod create;
>  pub(crate) mod dir_stack;
>  pub(crate) mod extract;
> +pub(crate) mod look_ahead_cache;
>  pub(crate) mod metadata;
>  pub(crate) mod tools;
>  
> -- 
> 2.39.2
> 
> 
> 
> _______________________________________________
> pbs-devel mailing list
> pbs-devel at lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pbs-devel
> 
> 
> 




More information about the pbs-devel mailing list