[pbs-devel] [PATCH v2 proxmox-backup 2/3] debug cli: add 'compare-content' flag to `diff archive` command

Wolfgang Bumiller w.bumiller at proxmox.com
Tue Dec 6 12:39:41 CET 2022


On Mon, Dec 05, 2022 at 03:55:13PM +0100, Lukas Wagner wrote:
> When --compare-content is set, the command will compare the
> file content instead on relying on mtime to detect modified files.
> 
> Signed-off-by: Lukas Wagner <l.wagner at proxmox.com>
> ---
> Changes from v1:
>   - Made `changed indicator for file content` a bit less confusing.
>     For regular files, it is now only displayed if
>       - the --comapare-content flag is set and
>       - the file contents *actually* differ
> 
>   - Removed unnecessesary namespace prefix for std::task::Pin in unit tests
> 
>  src/bin/proxmox_backup_debug/diff.rs | 157 +++++++++++++++++++++++++--
>  1 file changed, 145 insertions(+), 12 deletions(-)
> 
> diff --git a/src/bin/proxmox_backup_debug/diff.rs b/src/bin/proxmox_backup_debug/diff.rs
> index d5a4a1fe..b5ecd721 100644
> --- a/src/bin/proxmox_backup_debug/diff.rs
> +++ b/src/bin/proxmox_backup_debug/diff.rs
> @@ -28,12 +28,15 @@ use pbs_tools::json::required_string_param;
>  use pxar::accessor::ReadAt;
>  use pxar::EntryKind;
>  use serde_json::Value;
> +use tokio::io::AsyncReadExt;
>  
>  type ChunkDigest = [u8; 32];
>  type FileEntry = pxar::accessor::aio::FileEntry<Arc<dyn ReadAt + Send + Sync>>;
>  type Accessor = pxar::accessor::aio::Accessor<Arc<dyn ReadAt + Send + Sync>>;
>  type Directory = pxar::accessor::aio::Directory<Arc<dyn ReadAt + Send + Sync>>;
>  
> +const BUFFERSIZE: usize = 1024;

Please bump this ito 4k. Full page sizes have the potential to perform
better (even though in this case we'll rarely ever be page aligned, but it
won't hurt...)

> +
>  pub fn diff_commands() -> CommandLineInterface {
>      let cmd_def = CliCommandMap::new().insert(
>          "archive",
> @@ -79,18 +82,30 @@ pub fn diff_commands() -> CommandLineInterface {
>                  schema: KEYFD_SCHEMA,
>                  optional: true,
>              },
> +            "compare-content": {
> +                optional: true,
> +                type: bool,
> +                description: "Compare file content rather than solely relying on mtime for detecting modified files.",
> +            },
>          }
>      }
>  )]
>  /// Diff an archive in two snapshots. The command will output a list of added, modified and deleted files.
> -/// For modified files, only the file metadata (e.g. mtime, size, etc.) will be considered. The actual
> -/// file contents will not be compared.
> +/// For modified files, the file metadata (e.g. mode, uid, gid, size, etc.) will be considered. For detecting
> +/// modification of file content, only mtime will be used by default. If the --compare-content flag is provided,
> +/// mtime is ignored and file content will be compared.
>  async fn diff_archive_cmd(param: Value) -> Result<(), Error> {
>      let repo = extract_repository_from_value(&param)?;
>      let snapshot_a = required_string_param(&param, "prev-snapshot")?;
>      let snapshot_b = required_string_param(&param, "snapshot")?;
>      let archive_name = required_string_param(&param, "archive-name")?;
>  
> +    let compare_contents = match param.get("compare-content") {
> +        Some(Value::Bool(value)) => *value,
> +        Some(_) => bail!("invalid flag for compare-content"),
> +        None => false,
> +    };

^ Let's start to move parameters into the function signature here.
Should work fine for most of them.

The `#[api]` macro does support mixed parameters as well so you can
still use the `extract_repository_from_value` helper. Eg. you could do:

    async fn diff_archive_cmd(
        prev_snapshot: String,
        snapshot: String,
        archive_name: String,
        param: Value,
    ) -> Result<(), Error> {

Though you'll still need to have `let` bindings for the "a" and "b"
renaming, but that'll be easy & clear to read.

> +
>      let namespace = match param.get("ns") {
>          Some(Value::String(ns)) => ns.parse()?,
>          Some(_) => bail!("invalid namespace parameter"),
> @@ -120,7 +135,14 @@ async fn diff_archive_cmd(param: Value) -> Result<(), Error> {
>  
>      if archive_name.ends_with(".pxar") {
>          let file_name = format!("{}.didx", archive_name);
> -        diff_archive(snapshot_a, snapshot_b, &file_name, &repo_params).await?;
> +        diff_archive(
> +            snapshot_a,
> +            snapshot_b,
> +            &file_name,
> +            &repo_params,
> +            compare_contents,
> +        )
> +        .await?;
>      } else {
>          bail!("Only .pxar files are supported");
>      }
> @@ -133,6 +155,7 @@ async fn diff_archive(
>      snapshot_b: &str,
>      file_name: &str,
>      repo_params: &RepoParams,
> +    compare_contents: bool,
>  ) -> Result<(), Error> {
>      let (index_a, accessor_a) = open_dynamic_index(snapshot_a, file_name, repo_params).await?;
>      let (index_b, accessor_b) = open_dynamic_index(snapshot_b, file_name, repo_params).await?;
> @@ -184,7 +207,7 @@ async fn diff_archive(
>  
>      // ... so we compare the file metadata/contents to narrow the selection down to files
>      // which where *really* modified.
> -    let modified_files = compare_files(potentially_modified).await?;
> +    let modified_files = compare_files(potentially_modified, compare_contents).await?;
>  
>      show_file_list(&added_files, &deleted_files, &modified_files);
>  
> @@ -352,11 +375,12 @@ fn visit_directory<'f, 'c>(
>  /// Check if files were actually modified
>  async fn compare_files<'a>(
>      files: HashMap<&'a OsStr, (&'a FileEntry, &'a FileEntry)>,
> +    compare_contents: bool,
>  ) -> Result<HashMap<&'a OsStr, (&'a FileEntry, ChangedProperties)>, Error> {
>      let mut modified_files = HashMap::new();
>  
>      for (path, (entry_a, entry_b)) in files {
> -        if let Some(changed) = compare_file(entry_a, entry_b).await? {
> +        if let Some(changed) = compare_file(entry_a, entry_b, compare_contents).await? {
>              modified_files.insert(path, (entry_b, changed));
>          }
>      }
> @@ -367,6 +391,7 @@ async fn compare_files<'a>(
>  async fn compare_file(
>      file_a: &FileEntry,
>      file_b: &FileEntry,
> +    compare_contents: bool,
>  ) -> Result<Option<ChangedProperties>, Error> {
>      let mut changed = ChangedProperties::default();
>  
> @@ -385,10 +410,22 @@ async fn compare_file(
>              changed.content = a.major != b.major || a.minor != b.minor
>          }
>          (EntryKind::File { size: size_a, .. }, EntryKind::File { size: size_b, .. }) => {
> -            if size_a != size_b {
> -                changed.size = true;
> -                changed.content = true;
> -            };
> +            changed.size = size_a != size_b;
> +
> +            if compare_contents {
> +                if changed.size {
> +                    changed.content = true;
> +                } else {
> +                    let content_identical = compare_file_contents(file_a, file_b).await?;
> +                    if content_identical && !changed.any_without_mtime() {
> +                        // If the content is identical and nothing, exluding mtime,
> +                        // has changed, we don't consider the entry as modified.
> +                        changed.mtime = false;
> +                    }
> +
> +                    changed.content = !content_identical;
> +                }
> +            }
>          }
>          (EntryKind::Directory, EntryKind::Directory) => {}
>          (EntryKind::Socket, EntryKind::Socket) => {}
> @@ -405,6 +442,44 @@ async fn compare_file(
>      }
>  }
>  
> +async fn compare_file_contents(file_a: &FileEntry, file_b: &FileEntry) -> Result<bool, Error> {
> +    let mut contents_a = file_a.contents().await?;
> +    let mut contents_b = file_b.contents().await?;
> +
> +    compare_readers(&mut contents_a, &mut contents_b).await
> +}
> +
> +async fn compare_readers<T>(reader_a: &mut T, reader_b: &mut T) -> Result<bool, Error>
> +where
> +    T: AsyncReadExt + Unpin,
> +{
> +    let mut buf_a = vec![0u8; BUFFERSIZE];
> +    let mut buf_b = vec![0u8; BUFFERSIZE];

Maybe use a Box instead of a Vec, we don't ever resize this atm.

> +
> +    loop {
> +        // normally, a join! would be nice here, but this leads to
> +        // some 'higher-order lifetime error' for which I'm not smart enough to solve.
> +        // let (bytes_read_a, bytes_read_b) =
> +        //     tokio::try_join!(reader_a.read(&mut buf_a), reader_b.read(&mut buf_b))?;

The compile error also isn't useful at all.

Given that all it takes to make it work is to wrap the futures in an
async {} block, I'd call this a compiler bug. If you manage to find a
minimal reproducer w/o requiring pbs code, you could report it.

This one works:

    let (bytes_read_a, bytes_read_b) =
        tokio::try_join!(
            async { reader_a.read(&mut buf_a).await },
            async { reader_b.read(&mut buf_b).await },
        )?;

Note that your commented-out code itself does in fact compile fine, and
can even be used "normally", only when reached *some* way through an
#[api] fn does it error, which probably comes from the use of the
`ApiHandler` enum...

> +        let bytes_read_a = reader_a.read(&mut buf_a).await?;
> +        let bytes_read_b = reader_b.read(&mut buf_b).await?;
> +
> +        if bytes_read_a != bytes_read_b {
> +            return Ok(false);
> +        }
> +
> +        if bytes_read_a == 0 {
> +            break;
> +        }
> +
> +        if buf_a[..bytes_read_a] != buf_b[..bytes_read_b] {
> +            return Ok(false);
> +        }
> +    }
> +
> +    Ok(true)
> +}
> +
>  #[derive(Copy, Clone, Default)]
>  struct ChangedProperties {
>      entry_type: bool,
> @@ -438,8 +513,11 @@ impl ChangedProperties {
>      }
>  
>      fn any(&self) -> bool {
> +        self.any_without_mtime() || self.mtime
> +    }
> +
> +    fn any_without_mtime(&self) -> bool {
>          self.entry_type
> -            || self.mtime
>              || self.acl
>              || self.xattrs
>              || self.fcaps
> @@ -475,9 +553,10 @@ fn format_filesize(entry: &FileEntry, changed: bool) -> String {
>  fn format_mtime(entry: &FileEntry, changed: bool) -> String {
>      let mtime = &entry.metadata().stat.mtime;
>  
> -    let format = if changed { "*%F %T" } else { " %F %T" };
> +    let mut format = change_indicator(changed).to_owned();
> +    format.push_str("%F %T");
>  
> -    proxmox_time::strftime_local(format, mtime.secs).unwrap_or_default()
> +    proxmox_time::strftime_local(&format, mtime.secs).unwrap_or_default()
>  }
>  
>  fn format_mode(entry: &FileEntry, changed: bool) -> String {
> @@ -553,3 +632,57 @@ fn show_file_list(
>          println!("{op} {entry_type:>2} {mode:>5} {uid:>6} {gid:>6} {size:>10} {mtime:11} {name}");
>      }
>  }
> +
> +#[cfg(test)]
> +mod tests {
> +    use super::*;
> +
> +    use std::{
> +        io::Cursor,
> +        pin::Pin,
> +        task::{Context, Poll},
> +    };
> +    use tokio::io::{AsyncRead, ReadBuf};
> +
> +    struct MockedAsyncReader(Cursor<Vec<u8>>);
> +
> +    impl AsyncRead for MockedAsyncReader {
> +        fn poll_read(
> +            self: Pin<&mut Self>,
> +            _cx: &mut Context<'_>,
> +            read_buf: &mut ReadBuf<'_>,
> +        ) -> Poll<std::io::Result<()>> {
> +            let mut buf = vec![0u8; 100];
> +
> +            let res = std::io::Read::read(&mut self.get_mut().0, &mut buf);
> +
> +            if let Ok(bytes) = res {
> +                read_buf.put_slice(&buf[..bytes]);
> +            }
> +
> +            Poll::Ready(res.map(|_| ()))
> +        }
> +    }
> +
> +    #[test]
> +    fn test_do_compare_file_contents() {
> +        fn compare(a: Vec<u8>, b: Vec<u8>) -> Result<bool, Error> {
> +            let mut mock_a = MockedAsyncReader(Cursor::new(a));
> +            let mut mock_b = MockedAsyncReader(Cursor::new(b));
> +
> +            proxmox_async::runtime::block_on(compare_readers(&mut mock_a, &mut mock_b))
> +        }
> +
> +        assert!(matches!(compare(vec![0; 15], vec![0; 15]), Ok(true)));
> +        assert!(matches!(compare(vec![0; 15], vec![0; 14]), Ok(false)));
> +        assert!(matches!(compare(vec![0; 15], vec![1; 15]), Ok(false)));
> +
> +        let mut buf = vec![1u8; 2 * BUFFERSIZE];
> +        buf[BUFFERSIZE] = 0;
> +        assert!(matches!(compare(vec![1u8; 2 * BUFFERSIZE], buf), Ok(false)));
> +
> +        let mut buf = vec![1u8; 2 * BUFFERSIZE];
> +        buf[2 * BUFFERSIZE - 1] = 0;
> +        assert!(matches!(compare(vec![1u8; 2 * BUFFERSIZE], buf), Ok(false)));
> +    }
> +}
> -- 
> 2.30.2





More information about the pbs-devel mailing list