[pbs-devel] [RFC backup] Bloom filter for chunks

Dietmar Maurer dietmar at proxmox.com
Wed Jul 29 09:58:55 CEST 2020


---

 I just want to share this idea, but I have no real use case for now...

 Cargo.toml                      |  1 +
 src/tools.rs                    |  1 +
 src/tools/chunk_bloom_filter.rs | 66 +++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+)
 create mode 100644 src/tools/chunk_bloom_filter.rs

diff --git a/Cargo.toml b/Cargo.toml
index 2306395..5268ec1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -62,6 +62,7 @@ walkdir = "2"
 xdg = "2.2"
 zstd = { version = "0.4", features = [ "bindgen" ] }
 nom = "5.1"
+bit-vec = "0.5"
 
 [features]
 default = []
diff --git a/src/tools.rs b/src/tools.rs
index 44db796..f6656f9 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -35,6 +35,7 @@ pub mod timer;
 pub mod statistics;
 pub mod systemd;
 pub mod nom;
+pub mod chunk_bloom_filter;
 
 mod wrapped_reader_stream;
 pub use wrapped_reader_stream::*;
diff --git a/src/tools/chunk_bloom_filter.rs b/src/tools/chunk_bloom_filter.rs
new file mode 100644
index 0000000..32a43c0
--- /dev/null
+++ b/src/tools/chunk_bloom_filter.rs
@@ -0,0 +1,66 @@
+use bit_vec::BitVec;
+
+/// Bloom filter for chunks
+///
+/// This Bloom filter directly uses the chunk digest as hash, split
+/// into 8 parts.
+///
+/// formulas from https://en.wikipedia.org/wiki/Bloom_filter:
+///
+/// k = (m/n) * ln_2 ==> m = (k*n)/ln_2
+///
+/// We use k=8, so false positives rate should be around 2%
+///
+/// Memory usage in bytes = expected_entries*1.44
+pub struct ChunkBloomFilter {
+    bits: BitVec,
+}
+
+impl ChunkBloomFilter {
+
+    fn needed_bits(expected_entries: usize) -> usize {
+        const K: usize = 8;
+
+        (((K * expected_entries) as f64)/core::f64::consts::LN_2) as usize
+    }
+
+    pub fn new(expected_entries: usize) -> Self {
+        let n = Self::needed_bits(expected_entries);
+        Self {
+            bits: BitVec::with_capacity(n)
+        }
+    }
+
+    pub fn insert(&mut self, digest: &[u8; 32]) -> bool {
+
+        let hashes: &[u32; 8] = unsafe { std::mem::transmute(digest) };
+
+        let mut contained = true;
+
+        for i in 0..8 {
+            let index = (hashes[i] as usize) % self.bits.len();
+            if !self.bits.get(index).unwrap() {
+                self.bits.set(index, true);
+                contained = false;
+            }
+        }
+
+        contained
+    }
+
+    pub fn contains(&mut self, digest: &[u8; 32]) -> bool {
+
+        let hashes: &[u32; 8] = unsafe { std::mem::transmute(digest) };
+
+        let mut contained = true;
+
+        for i in 0..8 {
+            let index = (hashes[i] as usize) % self.bits.len();
+            if !self.bits.get(index).unwrap() {
+                contained = false;
+            }
+        }
+
+        contained
+    }
+}
-- 
2.20.1





More information about the pbs-devel mailing list