[pbs-devel] [RFC backup] Bloom filter for chunks
Dietmar Maurer
dietmar at proxmox.com
Wed Jul 29 09:58:55 CEST 2020
---
I just want to share this idea, but I have no real use case for now...
Cargo.toml | 1 +
src/tools.rs | 1 +
src/tools/chunk_bloom_filter.rs | 66 +++++++++++++++++++++++++++++++++
3 files changed, 68 insertions(+)
create mode 100644 src/tools/chunk_bloom_filter.rs
diff --git a/Cargo.toml b/Cargo.toml
index 2306395..5268ec1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -62,6 +62,7 @@ walkdir = "2"
xdg = "2.2"
zstd = { version = "0.4", features = [ "bindgen" ] }
nom = "5.1"
+bit-vec = "0.5"
[features]
default = []
diff --git a/src/tools.rs b/src/tools.rs
index 44db796..f6656f9 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -35,6 +35,7 @@ pub mod timer;
pub mod statistics;
pub mod systemd;
pub mod nom;
+pub mod chunk_bloom_filter;
mod wrapped_reader_stream;
pub use wrapped_reader_stream::*;
diff --git a/src/tools/chunk_bloom_filter.rs b/src/tools/chunk_bloom_filter.rs
new file mode 100644
index 0000000..32a43c0
--- /dev/null
+++ b/src/tools/chunk_bloom_filter.rs
@@ -0,0 +1,66 @@
+use bit_vec::BitVec;
+
+/// Bloom filter for chunks
+///
+/// This Bloom filter directly uses the chunk digest as hash, split
+/// into 8 parts.
+///
+/// formulas from https://en.wikipedia.org/wiki/Bloom_filter:
+///
+/// k = (m/n) * ln_2 ==> m = (k*n)/ln_2
+///
+/// We use k=8, so false positives rate should be around 2%
+///
+/// Memory usage in bytes = expected_entries*1.44
+pub struct ChunkBloomFilter {
+ bits: BitVec,
+}
+
+impl ChunkBloomFilter {
+
+ fn needed_bits(expected_entries: usize) -> usize {
+ const K: usize = 8;
+
+ (((K * expected_entries) as f64)/core::f64::consts::LN_2) as usize
+ }
+
+ pub fn new(expected_entries: usize) -> Self {
+ let n = Self::needed_bits(expected_entries);
+ Self {
+ bits: BitVec::with_capacity(n)
+ }
+ }
+
+ pub fn insert(&mut self, digest: &[u8; 32]) -> bool {
+
+ let hashes: &[u32; 8] = unsafe { std::mem::transmute(digest) };
+
+ let mut contained = true;
+
+ for i in 0..8 {
+ let index = (hashes[i] as usize) % self.bits.len();
+ if !self.bits.get(index).unwrap() {
+ self.bits.set(index, true);
+ contained = false;
+ }
+ }
+
+ contained
+ }
+
+ pub fn contains(&mut self, digest: &[u8; 32]) -> bool {
+
+ let hashes: &[u32; 8] = unsafe { std::mem::transmute(digest) };
+
+ let mut contained = true;
+
+ for i in 0..8 {
+ let index = (hashes[i] as usize) % self.bits.len();
+ if !self.bits.get(index).unwrap() {
+ contained = false;
+ }
+ }
+
+ contained
+ }
+}
--
2.20.1
More information about the pbs-devel
mailing list