[pbs-devel] [PATCH proxmox 1/2] add tools/zero: add fast zero comparison code

Dominik Csapak d.csapak at proxmox.com
Fri Dec 11 13:08:57 CET 2020


that can make use of see/avx instructions where available

this is mostly a direct translation of qemu's util/bufferiszero.c

this is originally from Wolfgang Bumiller

Signed-off-by: Dominik Csapak <d.csapak at proxmox.com>
---
 proxmox/src/tools/mod.rs  |   1 +
 proxmox/src/tools/zero.rs | 233 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 234 insertions(+)
 create mode 100644 proxmox/src/tools/zero.rs

diff --git a/proxmox/src/tools/mod.rs b/proxmox/src/tools/mod.rs
index ff3a720..49418a4 100644
--- a/proxmox/src/tools/mod.rs
+++ b/proxmox/src/tools/mod.rs
@@ -20,6 +20,7 @@ pub mod serde;
 pub mod time;
 pub mod uuid;
 pub mod vec;
+pub mod zero;
 
 #[cfg(feature = "websocket")]
 pub mod websocket;
diff --git a/proxmox/src/tools/zero.rs b/proxmox/src/tools/zero.rs
new file mode 100644
index 0000000..7493262
--- /dev/null
+++ b/proxmox/src/tools/zero.rs
@@ -0,0 +1,233 @@
+#[cfg(test)]
+mod test {
+    use std::mem;
+
+    pub(super) fn do_zero_test(func: fn(&[u8]) -> bool, short: bool) {
+        let mut buf: [u8; 512] = unsafe { mem::zeroed() };
+        assert_eq!(func(&buf), true);
+        for i in 0..buf.len() {
+            buf[i] = 1;
+            assert_eq!(func(&buf), false);
+            buf[i] = 0;
+        }
+        if short {
+            for i in 0..8 {
+                assert_eq!(func(&buf[0..i+1]), true);
+                buf[i] = 1;
+                assert_eq!(func(&buf[0..i+1]), false);
+                buf[i] = 0;
+            }
+        }
+    }
+}
+
+//#[cfg(all(target_arch = "x86_64", target_feature = "sse4"))]
+#[cfg(target_arch = "x86_64")]
+mod x86_64 {
+    use std::arch::x86_64::*;
+
+    const BIT_OSXSAVE: u32 = 1<<27;
+    const BIT_SSE2:    u32 = 1<<26;
+    const BIT_SSE4_1:  u32 = 1<<19;
+    const BIT_AVX:     u32 = 1<<28;
+    const BIT_AVX2:    u32 = 1<< 5;
+
+    // Direct translation of buffer_zero_sse2() of qemu's util/bufferiszero.c
+    fn buffer_is_zero_sse2(buf_slice: &[u8]) -> bool {
+        unsafe {
+            let len = buf_slice.len();
+            let buf = buf_slice.as_ptr() as *const u8;
+            let mut t = _mm_loadu_si128(buf as *const __m128i);
+            let mut p = ((buf as usize + 5*16) & !0xf) as *const __m128i;
+            let e = ((buf as usize + len) & !0xf) as *const __m128i;
+            let zero: __m128i = _mm_setzero_si128();
+            while p <= e {
+                _mm_prefetch(p as *const i8, _MM_HINT_T0);
+                t = _mm_cmpeq_epi8(t, zero);
+                if _mm_movemask_epi8(t) != 0xFFFF {
+                    return false;
+                }
+                t = *p.offset(-4);
+                t = _mm_or_si128(t, *p.offset(-3));
+                t = _mm_or_si128(t, *p.offset(-2));
+                t = _mm_or_si128(t, *p.offset(-1));
+                p = p.offset(4);
+            }
+            t = _mm_or_si128(t, *e.offset(-3));
+            t = _mm_or_si128(t, *e.offset(-2));
+            t = _mm_or_si128(t, *e.offset(-1));
+            t = _mm_or_si128(t, _mm_loadu_si128(
+                buf.add(len-16) as *const __m128i));
+            return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
+        }
+    }
+    #[test]
+    fn test_sse2() {
+        super::test::do_zero_test(buffer_is_zero_sse2, false);
+    }
+
+    // Direct translation of buffer_zero_sse4() of qemu's util/bufferiszero.c
+    fn buffer_is_zero_sse4_1(buf_slice: &[u8]) -> bool {
+        unsafe {
+            let len = buf_slice.len();
+            let buf = buf_slice.as_ptr() as *const u8;
+            let mut t = _mm_loadu_si128(buf as *const __m128i);
+            let mut p = ((buf as usize + 5*16) & !0xf) as *const __m128i;
+            let e = ((buf as usize + len) & !0xf) as *const __m128i;
+            while p <= e {
+                _mm_prefetch(p as *const i8, _MM_HINT_T0);
+                if _mm_testz_si128(t, t) == 0 {
+                    return false;
+                }
+                t = *p.offset(-4);
+                t = _mm_or_si128(t, *p.offset(-3));
+                t = _mm_or_si128(t, *p.offset(-2));
+                t = _mm_or_si128(t, *p.offset(-1));
+                p = p.offset(4);
+            }
+            t = _mm_or_si128(t, *e.offset(-3));
+            t = _mm_or_si128(t, *e.offset(-2));
+            t = _mm_or_si128(t, *e.offset(-1));
+            t = _mm_or_si128(t, _mm_loadu_si128(
+                buf.add(len-16) as *const __m128i));
+            return _mm_testz_si128(t, t) != 0;
+        }
+    }
+    #[test]
+    fn test_sse4_1() {
+        super::test::do_zero_test(buffer_is_zero_sse4_1, false);
+    }
+
+    // Direct translation of buffer_zero_avx2() of qemu's util/bufferiszero.c
+    fn buffer_is_zero_avx2(buf_slice: &[u8]) -> bool {
+        unsafe {
+            let len = buf_slice.len();
+            let buf = buf_slice.as_ptr() as *const u8;
+            let mut t = _mm256_loadu_si256(buf as *const __m256i);
+            let mut p = ((buf as usize + 5*32) & !0x1f) as *const __m256i;
+            let e = ((buf as usize + len) & !0x1f) as *const __m256i;
+            if p <= e {
+                // loop over 32 byte aligned blocks of 128
+                while p <= e {
+                    _mm_prefetch(p as *const i8, _MM_HINT_T0);
+                    if _mm256_testz_si256(t, t) == 0 {
+                        return false;
+                    }
+                    t = *p.offset(-4);
+                    t = _mm256_or_si256(t, *p.offset(-3));
+                    t = _mm256_or_si256(t, *p.offset(-2));
+                    t = _mm256_or_si256(t, *p.offset(-1));
+                    p = p.offset(4);
+                }
+                t = _mm256_or_si256(t, _mm256_loadu_si256(buf.add(len - 4*32) as *const __m256i));
+                t = _mm256_or_si256(t, _mm256_loadu_si256(buf.add(len - 3*32) as *const __m256i));
+            } else {
+                t = _mm256_or_si256(t, _mm256_loadu_si256(
+                    buf.add(32) as *const __m256i));
+                if len > 128 {
+                    t = _mm256_or_si256(t, _mm256_loadu_si256(buf.add(len - 4*32) as *const __m256i));
+                    t = _mm256_or_si256(t, _mm256_loadu_si256(buf.add(len - 3*32) as *const __m256i));
+                }
+            }
+            t = _mm256_or_si256(t, _mm256_loadu_si256(buf.add(len - 2*32) as *const __m256i));
+            t = _mm256_or_si256(t, _mm256_loadu_si256(buf.add(len - 1*32) as *const __m256i));
+            return _mm256_testz_si256(t, t) != 0;
+        }
+    }
+    #[test]
+    fn test_avx2() {
+        super::test::do_zero_test(buffer_is_zero_avx2, false);
+    }
+
+    // From qemu's (util/bufferiszero.c) init_cpuid_cache() + init_accel()
+    pub(super) fn init() {
+        unsafe {
+            let (max, _) = __get_cpuid_max(0);
+            if max >= 1 {
+                let id = __cpuid(1);
+                let avx_bits = BIT_OSXSAVE | BIT_AVX;
+                if (id.ecx & avx_bits) == avx_bits && max >= 7 {
+                    let bv = _xgetbv(0);
+                    let id70 = __cpuid_count(7, 0);
+                    if (bv & 6) == 6 && (id70.ebx & BIT_AVX2) == BIT_AVX2 {
+                        super::BUFFER_IS_ZERO_FUNC = buffer_is_zero_avx2;
+                        return;
+                    }
+                }
+
+                if (id.ecx & BIT_SSE4_1) == BIT_SSE4_1 {
+                    super::BUFFER_IS_ZERO_FUNC = buffer_is_zero_sse4_1;
+                    return;
+                }
+                if (id.edx & BIT_SSE2) == BIT_SSE2 {
+                    super::BUFFER_IS_ZERO_FUNC = buffer_is_zero_sse2;
+                    return
+                }
+            }
+            super::BUFFER_IS_ZERO_FUNC = super::buffer_is_zero_compat;
+        }
+    }
+}
+
+fn buffer_is_zero_compat(buf: &[u8]) -> bool{
+    let len = buf.len();
+    if len < 8 {
+        return buf.iter().fold(0, |a, x| a|x) == 0;
+    }
+
+    unsafe {
+        let mut ptr = buf.as_ptr() as *const u64;
+        let end = ((buf.as_ptr() as usize + len) & !7) as *const u64;
+        let mut t = ptr.read_unaligned();
+        ptr = ((ptr as usize + 8) & !7) as *const u64;
+        while ptr.add(8) <= end {
+            // XXX: add a prefetch_read_data() once it's stable...
+            if t != 0 {
+                return false;
+            }
+
+            t = *ptr | *ptr.add(1) | *ptr.add(2) | *ptr.add(3)
+              | *ptr.add(4) | *ptr.add(5) | *ptr.add(6) | *ptr.add(7);
+            ptr = ptr.add(8);
+        }
+        while ptr < end {
+            t |= *ptr;
+            ptr = ptr.add(1);
+        }
+        t |= end.offset(-1).read_unaligned();
+        return t == 0;
+    }
+}
+
+#[test]
+fn test_zero_compat() {
+    test::do_zero_test(buffer_is_zero_compat, true);
+}
+
+static BUF_IS_ZERO_GUARD: ::std::sync::Once = ::std::sync::Once::new();
+pub(self) static mut BUFFER_IS_ZERO_FUNC: fn(&[u8]) -> bool
+    = first_buffer_is_zero;
+
+fn first_buffer_is_zero(buf: &[u8]) -> bool {
+    BUF_IS_ZERO_GUARD.call_once(|| {
+        if cfg!(target_arch = "x86_64") {
+            x86_64::init();
+        }
+    });
+    return unsafe { BUFFER_IS_ZERO_FUNC(buf) };
+}
+
+pub fn buffer_is_zero(buf: &[u8]) -> bool {
+    if buf.len() == 0 {
+        return false;
+    }
+    if buf.len() < 64 {
+        return buffer_is_zero_compat(buf);
+    }
+    return unsafe { BUFFER_IS_ZERO_FUNC(buf) };
+}
+
+#[test]
+fn test_initialization() {
+    test::do_zero_test(buffer_is_zero, false);
+}
-- 
2.20.1






More information about the pbs-devel mailing list