Optimize SIMD impl of lines_fwd and lines_bwd

Benchmark results on AMD Zen4: ``` simd/lines_fwd/1 time: [1.4801 ns 1.4803 ns 1.4804 ns] thrpt: [644.19 MiB/s 644.26 MiB/s 644.34 MiB/s] change: time: [−38.958% −38.933% −38.898%] (p = 0.00 < 0.05) thrpt: [+63.661% +63.755% +63.821%] Performance has improved. simd/lines_fwd/8 time: [3.8571 ns 3.8607 ns 3.8641 ns] thrpt: [1.9282 GiB/s 1.9299 GiB/s 1.9316 GiB/s] change: time: [−19.458% −19.423% −19.394%] (p = 0.00 < 0.05) thrpt: [+24.060% +24.105% +24.159%] Performance has improved. simd/lines_fwd/128 time: [14.064 ns 14.092 ns 14.120 ns] thrpt: [8.4426 GiB/s 8.4592 GiB/s 8.4764 GiB/s] change: time: [−6.0955% −5.8911% −5.6706%] (p = 0.00 < 0.05) thrpt: [+6.0115% +6.2599% +6.4912%] Performance has improved. simd/lines_fwd/1024 time: [18.160 ns 18.178 ns 18.195 ns] thrpt: [52.415 GiB/s 52.462 GiB/s 52.516 GiB/s] change: time: [−4.1174% −3.9973% −3.8859%] (p = 0.00 < 0.05) thrpt: [+4.0430% +4.1637% +4.2942%] Performance has improved. simd/lines_fwd/131072 time: [871.09 ns 871.14 ns 871.21 ns] thrpt: [140.12 GiB/s 140.13 GiB/s 140.14 GiB/s] change: time: [−10.451% −10.405% −10.358%] (p = 0.00 < 0.05) thrpt: [+11.555% +11.613% +11.670%] Performance has improved. simd/lines_fwd/134217728 time: [1.7326 ms 1.7332 ms 1.7338 ms] thrpt: [72.094 GiB/s 72.120 GiB/s 72.146 GiB/s] change: time: [−0.4091% −0.2348% −0.0670%] (p = 0.00 < 0.05) thrpt: [+0.0671% +0.2353% +0.4108%] Change within noise threshold. ```
2025-08-04 10:59:57 +00:00 · 2025-06-26 21:45:45 +08:00 · 2025-06-26 21:45:45 +08:00 · e20e0061dc
commit e20e0061dc
parent 7ece8c5a38
2 changed files with 24 additions and 20 deletions
--- a/src/simd/lines_bwd.rs
+++ b/src/simd/lines_bwd.rs
@ -110,9 +110,12 @@ unsafe fn lines_bwd_avx2(

        let lf = _mm256_set1_epi8(b'\n' as i8);
        let line_stop = line_stop.min(line);
-        let mut remaining = end.offset_from_unsigned(beg);
+        let off = end.addr() & 31;
+        if off != 0 && off < end.offset_from_unsigned(beg) {
+            (end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop);
+        }

-        while remaining >= 128 {
+        while end.offset_from_unsigned(beg) >= 128 {
            let chunk_start = end.sub(128);

            let v1 = _mm256_loadu_si256(chunk_start.add(0) as *const _);
@ -135,11 +138,10 @@ unsafe fn lines_bwd_avx2(
            }

            end = chunk_start;
-            remaining -= 128;
            line = line_next;
        }

-        while remaining >= 32 {
+        while end.offset_from_unsigned(beg) >= 32 {
            let chunk_start = end.sub(32);
            let v = _mm256_loadu_si256(chunk_start as *const _);
            let c = _mm256_cmpeq_epi8(v, lf);
@ -154,7 +156,6 @@ unsafe fn lines_bwd_avx2(
            }

            end = chunk_start;
-            remaining -= 32;
            line = line_next;
        }

@ -174,9 +175,12 @@ unsafe fn lines_bwd_neon(

        let lf = vdupq_n_u8(b'\n');
        let line_stop = line_stop.min(line);
-        let mut remaining = end.offset_from_unsigned(beg);
+        let off = end.addr() & 15;
+        if off != 0 && off < end.offset_from_unsigned(beg) {
+            (end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop);
+        }

-        while remaining >= 64 {
+        while end.offset_from_unsigned(beg) >= 64 {
            let chunk_start = end.sub(64);

            let v1 = vld1q_u8(chunk_start.add(0));
@ -198,11 +202,10 @@ unsafe fn lines_bwd_neon(
            }

            end = chunk_start;
-            remaining -= 64;
            line = line_next;
        }

-        while remaining >= 16 {
+        while end.offset_from_unsigned(beg) >= 16 {
            let chunk_start = end.sub(16);
            let v = vld1q_u8(chunk_start);
            let c = vceqq_u8(v, lf);
@ -215,7 +218,6 @@ unsafe fn lines_bwd_neon(
            }

            end = chunk_start;
-            remaining -= 16;
            line = line_next;
        }

--- a/src/simd/lines_fwd.rs
+++ b/src/simd/lines_fwd.rs
@ -109,12 +109,15 @@ unsafe fn lines_fwd_avx2(
        }

        let lf = _mm256_set1_epi8(b'\n' as i8);
-        let mut remaining = end.offset_from_unsigned(beg);
+        let off = beg.align_offset(32);
+        if off != 0 && off < end.offset_from_unsigned(beg) {
+            (beg, line) = lines_fwd_fallback(beg, beg.add(off), line, line_stop);
+        }

        if line < line_stop {
            // Unrolling the loop by 4x speeds things up by >3x.
            // It allows us to accumulate matches before doing a single `vpsadbw`.
-            while remaining >= 128 {
+            while end.offset_from_unsigned(beg) >= 128 {
                let v1 = _mm256_loadu_si256(beg.add(0) as *const _);
                let v2 = _mm256_loadu_si256(beg.add(32) as *const _);
                let v3 = _mm256_loadu_si256(beg.add(64) as *const _);
@ -138,11 +141,10 @@ unsafe fn lines_fwd_avx2(
                }

                beg = beg.add(128);
-                remaining -= 128;
                line = line_next;
            }

-            while remaining >= 32 {
+            while end.offset_from_unsigned(beg) >= 32 {
                let v = _mm256_loadu_si256(beg as *const _);
                let c = _mm256_cmpeq_epi8(v, lf);

@ -159,7 +161,6 @@ unsafe fn lines_fwd_avx2(
                }

                beg = beg.add(32);
-                remaining -= 32;
                line = line_next;
            }
        }
@ -179,10 +180,13 @@ unsafe fn lines_fwd_neon(
        use std::arch::aarch64::*;

        let lf = vdupq_n_u8(b'\n');
-        let mut remaining = end.offset_from_unsigned(beg);
+        let off = beg.align_offset(16);
+        if off != 0 && off < end.offset_from_unsigned(beg) {
+            (beg, line) = lines_fwd_fallback(beg, beg.add(off), line, line_stop);
+        }

        if line < line_stop {
-            while remaining >= 64 {
+            while end.offset_from_unsigned(beg) >= 64 {
                let v1 = vld1q_u8(beg.add(0));
                let v2 = vld1q_u8(beg.add(16));
                let v3 = vld1q_u8(beg.add(32));
@ -204,11 +208,10 @@ unsafe fn lines_fwd_neon(
                }

                beg = beg.add(64);
-                remaining -= 64;
                line = line_next;
            }

-            while remaining >= 16 {
+            while end.offset_from_unsigned(beg) >= 16 {
                let v = vld1q_u8(beg);
                let c = vceqq_u8(v, lf);
                let c = vandq_u8(c, vdupq_n_u8(0x01));
@ -220,7 +223,6 @@ unsafe fn lines_fwd_neon(
                }

                beg = beg.add(16);
-                remaining -= 16;
                line = line_next;
            }
        }