Optimize SIMD impl of lines_fwd and lines_bwd

Benchmark results on AMD Zen4:

```
 simd/lines_fwd/1        time:   [1.4801 ns 1.4803 ns 1.4804 ns]
                         thrpt:  [644.19 MiB/s 644.26 MiB/s 644.34 MiB/s]
                  change:
                         time:   [−38.958% −38.933% −38.898%] (p = 0.00 < 0.05)
                         thrpt:  [+63.661% +63.755% +63.821%]
                         Performance has improved.

 simd/lines_fwd/8        time:   [3.8571 ns 3.8607 ns 3.8641 ns]
                         thrpt:  [1.9282 GiB/s 1.9299 GiB/s 1.9316 GiB/s]
                  change:
                         time:   [−19.458% −19.423% −19.394%] (p = 0.00 < 0.05)
                         thrpt:  [+24.060% +24.105% +24.159%]
                         Performance has improved.

 simd/lines_fwd/128      time:   [14.064 ns 14.092 ns 14.120 ns]
                         thrpt:  [8.4426 GiB/s 8.4592 GiB/s 8.4764 GiB/s]
                  change:
                         time:   [−6.0955% −5.8911% −5.6706%] (p = 0.00 < 0.05)
                         thrpt:  [+6.0115% +6.2599% +6.4912%]
                         Performance has improved.

 simd/lines_fwd/1024     time:   [18.160 ns 18.178 ns 18.195 ns]
                         thrpt:  [52.415 GiB/s 52.462 GiB/s 52.516 GiB/s]
                  change:
                         time:   [−4.1174% −3.9973% −3.8859%] (p = 0.00 < 0.05)
                         thrpt:  [+4.0430% +4.1637% +4.2942%]
                         Performance has improved.

 simd/lines_fwd/131072   time:   [871.09 ns 871.14 ns 871.21 ns]
                         thrpt:  [140.12 GiB/s 140.13 GiB/s 140.14 GiB/s]
                  change:
                         time:   [−10.451% −10.405% −10.358%] (p = 0.00 < 0.05)
                         thrpt:  [+11.555% +11.613% +11.670%]
                         Performance has improved.

 simd/lines_fwd/134217728
                         time:   [1.7326 ms 1.7332 ms 1.7338 ms]
                         thrpt:  [72.094 GiB/s 72.120 GiB/s 72.146 GiB/s]
                  change:
                         time:   [−0.4091% −0.2348% −0.0670%] (p = 0.00 < 0.05)
                         thrpt:  [+0.0671% +0.2353% +0.4108%]
                         Change within noise threshold.
```
This commit is contained in:
WANG Rui 2025-06-26 21:45:45 +08:00
parent 7ece8c5a38
commit e20e0061dc
2 changed files with 24 additions and 20 deletions

View file

@ -110,9 +110,12 @@ unsafe fn lines_bwd_avx2(
let lf = _mm256_set1_epi8(b'\n' as i8);
let line_stop = line_stop.min(line);
let mut remaining = end.offset_from_unsigned(beg);
let off = end.addr() & 31;
if off != 0 && off < end.offset_from_unsigned(beg) {
(end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop);
}
while remaining >= 128 {
while end.offset_from_unsigned(beg) >= 128 {
let chunk_start = end.sub(128);
let v1 = _mm256_loadu_si256(chunk_start.add(0) as *const _);
@ -135,11 +138,10 @@ unsafe fn lines_bwd_avx2(
}
end = chunk_start;
remaining -= 128;
line = line_next;
}
while remaining >= 32 {
while end.offset_from_unsigned(beg) >= 32 {
let chunk_start = end.sub(32);
let v = _mm256_loadu_si256(chunk_start as *const _);
let c = _mm256_cmpeq_epi8(v, lf);
@ -154,7 +156,6 @@ unsafe fn lines_bwd_avx2(
}
end = chunk_start;
remaining -= 32;
line = line_next;
}
@ -174,9 +175,12 @@ unsafe fn lines_bwd_neon(
let lf = vdupq_n_u8(b'\n');
let line_stop = line_stop.min(line);
let mut remaining = end.offset_from_unsigned(beg);
let off = end.addr() & 15;
if off != 0 && off < end.offset_from_unsigned(beg) {
(end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop);
}
while remaining >= 64 {
while end.offset_from_unsigned(beg) >= 64 {
let chunk_start = end.sub(64);
let v1 = vld1q_u8(chunk_start.add(0));
@ -198,11 +202,10 @@ unsafe fn lines_bwd_neon(
}
end = chunk_start;
remaining -= 64;
line = line_next;
}
while remaining >= 16 {
while end.offset_from_unsigned(beg) >= 16 {
let chunk_start = end.sub(16);
let v = vld1q_u8(chunk_start);
let c = vceqq_u8(v, lf);
@ -215,7 +218,6 @@ unsafe fn lines_bwd_neon(
}
end = chunk_start;
remaining -= 16;
line = line_next;
}

View file

@ -109,12 +109,15 @@ unsafe fn lines_fwd_avx2(
}
let lf = _mm256_set1_epi8(b'\n' as i8);
let mut remaining = end.offset_from_unsigned(beg);
let off = beg.align_offset(32);
if off != 0 && off < end.offset_from_unsigned(beg) {
(beg, line) = lines_fwd_fallback(beg, beg.add(off), line, line_stop);
}
if line < line_stop {
// Unrolling the loop by 4x speeds things up by >3x.
// It allows us to accumulate matches before doing a single `vpsadbw`.
while remaining >= 128 {
while end.offset_from_unsigned(beg) >= 128 {
let v1 = _mm256_loadu_si256(beg.add(0) as *const _);
let v2 = _mm256_loadu_si256(beg.add(32) as *const _);
let v3 = _mm256_loadu_si256(beg.add(64) as *const _);
@ -138,11 +141,10 @@ unsafe fn lines_fwd_avx2(
}
beg = beg.add(128);
remaining -= 128;
line = line_next;
}
while remaining >= 32 {
while end.offset_from_unsigned(beg) >= 32 {
let v = _mm256_loadu_si256(beg as *const _);
let c = _mm256_cmpeq_epi8(v, lf);
@ -159,7 +161,6 @@ unsafe fn lines_fwd_avx2(
}
beg = beg.add(32);
remaining -= 32;
line = line_next;
}
}
@ -179,10 +180,13 @@ unsafe fn lines_fwd_neon(
use std::arch::aarch64::*;
let lf = vdupq_n_u8(b'\n');
let mut remaining = end.offset_from_unsigned(beg);
let off = beg.align_offset(16);
if off != 0 && off < end.offset_from_unsigned(beg) {
(beg, line) = lines_fwd_fallback(beg, beg.add(off), line, line_stop);
}
if line < line_stop {
while remaining >= 64 {
while end.offset_from_unsigned(beg) >= 64 {
let v1 = vld1q_u8(beg.add(0));
let v2 = vld1q_u8(beg.add(16));
let v3 = vld1q_u8(beg.add(32));
@ -204,11 +208,10 @@ unsafe fn lines_fwd_neon(
}
beg = beg.add(64);
remaining -= 64;
line = line_next;
}
while remaining >= 16 {
while end.offset_from_unsigned(beg) >= 16 {
let v = vld1q_u8(beg);
let c = vceqq_u8(v, lf);
let c = vandq_u8(c, vdupq_n_u8(0x01));
@ -220,7 +223,6 @@ unsafe fn lines_fwd_neon(
}
beg = beg.add(16);
remaining -= 16;
line = line_next;
}
}