mirror of
https://github.com/microsoft/edit.git
synced 2025-08-04 10:59:57 +00:00
Optimize SIMD impl of lines_fwd
and lines_bwd
Benchmark results on AMD Zen4: ``` simd/lines_fwd/1 time: [1.4801 ns 1.4803 ns 1.4804 ns] thrpt: [644.19 MiB/s 644.26 MiB/s 644.34 MiB/s] change: time: [−38.958% −38.933% −38.898%] (p = 0.00 < 0.05) thrpt: [+63.661% +63.755% +63.821%] Performance has improved. simd/lines_fwd/8 time: [3.8571 ns 3.8607 ns 3.8641 ns] thrpt: [1.9282 GiB/s 1.9299 GiB/s 1.9316 GiB/s] change: time: [−19.458% −19.423% −19.394%] (p = 0.00 < 0.05) thrpt: [+24.060% +24.105% +24.159%] Performance has improved. simd/lines_fwd/128 time: [14.064 ns 14.092 ns 14.120 ns] thrpt: [8.4426 GiB/s 8.4592 GiB/s 8.4764 GiB/s] change: time: [−6.0955% −5.8911% −5.6706%] (p = 0.00 < 0.05) thrpt: [+6.0115% +6.2599% +6.4912%] Performance has improved. simd/lines_fwd/1024 time: [18.160 ns 18.178 ns 18.195 ns] thrpt: [52.415 GiB/s 52.462 GiB/s 52.516 GiB/s] change: time: [−4.1174% −3.9973% −3.8859%] (p = 0.00 < 0.05) thrpt: [+4.0430% +4.1637% +4.2942%] Performance has improved. simd/lines_fwd/131072 time: [871.09 ns 871.14 ns 871.21 ns] thrpt: [140.12 GiB/s 140.13 GiB/s 140.14 GiB/s] change: time: [−10.451% −10.405% −10.358%] (p = 0.00 < 0.05) thrpt: [+11.555% +11.613% +11.670%] Performance has improved. simd/lines_fwd/134217728 time: [1.7326 ms 1.7332 ms 1.7338 ms] thrpt: [72.094 GiB/s 72.120 GiB/s 72.146 GiB/s] change: time: [−0.4091% −0.2348% −0.0670%] (p = 0.00 < 0.05) thrpt: [+0.0671% +0.2353% +0.4108%] Change within noise threshold. ```
This commit is contained in:
parent
7ece8c5a38
commit
e20e0061dc
2 changed files with 24 additions and 20 deletions
|
@ -110,9 +110,12 @@ unsafe fn lines_bwd_avx2(
|
|||
|
||||
let lf = _mm256_set1_epi8(b'\n' as i8);
|
||||
let line_stop = line_stop.min(line);
|
||||
let mut remaining = end.offset_from_unsigned(beg);
|
||||
let off = end.addr() & 31;
|
||||
if off != 0 && off < end.offset_from_unsigned(beg) {
|
||||
(end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop);
|
||||
}
|
||||
|
||||
while remaining >= 128 {
|
||||
while end.offset_from_unsigned(beg) >= 128 {
|
||||
let chunk_start = end.sub(128);
|
||||
|
||||
let v1 = _mm256_loadu_si256(chunk_start.add(0) as *const _);
|
||||
|
@ -135,11 +138,10 @@ unsafe fn lines_bwd_avx2(
|
|||
}
|
||||
|
||||
end = chunk_start;
|
||||
remaining -= 128;
|
||||
line = line_next;
|
||||
}
|
||||
|
||||
while remaining >= 32 {
|
||||
while end.offset_from_unsigned(beg) >= 32 {
|
||||
let chunk_start = end.sub(32);
|
||||
let v = _mm256_loadu_si256(chunk_start as *const _);
|
||||
let c = _mm256_cmpeq_epi8(v, lf);
|
||||
|
@ -154,7 +156,6 @@ unsafe fn lines_bwd_avx2(
|
|||
}
|
||||
|
||||
end = chunk_start;
|
||||
remaining -= 32;
|
||||
line = line_next;
|
||||
}
|
||||
|
||||
|
@ -174,9 +175,12 @@ unsafe fn lines_bwd_neon(
|
|||
|
||||
let lf = vdupq_n_u8(b'\n');
|
||||
let line_stop = line_stop.min(line);
|
||||
let mut remaining = end.offset_from_unsigned(beg);
|
||||
let off = end.addr() & 15;
|
||||
if off != 0 && off < end.offset_from_unsigned(beg) {
|
||||
(end, line) = lines_bwd_fallback(end.sub(off), end, line, line_stop);
|
||||
}
|
||||
|
||||
while remaining >= 64 {
|
||||
while end.offset_from_unsigned(beg) >= 64 {
|
||||
let chunk_start = end.sub(64);
|
||||
|
||||
let v1 = vld1q_u8(chunk_start.add(0));
|
||||
|
@ -198,11 +202,10 @@ unsafe fn lines_bwd_neon(
|
|||
}
|
||||
|
||||
end = chunk_start;
|
||||
remaining -= 64;
|
||||
line = line_next;
|
||||
}
|
||||
|
||||
while remaining >= 16 {
|
||||
while end.offset_from_unsigned(beg) >= 16 {
|
||||
let chunk_start = end.sub(16);
|
||||
let v = vld1q_u8(chunk_start);
|
||||
let c = vceqq_u8(v, lf);
|
||||
|
@ -215,7 +218,6 @@ unsafe fn lines_bwd_neon(
|
|||
}
|
||||
|
||||
end = chunk_start;
|
||||
remaining -= 16;
|
||||
line = line_next;
|
||||
}
|
||||
|
||||
|
|
|
@ -109,12 +109,15 @@ unsafe fn lines_fwd_avx2(
|
|||
}
|
||||
|
||||
let lf = _mm256_set1_epi8(b'\n' as i8);
|
||||
let mut remaining = end.offset_from_unsigned(beg);
|
||||
let off = beg.align_offset(32);
|
||||
if off != 0 && off < end.offset_from_unsigned(beg) {
|
||||
(beg, line) = lines_fwd_fallback(beg, beg.add(off), line, line_stop);
|
||||
}
|
||||
|
||||
if line < line_stop {
|
||||
// Unrolling the loop by 4x speeds things up by >3x.
|
||||
// It allows us to accumulate matches before doing a single `vpsadbw`.
|
||||
while remaining >= 128 {
|
||||
while end.offset_from_unsigned(beg) >= 128 {
|
||||
let v1 = _mm256_loadu_si256(beg.add(0) as *const _);
|
||||
let v2 = _mm256_loadu_si256(beg.add(32) as *const _);
|
||||
let v3 = _mm256_loadu_si256(beg.add(64) as *const _);
|
||||
|
@ -138,11 +141,10 @@ unsafe fn lines_fwd_avx2(
|
|||
}
|
||||
|
||||
beg = beg.add(128);
|
||||
remaining -= 128;
|
||||
line = line_next;
|
||||
}
|
||||
|
||||
while remaining >= 32 {
|
||||
while end.offset_from_unsigned(beg) >= 32 {
|
||||
let v = _mm256_loadu_si256(beg as *const _);
|
||||
let c = _mm256_cmpeq_epi8(v, lf);
|
||||
|
||||
|
@ -159,7 +161,6 @@ unsafe fn lines_fwd_avx2(
|
|||
}
|
||||
|
||||
beg = beg.add(32);
|
||||
remaining -= 32;
|
||||
line = line_next;
|
||||
}
|
||||
}
|
||||
|
@ -179,10 +180,13 @@ unsafe fn lines_fwd_neon(
|
|||
use std::arch::aarch64::*;
|
||||
|
||||
let lf = vdupq_n_u8(b'\n');
|
||||
let mut remaining = end.offset_from_unsigned(beg);
|
||||
let off = beg.align_offset(16);
|
||||
if off != 0 && off < end.offset_from_unsigned(beg) {
|
||||
(beg, line) = lines_fwd_fallback(beg, beg.add(off), line, line_stop);
|
||||
}
|
||||
|
||||
if line < line_stop {
|
||||
while remaining >= 64 {
|
||||
while end.offset_from_unsigned(beg) >= 64 {
|
||||
let v1 = vld1q_u8(beg.add(0));
|
||||
let v2 = vld1q_u8(beg.add(16));
|
||||
let v3 = vld1q_u8(beg.add(32));
|
||||
|
@ -204,11 +208,10 @@ unsafe fn lines_fwd_neon(
|
|||
}
|
||||
|
||||
beg = beg.add(64);
|
||||
remaining -= 64;
|
||||
line = line_next;
|
||||
}
|
||||
|
||||
while remaining >= 16 {
|
||||
while end.offset_from_unsigned(beg) >= 16 {
|
||||
let v = vld1q_u8(beg);
|
||||
let c = vceqq_u8(v, lf);
|
||||
let c = vandq_u8(c, vdupq_n_u8(0x01));
|
||||
|
@ -220,7 +223,6 @@ unsafe fn lines_fwd_neon(
|
|||
}
|
||||
|
||||
beg = beg.add(16);
|
||||
remaining -= 16;
|
||||
line = line_next;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue