Re-improve perf of skipping spaces and comments

On my M1 mac this shows as ~25% faster at parsing Num.roc than the old implementation, probably because nobody wrote any NEON code.

Even on my x86_64 linux box (Ryzen 2700x), this shows as 10% faster than the current SSE implementation (running with RUSTFLAGS="-C target-cpu=native").
This commit is contained in:
Joshua Warner 2022-12-08 17:17:42 -08:00
parent 5f29402297
commit e83cd8f191
No known key found for this signature in database
GPG key ID: 89AD497003F93FDD
4 changed files with 195 additions and 6 deletions

View file

@ -161,7 +161,7 @@ where
}
}
fn eat_whitespace(bytes: &[u8]) -> usize {
pub fn simple_eat_whitespace(bytes: &[u8]) -> usize {
let mut i = 0;
while i < bytes.len() {
match bytes[i] {
@ -172,7 +172,54 @@ fn eat_whitespace(bytes: &[u8]) -> usize {
i
}
fn eat_until_newline(bytes: &[u8]) -> usize {
pub fn fast_eat_whitespace(bytes: &[u8]) -> usize {
// Load 8 bytes at a time, keeping in mind that the initial offset may not be aligned
let mut i = 0;
while i + 8 <= bytes.len() {
let chunk = unsafe {
// Safe because we know the pointer is in bounds
(bytes.as_ptr().add(i) as *const u64)
.read_unaligned()
.to_le()
};
// Space character is 0x20, which has a single bit set
// We can check for any space character by checking if any other bit is set
let spaces = 0x2020_2020_2020_2020;
// First, generate a mask where each byte is 0xff if the byte is a space,
// and some other bit sequence otherwise
let mask = !(chunk ^ spaces);
// Now mask off the high bit, so there's some place to carry into without
// overflowing into the next byte.
let mask = mask & !0x8080_8080_8080_8080;
// Now add 0x0101_0101_0101_0101 to each byte, which will carry into the high bit
// if and only if the byte is a space.
let mask = mask + 0x0101_0101_0101_0101;
// Now mask off areas where the original bytes had the high bit set, so that
// 0x80|0x20 = 0xa0 will not be considered a space.
let mask = mask & !(chunk & 0x8080_8080_8080_8080);
// Make sure all the _other_ bits aside from the high bit are set,
// and count the number of trailing one bits, dividing by 8 to get the number of
// bytes that are spaces.
let count = ((mask | !0x8080_8080_8080_8080).trailing_ones() as usize) / 8;
if count == 8 {
i += 8;
} else {
return i + count;
}
}
// Check the remaining bytes
simple_eat_whitespace(&bytes[i..]) + i
}
pub fn simple_eat_until_control_character(bytes: &[u8]) -> usize {
let mut i = 0;
while i < bytes.len() {
if bytes[i] < b' ' {
@ -184,6 +231,85 @@ fn eat_until_newline(bytes: &[u8]) -> usize {
i
}
pub fn fast_eat_until_control_character(bytes: &[u8]) -> usize {
// Load 8 bytes at a time, keeping in mind that the initial offset may not be aligned
let mut i = 0;
while i + 8 <= bytes.len() {
let chunk = unsafe {
// Safe because we know the pointer is in bounds
(bytes.as_ptr().add(i) as *const u64)
.read_unaligned()
.to_le()
};
// Control characters are 0x00-0x1F, and don't have any high bits set.
// They only have bits set that fall under the 0x1F mask.
let control = 0x1F1F_1F1F_1F1F_1F1F;
// First we set up a value where, if a given byte is a control character,
// it'll have a all the non-control bits set to 1. All control bits are set to zero.
let mask = !(chunk & !control) & !control;
// Now, down shift by one bit. This will leave room for the following add to
// carry, without impacting the next byte.
let mask = mask >> 1;
// Add one (shifted by the right amount), causing all the one bits in the control
// characters to cascade, and put a one in the high bit.
let mask = mask.wrapping_add(0x1010_1010_1010_1010);
// Now, we can count the number of trailing zero bits, dividing by 8 to get the
// number of bytes before the first control character.
let count = (mask & 0x8080_8080_8080_8080).trailing_zeros() as usize / 8;
if count == 8 {
i += 8;
} else {
return i + count;
}
}
// Check the remaining bytes
simple_eat_until_control_character(&bytes[i..]) + i
}
#[cfg(test)]
mod tests {
use super::*;
use proptest::prelude::*;
#[test]
fn test_eat_whitespace_simple() {
let bytes = &[0, 0, 0, 0, 0, 0, 0, 0];
assert_eq!(simple_eat_whitespace(bytes), fast_eat_whitespace(bytes));
}
proptest! {
#[test]
fn test_eat_whitespace(bytes in proptest::collection::vec(any::<u8>(), 0..100)) {
prop_assert_eq!(simple_eat_whitespace(&bytes), fast_eat_whitespace(&bytes));
}
}
#[test]
fn test_eat_until_control_character_simple() {
let bytes = &[32, 0, 0, 0, 0, 0, 0, 0];
assert_eq!(
simple_eat_until_control_character(bytes),
fast_eat_until_control_character(bytes)
);
}
proptest! {
#[test]
fn test_eat_until_control_character(bytes in proptest::collection::vec(any::<u8>(), 0..100)) {
prop_assert_eq!(
simple_eat_until_control_character(&bytes),
fast_eat_until_control_character(&bytes));
}
}
}
pub fn space0_e<'a, E>(
indent_problem: fn(Position) -> E,
) -> impl Parser<'a, &'a [CommentOrNewline<'a>], E>
@ -213,7 +339,7 @@ where
let mut newlines = Vec::new_in(arena);
let mut progress = NoProgress;
loop {
let whitespace = eat_whitespace(state.bytes());
let whitespace = fast_eat_whitespace(state.bytes());
if whitespace > 0 {
state.advance_mut(whitespace);
progress = MadeProgress;
@ -235,7 +361,7 @@ where
}
}
let len = eat_until_newline(state.bytes());
let len = fast_eat_until_control_character(state.bytes());
// We already checked that the string is valid UTF-8
debug_assert!(std::str::from_utf8(&state.bytes()[..len]).is_ok());