From e83cd8f19110317f7b55c8aff9fb5a6e9255489c Mon Sep 17 00:00:00 2001 From: Joshua Warner Date: Thu, 8 Dec 2022 17:17:42 -0800 Subject: [PATCH] Re-improve perf of skipping spaces and comments On my M1 mac this shows as ~25% faster at parsing Num.roc than the old implementation, probably because nobody wrote any NEON code. Even on my x86_64 linux box (Ryzen 2700x), this shows as 10% faster than the current SSE implementation (running with RUSTFLAGS="-C target-cpu=native"). --- Cargo.lock | 63 +++++++++ crates/compiler/parse/Cargo.toml | 1 + crates/compiler/parse/benches/bench_parse.rs | 3 +- crates/compiler/parse/src/blankspace.rs | 134 ++++++++++++++++++- 4 files changed, 195 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c674a4e98d..17344e1f7e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3172,6 +3172,26 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2f61dcf0b917cd75d4521d7343d1ffff3d1583054133c9b5cbea3375c703c40d" +[[package]] +name = "proptest" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e0d9cc07f18492d879586c92b485def06bc850da3118075cd45d50e9c95b0e5" +dependencies = [ + "bit-set", + "bitflags", + "byteorder", + "lazy_static", + "num-traits", + "quick-error 2.0.1", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax", + "rusty-fork", + "tempfile", +] + [[package]] name = "ptr_meta" version = "0.1.4" @@ -3203,6 +3223,18 @@ dependencies = [ "unicase", ] +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + [[package]] name = "quick-xml" version = "0.22.0" @@ -3295,6 +3327,15 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rand_xorshift" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" +dependencies = [ + "rand_core", +] + [[package]] name = "rand_xoshiro" version = "0.6.0" @@ -4178,6 +4219,7 @@ dependencies = [ "encode_unicode 1.0.0", "indoc", "pretty_assertions", + "proptest", "quickcheck", "quickcheck_macros", "roc_collections", @@ -4575,6 +4617,18 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0a5f7c728f5d284929a1cccb5bc19884422bfe6ef4d6c409da2c41838983fcf" +[[package]] +name = "rusty-fork" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f" +dependencies = [ + "fnv", + "quick-error 1.2.3", + "tempfile", + "wait-timeout", +] + [[package]] name = "rustyline" version = "9.1.1" @@ -5751,6 +5805,15 @@ dependencies = [ "quote", ] +[[package]] +name = "wait-timeout" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" +dependencies = [ + "libc", +] + [[package]] name = "walkdir" version = "2.3.2" diff --git a/crates/compiler/parse/Cargo.toml b/crates/compiler/parse/Cargo.toml index e90042c303..82b666b846 100644 --- a/crates/compiler/parse/Cargo.toml +++ b/crates/compiler/parse/Cargo.toml @@ -19,6 +19,7 @@ encode_unicode.workspace = true [dev-dependencies] roc_test_utils = { path = "../../test_utils" } +proptest = "1.0.0" criterion.workspace = true pretty_assertions.workspace = true diff --git a/crates/compiler/parse/benches/bench_parse.rs b/crates/compiler/parse/benches/bench_parse.rs index 68676cbfd1..77cca882ce 100644 --- a/crates/compiler/parse/benches/bench_parse.rs +++ b/crates/compiler/parse/benches/bench_parse.rs @@ -1,8 +1,7 @@ -use std::path::PathBuf; - use bumpalo::Bump; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use roc_parse::{module, module::module_defs, parser::Parser, state::State}; +use std::path::PathBuf; pub fn parse_benchmark(c: &mut Criterion) { c.bench_function("parse false-interpreter", |b| { diff --git a/crates/compiler/parse/src/blankspace.rs b/crates/compiler/parse/src/blankspace.rs index 8f489335fe..da6d617c62 100644 --- a/crates/compiler/parse/src/blankspace.rs +++ b/crates/compiler/parse/src/blankspace.rs @@ -161,7 +161,7 @@ where } } -fn eat_whitespace(bytes: &[u8]) -> usize { +pub fn simple_eat_whitespace(bytes: &[u8]) -> usize { let mut i = 0; while i < bytes.len() { match bytes[i] { @@ -172,7 +172,54 @@ fn eat_whitespace(bytes: &[u8]) -> usize { i } -fn eat_until_newline(bytes: &[u8]) -> usize { +pub fn fast_eat_whitespace(bytes: &[u8]) -> usize { + // Load 8 bytes at a time, keeping in mind that the initial offset may not be aligned + let mut i = 0; + while i + 8 <= bytes.len() { + let chunk = unsafe { + // Safe because we know the pointer is in bounds + (bytes.as_ptr().add(i) as *const u64) + .read_unaligned() + .to_le() + }; + + // Space character is 0x20, which has a single bit set + // We can check for any space character by checking if any other bit is set + let spaces = 0x2020_2020_2020_2020; + + // First, generate a mask where each byte is 0xff if the byte is a space, + // and some other bit sequence otherwise + let mask = !(chunk ^ spaces); + + // Now mask off the high bit, so there's some place to carry into without + // overflowing into the next byte. + let mask = mask & !0x8080_8080_8080_8080; + + // Now add 0x0101_0101_0101_0101 to each byte, which will carry into the high bit + // if and only if the byte is a space. + let mask = mask + 0x0101_0101_0101_0101; + + // Now mask off areas where the original bytes had the high bit set, so that + // 0x80|0x20 = 0xa0 will not be considered a space. + let mask = mask & !(chunk & 0x8080_8080_8080_8080); + + // Make sure all the _other_ bits aside from the high bit are set, + // and count the number of trailing one bits, dividing by 8 to get the number of + // bytes that are spaces. + let count = ((mask | !0x8080_8080_8080_8080).trailing_ones() as usize) / 8; + + if count == 8 { + i += 8; + } else { + return i + count; + } + } + + // Check the remaining bytes + simple_eat_whitespace(&bytes[i..]) + i +} + +pub fn simple_eat_until_control_character(bytes: &[u8]) -> usize { let mut i = 0; while i < bytes.len() { if bytes[i] < b' ' { @@ -184,6 +231,85 @@ fn eat_until_newline(bytes: &[u8]) -> usize { i } +pub fn fast_eat_until_control_character(bytes: &[u8]) -> usize { + // Load 8 bytes at a time, keeping in mind that the initial offset may not be aligned + let mut i = 0; + while i + 8 <= bytes.len() { + let chunk = unsafe { + // Safe because we know the pointer is in bounds + (bytes.as_ptr().add(i) as *const u64) + .read_unaligned() + .to_le() + }; + + // Control characters are 0x00-0x1F, and don't have any high bits set. + // They only have bits set that fall under the 0x1F mask. + let control = 0x1F1F_1F1F_1F1F_1F1F; + + // First we set up a value where, if a given byte is a control character, + // it'll have a all the non-control bits set to 1. All control bits are set to zero. + let mask = !(chunk & !control) & !control; + + // Now, down shift by one bit. This will leave room for the following add to + // carry, without impacting the next byte. + let mask = mask >> 1; + + // Add one (shifted by the right amount), causing all the one bits in the control + // characters to cascade, and put a one in the high bit. + let mask = mask.wrapping_add(0x1010_1010_1010_1010); + + // Now, we can count the number of trailing zero bits, dividing by 8 to get the + // number of bytes before the first control character. + let count = (mask & 0x8080_8080_8080_8080).trailing_zeros() as usize / 8; + + if count == 8 { + i += 8; + } else { + return i + count; + } + } + + // Check the remaining bytes + simple_eat_until_control_character(&bytes[i..]) + i +} + +#[cfg(test)] +mod tests { + use super::*; + use proptest::prelude::*; + + #[test] + fn test_eat_whitespace_simple() { + let bytes = &[0, 0, 0, 0, 0, 0, 0, 0]; + assert_eq!(simple_eat_whitespace(bytes), fast_eat_whitespace(bytes)); + } + + proptest! { + #[test] + fn test_eat_whitespace(bytes in proptest::collection::vec(any::(), 0..100)) { + prop_assert_eq!(simple_eat_whitespace(&bytes), fast_eat_whitespace(&bytes)); + } + } + + #[test] + fn test_eat_until_control_character_simple() { + let bytes = &[32, 0, 0, 0, 0, 0, 0, 0]; + assert_eq!( + simple_eat_until_control_character(bytes), + fast_eat_until_control_character(bytes) + ); + } + + proptest! { + #[test] + fn test_eat_until_control_character(bytes in proptest::collection::vec(any::(), 0..100)) { + prop_assert_eq!( + simple_eat_until_control_character(&bytes), + fast_eat_until_control_character(&bytes)); + } + } +} + pub fn space0_e<'a, E>( indent_problem: fn(Position) -> E, ) -> impl Parser<'a, &'a [CommentOrNewline<'a>], E> @@ -213,7 +339,7 @@ where let mut newlines = Vec::new_in(arena); let mut progress = NoProgress; loop { - let whitespace = eat_whitespace(state.bytes()); + let whitespace = fast_eat_whitespace(state.bytes()); if whitespace > 0 { state.advance_mut(whitespace); progress = MadeProgress; @@ -235,7 +361,7 @@ where } } - let len = eat_until_newline(state.bytes()); + let len = fast_eat_until_control_character(state.bytes()); // We already checked that the string is valid UTF-8 debug_assert!(std::str::from_utf8(&state.bytes()[..len]).is_ok());