From e83cd8f19110317f7b55c8aff9fb5a6e9255489c Mon Sep 17 00:00:00 2001
From: Joshua Warner <joshuawarner32@gmail.com>
Date: Thu, 8 Dec 2022 17:17:42 -0800
Subject: [PATCH] Re-improve perf of skipping spaces and comments

On my M1 mac this shows as ~25% faster at parsing Num.roc than the old implementation, probably because nobody wrote any NEON code.

Even on my x86_64 linux box (Ryzen 2700x), this shows as 10% faster than the current SSE implementation (running with RUSTFLAGS="-C target-cpu=native").
---
 Cargo.lock                                   |  63 +++++++++
 crates/compiler/parse/Cargo.toml             |   1 +
 crates/compiler/parse/benches/bench_parse.rs |   3 +-
 crates/compiler/parse/src/blankspace.rs      | 134 ++++++++++++++++++-
 4 files changed, 195 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c674a4e98d..17344e1f7e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3172,6 +3172,26 @@ version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2f61dcf0b917cd75d4521d7343d1ffff3d1583054133c9b5cbea3375c703c40d"
 
+[[package]]
+name = "proptest"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e0d9cc07f18492d879586c92b485def06bc850da3118075cd45d50e9c95b0e5"
+dependencies = [
+ "bit-set",
+ "bitflags",
+ "byteorder",
+ "lazy_static",
+ "num-traits",
+ "quick-error 2.0.1",
+ "rand",
+ "rand_chacha",
+ "rand_xorshift",
+ "regex-syntax",
+ "rusty-fork",
+ "tempfile",
+]
+
 [[package]]
 name = "ptr_meta"
 version = "0.1.4"
@@ -3203,6 +3223,18 @@ dependencies = [
  "unicase",
 ]
 
+[[package]]
+name = "quick-error"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
+
+[[package]]
+name = "quick-error"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
+
 [[package]]
 name = "quick-xml"
 version = "0.22.0"
@@ -3295,6 +3327,15 @@ dependencies = [
  "getrandom",
 ]
 
+[[package]]
+name = "rand_xorshift"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f"
+dependencies = [
+ "rand_core",
+]
+
 [[package]]
 name = "rand_xoshiro"
 version = "0.6.0"
@@ -4178,6 +4219,7 @@ dependencies = [
  "encode_unicode 1.0.0",
  "indoc",
  "pretty_assertions",
+ "proptest",
  "quickcheck",
  "quickcheck_macros",
  "roc_collections",
@@ -4575,6 +4617,18 @@ version = "1.0.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a0a5f7c728f5d284929a1cccb5bc19884422bfe6ef4d6c409da2c41838983fcf"
 
+[[package]]
+name = "rusty-fork"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f"
+dependencies = [
+ "fnv",
+ "quick-error 1.2.3",
+ "tempfile",
+ "wait-timeout",
+]
+
 [[package]]
 name = "rustyline"
 version = "9.1.1"
@@ -5751,6 +5805,15 @@ dependencies = [
  "quote",
 ]
 
+[[package]]
+name = "wait-timeout"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "walkdir"
 version = "2.3.2"
diff --git a/crates/compiler/parse/Cargo.toml b/crates/compiler/parse/Cargo.toml
index e90042c303..82b666b846 100644
--- a/crates/compiler/parse/Cargo.toml
+++ b/crates/compiler/parse/Cargo.toml
@@ -19,6 +19,7 @@ encode_unicode.workspace = true
 
 [dev-dependencies]
 roc_test_utils = { path = "../../test_utils" }
+proptest = "1.0.0"
 
 criterion.workspace = true
 pretty_assertions.workspace = true
diff --git a/crates/compiler/parse/benches/bench_parse.rs b/crates/compiler/parse/benches/bench_parse.rs
index 68676cbfd1..77cca882ce 100644
--- a/crates/compiler/parse/benches/bench_parse.rs
+++ b/crates/compiler/parse/benches/bench_parse.rs
@@ -1,8 +1,7 @@
-use std::path::PathBuf;
-
 use bumpalo::Bump;
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 use roc_parse::{module, module::module_defs, parser::Parser, state::State};
+use std::path::PathBuf;
 
 pub fn parse_benchmark(c: &mut Criterion) {
     c.bench_function("parse false-interpreter", |b| {
diff --git a/crates/compiler/parse/src/blankspace.rs b/crates/compiler/parse/src/blankspace.rs
index 8f489335fe..da6d617c62 100644
--- a/crates/compiler/parse/src/blankspace.rs
+++ b/crates/compiler/parse/src/blankspace.rs
@@ -161,7 +161,7 @@ where
     }
 }
 
-fn eat_whitespace(bytes: &[u8]) -> usize {
+pub fn simple_eat_whitespace(bytes: &[u8]) -> usize {
     let mut i = 0;
     while i < bytes.len() {
         match bytes[i] {
@@ -172,7 +172,54 @@ fn eat_whitespace(bytes: &[u8]) -> usize {
     i
 }
 
-fn eat_until_newline(bytes: &[u8]) -> usize {
+pub fn fast_eat_whitespace(bytes: &[u8]) -> usize {
+    // Load 8 bytes at a time, keeping in mind that the initial offset may not be aligned
+    let mut i = 0;
+    while i + 8 <= bytes.len() {
+        let chunk = unsafe {
+            // Safe because we know the pointer is in bounds
+            (bytes.as_ptr().add(i) as *const u64)
+                .read_unaligned()
+                .to_le()
+        };
+
+        // Space character is 0x20, which has a single bit set
+        // We can check for any space character by checking if any other bit is set
+        let spaces = 0x2020_2020_2020_2020;
+
+        // First, generate a mask where each byte is 0xff if the byte is a space,
+        // and some other bit sequence otherwise
+        let mask = !(chunk ^ spaces);
+
+        // Now mask off the high bit, so there's some place to carry into without
+        // overflowing into the next byte.
+        let mask = mask & !0x8080_8080_8080_8080;
+
+        // Now add 0x0101_0101_0101_0101 to each byte, which will carry into the high bit
+        // if and only if the byte is a space.
+        let mask = mask + 0x0101_0101_0101_0101;
+
+        // Now mask off areas where the original bytes had the high bit set, so that
+        // 0x80|0x20 = 0xa0 will not be considered a space.
+        let mask = mask & !(chunk & 0x8080_8080_8080_8080);
+
+        // Make sure all the _other_ bits aside from the high bit are set,
+        // and count the number of trailing one bits, dividing by 8 to get the number of
+        // bytes that are spaces.
+        let count = ((mask | !0x8080_8080_8080_8080).trailing_ones() as usize) / 8;
+
+        if count == 8 {
+            i += 8;
+        } else {
+            return i + count;
+        }
+    }
+
+    // Check the remaining bytes
+    simple_eat_whitespace(&bytes[i..]) + i
+}
+
+pub fn simple_eat_until_control_character(bytes: &[u8]) -> usize {
     let mut i = 0;
     while i < bytes.len() {
         if bytes[i] < b' ' {
@@ -184,6 +231,85 @@ fn eat_until_newline(bytes: &[u8]) -> usize {
     i
 }
 
+pub fn fast_eat_until_control_character(bytes: &[u8]) -> usize {
+    // Load 8 bytes at a time, keeping in mind that the initial offset may not be aligned
+    let mut i = 0;
+    while i + 8 <= bytes.len() {
+        let chunk = unsafe {
+            // Safe because we know the pointer is in bounds
+            (bytes.as_ptr().add(i) as *const u64)
+                .read_unaligned()
+                .to_le()
+        };
+
+        // Control characters are 0x00-0x1F, and don't have any high bits set.
+        // They only have bits set that fall under the 0x1F mask.
+        let control = 0x1F1F_1F1F_1F1F_1F1F;
+
+        // First we set up a value where, if a given byte is a control character,
+        // it'll have a all the non-control bits set to 1. All control bits are set to zero.
+        let mask = !(chunk & !control) & !control;
+
+        // Now, down shift by one bit. This will leave room for the following add to
+        // carry, without impacting the next byte.
+        let mask = mask >> 1;
+
+        // Add one (shifted by the right amount), causing all the one bits in the control
+        // characters to cascade, and put a one in the high bit.
+        let mask = mask.wrapping_add(0x1010_1010_1010_1010);
+
+        // Now, we can count the number of trailing zero bits, dividing by 8 to get the
+        // number of bytes before the first control character.
+        let count = (mask & 0x8080_8080_8080_8080).trailing_zeros() as usize / 8;
+
+        if count == 8 {
+            i += 8;
+        } else {
+            return i + count;
+        }
+    }
+
+    // Check the remaining bytes
+    simple_eat_until_control_character(&bytes[i..]) + i
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use proptest::prelude::*;
+
+    #[test]
+    fn test_eat_whitespace_simple() {
+        let bytes = &[0, 0, 0, 0, 0, 0, 0, 0];
+        assert_eq!(simple_eat_whitespace(bytes), fast_eat_whitespace(bytes));
+    }
+
+    proptest! {
+        #[test]
+        fn test_eat_whitespace(bytes in proptest::collection::vec(any::<u8>(), 0..100)) {
+            prop_assert_eq!(simple_eat_whitespace(&bytes), fast_eat_whitespace(&bytes));
+        }
+    }
+
+    #[test]
+    fn test_eat_until_control_character_simple() {
+        let bytes = &[32, 0, 0, 0, 0, 0, 0, 0];
+        assert_eq!(
+            simple_eat_until_control_character(bytes),
+            fast_eat_until_control_character(bytes)
+        );
+    }
+
+    proptest! {
+        #[test]
+        fn test_eat_until_control_character(bytes in proptest::collection::vec(any::<u8>(), 0..100)) {
+            prop_assert_eq!(
+                simple_eat_until_control_character(&bytes),
+                fast_eat_until_control_character(&bytes));
+        }
+    }
+}
+
 pub fn space0_e<'a, E>(
     indent_problem: fn(Position) -> E,
 ) -> impl Parser<'a, &'a [CommentOrNewline<'a>], E>
@@ -213,7 +339,7 @@ where
         let mut newlines = Vec::new_in(arena);
         let mut progress = NoProgress;
         loop {
-            let whitespace = eat_whitespace(state.bytes());
+            let whitespace = fast_eat_whitespace(state.bytes());
             if whitespace > 0 {
                 state.advance_mut(whitespace);
                 progress = MadeProgress;
@@ -235,7 +361,7 @@ where
                         }
                     }
 
-                    let len = eat_until_newline(state.bytes());
+                    let len = fast_eat_until_control_character(state.bytes());
 
                     // We already checked that the string is valid UTF-8
                     debug_assert!(std::str::from_utf8(&state.bytes()[..len]).is_ok());