From c3c1b8d0838dd4b1b57d75340d0bf26c22f035d6 Mon Sep 17 00:00:00 2001 From: Richard Feldman Date: Wed, 29 Mar 2023 15:52:08 -0400 Subject: [PATCH] Add Str.walkUtf8 --- crates/compiler/builtins/roc/Str.roc | 28 +++++++++++++++++++++++++ crates/compiler/module/src/symbol.rs | 1 + crates/compiler/test_gen/src/gen_str.rs | 27 ++++++++++++++++++++++++ 3 files changed, 56 insertions(+) diff --git a/crates/compiler/builtins/roc/Str.roc b/crates/compiler/builtins/roc/Str.roc index 6a670d71a8..f98d4360f0 100644 --- a/crates/compiler/builtins/roc/Str.roc +++ b/crates/compiler/builtins/roc/Str.roc @@ -107,6 +107,7 @@ interface Str replaceLast, splitFirst, splitLast, + walkUtf8, walkUtf8WithIndex, reserve, releaseExcessCapacity, @@ -841,6 +842,33 @@ walkUtf8WithIndexHelp = \string, state, step, index, length -> else state +## Walks over the `UTF-8` bytes of the given [Str] and calls a function to update +## state for each byte. +## +## ``` +## result = walkUtf8 "hello, world!" "" (\state, byte -> state ++ String.fromCodePoint byte) +## expect result == Ok "hello, world!" +## ``` +walkUtf8 : Str, state, (state, U8 -> state) -> state +walkUtf8 = \str, initial, step -> + walkUtf8Help str initial step 0 (Str.countUtf8Bytes str) + +walkUtf8Help : Str, state, (state, U8 -> state), Nat, Nat -> state +walkUtf8Help = \str, state, step, index, length -> + if index < length then + byte = Str.getUnsafe str index + newState = step state byte + + walkUtf8Help str newState step (index + 1) length + else + state + +# Test walkUtf8 with a simple ASCII string +expect (walkUtf8 "ABC" [] List.append) == [65, 66, 67] + +# Test walkUtf8 with a multi-byte string +expect (walkUtf8 "鹏" [] List.append) == [233, 185, 143] + ## Shrink the memory footprint of a str such that it's capacity and length are equal. ## Note: This will also convert seamless slices to regular lists. releaseExcessCapacity : Str -> Str diff --git a/crates/compiler/module/src/symbol.rs b/crates/compiler/module/src/symbol.rs index 4edc38e50d..7ceaf869e9 100644 --- a/crates/compiler/module/src/symbol.rs +++ b/crates/compiler/module/src/symbol.rs @@ -1328,6 +1328,7 @@ define_builtins! { 55 STR_GRAPHEMES: "graphemes" 56 STR_IS_VALID_SCALAR: "isValidScalar" 57 STR_RELEASE_EXCESS_CAPACITY: "releaseExcessCapacity" + 58 STR_WALK_UTF8: "walkUtf8" } 6 LIST: "List" => { 0 LIST_LIST: "List" exposed_apply_type=true // the List.List type alias diff --git a/crates/compiler/test_gen/src/gen_str.rs b/crates/compiler/test_gen/src/gen_str.rs index 19b8a89c79..dacf7feea6 100644 --- a/crates/compiler/test_gen/src/gen_str.rs +++ b/crates/compiler/test_gen/src/gen_str.rs @@ -1822,6 +1822,33 @@ fn str_split_overlapping_substring_2() { ); } +#[test] +#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))] +fn str_walk_utf8() { + #[cfg(not(feature = "gen-llvm-wasm"))] + assert_evals_to!( + // Reverse the bytes + indoc!( + r#" + Str.walkUtf8 "abcd" [] (\list, byte -> List.prepend list byte) + "# + ), + RocList::from_slice(&[b'd', b'c', b'b', b'a']), + RocList + ); + + #[cfg(feature = "gen-llvm-wasm")] + assert_evals_to!( + indoc!( + r#" + Str.walkUtf8WithIndex "abcd" [] (\list, byte, index -> List.append list (Pair index byte)) + "# + ), + RocList::from_slice(&[(0, 'a'), (1, 'b'), (2, 'c'), (3, 'd')]), + RocList<(u32, char)> + ); +} + #[test] #[cfg(any(feature = "gen-llvm", feature = "gen-dev"))] fn str_walk_utf8_with_index() {