diff --git a/crates/compiler/builtins/bitcode/src/main.zig b/crates/compiler/builtins/bitcode/src/main.zig index 2fd6cd2d11..e1527fb8a5 100644 --- a/crates/compiler/builtins/bitcode/src/main.zig +++ b/crates/compiler/builtins/bitcode/src/main.zig @@ -155,6 +155,7 @@ comptime { const str = @import("str.zig"); comptime { exportStrFn(str.init, "init"); + exportStrFn(str.strToScalarsC, "to_scalars"); exportStrFn(str.strSplitInPlaceC, "str_split_in_place"); exportStrFn(str.countSegments, "count_segments"); exportStrFn(str.countGraphemeClusters, "count_grapheme_clusters"); diff --git a/crates/compiler/builtins/bitcode/src/str.zig b/crates/compiler/builtins/bitcode/src/str.zig index a9da9da724..672137f862 100644 --- a/crates/compiler/builtins/bitcode/src/str.zig +++ b/crates/compiler/builtins/bitcode/src/str.zig @@ -470,7 +470,7 @@ pub fn strNumberOfBytes(string: RocStr) callconv(.C) usize { // Str.toScalars pub fn strToScalarsC(str: RocStr) callconv(.C) RocList { - return @call(.{ .modifier = always_inline }, strToScalars, .{ RocStr, str }); + return @call(.{ .modifier = always_inline }, strToScalars, .{ str }); } fn strToScalars(string: RocStr) callconv(.C) RocList { diff --git a/crates/compiler/builtins/roc/Str.roc b/crates/compiler/builtins/roc/Str.roc index 0aa86b6eb9..1b90cb928c 100644 --- a/crates/compiler/builtins/roc/Str.roc +++ b/crates/compiler/builtins/roc/Str.roc @@ -1,9 +1,9 @@ interface Str exposes [ - concat, Utf8Problem, Utf8ByteProblem, + concat, isEmpty, joinWith, split, @@ -32,6 +32,7 @@ interface Str toI16, toU8, toI8, + toScalars, ] imports [Bool.{ Bool }, Result.{ Result }] @@ -172,6 +173,31 @@ countGraphemes : Str -> Nat ## single [U32]. You'd need to use `Str.startsWithCodePt "🕊"` instead. startsWithCodePt : Str, U32 -> Bool +toScalars : Str -> List U32 + +# walkScalars : Str, state, (state, U32, Str -> state) -> state +# walkScalars = \inputStr, init, update -> +# # TODO rewrite this in Zig to speed it up a ton! +# answer = +# List.walk +# (toUtf8 inputStr) +# { index: 0, answer: init } +# \{ index, state }, byte -> +# { codePt, codePtStr } = +# if byte <= 127 then +# # This can never fail. Also, this list means one allocation per step! 😱 +# str = Str.fromUtf8 [byte] |> Result.withDefault "" + +# { codePt: Num.toU32 byte, codePtStr: str } +# else +# # TODO handle multibyte UTF-8 string by looking ahead in the list as needed +# # https://docs.teradata.com/r/Teradata-Database-International-Character-Set-Support/June-2017/Client-Character-Set-Options/UTF8-Client-Character-Set-Support/UTF8-Multibyte-Sequences + +# { index: index + 1, state: update state codePt codePtStr } + +# answer.state + + ## Return a [List] of the string's [U8] UTF-8 [code units](https://unicode.org/glossary/#code_unit). ## (To split the string into a [List] of smaller [Str] values instead of [U8] values, ## see [Str.split].) diff --git a/crates/compiler/builtins/src/bitcode.rs b/crates/compiler/builtins/src/bitcode.rs index 37c0f53899..c65752d4d3 100644 --- a/crates/compiler/builtins/src/bitcode.rs +++ b/crates/compiler/builtins/src/bitcode.rs @@ -311,6 +311,7 @@ pub const STR_COUNT_SEGMENTS: &str = "roc_builtins.str.count_segments"; pub const STR_CONCAT: &str = "roc_builtins.str.concat"; pub const STR_JOIN_WITH: &str = "roc_builtins.str.joinWith"; pub const STR_STR_SPLIT_IN_PLACE: &str = "roc_builtins.str.str_split_in_place"; +pub const STR_TO_SCALARS: &str = "roc_builtins.str.to_scalars"; pub const STR_COUNT_GRAPEHEME_CLUSTERS: &str = "roc_builtins.str.count_grapheme_clusters"; pub const STR_STARTS_WITH: &str = "roc_builtins.str.starts_with"; pub const STR_STARTS_WITH_CODE_PT: &str = "roc_builtins.str.starts_with_code_point"; diff --git a/crates/compiler/builtins/src/std.rs b/crates/compiler/builtins/src/std.rs index d179f91745..ebcd22b6fe 100644 --- a/crates/compiler/builtins/src/std.rs +++ b/crates/compiler/builtins/src/std.rs @@ -873,6 +873,13 @@ pub fn types() -> MutMap { Box::new(str_type()), ); + // Str.toScalars : Str -> List U32 + add_top_level_function_type!( + Symbol::STR_TO_SCALARS, + vec![str_type()], + Box::new(list_type(u32_type())), + ); + // isEmpty : Str -> Bool add_top_level_function_type!( Symbol::STR_IS_EMPTY, diff --git a/crates/compiler/can/src/builtins.rs b/crates/compiler/can/src/builtins.rs index 049965c690..d4bc993be4 100644 --- a/crates/compiler/can/src/builtins.rs +++ b/crates/compiler/can/src/builtins.rs @@ -73,6 +73,7 @@ pub fn builtin_defs_map(symbol: Symbol, var_store: &mut VarStore) -> Option BOOL_NOT => bool_not, STR_CONCAT => str_concat, STR_JOIN_WITH => str_join_with, + STR_TO_SCALARS => str_to_scalars, STR_SPLIT => str_split, STR_IS_EMPTY => str_is_empty, STR_STARTS_WITH => str_starts_with, @@ -1677,6 +1678,26 @@ fn str_concat(symbol: Symbol, var_store: &mut VarStore) -> Def { ) } +/// Str.toScalars : Str -> List U32 +fn str_to_scalars(symbol: Symbol, var_store: &mut VarStore) -> Def { + let str_var = var_store.fresh(); + let list_u32_var = var_store.fresh(); + + let body = RunLowLevel { + op: LowLevel::StrToScalars, + args: vec![(str_var, Var(Symbol::ARG_1))], + ret_var: str_var, + }; + + defn( + symbol, + vec![(str_var, Symbol::ARG_1)], + var_store, + body, + list_u32_var, + ) +} + /// Str.joinWith : List Str, Str -> Str fn str_join_with(symbol: Symbol, var_store: &mut VarStore) -> Def { let list_str_var = var_store.fresh(); diff --git a/crates/compiler/gen_llvm/src/llvm/build.rs b/crates/compiler/gen_llvm/src/llvm/build.rs index 897df6065a..d463157594 100644 --- a/crates/compiler/gen_llvm/src/llvm/build.rs +++ b/crates/compiler/gen_llvm/src/llvm/build.rs @@ -5415,6 +5415,14 @@ fn run_low_level<'a, 'ctx, 'env>( call_str_bitcode_fn(env, &[list.into(), string], bitcode::STR_JOIN_WITH) } + StrToScalars => { + // Str.toScalars : Str -> List U32 + debug_assert_eq!(args.len(), 1); + + let string = load_symbol(scope, &args[0]); + + call_str_bitcode_fn(env, &[string], bitcode::STR_TO_SCALARS) + } StrStartsWith => { // Str.startsWith : Str, Str -> Bool debug_assert_eq!(args.len(), 2); diff --git a/crates/compiler/gen_wasm/src/low_level.rs b/crates/compiler/gen_wasm/src/low_level.rs index 122b6393bd..64e914d50c 100644 --- a/crates/compiler/gen_wasm/src/low_level.rs +++ b/crates/compiler/gen_wasm/src/low_level.rs @@ -217,6 +217,7 @@ impl<'a> LowLevelCall<'a> { match self.lowlevel { // Str StrConcat => self.load_args_and_call_zig(backend, bitcode::STR_CONCAT), + StrToScalars => self.load_args_and_call_zig(backend, bitcode::STR_TO_SCALARS), StrJoinWith => self.load_args_and_call_zig(backend, bitcode::STR_JOIN_WITH), StrIsEmpty => match backend.storage.get(&self.arguments[0]) { StoredValue::StackMemory { location, .. } => { diff --git a/crates/compiler/module/src/low_level.rs b/crates/compiler/module/src/low_level.rs index 5c3b0f8fa2..0955e62e74 100644 --- a/crates/compiler/module/src/low_level.rs +++ b/crates/compiler/module/src/low_level.rs @@ -23,6 +23,7 @@ pub enum LowLevel { StrTrimLeft, StrTrimRight, StrToNum, + StrToScalars, ListLen, ListWithCapacity, ListGetUnsafe, @@ -193,6 +194,7 @@ impl LowLevelWrapperType { match symbol { Symbol::STR_CONCAT => CanBeReplacedBy(StrConcat), + Symbol::STR_TO_SCALARS => CanBeReplacedBy(StrToScalars), Symbol::STR_JOIN_WITH => CanBeReplacedBy(StrJoinWith), Symbol::STR_IS_EMPTY => CanBeReplacedBy(StrIsEmpty), Symbol::STR_STARTS_WITH => CanBeReplacedBy(StrStartsWith), diff --git a/crates/compiler/module/src/symbol.rs b/crates/compiler/module/src/symbol.rs index 9ee23ea8fe..04c2d395de 100644 --- a/crates/compiler/module/src/symbol.rs +++ b/crates/compiler/module/src/symbol.rs @@ -1189,6 +1189,7 @@ define_builtins! { 31 STR_TO_I16: "toI16" 32 STR_TO_U8: "toU8" 33 STR_TO_I8: "toI8" + 34 STR_TO_SCALARS: "toScalars" } 5 LIST: "List" => { 0 LIST_LIST: "List" imported // the List.List type alias diff --git a/crates/compiler/mono/src/borrow.rs b/crates/compiler/mono/src/borrow.rs index 76f690c34c..12ab8a6da1 100644 --- a/crates/compiler/mono/src/borrow.rs +++ b/crates/compiler/mono/src/borrow.rs @@ -896,7 +896,9 @@ pub fn lowlevel_borrow_signature(arena: &Bump, op: LowLevel) -> &[bool] { // - arguments that we may want to update destructively must be Owned // - other refcounted arguments are Borrowed match op { - ListLen | StrIsEmpty | StrCountGraphemes => arena.alloc_slice_copy(&[borrowed]), + ListLen | StrIsEmpty | StrToScalars | StrCountGraphemes => { + arena.alloc_slice_copy(&[borrowed]) + } ListWithCapacity => arena.alloc_slice_copy(&[irrelevant]), ListReplaceUnsafe => arena.alloc_slice_copy(&[owned, irrelevant, irrelevant]), ListGetUnsafe => arena.alloc_slice_copy(&[borrowed, irrelevant]),