Add Str.toScalars builtin

This commit is contained in:
Richard Feldman 2022-07-01 22:34:17 -04:00
parent b1fe76adbf
commit acb7cf99e1
No known key found for this signature in database
GPG key ID: 7E4127D1E4241798
11 changed files with 73 additions and 3 deletions

View file

@ -155,6 +155,7 @@ comptime {
const str = @import("str.zig");
comptime {
exportStrFn(str.init, "init");
exportStrFn(str.strToScalarsC, "to_scalars");
exportStrFn(str.strSplitInPlaceC, "str_split_in_place");
exportStrFn(str.countSegments, "count_segments");
exportStrFn(str.countGraphemeClusters, "count_grapheme_clusters");

View file

@ -470,7 +470,7 @@ pub fn strNumberOfBytes(string: RocStr) callconv(.C) usize {
// Str.toScalars
pub fn strToScalarsC(str: RocStr) callconv(.C) RocList {
return @call(.{ .modifier = always_inline }, strToScalars, .{ RocStr, str });
return @call(.{ .modifier = always_inline }, strToScalars, .{ str });
}
fn strToScalars(string: RocStr) callconv(.C) RocList {

View file

@ -1,9 +1,9 @@
interface Str
exposes
[
concat,
Utf8Problem,
Utf8ByteProblem,
concat,
isEmpty,
joinWith,
split,
@ -32,6 +32,7 @@ interface Str
toI16,
toU8,
toI8,
toScalars,
]
imports [Bool.{ Bool }, Result.{ Result }]
@ -172,6 +173,31 @@ countGraphemes : Str -> Nat
## single [U32]. You'd need to use `Str.startsWithCodePt "🕊"` instead.
startsWithCodePt : Str, U32 -> Bool
toScalars : Str -> List U32
# walkScalars : Str, state, (state, U32, Str -> state) -> state
# walkScalars = \inputStr, init, update ->
# # TODO rewrite this in Zig to speed it up a ton!
# answer =
# List.walk
# (toUtf8 inputStr)
# { index: 0, answer: init }
# \{ index, state }, byte ->
# { codePt, codePtStr } =
# if byte <= 127 then
# # This can never fail. Also, this list means one allocation per step! 😱
# str = Str.fromUtf8 [byte] |> Result.withDefault ""
# { codePt: Num.toU32 byte, codePtStr: str }
# else
# # TODO handle multibyte UTF-8 string by looking ahead in the list as needed
# # https://docs.teradata.com/r/Teradata-Database-International-Character-Set-Support/June-2017/Client-Character-Set-Options/UTF8-Client-Character-Set-Support/UTF8-Multibyte-Sequences
# { index: index + 1, state: update state codePt codePtStr }
# answer.state
## Return a [List] of the string's [U8] UTF-8 [code units](https://unicode.org/glossary/#code_unit).
## (To split the string into a [List] of smaller [Str] values instead of [U8] values,
## see [Str.split].)

View file

@ -311,6 +311,7 @@ pub const STR_COUNT_SEGMENTS: &str = "roc_builtins.str.count_segments";
pub const STR_CONCAT: &str = "roc_builtins.str.concat";
pub const STR_JOIN_WITH: &str = "roc_builtins.str.joinWith";
pub const STR_STR_SPLIT_IN_PLACE: &str = "roc_builtins.str.str_split_in_place";
pub const STR_TO_SCALARS: &str = "roc_builtins.str.to_scalars";
pub const STR_COUNT_GRAPEHEME_CLUSTERS: &str = "roc_builtins.str.count_grapheme_clusters";
pub const STR_STARTS_WITH: &str = "roc_builtins.str.starts_with";
pub const STR_STARTS_WITH_CODE_PT: &str = "roc_builtins.str.starts_with_code_point";

View file

@ -873,6 +873,13 @@ pub fn types() -> MutMap<Symbol, (SolvedType, Region)> {
Box::new(str_type()),
);
// Str.toScalars : Str -> List U32
add_top_level_function_type!(
Symbol::STR_TO_SCALARS,
vec![str_type()],
Box::new(list_type(u32_type())),
);
// isEmpty : Str -> Bool
add_top_level_function_type!(
Symbol::STR_IS_EMPTY,

View file

@ -73,6 +73,7 @@ pub fn builtin_defs_map(symbol: Symbol, var_store: &mut VarStore) -> Option<Def>
BOOL_NOT => bool_not,
STR_CONCAT => str_concat,
STR_JOIN_WITH => str_join_with,
STR_TO_SCALARS => str_to_scalars,
STR_SPLIT => str_split,
STR_IS_EMPTY => str_is_empty,
STR_STARTS_WITH => str_starts_with,
@ -1677,6 +1678,26 @@ fn str_concat(symbol: Symbol, var_store: &mut VarStore) -> Def {
)
}
/// Str.toScalars : Str -> List U32
fn str_to_scalars(symbol: Symbol, var_store: &mut VarStore) -> Def {
let str_var = var_store.fresh();
let list_u32_var = var_store.fresh();
let body = RunLowLevel {
op: LowLevel::StrToScalars,
args: vec![(str_var, Var(Symbol::ARG_1))],
ret_var: str_var,
};
defn(
symbol,
vec![(str_var, Symbol::ARG_1)],
var_store,
body,
list_u32_var,
)
}
/// Str.joinWith : List Str, Str -> Str
fn str_join_with(symbol: Symbol, var_store: &mut VarStore) -> Def {
let list_str_var = var_store.fresh();

View file

@ -5415,6 +5415,14 @@ fn run_low_level<'a, 'ctx, 'env>(
call_str_bitcode_fn(env, &[list.into(), string], bitcode::STR_JOIN_WITH)
}
StrToScalars => {
// Str.toScalars : Str -> List U32
debug_assert_eq!(args.len(), 1);
let string = load_symbol(scope, &args[0]);
call_str_bitcode_fn(env, &[string], bitcode::STR_TO_SCALARS)
}
StrStartsWith => {
// Str.startsWith : Str, Str -> Bool
debug_assert_eq!(args.len(), 2);

View file

@ -217,6 +217,7 @@ impl<'a> LowLevelCall<'a> {
match self.lowlevel {
// Str
StrConcat => self.load_args_and_call_zig(backend, bitcode::STR_CONCAT),
StrToScalars => self.load_args_and_call_zig(backend, bitcode::STR_TO_SCALARS),
StrJoinWith => self.load_args_and_call_zig(backend, bitcode::STR_JOIN_WITH),
StrIsEmpty => match backend.storage.get(&self.arguments[0]) {
StoredValue::StackMemory { location, .. } => {

View file

@ -23,6 +23,7 @@ pub enum LowLevel {
StrTrimLeft,
StrTrimRight,
StrToNum,
StrToScalars,
ListLen,
ListWithCapacity,
ListGetUnsafe,
@ -193,6 +194,7 @@ impl LowLevelWrapperType {
match symbol {
Symbol::STR_CONCAT => CanBeReplacedBy(StrConcat),
Symbol::STR_TO_SCALARS => CanBeReplacedBy(StrToScalars),
Symbol::STR_JOIN_WITH => CanBeReplacedBy(StrJoinWith),
Symbol::STR_IS_EMPTY => CanBeReplacedBy(StrIsEmpty),
Symbol::STR_STARTS_WITH => CanBeReplacedBy(StrStartsWith),

View file

@ -1189,6 +1189,7 @@ define_builtins! {
31 STR_TO_I16: "toI16"
32 STR_TO_U8: "toU8"
33 STR_TO_I8: "toI8"
34 STR_TO_SCALARS: "toScalars"
}
5 LIST: "List" => {
0 LIST_LIST: "List" imported // the List.List type alias

View file

@ -896,7 +896,9 @@ pub fn lowlevel_borrow_signature(arena: &Bump, op: LowLevel) -> &[bool] {
// - arguments that we may want to update destructively must be Owned
// - other refcounted arguments are Borrowed
match op {
ListLen | StrIsEmpty | StrCountGraphemes => arena.alloc_slice_copy(&[borrowed]),
ListLen | StrIsEmpty | StrToScalars | StrCountGraphemes => {
arena.alloc_slice_copy(&[borrowed])
}
ListWithCapacity => arena.alloc_slice_copy(&[irrelevant]),
ListReplaceUnsafe => arena.alloc_slice_copy(&[owned, irrelevant, irrelevant]),
ListGetUnsafe => arena.alloc_slice_copy(&[borrowed, irrelevant]),