From 3b07bd35a487a3e3edd916721265cb8d9b47883f Mon Sep 17 00:00:00 2001 From: Luke Boswell Date: Wed, 26 Nov 2025 12:06:48 +1100 Subject: [PATCH 01/10] add Str.count_utf8_bytes builtin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- src/build/builtin_compiler/main.zig | 3 +++ src/build/roc/Builtin.roc | 1 + src/canonicalize/Expression.zig | 1 + src/eval/interpreter.zig | 17 ++++++++++++++ test/snapshots/repl/str_count_utf8_bytes.md | 25 +++++++++++++++++++++ 5 files changed, 47 insertions(+) create mode 100644 test/snapshots/repl/str_count_utf8_bytes.md diff --git a/src/build/builtin_compiler/main.zig b/src/build/builtin_compiler/main.zig index 5a5f74cb82..e885737761 100644 --- a/src/build/builtin_compiler/main.zig +++ b/src/build/builtin_compiler/main.zig @@ -140,6 +140,9 @@ fn replaceStrIsEmptyWithLowLevel(env: *ModuleEnv) !std.ArrayList(CIR.Def.Idx) { if (env.common.findIdent("Builtin.Str.drop_suffix")) |str_drop_suffix_ident| { try low_level_map.put(str_drop_suffix_ident, .str_drop_suffix); } + if (env.common.findIdent("Builtin.Str.count_utf8_bytes")) |str_count_utf8_bytes_ident| { + try low_level_map.put(str_count_utf8_bytes_ident, .str_count_utf8_bytes); + } if (env.common.findIdent("Builtin.List.len")) |list_len_ident| { try low_level_map.put(list_len_ident, .list_len); } diff --git a/src/build/roc/Builtin.roc b/src/build/roc/Builtin.roc index c2d7498635..634f376173 100644 --- a/src/build/roc/Builtin.roc +++ b/src/build/roc/Builtin.roc @@ -15,6 +15,7 @@ Builtin :: [].{ with_prefix : Str, Str -> Str drop_prefix : Str, Str -> Str drop_suffix : Str, Str -> Str + count_utf8_bytes : Str -> U64 } List(_item) :: [ProvidedByCompiler].{ diff --git a/src/canonicalize/Expression.zig b/src/canonicalize/Expression.zig index b4e1f813f8..82cd181929 100644 --- a/src/canonicalize/Expression.zig +++ b/src/canonicalize/Expression.zig @@ -415,6 +415,7 @@ pub const Expr = union(enum) { str_with_prefix, str_drop_prefix, str_drop_suffix, + str_count_utf8_bytes, // Numeric to_str operations u8_to_str, diff --git a/src/eval/interpreter.zig b/src/eval/interpreter.zig index ee10ee22b3..431a1c6c1e 100644 --- a/src/eval/interpreter.zig +++ b/src/eval/interpreter.zig @@ -3091,6 +3091,23 @@ pub const Interpreter = struct { out.is_initialized = true; return out; }, + .str_count_utf8_bytes => { + // Str.count_utf8_bytes : Str -> U64 + std.debug.assert(args.len == 1); + + const string_arg = args[0]; + std.debug.assert(string_arg.ptr != null); + + const string: *const RocStr = @ptrCast(@alignCast(string_arg.ptr.?)); + const byte_count = builtins.str.countUtf8Bytes(string.*); + + const result_layout = layout.Layout.int(.u64); + var out = try self.pushRaw(result_layout, 0); + out.is_initialized = false; + try out.setInt(@intCast(byte_count)); + out.is_initialized = true; + return out; + }, .list_len => { // List.len : List(a) -> U64 // Note: listLen returns usize, but List.len always returns U64. diff --git a/test/snapshots/repl/str_count_utf8_bytes.md b/test/snapshots/repl/str_count_utf8_bytes.md new file mode 100644 index 0000000000..ba8272e34a --- /dev/null +++ b/test/snapshots/repl/str_count_utf8_bytes.md @@ -0,0 +1,25 @@ +# META +~~~ini +description=Str.count_utf8_bytes should return the number of bytes in the string +type=repl +~~~ +# SOURCE +~~~roc +» Str.count_utf8_bytes("") +» Str.count_utf8_bytes("hello") +» Str.count_utf8_bytes("hello world") +» Str.count_utf8_bytes("é") +» Str.count_utf8_bytes("🎉") +~~~ +# OUTPUT +0 +--- +5 +--- +11 +--- +2 +--- +4 +# PROBLEMS +NIL From 72e97ea431d1a2d0f954eb34ac844f3c2bcf6282 Mon Sep 17 00:00:00 2001 From: Luke Boswell Date: Wed, 26 Nov 2025 12:09:59 +1100 Subject: [PATCH 02/10] add Str.with_capacity builtin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- old-string-implementations.md | 890 +++++++++++++++++++++++ src/build/builtin_compiler/main.zig | 3 + src/build/roc/Builtin.roc | 1 + src/canonicalize/Expression.zig | 1 + src/eval/interpreter.zig | 20 + test/fx/Parser.roc | 19 + test/snapshots/repl/str_with_capacity.md | 19 + 7 files changed, 953 insertions(+) create mode 100644 old-string-implementations.md create mode 100644 test/fx/Parser.roc create mode 100644 test/snapshots/repl/str_with_capacity.md diff --git a/old-string-implementations.md b/old-string-implementations.md new file mode 100644 index 0000000000..6470c027d0 --- /dev/null +++ b/old-string-implementations.md @@ -0,0 +1,890 @@ + +## [Str](/) + +Strings represent text. For example, `"Hi!"` is a string. + +This guide starts at a high level and works down to the in-memory representation of strings and their [performance characteristics](#performance). For reasons that will be explained later in this guide, some string operations are in the `Str` module while others (notably [capitalization](#capitalization), [code points](#code-points), [graphemes](#graphemes), and sorting) are in separate packages. There's also a list of recommendations for [when to use code points, graphemes, and UTF-8](#when-to-use). + +## Syntax + +The most common way to represent strings is using quotation marks: + +Copy + +"Hello, World!" + +Using this syntax, the whole string must go on one line. You can write multiline strings using triple quotes: + +Copy + +text = + """ + In memory, this string will not have any spaces + at its start. That's because the first line + starts at the same indentation level as the + opening quotation mark. Actually, none of these + lines will be indented. + + However, this line will be indented! + """ + +In triple-quoted strings, both the opening and closing `"""` must be at the same indentation level. Lines in the string begin at that indentation level; the spaces that indent the multiline string itself are not considered content. + +### Interpolation + +_String interpolation_ is syntax for inserting a string into another string. + +Copy + +name = "Sam" + +"Hi, my name is ${name}!" + +This will evaluate to the string `"Hi, my name is Sam!"` + +You can put any expression you like inside the parentheses, as long as it's all on one line: + +Copy + +colors = \["red", "green", "blue"\] + +"The colors are ${colors |> Str.join\_with(", ")}!" + +Interpolation can be used in multiline strings, but the part inside the parentheses must still be on one line. + +### Escapes + +There are a few special escape sequences in strings: + +* `\n` becomes a [newline](https://en.wikipedia.org/wiki/Newline) +* `\r` becomes a [carriage return](https://en.wikipedia.org/wiki/Carriage_return#Computers) +* `\t` becomes a [tab](https://en.wikipedia.org/wiki/Tab_key#Tab_characters) +* `\"` becomes a normal `"` (this lets you write `"` inside a single-line string) +* `\\` becomes a normal `\` (this lets you write `\` without it being treated as an escape) +* `\$` becomes a normal `$` (this lets you write `$` followed by `(` without it being treated as [interpolation](#interpolation)) + +These work in both single-line and multiline strings. We'll also discuss another escape later, for inserting [Unicode code points](#code-points) into a string. + +### Single quote syntax + +Try putting `'👩'` into `roc repl`. You should see this: + +Copy + +» '👩' + +128105 : Int \* + +The single-quote `'` syntax lets you represent a Unicode code point (discussed in the next section) in source code, in a way that renders as the actual text it represents rather than as a number literal. This lets you see what it looks like in the source code rather than looking at a number. + +At runtime, the single-quoted value will be treated the same as an ordinary number literal—in other words, `'👩'` is syntax sugar for writing `128105`. You can verify this in `roc repl`: + +Copy + +» '👩' == 128105 + +Bool.true : Bool + +Double quotes (`"`), on the other hand, are not type-compatible with integers—not only because strings can be empty (`""` is valid, but `''` is not) but also because there may be more than one code point involved in any given string! + +There are also some special escape sequences in single-quote strings: + +* `\n` becomes a [newline](https://en.wikipedia.org/wiki/Newline) +* `\r` becomes a [carriage return](https://en.wikipedia.org/wiki/Carriage_return#Computers) +* `\t` becomes a [tab](https://en.wikipedia.org/wiki/Tab_key#Tab_characters) +* `\'` becomes a normal `'` (this lets you write `'` inside a single-quote string) +* `\\` becomes a normal `\` (this lets you write `\` without it being treated as an escape) + +Most often this single-quote syntax is used when writing parsers; most Roc programs never use it at all. + +## Unicode + +Roc strings represent text using [Unicode](https://unicode.org) This guide will provide only a basic overview of Unicode (the [Unicode glossary](http://www.unicode.org/glossary/) has over 500 entries in it), but it will include the most relevant differences between these concepts: + +* Code points +* Graphemes +* UTF-8 + +It will also explain why some operations are included in Roc's builtin [Str](https://www.roc-lang.org/builtins/Str) module, and why others are in separate packages like [roc-lang/unicode](https://github.com/roc-lang/unicode). + +### Graphemes + +Let's start with the following string: + +`"👩‍👩‍👦‍👦"` + +Some might call this a "character." After all, in a monospace font, it looks to be about the same width as the letter "A" or the punctuation mark "!"—both of which are commonly called "characters." Unfortunately, the term "character" in programming has changed meanings many times across the years and across programming languages, and today it's become a major source of confusion. + +Unicode uses the less ambiguous term [_grapheme_](https://www.unicode.org/glossary/#grapheme), which it defines as a "user-perceived character" (as opposed to one of the several historical ways the term "character" has been used in programming) or, alternatively, "A minimally distinctive unit of writing in the context of a particular writing system." + +By Unicode's definition, each of the following is an individual grapheme: + +* `a` +* `鹏` +* `👩‍👩‍👦‍👦` + +Note that although _grapheme_ is less ambiguous than _character_, its definition is still open to interpretation. To address this, Unicode has formally specified [text segmentation rules](https://www.unicode.org/reports/tr29/) which define grapheme boundaries in precise technical terms. We won't get into those rules here, but since they can change with new Unicode releases, functions for working with graphemes are in the [roc-lang/unicode](https://github.com/roc-lang/unicode) package rather than in the builtin [`Str`](https://www.roc-lang.org/builtins/Str) module. This allows them to be updated without being blocked on a new release of the Roc language. + +### Code Points + +Every Unicode text value can be broken down into [Unicode code points](http://www.unicode.org/glossary/#code_point), which are integers between `0` and `285_212_438` that describe components of the text. In memory, every Roc string is a sequence of these integers stored in a format called UTF-8, which will be discussed [later](#utf8). + +The string `"👩‍👩‍👦‍👦"` happens to be made up of these code points: + +Copy + +\[128105, 8205, 128105, 8205, 128102, 8205, 128102\] + +From this we can see that: + +* One grapheme can be made up of multiple code points. In fact, there is no upper limit on how many code points can go into a single grapheme! (Some programming languages use the term "character" to refer to individual code points; this can be confusing for graphemes like 👩‍👩‍👦‍👦 because it visually looks like "one character" but no single code point can represent it.) +* Sometimes code points repeat within an individual grapheme. Here, 128105 repeats twice, as does 128102, and there's an 8205 in between each of the other code points. + +### Combining Code Points + +The reason every other code point in 👩‍👩‍👦‍👦 is 8205 is that code point 8205 joins together other code points. This emoji, known as ["Family: Woman, Woman, Boy, Boy"](https://emojipedia.org/family-woman-woman-boy-boy), is made by combining several emoji using [zero-width joiners](https://emojipedia.org/zero-width-joiner)—which are represented by code point 8205 in memory, and which have no visual repesentation on their own. + +Here are those code points again, this time with comments about what they represent: + +Copy + +\[128105\] # "👩" +\[8205\] # (joiner) +\[128105\] # "👩" +\[8205\] # (joiner) +\[128102\] # "👦" +\[8205\] # (joiner) +\[128102\] # "👦" + +One way to read this is "woman emoji joined to woman emoji joined to boy emoji joined to boy emoji." Without the joins, it would be: + +Copy + +"👩👩👦👦" + +With the joins, however, it is instead: + +Copy + +"👩‍👩‍👦‍👦" + +Even though 👩‍👩‍👦‍👦 is visually smaller when rendered, it takes up almost twice as much memory as 👩👩👦👦 does! That's because it has all the same code points, plus the zero-width joiners in between them. + +### String equality and normalization + +Besides emoji like 👩‍👩‍👦‍👦, another classic example of multiple code points being combined to render as one grapheme has to do with accent marks. Try putting these two strings into `roc repl`: + +Copy + +"caf\\u(e9)" +"cafe\\u(301)" + +The `\u(e9)` syntax is a way of inserting code points into string literals. In this case, it's the same as inserting the hexadecimal number `0xe9` as a code point onto the end of the string `"caf"`. Since Unicode code point `0xe9` happens to be `é`, the string `"caf\u(e9)"` ends up being identical in memory to the string `"café"`. + +We can verify this too: + +Copy + +» "caf\\u(e9)" == "café" + +Bool.true : Bool + +As it turns out, `"cafe\u(301)"` is another way to represent the same word. The Unicode code point 0x301 represents a ["combining acute accent"](https://unicodeplus.com/U+0301)—which essentially means that it will add an accent mark to whatever came before it. In this case, since `"cafe\u(301)"` has an `e` before the `"\u(301)"`, that `e` ends up with an accent mark on it and becomes `é`. + +Although these two strings get rendered identically to one another, they are different in memory because their code points are different! We can also confirm this in `roc repl`: + +Copy + +» "caf\\u(e9)" == "cafe\\u(301)" + +Bool.false : Bool + +As you can imagine, this can be a source of bugs. Not only are they considered unequal, they also hash differently, meaning `"caf\u(e9)"` and `"cafe\u(301)"` can both be separate entries in the same [`Set`](https://www.roc-lang.org/builtins/Set). + +One way to prevent problems like these is to perform [Unicode normalization](https://www.unicode.org/reports/tr15/), a process which converts conceptually equivalent strings (like `"caf\u(e9)"` and `"cafe\u(301)"`) into one canonical in-memory representation. This makes equality checks on them pass, among other benefits. + +It would be technically possible for Roc to perform string normalization automatically on every equality check. Unfortunately, although some programs might want to treat `"caf\u(e9)"` and `"cafe\u(301)"` as equivalent, for other programs it might actually be important to be able to tell them apart. If these equality checks always passed, then there would be no way to tell them apart! + +As such, normalization must be performed explicitly when desired. Like graphemes, Unicode normalization rules can change with new releases of Unicode. As such, these functions are in separate packages instead of builtins (normalization is planned to be in [roc-lang/unicode](https://github.com/roc-lang/unicode) in the future, but it has not yet been implemented) so that updates to these functions based on new Unicode releases can happen without waiting on new releases of the Roc language. + +### Capitalization + +We've already seen two examples of Unicode definitions that can change with new Unicode releases: graphemes and normalization. Another is capitalization; these rules can change with new Unicode releases (most often in the form of additions of new languages, but breaking changes to capitalization rules for existing languages are also possible), and so they are not included in builtin [`Str`](https://www.roc-lang.org/builtins/Str). + +This might seem particularly surprising, since capitalization functions are commonly included in standard libraries. However, it turns out that "capitalizing an arbitrary string" is impossible to do correctly without additional information. + +For example, what is the capitalized version of this string? + +Copy + +"i" + +* In English, the correct answer is `"I"`. +* In Turkish, the correct answer is `"İ"`. + +Similarly, the correct lowercased version of the string `"I"` is `"i"` in English and `"ı"` in Turkish. + +Turkish is not the only language to use this [dotless i](https://en.wikipedia.org/wiki/Dotless_I), and it's an example of how a function which capitalizes strings cannot give correct answers without the additional information of which language's capitalization rules should be used. + +Many languages defer to the operating system's [localization](https://en.wikipedia.org/wiki/Internationalization_and_localization) settings for this information. In that design, calling a program's capitalization function with an input string of `"i"` might give an answer of `"I"` on one machine and `"İ"` on a different machine, even though it was the same program running on both systems. Naturally, this can cause bugs—but more than that, writing tests to prevent bugs like this usually requires extra complexity compared to writing ordinary tests. + +In general, Roc programs should give the same answers for the same inputs even when run on different machines. There are exceptions to this (e.g. a program running out of system resources on one machine, while being able to make more progress on a machine that has more resources), but operating system's language localization is not among them. + +For these reasons, capitalization functions are not in [`Str`](https://www.roc-lang.org/builtins/Str). There is a planned `roc-lang` package to handle use cases like capitalization and sorting—sorting can also vary by language as well as by things like country—but implementation work has not yet started on this package. + +### UTF-8 + +Earlier, we discussed how Unicode code points can be described as [`U32`](https://www.roc-lang.org/builtins/Num#U32) integers. However, many common code points are very low integers, and can fit into a `U8` instead of needing an entire `U32` to represent them in memory. UTF-8 takes advantage of this, using a variable-width encoding to represent code points in 1-4 bytes, which saves a lot of memory in the typical case—especially compared to [UTF-16](https://en.wikipedia.org/wiki/UTF-16), which always uses at least 2 bytes to represent each code point, or [UTF-32](https://en.wikipedia.org/wiki/UTF-32), which always uses the maximum 4 bytes. + +This guide won't cover all the details of UTF-8, but the basic idea is this: + +* If a code point is 127 or lower, UTF-8 stores it in 1 byte. +* If it's between 128 and 2047, UTF-8 stores it in 2 bytes. +* If it's between 2048 and 65535, UTF-8 stores it in 3 bytes. +* If it's higher than that, UTF-8 stores it in 4 bytes. + +The specific [UTF-8 encoding](https://en.wikipedia.org/wiki/UTF-8#Encoding) of these bytes involves using 1 to 5 bits of each byte for metadata about multi-byte sequences. + +A valuable feature of UTF-8 is that it is backwards-compatible with the [ASCII](https://en.wikipedia.org/wiki/ASCII) encoding that was widely used for many years. ASCII existed before Unicode did, and only used the integers 0 to 127 to represent its equivalent of code points. The Unicode code points 0 to 127 represent the same semantic information as ASCII, (e.g. the number 64 represents the letter "A" in both ASCII and in Unicode), and since UTF-8 represents code points 0 to 127 using one byte, all valid ASCII strings can be successfully parsed as UTF-8 without any need for conversion. + +Since many textual computer encodings—including [CSV](https://en.wikipedia.org/wiki/CSV), [XML](https://en.wikipedia.org/wiki/XML), and [JSON](https://en.wikipedia.org/wiki/JSON)—do not use any code points above 127 for their delimiters, it is often possible to write parsers for these formats using only `Str` functions which present UTF-8 as raw `U8` sequences, such as [`Str.walk_utf8`](https://www.roc-lang.org/builtins/Str#walk_utf8) and [`Str.to_utf8`](https://www.roc-lang.org/builtins/Str#to_utf8). In the typical case where they do not to need to parse out individual Unicode code points, they can get everything they need from `Str` UTF-8 functions without needing to depend on other packages. + +### When to use code points, graphemes, and UTF-8 + +Deciding when to use code points, graphemes, and UTF-8 can be nonobvious to say the least! + +The way Roc organizes the `Str` module and supporting packages is designed to help answer this question. Every situation is different, but the following rules of thumb are typical: + +* Most often, using `Str` values along with helper functions like [`split_on`](https://www.roc-lang.org/builtins/Str#split_on), [`join_with`](https://www.roc-lang.org/builtins/Str#join_with), and so on, is the best option. +* If you are specifically implementing a parser, working in UTF-8 bytes is usually the best option. So functions like [`walk_utf8`](https://www.roc-lang.org/builtins/Str#walk_utf8), [to\_utf8](https://www.roc-lang.org/builtins/Str#to_utf8), and so on. (Note that single-quote literals produce number literals, so ASCII-range literals like `'a'` gives an integer literal that works with a UTF-8 `U8`.) +* If you are implementing a Unicode library like [roc-lang/unicode](https://github.com/roc-lang/unicode), working in terms of code points will be unavoidable. Aside from basic readability considerations like `\u(...)` in string literals, if you have the option to avoid working in terms of code points, it is almost always correct to avoid them. +* If it seems like a good idea to split a string into "characters" (graphemes), you should definitely stop and reconsider whether this is really the best design. Almost always, doing this is some combination of more error-prone or slower (usually both) than doing something else that does not require taking graphemes into consideration. + +For this reason (among others), grapheme functions live in [roc-lang/unicode](https://github.com/roc-lang/unicode) rather than in [`Str`](https://www.roc-lang.org/builtins/Str). They are more niche than they seem, so they should not be reached for all the time! + +## Performance + +This section deals with how Roc strings are represented in memory, and their performance characteristics. + +A normal heap-allocated roc `Str` is represented on the stack as: + +* A "capacity" unsigned integer, which respresents how many bytes are allocated on the heap to hold the string's contents. +* A "length" unsigned integer, which rerepresents how many of the "capacity" bytes are actually in use. (A `Str` can have more bytes allocated on the heap than are actually in use.) +* The memory address of the first byte in the string's actual contents. + +Each of these three fields is the same size: 64 bits on a 64-bit system, and 32 bits on a 32-bit system. The actual contents of the string are stored in one contiguous sequence of bytes, encoded as UTF-8, often on the heap but sometimes elsewhere—more on this later. Empty strings do not have heap allocations, so an empty `Str` on a 64-bit system still takes up 24 bytes on the stack (due to its three 64-bit fields). + +### Reference counting and opportunistic mutation + +Like lists, dictionaries, and sets, Roc strings are automatically reference-counted and can benefit from opportunistic in-place mutation. The reference count is stored on the heap immediately before the first byte of the string's contents, and it has the same size as a memory address. This means it can count so high that it's impossible to write a Roc program which overflows a reference count, because having that many simultaneous references (each of which is a memory address) would have exhausted the operating system's address space first. + +When the string's reference count is 1, functions like [`Str.concat`](https://www.roc-lang.org/builtins/Str#concat) and [`Str.replace_each`](https://www.roc-lang.org/builtins/Str#replace_each) mutate the string in-place rather than allocating a new string. This preserves semantic immutability because it is unobservable in terms of the operation's output; if the reference count is 1, it means that memory would have otherwise been deallocated immediately anyway, and it's more efficient to reuse it instead of deallocating it and then immediately making a new allocation. + +The contents of statically-known strings (today that means string literals) are stored in the readonly section of the binary, so they do not need heap allocations or reference counts. They are not eligible for in-place mutation, since mutating the readonly section of the binary would cause an operating system [access violation](https://en.wikipedia.org/wiki/Segmentation_fault). + +### Small String Optimization + +Roc uses a "small string optimization" when representing certain strings in memory. + +If you have a sufficiently long string, then on a 64-bit system it will be represented on the stack using 24 bytes, and on a 32-bit system it will take 12 bytes—plus however many bytes are in the string itself—on the heap. However, if there is a string shorter than either of these stack sizes (so, a string of up to 23 bytes on a 64-bit system, and up to 11 bytes on a 32-bit system), then that string will be stored entirely on the stack rather than having a separate heap allocation at all. + +This can be much more memory-efficient! However, `List` does not have this optimization (it has some runtime cost, and in the case of `List` it's not anticipated to come up nearly as often), which means when converting a small string to `List U8` it can result in a heap allocation. + +Note that this optimization is based entirely on how many UTF-8 bytes the string takes up in memory. It doesn't matter how many [graphemes](#graphemes), [code points](#code-points) or anything else it has; the only factor that determines whether a particular string is eligible for the small string optimization is the number of UTF-8 bytes it takes up in memory! + +### Seamless Slices + +Try putting this into `roc repl`: + +Copy + +» "foo/bar/baz" |> Str.split\_on("/") + +\["foo", "bar", "baz"\] : List Str + +All of these strings are small enough that the [small string optimization](#small) will apply, so none of them will be allocated on the heap. + +Now let's suppose they were long enough that this optimization no longer applied: + +Copy + +» "a much, much, much, much/longer/string compared to the last one!" |> Str.split\_on "/" + +\["a much, much, much, much", "longer", "string compared to the last one!"\] : List Str + +Here, the only strings small enough for the small string optimization are `"/"` and `"longer"`. They will be allocated on the stack. + +The first and last strings in the returned list `"a much, much, much, much"` and `"string compared to the last one!"` will not be allocated on the heap either. Instead, they will be _seamless slices_, which means they will share memory with the original input string. + +* `"a much, much, much, much"` will share the first 24 bytes of the original string. +* `"string compared to the last one!"` will share the last 32 bytes of the original string. + +All of these strings are semantically immutable, so sharing these bytes is an implementation detail that should only affect performance. By design, there is no way at either compile time or runtime to tell whether a string is a seamless slice. This allows the optimization's behavior to change in the future without affecting Roc programs' semantic behavior. + +Seamless slices create additional references to the original string, which make it ineligible for opportunistic mutation (along with the slices themselves; slices are never eligible for mutation), and which also make it take longer before the original string can be deallocated. A case where this might be noticeable in terms of performance would be: + +1. A function takes a very large string as an argument and returns a much smaller slice into that string. +2. The smaller slice is used for a long time in the program, whereas the much larger original string stops being used. +3. In this situation, it might have been better for total program memory usage (although not necessarily overall performance) if the original large string could have been deallocated sooner, even at the expense of having to copy the smaller string into a new allocation instead of reusing the bytes with a seamless slice. + +If a situation like this comes up, a slice can be turned into a separate string by using [`Str.concat`](https://www.roc-lang.org/builtins/Str#concat) to concatenate the slice onto an empty string (or one created with [`Str.with_capacity`](https://www.roc-lang.org/builtins/Str#with_capacity)). + +Currently, the only way to get seamless slices of strings is by calling certain `Str` functions which return them. In general, `Str` functions which accept a string and return a subset of that string tend to do this. [`Str.trim`](https://www.roc-lang.org/builtins/Str#trim) is another example of a function which returns a seamless slice. + +### [](Str#Utf8Problem)`Utf8Problem : [ InvalidStartByte, UnexpectedEndOfSequence, ExpectedContinuation, OverlongEncoding, CodepointTooLarge, EncodesSurrogateHalf ]` + +### [](Str#is_empty)`is_empty : Str -> Bool` + +Returns [`Bool.true`](/builtins/alpha4/Bool#true "Docs for Bool.true") if the string is empty, and [`Bool.false`](/builtins/alpha4/Bool#false "Docs for Bool.false") otherwise. + +Copy + +expect Str.is\_empty("hi!") == Bool.false +expect Str.is\_empty("") == Bool.true + +### [](Str#concat)`concat : Str, Str -> Str` + +Concatenates two strings together. + +Copy + +expect Str.concat("ab", "cd") == "abcd" +expect Str.concat("hello", "") == "hello" +expect Str.concat("", "") == "" + +### [](Str#with_capacity)`with_capacity : U64 -> Str` + +Returns a string of the specified capacity without any content. + +This is a performance optimization tool that's like calling [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve") on an empty string. It's useful when you plan to build up a string incrementally, for example by calling [`Str.concat`](/builtins/alpha4/Str#concat "Docs for Str.concat") on it: + +Copy + +greeting = "Hello and welcome to Roc" +subject = "Awesome Programmer" + +# Evaluates to "Hello and welcome to Roc, Awesome Programmer!" +hello\_world = + Str.with\_capacity(45) + |> Str.concat(greeting) + |> Str.concat(", ") + |> Str.concat(subject) + |> Str.concat("!") + +In general, if you plan to use [`Str.concat`](/builtins/alpha4/Str#concat "Docs for Str.concat") on an empty string, it will be faster to start with [`Str.with_capacity`](/builtins/alpha4/Str#with_capacity "Docs for Str.with_capacity") than with `""`. Even if you don't know the exact capacity of the string, giving [`with_capacity`](/builtins/alpha4/Str#with_capacity "Docs for Str.with_capacity") a higher value than ends up being necessary can help prevent reallocation and copying—at the cost of using more memory than is necessary. + +For more details on how the performance optimization works, see [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve"). + +### [](Str#reserve)`reserve : Str, U64 -> Str` + +Increase a string's capacity by at least the given number of additional bytes. + +This can improve the performance of string concatenation operations like [`Str.concat`](/builtins/alpha4/Str#concat "Docs for Str.concat") by allocating extra capacity up front, which can prevent the need for reallocations and copies. Consider the following example which does not use [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve"): + +Copy + +greeting = "Hello and welcome to Roc" +subject = "Awesome Programmer" + +# Evaluates to "Hello and welcome to Roc, Awesome Programmer!" +hello\_world = + greeting + |> Str.concat(", ") + |> Str.concat(subject) + |> Str.concat("!") + +In this example: + +1. We start with `greeting`, which has both a length and capacity of 24 (bytes). +2. `|> Str.concat ", "` will see that there isn't enough capacity to add 2 more bytes for the `", "`, so it will create a new heap allocation with enough bytes to hold both. (This probably will be more than 7 bytes, because when [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") functions reallocate, they apply a multiplier to the exact capacity required. This makes it less likely that future realloctions will be needed. The multiplier amount is not specified, because it may change in future releases of Roc, but it will likely be around 1.5 to 2 times the exact capacity required.) Then it will copy the current bytes (`"Hello"`) into the new allocation, and finally concatenate the `", "` into the new allocation. The old allocation will then be deallocated because it's no longer referenced anywhere in the program. +3. `|> Str.concat subject` will again check if there is enough capacity in the string. If it doesn't find enough capacity once again, it will make a third allocation, copy the existing bytes (`"Hello, "`) into that third allocation, and then deallocate the second allocation because it's already no longer being referenced anywhere else in the program. (It may find enough capacity in this particular case, because the previous [`Str.concat`](/builtins/alpha4/Str#concat "Docs for Str.concat") allocated something like 1.5 to 2 times the necessary capacity in order to anticipate future concatenations like this...but if something longer than `"World"` were being concatenated here, it might still require further reallocation and copying.) +4. `|> Str.concat "!\n"` will repeat this process once more. + +This process can have significant performance costs due to multiple reallocation of new strings, copying between old strings and new strings, and deallocation of immediately obsolete strings. + +Here's a modified example which uses [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve") to eliminate the need for all that reallocation, copying, and deallocation. + +Copy + +hello\_world = + greeting + |> Str.reserve(21) + |> Str.concat(", ") + |> Str.concat(subject) + |> Str.concat("!") + +In this example: + +1. We again start with `greeting`, which has both a length and capacity of 24 bytes. +2. `|> Str.reserve(21)` will ensure that there is enough capacity in the string for an additional 21 bytes (to make room for `", "`, `"Awesome Programmer"`, and `"!"`). Since the current capacity is only 24, it will create a new 45-byte (24 + 21) heap allocation and copy the contents of the existing allocation (`greeting`) into it. +3. `|> Str.concat(", ")` will concatenate `,` to the string. No reallocation, copying, or deallocation will be necessary, because the string already has a capacity of 45 btytes, and `greeting` will only use 24 of them. +4. `|> Str.concat(subject)` will concatenate `subject` (`"Awesome Programmer"`) to the string. Again, no reallocation, copying, or deallocation will be necessary. +5. `|> Str.concat "!\n"` will concatenate `"!\n"` to the string, still without any reallocation, copying, or deallocation. + +Here, [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve") prevented multiple reallocations, copies, and deallocations during the [`Str.concat`](/builtins/alpha4/Str#concat "Docs for Str.concat") calls. Notice that it did perform a heap allocation before any [`Str.concat`](/builtins/alpha4/Str#concat "Docs for Str.concat") calls were made, which means that using [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve") is not free! You should only use it if you actually expect to make use of the extra capacity. + +Ideally, you'd be able to predict exactly how many extra bytes of capacity will be needed, but this may not always be knowable. When you don't know exactly how many bytes to reserve, you can often get better performance by choosing a number of bytes that's too high, because a number that's too low could lead to reallocations. There's a limit to this, of course; if you always give it ten times what it turns out to need, that could prevent reallocations but will also waste a lot of memory! + +If you plan to use [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve") on an empty string, it's generally better to use [`Str.with_capacity`](/builtins/alpha4/Str#with_capacity "Docs for Str.with_capacity") instead. + +### [](Str#join_with)`join_with : List Str, Str -> Str` + +Combines a [`List`](/builtins/alpha4/List#List "Docs for List.List") of strings into a single string, with a separator string in between each. + +Copy + +expect Str.join\_with(\["one", "two", "three"\], ", ") == "one, two, three" +expect Str.join\_with(\["1", "2", "3", "4"\], ".") == "1.2.3.4" + +### [](Str#split_on)`split_on : Str, Str -> List Str` + +Split a string around a separator. + +Passing `""` for the separator is not useful; it returns the original string wrapped in a [`List`](/builtins/alpha4/List#List "Docs for List.List"). + +Copy + +expect Str.split\_on("1,2,3", ",") == \["1","2","3"\] +expect Str.split\_on("1,2,3", "") == \["1,2,3"\] + +### [](Str#repeat)`repeat : Str, U64 -> Str` + +Repeats a string the given number of times. + +Copy + +expect Str.repeat("z", 3) == "zzz" +expect Str.repeat("na", 8) == "nananananananana" + +Returns `""` when given `""` for the string or `0` for the count. + +Copy + +expect Str.repeat("", 10) == "" +expect Str.repeat("anything", 0) == "" + +### [](Str#len)`len : Str -> [LearnAboutStringsInRoc Str]` + +A stub function to help people discover [how they should handle this in Roc](https://www.roc-lang.org/faq.html#strings-in-roc). + +### [](Str#to_utf8)`to_utf8 : Str -> List U8` + +Returns a [`List`](/builtins/alpha4/List#List "Docs for List.List") of the string's [`U8`](/builtins/alpha4/Num#U8 "Docs for Num.U8") UTF-8 [code units](https://unicode.org/glossary/#code_unit). (To split the string into a [`List`](/builtins/alpha4/List#List "Docs for List.List") of smaller [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") values instead of [`U8`](/builtins/alpha4/Num#U8 "Docs for Num.U8") values, see [`Str.split_on`](/builtins/alpha4/Str#split_on "Docs for Str.split_on").) + +Copy + +expect Str.to\_utf8("Roc") == \[82, 111, 99\] +expect Str.to\_utf8("鹏") == \[233, 185, 143\] +expect Str.to\_utf8("சி") == \[224, 174, 154, 224, 174, 191\] +expect Str.to\_utf8("🐦") == \[240, 159, 144, 166\] + +### [](Str#from_utf8)`from_utf8 : List U8 -> Result Str [ BadUtf8 { problem : Utf8Problem, index : U64 } ]` + +Converts a [`List`](/builtins/alpha4/List#List "Docs for List.List") of [`U8`](/builtins/alpha4/Num#U8 "Docs for Num.U8") UTF-8 [code units](https://unicode.org/glossary/#code_unit) to a string. + +Returns `Err` if the given bytes are invalid UTF-8, and returns `Ok ""` when given `[]`. + +Copy + +expect Str.from\_utf8(\[82, 111, 99\]) == Ok("Roc") +expect Str.from\_utf8(\[233, 185, 143\]) == Ok("鹏") +expect Str.from\_utf8(\[224, 174, 154, 224, 174, 191\]) == Ok("சி") +expect Str.from\_utf8(\[240, 159, 144, 166\]) == Ok("🐦") +expect Str.from\_utf8(\[\]) == Ok("") +expect Str.from\_utf8(\[255\]) |> Result.is\_err + +### [](Str#from_utf8_lossy)`from_utf8_lossy : List U8 -> Str` + +Converts a [`List`](/builtins/alpha4/List#List "Docs for List.List") of [`U8`](/builtins/alpha4/Num#U8 "Docs for Num.U8") UTF-8 [code units](https://unicode.org/glossary/#code_unit) to a string. Any grouping of invalid byte sequences are replaced with a single unicode replacement character '�'. + +An invalid byte sequence is defined as + +* a 2-byte-sequence starting byte, followed by less than 1 continuation byte +* a 3-byte-sequence starting byte, followed by less than 2 continuation bytes +* a 4-byte-sequence starting byte, followed by less than 3 continuation bytes +* an invalid codepoint from the surrogate pair block +* an invalid codepoint greater than 0x110000 encoded as a 4-byte sequence +* any valid codepoint encoded as an incorrect sequence, for instance a codepoint that should be a 2-byte sequence encoded as a 3- or 4-byte sequence + +Copy + +expect (Str.from\_utf8\_lossy \[82, 111, 99, 240, 159, 144, 166\]) == "Roc🐦" +expect (Str.from\_utf8\_lossy \[82, 255, 99\]) == "R�c" +expect (Str.from\_utf8\_lossy \[82, 0xED, 0xA0, 0xBD, 99\]) == "R�c" + +### [](Str#from_utf16)`from_utf16 : List U16 -> Result Str [ BadUtf16 { problem : Utf8Problem, index : U64 } ]` + +Converts a [`List`](/builtins/alpha4/List#List "Docs for List.List") of [`U16`](/builtins/alpha4/Num#U16 "Docs for Num.U16") UTF-16 (little-endian) [code units](https://unicode.org/glossary/#code_unit) to a string. + +Copy + +expect Str.from\_utf16(\[82, 111, 99\]) == Ok("Roc") +expect Str.from\_utf16(\[0xb9a, 0xbbf\]) == Ok("சி") +expect Str.from\_utf16(\[0xd83d, 0xdc26\]) == Ok("🐦") +expect Str.from\_utf16(\[\]) == Ok("") +# unpaired surrogates, first and second halves +expect Str.from\_utf16(\[82, 0xd83d, 99\]) |> Result.is\_err +expect Str.from\_utf16(\[82, 0xdc96, 99\]) |> Result.is\_err + +### [](Str#from_utf16_lossy)`from_utf16_lossy : List U16 -> Str` + +Converts a [`List`](/builtins/alpha4/List#List "Docs for List.List") of [`U16`](/builtins/alpha4/Num#U16 "Docs for Num.U16") UTF-16 (little-endian) [code units](https://unicode.org/glossary/#code_unit) to a string. Any unpaired surrogate code unit is replaced with a single unicode replacement character '�'. + +Copy + +expect Str.from\_utf16\_lossy(\[82, 111, 99, 0xd83d, 0xdc26\]) == "Roc🐦" +expect Str.from\_utf16\_lossy(\[82, 0xdc96, 99\]) == "R�c" + +### [](Str#from_utf32)`from_utf32 : List U32 -> Result Str [ BadUtf32 { problem : Utf8Problem, index : U64 } ]` + +### [](Str#from_utf32_lossy)`from_utf32_lossy : List U32 -> Str` + +Converts a [`List`](/builtins/alpha4/List#List "Docs for List.List") of [`U32`](/builtins/alpha4/Num#U32 "Docs for Num.U32") UTF-32 [code units](https://unicode.org/glossary/#code_unit) to a string. Any invalid code points are replaced with a single unicode replacement character '�'. + +Copy + +expect Str.from\_utf32\_lossy(\[82, 111, 99, 0x1f426\]) == "Roc🐦" +expect Str.from\_utf32\_lossy(\[82, 0x110000, 99\]) == "R�c" + +### [](Str#starts_with)`starts_with : Str, Str -> Bool` + +Check if the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") starts with a value. + +Copy + +expect Str.starts\_with("ABC", "A") == Bool.true +expect Str.starts\_with("ABC", "X") == Bool.false + +### [](Str#ends_with)`ends_with : Str, Str -> Bool` + +Check if the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") ends with a value. + +Copy + +expect Str.ends\_with("ABC", "C") == Bool.true +expect Str.ends\_with("ABC", "X") == Bool.false + +### [](Str#trim)`trim : Str -> Str` + +Return the [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") with all whitespace removed from both the beginning as well as the end. + +Copy + +expect Str.trim(" Hello \\n\\n") == "Hello" + +### [](Str#trim_start)`trim_start : Str -> Str` + +Return the [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") with all whitespace removed from the beginning. + +Copy + +expect Str.trim\_start(" Hello \\n\\n") == "Hello \\n\\n" + +### [](Str#trim_end)`trim_end : Str -> Str` + +Return the [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") with all whitespace removed from the end. + +Copy + +expect Str.trim\_end(" Hello \\n\\n") == " Hello" + +### [](Str#to_dec)`to_dec : Str -> Result Dec [InvalidNumStr]` + +Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a [`Dec`](/builtins/alpha4/Num#Dec "Docs for Num.Dec"). A [`Dec`](/builtins/alpha4/Num#Dec "Docs for Num.Dec") value is a 128-bit decimal [fixed-point number](https://en.wikipedia.org/wiki/Fixed-point_arithmetic). + +Copy + +expect Str.to\_dec("10") == Ok(10dec) +expect Str.to\_dec("-0.25") == Ok(\-0.25dec) +expect Str.to\_dec("not a number") == Err(InvalidNumStr) + +### [](Str#to_f64)`to_f64 : Str -> Result F64 [InvalidNumStr]` + +Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a [`F64`](/builtins/alpha4/Num#F64 "Docs for Num.F64"). A [`F64`](/builtins/alpha4/Num#F64 "Docs for Num.F64") value is a 64-bit [floating-point number](https://en.wikipedia.org/wiki/IEEE_754) and can be specified with a `f64` suffix. + +Copy + +expect Str.to\_f64("0.10") == Ok(0.10f64) +expect Str.to\_f64("not a number") == Err(InvalidNumStr) + +### [](Str#to_f32)`to_f32 : Str -> Result F32 [InvalidNumStr]` + +Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a [`F32`](/builtins/alpha4/Num#F32 "Docs for Num.F32").A [`F32`](/builtins/alpha4/Num#F32 "Docs for Num.F32") value is a 32-bit [floating-point number](https://en.wikipedia.org/wiki/IEEE_754) and can be specified with a `f32` suffix. + +Copy + +expect Str.to\_f32("0.10") == Ok(0.10f32) +expect Str.to\_f32("not a number") == Err(InvalidNumStr) + +### [](Str#to_u128)`to_u128 : Str -> Result U128 [InvalidNumStr]` + +Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to an unsigned [`U128`](/builtins/alpha4/Num#U128 "Docs for Num.U128") integer. A [`U128`](/builtins/alpha4/Num#U128 "Docs for Num.U128") value can hold numbers from `0` to `340_282_366_920_938_463_463_374_607_431_768_211_455` (over 340 undecillion). It can be specified with a u128 suffix. + +Copy + +expect Str.to\_u128("1500") == Ok(1500u128) +expect Str.to\_u128("0.1") == Err(InvalidNumStr) +expect Str.to\_u128("-1") == Err(InvalidNumStr) +expect Str.to\_u128("not a number") == Err(InvalidNumStr) + +### [](Str#to_i128)`to_i128 : Str -> Result I128 [InvalidNumStr]` + +Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a signed [`I128`](/builtins/alpha4/Num#I128 "Docs for Num.I128") integer. A [`I128`](/builtins/alpha4/Num#I128 "Docs for Num.I128") value can hold numbers from `-170_141_183_460_469_231_731_687_303_715_884_105_728` to `170_141_183_460_469_231_731_687_303_715_884_105_727`. It can be specified with a i128 suffix. + +Copy + +expect Str.to\_u128("1500") == Ok(1500i128) +expect Str.to\_i128("-1") == Ok(\-1i128) +expect Str.to\_i128("0.1") == Err(InvalidNumStr) +expect Str.to\_i128("not a number") == Err(InvalidNumStr) + +### [](Str#to_u64)`to_u64 : Str -> Result U64 [InvalidNumStr]` + +Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to an unsigned [`U64`](/builtins/alpha4/Num#U64 "Docs for Num.U64") integer. A [`U64`](/builtins/alpha4/Num#U64 "Docs for Num.U64") value can hold numbers from `0` to `18_446_744_073_709_551_615` (over 18 quintillion). It can be specified with a u64 suffix. + +Copy + +expect Str.to\_u64("1500") == Ok(1500u64) +expect Str.to\_u64("0.1") == Err(InvalidNumStr) +expect Str.to\_u64("-1") == Err(InvalidNumStr) +expect Str.to\_u64("not a number") == Err(InvalidNumStr) + +### [](Str#to_i64)`to_i64 : Str -> Result I64 [InvalidNumStr]` + +Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a signed [`I64`](/builtins/alpha4/Num#I64 "Docs for Num.I64") integer. A [`I64`](/builtins/alpha4/Num#I64 "Docs for Num.I64") value can hold numbers from `-9_223_372_036_854_775_808` to `9_223_372_036_854_775_807`. It can be specified with a i64 suffix. + +Copy + +expect Str.to\_i64("1500") == Ok(1500i64) +expect Str.to\_i64("-1") == Ok(\-1i64) +expect Str.to\_i64("0.1") == Err(InvalidNumStr) +expect Str.to\_i64("not a number") == Err(InvalidNumStr) + +### [](Str#to_u32)`to_u32 : Str -> Result U32 [InvalidNumStr]` + +Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to an unsigned [`U32`](/builtins/alpha4/Num#U32 "Docs for Num.U32") integer. A [`U32`](/builtins/alpha4/Num#U32 "Docs for Num.U32") value can hold numbers from `0` to `4_294_967_295` (over 4 billion). It can be specified with a u32 suffix. + +Copy + +expect Str.to\_u32("1500") == Ok(1500u32) +expect Str.to\_u32("0.1") == Err(InvalidNumStr) +expect Str.to\_u32("-1") == Err(InvalidNumStr) +expect Str.to\_u32("not a number") == Err(InvalidNumStr) + +### [](Str#to_i32)`to_i32 : Str -> Result I32 [InvalidNumStr]` + +Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a signed [`I32`](/builtins/alpha4/Num#I32 "Docs for Num.I32") integer. A [`I32`](/builtins/alpha4/Num#I32 "Docs for Num.I32") value can hold numbers from `-2_147_483_648` to `2_147_483_647`. It can be specified with a i32 suffix. + +Copy + +expect Str.to\_i32("1500") == Ok(1500i32) +expect Str.to\_i32("-1") == Ok(\-1i32) +expect Str.to\_i32("0.1") == Err(InvalidNumStr) +expect Str.to\_i32("not a number") == Err(InvalidNumStr) + +### [](Str#to_u16)`to_u16 : Str -> Result U16 [InvalidNumStr]` + +Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to an unsigned [`U16`](/builtins/alpha4/Num#U16 "Docs for Num.U16") integer. A [`U16`](/builtins/alpha4/Num#U16 "Docs for Num.U16") value can hold numbers from `0` to `65_535`. It can be specified with a u16 suffix. + +Copy + +expect Str.to\_u16("1500") == Ok(1500u16) +expect Str.to\_u16("0.1") == Err(InvalidNumStr) +expect Str.to\_u16("-1") == Err(InvalidNumStr) +expect Str.to\_u16("not a number") == Err(InvalidNumStr) + +### [](Str#to_i16)`to_i16 : Str -> Result I16 [InvalidNumStr]` + +Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a signed [`I16`](/builtins/alpha4/Num#I16 "Docs for Num.I16") integer. A [`I16`](/builtins/alpha4/Num#I16 "Docs for Num.I16") value can hold numbers from `-32_768` to `32_767`. It can be specified with a i16 suffix. + +Copy + +expect Str.to\_i16("1500") == Ok(1500i16) +expect Str.to\_i16("-1") == Ok(\-1i16) +expect Str.to\_i16("0.1") == Err(InvalidNumStr) +expect Str.to\_i16("not a number") == Err(InvalidNumStr) + +### [](Str#to_u8)`to_u8 : Str -> Result U8 [InvalidNumStr]` + +Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to an unsigned [`U8`](/builtins/alpha4/Num#U8 "Docs for Num.U8") integer. A [`U8`](/builtins/alpha4/Num#U8 "Docs for Num.U8") value can hold numbers from `0` to `255`. It can be specified with a u8 suffix. + +Copy + +expect Str.to\_u8("250") == Ok(250u8) +expect Str.to\_u8("-0.1") == Err(InvalidNumStr) +expect Str.to\_u8("not a number") == Err(InvalidNumStr) +expect Str.to\_u8("1500") == Err(InvalidNumStr) + +### [](Str#to_i8)`to_i8 : Str -> Result I8 [InvalidNumStr]` + +Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a signed [`I8`](/builtins/alpha4/Num#I8 "Docs for Num.I8") integer. A [`I8`](/builtins/alpha4/Num#I8 "Docs for Num.I8") value can hold numbers from `-128` to `127`. It can be specified with a i8 suffix. + +Copy + +expect Str.to\_i8("-15") == Ok(\-15i8) +expect Str.to\_i8("150.00") == Err(InvalidNumStr) +expect Str.to\_i8("not a number") == Err(InvalidNumStr) + +### [](Str#count_utf8_bytes)`count_utf8_bytes : Str -> U64` + +Gives the number of bytes in a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") value. + +Copy + +expect Str.count\_utf8\_bytes("Hello World") == 11 + +### [](Str#replace_each)`replace_each : Str, Str, Str -> Str` + +Returns the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") with each occurrence of a substring replaced. If the substring is not found, returns the original string. + +Copy + +expect Str.replace\_each("foo/bar/baz", "/", "\_") == "foo\_bar\_baz" +expect Str.replace\_each("not here", "/", "\_") == "not here" + +### [](Str#replace_first)`replace_first : Str, Str, Str -> Str` + +Returns the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") with the first occurrence of a substring replaced. If the substring is not found, returns the original string. + +Copy + +expect Str.replace\_first("foo/bar/baz", "/", "\_") == "foo\_bar/baz" +expect Str.replace\_first("no slashes here", "/", "\_") == "no slashes here" + +### [](Str#replace_last)`replace_last : Str, Str, Str -> Str` + +Returns the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") with the last occurrence of a substring replaced. If the substring is not found, returns the original string. + +Copy + +expect Str.replace\_last("foo/bar/baz", "/", "\_") == "foo/bar\_baz" +expect Str.replace\_last("no slashes here", "/", "\_") == "no slashes here" + +### [](Str#split_first)`split_first : Str, Str -> Result { before : Str, after : Str } [NotFound]` + +Returns the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") before the first occurrence of a [delimiter](https://www.computerhope.com/jargon/d/delimite.htm), as well as the rest of the string after that occurrence. Returns \[Err NotFound\] if the delimiter is not found. + +Copy + +expect Str.split\_first("foo/bar/baz", "/") == Ok({ before: "foo", after: "bar/baz" }) +expect Str.split\_first("no slashes here", "/") == Err(NotFound) + +### [](Str#split_last)`split_last : Str, Str -> Result { before : Str, after : Str } [NotFound]` + +Returns the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") before the last occurrence of a delimiter, as well as the rest of the string after that occurrence. Returns \[Err NotFound\] if the delimiter is not found. + +Copy + +expect Str.split\_last("foo/bar/baz", "/") == Ok({ before: "foo/bar", after: "baz" }) +expect Str.split\_last("no slashes here", "/") == Err(NotFound) + +### [](Str#walk_utf8_with_index)`walk_utf8_with_index : Str, state, (state, U8, U64 -> state) -> state` + +Walks over the `UTF-8` bytes of the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") and calls a function to update state for each byte. The index for that byte in the string is provided to the update function. + +Copy + +f : List U8, U8, U64 -> List U8 +f = \\state, byte, \_ -> List.append(state, byte) +expect Str.walk\_utf8\_with\_index("ABC", \[\], f) == \[65, 66, 67\] + +### [](Str#walk_utf8)`walk_utf8 : Str, state, (state, U8 -> state) -> state` + +Walks over the `UTF-8` bytes of the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") and calls a function to update state for each byte. + +Copy + +sum\_of\_utf8\_bytes = + Str.walk\_utf8("Hello, World!", 0, (\\total, byte -> + total + byte + )) + +expect sum\_of\_utf8\_bytes == 105 + +### [](Str#release_excess_capacity)`release_excess_capacity : Str -> Str` + +Shrink the memory footprint of a str such that its capacity and length are equal. Note: This will also convert seamless slices to regular lists. + +### [](Str#with_prefix)`with_prefix : Str, Str -> Str` + +Adds a prefix to the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str"). + +Copy + +expect Str.with\_prefix("Awesome", "Roc") == "RocAwesome" + +### [](Str#contains)`contains : Str, Str -> Bool` + +Determines whether or not the first Str contains the second. + +Copy + +expect Str.contains("foobarbaz", "bar") +expect !Str.contains("apple", "orange") +expect Str.contains("anything", "") + +### [](Str#drop_prefix)`drop_prefix : Str, Str -> Str` + +Drops the given prefix [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") from the start of a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") If the prefix is not found, returns the original string. + +Copy + +expect Str.drop\_prefix("bar", "foo") == "bar" +expect Str.drop\_prefix("foobar", "foo") == "bar" + +### [](Str#drop_suffix)`drop_suffix : Str, Str -> Str` + +Drops the given suffix [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") from the end of a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") If the suffix is not found, returns the original string. + +Copy + +expect Str.drop\_suffix("bar", "foo") == "bar" +expect Str.drop\_suffix("barfoo", "foo") == "bar" + +### [](Str#with_ascii_lowercased)`with_ascii_lowercased : Str -> Str` + +Returns a version of the string with all [ASCII characters](https://en.wikipedia.org/wiki/ASCII) lowercased. Non-ASCII characters are left unmodified. For example: + +Copy + +expect Str.with\_ascii\_lowercased("CAFÉ") == "cafÉ" + +This function is useful for things like [command-line flags](https://en.wikipedia.org/wiki/Command-line_interface#Command-line_option) and [environment variable names](https://en.wikipedia.org/wiki/Environment_variable) where you know in advance that you're dealing with a string containing only ASCII characters. It has better performance than lowercasing operations which take Unicode into account. + +That said, strings received from user input can always contain non-ASCII Unicode characters, and lowercasing [Unicode](https://unicode.org) works differently in different languages. For example, the string `"I"` lowercases to `"i"` in English and to `"ı"` (a [dotless i](https://en.wikipedia.org/wiki/Dotless_I)) in Turkish. These rules can also change in each [Unicode release](https://www.unicode.org/releases/), so we have separate [`unicode` package](https://github.com/roc-lang/unicode) for Unicode capitalization that can be upgraded independently from the language's builtins. + +To do a case-insensitive comparison of the ASCII characters in a string, you can use [`Str.caseless_ascii_equals`](/builtins/alpha4/Str#caseless_ascii_equals "Docs for Str.caseless_ascii_equals"). + +### [](Str#with_ascii_uppercased)`with_ascii_uppercased : Str -> Str` + +Returns a version of the string with all [ASCII characters](https://en.wikipedia.org/wiki/ASCII) uppercased. Non-ASCII characters are left unmodified. For example: + +Copy + + expect Str.with\_ascii\_uppercased("café") == "CAFé" + +This function is useful for things like [command-line flags](https://en.wikipedia.org/wiki/Command-line_interface#Command-line_option) and [environment variable names](https://en.wikipedia.org/wiki/Environment_variable) where you know in advance that you're dealing with a string containing only ASCII characters. It has better performance than lowercasing operations which take Unicode into account. + +That said, strings received from user input can always contain non-ASCII Unicode characters, and uppercasing [Unicode](https://unicode.org) works differently in different languages. For example, the string `"i"` uppercases to `"I"` in English and to `"İ"` (a [dotted I](https://en.wikipedia.org/wiki/%C4%B0)) in Turkish. These rules can also change in each Unicode release, so we have a separate [`unicode` package](https://github.com/roc-lang/unicode) for Unicode capitalization that can be upgraded independently from the language's builtins. + +To do a case-insensitive comparison of the ASCII characters in a string, you can use [`Str.caseless_ascii_equals`](/builtins/alpha4/Str#caseless_ascii_equals "Docs for Str.caseless_ascii_equals"). + +### [](Str#caseless_ascii_equals)`caseless_ascii_equals : Str, Str -> Bool` + +Returns `True` if all the [ASCII characters](https://en.wikipedia.org/wiki/ASCII) in the string are the same when ignoring differences in capitalization. Non-ASCII characters must all be exactly the same, including capitalization. For example: + +Copy + + expect Str.caseless\_ascii\_equals("café", "CAFé") + + expect !Str.caseless\_ascii\_equals("café", "CAFÉ") + +The first call returns `True` because all the ASCII characters are the same when ignoring differences in capitalization, and the only non-ASCII character (`é`) is the same in both strings. The second call returns `False`because `é` and `É` are not ASCII characters, and they are different. + +This function is useful for things like [command-line flags](https://en.wikipedia.org/wiki/Command-line_interface#Command-line_option) and [environment variable names](https://en.wikipedia.org/wiki/Environment_variable) where you know in advance that you're dealing with a string containing only ASCII characters. It has better performance than lowercasing operations which take Unicode into account. + +That said, strings received from user input can always contain non-ASCII Unicode characters, and lowercasing [Unicode](https://unicode.org) works differently in different languages. For example, the string `"I"` lowercases to `"i"` in English and to `"ı"` (a [dotless i](https://en.wikipedia.org/wiki/Dotless_I)) in Turkish. These rules can also change in each [Unicode release](https://www.unicode.org/releases/), so we have separate [`unicode` package](https://github.com/roc-lang/unicode) for Unicode capitalization that can be upgraded independently from the language's builtins. + +To convert a string's ASCII characters to uppercase or lowercase, you can use [`Str.with_ascii_uppercased`](/builtins/alpha4/Str#with_ascii_uppercased "Docs for Str.with_ascii_uppercased") or [`Str.with_ascii_lowercased`](/builtins/alpha4/Str#with_ascii_lowercased "Docs for Str.with_ascii_lowercased"). + +Made by people who like to make nice things. diff --git a/src/build/builtin_compiler/main.zig b/src/build/builtin_compiler/main.zig index e885737761..6b126776d0 100644 --- a/src/build/builtin_compiler/main.zig +++ b/src/build/builtin_compiler/main.zig @@ -143,6 +143,9 @@ fn replaceStrIsEmptyWithLowLevel(env: *ModuleEnv) !std.ArrayList(CIR.Def.Idx) { if (env.common.findIdent("Builtin.Str.count_utf8_bytes")) |str_count_utf8_bytes_ident| { try low_level_map.put(str_count_utf8_bytes_ident, .str_count_utf8_bytes); } + if (env.common.findIdent("Builtin.Str.with_capacity")) |str_with_capacity_ident| { + try low_level_map.put(str_with_capacity_ident, .str_with_capacity); + } if (env.common.findIdent("Builtin.List.len")) |list_len_ident| { try low_level_map.put(list_len_ident, .list_len); } diff --git a/src/build/roc/Builtin.roc b/src/build/roc/Builtin.roc index 634f376173..2fd93fec62 100644 --- a/src/build/roc/Builtin.roc +++ b/src/build/roc/Builtin.roc @@ -16,6 +16,7 @@ Builtin :: [].{ drop_prefix : Str, Str -> Str drop_suffix : Str, Str -> Str count_utf8_bytes : Str -> U64 + with_capacity : U64 -> Str } List(_item) :: [ProvidedByCompiler].{ diff --git a/src/canonicalize/Expression.zig b/src/canonicalize/Expression.zig index 82cd181929..6b48ee7708 100644 --- a/src/canonicalize/Expression.zig +++ b/src/canonicalize/Expression.zig @@ -416,6 +416,7 @@ pub const Expr = union(enum) { str_drop_prefix, str_drop_suffix, str_count_utf8_bytes, + str_with_capacity, // Numeric to_str operations u8_to_str, diff --git a/src/eval/interpreter.zig b/src/eval/interpreter.zig index 431a1c6c1e..3a969168a5 100644 --- a/src/eval/interpreter.zig +++ b/src/eval/interpreter.zig @@ -3108,6 +3108,26 @@ pub const Interpreter = struct { out.is_initialized = true; return out; }, + .str_with_capacity => { + // Str.with_capacity : U64 -> Str + std.debug.assert(args.len == 1); + + const capacity_arg = args[0]; + const capacity_value = try self.extractNumericValue(capacity_arg); + const capacity: u64 = @intCast(capacity_value.int); + + const result_str = builtins.str.withCapacityC(capacity, roc_ops); + + const result_layout = layout.Layout.str(); + var out = try self.pushRaw(result_layout, 0); + out.is_initialized = false; + + const result_ptr: *RocStr = @ptrCast(@alignCast(out.ptr.?)); + result_ptr.* = result_str; + + out.is_initialized = true; + return out; + }, .list_len => { // List.len : List(a) -> U64 // Note: listLen returns usize, but List.len always returns U64. diff --git a/test/fx/Parser.roc b/test/fx/Parser.roc new file mode 100644 index 0000000000..bf5ceec8c6 --- /dev/null +++ b/test/fx/Parser.roc @@ -0,0 +1,19 @@ +Parser := [].{ + Result(input, a) : Try({ val : a, rem : input }, [ParsingFailure(Str)]) + Fn(input, a) : input -> Result(input, a) + + parse_partial : Fn(input, a), input -> Result(input, a) + parse_partial = |fn, input| fn(input) +} + +test_fn : Parser.Fn(List(U8), Bool) +test_fn = |bytes| if (bytes.len() > 0) Ok({ val : True, rem : [] }) else Err(ParsingFailure("no input")) + +test_input : List(U8) +test_input = [1,2,3] + +test_empty : List(U8) +test_empty = [] + +expect Parser.parse_partial(test_fn, test_empty) == Err(ParsingFailure("no input")) +expect Parser.parse_partial(test_fn, test_input) == Ok({ val : True, rem : [1] }) diff --git a/test/snapshots/repl/str_with_capacity.md b/test/snapshots/repl/str_with_capacity.md new file mode 100644 index 0000000000..cb9d6019b1 --- /dev/null +++ b/test/snapshots/repl/str_with_capacity.md @@ -0,0 +1,19 @@ +# META +~~~ini +description=Str.with_capacity should create an empty string with preallocated capacity +type=repl +~~~ +# SOURCE +~~~roc +» Str.with_capacity(0) +» Str.with_capacity(10) +» Str.with_capacity(100) +~~~ +# OUTPUT +"" +--- +"" +--- +"" +# PROBLEMS +NIL From 92a848e2668b3c6bbb59cd62753f1f9fbf24b2f8 Mon Sep 17 00:00:00 2001 From: Luke Boswell Date: Wed, 26 Nov 2025 12:10:58 +1100 Subject: [PATCH 03/10] remove accidental files --- old-string-implementations.md | 890 ---------------------------------- test/fx/Parser.roc | 19 - 2 files changed, 909 deletions(-) delete mode 100644 old-string-implementations.md delete mode 100644 test/fx/Parser.roc diff --git a/old-string-implementations.md b/old-string-implementations.md deleted file mode 100644 index 6470c027d0..0000000000 --- a/old-string-implementations.md +++ /dev/null @@ -1,890 +0,0 @@ - -## [Str](/) - -Strings represent text. For example, `"Hi!"` is a string. - -This guide starts at a high level and works down to the in-memory representation of strings and their [performance characteristics](#performance). For reasons that will be explained later in this guide, some string operations are in the `Str` module while others (notably [capitalization](#capitalization), [code points](#code-points), [graphemes](#graphemes), and sorting) are in separate packages. There's also a list of recommendations for [when to use code points, graphemes, and UTF-8](#when-to-use). - -## Syntax - -The most common way to represent strings is using quotation marks: - -Copy - -"Hello, World!" - -Using this syntax, the whole string must go on one line. You can write multiline strings using triple quotes: - -Copy - -text = - """ - In memory, this string will not have any spaces - at its start. That's because the first line - starts at the same indentation level as the - opening quotation mark. Actually, none of these - lines will be indented. - - However, this line will be indented! - """ - -In triple-quoted strings, both the opening and closing `"""` must be at the same indentation level. Lines in the string begin at that indentation level; the spaces that indent the multiline string itself are not considered content. - -### Interpolation - -_String interpolation_ is syntax for inserting a string into another string. - -Copy - -name = "Sam" - -"Hi, my name is ${name}!" - -This will evaluate to the string `"Hi, my name is Sam!"` - -You can put any expression you like inside the parentheses, as long as it's all on one line: - -Copy - -colors = \["red", "green", "blue"\] - -"The colors are ${colors |> Str.join\_with(", ")}!" - -Interpolation can be used in multiline strings, but the part inside the parentheses must still be on one line. - -### Escapes - -There are a few special escape sequences in strings: - -* `\n` becomes a [newline](https://en.wikipedia.org/wiki/Newline) -* `\r` becomes a [carriage return](https://en.wikipedia.org/wiki/Carriage_return#Computers) -* `\t` becomes a [tab](https://en.wikipedia.org/wiki/Tab_key#Tab_characters) -* `\"` becomes a normal `"` (this lets you write `"` inside a single-line string) -* `\\` becomes a normal `\` (this lets you write `\` without it being treated as an escape) -* `\$` becomes a normal `$` (this lets you write `$` followed by `(` without it being treated as [interpolation](#interpolation)) - -These work in both single-line and multiline strings. We'll also discuss another escape later, for inserting [Unicode code points](#code-points) into a string. - -### Single quote syntax - -Try putting `'👩'` into `roc repl`. You should see this: - -Copy - -» '👩' - -128105 : Int \* - -The single-quote `'` syntax lets you represent a Unicode code point (discussed in the next section) in source code, in a way that renders as the actual text it represents rather than as a number literal. This lets you see what it looks like in the source code rather than looking at a number. - -At runtime, the single-quoted value will be treated the same as an ordinary number literal—in other words, `'👩'` is syntax sugar for writing `128105`. You can verify this in `roc repl`: - -Copy - -» '👩' == 128105 - -Bool.true : Bool - -Double quotes (`"`), on the other hand, are not type-compatible with integers—not only because strings can be empty (`""` is valid, but `''` is not) but also because there may be more than one code point involved in any given string! - -There are also some special escape sequences in single-quote strings: - -* `\n` becomes a [newline](https://en.wikipedia.org/wiki/Newline) -* `\r` becomes a [carriage return](https://en.wikipedia.org/wiki/Carriage_return#Computers) -* `\t` becomes a [tab](https://en.wikipedia.org/wiki/Tab_key#Tab_characters) -* `\'` becomes a normal `'` (this lets you write `'` inside a single-quote string) -* `\\` becomes a normal `\` (this lets you write `\` without it being treated as an escape) - -Most often this single-quote syntax is used when writing parsers; most Roc programs never use it at all. - -## Unicode - -Roc strings represent text using [Unicode](https://unicode.org) This guide will provide only a basic overview of Unicode (the [Unicode glossary](http://www.unicode.org/glossary/) has over 500 entries in it), but it will include the most relevant differences between these concepts: - -* Code points -* Graphemes -* UTF-8 - -It will also explain why some operations are included in Roc's builtin [Str](https://www.roc-lang.org/builtins/Str) module, and why others are in separate packages like [roc-lang/unicode](https://github.com/roc-lang/unicode). - -### Graphemes - -Let's start with the following string: - -`"👩‍👩‍👦‍👦"` - -Some might call this a "character." After all, in a monospace font, it looks to be about the same width as the letter "A" or the punctuation mark "!"—both of which are commonly called "characters." Unfortunately, the term "character" in programming has changed meanings many times across the years and across programming languages, and today it's become a major source of confusion. - -Unicode uses the less ambiguous term [_grapheme_](https://www.unicode.org/glossary/#grapheme), which it defines as a "user-perceived character" (as opposed to one of the several historical ways the term "character" has been used in programming) or, alternatively, "A minimally distinctive unit of writing in the context of a particular writing system." - -By Unicode's definition, each of the following is an individual grapheme: - -* `a` -* `鹏` -* `👩‍👩‍👦‍👦` - -Note that although _grapheme_ is less ambiguous than _character_, its definition is still open to interpretation. To address this, Unicode has formally specified [text segmentation rules](https://www.unicode.org/reports/tr29/) which define grapheme boundaries in precise technical terms. We won't get into those rules here, but since they can change with new Unicode releases, functions for working with graphemes are in the [roc-lang/unicode](https://github.com/roc-lang/unicode) package rather than in the builtin [`Str`](https://www.roc-lang.org/builtins/Str) module. This allows them to be updated without being blocked on a new release of the Roc language. - -### Code Points - -Every Unicode text value can be broken down into [Unicode code points](http://www.unicode.org/glossary/#code_point), which are integers between `0` and `285_212_438` that describe components of the text. In memory, every Roc string is a sequence of these integers stored in a format called UTF-8, which will be discussed [later](#utf8). - -The string `"👩‍👩‍👦‍👦"` happens to be made up of these code points: - -Copy - -\[128105, 8205, 128105, 8205, 128102, 8205, 128102\] - -From this we can see that: - -* One grapheme can be made up of multiple code points. In fact, there is no upper limit on how many code points can go into a single grapheme! (Some programming languages use the term "character" to refer to individual code points; this can be confusing for graphemes like 👩‍👩‍👦‍👦 because it visually looks like "one character" but no single code point can represent it.) -* Sometimes code points repeat within an individual grapheme. Here, 128105 repeats twice, as does 128102, and there's an 8205 in between each of the other code points. - -### Combining Code Points - -The reason every other code point in 👩‍👩‍👦‍👦 is 8205 is that code point 8205 joins together other code points. This emoji, known as ["Family: Woman, Woman, Boy, Boy"](https://emojipedia.org/family-woman-woman-boy-boy), is made by combining several emoji using [zero-width joiners](https://emojipedia.org/zero-width-joiner)—which are represented by code point 8205 in memory, and which have no visual repesentation on their own. - -Here are those code points again, this time with comments about what they represent: - -Copy - -\[128105\] # "👩" -\[8205\] # (joiner) -\[128105\] # "👩" -\[8205\] # (joiner) -\[128102\] # "👦" -\[8205\] # (joiner) -\[128102\] # "👦" - -One way to read this is "woman emoji joined to woman emoji joined to boy emoji joined to boy emoji." Without the joins, it would be: - -Copy - -"👩👩👦👦" - -With the joins, however, it is instead: - -Copy - -"👩‍👩‍👦‍👦" - -Even though 👩‍👩‍👦‍👦 is visually smaller when rendered, it takes up almost twice as much memory as 👩👩👦👦 does! That's because it has all the same code points, plus the zero-width joiners in between them. - -### String equality and normalization - -Besides emoji like 👩‍👩‍👦‍👦, another classic example of multiple code points being combined to render as one grapheme has to do with accent marks. Try putting these two strings into `roc repl`: - -Copy - -"caf\\u(e9)" -"cafe\\u(301)" - -The `\u(e9)` syntax is a way of inserting code points into string literals. In this case, it's the same as inserting the hexadecimal number `0xe9` as a code point onto the end of the string `"caf"`. Since Unicode code point `0xe9` happens to be `é`, the string `"caf\u(e9)"` ends up being identical in memory to the string `"café"`. - -We can verify this too: - -Copy - -» "caf\\u(e9)" == "café" - -Bool.true : Bool - -As it turns out, `"cafe\u(301)"` is another way to represent the same word. The Unicode code point 0x301 represents a ["combining acute accent"](https://unicodeplus.com/U+0301)—which essentially means that it will add an accent mark to whatever came before it. In this case, since `"cafe\u(301)"` has an `e` before the `"\u(301)"`, that `e` ends up with an accent mark on it and becomes `é`. - -Although these two strings get rendered identically to one another, they are different in memory because their code points are different! We can also confirm this in `roc repl`: - -Copy - -» "caf\\u(e9)" == "cafe\\u(301)" - -Bool.false : Bool - -As you can imagine, this can be a source of bugs. Not only are they considered unequal, they also hash differently, meaning `"caf\u(e9)"` and `"cafe\u(301)"` can both be separate entries in the same [`Set`](https://www.roc-lang.org/builtins/Set). - -One way to prevent problems like these is to perform [Unicode normalization](https://www.unicode.org/reports/tr15/), a process which converts conceptually equivalent strings (like `"caf\u(e9)"` and `"cafe\u(301)"`) into one canonical in-memory representation. This makes equality checks on them pass, among other benefits. - -It would be technically possible for Roc to perform string normalization automatically on every equality check. Unfortunately, although some programs might want to treat `"caf\u(e9)"` and `"cafe\u(301)"` as equivalent, for other programs it might actually be important to be able to tell them apart. If these equality checks always passed, then there would be no way to tell them apart! - -As such, normalization must be performed explicitly when desired. Like graphemes, Unicode normalization rules can change with new releases of Unicode. As such, these functions are in separate packages instead of builtins (normalization is planned to be in [roc-lang/unicode](https://github.com/roc-lang/unicode) in the future, but it has not yet been implemented) so that updates to these functions based on new Unicode releases can happen without waiting on new releases of the Roc language. - -### Capitalization - -We've already seen two examples of Unicode definitions that can change with new Unicode releases: graphemes and normalization. Another is capitalization; these rules can change with new Unicode releases (most often in the form of additions of new languages, but breaking changes to capitalization rules for existing languages are also possible), and so they are not included in builtin [`Str`](https://www.roc-lang.org/builtins/Str). - -This might seem particularly surprising, since capitalization functions are commonly included in standard libraries. However, it turns out that "capitalizing an arbitrary string" is impossible to do correctly without additional information. - -For example, what is the capitalized version of this string? - -Copy - -"i" - -* In English, the correct answer is `"I"`. -* In Turkish, the correct answer is `"İ"`. - -Similarly, the correct lowercased version of the string `"I"` is `"i"` in English and `"ı"` in Turkish. - -Turkish is not the only language to use this [dotless i](https://en.wikipedia.org/wiki/Dotless_I), and it's an example of how a function which capitalizes strings cannot give correct answers without the additional information of which language's capitalization rules should be used. - -Many languages defer to the operating system's [localization](https://en.wikipedia.org/wiki/Internationalization_and_localization) settings for this information. In that design, calling a program's capitalization function with an input string of `"i"` might give an answer of `"I"` on one machine and `"İ"` on a different machine, even though it was the same program running on both systems. Naturally, this can cause bugs—but more than that, writing tests to prevent bugs like this usually requires extra complexity compared to writing ordinary tests. - -In general, Roc programs should give the same answers for the same inputs even when run on different machines. There are exceptions to this (e.g. a program running out of system resources on one machine, while being able to make more progress on a machine that has more resources), but operating system's language localization is not among them. - -For these reasons, capitalization functions are not in [`Str`](https://www.roc-lang.org/builtins/Str). There is a planned `roc-lang` package to handle use cases like capitalization and sorting—sorting can also vary by language as well as by things like country—but implementation work has not yet started on this package. - -### UTF-8 - -Earlier, we discussed how Unicode code points can be described as [`U32`](https://www.roc-lang.org/builtins/Num#U32) integers. However, many common code points are very low integers, and can fit into a `U8` instead of needing an entire `U32` to represent them in memory. UTF-8 takes advantage of this, using a variable-width encoding to represent code points in 1-4 bytes, which saves a lot of memory in the typical case—especially compared to [UTF-16](https://en.wikipedia.org/wiki/UTF-16), which always uses at least 2 bytes to represent each code point, or [UTF-32](https://en.wikipedia.org/wiki/UTF-32), which always uses the maximum 4 bytes. - -This guide won't cover all the details of UTF-8, but the basic idea is this: - -* If a code point is 127 or lower, UTF-8 stores it in 1 byte. -* If it's between 128 and 2047, UTF-8 stores it in 2 bytes. -* If it's between 2048 and 65535, UTF-8 stores it in 3 bytes. -* If it's higher than that, UTF-8 stores it in 4 bytes. - -The specific [UTF-8 encoding](https://en.wikipedia.org/wiki/UTF-8#Encoding) of these bytes involves using 1 to 5 bits of each byte for metadata about multi-byte sequences. - -A valuable feature of UTF-8 is that it is backwards-compatible with the [ASCII](https://en.wikipedia.org/wiki/ASCII) encoding that was widely used for many years. ASCII existed before Unicode did, and only used the integers 0 to 127 to represent its equivalent of code points. The Unicode code points 0 to 127 represent the same semantic information as ASCII, (e.g. the number 64 represents the letter "A" in both ASCII and in Unicode), and since UTF-8 represents code points 0 to 127 using one byte, all valid ASCII strings can be successfully parsed as UTF-8 without any need for conversion. - -Since many textual computer encodings—including [CSV](https://en.wikipedia.org/wiki/CSV), [XML](https://en.wikipedia.org/wiki/XML), and [JSON](https://en.wikipedia.org/wiki/JSON)—do not use any code points above 127 for their delimiters, it is often possible to write parsers for these formats using only `Str` functions which present UTF-8 as raw `U8` sequences, such as [`Str.walk_utf8`](https://www.roc-lang.org/builtins/Str#walk_utf8) and [`Str.to_utf8`](https://www.roc-lang.org/builtins/Str#to_utf8). In the typical case where they do not to need to parse out individual Unicode code points, they can get everything they need from `Str` UTF-8 functions without needing to depend on other packages. - -### When to use code points, graphemes, and UTF-8 - -Deciding when to use code points, graphemes, and UTF-8 can be nonobvious to say the least! - -The way Roc organizes the `Str` module and supporting packages is designed to help answer this question. Every situation is different, but the following rules of thumb are typical: - -* Most often, using `Str` values along with helper functions like [`split_on`](https://www.roc-lang.org/builtins/Str#split_on), [`join_with`](https://www.roc-lang.org/builtins/Str#join_with), and so on, is the best option. -* If you are specifically implementing a parser, working in UTF-8 bytes is usually the best option. So functions like [`walk_utf8`](https://www.roc-lang.org/builtins/Str#walk_utf8), [to\_utf8](https://www.roc-lang.org/builtins/Str#to_utf8), and so on. (Note that single-quote literals produce number literals, so ASCII-range literals like `'a'` gives an integer literal that works with a UTF-8 `U8`.) -* If you are implementing a Unicode library like [roc-lang/unicode](https://github.com/roc-lang/unicode), working in terms of code points will be unavoidable. Aside from basic readability considerations like `\u(...)` in string literals, if you have the option to avoid working in terms of code points, it is almost always correct to avoid them. -* If it seems like a good idea to split a string into "characters" (graphemes), you should definitely stop and reconsider whether this is really the best design. Almost always, doing this is some combination of more error-prone or slower (usually both) than doing something else that does not require taking graphemes into consideration. - -For this reason (among others), grapheme functions live in [roc-lang/unicode](https://github.com/roc-lang/unicode) rather than in [`Str`](https://www.roc-lang.org/builtins/Str). They are more niche than they seem, so they should not be reached for all the time! - -## Performance - -This section deals with how Roc strings are represented in memory, and their performance characteristics. - -A normal heap-allocated roc `Str` is represented on the stack as: - -* A "capacity" unsigned integer, which respresents how many bytes are allocated on the heap to hold the string's contents. -* A "length" unsigned integer, which rerepresents how many of the "capacity" bytes are actually in use. (A `Str` can have more bytes allocated on the heap than are actually in use.) -* The memory address of the first byte in the string's actual contents. - -Each of these three fields is the same size: 64 bits on a 64-bit system, and 32 bits on a 32-bit system. The actual contents of the string are stored in one contiguous sequence of bytes, encoded as UTF-8, often on the heap but sometimes elsewhere—more on this later. Empty strings do not have heap allocations, so an empty `Str` on a 64-bit system still takes up 24 bytes on the stack (due to its three 64-bit fields). - -### Reference counting and opportunistic mutation - -Like lists, dictionaries, and sets, Roc strings are automatically reference-counted and can benefit from opportunistic in-place mutation. The reference count is stored on the heap immediately before the first byte of the string's contents, and it has the same size as a memory address. This means it can count so high that it's impossible to write a Roc program which overflows a reference count, because having that many simultaneous references (each of which is a memory address) would have exhausted the operating system's address space first. - -When the string's reference count is 1, functions like [`Str.concat`](https://www.roc-lang.org/builtins/Str#concat) and [`Str.replace_each`](https://www.roc-lang.org/builtins/Str#replace_each) mutate the string in-place rather than allocating a new string. This preserves semantic immutability because it is unobservable in terms of the operation's output; if the reference count is 1, it means that memory would have otherwise been deallocated immediately anyway, and it's more efficient to reuse it instead of deallocating it and then immediately making a new allocation. - -The contents of statically-known strings (today that means string literals) are stored in the readonly section of the binary, so they do not need heap allocations or reference counts. They are not eligible for in-place mutation, since mutating the readonly section of the binary would cause an operating system [access violation](https://en.wikipedia.org/wiki/Segmentation_fault). - -### Small String Optimization - -Roc uses a "small string optimization" when representing certain strings in memory. - -If you have a sufficiently long string, then on a 64-bit system it will be represented on the stack using 24 bytes, and on a 32-bit system it will take 12 bytes—plus however many bytes are in the string itself—on the heap. However, if there is a string shorter than either of these stack sizes (so, a string of up to 23 bytes on a 64-bit system, and up to 11 bytes on a 32-bit system), then that string will be stored entirely on the stack rather than having a separate heap allocation at all. - -This can be much more memory-efficient! However, `List` does not have this optimization (it has some runtime cost, and in the case of `List` it's not anticipated to come up nearly as often), which means when converting a small string to `List U8` it can result in a heap allocation. - -Note that this optimization is based entirely on how many UTF-8 bytes the string takes up in memory. It doesn't matter how many [graphemes](#graphemes), [code points](#code-points) or anything else it has; the only factor that determines whether a particular string is eligible for the small string optimization is the number of UTF-8 bytes it takes up in memory! - -### Seamless Slices - -Try putting this into `roc repl`: - -Copy - -» "foo/bar/baz" |> Str.split\_on("/") - -\["foo", "bar", "baz"\] : List Str - -All of these strings are small enough that the [small string optimization](#small) will apply, so none of them will be allocated on the heap. - -Now let's suppose they were long enough that this optimization no longer applied: - -Copy - -» "a much, much, much, much/longer/string compared to the last one!" |> Str.split\_on "/" - -\["a much, much, much, much", "longer", "string compared to the last one!"\] : List Str - -Here, the only strings small enough for the small string optimization are `"/"` and `"longer"`. They will be allocated on the stack. - -The first and last strings in the returned list `"a much, much, much, much"` and `"string compared to the last one!"` will not be allocated on the heap either. Instead, they will be _seamless slices_, which means they will share memory with the original input string. - -* `"a much, much, much, much"` will share the first 24 bytes of the original string. -* `"string compared to the last one!"` will share the last 32 bytes of the original string. - -All of these strings are semantically immutable, so sharing these bytes is an implementation detail that should only affect performance. By design, there is no way at either compile time or runtime to tell whether a string is a seamless slice. This allows the optimization's behavior to change in the future without affecting Roc programs' semantic behavior. - -Seamless slices create additional references to the original string, which make it ineligible for opportunistic mutation (along with the slices themselves; slices are never eligible for mutation), and which also make it take longer before the original string can be deallocated. A case where this might be noticeable in terms of performance would be: - -1. A function takes a very large string as an argument and returns a much smaller slice into that string. -2. The smaller slice is used for a long time in the program, whereas the much larger original string stops being used. -3. In this situation, it might have been better for total program memory usage (although not necessarily overall performance) if the original large string could have been deallocated sooner, even at the expense of having to copy the smaller string into a new allocation instead of reusing the bytes with a seamless slice. - -If a situation like this comes up, a slice can be turned into a separate string by using [`Str.concat`](https://www.roc-lang.org/builtins/Str#concat) to concatenate the slice onto an empty string (or one created with [`Str.with_capacity`](https://www.roc-lang.org/builtins/Str#with_capacity)). - -Currently, the only way to get seamless slices of strings is by calling certain `Str` functions which return them. In general, `Str` functions which accept a string and return a subset of that string tend to do this. [`Str.trim`](https://www.roc-lang.org/builtins/Str#trim) is another example of a function which returns a seamless slice. - -### [](Str#Utf8Problem)`Utf8Problem : [ InvalidStartByte, UnexpectedEndOfSequence, ExpectedContinuation, OverlongEncoding, CodepointTooLarge, EncodesSurrogateHalf ]` - -### [](Str#is_empty)`is_empty : Str -> Bool` - -Returns [`Bool.true`](/builtins/alpha4/Bool#true "Docs for Bool.true") if the string is empty, and [`Bool.false`](/builtins/alpha4/Bool#false "Docs for Bool.false") otherwise. - -Copy - -expect Str.is\_empty("hi!") == Bool.false -expect Str.is\_empty("") == Bool.true - -### [](Str#concat)`concat : Str, Str -> Str` - -Concatenates two strings together. - -Copy - -expect Str.concat("ab", "cd") == "abcd" -expect Str.concat("hello", "") == "hello" -expect Str.concat("", "") == "" - -### [](Str#with_capacity)`with_capacity : U64 -> Str` - -Returns a string of the specified capacity without any content. - -This is a performance optimization tool that's like calling [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve") on an empty string. It's useful when you plan to build up a string incrementally, for example by calling [`Str.concat`](/builtins/alpha4/Str#concat "Docs for Str.concat") on it: - -Copy - -greeting = "Hello and welcome to Roc" -subject = "Awesome Programmer" - -# Evaluates to "Hello and welcome to Roc, Awesome Programmer!" -hello\_world = - Str.with\_capacity(45) - |> Str.concat(greeting) - |> Str.concat(", ") - |> Str.concat(subject) - |> Str.concat("!") - -In general, if you plan to use [`Str.concat`](/builtins/alpha4/Str#concat "Docs for Str.concat") on an empty string, it will be faster to start with [`Str.with_capacity`](/builtins/alpha4/Str#with_capacity "Docs for Str.with_capacity") than with `""`. Even if you don't know the exact capacity of the string, giving [`with_capacity`](/builtins/alpha4/Str#with_capacity "Docs for Str.with_capacity") a higher value than ends up being necessary can help prevent reallocation and copying—at the cost of using more memory than is necessary. - -For more details on how the performance optimization works, see [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve"). - -### [](Str#reserve)`reserve : Str, U64 -> Str` - -Increase a string's capacity by at least the given number of additional bytes. - -This can improve the performance of string concatenation operations like [`Str.concat`](/builtins/alpha4/Str#concat "Docs for Str.concat") by allocating extra capacity up front, which can prevent the need for reallocations and copies. Consider the following example which does not use [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve"): - -Copy - -greeting = "Hello and welcome to Roc" -subject = "Awesome Programmer" - -# Evaluates to "Hello and welcome to Roc, Awesome Programmer!" -hello\_world = - greeting - |> Str.concat(", ") - |> Str.concat(subject) - |> Str.concat("!") - -In this example: - -1. We start with `greeting`, which has both a length and capacity of 24 (bytes). -2. `|> Str.concat ", "` will see that there isn't enough capacity to add 2 more bytes for the `", "`, so it will create a new heap allocation with enough bytes to hold both. (This probably will be more than 7 bytes, because when [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") functions reallocate, they apply a multiplier to the exact capacity required. This makes it less likely that future realloctions will be needed. The multiplier amount is not specified, because it may change in future releases of Roc, but it will likely be around 1.5 to 2 times the exact capacity required.) Then it will copy the current bytes (`"Hello"`) into the new allocation, and finally concatenate the `", "` into the new allocation. The old allocation will then be deallocated because it's no longer referenced anywhere in the program. -3. `|> Str.concat subject` will again check if there is enough capacity in the string. If it doesn't find enough capacity once again, it will make a third allocation, copy the existing bytes (`"Hello, "`) into that third allocation, and then deallocate the second allocation because it's already no longer being referenced anywhere else in the program. (It may find enough capacity in this particular case, because the previous [`Str.concat`](/builtins/alpha4/Str#concat "Docs for Str.concat") allocated something like 1.5 to 2 times the necessary capacity in order to anticipate future concatenations like this...but if something longer than `"World"` were being concatenated here, it might still require further reallocation and copying.) -4. `|> Str.concat "!\n"` will repeat this process once more. - -This process can have significant performance costs due to multiple reallocation of new strings, copying between old strings and new strings, and deallocation of immediately obsolete strings. - -Here's a modified example which uses [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve") to eliminate the need for all that reallocation, copying, and deallocation. - -Copy - -hello\_world = - greeting - |> Str.reserve(21) - |> Str.concat(", ") - |> Str.concat(subject) - |> Str.concat("!") - -In this example: - -1. We again start with `greeting`, which has both a length and capacity of 24 bytes. -2. `|> Str.reserve(21)` will ensure that there is enough capacity in the string for an additional 21 bytes (to make room for `", "`, `"Awesome Programmer"`, and `"!"`). Since the current capacity is only 24, it will create a new 45-byte (24 + 21) heap allocation and copy the contents of the existing allocation (`greeting`) into it. -3. `|> Str.concat(", ")` will concatenate `,` to the string. No reallocation, copying, or deallocation will be necessary, because the string already has a capacity of 45 btytes, and `greeting` will only use 24 of them. -4. `|> Str.concat(subject)` will concatenate `subject` (`"Awesome Programmer"`) to the string. Again, no reallocation, copying, or deallocation will be necessary. -5. `|> Str.concat "!\n"` will concatenate `"!\n"` to the string, still without any reallocation, copying, or deallocation. - -Here, [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve") prevented multiple reallocations, copies, and deallocations during the [`Str.concat`](/builtins/alpha4/Str#concat "Docs for Str.concat") calls. Notice that it did perform a heap allocation before any [`Str.concat`](/builtins/alpha4/Str#concat "Docs for Str.concat") calls were made, which means that using [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve") is not free! You should only use it if you actually expect to make use of the extra capacity. - -Ideally, you'd be able to predict exactly how many extra bytes of capacity will be needed, but this may not always be knowable. When you don't know exactly how many bytes to reserve, you can often get better performance by choosing a number of bytes that's too high, because a number that's too low could lead to reallocations. There's a limit to this, of course; if you always give it ten times what it turns out to need, that could prevent reallocations but will also waste a lot of memory! - -If you plan to use [`Str.reserve`](/builtins/alpha4/Str#reserve "Docs for Str.reserve") on an empty string, it's generally better to use [`Str.with_capacity`](/builtins/alpha4/Str#with_capacity "Docs for Str.with_capacity") instead. - -### [](Str#join_with)`join_with : List Str, Str -> Str` - -Combines a [`List`](/builtins/alpha4/List#List "Docs for List.List") of strings into a single string, with a separator string in between each. - -Copy - -expect Str.join\_with(\["one", "two", "three"\], ", ") == "one, two, three" -expect Str.join\_with(\["1", "2", "3", "4"\], ".") == "1.2.3.4" - -### [](Str#split_on)`split_on : Str, Str -> List Str` - -Split a string around a separator. - -Passing `""` for the separator is not useful; it returns the original string wrapped in a [`List`](/builtins/alpha4/List#List "Docs for List.List"). - -Copy - -expect Str.split\_on("1,2,3", ",") == \["1","2","3"\] -expect Str.split\_on("1,2,3", "") == \["1,2,3"\] - -### [](Str#repeat)`repeat : Str, U64 -> Str` - -Repeats a string the given number of times. - -Copy - -expect Str.repeat("z", 3) == "zzz" -expect Str.repeat("na", 8) == "nananananananana" - -Returns `""` when given `""` for the string or `0` for the count. - -Copy - -expect Str.repeat("", 10) == "" -expect Str.repeat("anything", 0) == "" - -### [](Str#len)`len : Str -> [LearnAboutStringsInRoc Str]` - -A stub function to help people discover [how they should handle this in Roc](https://www.roc-lang.org/faq.html#strings-in-roc). - -### [](Str#to_utf8)`to_utf8 : Str -> List U8` - -Returns a [`List`](/builtins/alpha4/List#List "Docs for List.List") of the string's [`U8`](/builtins/alpha4/Num#U8 "Docs for Num.U8") UTF-8 [code units](https://unicode.org/glossary/#code_unit). (To split the string into a [`List`](/builtins/alpha4/List#List "Docs for List.List") of smaller [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") values instead of [`U8`](/builtins/alpha4/Num#U8 "Docs for Num.U8") values, see [`Str.split_on`](/builtins/alpha4/Str#split_on "Docs for Str.split_on").) - -Copy - -expect Str.to\_utf8("Roc") == \[82, 111, 99\] -expect Str.to\_utf8("鹏") == \[233, 185, 143\] -expect Str.to\_utf8("சி") == \[224, 174, 154, 224, 174, 191\] -expect Str.to\_utf8("🐦") == \[240, 159, 144, 166\] - -### [](Str#from_utf8)`from_utf8 : List U8 -> Result Str [ BadUtf8 { problem : Utf8Problem, index : U64 } ]` - -Converts a [`List`](/builtins/alpha4/List#List "Docs for List.List") of [`U8`](/builtins/alpha4/Num#U8 "Docs for Num.U8") UTF-8 [code units](https://unicode.org/glossary/#code_unit) to a string. - -Returns `Err` if the given bytes are invalid UTF-8, and returns `Ok ""` when given `[]`. - -Copy - -expect Str.from\_utf8(\[82, 111, 99\]) == Ok("Roc") -expect Str.from\_utf8(\[233, 185, 143\]) == Ok("鹏") -expect Str.from\_utf8(\[224, 174, 154, 224, 174, 191\]) == Ok("சி") -expect Str.from\_utf8(\[240, 159, 144, 166\]) == Ok("🐦") -expect Str.from\_utf8(\[\]) == Ok("") -expect Str.from\_utf8(\[255\]) |> Result.is\_err - -### [](Str#from_utf8_lossy)`from_utf8_lossy : List U8 -> Str` - -Converts a [`List`](/builtins/alpha4/List#List "Docs for List.List") of [`U8`](/builtins/alpha4/Num#U8 "Docs for Num.U8") UTF-8 [code units](https://unicode.org/glossary/#code_unit) to a string. Any grouping of invalid byte sequences are replaced with a single unicode replacement character '�'. - -An invalid byte sequence is defined as - -* a 2-byte-sequence starting byte, followed by less than 1 continuation byte -* a 3-byte-sequence starting byte, followed by less than 2 continuation bytes -* a 4-byte-sequence starting byte, followed by less than 3 continuation bytes -* an invalid codepoint from the surrogate pair block -* an invalid codepoint greater than 0x110000 encoded as a 4-byte sequence -* any valid codepoint encoded as an incorrect sequence, for instance a codepoint that should be a 2-byte sequence encoded as a 3- or 4-byte sequence - -Copy - -expect (Str.from\_utf8\_lossy \[82, 111, 99, 240, 159, 144, 166\]) == "Roc🐦" -expect (Str.from\_utf8\_lossy \[82, 255, 99\]) == "R�c" -expect (Str.from\_utf8\_lossy \[82, 0xED, 0xA0, 0xBD, 99\]) == "R�c" - -### [](Str#from_utf16)`from_utf16 : List U16 -> Result Str [ BadUtf16 { problem : Utf8Problem, index : U64 } ]` - -Converts a [`List`](/builtins/alpha4/List#List "Docs for List.List") of [`U16`](/builtins/alpha4/Num#U16 "Docs for Num.U16") UTF-16 (little-endian) [code units](https://unicode.org/glossary/#code_unit) to a string. - -Copy - -expect Str.from\_utf16(\[82, 111, 99\]) == Ok("Roc") -expect Str.from\_utf16(\[0xb9a, 0xbbf\]) == Ok("சி") -expect Str.from\_utf16(\[0xd83d, 0xdc26\]) == Ok("🐦") -expect Str.from\_utf16(\[\]) == Ok("") -# unpaired surrogates, first and second halves -expect Str.from\_utf16(\[82, 0xd83d, 99\]) |> Result.is\_err -expect Str.from\_utf16(\[82, 0xdc96, 99\]) |> Result.is\_err - -### [](Str#from_utf16_lossy)`from_utf16_lossy : List U16 -> Str` - -Converts a [`List`](/builtins/alpha4/List#List "Docs for List.List") of [`U16`](/builtins/alpha4/Num#U16 "Docs for Num.U16") UTF-16 (little-endian) [code units](https://unicode.org/glossary/#code_unit) to a string. Any unpaired surrogate code unit is replaced with a single unicode replacement character '�'. - -Copy - -expect Str.from\_utf16\_lossy(\[82, 111, 99, 0xd83d, 0xdc26\]) == "Roc🐦" -expect Str.from\_utf16\_lossy(\[82, 0xdc96, 99\]) == "R�c" - -### [](Str#from_utf32)`from_utf32 : List U32 -> Result Str [ BadUtf32 { problem : Utf8Problem, index : U64 } ]` - -### [](Str#from_utf32_lossy)`from_utf32_lossy : List U32 -> Str` - -Converts a [`List`](/builtins/alpha4/List#List "Docs for List.List") of [`U32`](/builtins/alpha4/Num#U32 "Docs for Num.U32") UTF-32 [code units](https://unicode.org/glossary/#code_unit) to a string. Any invalid code points are replaced with a single unicode replacement character '�'. - -Copy - -expect Str.from\_utf32\_lossy(\[82, 111, 99, 0x1f426\]) == "Roc🐦" -expect Str.from\_utf32\_lossy(\[82, 0x110000, 99\]) == "R�c" - -### [](Str#starts_with)`starts_with : Str, Str -> Bool` - -Check if the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") starts with a value. - -Copy - -expect Str.starts\_with("ABC", "A") == Bool.true -expect Str.starts\_with("ABC", "X") == Bool.false - -### [](Str#ends_with)`ends_with : Str, Str -> Bool` - -Check if the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") ends with a value. - -Copy - -expect Str.ends\_with("ABC", "C") == Bool.true -expect Str.ends\_with("ABC", "X") == Bool.false - -### [](Str#trim)`trim : Str -> Str` - -Return the [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") with all whitespace removed from both the beginning as well as the end. - -Copy - -expect Str.trim(" Hello \\n\\n") == "Hello" - -### [](Str#trim_start)`trim_start : Str -> Str` - -Return the [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") with all whitespace removed from the beginning. - -Copy - -expect Str.trim\_start(" Hello \\n\\n") == "Hello \\n\\n" - -### [](Str#trim_end)`trim_end : Str -> Str` - -Return the [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") with all whitespace removed from the end. - -Copy - -expect Str.trim\_end(" Hello \\n\\n") == " Hello" - -### [](Str#to_dec)`to_dec : Str -> Result Dec [InvalidNumStr]` - -Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a [`Dec`](/builtins/alpha4/Num#Dec "Docs for Num.Dec"). A [`Dec`](/builtins/alpha4/Num#Dec "Docs for Num.Dec") value is a 128-bit decimal [fixed-point number](https://en.wikipedia.org/wiki/Fixed-point_arithmetic). - -Copy - -expect Str.to\_dec("10") == Ok(10dec) -expect Str.to\_dec("-0.25") == Ok(\-0.25dec) -expect Str.to\_dec("not a number") == Err(InvalidNumStr) - -### [](Str#to_f64)`to_f64 : Str -> Result F64 [InvalidNumStr]` - -Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a [`F64`](/builtins/alpha4/Num#F64 "Docs for Num.F64"). A [`F64`](/builtins/alpha4/Num#F64 "Docs for Num.F64") value is a 64-bit [floating-point number](https://en.wikipedia.org/wiki/IEEE_754) and can be specified with a `f64` suffix. - -Copy - -expect Str.to\_f64("0.10") == Ok(0.10f64) -expect Str.to\_f64("not a number") == Err(InvalidNumStr) - -### [](Str#to_f32)`to_f32 : Str -> Result F32 [InvalidNumStr]` - -Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a [`F32`](/builtins/alpha4/Num#F32 "Docs for Num.F32").A [`F32`](/builtins/alpha4/Num#F32 "Docs for Num.F32") value is a 32-bit [floating-point number](https://en.wikipedia.org/wiki/IEEE_754) and can be specified with a `f32` suffix. - -Copy - -expect Str.to\_f32("0.10") == Ok(0.10f32) -expect Str.to\_f32("not a number") == Err(InvalidNumStr) - -### [](Str#to_u128)`to_u128 : Str -> Result U128 [InvalidNumStr]` - -Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to an unsigned [`U128`](/builtins/alpha4/Num#U128 "Docs for Num.U128") integer. A [`U128`](/builtins/alpha4/Num#U128 "Docs for Num.U128") value can hold numbers from `0` to `340_282_366_920_938_463_463_374_607_431_768_211_455` (over 340 undecillion). It can be specified with a u128 suffix. - -Copy - -expect Str.to\_u128("1500") == Ok(1500u128) -expect Str.to\_u128("0.1") == Err(InvalidNumStr) -expect Str.to\_u128("-1") == Err(InvalidNumStr) -expect Str.to\_u128("not a number") == Err(InvalidNumStr) - -### [](Str#to_i128)`to_i128 : Str -> Result I128 [InvalidNumStr]` - -Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a signed [`I128`](/builtins/alpha4/Num#I128 "Docs for Num.I128") integer. A [`I128`](/builtins/alpha4/Num#I128 "Docs for Num.I128") value can hold numbers from `-170_141_183_460_469_231_731_687_303_715_884_105_728` to `170_141_183_460_469_231_731_687_303_715_884_105_727`. It can be specified with a i128 suffix. - -Copy - -expect Str.to\_u128("1500") == Ok(1500i128) -expect Str.to\_i128("-1") == Ok(\-1i128) -expect Str.to\_i128("0.1") == Err(InvalidNumStr) -expect Str.to\_i128("not a number") == Err(InvalidNumStr) - -### [](Str#to_u64)`to_u64 : Str -> Result U64 [InvalidNumStr]` - -Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to an unsigned [`U64`](/builtins/alpha4/Num#U64 "Docs for Num.U64") integer. A [`U64`](/builtins/alpha4/Num#U64 "Docs for Num.U64") value can hold numbers from `0` to `18_446_744_073_709_551_615` (over 18 quintillion). It can be specified with a u64 suffix. - -Copy - -expect Str.to\_u64("1500") == Ok(1500u64) -expect Str.to\_u64("0.1") == Err(InvalidNumStr) -expect Str.to\_u64("-1") == Err(InvalidNumStr) -expect Str.to\_u64("not a number") == Err(InvalidNumStr) - -### [](Str#to_i64)`to_i64 : Str -> Result I64 [InvalidNumStr]` - -Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a signed [`I64`](/builtins/alpha4/Num#I64 "Docs for Num.I64") integer. A [`I64`](/builtins/alpha4/Num#I64 "Docs for Num.I64") value can hold numbers from `-9_223_372_036_854_775_808` to `9_223_372_036_854_775_807`. It can be specified with a i64 suffix. - -Copy - -expect Str.to\_i64("1500") == Ok(1500i64) -expect Str.to\_i64("-1") == Ok(\-1i64) -expect Str.to\_i64("0.1") == Err(InvalidNumStr) -expect Str.to\_i64("not a number") == Err(InvalidNumStr) - -### [](Str#to_u32)`to_u32 : Str -> Result U32 [InvalidNumStr]` - -Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to an unsigned [`U32`](/builtins/alpha4/Num#U32 "Docs for Num.U32") integer. A [`U32`](/builtins/alpha4/Num#U32 "Docs for Num.U32") value can hold numbers from `0` to `4_294_967_295` (over 4 billion). It can be specified with a u32 suffix. - -Copy - -expect Str.to\_u32("1500") == Ok(1500u32) -expect Str.to\_u32("0.1") == Err(InvalidNumStr) -expect Str.to\_u32("-1") == Err(InvalidNumStr) -expect Str.to\_u32("not a number") == Err(InvalidNumStr) - -### [](Str#to_i32)`to_i32 : Str -> Result I32 [InvalidNumStr]` - -Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a signed [`I32`](/builtins/alpha4/Num#I32 "Docs for Num.I32") integer. A [`I32`](/builtins/alpha4/Num#I32 "Docs for Num.I32") value can hold numbers from `-2_147_483_648` to `2_147_483_647`. It can be specified with a i32 suffix. - -Copy - -expect Str.to\_i32("1500") == Ok(1500i32) -expect Str.to\_i32("-1") == Ok(\-1i32) -expect Str.to\_i32("0.1") == Err(InvalidNumStr) -expect Str.to\_i32("not a number") == Err(InvalidNumStr) - -### [](Str#to_u16)`to_u16 : Str -> Result U16 [InvalidNumStr]` - -Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to an unsigned [`U16`](/builtins/alpha4/Num#U16 "Docs for Num.U16") integer. A [`U16`](/builtins/alpha4/Num#U16 "Docs for Num.U16") value can hold numbers from `0` to `65_535`. It can be specified with a u16 suffix. - -Copy - -expect Str.to\_u16("1500") == Ok(1500u16) -expect Str.to\_u16("0.1") == Err(InvalidNumStr) -expect Str.to\_u16("-1") == Err(InvalidNumStr) -expect Str.to\_u16("not a number") == Err(InvalidNumStr) - -### [](Str#to_i16)`to_i16 : Str -> Result I16 [InvalidNumStr]` - -Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a signed [`I16`](/builtins/alpha4/Num#I16 "Docs for Num.I16") integer. A [`I16`](/builtins/alpha4/Num#I16 "Docs for Num.I16") value can hold numbers from `-32_768` to `32_767`. It can be specified with a i16 suffix. - -Copy - -expect Str.to\_i16("1500") == Ok(1500i16) -expect Str.to\_i16("-1") == Ok(\-1i16) -expect Str.to\_i16("0.1") == Err(InvalidNumStr) -expect Str.to\_i16("not a number") == Err(InvalidNumStr) - -### [](Str#to_u8)`to_u8 : Str -> Result U8 [InvalidNumStr]` - -Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to an unsigned [`U8`](/builtins/alpha4/Num#U8 "Docs for Num.U8") integer. A [`U8`](/builtins/alpha4/Num#U8 "Docs for Num.U8") value can hold numbers from `0` to `255`. It can be specified with a u8 suffix. - -Copy - -expect Str.to\_u8("250") == Ok(250u8) -expect Str.to\_u8("-0.1") == Err(InvalidNumStr) -expect Str.to\_u8("not a number") == Err(InvalidNumStr) -expect Str.to\_u8("1500") == Err(InvalidNumStr) - -### [](Str#to_i8)`to_i8 : Str -> Result I8 [InvalidNumStr]` - -Encode a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") to a signed [`I8`](/builtins/alpha4/Num#I8 "Docs for Num.I8") integer. A [`I8`](/builtins/alpha4/Num#I8 "Docs for Num.I8") value can hold numbers from `-128` to `127`. It can be specified with a i8 suffix. - -Copy - -expect Str.to\_i8("-15") == Ok(\-15i8) -expect Str.to\_i8("150.00") == Err(InvalidNumStr) -expect Str.to\_i8("not a number") == Err(InvalidNumStr) - -### [](Str#count_utf8_bytes)`count_utf8_bytes : Str -> U64` - -Gives the number of bytes in a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") value. - -Copy - -expect Str.count\_utf8\_bytes("Hello World") == 11 - -### [](Str#replace_each)`replace_each : Str, Str, Str -> Str` - -Returns the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") with each occurrence of a substring replaced. If the substring is not found, returns the original string. - -Copy - -expect Str.replace\_each("foo/bar/baz", "/", "\_") == "foo\_bar\_baz" -expect Str.replace\_each("not here", "/", "\_") == "not here" - -### [](Str#replace_first)`replace_first : Str, Str, Str -> Str` - -Returns the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") with the first occurrence of a substring replaced. If the substring is not found, returns the original string. - -Copy - -expect Str.replace\_first("foo/bar/baz", "/", "\_") == "foo\_bar/baz" -expect Str.replace\_first("no slashes here", "/", "\_") == "no slashes here" - -### [](Str#replace_last)`replace_last : Str, Str, Str -> Str` - -Returns the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") with the last occurrence of a substring replaced. If the substring is not found, returns the original string. - -Copy - -expect Str.replace\_last("foo/bar/baz", "/", "\_") == "foo/bar\_baz" -expect Str.replace\_last("no slashes here", "/", "\_") == "no slashes here" - -### [](Str#split_first)`split_first : Str, Str -> Result { before : Str, after : Str } [NotFound]` - -Returns the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") before the first occurrence of a [delimiter](https://www.computerhope.com/jargon/d/delimite.htm), as well as the rest of the string after that occurrence. Returns \[Err NotFound\] if the delimiter is not found. - -Copy - -expect Str.split\_first("foo/bar/baz", "/") == Ok({ before: "foo", after: "bar/baz" }) -expect Str.split\_first("no slashes here", "/") == Err(NotFound) - -### [](Str#split_last)`split_last : Str, Str -> Result { before : Str, after : Str } [NotFound]` - -Returns the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") before the last occurrence of a delimiter, as well as the rest of the string after that occurrence. Returns \[Err NotFound\] if the delimiter is not found. - -Copy - -expect Str.split\_last("foo/bar/baz", "/") == Ok({ before: "foo/bar", after: "baz" }) -expect Str.split\_last("no slashes here", "/") == Err(NotFound) - -### [](Str#walk_utf8_with_index)`walk_utf8_with_index : Str, state, (state, U8, U64 -> state) -> state` - -Walks over the `UTF-8` bytes of the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") and calls a function to update state for each byte. The index for that byte in the string is provided to the update function. - -Copy - -f : List U8, U8, U64 -> List U8 -f = \\state, byte, \_ -> List.append(state, byte) -expect Str.walk\_utf8\_with\_index("ABC", \[\], f) == \[65, 66, 67\] - -### [](Str#walk_utf8)`walk_utf8 : Str, state, (state, U8 -> state) -> state` - -Walks over the `UTF-8` bytes of the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") and calls a function to update state for each byte. - -Copy - -sum\_of\_utf8\_bytes = - Str.walk\_utf8("Hello, World!", 0, (\\total, byte -> - total + byte - )) - -expect sum\_of\_utf8\_bytes == 105 - -### [](Str#release_excess_capacity)`release_excess_capacity : Str -> Str` - -Shrink the memory footprint of a str such that its capacity and length are equal. Note: This will also convert seamless slices to regular lists. - -### [](Str#with_prefix)`with_prefix : Str, Str -> Str` - -Adds a prefix to the given [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str"). - -Copy - -expect Str.with\_prefix("Awesome", "Roc") == "RocAwesome" - -### [](Str#contains)`contains : Str, Str -> Bool` - -Determines whether or not the first Str contains the second. - -Copy - -expect Str.contains("foobarbaz", "bar") -expect !Str.contains("apple", "orange") -expect Str.contains("anything", "") - -### [](Str#drop_prefix)`drop_prefix : Str, Str -> Str` - -Drops the given prefix [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") from the start of a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") If the prefix is not found, returns the original string. - -Copy - -expect Str.drop\_prefix("bar", "foo") == "bar" -expect Str.drop\_prefix("foobar", "foo") == "bar" - -### [](Str#drop_suffix)`drop_suffix : Str, Str -> Str` - -Drops the given suffix [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") from the end of a [`Str`](/builtins/alpha4/Str#Str "Docs for Str.Str") If the suffix is not found, returns the original string. - -Copy - -expect Str.drop\_suffix("bar", "foo") == "bar" -expect Str.drop\_suffix("barfoo", "foo") == "bar" - -### [](Str#with_ascii_lowercased)`with_ascii_lowercased : Str -> Str` - -Returns a version of the string with all [ASCII characters](https://en.wikipedia.org/wiki/ASCII) lowercased. Non-ASCII characters are left unmodified. For example: - -Copy - -expect Str.with\_ascii\_lowercased("CAFÉ") == "cafÉ" - -This function is useful for things like [command-line flags](https://en.wikipedia.org/wiki/Command-line_interface#Command-line_option) and [environment variable names](https://en.wikipedia.org/wiki/Environment_variable) where you know in advance that you're dealing with a string containing only ASCII characters. It has better performance than lowercasing operations which take Unicode into account. - -That said, strings received from user input can always contain non-ASCII Unicode characters, and lowercasing [Unicode](https://unicode.org) works differently in different languages. For example, the string `"I"` lowercases to `"i"` in English and to `"ı"` (a [dotless i](https://en.wikipedia.org/wiki/Dotless_I)) in Turkish. These rules can also change in each [Unicode release](https://www.unicode.org/releases/), so we have separate [`unicode` package](https://github.com/roc-lang/unicode) for Unicode capitalization that can be upgraded independently from the language's builtins. - -To do a case-insensitive comparison of the ASCII characters in a string, you can use [`Str.caseless_ascii_equals`](/builtins/alpha4/Str#caseless_ascii_equals "Docs for Str.caseless_ascii_equals"). - -### [](Str#with_ascii_uppercased)`with_ascii_uppercased : Str -> Str` - -Returns a version of the string with all [ASCII characters](https://en.wikipedia.org/wiki/ASCII) uppercased. Non-ASCII characters are left unmodified. For example: - -Copy - - expect Str.with\_ascii\_uppercased("café") == "CAFé" - -This function is useful for things like [command-line flags](https://en.wikipedia.org/wiki/Command-line_interface#Command-line_option) and [environment variable names](https://en.wikipedia.org/wiki/Environment_variable) where you know in advance that you're dealing with a string containing only ASCII characters. It has better performance than lowercasing operations which take Unicode into account. - -That said, strings received from user input can always contain non-ASCII Unicode characters, and uppercasing [Unicode](https://unicode.org) works differently in different languages. For example, the string `"i"` uppercases to `"I"` in English and to `"İ"` (a [dotted I](https://en.wikipedia.org/wiki/%C4%B0)) in Turkish. These rules can also change in each Unicode release, so we have a separate [`unicode` package](https://github.com/roc-lang/unicode) for Unicode capitalization that can be upgraded independently from the language's builtins. - -To do a case-insensitive comparison of the ASCII characters in a string, you can use [`Str.caseless_ascii_equals`](/builtins/alpha4/Str#caseless_ascii_equals "Docs for Str.caseless_ascii_equals"). - -### [](Str#caseless_ascii_equals)`caseless_ascii_equals : Str, Str -> Bool` - -Returns `True` if all the [ASCII characters](https://en.wikipedia.org/wiki/ASCII) in the string are the same when ignoring differences in capitalization. Non-ASCII characters must all be exactly the same, including capitalization. For example: - -Copy - - expect Str.caseless\_ascii\_equals("café", "CAFé") - - expect !Str.caseless\_ascii\_equals("café", "CAFÉ") - -The first call returns `True` because all the ASCII characters are the same when ignoring differences in capitalization, and the only non-ASCII character (`é`) is the same in both strings. The second call returns `False`because `é` and `É` are not ASCII characters, and they are different. - -This function is useful for things like [command-line flags](https://en.wikipedia.org/wiki/Command-line_interface#Command-line_option) and [environment variable names](https://en.wikipedia.org/wiki/Environment_variable) where you know in advance that you're dealing with a string containing only ASCII characters. It has better performance than lowercasing operations which take Unicode into account. - -That said, strings received from user input can always contain non-ASCII Unicode characters, and lowercasing [Unicode](https://unicode.org) works differently in different languages. For example, the string `"I"` lowercases to `"i"` in English and to `"ı"` (a [dotless i](https://en.wikipedia.org/wiki/Dotless_I)) in Turkish. These rules can also change in each [Unicode release](https://www.unicode.org/releases/), so we have separate [`unicode` package](https://github.com/roc-lang/unicode) for Unicode capitalization that can be upgraded independently from the language's builtins. - -To convert a string's ASCII characters to uppercase or lowercase, you can use [`Str.with_ascii_uppercased`](/builtins/alpha4/Str#with_ascii_uppercased "Docs for Str.with_ascii_uppercased") or [`Str.with_ascii_lowercased`](/builtins/alpha4/Str#with_ascii_lowercased "Docs for Str.with_ascii_lowercased"). - -Made by people who like to make nice things. diff --git a/test/fx/Parser.roc b/test/fx/Parser.roc deleted file mode 100644 index bf5ceec8c6..0000000000 --- a/test/fx/Parser.roc +++ /dev/null @@ -1,19 +0,0 @@ -Parser := [].{ - Result(input, a) : Try({ val : a, rem : input }, [ParsingFailure(Str)]) - Fn(input, a) : input -> Result(input, a) - - parse_partial : Fn(input, a), input -> Result(input, a) - parse_partial = |fn, input| fn(input) -} - -test_fn : Parser.Fn(List(U8), Bool) -test_fn = |bytes| if (bytes.len() > 0) Ok({ val : True, rem : [] }) else Err(ParsingFailure("no input")) - -test_input : List(U8) -test_input = [1,2,3] - -test_empty : List(U8) -test_empty = [] - -expect Parser.parse_partial(test_fn, test_empty) == Err(ParsingFailure("no input")) -expect Parser.parse_partial(test_fn, test_input) == Ok({ val : True, rem : [1] }) From 04980635736c1a7cc9d688ae703960d9dd2b477f Mon Sep 17 00:00:00 2001 From: Luke Boswell Date: Wed, 26 Nov 2025 12:11:51 +1100 Subject: [PATCH 04/10] add Str.reserve builtin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- src/build/builtin_compiler/main.zig | 3 +++ src/build/roc/Builtin.roc | 1 + src/canonicalize/Expression.zig | 1 + src/eval/interpreter.zig | 25 +++++++++++++++++++++++++ test/snapshots/repl/str_reserve.md | 19 +++++++++++++++++++ 5 files changed, 49 insertions(+) create mode 100644 test/snapshots/repl/str_reserve.md diff --git a/src/build/builtin_compiler/main.zig b/src/build/builtin_compiler/main.zig index 6b126776d0..3b0788cc62 100644 --- a/src/build/builtin_compiler/main.zig +++ b/src/build/builtin_compiler/main.zig @@ -146,6 +146,9 @@ fn replaceStrIsEmptyWithLowLevel(env: *ModuleEnv) !std.ArrayList(CIR.Def.Idx) { if (env.common.findIdent("Builtin.Str.with_capacity")) |str_with_capacity_ident| { try low_level_map.put(str_with_capacity_ident, .str_with_capacity); } + if (env.common.findIdent("Builtin.Str.reserve")) |str_reserve_ident| { + try low_level_map.put(str_reserve_ident, .str_reserve); + } if (env.common.findIdent("Builtin.List.len")) |list_len_ident| { try low_level_map.put(list_len_ident, .list_len); } diff --git a/src/build/roc/Builtin.roc b/src/build/roc/Builtin.roc index 2fd93fec62..430fb91e2f 100644 --- a/src/build/roc/Builtin.roc +++ b/src/build/roc/Builtin.roc @@ -17,6 +17,7 @@ Builtin :: [].{ drop_suffix : Str, Str -> Str count_utf8_bytes : Str -> U64 with_capacity : U64 -> Str + reserve : Str, U64 -> Str } List(_item) :: [ProvidedByCompiler].{ diff --git a/src/canonicalize/Expression.zig b/src/canonicalize/Expression.zig index 6b48ee7708..88e7c721d9 100644 --- a/src/canonicalize/Expression.zig +++ b/src/canonicalize/Expression.zig @@ -417,6 +417,7 @@ pub const Expr = union(enum) { str_drop_suffix, str_count_utf8_bytes, str_with_capacity, + str_reserve, // Numeric to_str operations u8_to_str, diff --git a/src/eval/interpreter.zig b/src/eval/interpreter.zig index 3a969168a5..7374bd5966 100644 --- a/src/eval/interpreter.zig +++ b/src/eval/interpreter.zig @@ -3128,6 +3128,31 @@ pub const Interpreter = struct { out.is_initialized = true; return out; }, + .str_reserve => { + // Str.reserve : Str, U64 -> Str + std.debug.assert(args.len == 2); + + const string_arg = args[0]; + const spare_arg = args[1]; + + std.debug.assert(string_arg.ptr != null); + + const string: *const RocStr = @ptrCast(@alignCast(string_arg.ptr.?)); + const spare_value = try self.extractNumericValue(spare_arg); + const spare: u64 = @intCast(spare_value.int); + + const result_str = builtins.str.reserveC(string.*, spare, roc_ops); + + const result_layout = string_arg.layout; + var out = try self.pushRaw(result_layout, 0); + out.is_initialized = false; + + const result_ptr: *RocStr = @ptrCast(@alignCast(out.ptr.?)); + result_ptr.* = result_str; + + out.is_initialized = true; + return out; + }, .list_len => { // List.len : List(a) -> U64 // Note: listLen returns usize, but List.len always returns U64. diff --git a/test/snapshots/repl/str_reserve.md b/test/snapshots/repl/str_reserve.md new file mode 100644 index 0000000000..8f9562540d --- /dev/null +++ b/test/snapshots/repl/str_reserve.md @@ -0,0 +1,19 @@ +# META +~~~ini +description=Str.reserve should return the same string with additional capacity reserved +type=repl +~~~ +# SOURCE +~~~roc +» Str.reserve("hello", 0) +» Str.reserve("hello", 10) +» Str.reserve("", 100) +~~~ +# OUTPUT +"hello" +--- +"hello" +--- +"" +# PROBLEMS +NIL From e81c896d2ad2ee67909720ab9f22c511600c64bc Mon Sep 17 00:00:00 2001 From: Luke Boswell Date: Wed, 26 Nov 2025 12:34:34 +1100 Subject: [PATCH 05/10] add Str.release_excess_capacity builtin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- src/build/builtin_compiler/main.zig | 3 +++ src/build/roc/Builtin.roc | 1 + src/canonicalize/Expression.zig | 1 + src/eval/interpreter.zig | 20 +++++++++++++++++++ .../repl/str_release_excess_capacity.md | 19 ++++++++++++++++++ 5 files changed, 44 insertions(+) create mode 100644 test/snapshots/repl/str_release_excess_capacity.md diff --git a/src/build/builtin_compiler/main.zig b/src/build/builtin_compiler/main.zig index 3b0788cc62..c82ce1a044 100644 --- a/src/build/builtin_compiler/main.zig +++ b/src/build/builtin_compiler/main.zig @@ -149,6 +149,9 @@ fn replaceStrIsEmptyWithLowLevel(env: *ModuleEnv) !std.ArrayList(CIR.Def.Idx) { if (env.common.findIdent("Builtin.Str.reserve")) |str_reserve_ident| { try low_level_map.put(str_reserve_ident, .str_reserve); } + if (env.common.findIdent("Builtin.Str.release_excess_capacity")) |str_release_excess_capacity_ident| { + try low_level_map.put(str_release_excess_capacity_ident, .str_release_excess_capacity); + } if (env.common.findIdent("Builtin.List.len")) |list_len_ident| { try low_level_map.put(list_len_ident, .list_len); } diff --git a/src/build/roc/Builtin.roc b/src/build/roc/Builtin.roc index 430fb91e2f..43ec70b67c 100644 --- a/src/build/roc/Builtin.roc +++ b/src/build/roc/Builtin.roc @@ -18,6 +18,7 @@ Builtin :: [].{ count_utf8_bytes : Str -> U64 with_capacity : U64 -> Str reserve : Str, U64 -> Str + release_excess_capacity : Str -> Str } List(_item) :: [ProvidedByCompiler].{ diff --git a/src/canonicalize/Expression.zig b/src/canonicalize/Expression.zig index 88e7c721d9..bb212db217 100644 --- a/src/canonicalize/Expression.zig +++ b/src/canonicalize/Expression.zig @@ -418,6 +418,7 @@ pub const Expr = union(enum) { str_count_utf8_bytes, str_with_capacity, str_reserve, + str_release_excess_capacity, // Numeric to_str operations u8_to_str, diff --git a/src/eval/interpreter.zig b/src/eval/interpreter.zig index 7374bd5966..cb51dbd82b 100644 --- a/src/eval/interpreter.zig +++ b/src/eval/interpreter.zig @@ -3153,6 +3153,26 @@ pub const Interpreter = struct { out.is_initialized = true; return out; }, + .str_release_excess_capacity => { + // Str.release_excess_capacity : Str -> Str + std.debug.assert(args.len == 1); + + const string_arg = args[0]; + std.debug.assert(string_arg.ptr != null); + + const string: *const RocStr = @ptrCast(@alignCast(string_arg.ptr.?)); + const result_str = builtins.str.strReleaseExcessCapacity(roc_ops, string.*); + + const result_layout = string_arg.layout; + var out = try self.pushRaw(result_layout, 0); + out.is_initialized = false; + + const result_ptr: *RocStr = @ptrCast(@alignCast(out.ptr.?)); + result_ptr.* = result_str; + + out.is_initialized = true; + return out; + }, .list_len => { // List.len : List(a) -> U64 // Note: listLen returns usize, but List.len always returns U64. diff --git a/test/snapshots/repl/str_release_excess_capacity.md b/test/snapshots/repl/str_release_excess_capacity.md new file mode 100644 index 0000000000..b0a89a168b --- /dev/null +++ b/test/snapshots/repl/str_release_excess_capacity.md @@ -0,0 +1,19 @@ +# META +~~~ini +description=Str.release_excess_capacity should return the same string with excess capacity released +type=repl +~~~ +# SOURCE +~~~roc +» Str.release_excess_capacity("hello") +» Str.release_excess_capacity("") +» Str.release_excess_capacity("hello world") +~~~ +# OUTPUT +"hello" +--- +"" +--- +"hello world" +# PROBLEMS +NIL From 8fcd482901902a3d85a5bbfc3bfeed2732c9e32e Mon Sep 17 00:00:00 2001 From: Luke Boswell Date: Wed, 26 Nov 2025 12:38:54 +1100 Subject: [PATCH 06/10] add Str.to_utf8 builtin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- src/build/builtin_compiler/main.zig | 3 +++ src/build/roc/Builtin.roc | 1 + src/canonicalize/Expression.zig | 1 + src/eval/interpreter.zig | 25 +++++++++++++++++++++++++ test/snapshots/repl/str_to_utf8.md | 22 ++++++++++++++++++++++ 5 files changed, 52 insertions(+) create mode 100644 test/snapshots/repl/str_to_utf8.md diff --git a/src/build/builtin_compiler/main.zig b/src/build/builtin_compiler/main.zig index c82ce1a044..029506ebf7 100644 --- a/src/build/builtin_compiler/main.zig +++ b/src/build/builtin_compiler/main.zig @@ -152,6 +152,9 @@ fn replaceStrIsEmptyWithLowLevel(env: *ModuleEnv) !std.ArrayList(CIR.Def.Idx) { if (env.common.findIdent("Builtin.Str.release_excess_capacity")) |str_release_excess_capacity_ident| { try low_level_map.put(str_release_excess_capacity_ident, .str_release_excess_capacity); } + if (env.common.findIdent("Builtin.Str.to_utf8")) |str_to_utf8_ident| { + try low_level_map.put(str_to_utf8_ident, .str_to_utf8); + } if (env.common.findIdent("Builtin.List.len")) |list_len_ident| { try low_level_map.put(list_len_ident, .list_len); } diff --git a/src/build/roc/Builtin.roc b/src/build/roc/Builtin.roc index 43ec70b67c..1160c81bcc 100644 --- a/src/build/roc/Builtin.roc +++ b/src/build/roc/Builtin.roc @@ -19,6 +19,7 @@ Builtin :: [].{ with_capacity : U64 -> Str reserve : Str, U64 -> Str release_excess_capacity : Str -> Str + to_utf8 : Str -> List(U8) } List(_item) :: [ProvidedByCompiler].{ diff --git a/src/canonicalize/Expression.zig b/src/canonicalize/Expression.zig index bb212db217..f1228da0e4 100644 --- a/src/canonicalize/Expression.zig +++ b/src/canonicalize/Expression.zig @@ -419,6 +419,7 @@ pub const Expr = union(enum) { str_with_capacity, str_reserve, str_release_excess_capacity, + str_to_utf8, // Numeric to_str operations u8_to_str, diff --git a/src/eval/interpreter.zig b/src/eval/interpreter.zig index cb51dbd82b..aaf1c949d9 100644 --- a/src/eval/interpreter.zig +++ b/src/eval/interpreter.zig @@ -3173,6 +3173,31 @@ pub const Interpreter = struct { out.is_initialized = true; return out; }, + .str_to_utf8 => { + // Str.to_utf8 : Str -> List(U8) + std.debug.assert(args.len == 1); + + const string_arg = args[0]; + std.debug.assert(string_arg.ptr != null); + + const string: *const RocStr = @ptrCast(@alignCast(string_arg.ptr.?)); + const result_list = builtins.str.strToUtf8C(string.*, roc_ops); + + const result_rt_var = return_rt_var orelse { + self.triggerCrash("str_to_utf8 requires return type info", false, roc_ops); + return error.Crash; + }; + const result_layout = try self.getRuntimeLayout(result_rt_var); + + var out = try self.pushRaw(result_layout, 0); + out.is_initialized = false; + + const result_ptr: *builtins.list.RocList = @ptrCast(@alignCast(out.ptr.?)); + result_ptr.* = result_list; + + out.is_initialized = true; + return out; + }, .list_len => { // List.len : List(a) -> U64 // Note: listLen returns usize, but List.len always returns U64. diff --git a/test/snapshots/repl/str_to_utf8.md b/test/snapshots/repl/str_to_utf8.md new file mode 100644 index 0000000000..57f308e325 --- /dev/null +++ b/test/snapshots/repl/str_to_utf8.md @@ -0,0 +1,22 @@ +# META +~~~ini +description=Str.to_utf8 should convert a string to a list of UTF-8 bytes +type=repl +~~~ +# SOURCE +~~~roc +» List.len(Str.to_utf8("")) +» List.len(Str.to_utf8("hello")) +» List.len(Str.to_utf8("é")) +» List.len(Str.to_utf8("🎉")) +~~~ +# OUTPUT +0 +--- +5 +--- +2 +--- +4 +# PROBLEMS +NIL From 9b2e57eeef5e3d5151f803d53826ba5418fd3d3b Mon Sep 17 00:00:00 2001 From: Luke Boswell Date: Wed, 26 Nov 2025 12:40:48 +1100 Subject: [PATCH 07/10] add Str.from_utf8_lossy builtin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- src/build/builtin_compiler/main.zig | 3 +++ src/build/roc/Builtin.roc | 1 + src/canonicalize/Expression.zig | 1 + src/eval/interpreter.zig | 20 +++++++++++++++++ test/snapshots/repl/str_from_utf8_lossy.md | 25 ++++++++++++++++++++++ 5 files changed, 50 insertions(+) create mode 100644 test/snapshots/repl/str_from_utf8_lossy.md diff --git a/src/build/builtin_compiler/main.zig b/src/build/builtin_compiler/main.zig index 029506ebf7..ed5ff0f637 100644 --- a/src/build/builtin_compiler/main.zig +++ b/src/build/builtin_compiler/main.zig @@ -155,6 +155,9 @@ fn replaceStrIsEmptyWithLowLevel(env: *ModuleEnv) !std.ArrayList(CIR.Def.Idx) { if (env.common.findIdent("Builtin.Str.to_utf8")) |str_to_utf8_ident| { try low_level_map.put(str_to_utf8_ident, .str_to_utf8); } + if (env.common.findIdent("Builtin.Str.from_utf8_lossy")) |str_from_utf8_lossy_ident| { + try low_level_map.put(str_from_utf8_lossy_ident, .str_from_utf8_lossy); + } if (env.common.findIdent("Builtin.List.len")) |list_len_ident| { try low_level_map.put(list_len_ident, .list_len); } diff --git a/src/build/roc/Builtin.roc b/src/build/roc/Builtin.roc index 1160c81bcc..df0caf6e9f 100644 --- a/src/build/roc/Builtin.roc +++ b/src/build/roc/Builtin.roc @@ -20,6 +20,7 @@ Builtin :: [].{ reserve : Str, U64 -> Str release_excess_capacity : Str -> Str to_utf8 : Str -> List(U8) + from_utf8_lossy : List(U8) -> Str } List(_item) :: [ProvidedByCompiler].{ diff --git a/src/canonicalize/Expression.zig b/src/canonicalize/Expression.zig index f1228da0e4..a27558b015 100644 --- a/src/canonicalize/Expression.zig +++ b/src/canonicalize/Expression.zig @@ -420,6 +420,7 @@ pub const Expr = union(enum) { str_reserve, str_release_excess_capacity, str_to_utf8, + str_from_utf8_lossy, // Numeric to_str operations u8_to_str, diff --git a/src/eval/interpreter.zig b/src/eval/interpreter.zig index aaf1c949d9..2ba81ff885 100644 --- a/src/eval/interpreter.zig +++ b/src/eval/interpreter.zig @@ -3198,6 +3198,26 @@ pub const Interpreter = struct { out.is_initialized = true; return out; }, + .str_from_utf8_lossy => { + // Str.from_utf8_lossy : List(U8) -> Str + std.debug.assert(args.len == 1); + + const list_arg = args[0]; + std.debug.assert(list_arg.ptr != null); + + const roc_list: *const builtins.list.RocList = @ptrCast(@alignCast(list_arg.ptr.?)); + const result_str = builtins.str.fromUtf8Lossy(roc_list.*, roc_ops); + + const result_layout = layout.Layout.str(); + var out = try self.pushRaw(result_layout, 0); + out.is_initialized = false; + + const result_ptr: *RocStr = @ptrCast(@alignCast(out.ptr.?)); + result_ptr.* = result_str; + + out.is_initialized = true; + return out; + }, .list_len => { // List.len : List(a) -> U64 // Note: listLen returns usize, but List.len always returns U64. diff --git a/test/snapshots/repl/str_from_utf8_lossy.md b/test/snapshots/repl/str_from_utf8_lossy.md new file mode 100644 index 0000000000..47ab86f280 --- /dev/null +++ b/test/snapshots/repl/str_from_utf8_lossy.md @@ -0,0 +1,25 @@ +# META +~~~ini +description=Str.from_utf8_lossy should convert a list of UTF-8 bytes to a string +type=repl +~~~ +# SOURCE +~~~roc +» Str.from_utf8_lossy(Str.to_utf8("")) +» Str.from_utf8_lossy(Str.to_utf8("hello")) +» Str.from_utf8_lossy(Str.to_utf8("hello world")) +» Str.from_utf8_lossy(Str.to_utf8("é")) +» Str.from_utf8_lossy(Str.to_utf8("🎉")) +~~~ +# OUTPUT +"" +--- +"hello" +--- +"hello world" +--- +"é" +--- +"🎉" +# PROBLEMS +NIL From f95228dca16313506e36f73dcf903147ed8b1d03 Mon Sep 17 00:00:00 2001 From: Luke Boswell Date: Wed, 26 Nov 2025 12:52:37 +1100 Subject: [PATCH 08/10] add Str.split_on builtin with comprehensive tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Includes interpreter tests in low_level_interp_test.zig for all new builtins: - count_utf8_bytes: tests empty, ASCII, multi-byte UTF-8, emoji - with_capacity: tests zero and non-zero capacity - reserve: tests content preservation - release_excess_capacity: tests content preservation - to_utf8: tests length verification via List.len - from_utf8_lossy: tests roundtrip with to_utf8 - split_on: tests count and actual substring content via List.first Also updates Color.md snapshot now that to_utf8 is a valid method. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- src/build/builtin_compiler/main.zig | 3 + src/build/roc/Builtin.roc | 1 + src/canonicalize/Expression.zig | 1 + src/eval/interpreter.zig | 30 ++++ src/eval/test/low_level_interp_test.zig | 198 ++++++++++++++++++++++++ test/snapshots/plume_package/Color.md | 72 ++++++++- test/snapshots/repl/str_split_on.md | 22 +++ 7 files changed, 321 insertions(+), 6 deletions(-) create mode 100644 test/snapshots/repl/str_split_on.md diff --git a/src/build/builtin_compiler/main.zig b/src/build/builtin_compiler/main.zig index ed5ff0f637..f65ad2aaf2 100644 --- a/src/build/builtin_compiler/main.zig +++ b/src/build/builtin_compiler/main.zig @@ -158,6 +158,9 @@ fn replaceStrIsEmptyWithLowLevel(env: *ModuleEnv) !std.ArrayList(CIR.Def.Idx) { if (env.common.findIdent("Builtin.Str.from_utf8_lossy")) |str_from_utf8_lossy_ident| { try low_level_map.put(str_from_utf8_lossy_ident, .str_from_utf8_lossy); } + if (env.common.findIdent("Builtin.Str.split_on")) |str_split_on_ident| { + try low_level_map.put(str_split_on_ident, .str_split_on); + } if (env.common.findIdent("Builtin.List.len")) |list_len_ident| { try low_level_map.put(list_len_ident, .list_len); } diff --git a/src/build/roc/Builtin.roc b/src/build/roc/Builtin.roc index df0caf6e9f..f6064b071e 100644 --- a/src/build/roc/Builtin.roc +++ b/src/build/roc/Builtin.roc @@ -21,6 +21,7 @@ Builtin :: [].{ release_excess_capacity : Str -> Str to_utf8 : Str -> List(U8) from_utf8_lossy : List(U8) -> Str + split_on : Str, Str -> List(Str) } List(_item) :: [ProvidedByCompiler].{ diff --git a/src/canonicalize/Expression.zig b/src/canonicalize/Expression.zig index a27558b015..1b7d75a5ba 100644 --- a/src/canonicalize/Expression.zig +++ b/src/canonicalize/Expression.zig @@ -421,6 +421,7 @@ pub const Expr = union(enum) { str_release_excess_capacity, str_to_utf8, str_from_utf8_lossy, + str_split_on, // Numeric to_str operations u8_to_str, diff --git a/src/eval/interpreter.zig b/src/eval/interpreter.zig index 2ba81ff885..66b5ebedfb 100644 --- a/src/eval/interpreter.zig +++ b/src/eval/interpreter.zig @@ -3218,6 +3218,36 @@ pub const Interpreter = struct { out.is_initialized = true; return out; }, + .str_split_on => { + // Str.split_on : Str, Str -> List(Str) + std.debug.assert(args.len == 2); + + const string_arg = args[0]; + const delimiter_arg = args[1]; + + std.debug.assert(string_arg.ptr != null); + std.debug.assert(delimiter_arg.ptr != null); + + const string: *const RocStr = @ptrCast(@alignCast(string_arg.ptr.?)); + const delimiter: *const RocStr = @ptrCast(@alignCast(delimiter_arg.ptr.?)); + + const result_list = builtins.str.strSplitOn(string.*, delimiter.*, roc_ops); + + const result_rt_var = return_rt_var orelse { + self.triggerCrash("str_split_on requires return type info", false, roc_ops); + return error.Crash; + }; + const result_layout = try self.getRuntimeLayout(result_rt_var); + + var out = try self.pushRaw(result_layout, 0); + out.is_initialized = false; + + const result_ptr: *builtins.list.RocList = @ptrCast(@alignCast(out.ptr.?)); + result_ptr.* = result_list; + + out.is_initialized = true; + return out; + }, .list_len => { // List.len : List(a) -> U64 // Note: listLen returns usize, but List.len always returns U64. diff --git a/src/eval/test/low_level_interp_test.zig b/src/eval/test/low_level_interp_test.zig index 160b50b0c9..28d5b77cfe 100644 --- a/src/eval/test/low_level_interp_test.zig +++ b/src/eval/test/low_level_interp_test.zig @@ -1120,3 +1120,201 @@ test "e_low_level_lambda - Str.drop_suffix suffix longer than string" { defer test_allocator.free(value); try testing.expectEqualStrings("\"hi\"", value); } + +// count_utf8_bytes tests +test "e_low_level_lambda - Str.count_utf8_bytes empty string" { + const src = + \\x = Str.count_utf8_bytes("") + ; + const value = try evalModuleAndGetInt(src, 0); + try testing.expectEqual(@as(i128, 0), value); +} + +test "e_low_level_lambda - Str.count_utf8_bytes ASCII string" { + const src = + \\x = Str.count_utf8_bytes("hello") + ; + const value = try evalModuleAndGetInt(src, 0); + try testing.expectEqual(@as(i128, 5), value); +} + +test "e_low_level_lambda - Str.count_utf8_bytes multi-byte UTF-8" { + const src = + \\x = Str.count_utf8_bytes("é") + ; + const value = try evalModuleAndGetInt(src, 0); + try testing.expectEqual(@as(i128, 2), value); +} + +test "e_low_level_lambda - Str.count_utf8_bytes emoji" { + const src = + \\x = Str.count_utf8_bytes("🎉") + ; + const value = try evalModuleAndGetInt(src, 0); + try testing.expectEqual(@as(i128, 4), value); +} + +// with_capacity tests +test "e_low_level_lambda - Str.with_capacity returns empty string" { + const src = + \\x = Str.with_capacity(0) + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"\"", value); +} + +test "e_low_level_lambda - Str.with_capacity with capacity returns empty string" { + const src = + \\x = Str.with_capacity(100) + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"\"", value); +} + +// reserve tests +test "e_low_level_lambda - Str.reserve preserves content" { + const src = + \\x = Str.reserve("hello", 100) + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"hello\"", value); +} + +test "e_low_level_lambda - Str.reserve empty string" { + const src = + \\x = Str.reserve("", 50) + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"\"", value); +} + +// release_excess_capacity tests +test "e_low_level_lambda - Str.release_excess_capacity preserves content" { + const src = + \\x = Str.release_excess_capacity("hello") + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"hello\"", value); +} + +test "e_low_level_lambda - Str.release_excess_capacity empty string" { + const src = + \\x = Str.release_excess_capacity("") + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"\"", value); +} + +// to_utf8 tests (using List.len to verify) +test "e_low_level_lambda - Str.to_utf8 empty string" { + const src = + \\x = List.len(Str.to_utf8("")) + ; + const value = try evalModuleAndGetInt(src, 0); + try testing.expectEqual(@as(i128, 0), value); +} + +test "e_low_level_lambda - Str.to_utf8 ASCII string" { + const src = + \\x = List.len(Str.to_utf8("hello")) + ; + const value = try evalModuleAndGetInt(src, 0); + try testing.expectEqual(@as(i128, 5), value); +} + +test "e_low_level_lambda - Str.to_utf8 multi-byte UTF-8" { + const src = + \\x = List.len(Str.to_utf8("é")) + ; + const value = try evalModuleAndGetInt(src, 0); + try testing.expectEqual(@as(i128, 2), value); +} + +// from_utf8_lossy tests (roundtrip through to_utf8) +test "e_low_level_lambda - Str.from_utf8_lossy roundtrip ASCII" { + const src = + \\x = Str.from_utf8_lossy(Str.to_utf8("hello")) + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"hello\"", value); +} + +test "e_low_level_lambda - Str.from_utf8_lossy roundtrip empty" { + const src = + \\x = Str.from_utf8_lossy(Str.to_utf8("")) + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"\"", value); +} + +test "e_low_level_lambda - Str.from_utf8_lossy roundtrip UTF-8" { + const src = + \\x = Str.from_utf8_lossy(Str.to_utf8("hello 🎉 world")) + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"hello 🎉 world\"", value); +} + +// split_on tests +test "e_low_level_lambda - Str.split_on basic split count" { + const src = + \\x = List.len(Str.split_on("hello world", " ")) + ; + const value = try evalModuleAndGetInt(src, 0); + try testing.expectEqual(@as(i128, 2), value); +} + +test "e_low_level_lambda - Str.split_on basic split first element" { + const src = + \\parts = Str.split_on("hello world", " ") + \\first = List.first(parts) + ; + const value = try evalModuleAndGetString(src, 1, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("Ok(\"hello\")", value); +} + +test "e_low_level_lambda - Str.split_on multiple delimiters count" { + const src = + \\x = List.len(Str.split_on("a,b,c,d", ",")) + ; + const value = try evalModuleAndGetInt(src, 0); + try testing.expectEqual(@as(i128, 4), value); +} + +test "e_low_level_lambda - Str.split_on multiple delimiters first element" { + const src = + \\parts = Str.split_on("a,b,c,d", ",") + \\first = List.first(parts) + ; + const value = try evalModuleAndGetString(src, 1, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("Ok(\"a\")", value); +} + +test "e_low_level_lambda - Str.split_on no match" { + const src = + \\parts = Str.split_on("hello", "x") + \\first = List.first(parts) + ; + const value = try evalModuleAndGetString(src, 1, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("Ok(\"hello\")", value); +} + +test "e_low_level_lambda - Str.split_on empty string" { + const src = + \\x = List.len(Str.split_on("", ",")) + ; + const value = try evalModuleAndGetInt(src, 0); + try testing.expectEqual(@as(i128, 1), value); +} diff --git a/test/snapshots/plume_package/Color.md b/test/snapshots/plume_package/Color.md index 00a1997700..ca95ed1177 100644 --- a/test/snapshots/plume_package/Color.md +++ b/test/snapshots/plume_package/Color.md @@ -89,7 +89,12 @@ DOES NOT EXIST - Color.md:51:75:51:85 DOES NOT EXIST - Color.md:51:93:51:103 DOES NOT EXIST - Color.md:68:14:68:27 MISSING METHOD - Color.md:22:15:22:26 -MISSING METHOD - Color.md:29:13:29:26 +MISSING METHOD - Color.md:35:17:35:41 +MISSING METHOD - Color.md:36:21:36:45 +MISSING METHOD - Color.md:37:21:37:45 +MISSING METHOD - Color.md:38:21:38:45 +MISSING METHOD - Color.md:39:21:39:45 +MISSING METHOD - Color.md:40:21:40:45 TYPE MISMATCH - Color.md:32:5:45:6 MISSING METHOD - Color.md:62:8:62:28 MISSING METHOD - Color.md:56:8:56:34 @@ -221,15 +226,70 @@ This **to_frac** method is being called on the type **Num.U8**, which has no met **Hint: **For this to work, the type would need to have a method named **to_frac** associated with it in the type's declaration. **MISSING METHOD** -This **to_utf8** method is being called on the type **Str**, which has no method with that name: -**Color.md:29:13:29:26:** +This **is_char_in_hex_range** method is being called on the type **Num.U8**, which has no method with that name: +**Color.md:35:17:35:41:** ```roc - bytes = str.to_utf8() + a.is_char_in_hex_range() ``` - ^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^^^^^ -**Hint: **For this to work, the type would need to have a method named **to_utf8** associated with it in the type's declaration. +**Hint: **For this to work, the type would need to have a method named **is_char_in_hex_range** associated with it in the type's declaration. + +**MISSING METHOD** +This **is_char_in_hex_range** method is being called on the type **Num.U8**, which has no method with that name: +**Color.md:36:21:36:45:** +```roc + and b.is_char_in_hex_range() +``` + ^^^^^^^^^^^^^^^^^^^^^^^^ + + +**Hint: **For this to work, the type would need to have a method named **is_char_in_hex_range** associated with it in the type's declaration. + +**MISSING METHOD** +This **is_char_in_hex_range** method is being called on the type **Num.U8**, which has no method with that name: +**Color.md:37:21:37:45:** +```roc + and c.is_char_in_hex_range() +``` + ^^^^^^^^^^^^^^^^^^^^^^^^ + + +**Hint: **For this to work, the type would need to have a method named **is_char_in_hex_range** associated with it in the type's declaration. + +**MISSING METHOD** +This **is_char_in_hex_range** method is being called on the type **Num.U8**, which has no method with that name: +**Color.md:38:21:38:45:** +```roc + and d.is_char_in_hex_range() +``` + ^^^^^^^^^^^^^^^^^^^^^^^^ + + +**Hint: **For this to work, the type would need to have a method named **is_char_in_hex_range** associated with it in the type's declaration. + +**MISSING METHOD** +This **is_char_in_hex_range** method is being called on the type **Num.U8**, which has no method with that name: +**Color.md:39:21:39:45:** +```roc + and e.is_char_in_hex_range() +``` + ^^^^^^^^^^^^^^^^^^^^^^^^ + + +**Hint: **For this to work, the type would need to have a method named **is_char_in_hex_range** associated with it in the type's declaration. + +**MISSING METHOD** +This **is_char_in_hex_range** method is being called on the type **Num.U8**, which has no method with that name: +**Color.md:40:21:40:45:** +```roc + and f.is_char_in_hex_range() +``` + ^^^^^^^^^^^^^^^^^^^^^^^^ + + +**Hint: **For this to work, the type would need to have a method named **is_char_in_hex_range** associated with it in the type's declaration. **TYPE MISMATCH** This expression is used in an unexpected way: diff --git a/test/snapshots/repl/str_split_on.md b/test/snapshots/repl/str_split_on.md new file mode 100644 index 0000000000..2c76888604 --- /dev/null +++ b/test/snapshots/repl/str_split_on.md @@ -0,0 +1,22 @@ +# META +~~~ini +description=Str.split_on should split a string on a delimiter +type=repl +~~~ +# SOURCE +~~~roc +» List.len(Str.split_on("hello world", " ")) +» List.len(Str.split_on("a,b,c", ",")) +» List.len(Str.split_on("no match", "x")) +» List.len(Str.split_on("", ",")) +~~~ +# OUTPUT +2 +--- +3 +--- +1 +--- +1 +# PROBLEMS +NIL From 173c00993bcec0756b3bcd1365e1f39729fa4609 Mon Sep 17 00:00:00 2001 From: Luke Boswell Date: Wed, 26 Nov 2025 12:54:53 +1100 Subject: [PATCH 09/10] add Str.join_with builtin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Includes interpreter tests: - basic join with two elements - multiple elements with comma separator - single element - empty list - roundtrip with split_on 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- src/build/builtin_compiler/main.zig | 3 ++ src/build/roc/Builtin.roc | 1 + src/canonicalize/Expression.zig | 1 + src/eval/interpreter.zig | 25 ++++++++++++++ src/eval/test/low_level_interp_test.zig | 46 +++++++++++++++++++++++++ test/snapshots/repl/str_join_with.md | 22 ++++++++++++ 6 files changed, 98 insertions(+) create mode 100644 test/snapshots/repl/str_join_with.md diff --git a/src/build/builtin_compiler/main.zig b/src/build/builtin_compiler/main.zig index f65ad2aaf2..17a9d877ad 100644 --- a/src/build/builtin_compiler/main.zig +++ b/src/build/builtin_compiler/main.zig @@ -161,6 +161,9 @@ fn replaceStrIsEmptyWithLowLevel(env: *ModuleEnv) !std.ArrayList(CIR.Def.Idx) { if (env.common.findIdent("Builtin.Str.split_on")) |str_split_on_ident| { try low_level_map.put(str_split_on_ident, .str_split_on); } + if (env.common.findIdent("Builtin.Str.join_with")) |str_join_with_ident| { + try low_level_map.put(str_join_with_ident, .str_join_with); + } if (env.common.findIdent("Builtin.List.len")) |list_len_ident| { try low_level_map.put(list_len_ident, .list_len); } diff --git a/src/build/roc/Builtin.roc b/src/build/roc/Builtin.roc index f6064b071e..ecd83dfccf 100644 --- a/src/build/roc/Builtin.roc +++ b/src/build/roc/Builtin.roc @@ -22,6 +22,7 @@ Builtin :: [].{ to_utf8 : Str -> List(U8) from_utf8_lossy : List(U8) -> Str split_on : Str, Str -> List(Str) + join_with : List(Str), Str -> Str } List(_item) :: [ProvidedByCompiler].{ diff --git a/src/canonicalize/Expression.zig b/src/canonicalize/Expression.zig index 1b7d75a5ba..76732ff2dc 100644 --- a/src/canonicalize/Expression.zig +++ b/src/canonicalize/Expression.zig @@ -422,6 +422,7 @@ pub const Expr = union(enum) { str_to_utf8, str_from_utf8_lossy, str_split_on, + str_join_with, // Numeric to_str operations u8_to_str, diff --git a/src/eval/interpreter.zig b/src/eval/interpreter.zig index 66b5ebedfb..2847c09b77 100644 --- a/src/eval/interpreter.zig +++ b/src/eval/interpreter.zig @@ -3248,6 +3248,31 @@ pub const Interpreter = struct { out.is_initialized = true; return out; }, + .str_join_with => { + // Str.join_with : List(Str), Str -> Str + std.debug.assert(args.len == 2); + + const list_arg = args[0]; + const separator_arg = args[1]; + + std.debug.assert(list_arg.ptr != null); + std.debug.assert(separator_arg.ptr != null); + + const roc_list: *const builtins.list.RocList = @ptrCast(@alignCast(list_arg.ptr.?)); + const separator: *const RocStr = @ptrCast(@alignCast(separator_arg.ptr.?)); + + const result_str = builtins.str.strJoinWithC(roc_list.*, separator.*, roc_ops); + + const result_layout = layout.Layout.str(); + var out = try self.pushRaw(result_layout, 0); + out.is_initialized = false; + + const result_ptr: *RocStr = @ptrCast(@alignCast(out.ptr.?)); + result_ptr.* = result_str; + + out.is_initialized = true; + return out; + }, .list_len => { // List.len : List(a) -> U64 // Note: listLen returns usize, but List.len always returns U64. diff --git a/src/eval/test/low_level_interp_test.zig b/src/eval/test/low_level_interp_test.zig index 28d5b77cfe..9366326214 100644 --- a/src/eval/test/low_level_interp_test.zig +++ b/src/eval/test/low_level_interp_test.zig @@ -1318,3 +1318,49 @@ test "e_low_level_lambda - Str.split_on empty string" { const value = try evalModuleAndGetInt(src, 0); try testing.expectEqual(@as(i128, 1), value); } + +// join_with tests +test "e_low_level_lambda - Str.join_with basic join" { + const src = + \\x = Str.join_with(["hello", "world"], " ") + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"hello world\"", value); +} + +test "e_low_level_lambda - Str.join_with multiple elements" { + const src = + \\x = Str.join_with(["a", "b", "c", "d"], ",") + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"a,b,c,d\"", value); +} + +test "e_low_level_lambda - Str.join_with single element" { + const src = + \\x = Str.join_with(["hello"], "-") + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"hello\"", value); +} + +test "e_low_level_lambda - Str.join_with empty list" { + const src = + \\x = Str.join_with([], ",") + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"\"", value); +} + +test "e_low_level_lambda - Str.join_with roundtrip with split_on" { + const src = + \\x = Str.join_with(Str.split_on("hello world", " "), " ") + ; + const value = try evalModuleAndGetString(src, 0, test_allocator); + defer test_allocator.free(value); + try testing.expectEqualStrings("\"hello world\"", value); +} diff --git a/test/snapshots/repl/str_join_with.md b/test/snapshots/repl/str_join_with.md new file mode 100644 index 0000000000..1b0ade9365 --- /dev/null +++ b/test/snapshots/repl/str_join_with.md @@ -0,0 +1,22 @@ +# META +~~~ini +description=Str.join_with should join strings with a separator +type=repl +~~~ +# SOURCE +~~~roc +» Str.join_with(["hello", "world"], " ") +» Str.join_with(["a", "b", "c"], ",") +» Str.join_with(["single"], "-") +» Str.join_with([], ",") +~~~ +# OUTPUT +"hello world" +--- +"a,b,c" +--- +"single" +--- +"" +# PROBLEMS +NIL From b97f5fae9e1f052cca6c7733e3f6b6efb684d65d Mon Sep 17 00:00:00 2001 From: Anton-4 <17049058+Anton-4@users.noreply.github.com> Date: Thu, 27 Nov 2025 14:29:59 +0100 Subject: [PATCH 10/10] plum update, fix snapshot_tool_test: integer overflow --- src/eval/interpreter.zig | 22 ++++++-- src/eval/test/low_level_interp_test.zig | 1 - test/snapshots/plume_package/Color.md | 69 +++++++++++++++---------- 3 files changed, 60 insertions(+), 32 deletions(-) diff --git a/src/eval/interpreter.zig b/src/eval/interpreter.zig index 4241b3dc88..d55e3a2cd5 100644 --- a/src/eval/interpreter.zig +++ b/src/eval/interpreter.zig @@ -3421,11 +3421,25 @@ pub const Interpreter = struct { const result_list = builtins.str.strSplitOn(string.*, delimiter.*, roc_ops); - const result_rt_var = return_rt_var orelse { - self.triggerCrash("str_split_on requires return type info", false, roc_ops); - return error.Crash; + // str_split_on has a fixed return type of List(Str). + // Prefer the caller's return_rt_var when it matches that shape, but fall back + // to the known layout if type information is missing or incorrect. + const result_layout = blk: { + const expected_idx = try self.runtime_layout_store.insertList(layout.Idx.str); + const expected_layout = self.runtime_layout_store.getLayout(expected_idx); + + if (return_rt_var) |rt_var| { + const candidate = self.getRuntimeLayout(rt_var) catch expected_layout; + if (candidate.tag == .list) { + const elem_layout = self.runtime_layout_store.getLayout(candidate.data.list); + if (elem_layout.tag == .scalar and elem_layout.data.scalar.tag == .str) { + break :blk candidate; + } + } + } + + break :blk expected_layout; }; - const result_layout = try self.getRuntimeLayout(result_rt_var); var out = try self.pushRaw(result_layout, 0); out.is_initialized = false; diff --git a/src/eval/test/low_level_interp_test.zig b/src/eval/test/low_level_interp_test.zig index 5e0e99bd41..d02cc840b8 100644 --- a/src/eval/test/low_level_interp_test.zig +++ b/src/eval/test/low_level_interp_test.zig @@ -6,7 +6,6 @@ const std = @import("std"); const parse = @import("parse"); -const types = @import("types"); const base = @import("base"); const can = @import("can"); const check = @import("check"); diff --git a/test/snapshots/plume_package/Color.md b/test/snapshots/plume_package/Color.md index 9a9157419e..99697ec85c 100644 --- a/test/snapshots/plume_package/Color.md +++ b/test/snapshots/plume_package/Color.md @@ -88,15 +88,13 @@ DOES NOT EXIST - Color.md:51:57:51:67 DOES NOT EXIST - Color.md:51:75:51:85 DOES NOT EXIST - Color.md:51:93:51:103 DOES NOT EXIST - Color.md:68:14:68:27 -MISSING METHOD - Color.md:22:15:22:26 -MISSING METHOD - Color.md:35:17:35:41 -MISSING METHOD - Color.md:36:21:36:45 -MISSING METHOD - Color.md:37:21:37:45 -MISSING METHOD - Color.md:38:21:38:45 -MISSING METHOD - Color.md:39:21:39:45 -MISSING METHOD - Color.md:40:21:40:45 MISSING METHOD - Color.md:22:17:22:24 -MISSING METHOD - Color.md:29:17:29:24 +MISSING METHOD - Color.md:35:19:35:39 +MISSING METHOD - Color.md:36:23:36:43 +MISSING METHOD - Color.md:37:23:37:43 +MISSING METHOD - Color.md:38:23:38:43 +MISSING METHOD - Color.md:39:23:39:43 +MISSING METHOD - Color.md:40:23:40:43 TYPE MISMATCH - Color.md:32:5:45:6 MISSING METHOD - Color.md:62:12:62:26 MISSING METHOD - Color.md:56:26:56:32 @@ -231,69 +229,86 @@ The value's type, which does not have a method named **to_frac**, is: **Hint: **For this to work, the type would need to have a method named **to_frac** associated with it in the type's declaration. **MISSING METHOD** -This **is_char_in_hex_range** method is being called on the type **Num.U8**, which has no method with that name: -**Color.md:35:17:35:41:** +This **is_char_in_hex_range** method is being called on a value whose type doesn't have that method: +**Color.md:35:19:35:39:** ```roc a.is_char_in_hex_range() ``` - ^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^ +The value's type, which does not have a method named **is_char_in_hex_range**, is: + + _U8_ **Hint: **For this to work, the type would need to have a method named **is_char_in_hex_range** associated with it in the type's declaration. **MISSING METHOD** -This **is_char_in_hex_range** method is being called on the type **Num.U8**, which has no method with that name: -**Color.md:36:21:36:45:** +This **is_char_in_hex_range** method is being called on a value whose type doesn't have that method: +**Color.md:36:23:36:43:** ```roc and b.is_char_in_hex_range() ``` - ^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^ +The value's type, which does not have a method named **is_char_in_hex_range**, is: + + _U8_ **Hint: **For this to work, the type would need to have a method named **is_char_in_hex_range** associated with it in the type's declaration. **MISSING METHOD** -This **is_char_in_hex_range** method is being called on the type **Num.U8**, which has no method with that name: -**Color.md:37:21:37:45:** +This **is_char_in_hex_range** method is being called on a value whose type doesn't have that method: +**Color.md:37:23:37:43:** ```roc and c.is_char_in_hex_range() ``` - ^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^ +The value's type, which does not have a method named **is_char_in_hex_range**, is: + + _U8_ **Hint: **For this to work, the type would need to have a method named **is_char_in_hex_range** associated with it in the type's declaration. **MISSING METHOD** -This **is_char_in_hex_range** method is being called on the type **Num.U8**, which has no method with that name: -**Color.md:38:21:38:45:** +This **is_char_in_hex_range** method is being called on a value whose type doesn't have that method: +**Color.md:38:23:38:43:** ```roc and d.is_char_in_hex_range() ``` - ^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^ +The value's type, which does not have a method named **is_char_in_hex_range**, is: + + _U8_ **Hint: **For this to work, the type would need to have a method named **is_char_in_hex_range** associated with it in the type's declaration. **MISSING METHOD** -This **is_char_in_hex_range** method is being called on the type **Num.U8**, which has no method with that name: -**Color.md:39:21:39:45:** +This **is_char_in_hex_range** method is being called on a value whose type doesn't have that method: +**Color.md:39:23:39:43:** ```roc and e.is_char_in_hex_range() ``` - ^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^ +The value's type, which does not have a method named **is_char_in_hex_range**, is: + + _U8_ **Hint: **For this to work, the type would need to have a method named **is_char_in_hex_range** associated with it in the type's declaration. **MISSING METHOD** -This **is_char_in_hex_range** method is being called on the type **Num.U8**, which has no method with that name: -**Color.md:40:21:40:45:** +This **is_char_in_hex_range** method is being called on a value whose type doesn't have that method: +**Color.md:40:23:40:43:** ```roc and f.is_char_in_hex_range() ``` - ^^^^^^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^ - _Str_ +The value's type, which does not have a method named **is_char_in_hex_range**, is: + + _U8_ **Hint: **For this to work, the type would need to have a method named **is_char_in_hex_range** associated with it in the type's declaration.