Merge branch 'main' into builtin-json

2025-08-03 11:52:19 +00:00 · 2023-04-04 17:21:08 +10:00 · 2023-04-04 17:21:08 +10:00 · dc43290647
commit dc43290647
parent addc4debef fe71c389d6
199 changed files with 10195 additions and 3810 deletions
--- a/crates/compiler/builtins/bitcode/src/str.zig
+++ b/crates/compiler/builtins/bitcode/src/str.zig
@ -1921,7 +1921,7 @@ pub fn fromUtf8Range(arg: RocList, start: usize, count: usize, update_mode: Upda
            .problem_code = Utf8ByteProblem.InvalidStartByte,
        };
    }
-    const bytes = @ptrCast([*]const u8, arg.bytes)[start..count];
+    const bytes = @ptrCast([*]const u8, arg.bytes)[start .. start + count];

    if (isValidUnicode(bytes)) {
        // Make a seamless slice of the input.
--- a/crates/compiler/builtins/roc/Str.roc
+++ b/crates/compiler/builtins/roc/Str.roc
@ -1,3 +1,21 @@
+## Roc strings are sequences of text values. This module includes functions for combining strings,
+## as well as breaking them up into smaller units—most commonly [extended grapheme clusters](http://www.unicode.org/glossary/#extended_grapheme_cluster)
+## (referred to in this module's documentation as "graphemes" rather than "characters" for clarity;
+## "characters" can mean very different things in different languages).
+##
+## This module focuses on graphemes (as opposed to, say, Unicode code points or LATIN-1 bytes)
+## because graphemes avoid common classes of bugs. Breaking strings up using code points often
+## leads to bugs around things like emoji, where multiple code points combine to form to a
+## single rendered glyph. Graphemes avoid these bugs by treating multi-code-point things like
+## emojis as indivisible units.
+##
+## Because graphemes can have variable length (there's no upper limit on how many code points one
+## grapheme can represent), it takes linear time to count the number of graphemes in a string,
+## and also linear time to find an individual grapheme within a string by its position (or "index")
+## among the string's other graphemes. The only way to get constant-time access to these is in a way
+## that can result in bugs if the string contains multi-code-point things like emojis, which is why
+## this module does not offer those.
+##
 ##
 ## ## Working with Unicode strings in Roc
 ##
@ -107,6 +125,7 @@ interface Str
        replaceLast,
        splitFirst,
        splitLast,
+        walkUtf8,
        walkUtf8WithIndex,
        reserve,
        releaseExcessCapacity,
@ -151,8 +170,89 @@ isEmpty : Str -> Bool
 concat : Str, Str -> Str

 ## Returns a string of the specified capacity without any content.
+##
+## This is a performance optimization tool that's like calling [Str.reserve] on an empty string.
+## It's useful when you plan to build up a string incrementally, for example by calling [Str.concat] on it:
+##
+## ```
+## greeting = "Hello and welcome to Roc"
+## subject = "Awesome Programmer"
+##
+## # Evaluates to "Hello and welcome to Roc, Awesome Programmer!"
+## helloWorld =
+##     Str.withCapacity 45
+##     |> Str.concat greeting
+##     |> Str.concat ", "
+##     |> Str.concat subject
+##     |> Str.concat "!"
+## ```
+##
+## In general, if you plan to use [Str.concat] on an empty string, it will be faster to start with
+## [Str.withCapacity] than with `""`. Even if you don't know the exact capacity of the string, giving [withCapacity]
+## a higher value than ends up being necessary can help prevent reallocation and copying—at
+## the cost of using more memory than is necessary.
+##
+## For more details on how the performance optimization works, see [Str.reserve].
 withCapacity : Nat -> Str

+## Increase a string's capacity by at least the given number of additional bytes.
+##
+## This can improve the performance of string concatenation operations like [Str.concat] by
+## allocating extra capacity up front, which can prevent the need for reallocations and copies.
+## Consider the following example which does not use [Str.reserve]:
+##
+## ```
+## greeting = "Hello and welcome to Roc"
+## subject = "Awesome Programmer"
+##
+## # Evaluates to "Hello and welcome to Roc, Awesome Programmer!"
+## helloWorld =
+##     greeting
+##     |> Str.concat ", "
+##     |> Str.concat subject
+##     |> Str.concat "!"
+## ```
+##
+## In this example:
+## 1. We start with `greeting`, which has both a length and capacity of 24 (bytes).
+## 2. `|> Str.concat ", "` will see that there isn't enough capacity to add 2 more bytes for the `", "`, so it will create a new heap allocation with enough bytes to hold both. (This probably will be more than 7 bytes, because when [Str] functions reallocate, they apply a multiplier to the exact capacity required. This makes it less likely that future realloctions will be needed. The multiplier amount is not specified, because it may change in future releases of Roc, but it will likely be around 1.5 to 2 times the exact capacity required.) Then it will copy the current bytes (`"Hello"`) into the new allocation, and finally concatenate the `", "` into the new allocation. The old allocation will then be deallocated because it's no longer referenced anywhere in the program.
+## 3. `|> Str.concat subject` will again check if there is enough capacity in the string. If it doesn't find enough capacity once again, it will make a third allocation, copy the existing bytes (`"Hello, "`) into that third allocation, and then deallocate the second allocation because it's already no longer being referenced anywhere else in the program. (It may find enough capacity in this prticular case, because the previous [Str.concat] allocated something like 1.5 to 2 times the necessary capacity in order to anticipate future concatenations like this...but if something longer than `"World"` were being concatenated here, it might still require further reallocation and copying.)
+## 4. `|> Str.concat "!\n"` will repeat this process once more.
+##
+## This process can have significant performance costs due to multiple reallocation of new strings, copying between old strings and new strings, and deallocation of immediately obsolete strings.
+##
+## Here's a modified example which uses [Str.reserve] to eliminate the need for all that reallocation, copying, and deallocation.
+##
+## ```
+## helloWorld =
+##     greeting
+##     |> Str.reserve 21
+##     |> Str.concat ", "
+##     |> Str.concat subject
+##     |> Str.concat "!"
+## ```
+##
+## In this example:
+## 1. We again start with `greeting`, which has both a length and capacity of 24 bytes.
+## 2. `|> Str.reserve 21` will ensure that there is enough capacity in the string for an additional 21 bytes (to make room for `", "`, `"Awesome Programmer"`, and `"!"`). Since the current capacity is only 24, it will create a new 45-byte (24 + 21) heap allocation and copy the contents of the existing allocation (`greeting`) into it.
+## 3. `|> Str.concat ", "` will concatenate `, ` to the string. No reallocation, copying, or deallocation will be necessary, because the string already has a capacity of 45 btytes, and `greeting` will only use 24 of them.
+## 4. `|> Str.concat subject` will concatenate `subject` (`"Awesome Programmer"`) to the string. Again, no reallocation, copying, or deallocation will be necessary.
+## 5. `|> Str.concat "!\n"` will concatenate `"!\n"` to the string, still without any reallocation, copying, or deallocation.
+##
+## Here, [Str.reserve] prevented multiple reallocations, copies, and deallocations during the
+## [Str.concat] calls. Notice that it did perform a heap allocation before any [Str.concat] calls
+## were made, which means that using [Str.reserve] is not free! You should only use it if you actually
+## expect to make use of the extra capacity.
+##
+## Ideally, you'd be able to predict exactly how many extra bytes of capacity will be needed, but this
+## may not always be knowable. When you don't know exactly how many bytes to reserve, you can often get better
+## performance by choosing a number of bytes that's too high, because a number that's too low could lead to reallocations. There's a limit to
+## this, of course; if you always give it ten times what it turns out to need, that could prevent
+## reallocations but will also waste a lot of memory!
+##
+## If you plan to use [Str.reserve] on an empty string, it's generally better to use [Str.withCapacity] instead.
+reserve : Str, Nat -> Str
+
 ## Combines a [List] of strings into a single string, with a separator
 ## string in between each.
 ## ```
@ -200,7 +300,21 @@ repeat : Str, Nat -> Str
 ## using a single Unicode code point.
 countGraphemes : Str -> Nat

-## Split a string into its constituent grapheme clusters
+## Split a string into its constituent graphemes.
+##
+## This function breaks a string into its individual [graphemes](https://stackoverflow.com/a/27331885/4200103),
+## returning them as a list of strings. This is useful for working with text that
+## contains complex characters, such as emojis.
+##
+## Examples:
+## ```
+## expect Str.graphemes "Roc" == ["R", "o", "c"]
+## expect Str.graphemes "नमस्ते" == ["न", "म", "स्", "ते"]
+## expect Str.graphemes "👩‍👩‍👦‍👦" == ["👩‍", "👩‍", "👦‍", "👦"]
+## ```
+##
+## Note that the "👩‍👩‍👦‍👦" example consists of 4 grapheme clusters, although it visually
+## appears as a single glyph. This is because it uses an emoji modifier sequence.
 graphemes : Str -> List Str

 ## If the string begins with a [Unicode code point](http://www.unicode.org/glossary/#code_point)
@ -273,6 +387,12 @@ fromUtf8 = \bytes ->
    else
        Err (BadUtf8 result.dProblemCode result.aByteIndex)

+expect (Str.fromUtf8 [82, 111, 99]) == Ok "Roc"
+expect (Str.fromUtf8 [224, 174, 154, 224, 174, 191]) == Ok "சி"
+expect (Str.fromUtf8 [240, 159, 144, 166]) == Ok "🐦"
+expect (Str.fromUtf8 []) == Ok ""
+expect (Str.fromUtf8 [255]) |> Result.isErr
+
 ## Encode part of a [List] of [U8] UTF-8 [code units](https://unicode.org/glossary/#code_unit)
 ## into a [Str]
 ## ```
@ -290,6 +410,12 @@ fromUtf8Range = \bytes, config ->
    else
        Err OutOfBounds

+expect (Str.fromUtf8Range [72, 105, 80, 103] { start: 0, count: 2 }) == Ok "Hi"
+expect (Str.fromUtf8Range [233, 185, 143, 224, 174, 154, 224, 174, 191] { start: 3, count: 3 }) == Ok "ச"
+expect (Str.fromUtf8Range [240, 159, 144, 166] { start: 0, count: 4 }) == Ok "🐦"
+expect (Str.fromUtf8Range [] { start: 0, count: 0 }) == Ok ""
+expect (Str.fromUtf8Range [72, 105, 80, 103] { start: 2, count: 3 }) |> Result.isErr
+
 FromUtf8Result : {
    aByteIndex : Nat,
    bString : Str,
@ -744,8 +870,29 @@ walkUtf8WithIndexHelp = \string, state, step, index, length ->
    else
        state

-## Enlarge a string for at least the given number additional bytes.
-reserve : Str, Nat -> Str
+## Walks over the `UTF-8` bytes of the given [Str] and calls a function to update
+## state for each byte.
+##
+## ```
+## result = walkUtf8 "hello, world!" "" (\state, byte -> state ++ String.fromCodePoint byte)
+## expect result == Ok "hello, world!"
+## ```
+walkUtf8 : Str, state, (state, U8 -> state) -> state
+walkUtf8 = \str, initial, step ->
+    walkUtf8Help str initial step 0 (Str.countUtf8Bytes str)
+
+walkUtf8Help : Str, state, (state, U8 -> state), Nat, Nat -> state
+walkUtf8Help = \str, state, step, index, length ->
+    if index < length then
+        byte = Str.getUnsafe str index
+        newState = step state byte
+
+        walkUtf8Help str newState step (index + 1) length
+    else
+        state
+
+expect (walkUtf8 "ABC" [] List.append) == [65, 66, 67]
+expect (walkUtf8 "鹏" [] List.append) == [233, 185, 143]

 ## Shrink the memory footprint of a str such that it's capacity and length are equal.
 ## Note: This will also convert seamless slices to regular lists.