add fromUtf16 and 32, as well as lossy variants

2025-08-04 20:28:02 +00:00 · 2025-01-15 00:14:58 +01:00 · 2025-01-15 00:14:58 +01:00 · 414fecd14d
commit 414fecd14d
parent 032f1cc5a4
17 changed files with 805 additions and 74 deletions
--- a/crates/compiler/builtins/bitcode/src/main.zig
+++ b/crates/compiler/builtins/bitcode/src/main.zig
@ -203,6 +203,7 @@ comptime {
    exportStrFn(str.reserveC, "reserve");
    exportStrFn(str.strToUtf8C, "to_utf8");
    exportStrFn(str.fromUtf8C, "from_utf8");
+    exportStrFn(str.fromUtf8Lossy, "from_utf8_lossy");
    exportStrFn(str.repeatC, "repeat");
    exportStrFn(str.strTrim, "trim");
    exportStrFn(str.strTrimStart, "trim_start");
--- a/crates/compiler/builtins/bitcode/src/str.zig
+++ b/crates/compiler/builtins/bitcode/src/str.zig
@ -1449,6 +1449,105 @@ pub fn fromUtf8C(
    return fromUtf8(list, update_mode);
 }

+const UNICODE_REPLACEMENT: u21 = 0xfffd;
+
+const Utf8Iterator = struct {
+    bytes: []u8,
+    i: usize,
+
+    pub fn init(list: RocList) Utf8Iterator {
+        const bytes = @as([*]u8, @ptrCast(list.bytes))[0..list.length];
+        return Utf8Iterator{
+            .bytes = bytes,
+            .i = 0,
+        };
+    }
+
+    pub fn nextLossy(it: *Utf8Iterator) ?u32 {
+        if (it.bytes.len <= it.i) {
+            return null;
+        }
+
+        const rest = it.bytes[it.i..];
+        const n = unicode.utf8ByteSequenceLength(rest[0]) catch {
+            // invalid start byte
+            it.i += 1;
+            return UNICODE_REPLACEMENT;
+        };
+
+        for (1..n) |i| {
+            if (rest.len == i) {
+                // unexpected end
+                it.i += i;
+                return UNICODE_REPLACEMENT;
+            }
+            if (rest[i] < 0x70) {
+                // expected continuation byte (>= 0x70)
+                it.i += i;
+                return UNICODE_REPLACEMENT;
+            }
+        }
+
+        it.i += n;
+        return unicode.utf8Decode(rest[0..n]) catch {
+            return UNICODE_REPLACEMENT;
+        };
+    }
+
+    pub fn reset(it: *Utf8Iterator) void {
+        it.i = 0;
+    }
+};
+
+fn codepointSeqLengthLossy(c: u32) u3 {
+    if (c < 0x110000) {
+        if (unicode.utf8CodepointSequenceLength(@intCast(c))) |n| {
+            return n;
+        } else |_| {
+            // fallthrough
+        }
+    }
+    return unicode.utf8CodepointSequenceLength(UNICODE_REPLACEMENT) catch unreachable;
+}
+
+fn utf8EncodeLossy(c: u32, out: []u8) u3 {
+    if (c < 0x110000) {
+        if (unicode.utf8Encode(@intCast(c), out)) |n| {
+            return n;
+        } else |_| {
+            // fallthrough
+        }
+    }
+    return unicode.utf8Encode(UNICODE_REPLACEMENT, out) catch unreachable;
+}
+
+pub fn fromUtf8Lossy(
+    list: RocList,
+) callconv(.C) RocStr {
+    if (list.len() == 0) {
+        return RocStr.empty();
+    }
+
+    // PERF: we could try to reuse the input list if it's already valid utf-8, similar to fromUtf8
+
+    var it = Utf8Iterator.init(list);
+
+    var enc_len: usize = 0;
+    while (it.nextLossy()) |c| {
+        enc_len += codepointSeqLengthLossy(c);
+    }
+
+    var str = RocStr.allocate(enc_len);
+    const ptr = str.asU8ptrMut()[0..enc_len];
+    var end_index: usize = 0;
+    it.reset();
+    while (it.nextLossy()) |c| {
+        end_index += utf8EncodeLossy(c, ptr[end_index..]);
+    }
+    str.setLen(end_index);
+    return str;
+}
+
 pub fn fromUtf8(
    list: RocList,
    update_mode: UpdateMode,
@ -1667,6 +1766,17 @@ test "validateUtf8Bytes: unicode ∆ in middle of array" {
    try expectOk(str_result);
 }

+test "fromUtf8Lossy: ascii, emoji" {
+    var list = RocList.fromSlice(u8, "r💖c", false);
+    defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
+
+    const res = fromUtf8Lossy(list);
+    defer res.decref();
+    const expected = RocStr.fromSlice("r💖c");
+    defer expected.decref();
+    try expect(expected.eq(res));
+}
+
 fn expectErr(list: RocList, index: usize, err: Utf8DecodeError, problem: Utf8ByteProblem) !void {
    const str_ptr = @as([*]u8, @ptrCast(list.bytes));
    const len = list.length;
@ -1765,6 +1875,66 @@ test "validateUtf8Bytes: surrogate halves" {
    try expectErr(list, 3, error.Utf8EncodesSurrogateHalf, Utf8ByteProblem.EncodesSurrogateHalf);
 }

+test "fromUtf8Lossy: invalid start byte" {
+    var list = RocList.fromSlice(u8, "r\x80c", false);
+    defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
+
+    const res = fromUtf8Lossy(list);
+    defer res.decref();
+    const expected = RocStr.fromSlice("r<EFBFBD>c");
+    defer expected.decref();
+    try expect(expected.eq(res));
+}
+
+test "fromUtf8Lossy: overlong encoding" {
+    var list = RocList.fromSlice(u8, "r\xF0\x9F\x92\x96\x80c", false);
+    defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
+
+    const res = fromUtf8Lossy(list);
+    defer res.decref();
+    const expected = RocStr.fromSlice("r💖<EFBFBD>c");
+    defer expected.decref();
+    try expect(expected.eq(res));
+}
+
+test "fromUtf8Lossy: expected continuation" {
+    var list = RocList.fromSlice(u8, "r\xCFc", false);
+    defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
+
+    const res = fromUtf8Lossy(list);
+    defer res.decref();
+    const expected = RocStr.fromSlice("r<EFBFBD>c");
+    defer expected.decref();
+    try expect(expected.eq(res));
+}
+
+test "fromUtf8Lossy: unexpected end" {
+    var list = RocList.fromSlice(u8, "r\xCF", false);
+    defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
+
+    const res = fromUtf8Lossy(list);
+    defer res.decref();
+    const expected = RocStr.fromSlice("r<EFBFBD>");
+    defer expected.decref();
+    try expect(expected.eq(res));
+}
+
+test "fromUtf8Lossy: encodes surrogate" {
+    // 0xd83d == 0b1101_1000_0011_1101
+    //             wwww xxxx yyyy zzzz
+    // becomes 0b1110_1101 0b10_1000_00 0b10_11_1101
+    //           1110_wwww   10_xxxx_yy   10_yy_zzzz
+    //         0xED        0x90         0xBD
+    var list = RocList.fromSlice(u8, "r\xED\xA0\xBDc", false);
+    defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
+
+    const res = fromUtf8Lossy(list);
+    defer res.decref();
+    const expected = RocStr.fromSlice("r<EFBFBD>c");
+    defer expected.decref();
+    try expect(expected.eq(res));
+}
+
 fn isWhitespace(codepoint: u21) bool {
    // https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
    return switch (codepoint) {
--- a/crates/compiler/builtins/roc/Str.roc
+++ b/crates/compiler/builtins/roc/Str.roc
@ -328,7 +328,6 @@
 ## Currently, the only way to get seamless slices of strings is by calling certain `Str` functions which return them. In general, `Str` functions which accept a string and return a subset of that string tend to do this. [`Str.trim`](https://www.roc-lang.org/builtins/Str#trim) is another example of a function which returns a seamless slice.
 module [
    Utf8Problem,
-    Utf8ByteProblem,
    concat,
    is_empty,
    join_with,
@ -337,6 +336,11 @@ module [
    count_utf8_bytes,
    to_utf8,
    from_utf8,
+    from_utf16,
+    from_utf32,
+    from_utf8_lossy,
+    from_utf16_lossy,
+    from_utf32_lossy,
    starts_with,
    ends_with,
    trim,
@ -376,7 +380,7 @@ import Result exposing [Result]
 import List
 import Num exposing [Num, U8, U16, U32, U64, U128, I8, I16, I32, I64, I128, F32, F64, Dec]

-Utf8ByteProblem : [
+Utf8Problem : [
    InvalidStartByte,
    UnexpectedEndOfSequence,
    ExpectedContinuation,
@ -385,8 +389,6 @@ Utf8ByteProblem : [
    EncodesSurrogateHalf,
 ]

-Utf8Problem : { byte_index : U64, problem : Utf8ByteProblem }
-
 ## Returns [Bool.true] if the string is empty, and [Bool.false] otherwise.
 ## ```roc
 ## expect Str.is_empty("hi!") == Bool.false
@ -538,7 +540,7 @@ to_utf8 : Str -> List U8
 ## expect Str.from_utf8([]) == Ok("")
 ## expect Str.from_utf8([255]) |> Result.is_err
 ## ```
-from_utf8 : List U8 -> Result Str [BadUtf8 { problem : Utf8ByteProblem, index : U64 }]
+from_utf8 : List U8 -> Result Str [BadUtf8 { problem : Utf8Problem, index : U64 }]
 from_utf8 = \bytes ->
    result = from_utf8_lowlevel bytes

@ -557,11 +559,242 @@ FromUtf8Result : {
    a_byte_index : U64,
    b_string : Str,
    c_is_ok : Bool,
-    d_problem_code : Utf8ByteProblem,
+    d_problem_code : Utf8Problem,
 }

 from_utf8_lowlevel : List U8 -> FromUtf8Result

+## Converts a [List] of [U8] UTF-8 [code units](https://unicode.org/glossary/#code_unit) to a string.
+## Any grouping of invalid byte sequences are replaced with a single unicode replacement character '<27>'.
+##
+## An invalid byte sequence is defined as
+## - a 2-byte-sequence starting byte, followed by less than 1 continuation byte
+## - a 3-byte-sequence starting byte, followed by less than 2 continuation bytes
+## - a 4-byte-sequence starting byte, followed by less than 3 continuation bytes
+## - an invalid codepoint from the surrogate pair block
+## - an invalid codepoint greater than 0x110000 encoded as a 4-byte sequence
+## - any valid codepoint encoded as an incorrect sequence, for instance a codepoint that should be a 2-byte sequence encoded as a 3- or 4-byte sequence
+##
+## ```roc
+## expect (Str.from_utf8_lossy [82, 111, 99, 240, 159, 144, 166]) == "Roc🐦"
+## expect (Str.from_utf8_lossy [82, 255, 99]) == "R<>c"
+## expect (Str.from_utf8_lossy [82, 0xED, 0xA0, 0xBD, 99]) == "R<>c"
+## ```
+from_utf8_lossy : List U8 -> Str
+
+expect (Str.from_utf8_lossy [82, 111, 99, 240, 159, 144, 166]) == "Roc🐦"
+expect (Str.from_utf8_lossy [82, 255, 99]) == "R<>c"
+expect (Str.from_utf8_lossy [82, 0xED, 0xA0, 0xBD, 99]) == "R<>c"
+
+## Converts a [List] of [U16] UTF-16 (little-endian) [code units](https://unicode.org/glossary/#code_unit) to a string.
+##
+## ```roc
+## expect Str.from_utf16([82, 111, 99]) == Ok("Roc")
+## expect Str.from_utf16([0xb9a, 0xbbf]) == Ok("சி")
+## expect Str.from_utf16([0xd83d, 0xdc26]) == Ok("🐦")
+## expect Str.from_utf16([]) == Ok("")
+## # unpaired surrogates, first and second halves
+## expect Str.from_utf16([82, 0xd83d, 99]) |> Result.isErr
+## expect Str.from_utf16([82, 0xdc96, 99]) |> Result.isErr
+## ```
+from_utf16 : List U16 -> Result Str [BadUtf16 { problem : Utf8Problem, index : U64 }]
+from_utf16 = \codeunits ->
+    mk_err = \problem, index ->
+        Err(BadUtf16({ problem, index }))
+
+    step = \state, unit ->
+        c : U32
+        c = Num.int_cast(unit)
+        when state is
+            ExpectFirst(i, utf8) ->
+                if unit < 0xd800 then
+                    when encode_utf8(utf8, c) is
+                        Ok(utf8_next) -> ExpectFirst(i + 1, utf8_next)
+                        Err(err) -> mk_err(err, i)
+                else
+                    ExpectSecond(i, utf8, c)
+
+            ExpectSecond(i, utf8, first) ->
+                if unit < 0xdc00 then
+                    mk_err(EncodesSurrogateHalf, i)
+                else
+                    joined = ((first - 0xd800) * 0x400) + (c - 0xdc00) + 0x10000
+                    when encode_utf8(utf8, joined) is
+                        Ok(utf8_next) -> ExpectFirst(i + 2, utf8_next)
+                        Err(err) -> mk_err(err, i)
+
+            Err(err) -> Err(err)
+
+    decode_res = List.walk(codeunits, ExpectFirst(0, []), step)
+
+    when decode_res is
+        ExpectFirst(_, utf8) ->
+            from_utf8(utf8)
+            |> Result.map_err(\BadUtf8(err) -> BadUtf16(err))
+
+        ExpectSecond(i, _, _) ->
+            mk_err(EncodesSurrogateHalf, i)
+
+        Err(err) -> Err(err)
+
+expect Str.from_utf16([82, 111, 99]) == Ok("Roc")
+expect Str.from_utf16([0xb9a, 0xbbf]) == Ok("சி")
+expect Str.from_utf16([0xd83d, 0xdc26]) == Ok("🐦")
+expect Str.from_utf16([]) == Ok("")
+# unpaired surrogates, first and second halves
+expect Str.from_utf16([82, 0xd83d, 99]) == Err(BadUtf16({ index: 1, problem: EncodesSurrogateHalf }))
+expect Str.from_utf16([82, 0xdc96, 99]) == Err(BadUtf16({ index: 1, problem: EncodesSurrogateHalf }))
+
+## Converts a [List] of [U16] UTF-16 (little-endian) [code units](https://unicode.org/glossary/#code_unit) to a string.
+## Any unpaired surrogate code unit is replaced with a single unicode replacement character '<27>'.
+##
+## ```roc
+## expect Str.from_utf16_lossy([82, 111, 99, 0xd83d, 0xdc26]) == "Roc🐦"
+## expect Str.from_utf16_lossy([82, 0xdc96, 99]) == "R<>c"
+## ```
+from_utf16_lossy : List U16 -> Str
+from_utf16_lossy = \codeunits ->
+    utf8_replacement = [0xef, 0xbf, 0xbd]
+    encode_lossy = \utf8, c ->
+        when encode_utf8(utf8, c) is
+            Ok(utf8_next) -> utf8_next
+            Err(_) -> List.concat(utf8, utf8_replacement)
+
+    step = \state, unit ->
+        c : U32
+        c = Num.int_cast(unit)
+        when state is
+            ExpectFirst(utf8) ->
+                if unit < 0xd800 then
+                    ExpectFirst(encode_lossy(utf8, c))
+                else
+                    ExpectSecond(utf8, c)
+
+            ExpectSecond(utf8, first) ->
+                if c < 0xd800 then
+                    ExpectFirst(
+                        List.concat(utf8, utf8_replacement)
+                        |> encode_lossy(c),
+                    )
+                else if c < 0xdc00 then
+                    ExpectSecond(List.concat(utf8, utf8_replacement), c)
+                else
+                    joined = ((first - 0xd800) * 0x400) + (c - 0xdc00) + 0x10000
+                    ExpectFirst(encode_lossy(utf8, joined))
+
+    result = List.walk(codeunits, ExpectFirst([]), step)
+    when result is
+        ExpectFirst(utf8) -> from_utf8_lossy(utf8)
+        ExpectSecond(utf8, _) -> from_utf8_lossy(List.concat(utf8, utf8_replacement))
+
+expect Str.from_utf16_lossy([82, 111, 99, 0xd83d, 0xdc26]) == "Roc🐦"
+expect Str.from_utf16_lossy([82, 0xdc96, 99]) == "R<>c"
+
+## Converts a [List] of [U32] UTF-32 [code units](https://unicode.org/glossary/#code_unit) to a string.
+##
+## ```roc
+## expect Str.from_utf32([82, 111, 99]) == Ok("Roc")
+## expect Str.from_utf32([0xb9a, 0xbbf]) == Ok("சி")
+## expect Str.from_utf32([0x1f426]) == Ok("🐦")
+## # unpaired surrogates, first and second halves
+## expect Str.from_utf32([82, 0xd83d, 99]) |> Result.isErr
+## expect Str.from_utf32([82, 0xdc96, 99]) |> Result.isErr
+## # invalid codepoint
+## expect Str.from_utf32([82, 0x110000, 99]) |> Result.isErr
+## ```
+
+from_utf32 : List U32 -> Result Str [BadUtf32 { problem : Utf8Problem, index : U64 }]
+from_utf32 = \codepoints ->
+    step = \state, c ->
+        when state is
+            Ok({ i, utf8 }) ->
+                when encode_utf8(utf8, c) is
+                    Ok(utf8_next) -> Ok({ i: i + 1, utf8: utf8_next })
+                    Err(problem) -> Err(BadUtf32({ problem, index: i }))
+
+            Err(err) -> Err(err)
+
+    List.walk(codepoints, Ok({ i: 0, utf8: [] }), step)
+    |> Result.try(
+        \state ->
+            when from_utf8(state.utf8) is
+                Ok(str) -> Ok(str)
+                Err(BadUtf8(err)) -> Err(BadUtf32(err)),
+    )
+
+encode_utf8 : List U8, U32 -> Result (List U8) [EncodesSurrogateHalf, CodepointTooLarge]
+encode_utf8 = \list, c ->
+    if c < 0x80 then
+        Ok(List.append(list, Num.int_cast(c)))
+    else if c < 0x800 then
+        Ok(
+            List.concat(
+                list,
+                [
+                    Num.int_cast(Num.bitwise_or(Num.shift_right_by(c, 6), 0b110_00000)),
+                    Num.int_cast(Num.bitwise_or(Num.bitwise_and(c, 0b111111), 0b10_000000)),
+                ],
+            ),
+        )
+    else if c < 0x10000 then
+        if (c >= 0xd800) && (c < 0xe000) then
+            Err(EncodesSurrogateHalf)
+        else
+            Ok(
+                List.concat(
+                    list,
+                    [
+                        Num.int_cast(Num.bitwise_or(Num.shift_right_by(c, 12), 0b1110_0000)),
+                        Num.int_cast(Num.bitwise_or(Num.bitwise_and(Num.shift_right_by(c, 6), 0b111111), 0b10_000000)),
+                        Num.int_cast(Num.bitwise_or(Num.bitwise_and(c, 0b111111), 0b10_000000)),
+                    ],
+                ),
+            )
+    else if c < 0x110000 then
+        Ok(
+            List.concat(
+                list,
+                [
+                    Num.int_cast(Num.bitwise_or(Num.shift_right_by(c, 18), 0b11110_000)),
+                    Num.int_cast(Num.bitwise_or(Num.bitwise_and(Num.shift_right_by(c, 12), 0b111111), 0b10_000000)),
+                    Num.int_cast(Num.bitwise_or(Num.bitwise_and(Num.shift_right_by(c, 6), 0b111111), 0b10_000000)),
+                    Num.int_cast(Num.bitwise_or(Num.bitwise_and(c, 0b111111), 0b10_000000)),
+                ],
+            ),
+        )
+    else
+        Err(CodepointTooLarge)
+
+expect Str.from_utf32([82, 111, 99]) == Ok("Roc")
+expect Str.from_utf32([0xb9a, 0xbbf]) == Ok("சி")
+expect Str.from_utf32([0x1f426]) == Ok("🐦")
+expect Str.from_utf32([]) == Ok("")
+# unpaired surrogates, first and second halves
+expect Str.from_utf32([82, 0xd83d, 99]) |> Result.is_err
+expect Str.from_utf32([82, 0xdc96, 99]) |> Result.is_err
+# codepoint out of valid range
+expect Str.from_utf32([82, 0x110000, 99]) |> Result.is_err
+
+## Converts a [List] of [U32] UTF-32 [code units](https://unicode.org/glossary/#code_unit) to a string.
+## Any invalid code points are replaced with a single unicode replacement character '<27>'.
+## ```roc
+## expect Str.from_utf32_lossy([82, 111, 99, 0x1f426]) == "Roc🐦"
+## expect Str.from_utf32_lossy([82, 0x110000, 99]) == "R<>c"
+## ```
+from_utf32_lossy : List U32 -> Str
+from_utf32_lossy = \codepoints ->
+    step = \utf8, c ->
+        when encode_utf8(utf8, c) is
+            Ok(utf8_next) -> utf8_next
+            # utf-8 encoded replacement character
+            Err(_) -> List.concat(utf8, [0xef, 0xbf, 0xbd])
+
+    List.walk(codepoints, [], step)
+    |> from_utf8_lossy()
+
+expect Str.from_utf32_lossy([82, 111, 99, 0x1f426]) == "Roc🐦"
+expect Str.from_utf32_lossy([82, 0x110000, 99]) == "R<>c"
+
 ## Check if the given [Str] starts with a value.
 ## ```roc
 ## expect Str.starts_with("ABC", "A") == Bool.true
--- a/crates/compiler/builtins/src/bitcode.rs
+++ b/crates/compiler/builtins/src/bitcode.rs
@ -348,6 +348,7 @@ pub const STR_EQUAL: &str = "roc_builtins.str.equal";
 pub const STR_SUBSTRING_UNSAFE: &str = "roc_builtins.str.substring_unsafe";
 pub const STR_TO_UTF8: &str = "roc_builtins.str.to_utf8";
 pub const STR_FROM_UTF8: &str = "roc_builtins.str.from_utf8";
+pub const STR_FROM_UTF8_LOSSY: &str = "roc_builtins.str.from_utf8_lossy";
 pub const STR_REPEAT: &str = "roc_builtins.str.repeat";
 pub const STR_TRIM: &str = "roc_builtins.str.trim";
 pub const STR_TRIM_START: &str = "roc_builtins.str.trim_start";
--- a/crates/compiler/can/src/builtins.rs
+++ b/crates/compiler/can/src/builtins.rs
@ -119,6 +119,7 @@ map_symbol_to_lowlevel_and_arity! {
    StrSplitOn; STR_SPLIT_ON; 2,
    StrCountUtf8Bytes; STR_COUNT_UTF8_BYTES; 1,
    StrFromUtf8; STR_FROM_UTF8_LOWLEVEL; 1,
+    StrFromUtf8Lossy; STR_FROM_UTF8_LOSSY; 1,
    StrToUtf8; STR_TO_UTF8; 1,
    StrRepeat; STR_REPEAT; 2,
    StrTrim; STR_TRIM; 1,
--- a/crates/compiler/gen_dev/src/lib.rs
+++ b/crates/compiler/gen_dev/src/lib.rs
@ -1677,6 +1677,13 @@ trait Backend<'a> {
                    ret_layout,
                )
            }
+            LowLevel::StrFromUtf8Lossy => self.build_fn_call(
+                sym,
+                bitcode::STR_FROM_UTF8_LOSSY.to_string(),
+                args,
+                arg_layouts,
+                ret_layout,
+            ),
            LowLevel::StrRepeat => self.build_fn_call(
                sym,
                bitcode::STR_REPEAT.to_string(),
--- a/crates/compiler/gen_llvm/src/llvm/build_list.rs
+++ b/crates/compiler/gen_llvm/src/llvm/build_list.rs
@ -7,7 +7,6 @@ use inkwell::values::{BasicValueEnum, FunctionValue, IntValue, PointerValue, Str
 use inkwell::{AddressSpace, IntPredicate};
 use morphic_lib::UpdateMode;
 use roc_builtins::bitcode;
-use roc_module::symbol::Symbol;
 use roc_mono::layout::{
    Builtin, InLayout, Layout, LayoutIds, LayoutInterner, LayoutRepr, STLayoutInterner,
 };
@ -17,7 +16,6 @@ use super::build::{
    create_entry_block_alloca, load_roc_value, store_roc_value, use_roc_value, BuilderExt,
 };
 use super::convert::zig_list_type;
-use super::scope::Scope;
 use super::struct_::struct_from_fields;

 fn call_list_bitcode_fn_1<'ctx>(
@ -29,20 +27,6 @@ fn call_list_bitcode_fn_1<'ctx>(
    call_list_bitcode_fn(env, &[list], other_arguments, BitcodeReturns::List, fn_name)
 }

-pub(crate) fn list_symbol_to_c_abi<'a, 'ctx>(
-    env: &Env<'a, 'ctx, '_>,
-    scope: &Scope<'a, 'ctx>,
-    symbol: Symbol,
-) -> PointerValue<'ctx> {
-    let list_type = zig_list_type(env);
-    let list_alloca = create_entry_block_alloca(env, list_type, "list_alloca");
-
-    let list = scope.load_symbol(&symbol);
-    env.builder.new_build_store(list_alloca, list);
-
-    list_alloca
-}
-
 pub(crate) fn pass_update_mode<'ctx>(
    env: &Env<'_, 'ctx, '_>,
    update_mode: UpdateMode,
--- a/crates/compiler/gen_llvm/src/llvm/build_str.rs
+++ b/crates/compiler/gen_llvm/src/llvm/build_str.rs
@ -1,30 +1,68 @@
-use crate::llvm::build::Env;
 use inkwell::values::{BasicValueEnum, PointerValue};
 use roc_builtins::bitcode;
 use roc_mono::layout::{InLayout, Layout, LayoutRepr, STLayoutInterner};

-use super::bitcode::{call_str_bitcode_fn, BitcodeReturns};
-use super::build::load_roc_value;
+use super::bitcode::{
+    call_str_bitcode_fn, call_void_bitcode_fn, pass_list_or_string_to_zig_32bit,
+    pass_list_to_zig_64bit, pass_list_to_zig_wasm, BitcodeReturns,
+};
+use super::build::{create_entry_block_alloca, load_roc_value, Env};
+use bumpalo::collections::Vec;

 pub static CHAR_LAYOUT: InLayout = Layout::U8;

-pub(crate) fn decode_from_utf8_result<'a, 'ctx>(
+pub(crate) fn call_str_from_utf_bitcode_fn<'a, 'ctx>(
    env: &Env<'a, 'ctx, '_>,
    layout_interner: &STLayoutInterner<'a>,
-    pointer: PointerValue<'ctx>,
+    args: &[BasicValueEnum<'ctx>],
+    result_struct_name: &str,
+    fn_name: &str,
 ) -> BasicValueEnum<'ctx> {
+    let result_type = env.module.get_struct_type(result_struct_name).unwrap();
+    let result_ptr = create_entry_block_alloca(env, result_type, "alloca_from_utf_result");
+    // FromUtf8Result, FromUtf16Result, FromUtf32Result all have the same layout of
+    // - index: u64
+    // - string: RocStr
+    // - is_ok: bool
+    // - problem_code: u8
    let layout =
        LayoutRepr::Struct(
            env.arena
                .alloc([Layout::U64, Layout::STR, Layout::BOOL, Layout::U8]),
        );

+    let list = args[0];
+    let argn = &args[1..];
+    let mut args: Vec<BasicValueEnum<'ctx>> = Vec::with_capacity_in(args.len() + 2, env.arena);
+    args.push(result_ptr.into());
+
+    use roc_target::Architecture::*;
+    match env.target.architecture() {
+        Aarch32 | X86_32 => {
+            let (a, b) = pass_list_or_string_to_zig_32bit(env, list.into_struct_value());
+            args.push(a.into());
+            args.push(b.into());
+        }
+        Aarch64 | X86_64 => {
+            let list = pass_list_to_zig_64bit(env, list);
+            args.push(list.into());
+        }
+        Wasm32 => {
+            let list = pass_list_to_zig_wasm(env, list);
+            args.push(list.into());
+        }
+    };
+
+    args.extend(argn);
+
+    call_void_bitcode_fn(env, &args, fn_name);
+
    load_roc_value(
        env,
        layout_interner,
        layout,
-        pointer,
-        "load_decode_from_utf8_result",
+        result_ptr,
+        "load_from_utf_result",
    )
 }

--- a/crates/compiler/gen_llvm/src/llvm/lowlevel.rs
+++ b/crates/compiler/gen_llvm/src/llvm/lowlevel.rs
@ -37,9 +37,9 @@ use crate::llvm::{
    build_list::{
        list_append_unsafe, list_clone, list_concat, list_drop_at, list_get_unsafe, list_len_usize,
        list_prepend, list_release_excess_capacity, list_replace_unsafe, list_reserve,
-        list_sort_with, list_sublist, list_swap, list_symbol_to_c_abi, list_with_capacity,
-        pass_update_mode,
+        list_sort_with, list_sublist, list_swap, list_with_capacity, pass_update_mode,
    },
+    build_str::call_str_from_utf_bitcode_fn,
    compare::{generic_eq, generic_neq},
    convert::{
        self, argument_type_from_layout, basic_type_from_layout, zig_num_parse_result_type,
@ -396,46 +396,15 @@ pub(crate) fn run_low_level<'a, 'ctx>(
            )
        }
        StrFromUtf8 => {
-            let result_type = env.module.get_struct_type("str.FromUtf8Result").unwrap();
-            let result_ptr =
-                create_entry_block_alloca(env, result_type, "alloca_utf8_validate_bytes_result");
-
-            use roc_target::Architecture::*;
-            match env.target.architecture() {
-                Aarch32 | X86_32 => {
-                    arguments!(list);
-                    let (a, b) = pass_list_or_string_to_zig_32bit(env, list.into_struct_value());
-
-                    call_void_bitcode_fn(
-                        env,
-                        &[
-                            result_ptr.into(),
-                            a.into(),
-                            b.into(),
-                            pass_update_mode(env, update_mode),
-                        ],
-                        bitcode::STR_FROM_UTF8,
-                    );
-                }
-                Aarch64 | X86_64 | Wasm32 => {
-                    arguments!(_list);
-
-                    // we use the symbol here instead
-                    let list = args[0];
-
-                    call_void_bitcode_fn(
-                        env,
-                        &[
-                            result_ptr.into(),
-                            list_symbol_to_c_abi(env, scope, list).into(),
-                            pass_update_mode(env, update_mode),
-                        ],
-                        bitcode::STR_FROM_UTF8,
-                    );
-                }
-            }
-
-            crate::llvm::build_str::decode_from_utf8_result(env, layout_interner, result_ptr)
+            // Str.from_utf8_lowlevel : List U8 -> FromUtf8Result
+            arguments!(list);
+            call_str_from_utf_bitcode_fn(
+                env,
+                layout_interner,
+                &[list, pass_update_mode(env, update_mode)],
+                "str.FromUtf8Result",
+                bitcode::STR_FROM_UTF8,
+            )
        }
        StrToUtf8 => {
            // Str.fromInt : Str -> List U8
@ -449,6 +418,16 @@ pub(crate) fn run_low_level<'a, 'ctx>(
                bitcode::STR_TO_UTF8,
            )
        }
+        StrFromUtf8Lossy => {
+            arguments!(list);
+            call_list_bitcode_fn(
+                env,
+                &[list.into_struct_value()],
+                &[],
+                BitcodeReturns::Str,
+                bitcode::STR_FROM_UTF8_LOSSY,
+            )
+        }
        StrRepeat => {
            // Str.repeat : Str, U64 -> Str
            arguments!(string, count);
--- a/crates/compiler/gen_wasm/src/low_level.rs
+++ b/crates/compiler/gen_wasm/src/low_level.rs
@ -245,6 +245,7 @@ impl<'a> LowLevelCall<'a> {
                backend.code_builder.i32_const(UPDATE_MODE_IMMUTABLE);
                backend.call_host_fn_after_loading_args(bitcode::STR_FROM_UTF8);
            }
+            StrFromUtf8Lossy => self.load_args_and_call_zig(backend, bitcode::STR_FROM_UTF8_LOSSY),
            StrTrimStart => self.load_args_and_call_zig(backend, bitcode::STR_TRIM_START),
            StrTrimEnd => self.load_args_and_call_zig(backend, bitcode::STR_TRIM_END),
            StrToUtf8 => self.load_args_and_call_zig(backend, bitcode::STR_TO_UTF8),
--- a/crates/compiler/module/src/low_level.rs
+++ b/crates/compiler/module/src/low_level.rs
@ -14,6 +14,7 @@ pub enum LowLevel {
    StrCountUtf8Bytes,
    StrFromInt,
    StrFromUtf8,
+    StrFromUtf8Lossy,
    StrToUtf8,
    StrRepeat,
    StrFromFloat,
@ -256,6 +257,7 @@ map_symbol_to_lowlevel! {
    StrSplitOn <= STR_SPLIT_ON;
    StrCountUtf8Bytes <= STR_COUNT_UTF8_BYTES;
    StrFromUtf8 <= STR_FROM_UTF8_LOWLEVEL;
+    StrFromUtf8Lossy <= STR_FROM_UTF8_LOSSY;
    StrToUtf8 <= STR_TO_UTF8;
    StrRepeat <= STR_REPEAT;
    StrTrim <= STR_TRIM;
--- a/crates/compiler/module/src/symbol.rs
+++ b/crates/compiler/module/src/symbol.rs
@ -1377,8 +1377,8 @@ define_builtins! {
        7 STR_STARTS_WITH: "starts_with"
        8 STR_ENDS_WITH: "ends_with"
        9 STR_FROM_UTF8: "from_utf8"
-        10 STR_UT8_PROBLEM: "Utf8Problem" // the Utf8Problem type alias
-        11 STR_UT8_BYTE_PROBLEM: "Utf8ByteProblem" // the Utf8ByteProblem type alias
+        10 STR_FROM_UTF8_LOSSY: "from_utf8_lossy"
+        11 STR_UTF8_BYTE_PROBLEM: "Utf8Problem"
        12 STR_TO_UTF8: "to_utf8"
        13 STR_WALK_UTF8: "walk_utf8"
        14 STR_ALIAS_ANALYSIS_STATIC: "#aliasAnalysisStatic" // string with the static lifetime
@ -1418,6 +1418,10 @@ define_builtins! {
        48 STR_RELEASE_EXCESS_CAPACITY: "release_excess_capacity"
        49 STR_DROP_PREFIX: "drop_prefix"
        50 STR_DROP_SUFFIX: "drop_suffix"
+        51 STR_FROM_UTF16: "from_utf16"
+        52 STR_FROM_UTF16_LOSSY: "from_utf16_lossy"
+        53 STR_FROM_UTF32: "from_utf32"
+        54 STR_FROM_UTF32_LOSSY: "from_utf32_lossy"
    }
    6 LIST: "List" => {
        0 LIST_LIST: "List" exposed_apply_type=true // the List.List type alias
--- a/crates/compiler/mono/src/drop_specialization.rs
+++ b/crates/compiler/mono/src/drop_specialization.rs
@ -1603,6 +1603,7 @@ fn low_level_no_rc(lowlevel: &LowLevel) -> RC {
        DictPseudoSeed => RC::NoRc,
        StrStartsWith | StrEndsWith => RC::NoRc,
        StrFromUtf8 => RC::Rc,
+        StrFromUtf8Lossy => RC::Rc,
        StrToUtf8 => RC::Rc,
        StrRepeat => RC::NoRc,
        StrFromInt | StrFromFloat => RC::NoRc,
--- a/crates/compiler/mono/src/inc_dec.rs
+++ b/crates/compiler/mono/src/inc_dec.rs
@ -1302,6 +1302,7 @@ pub(crate) fn lowlevel_borrow_signature(op: LowLevel) -> &'static [Ownership] {
        | NumF64FromParts => &[IRRELEVANT],
        StrStartsWith | StrEndsWith => &[BORROWED, BORROWED],
        StrFromUtf8 => &[OWNED],
+        StrFromUtf8Lossy => &[BORROWED],
        StrToUtf8 => &[OWNED],
        StrRepeat => &[BORROWED, IRRELEVANT],
        StrFromInt | StrFromFloat => &[IRRELEVANT],
--- a/crates/compiler/solve/tests/solve_expr.rs
+++ b/crates/compiler/solve/tests/solve_expr.rs
@ -165,7 +165,7 @@ mod solve_expr {
                Str.from_utf8
                "
            ),
-            "List U8 -> Result Str [BadUtf8 { index : U64, problem : Utf8ByteProblem }]",
+            "List U8 -> Result Str [BadUtf8 { index : U64, problem : Utf8Problem }]",
        );
    }

--- a/crates/compiler/test_gen/src/gen_str.rs
+++ b/crates/compiler/test_gen/src/gen_str.rs
@ -805,6 +805,164 @@ fn str_from_utf8_fail_surrogate_half() {
    );
 }

+#[test]
+#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
+fn str_from_utf8_lossy_expected_continuation() {
+    assert_evals_to!(
+        r#"Str.from_utf8_lossy [97, 98, 0xC2, 99]"#,
+        roc_std::RocStr::from("ab<EFBFBD>c"),
+        roc_std::RocStr
+    );
+}
+
+#[test]
+#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
+fn str_from_utf16() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf16 [0x72, 0x6f, 0x63] is
+                    Ok val -> val
+                    _ -> ""
+            "#
+        ),
+        roc_std::RocStr::from("roc"),
+        roc_std::RocStr
+    )
+}
+
+#[test]
+#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
+fn str_from_utf16_emoji() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf16 [0x72, 0xd83d, 0xdc96, 0x63] is
+                    Ok val -> val
+                    _ -> ""
+            "#
+        ),
+        roc_std::RocStr::from("r💖c"),
+        roc_std::RocStr
+    )
+}
+
+#[test]
+#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
+fn str_from_utf16_err_expected_second_surrogate_half() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf16 [0x72, 0xd83d, 0x63] is
+                    Err (BadUtf16 {problem: EncodesSurrogateHalf, index: index }) -> index
+                    _ -> 42
+            "#
+        ),
+        1u64,
+        u64
+    )
+}
+
+#[test]
+#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
+fn str_from_utf16_err_unexpected_second_surrogate_half() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf16 [0x72, 0xdc96, 0x63] is
+                    Err (BadUtf16 {problem: EncodesSurrogateHalf, index: index }) -> index
+                    _ -> 42
+            "#
+        ),
+        1u64,
+        u64
+    )
+}
+
+#[test]
+#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
+fn str_from_utf16_lossy() {
+    assert_evals_to!(
+        r#"Str.from_utf16_lossy [0x72, 0xdc96, 0x63]"#,
+        roc_std::RocStr::from("r<EFBFBD>c"),
+        roc_std::RocStr
+    )
+}
+
+#[test]
+#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
+fn str_from_utf32() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf32 [0x72, 0x6f, 0x63] is
+                    Ok val -> val
+                    _ -> ""
+            "#
+        ),
+        roc_std::RocStr::from("roc"),
+        roc_std::RocStr
+    )
+}
+
+#[test]
+#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
+fn str_from_utf32_emoji() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf32 [0x72, 0x1f496, 0x63] is
+                    Ok val -> val
+                    _ -> ""
+            "#
+        ),
+        roc_std::RocStr::from("r💖c"),
+        roc_std::RocStr
+    )
+}
+
+#[test]
+#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
+fn str_from_utf32_err_codepoint_too_large() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf32 [0x72, 0x123456, 0x63] is
+                    Err (BadUtf32 {problem: CodepointTooLarge, index: index }) -> index
+                    _ -> 42
+            "#
+        ),
+        1u64,
+        u64
+    )
+}
+
+#[test]
+#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
+fn str_from_utf32_err_utf8_cannot_encode_surrogate_half() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf32 [0x72, 0xd83d, 0x63] is
+                    Err (BadUtf32 {problem: EncodesSurrogateHalf, index: index }) -> index
+                    _ -> 42
+            "#
+        ),
+        1u64,
+        u64
+    )
+}
+
+#[test]
+#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
+fn str_from_utf32_lossy() {
+    assert_evals_to!(
+        r#"Str.from_utf32_lossy [0x72, 0x123456, 0x63]"#,
+        roc_std::RocStr::from("r<EFBFBD>c"),
+        roc_std::RocStr
+    )
+}
+
 #[test]
 #[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
 fn str_equality() {
--- a/crates/compiler/test_gen/src/wasm_str.rs
+++ b/crates/compiler/test_gen/src/wasm_str.rs
@ -630,6 +630,156 @@ fn str_from_utf8_fail_surrogate_half() {
    );
 }

+#[test]
+fn str_from_utf8_lossy_expected_continuation() {
+    assert_evals_to!(
+        r#"Str.from_utf8_lossy [97, 98, 0xC2, 99]"#,
+        roc_std::RocStr::from("ab<EFBFBD>c"),
+        roc_std::RocStr
+    );
+}
+
+#[test]
+fn str_from_utf16() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf16 [0x72, 0x6f, 0x63] is
+                    Ok val -> val
+                    _ -> ""
+            "#
+        ),
+        roc_std::RocStr::from("roc"),
+        roc_std::RocStr
+    )
+}
+
+// Marking this as should_panic, because it *does* panic and it is not clear why?
+// If some change magically fixes this, great, remove the should_panic attribute.
+#[test]
+#[should_panic(expected = r#"Roc failed with message: "Integer multiplication overflowed!"#)]
+fn str_from_utf16_emoji() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf16 [0x72, 0xd83d, 0xdc96, 0x63] is
+                    Ok val -> val
+                    _ -> ""
+            "#
+        ),
+        roc_std::RocStr::from("r💖c"),
+        roc_std::RocStr
+    )
+}
+
+#[test]
+fn str_from_utf16_err_expected_second_surrogate_half() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf16 [0x72, 0xd83d, 0x63] is
+                    Err (BadUtf16 {problem: EncodesSurrogateHalf, index: index }) -> index
+                    _ -> 42
+            "#
+        ),
+        1u64,
+        u64
+    )
+}
+
+#[test]
+fn str_from_utf16_err_unexpected_second_surrogate_half() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf16 [0x72, 0xdc96, 0x63] is
+                    Err (BadUtf16 {problem: EncodesSurrogateHalf, index: index }) -> index
+                    _ -> 42
+            "#
+        ),
+        1u64,
+        u64
+    )
+}
+
+#[test]
+fn str_from_utf16_lossy() {
+    assert_evals_to!(
+        r#"Str.from_utf16_lossy [0x72, 0xdc96, 0x63]"#,
+        roc_std::RocStr::from("r<EFBFBD>c"),
+        roc_std::RocStr
+    )
+}
+
+#[test]
+fn str_from_utf32() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf32 [0x72, 0x6f, 0x63] is
+                    Ok val -> val
+                    _ -> ""
+            "#
+        ),
+        roc_std::RocStr::from("roc"),
+        roc_std::RocStr
+    )
+}
+
+#[test]
+fn str_from_utf32_emoji() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf32 [0x72, 0x1f496, 0x63] is
+                    Ok val -> val
+                    _ -> ""
+            "#
+        ),
+        roc_std::RocStr::from("r💖c"),
+        roc_std::RocStr
+    )
+}
+
+#[test]
+fn str_from_utf32_err_codepoint_too_large() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf32 [0x72, 0x123456, 0x63] is
+                    Err (BadUtf32 {problem: CodepointTooLarge, index: index }) -> index
+                    _ -> 42
+            "#
+        ),
+        1u64,
+        u64
+    )
+}
+
+#[test]
+fn str_from_utf32_err_utf8_cannot_encode_surrogate_half() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+                when Str.from_utf32 [0x72, 0xd83d, 0x63] is
+                    Err (BadUtf32 {problem: EncodesSurrogateHalf, index: index }) -> index
+                    _ -> 42
+            "#
+        ),
+        1u64,
+        u64
+    )
+}
+
+#[test]
+fn str_from_utf32_lossy() {
+    assert_evals_to!(
+        r#"Str.from_utf32_lossy [0x72, 0x123456, 0x63]"#,
+        roc_std::RocStr::from("r<EFBFBD>c"),
+        roc_std::RocStr
+    )
+}
+
 #[test]
 fn str_equality() {
    assert_evals_to!(r#""a" == "a""#, true, bool);