make fromUtf8 do RC

2025-09-28 06:14:46 +00:00 · 2021-02-24 21:25:14 +01:00 · 2021-02-24 21:25:14 +01:00 · a6edc58323
commit a6edc58323
parent 17a44aab02
8 changed files with 172 additions and 168 deletions
--- a/cli/tests/cli_run.rs
+++ b/cli/tests/cli_run.rs
@ -240,6 +240,18 @@ mod cli_run {
        );
    }
    #[test]
    #[serial(base64)]
    fn base64() {
        check_output(
            &example_file("benchmarks", "TestBase64.roc"),
            "test-base64",
            &[],
            "SGVsbG8gV29ybGQ=",
            true,
        );
    }
    #[test]
    #[serial(closure)]
    fn closure() {
--- a/compiler/builtins/bitcode/src/main.zig
+++ b/compiler/builtins/bitcode/src/main.zig
@ -67,8 +67,8 @@ comptime {
    exportStrFn(str.strFromIntC, "from_int");
    exportStrFn(str.strFromFloatC, "from_float");
    exportStrFn(str.strEqual, "equal");
    exportStrFn(str.validateUtf8Bytes, "validate_utf8_bytes");
    exportStrFn(str.strToBytesC, "to_bytes");
    exportStrFn(str.fromUtf8C, "from_utf8");
 }
 // Export helpers - Must be run inside a comptime
--- a/compiler/builtins/bitcode/src/str.zig
+++ b/compiler/builtins/bitcode/src/str.zig
@ -15,6 +15,7 @@ const InPlace = packed enum(u8) {
    Clone,
 };
 const SMALL_STR_MAX_LENGTH = small_string_size - 1;
 const small_string_size = 2 * @sizeOf(usize);
 const blank_small_string: [16]u8 = init_blank_small_string(small_string_size);
@ -982,6 +983,71 @@ fn strToBytes(allocator: *Allocator, arg: RocStr) RocList {
    }
 }
 const FromUtf8Result = extern struct {
    byte_index: usize,
    string: RocStr,
    is_ok: bool,
    problem_code: Utf8ByteProblem,
 };
 pub fn fromUtf8C(arg: RocList, output: *FromUtf8Result) callconv(.C) void {
    output.* = @call(.{ .modifier = always_inline }, fromUtf8, .{ std.heap.c_allocator, arg });
 }
 fn fromUtf8(allocator: *Allocator, arg: RocList) FromUtf8Result {
    const bytes = @ptrCast([*]const u8, arg.bytes)[0..arg.length];
    if (unicode.utf8ValidateSlice(bytes)) {
        // the output will be correct. Now we need to take ownership of the input
        if (arg.len() <= SMALL_STR_MAX_LENGTH) {
            // turn the bytes into a small string
            const string = RocStr.init(allocator, @ptrCast([*]u8, arg.bytes), arg.len());
            // then decrement the input list
            const data_bytes = arg.len();
            utils.decref(allocator, @alignOf(usize), arg.bytes, data_bytes);
            return FromUtf8Result{ .is_ok = true, .string = string, .byte_index = 0, .problem_code = Utf8ByteProblem.InvalidStartByte };
        } else {
            const byte_list = arg.makeUnique(allocator, @alignOf(usize), @sizeOf(u8));
            const string = RocStr{ .str_bytes = byte_list.bytes, .str_len = byte_list.length };
            return FromUtf8Result{ .is_ok = true, .string = string, .byte_index = 0, .problem_code = Utf8ByteProblem.InvalidStartByte };
        }
    } else {
        const temp = errorToProblem(@ptrCast([*]u8, arg.bytes), arg.length);
        // TODO what should we do RC-wise here
        // const data_bytes = arg.len();
        // utils.decref(allocator, @alignOf(usize), arg.list_bytes, data_bytes);
        return FromUtf8Result{ .is_ok = false, .string = RocStr.empty(), .byte_index = temp.index, .problem_code = temp.problem };
    }
 }
 fn errorToProblem(bytes: [*]u8, length: usize) struct { index: usize, problem: Utf8ByteProblem } {
    var index: usize = 0;
    while (index < length) {
        const nextNumBytes = numberOfNextCodepointBytes(bytes, length, index) catch |err| {
            switch (err) {
                error.UnexpectedEof => {
                    return .{ .index = index, .problem = Utf8ByteProblem.UnexpectedEndOfSequence };
                },
                error.Utf8InvalidStartByte => return .{ .index = index, .problem = Utf8ByteProblem.InvalidStartByte },
                error.Utf8ExpectedContinuation => return .{ .index = index, .problem = Utf8ByteProblem.ExpectedContinuation },
                error.Utf8OverlongEncoding => return .{ .index = index, .problem = Utf8ByteProblem.OverlongEncoding },
                error.Utf8EncodesSurrogateHalf => return .{ .index = index, .problem = Utf8ByteProblem.EncodesSurrogateHalf },
                error.Utf8CodepointTooLarge => return .{ .index = index, .problem = Utf8ByteProblem.CodepointTooLarge },
            }
        };
        index += nextNumBytes;
    }
    unreachable;
 }
 pub fn isValidUnicode(ptr: [*]u8, len: usize) callconv(.C) bool {
    const bytes: []u8 = ptr[0..len];
    return @call(.{ .modifier = always_inline }, unicode.utf8ValidateSlice, .{bytes});
@ -1019,76 +1085,74 @@ pub const Utf8ByteProblem = packed enum(u8) {
    OverlongEncoding = 4,
    UnexpectedEndOfSequence = 5,
 };
 pub const ValidateUtf8BytesResult = extern struct {
    is_ok: bool, byte_index: usize, problem_code: Utf8ByteProblem
 };
-const is_ok_utf8_byte_response =
+fn validateUtf8Bytes(bytes: [*]u8, length: usize) FromUtf8Result {
-    ValidateUtf8BytesResult{ .is_ok = true, .byte_index = 0, .problem_code = Utf8ByteProblem.UnexpectedEndOfSequence };
+    return fromUtf8(std.testing.allocator, RocList{ .bytes = bytes, .length = length });
 inline fn toErrUtf8ByteResponse(byte_index: usize, problem_code: Utf8ByteProblem) ValidateUtf8BytesResult {
    return ValidateUtf8BytesResult{ .is_ok = false, .byte_index = byte_index, .problem_code = problem_code };
 }
-// Validate that an array of bytes is valid UTF-8, but if it fails catch & return the error & byte index
+fn validateUtf8BytesX(str: RocList) FromUtf8Result {
-pub fn validateUtf8Bytes(ptr: [*]u8, len: usize) callconv(.C) ValidateUtf8BytesResult {
+    return fromUtf8(std.testing.allocator, str);
    var index: usize = 0;
    while (index < len) {
        const nextNumBytes = numberOfNextCodepointBytes(ptr, len, index) catch |err| {
            return toErrUtf8ByteResponse(
                index,
                switch (err) {
                    error.UnexpectedEof => Utf8ByteProblem.UnexpectedEndOfSequence,
                    error.Utf8InvalidStartByte => Utf8ByteProblem.InvalidStartByte,
                    error.Utf8ExpectedContinuation => Utf8ByteProblem.ExpectedContinuation,
                    error.Utf8OverlongEncoding => Utf8ByteProblem.OverlongEncoding,
                    error.Utf8EncodesSurrogateHalf => Utf8ByteProblem.EncodesSurrogateHalf,
                    error.Utf8CodepointTooLarge => Utf8ByteProblem.CodepointTooLarge,
                },
            );
        };
        index += nextNumBytes;
    }
    return is_ok_utf8_byte_response;
 }
 fn expectOk(result: FromUtf8Result) void {
    expectEqual(result.is_ok, true);
 }
 fn sliceHelp(bytes: [*]const u8, length: usize) RocList {
    var list = RocList.allocate(testing.allocator, @alignOf(usize), length, @sizeOf(u8));
    @memcpy(list.bytes orelse unreachable, bytes, length);
    list.length = length;
    return list;
 }
 fn toErrUtf8ByteResponse(index: usize, problem: Utf8ByteProblem) FromUtf8Result {
    return FromUtf8Result{ .is_ok = false, .string = RocStr.empty(), .byte_index = index, .problem_code = problem };
 }
 // NOTE on memory: the validate function consumes a RC token of the input. Since
 // we freshly created it (in `sliceHelp`), it has only one RC token, and input list will be deallocated.
 //
 // If we tested with big strings, we'd have to deallocate the output string, but never the input list
 test "validateUtf8Bytes: ascii" {
-    const str_len = 3;
+    const raw = "abc";
-    var str: [str_len]u8 = "abc".*;
+    const ptr: [*]const u8 = @ptrCast([*]const u8, raw);
-    const str_ptr: [*]u8 = &str;
+    const list = sliceHelp(ptr, raw.len);
-    expectEqual(is_ok_utf8_byte_response, validateUtf8Bytes(str_ptr, str_len));
+    expectOk(validateUtf8BytesX(list));
 }
 test "validateUtf8Bytes: unicode œ" {
-    const str_len = 2;
+    const raw = "œ";
-    var str: [str_len]u8 = "œ".*;
+    const ptr: [*]const u8 = @ptrCast([*]const u8, raw);
-    const str_ptr: [*]u8 = &str;
+    const list = sliceHelp(ptr, raw.len);
-    expectEqual(is_ok_utf8_byte_response, validateUtf8Bytes(str_ptr, str_len));
+    expectOk(validateUtf8BytesX(list));
 }
 test "validateUtf8Bytes: unicode ∆" {
-    const str_len = 3;
+    const raw = "∆";
-    var str: [str_len]u8 = "∆".*;
+    const ptr: [*]const u8 = @ptrCast([*]const u8, raw);
-    const str_ptr: [*]u8 = &str;
+    const list = sliceHelp(ptr, raw.len);
-    expectEqual(is_ok_utf8_byte_response, validateUtf8Bytes(str_ptr, str_len));
+    expectOk(validateUtf8BytesX(list));
 }
 test "validateUtf8Bytes: emoji" {
-    const str_len = 4;
+    const raw = "💖";
-    var str: [str_len]u8 = "💖".*;
+    const ptr: [*]const u8 = @ptrCast([*]const u8, raw);
-    const str_ptr: [*]u8 = &str;
+    const list = sliceHelp(ptr, raw.len);
-    expectEqual(is_ok_utf8_byte_response, validateUtf8Bytes(str_ptr, str_len));
+    expectOk(validateUtf8BytesX(list));
 }
 test "validateUtf8Bytes: unicode ∆ in middle of array" {
-    const str_len = 9;
+    const raw = "œb∆c¬";
-    var str: [str_len]u8 = "œb∆c¬".*;
+    const ptr: [*]const u8 = @ptrCast([*]const u8, raw);
-    const str_ptr: [*]u8 = &str;
+    const list = sliceHelp(ptr, raw.len);
-    expectEqual(is_ok_utf8_byte_response, validateUtf8Bytes(str_ptr, str_len));
+    expectOk(validateUtf8BytesX(list));
 }
 test "validateUtf8Bytes: invalid start byte" {
--- a/compiler/builtins/src/bitcode.rs
+++ b/compiler/builtins/src/bitcode.rs
@ -41,8 +41,8 @@ pub const STR_NUMBER_OF_BYTES: &str = "roc_builtins.str.number_of_bytes";
 pub const STR_FROM_INT: &str = "roc_builtins.str.from_int";
 pub const STR_FROM_FLOAT: &str = "roc_builtins.str.from_float";
 pub const STR_EQUAL: &str = "roc_builtins.str.equal";
 pub const STR_VALIDATE_UTF_BYTES: &str = "roc_builtins.str.validate_utf8_bytes";
 pub const STR_TO_BYTES: &str = "roc_builtins.str.to_bytes";
 pub const STR_FROM_UTF8: &str = "roc_builtins.str.from_utf8";
 pub const DICT_HASH: &str = "roc_builtins.dict.hash";
 pub const DICT_HASH_STR: &str = "roc_builtins.dict.hash_str";
--- a/compiler/can/src/builtins.rs
+++ b/compiler/can/src/builtins.rs
@ -1598,7 +1598,7 @@ fn str_from_utf8(symbol: Symbol, var_store: &mut VarStore) -> Def {
                Access {
                    record_var,
                    ext_var: var_store.fresh(),
-                    field: "isOk".into(),
+                    field: "c_isOk".into(),
                    field_var: var_store.fresh(),
                    loc_expr: Box::new(no_region(Var(Symbol::ARG_2))),
                },
@ -1610,7 +1610,7 @@ fn str_from_utf8(symbol: Symbol, var_store: &mut VarStore) -> Def {
                vec![Access {
                    record_var,
                    ext_var: var_store.fresh(),
-                    field: "str".into(),
+                    field: "b_str".into(),
                    field_var: var_store.fresh(),
                    loc_expr: Box::new(no_region(Var(Symbol::ARG_2))),
                }],
@ -1627,14 +1627,14 @@ fn str_from_utf8(symbol: Symbol, var_store: &mut VarStore) -> Def {
                        Access {
                            record_var,
                            ext_var: var_store.fresh(),
-                            field: "problem".into(),
+                            field: "d_problem".into(),
                            field_var: var_store.fresh(),
                            loc_expr: Box::new(no_region(Var(Symbol::ARG_2))),
                        },
                        Access {
                            record_var,
                            ext_var: var_store.fresh(),
-                            field: "byteIndex".into(),
+                            field: "a_byteIndex".into(),
                            field_var: var_store.fresh(),
                            loc_expr: Box::new(no_region(Var(Symbol::ARG_2))),
                        },
--- a/compiler/gen/src/llvm/build_str.rs
+++ b/compiler/gen/src/llvm/build_str.rs
@ -1,13 +1,11 @@
 use crate::llvm::bitcode::{call_bitcode_fn, call_void_bitcode_fn};
 use crate::llvm::build::{complex_bitcast, Env, InPlace, Scope};
-use crate::llvm::build_list::{
+use crate::llvm::build_list::{allocate_list, store_list};
-    allocate_list, build_basic_phi2, empty_polymorphic_list, list_len, load_list_ptr, store_list,
+use crate::llvm::convert::collection;
 };
 use crate::llvm::convert::{collection, get_ptr_type};
 use inkwell::builder::Builder;
-use inkwell::types::{BasicTypeEnum, StructType};
+use inkwell::types::BasicTypeEnum;
 use inkwell::values::{BasicValueEnum, FunctionValue, IntValue, PointerValue, StructValue};
-use inkwell::{AddressSpace, IntPredicate};
+use inkwell::AddressSpace;
 use roc_builtins::bitcode;
 use roc_module::symbol::Symbol;
 use roc_mono::layout::{Builtin, Layout};
@ -300,43 +298,28 @@ pub fn str_to_bytes<'a, 'ctx, 'env>(
 /// Str.fromUtf8 : List U8 -> { a : Bool, b : Str, c : Nat, d : I8 }
 pub fn str_from_utf8<'a, 'ctx, 'env>(
    env: &Env<'a, 'ctx, 'env>,
-    parent: FunctionValue<'ctx>,
+    _parent: FunctionValue<'ctx>,
    original_wrapper: StructValue<'ctx>,
 ) -> BasicValueEnum<'ctx> {
    let builder = env.builder;
    let ctx = env.context;
-    let list_len = list_len(builder, original_wrapper);
+    let result_type = env.module.get_struct_type("str.FromUtf8Result").unwrap();
    let ptr_type = get_ptr_type(&ctx.i8_type().into(), AddressSpace::Generic);
    let list_ptr = load_list_ptr(builder, original_wrapper, ptr_type);
    let result_type = env
        .module
        .get_struct_type("str.ValidateUtf8BytesResult")
        .unwrap();
    let result_ptr = builder.build_alloca(result_type, "alloca_utf8_validate_bytes_result");
    call_void_bitcode_fn(
        env,
-        &[result_ptr.into(), list_ptr.into(), list_len.into()],
+        &[
-        &bitcode::STR_VALIDATE_UTF_BYTES,
+            complex_bitcast(
                env.builder,
                original_wrapper.into(),
                env.context.i128_type().into(),
                "to_i128",
            ),
            result_ptr.into(),
        ],
        &bitcode::STR_FROM_UTF8,
    );
    let utf8_validate_bytes_result = builder
        .build_load(result_ptr, "load_utf8_validate_bytes_result")
        .into_struct_value();
    let is_ok = builder
        .build_extract_value(utf8_validate_bytes_result, 0, "extract_extract_is_ok")
        .unwrap()
        .into_int_value();
    let byte_index = builder
        .build_extract_value(utf8_validate_bytes_result, 1, "extract_byte_index")
        .unwrap()
        .into_int_value();
    let problem_code = builder
        .build_extract_value(utf8_validate_bytes_result, 2, "extract_problem_code")
        .unwrap()
        .into_int_value();
    let record_type = env.context.struct_type(
        &[
@ -348,71 +331,16 @@ pub fn str_from_utf8<'a, 'ctx, 'env>(
        false,
    );
-    let comparison = builder.build_int_compare(
+    let result_ptr_cast = env
-        IntPredicate::EQ,
+        .builder
-        is_ok,
+        .build_bitcast(
-        ctx.bool_type().const_int(1, false),
+            result_ptr,
-        "compare_is_ok",
+            record_type.ptr_type(AddressSpace::Generic),
-    );
+            "to_unnamed",
        )
        .into_pointer_value();
-    build_basic_phi2(
+    builder.build_load(result_ptr_cast, "load_utf8_validate_bytes_result")
        env,
        parent,
        comparison,
        || {
            // We have a valid utf8 byte sequence
            // TODO: Should we do something different here if we're doing this in place?
            let zig_str =
                call_bitcode_fn(env, &[list_ptr.into(), list_len.into()], &bitcode::STR_INIT)
                    .into_struct_value();
            build_struct(
                builder,
                record_type,
                vec![
                    (
                        env.ptr_int().const_int(0, false).into(),
                        "insert_zeroed_byte_index",
                    ),
                    (zig_str_to_struct(env, zig_str).into(), "insert_str"),
                    (ctx.bool_type().const_int(1, false).into(), "insert_is_ok"),
                    (
                        ctx.i8_type().const_int(0, false).into(),
                        "insert_zeroed_problem",
                    ),
                ],
            )
            .into()
        },
        || {
            // We do not have a valid utf8 byte sequence
            build_struct(
                builder,
                record_type,
                vec![
                    (byte_index.into(), "insert_byte_index"),
                    (empty_polymorphic_list(env), "insert_zeroed_str"),
                    (ctx.bool_type().const_int(0, false).into(), "insert_is_ok"),
                    (problem_code.into(), "insert_problem"),
                ],
            )
            .into()
        },
        BasicTypeEnum::StructType(record_type),
    )
 }
 fn build_struct<'env, 'ctx>(
    builder: &'env Builder<'ctx>,
    struct_type: StructType<'ctx>,
    values: Vec<(BasicValueEnum<'ctx>, &str)>,
 ) -> StructValue<'ctx> {
    let mut val = struct_type.get_undef().into();
    for (index, (value, name)) in values.iter().enumerate() {
        val = builder
            .build_insert_value(val, *value, index as u32, name)
            .unwrap();
    }
    val.into_struct_value()
 }
 /// Str.fromInt : Int -> Str
--- a/examples/benchmarks/Base64.roc
+++ b/examples/benchmarks/Base64.roc
@ -1,24 +1,7 @@
-app "base64"
+interface Base64 exposes [ fromBytes ] imports [ Bytes.Decode ]
    packages { base: "platform" }
    imports [base.Task, Bytes.Decode.{Decoder} ]
    provides [ main ] to base
 IO a : Task.Task a []
 Decoder a : Bytes.Decode.Decoder a
 main : IO {}
 main =
    # when fromBytes [ 0 ] is
    when fromBytes (Str.toBytes "Hello World") is
        Ok str ->
            Task.putLine str
        Err _ ->
            Task.putLine "sadness"
 # ------
 fromBytes : List U8 -> Result Str Bytes.Decode.DecodeError
 fromBytes = \bytes ->
--- a/examples/benchmarks/TestBase64.roc
+++ b/examples/benchmarks/TestBase64.roc
@ -0,0 +1,17 @@
 app "test-base64"
    packages { base: "platform" }
    imports [base.Task, Base64 ]
    provides [ main ] to base
 IO a : Task.Task a []
 main : IO {}
 main =
    # when fromBytes [ 0 ] is
    when Base64.fromBytes (Str.toBytes "Hello World") is
        Ok str ->
            Task.putLine str
        Err _ ->
            Task.putLine "sadness"