mirror of
https://github.com/roc-lang/roc.git
synced 2025-09-28 06:14:46 +00:00
Finish first pass of Str.fromUtf8
This commit is contained in:
parent
815f633a58
commit
f3d9c2f2bb
12 changed files with 591 additions and 159 deletions
|
@ -15,6 +15,7 @@ comptime {
|
|||
// Str Module
|
||||
const str = @import("str.zig");
|
||||
comptime {
|
||||
exportStrFn(str.init, "init");
|
||||
exportStrFn(str.strSplitInPlaceC, "str_split_in_place");
|
||||
exportStrFn(str.countSegments, "count_segments");
|
||||
exportStrFn(str.countGraphemeClusters, "count_grapheme_clusters");
|
||||
|
@ -25,7 +26,7 @@ comptime {
|
|||
exportStrFn(str.strNumberOfBytes, "number_of_bytes");
|
||||
exportStrFn(str.strFromIntC, "from_int");
|
||||
exportStrFn(str.strEqual, "equal");
|
||||
exportStrFn(str.numberOfNextCodepointBytesC, "number_of_next_codepoint_bytes");
|
||||
exportStrFn(str.validateUtf8Bytes, "validate_utf8_bytes");
|
||||
}
|
||||
|
||||
// Export helpers - Must be run inside a comptime
|
||||
|
|
|
@ -275,6 +275,10 @@ pub const RocStr = extern struct {
|
|||
}
|
||||
};
|
||||
|
||||
pub fn init(bytes_ptr: [*]const u8, length: usize) callconv(.C) RocStr {
|
||||
return @call(.{ .modifier = always_inline }, RocStr.init, .{ std.heap.c_allocator, bytes_ptr, length});
|
||||
}
|
||||
|
||||
// Str.equal
|
||||
pub fn strEqual(self: RocStr, other: RocStr) callconv(.C) bool {
|
||||
return self.eq(other);
|
||||
|
@ -973,165 +977,189 @@ pub fn numberOfNextCodepointBytes(ptr: [*]u8, len: usize, index: usize) Utf8Deco
|
|||
if (codepoint_end_index > len) {
|
||||
return error.UnexpectedEof;
|
||||
}
|
||||
_ = try unicode.utf8Decode(ptr[index .. codepoint_end_index]);
|
||||
_ = try unicode.utf8Decode(ptr[index..codepoint_end_index]);
|
||||
return codepoint_end_index - index;
|
||||
}
|
||||
|
||||
// Rather then dealing with structs in Rust, we catch the error and return an error code when we actually use this function
|
||||
pub fn numberOfNextCodepointBytesC(ptr: [*]u8, len: usize, index: usize) callconv(.C) usize {
|
||||
return @call(.{ .modifier = always_inline }, numberOfNextCodepointBytes, .{ ptr, len, index }) catch |err| {
|
||||
return switch (err) {
|
||||
error.UnexpectedEof => 5,
|
||||
error.Utf8InvalidStartByte => 6,
|
||||
error.Utf8ExpectedContinuation => 7,
|
||||
error.Utf8OverlongEncoding => 8,
|
||||
error.Utf8EncodesSurrogateHalf => 9,
|
||||
error.Utf8CodepointTooLarge => 10,
|
||||
};
|
||||
};
|
||||
// Return types for validateUtf8Bytes
|
||||
// Values must be in alphabetical order. That is, lowest values are the first alphabetically.
|
||||
pub const Utf8ByteProblem = packed enum(u8) {
|
||||
CodepointTooLarge = 0,
|
||||
EncodesSurrogateHalf = 1,
|
||||
ExpectedContinuation = 2,
|
||||
InvalidStartByte = 3,
|
||||
OverlongEncoding = 4,
|
||||
UnexpectedEndOfSequence = 5,
|
||||
|
||||
};
|
||||
pub const ValidateUtf8BytesResult = extern struct {
|
||||
is_ok: bool, byte_index: usize, problem_code: Utf8ByteProblem
|
||||
};
|
||||
|
||||
const is_ok_utf8_byte_response =
|
||||
ValidateUtf8BytesResult{ .is_ok = true, .byte_index = 0, .problem_code = Utf8ByteProblem.UnexpectedEndOfSequence };
|
||||
inline fn toErrUtf8ByteResponse(byte_index: usize, problem_code: Utf8ByteProblem) ValidateUtf8BytesResult {
|
||||
return ValidateUtf8BytesResult{ .is_ok = false, .byte_index = byte_index, .problem_code = problem_code };
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: ascii" {
|
||||
// Validate that an array of bytes is valid UTF-8, but if it fails catch & return the error & byte index
|
||||
pub fn validateUtf8Bytes(ptr: [*]u8, len: usize) callconv(.C) ValidateUtf8BytesResult {
|
||||
var index: usize = 0;
|
||||
while (index < len) {
|
||||
const nextNumBytes = numberOfNextCodepointBytes(ptr, len, index) catch |err| {
|
||||
return toErrUtf8ByteResponse(
|
||||
index,
|
||||
switch (err) {
|
||||
error.UnexpectedEof => Utf8ByteProblem.UnexpectedEndOfSequence,
|
||||
error.Utf8InvalidStartByte => Utf8ByteProblem.InvalidStartByte,
|
||||
error.Utf8ExpectedContinuation => Utf8ByteProblem.ExpectedContinuation,
|
||||
error.Utf8OverlongEncoding => Utf8ByteProblem.OverlongEncoding,
|
||||
error.Utf8EncodesSurrogateHalf => Utf8ByteProblem.EncodesSurrogateHalf,
|
||||
error.Utf8CodepointTooLarge => Utf8ByteProblem.CodepointTooLarge,
|
||||
},
|
||||
);
|
||||
};
|
||||
index += nextNumBytes;
|
||||
}
|
||||
return is_ok_utf8_byte_response;
|
||||
}
|
||||
|
||||
test "validateUtf8Bytes: ascii" {
|
||||
const str_len = 3;
|
||||
var str: [str_len]u8 = "abc".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
const expected: usize = 1;
|
||||
expectEqual(expected, numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
expectEqual(is_ok_utf8_byte_response, validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: unicode œ" {
|
||||
test "validateUtf8Bytes: unicode œ" {
|
||||
const str_len = 2;
|
||||
var str: [str_len]u8 = "œ".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
const expected: usize = 2;
|
||||
expectEqual(expected, numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
expectEqual(is_ok_utf8_byte_response, validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: unicode ∆" {
|
||||
test "validateUtf8Bytes: unicode ∆" {
|
||||
const str_len = 3;
|
||||
var str: [str_len]u8 = "∆".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
const expected: usize = 3;
|
||||
expectEqual(expected, numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
expectEqual(is_ok_utf8_byte_response, validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: emoji" {
|
||||
test "validateUtf8Bytes: emoji" {
|
||||
const str_len = 4;
|
||||
var str: [str_len]u8 = "💖".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
const expected: usize = 4;
|
||||
expectEqual(expected, numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
expectEqual(is_ok_utf8_byte_response, validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: unicode ∆ in middle of array" {
|
||||
test "validateUtf8Bytes: unicode ∆ in middle of array" {
|
||||
const str_len = 9;
|
||||
var str: [str_len]u8 = "œb∆c¬".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
const expected: usize = 3;
|
||||
expectEqual(expected, numberOfNextCodepointBytesC(str_ptr, str_len, 3));
|
||||
expectEqual(is_ok_utf8_byte_response, validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: invalid start byte" {
|
||||
// https://doc.rust-lang.org/std/str/fn.from_utf8.html#examples
|
||||
const str_len = 1;
|
||||
var str: [str_len]u8 = "\x80".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.Utf8InvalidStartByte, numberOfNextCodepointBytes(str_ptr, str_len, 0));
|
||||
expectEqual(@intCast(usize, 6), numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: unexpected eof for 2 byte sequence" {
|
||||
test "validateUtf8Bytes: invalid start byte" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L426
|
||||
const str_len = 1;
|
||||
var str: [str_len]u8 = "\xc2".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.UnexpectedEof, numberOfNextCodepointBytes(str_ptr, str_len, 0));
|
||||
expectEqual(@intCast(usize, 5), numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: expected continuation for 2 byte sequence" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L426
|
||||
const str_len = 2;
|
||||
var str: [str_len]u8 = "\xc2\x00".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.Utf8ExpectedContinuation, numberOfNextCodepointBytes(str_ptr, str_len, 0));
|
||||
expectEqual(@intCast(usize, 7), numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: unexpected eof for 3 byte sequence" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L430
|
||||
const str_len = 2;
|
||||
var str: [str_len]u8 = "\xe0\x00".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.UnexpectedEof, numberOfNextCodepointBytes(str_ptr, str_len, 0));
|
||||
expectEqual(@intCast(usize, 5), numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: expected continuation for 3 byte sequence" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L430
|
||||
const str_len = 3;
|
||||
var str: [str_len]u8 = "\xe0\xa0\xc0".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.Utf8ExpectedContinuation, numberOfNextCodepointBytes(str_ptr, str_len, 0));
|
||||
expectEqual(@intCast(usize, 7), numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: unexpected eof for 4 byte sequence" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L437
|
||||
const str_len = 3;
|
||||
var str: [str_len]u8 = "\xf0\x90\x00".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.UnexpectedEof, numberOfNextCodepointBytes(str_ptr, str_len, 0));
|
||||
expectEqual(@intCast(usize, 5), numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: expected continuation for 4 byte sequence" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L437
|
||||
const str_len = 4;
|
||||
var str: [str_len]u8 = "\xf0\x90\x80\x00".*;
|
||||
var str: [str_len]u8 = "ab\x80c".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.Utf8ExpectedContinuation, numberOfNextCodepointBytes(str_ptr, str_len, 0));
|
||||
expectEqual(@intCast(usize, 7), numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
expectError(error.Utf8InvalidStartByte, numberOfNextCodepointBytes(str_ptr, str_len, 2));
|
||||
expectEqual(toErrUtf8ByteResponse(2, Utf8ByteProblem.InvalidStartByte), validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: overlong" {
|
||||
test "validateUtf8Bytes: unexpected eof for 2 byte sequence" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L426
|
||||
const str_len = 4;
|
||||
var str: [str_len]u8 = "abc\xc2".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.UnexpectedEof, numberOfNextCodepointBytes(str_ptr, str_len, 3));
|
||||
expectEqual(toErrUtf8ByteResponse(3, Utf8ByteProblem.UnexpectedEndOfSequence), validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "validateUtf8Bytes: expected continuation for 2 byte sequence" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L426
|
||||
const str_len = 5;
|
||||
var str: [str_len]u8 = "abc\xc2\x00".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.Utf8ExpectedContinuation, numberOfNextCodepointBytes(str_ptr, str_len, 3));
|
||||
expectEqual(toErrUtf8ByteResponse(3, Utf8ByteProblem.ExpectedContinuation), validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "validateUtf8Bytes: unexpected eof for 3 byte sequence" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L430
|
||||
const str_len = 5;
|
||||
var str: [str_len]u8 = "abc\xe0\x00".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.UnexpectedEof, numberOfNextCodepointBytes(str_ptr, str_len, 3));
|
||||
expectEqual(toErrUtf8ByteResponse(3, Utf8ByteProblem.UnexpectedEndOfSequence), validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "validateUtf8Bytes: expected continuation for 3 byte sequence" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L430
|
||||
const str_len = 6;
|
||||
var str: [str_len]u8 = "abc\xe0\xa0\xc0".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.Utf8ExpectedContinuation, numberOfNextCodepointBytes(str_ptr, str_len, 3));
|
||||
expectEqual(toErrUtf8ByteResponse(3, Utf8ByteProblem.ExpectedContinuation), validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "validateUtf8Bytes: unexpected eof for 4 byte sequence" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L437
|
||||
const str_len = 6;
|
||||
var str: [str_len]u8 = "abc\xf0\x90\x00".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.UnexpectedEof, numberOfNextCodepointBytes(str_ptr, str_len, 3));
|
||||
expectEqual(toErrUtf8ByteResponse(3, Utf8ByteProblem.UnexpectedEndOfSequence), validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "validateUtf8Bytes: expected continuation for 4 byte sequence" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L437
|
||||
const str_len = 7;
|
||||
var str: [str_len]u8 = "abc\xf0\x90\x80\x00".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.Utf8ExpectedContinuation, numberOfNextCodepointBytes(str_ptr, str_len, 3));
|
||||
expectEqual(toErrUtf8ByteResponse(3, Utf8ByteProblem.ExpectedContinuation), validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "validateUtf8Bytes: overlong" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L451
|
||||
const str_len = 4;
|
||||
var str: [str_len]u8 = "\xf0\x80\x80\x80".*;
|
||||
const str_len = 7;
|
||||
var str: [str_len]u8 = "abc\xf0\x80\x80\x80".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.Utf8OverlongEncoding, numberOfNextCodepointBytes(str_ptr, str_len, 0));
|
||||
expectEqual(@intCast(usize, 8), numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
expectError(error.Utf8OverlongEncoding, numberOfNextCodepointBytes(str_ptr, str_len, 3));
|
||||
expectEqual(toErrUtf8ByteResponse(3, Utf8ByteProblem.OverlongEncoding), validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: codepoint out of bounds" {
|
||||
test "validateUtf8Bytes: codepoint out too large" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L465
|
||||
const str_len = 4;
|
||||
var str: [str_len]u8 = "\xf4\x90\x80\x80".*;
|
||||
const str_len = 7;
|
||||
var str: [str_len]u8 = "abc\xf4\x90\x80\x80".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.Utf8CodepointTooLarge, numberOfNextCodepointBytes(str_ptr, str_len, 0));
|
||||
expectEqual(@intCast(usize, 10), numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
expectError(error.Utf8CodepointTooLarge, numberOfNextCodepointBytes(str_ptr, str_len, 3));
|
||||
expectEqual(toErrUtf8ByteResponse(3, Utf8ByteProblem.CodepointTooLarge), validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
||||
test "numberOfNextCodepointBytes: surrogate halves" {
|
||||
test "validateUtf8Bytes: surrogate halves" {
|
||||
// https://github.com/ziglang/zig/blob/0.7.x/lib/std/unicode.zig#L468
|
||||
const str_len = 3;
|
||||
var str: [str_len]u8 = "\xed\xa0\x80".*;
|
||||
const str_len = 6;
|
||||
var str: [str_len]u8 = "abc\xed\xa0\x80".*;
|
||||
const str_ptr: [*]u8 = &str;
|
||||
|
||||
expectError(error.Utf8EncodesSurrogateHalf, numberOfNextCodepointBytes(str_ptr, str_len, 0));
|
||||
expectEqual(@intCast(usize, 9), numberOfNextCodepointBytesC(str_ptr, str_len, 0));
|
||||
expectError(error.Utf8EncodesSurrogateHalf, numberOfNextCodepointBytes(str_ptr, str_len, 3));
|
||||
expectEqual(toErrUtf8ByteResponse(3, Utf8ByteProblem.EncodesSurrogateHalf), validateUtf8Bytes(str_ptr, str_len));
|
||||
}
|
||||
|
|
|
@ -24,6 +24,7 @@ pub const NUM_ATAN: &str = "roc_builtins.num.atan";
|
|||
pub const NUM_IS_FINITE: &str = "roc_builtins.num.is_finite";
|
||||
pub const NUM_POW_INT: &str = "roc_builtins.num.pow_int";
|
||||
|
||||
pub const STR_INIT: &str = "roc_builtins.str.init";
|
||||
pub const STR_COUNT_SEGMENTS: &str = "roc_builtins.str.count_segments";
|
||||
pub const STR_CONCAT: &str = "roc_builtins.str.concat";
|
||||
pub const STR_JOIN_WITH: &str = "roc_builtins.str.joinWith";
|
||||
|
@ -34,4 +35,4 @@ pub const STR_ENDS_WITH: &str = "roc_builtins.str.ends_with";
|
|||
pub const STR_NUMBER_OF_BYTES: &str = "roc_builtins.str.number_of_bytes";
|
||||
pub const STR_FROM_INT: &str = "roc_builtins.str.from_int";
|
||||
pub const STR_EQUAL: &str = "roc_builtins.str.equal";
|
||||
pub const STR_NUMBER_OF_NEXT_CODEPOINT_BYTES: &str = "number_of_next_codepoint_bytes";
|
||||
pub const STR_VALIDATE_UTF_BYTES: &str = "roc_builtins.str.validate_utf8_bytes";
|
||||
|
|
|
@ -4,7 +4,7 @@ use roc_module::symbol::Symbol;
|
|||
use roc_region::all::Region;
|
||||
use roc_types::builtin_aliases::{
|
||||
bool_type, dict_type, float_type, int_type, list_type, nat_type, num_type, ordering_type,
|
||||
result_type, set_type, str_type, str_utf8_problem_type, u8_type,
|
||||
result_type, set_type, str_type, str_utf8_byte_problem_type, u8_type,
|
||||
};
|
||||
use roc_types::solved_types::SolvedType;
|
||||
use roc_types::subs::VarId;
|
||||
|
@ -543,7 +543,8 @@ pub fn types() -> MutMap<Symbol, (SolvedType, Region)> {
|
|||
let bad_utf8 = SolvedType::TagUnion(
|
||||
vec![(
|
||||
TagName::Global("BadUtf8".into()),
|
||||
vec![str_utf8_problem_type()],
|
||||
// vec![str_utf8_problem_type()],
|
||||
vec![str_utf8_byte_problem_type(), nat_type()],
|
||||
)],
|
||||
Box::new(SolvedType::Wildcard),
|
||||
);
|
||||
|
|
|
@ -1191,7 +1191,11 @@ pub fn types() -> MutMap<Symbol, (SolvedType, Region)> {
|
|||
let bad_utf8 = SolvedType::TagUnion(
|
||||
vec![(
|
||||
TagName::Global("BadUtf8".into()),
|
||||
vec![builtin_aliases::str_utf8_problem_type()],
|
||||
// vec![builtin_aliases::str_utf8_problem_type()],
|
||||
vec![
|
||||
builtin_aliases::str_utf8_byte_problem_type(),
|
||||
builtin_aliases::nat_type(),
|
||||
],
|
||||
)],
|
||||
Box::new(SolvedType::Wildcard),
|
||||
);
|
||||
|
|
|
@ -1379,20 +1379,104 @@ fn str_from_int(symbol: Symbol, var_store: &mut VarStore) -> Def {
|
|||
/// Str.fromUtf8 : List U8 -> Result Str [ BadUtf8 Utf8Problem ]*
|
||||
fn str_from_utf8(symbol: Symbol, var_store: &mut VarStore) -> Def {
|
||||
let bytes_var = var_store.fresh();
|
||||
let str_var = var_store.fresh();
|
||||
let bool_var = var_store.fresh();
|
||||
let record_var = var_store.fresh();
|
||||
let ret_var = var_store.fresh();
|
||||
|
||||
let body = RunLowLevel {
|
||||
// let arg_2 = RunLowLevel FromUtf8 arg_1
|
||||
//
|
||||
// arg_2 :
|
||||
// { a : Bool -- isOk
|
||||
// , b : String -- result_str
|
||||
// , c : Nat -- problem_byte_index
|
||||
// , d : I8 -- problem_code
|
||||
// }
|
||||
//
|
||||
// if arg_2.a then
|
||||
// # all is well
|
||||
// Ok arg_2.str
|
||||
// else
|
||||
// # problem
|
||||
// Err (BadUtf8 { byteIndex: arg_2.byteIndex, problem : arg_2.problem })
|
||||
|
||||
let def = crate::def::Def {
|
||||
loc_pattern: no_region(Pattern::Identifier(Symbol::ARG_2)),
|
||||
loc_expr: no_region(RunLowLevel {
|
||||
op: LowLevel::StrFromUtf8,
|
||||
args: vec![(bytes_var, Var(Symbol::ARG_1))],
|
||||
ret_var: str_var,
|
||||
ret_var: record_var,
|
||||
}),
|
||||
expr_var: record_var,
|
||||
pattern_vars: SendMap::default(),
|
||||
annotation: None,
|
||||
};
|
||||
|
||||
let cont = If {
|
||||
branch_var: ret_var,
|
||||
cond_var: bool_var,
|
||||
branches: vec![(
|
||||
// if-condition
|
||||
no_region(
|
||||
// arg_2.c -> Bool
|
||||
Access {
|
||||
record_var,
|
||||
ext_var: var_store.fresh(),
|
||||
field: "isOk".into(),
|
||||
field_var: var_store.fresh(),
|
||||
loc_expr: Box::new(no_region(Var(Symbol::ARG_2))),
|
||||
},
|
||||
),
|
||||
// all is good
|
||||
no_region(tag(
|
||||
"Ok",
|
||||
// arg_2.a -> Str
|
||||
vec![Access {
|
||||
record_var,
|
||||
ext_var: var_store.fresh(),
|
||||
field: "str".into(),
|
||||
field_var: var_store.fresh(),
|
||||
loc_expr: Box::new(no_region(Var(Symbol::ARG_2))),
|
||||
}],
|
||||
var_store,
|
||||
)),
|
||||
)],
|
||||
final_else: Box::new(
|
||||
// bad!!
|
||||
no_region(tag(
|
||||
"Err",
|
||||
vec![tag(
|
||||
"BadUtf8",
|
||||
vec![
|
||||
Access {
|
||||
record_var,
|
||||
ext_var: var_store.fresh(),
|
||||
field: "problem".into(),
|
||||
field_var: var_store.fresh(),
|
||||
loc_expr: Box::new(no_region(Var(Symbol::ARG_2))),
|
||||
},
|
||||
Access {
|
||||
record_var,
|
||||
ext_var: var_store.fresh(),
|
||||
field: "byteIndex".into(),
|
||||
field_var: var_store.fresh(),
|
||||
loc_expr: Box::new(no_region(Var(Symbol::ARG_2))),
|
||||
},
|
||||
],
|
||||
var_store,
|
||||
)],
|
||||
var_store,
|
||||
)),
|
||||
),
|
||||
};
|
||||
|
||||
let body = LetNonRec(Box::new(def), Box::new(no_region(cont)), ret_var);
|
||||
|
||||
defn(
|
||||
symbol,
|
||||
vec![(bytes_var, Symbol::ARG_1)],
|
||||
var_store,
|
||||
body,
|
||||
str_var,
|
||||
ret_var,
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -2277,6 +2361,18 @@ fn tag(name: &'static str, args: Vec<Expr>, var_store: &mut VarStore) -> Expr {
|
|||
}
|
||||
}
|
||||
|
||||
// #[inline(always)]
|
||||
// fn record(fields: Vec<(Lowercase, Field)>, var_store: &mut VarStore) -> Expr {
|
||||
// let mut send_map = SendMap::default();
|
||||
// for (k, v) in fields {
|
||||
// send_map.insert(k, v);
|
||||
// }
|
||||
// Expr::Record {
|
||||
// record_var: var_store.fresh(),
|
||||
// fields: send_map,
|
||||
// }
|
||||
// }
|
||||
|
||||
#[inline(always)]
|
||||
fn defn(
|
||||
fn_name: Symbol,
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
use inkwell::types::BasicTypeEnum;
|
||||
use roc_module::low_level::LowLevel;
|
||||
|
||||
pub fn call_bitcode_fn<'a, 'ctx, 'env>(
|
||||
op: LowLevel,
|
||||
env: &Env<'a, 'ctx, 'env>,
|
||||
args: &[BasicValueEnum<'ctx>],
|
||||
fn_name: &str,
|
||||
) -> BasicValueEnum<'ctx> {
|
||||
let fn_val = env
|
||||
.module
|
||||
.get_function(fn_name)
|
||||
.unwrap_or_else(|| panic!("Unrecognized builtin function: {:?} - if you're working on the Roc compiler, do you need to rebuild the bitcode? See compiler/builtins/bitcode/README.md", fn_name));
|
||||
let call = env.builder.build_call(fn_val, args, "call_builtin");
|
||||
|
||||
call.set_call_convention(fn_val.get_call_conventions());
|
||||
|
||||
call.try_as_basic_value()
|
||||
.left()
|
||||
.unwrap_or_else(|| panic!("LLVM error: Invalid call for low-level op {:?}", op))
|
||||
}
|
|
@ -3516,7 +3516,9 @@ fn run_low_level<'a, 'ctx, 'env>(
|
|||
// Str.fromInt : Int -> Str
|
||||
debug_assert_eq!(args.len(), 1);
|
||||
|
||||
str_from_utf8(env, scope, args[0])
|
||||
let original_wrapper = load_symbol(env, scope, &args[0]).into_struct_value();
|
||||
|
||||
str_from_utf8(env, parent, original_wrapper)
|
||||
}
|
||||
StrSplit => {
|
||||
// Str.split : Str, Str -> List Str
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
use crate::llvm::build::{
|
||||
call_bitcode_fn, call_void_bitcode_fn, complex_bitcast, Env, InPlace, Scope,
|
||||
};
|
||||
use crate::llvm::build_list::{allocate_list, store_list};
|
||||
use crate::llvm::convert::collection;
|
||||
use inkwell::types::BasicTypeEnum;
|
||||
use inkwell::values::{BasicValueEnum, IntValue, StructValue};
|
||||
use inkwell::AddressSpace;
|
||||
use crate::llvm::build_list::{
|
||||
allocate_list, build_basic_phi2, empty_polymorphic_list, list_len, load_list_ptr, store_list,
|
||||
};
|
||||
use crate::llvm::convert::{collection, get_ptr_type};
|
||||
use inkwell::builder::Builder;
|
||||
use inkwell::types::{BasicTypeEnum, StructType};
|
||||
use inkwell::values::{BasicValueEnum, FunctionValue, IntValue, StructValue};
|
||||
use inkwell::{AddressSpace, IntPredicate};
|
||||
use roc_builtins::bitcode;
|
||||
use roc_module::symbol::Symbol;
|
||||
use roc_mono::layout::{Builtin, Layout};
|
||||
|
@ -255,19 +258,128 @@ pub fn str_from_int<'a, 'ctx, 'env>(
|
|||
zig_str_to_struct(env, zig_result).into()
|
||||
}
|
||||
|
||||
/// Str.fromUtf8 : List U8 -> Result Str [ BadUtf8 Utf8Problem ]*
|
||||
/// Str.fromUtf8 : List U8 -> { a : Bool, b : Str, c : Nat, d : I8 }
|
||||
pub fn str_from_utf8<'a, 'ctx, 'env>(
|
||||
env: &Env<'a, 'ctx, 'env>,
|
||||
scope: &Scope<'a, 'ctx>,
|
||||
bytes_symbol: Symbol,
|
||||
parent: FunctionValue<'ctx>,
|
||||
original_wrapper: StructValue<'ctx>,
|
||||
) -> BasicValueEnum<'ctx> {
|
||||
let bytes = load_symbol(env, scope, &bytes_symbol);
|
||||
let builder = env.builder;
|
||||
let ctx = env.context;
|
||||
|
||||
// TODO fromUtf8:
|
||||
// let zig_result = call_bitcode_fn(env, &[int], &bitcode::STR_FROM_INT).into_struct_value();
|
||||
// zig_str_to_struct(env, zig_result).into()
|
||||
let list_len = list_len(builder, original_wrapper);
|
||||
let ptr_type = get_ptr_type(&ctx.i8_type().into(), AddressSpace::Generic);
|
||||
let list_ptr = load_list_ptr(builder, original_wrapper, ptr_type);
|
||||
|
||||
panic!("TODO fromUtf8")
|
||||
let result_type = env
|
||||
.module
|
||||
.get_struct_type("str.ValidateUtf8BytesResult")
|
||||
.unwrap();
|
||||
let result_ptr = builder.build_alloca(result_type, "alloca_utf8_validate_bytes_result");
|
||||
|
||||
call_void_bitcode_fn(
|
||||
env,
|
||||
&[result_ptr.into(), list_ptr.into(), list_len.into()],
|
||||
&bitcode::STR_VALIDATE_UTF_BYTES,
|
||||
);
|
||||
let utf8_validate_bytes_result = builder
|
||||
.build_load(result_ptr, "load_utf8_validate_bytes_result")
|
||||
.into_struct_value();
|
||||
|
||||
let is_ok = builder
|
||||
.build_extract_value(utf8_validate_bytes_result, 0, "extract_extract_is_ok")
|
||||
.unwrap()
|
||||
.into_int_value();
|
||||
let byte_index = builder
|
||||
.build_extract_value(utf8_validate_bytes_result, 1, "extract_byte_index")
|
||||
.unwrap()
|
||||
.into_int_value();
|
||||
let problem_code = builder
|
||||
.build_extract_value(utf8_validate_bytes_result, 2, "extract_problem_code")
|
||||
.unwrap()
|
||||
.into_int_value();
|
||||
|
||||
let record_type = env.context.struct_type(
|
||||
&[
|
||||
env.ptr_int().into(),
|
||||
collection(env.context, env.ptr_bytes).into(),
|
||||
env.context.bool_type().into(),
|
||||
ctx.i8_type().into(),
|
||||
],
|
||||
false,
|
||||
);
|
||||
|
||||
let comparison = builder.build_int_compare(
|
||||
IntPredicate::EQ,
|
||||
is_ok,
|
||||
ctx.bool_type().const_int(1, false),
|
||||
"compare_is_ok",
|
||||
);
|
||||
|
||||
build_basic_phi2(
|
||||
env,
|
||||
parent,
|
||||
comparison,
|
||||
|| {
|
||||
// We have a valid utf8 byte sequence
|
||||
// TODO: Should we do something different here if we're doing this in place?
|
||||
let zig_str =
|
||||
call_bitcode_fn(env, &[list_ptr.into(), list_len.into()], &bitcode::STR_INIT)
|
||||
.into_struct_value();
|
||||
build_struct(
|
||||
builder,
|
||||
record_type,
|
||||
vec![
|
||||
(
|
||||
env.ptr_int().const_int(0 as u64, false).into(),
|
||||
"insert_zeroed_byte_index",
|
||||
),
|
||||
(zig_str_to_struct(env, zig_str).into(), "insert_str"),
|
||||
(
|
||||
ctx.bool_type().const_int(1 as u64, false).into(),
|
||||
"insert_is_ok",
|
||||
),
|
||||
(
|
||||
ctx.i8_type().const_int(0 as u64, false).into(),
|
||||
"insert_zeroed_problem",
|
||||
),
|
||||
],
|
||||
)
|
||||
.into()
|
||||
},
|
||||
|| {
|
||||
// We do not have a valid utf8 byte sequence
|
||||
build_struct(
|
||||
builder,
|
||||
record_type,
|
||||
vec![
|
||||
(byte_index.into(), "insert_byte_index"),
|
||||
(empty_polymorphic_list(env).into(), "insert_zeroed_str"),
|
||||
(
|
||||
ctx.bool_type().const_int(0 as u64, false).into(),
|
||||
"insert_is_ok",
|
||||
),
|
||||
(problem_code.into(), "insert_problem"),
|
||||
],
|
||||
)
|
||||
.into()
|
||||
},
|
||||
BasicTypeEnum::StructType(record_type),
|
||||
)
|
||||
}
|
||||
|
||||
fn build_struct<'env, 'ctx>(
|
||||
builder: &'env Builder<'ctx>,
|
||||
struct_type: StructType<'ctx>,
|
||||
values: Vec<(BasicValueEnum<'ctx>, &str)>,
|
||||
) -> StructValue<'ctx> {
|
||||
let mut val = struct_type.get_undef().into();
|
||||
for (index, (value, name)) in values.iter().enumerate() {
|
||||
val = builder
|
||||
.build_insert_value(val, *value, index as u32, name)
|
||||
.unwrap();
|
||||
}
|
||||
val.into_struct_value()
|
||||
}
|
||||
|
||||
/// Str.equal : Str, Str -> Bool
|
||||
|
|
|
@ -47,6 +47,7 @@ pub fn build_eq<'a, 'ctx, 'env>(
|
|||
(Builtin::Int16, Builtin::Int16) => int_cmp(IntPredicate::EQ, "eq_i16"),
|
||||
(Builtin::Int8, Builtin::Int8) => int_cmp(IntPredicate::EQ, "eq_i8"),
|
||||
(Builtin::Int1, Builtin::Int1) => int_cmp(IntPredicate::EQ, "eq_i1"),
|
||||
(Builtin::Usize, Builtin::Usize) => int_cmp(IntPredicate::EQ, "eq_usize"),
|
||||
(Builtin::Float64, Builtin::Float64) => float_cmp(FloatPredicate::OEQ, "eq_f64"),
|
||||
(Builtin::Float32, Builtin::Float32) => float_cmp(FloatPredicate::OEQ, "eq_f32"),
|
||||
(Builtin::Str, Builtin::Str) => str_equal(env, lhs_val, rhs_val),
|
||||
|
|
|
@ -517,7 +517,7 @@ mod gen_str {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8() {
|
||||
fn str_from_utf8_pass_single_ascii() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
|
@ -531,6 +531,210 @@ mod gen_str {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8_pass_many_ascii() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.fromUtf8 [ 97, 98, 99, 0x7E ] is
|
||||
Ok val -> val
|
||||
Err _ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from_slice("abc~".as_bytes()),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8_pass_single_unicode() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.fromUtf8 [ 0xE2, 0x88, 0x86 ] is
|
||||
Ok val -> val
|
||||
Err _ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from_slice("∆".as_bytes()),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8_pass_many_unicode() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.fromUtf8 [ 0xE2, 0x88, 0x86, 0xC5, 0x93, 0xC2, 0xAC ] is
|
||||
Ok val -> val
|
||||
Err _ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from_slice("∆œ¬".as_bytes()),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8_pass_single_grapheme() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.fromUtf8 [ 0xF0, 0x9F, 0x92, 0x96 ] is
|
||||
Ok val -> val
|
||||
Err _ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from_slice("💖".as_bytes()),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8_pass_many_grapheme() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.fromUtf8 [ 0xF0, 0x9F, 0x92, 0x96, 0xF0, 0x9F, 0xA4, 0xA0, 0xF0, 0x9F, 0x9A, 0x80 ] is
|
||||
Ok val -> val
|
||||
Err _ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from_slice("💖🤠🚀".as_bytes()),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8_pass_all() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.fromUtf8 [ 0xF0, 0x9F, 0x92, 0x96, 98, 0xE2, 0x88, 0x86 ] is
|
||||
Ok val -> val
|
||||
Err _ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from_slice("💖b∆".as_bytes()),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8_fail_invalid_start_byte() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.fromUtf8 [ 97, 98, 0x80, 99 ] is
|
||||
Err (BadUtf8 InvalidStartByte byteIndex) ->
|
||||
if byteIndex == 2 then
|
||||
"a"
|
||||
else
|
||||
"b"
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from_slice("a".as_bytes()),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8_fail_unexpected_end_of_sequence() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.fromUtf8 [ 97, 98, 99, 0xC2 ] is
|
||||
Err (BadUtf8 UnexpectedEndOfSequence byteIndex) ->
|
||||
if byteIndex == 3 then
|
||||
"a"
|
||||
else
|
||||
"b"
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from_slice("a".as_bytes()),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8_fail_expected_continuation() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.fromUtf8 [ 97, 98, 99, 0xC2, 0x00 ] is
|
||||
Err (BadUtf8 ExpectedContinuation byteIndex) ->
|
||||
if byteIndex == 3 then
|
||||
"a"
|
||||
else
|
||||
"b"
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from_slice("a".as_bytes()),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8_fail_overlong_encoding() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.fromUtf8 [ 97, 0xF0, 0x80, 0x80, 0x80 ] is
|
||||
Err (BadUtf8 OverlongEncoding byteIndex) ->
|
||||
if byteIndex == 1 then
|
||||
"a"
|
||||
else
|
||||
"b"
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from_slice("a".as_bytes()),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8_fail_codepoint_too_large() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.fromUtf8 [ 97, 0xF4, 0x90, 0x80, 0x80 ] is
|
||||
Err (BadUtf8 CodepointTooLarge byteIndex) ->
|
||||
if byteIndex == 1 then
|
||||
"a"
|
||||
else
|
||||
"b"
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from_slice("a".as_bytes()),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8_fail_surrogate_half() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.fromUtf8 [ 97, 98, 0xED, 0xA0, 0x80 ] is
|
||||
Err (BadUtf8 EncodesSurrogateHalf byteIndex) ->
|
||||
if byteIndex == 2 then
|
||||
"a"
|
||||
else
|
||||
"b"
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from_slice("a".as_bytes()),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_equality() {
|
||||
assert_evals_to!(r#""a" == "a""#, true, bool);
|
||||
|
|
|
@ -717,15 +717,18 @@ pub fn str_utf8_byte_problem_type() -> SolvedType {
|
|||
|
||||
#[inline(always)]
|
||||
pub fn str_utf8_byte_problem_alias_content() -> SolvedType {
|
||||
// [ InvalidStartByte, UnexpectedEndOfSequence, ExpectedContinuation, OverlongEncoding, CodepointTooLarge, EncodesSurrogateHalf ]
|
||||
// 1. This must have the same values as the Zig struct Utf8ByteProblem in src/str.zig
|
||||
// 2. This must be in alphabetical order
|
||||
//
|
||||
// [ CodepointTooLarge, EncodesSurrogateHalf, OverlongEncoding, InvalidStartByte, UnexpectedEndOfSequence, ExpectedContinuation ]
|
||||
SolvedType::TagUnion(
|
||||
vec![
|
||||
(TagName::Global("InvalidStartByte".into()), vec![]),
|
||||
(TagName::Global("UnexpectedEndOfSequence".into()), vec![]),
|
||||
(TagName::Global("ExpectedContinuation".into()), vec![]),
|
||||
(TagName::Global("OverlongEncoding".into()), vec![]),
|
||||
(TagName::Global("CodepointTooLarge".into()), vec![]),
|
||||
(TagName::Global("EncodesSurrogateHalf".into()), vec![]),
|
||||
(TagName::Global("ExpectedContinuation".into()), vec![]),
|
||||
(TagName::Global("InvalidStartByte".into()), vec![]),
|
||||
(TagName::Global("OverlongEncoding".into()), vec![]),
|
||||
(TagName::Global("UnexpectedEndOfSequence".into()), vec![]),
|
||||
],
|
||||
Box::new(SolvedType::EmptyTagUnion),
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue