add fromUtf16 and 32, as well as lossy variants

This commit is contained in:
shua 2025-01-15 00:14:58 +01:00
parent 032f1cc5a4
commit 414fecd14d
No known key found for this signature in database
17 changed files with 805 additions and 74 deletions

View file

@ -203,6 +203,7 @@ comptime {
exportStrFn(str.reserveC, "reserve");
exportStrFn(str.strToUtf8C, "to_utf8");
exportStrFn(str.fromUtf8C, "from_utf8");
exportStrFn(str.fromUtf8Lossy, "from_utf8_lossy");
exportStrFn(str.repeatC, "repeat");
exportStrFn(str.strTrim, "trim");
exportStrFn(str.strTrimStart, "trim_start");

View file

@ -1449,6 +1449,105 @@ pub fn fromUtf8C(
return fromUtf8(list, update_mode);
}
const UNICODE_REPLACEMENT: u21 = 0xfffd;
const Utf8Iterator = struct {
bytes: []u8,
i: usize,
pub fn init(list: RocList) Utf8Iterator {
const bytes = @as([*]u8, @ptrCast(list.bytes))[0..list.length];
return Utf8Iterator{
.bytes = bytes,
.i = 0,
};
}
pub fn nextLossy(it: *Utf8Iterator) ?u32 {
if (it.bytes.len <= it.i) {
return null;
}
const rest = it.bytes[it.i..];
const n = unicode.utf8ByteSequenceLength(rest[0]) catch {
// invalid start byte
it.i += 1;
return UNICODE_REPLACEMENT;
};
for (1..n) |i| {
if (rest.len == i) {
// unexpected end
it.i += i;
return UNICODE_REPLACEMENT;
}
if (rest[i] < 0x70) {
// expected continuation byte (>= 0x70)
it.i += i;
return UNICODE_REPLACEMENT;
}
}
it.i += n;
return unicode.utf8Decode(rest[0..n]) catch {
return UNICODE_REPLACEMENT;
};
}
pub fn reset(it: *Utf8Iterator) void {
it.i = 0;
}
};
fn codepointSeqLengthLossy(c: u32) u3 {
if (c < 0x110000) {
if (unicode.utf8CodepointSequenceLength(@intCast(c))) |n| {
return n;
} else |_| {
// fallthrough
}
}
return unicode.utf8CodepointSequenceLength(UNICODE_REPLACEMENT) catch unreachable;
}
fn utf8EncodeLossy(c: u32, out: []u8) u3 {
if (c < 0x110000) {
if (unicode.utf8Encode(@intCast(c), out)) |n| {
return n;
} else |_| {
// fallthrough
}
}
return unicode.utf8Encode(UNICODE_REPLACEMENT, out) catch unreachable;
}
pub fn fromUtf8Lossy(
list: RocList,
) callconv(.C) RocStr {
if (list.len() == 0) {
return RocStr.empty();
}
// PERF: we could try to reuse the input list if it's already valid utf-8, similar to fromUtf8
var it = Utf8Iterator.init(list);
var enc_len: usize = 0;
while (it.nextLossy()) |c| {
enc_len += codepointSeqLengthLossy(c);
}
var str = RocStr.allocate(enc_len);
const ptr = str.asU8ptrMut()[0..enc_len];
var end_index: usize = 0;
it.reset();
while (it.nextLossy()) |c| {
end_index += utf8EncodeLossy(c, ptr[end_index..]);
}
str.setLen(end_index);
return str;
}
pub fn fromUtf8(
list: RocList,
update_mode: UpdateMode,
@ -1667,6 +1766,17 @@ test "validateUtf8Bytes: unicode ∆ in middle of array" {
try expectOk(str_result);
}
test "fromUtf8Lossy: ascii, emoji" {
var list = RocList.fromSlice(u8, "r💖c", false);
defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
const res = fromUtf8Lossy(list);
defer res.decref();
const expected = RocStr.fromSlice("r💖c");
defer expected.decref();
try expect(expected.eq(res));
}
fn expectErr(list: RocList, index: usize, err: Utf8DecodeError, problem: Utf8ByteProblem) !void {
const str_ptr = @as([*]u8, @ptrCast(list.bytes));
const len = list.length;
@ -1765,6 +1875,66 @@ test "validateUtf8Bytes: surrogate halves" {
try expectErr(list, 3, error.Utf8EncodesSurrogateHalf, Utf8ByteProblem.EncodesSurrogateHalf);
}
test "fromUtf8Lossy: invalid start byte" {
var list = RocList.fromSlice(u8, "r\x80c", false);
defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
const res = fromUtf8Lossy(list);
defer res.decref();
const expected = RocStr.fromSlice("r<EFBFBD>c");
defer expected.decref();
try expect(expected.eq(res));
}
test "fromUtf8Lossy: overlong encoding" {
var list = RocList.fromSlice(u8, "r\xF0\x9F\x92\x96\x80c", false);
defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
const res = fromUtf8Lossy(list);
defer res.decref();
const expected = RocStr.fromSlice("r💖<EFBFBD>c");
defer expected.decref();
try expect(expected.eq(res));
}
test "fromUtf8Lossy: expected continuation" {
var list = RocList.fromSlice(u8, "r\xCFc", false);
defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
const res = fromUtf8Lossy(list);
defer res.decref();
const expected = RocStr.fromSlice("r<EFBFBD>c");
defer expected.decref();
try expect(expected.eq(res));
}
test "fromUtf8Lossy: unexpected end" {
var list = RocList.fromSlice(u8, "r\xCF", false);
defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
const res = fromUtf8Lossy(list);
defer res.decref();
const expected = RocStr.fromSlice("r<EFBFBD>");
defer expected.decref();
try expect(expected.eq(res));
}
test "fromUtf8Lossy: encodes surrogate" {
// 0xd83d == 0b1101_1000_0011_1101
// wwww xxxx yyyy zzzz
// becomes 0b1110_1101 0b10_1000_00 0b10_11_1101
// 1110_wwww 10_xxxx_yy 10_yy_zzzz
// 0xED 0x90 0xBD
var list = RocList.fromSlice(u8, "r\xED\xA0\xBDc", false);
defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
const res = fromUtf8Lossy(list);
defer res.decref();
const expected = RocStr.fromSlice("r<EFBFBD>c");
defer expected.decref();
try expect(expected.eq(res));
}
fn isWhitespace(codepoint: u21) bool {
// https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
return switch (codepoint) {

View file

@ -328,7 +328,6 @@
## Currently, the only way to get seamless slices of strings is by calling certain `Str` functions which return them. In general, `Str` functions which accept a string and return a subset of that string tend to do this. [`Str.trim`](https://www.roc-lang.org/builtins/Str#trim) is another example of a function which returns a seamless slice.
module [
Utf8Problem,
Utf8ByteProblem,
concat,
is_empty,
join_with,
@ -337,6 +336,11 @@ module [
count_utf8_bytes,
to_utf8,
from_utf8,
from_utf16,
from_utf32,
from_utf8_lossy,
from_utf16_lossy,
from_utf32_lossy,
starts_with,
ends_with,
trim,
@ -376,7 +380,7 @@ import Result exposing [Result]
import List
import Num exposing [Num, U8, U16, U32, U64, U128, I8, I16, I32, I64, I128, F32, F64, Dec]
Utf8ByteProblem : [
Utf8Problem : [
InvalidStartByte,
UnexpectedEndOfSequence,
ExpectedContinuation,
@ -385,8 +389,6 @@ Utf8ByteProblem : [
EncodesSurrogateHalf,
]
Utf8Problem : { byte_index : U64, problem : Utf8ByteProblem }
## Returns [Bool.true] if the string is empty, and [Bool.false] otherwise.
## ```roc
## expect Str.is_empty("hi!") == Bool.false
@ -538,7 +540,7 @@ to_utf8 : Str -> List U8
## expect Str.from_utf8([]) == Ok("")
## expect Str.from_utf8([255]) |> Result.is_err
## ```
from_utf8 : List U8 -> Result Str [BadUtf8 { problem : Utf8ByteProblem, index : U64 }]
from_utf8 : List U8 -> Result Str [BadUtf8 { problem : Utf8Problem, index : U64 }]
from_utf8 = \bytes ->
result = from_utf8_lowlevel bytes
@ -557,11 +559,242 @@ FromUtf8Result : {
a_byte_index : U64,
b_string : Str,
c_is_ok : Bool,
d_problem_code : Utf8ByteProblem,
d_problem_code : Utf8Problem,
}
from_utf8_lowlevel : List U8 -> FromUtf8Result
## Converts a [List] of [U8] UTF-8 [code units](https://unicode.org/glossary/#code_unit) to a string.
## Any grouping of invalid byte sequences are replaced with a single unicode replacement character '<27>'.
##
## An invalid byte sequence is defined as
## - a 2-byte-sequence starting byte, followed by less than 1 continuation byte
## - a 3-byte-sequence starting byte, followed by less than 2 continuation bytes
## - a 4-byte-sequence starting byte, followed by less than 3 continuation bytes
## - an invalid codepoint from the surrogate pair block
## - an invalid codepoint greater than 0x110000 encoded as a 4-byte sequence
## - any valid codepoint encoded as an incorrect sequence, for instance a codepoint that should be a 2-byte sequence encoded as a 3- or 4-byte sequence
##
## ```roc
## expect (Str.from_utf8_lossy [82, 111, 99, 240, 159, 144, 166]) == "Roc🐦"
## expect (Str.from_utf8_lossy [82, 255, 99]) == "R<>c"
## expect (Str.from_utf8_lossy [82, 0xED, 0xA0, 0xBD, 99]) == "R<>c"
## ```
from_utf8_lossy : List U8 -> Str
expect (Str.from_utf8_lossy [82, 111, 99, 240, 159, 144, 166]) == "Roc🐦"
expect (Str.from_utf8_lossy [82, 255, 99]) == "R<>c"
expect (Str.from_utf8_lossy [82, 0xED, 0xA0, 0xBD, 99]) == "R<>c"
## Converts a [List] of [U16] UTF-16 (little-endian) [code units](https://unicode.org/glossary/#code_unit) to a string.
##
## ```roc
## expect Str.from_utf16([82, 111, 99]) == Ok("Roc")
## expect Str.from_utf16([0xb9a, 0xbbf]) == Ok("சி")
## expect Str.from_utf16([0xd83d, 0xdc26]) == Ok("🐦")
## expect Str.from_utf16([]) == Ok("")
## # unpaired surrogates, first and second halves
## expect Str.from_utf16([82, 0xd83d, 99]) |> Result.isErr
## expect Str.from_utf16([82, 0xdc96, 99]) |> Result.isErr
## ```
from_utf16 : List U16 -> Result Str [BadUtf16 { problem : Utf8Problem, index : U64 }]
from_utf16 = \codeunits ->
mk_err = \problem, index ->
Err(BadUtf16({ problem, index }))
step = \state, unit ->
c : U32
c = Num.int_cast(unit)
when state is
ExpectFirst(i, utf8) ->
if unit < 0xd800 then
when encode_utf8(utf8, c) is
Ok(utf8_next) -> ExpectFirst(i + 1, utf8_next)
Err(err) -> mk_err(err, i)
else
ExpectSecond(i, utf8, c)
ExpectSecond(i, utf8, first) ->
if unit < 0xdc00 then
mk_err(EncodesSurrogateHalf, i)
else
joined = ((first - 0xd800) * 0x400) + (c - 0xdc00) + 0x10000
when encode_utf8(utf8, joined) is
Ok(utf8_next) -> ExpectFirst(i + 2, utf8_next)
Err(err) -> mk_err(err, i)
Err(err) -> Err(err)
decode_res = List.walk(codeunits, ExpectFirst(0, []), step)
when decode_res is
ExpectFirst(_, utf8) ->
from_utf8(utf8)
|> Result.map_err(\BadUtf8(err) -> BadUtf16(err))
ExpectSecond(i, _, _) ->
mk_err(EncodesSurrogateHalf, i)
Err(err) -> Err(err)
expect Str.from_utf16([82, 111, 99]) == Ok("Roc")
expect Str.from_utf16([0xb9a, 0xbbf]) == Ok("சி")
expect Str.from_utf16([0xd83d, 0xdc26]) == Ok("🐦")
expect Str.from_utf16([]) == Ok("")
# unpaired surrogates, first and second halves
expect Str.from_utf16([82, 0xd83d, 99]) == Err(BadUtf16({ index: 1, problem: EncodesSurrogateHalf }))
expect Str.from_utf16([82, 0xdc96, 99]) == Err(BadUtf16({ index: 1, problem: EncodesSurrogateHalf }))
## Converts a [List] of [U16] UTF-16 (little-endian) [code units](https://unicode.org/glossary/#code_unit) to a string.
## Any unpaired surrogate code unit is replaced with a single unicode replacement character '<27>'.
##
## ```roc
## expect Str.from_utf16_lossy([82, 111, 99, 0xd83d, 0xdc26]) == "Roc🐦"
## expect Str.from_utf16_lossy([82, 0xdc96, 99]) == "R<>c"
## ```
from_utf16_lossy : List U16 -> Str
from_utf16_lossy = \codeunits ->
utf8_replacement = [0xef, 0xbf, 0xbd]
encode_lossy = \utf8, c ->
when encode_utf8(utf8, c) is
Ok(utf8_next) -> utf8_next
Err(_) -> List.concat(utf8, utf8_replacement)
step = \state, unit ->
c : U32
c = Num.int_cast(unit)
when state is
ExpectFirst(utf8) ->
if unit < 0xd800 then
ExpectFirst(encode_lossy(utf8, c))
else
ExpectSecond(utf8, c)
ExpectSecond(utf8, first) ->
if c < 0xd800 then
ExpectFirst(
List.concat(utf8, utf8_replacement)
|> encode_lossy(c),
)
else if c < 0xdc00 then
ExpectSecond(List.concat(utf8, utf8_replacement), c)
else
joined = ((first - 0xd800) * 0x400) + (c - 0xdc00) + 0x10000
ExpectFirst(encode_lossy(utf8, joined))
result = List.walk(codeunits, ExpectFirst([]), step)
when result is
ExpectFirst(utf8) -> from_utf8_lossy(utf8)
ExpectSecond(utf8, _) -> from_utf8_lossy(List.concat(utf8, utf8_replacement))
expect Str.from_utf16_lossy([82, 111, 99, 0xd83d, 0xdc26]) == "Roc🐦"
expect Str.from_utf16_lossy([82, 0xdc96, 99]) == "R<>c"
## Converts a [List] of [U32] UTF-32 [code units](https://unicode.org/glossary/#code_unit) to a string.
##
## ```roc
## expect Str.from_utf32([82, 111, 99]) == Ok("Roc")
## expect Str.from_utf32([0xb9a, 0xbbf]) == Ok("சி")
## expect Str.from_utf32([0x1f426]) == Ok("🐦")
## # unpaired surrogates, first and second halves
## expect Str.from_utf32([82, 0xd83d, 99]) |> Result.isErr
## expect Str.from_utf32([82, 0xdc96, 99]) |> Result.isErr
## # invalid codepoint
## expect Str.from_utf32([82, 0x110000, 99]) |> Result.isErr
## ```
from_utf32 : List U32 -> Result Str [BadUtf32 { problem : Utf8Problem, index : U64 }]
from_utf32 = \codepoints ->
step = \state, c ->
when state is
Ok({ i, utf8 }) ->
when encode_utf8(utf8, c) is
Ok(utf8_next) -> Ok({ i: i + 1, utf8: utf8_next })
Err(problem) -> Err(BadUtf32({ problem, index: i }))
Err(err) -> Err(err)
List.walk(codepoints, Ok({ i: 0, utf8: [] }), step)
|> Result.try(
\state ->
when from_utf8(state.utf8) is
Ok(str) -> Ok(str)
Err(BadUtf8(err)) -> Err(BadUtf32(err)),
)
encode_utf8 : List U8, U32 -> Result (List U8) [EncodesSurrogateHalf, CodepointTooLarge]
encode_utf8 = \list, c ->
if c < 0x80 then
Ok(List.append(list, Num.int_cast(c)))
else if c < 0x800 then
Ok(
List.concat(
list,
[
Num.int_cast(Num.bitwise_or(Num.shift_right_by(c, 6), 0b110_00000)),
Num.int_cast(Num.bitwise_or(Num.bitwise_and(c, 0b111111), 0b10_000000)),
],
),
)
else if c < 0x10000 then
if (c >= 0xd800) && (c < 0xe000) then
Err(EncodesSurrogateHalf)
else
Ok(
List.concat(
list,
[
Num.int_cast(Num.bitwise_or(Num.shift_right_by(c, 12), 0b1110_0000)),
Num.int_cast(Num.bitwise_or(Num.bitwise_and(Num.shift_right_by(c, 6), 0b111111), 0b10_000000)),
Num.int_cast(Num.bitwise_or(Num.bitwise_and(c, 0b111111), 0b10_000000)),
],
),
)
else if c < 0x110000 then
Ok(
List.concat(
list,
[
Num.int_cast(Num.bitwise_or(Num.shift_right_by(c, 18), 0b11110_000)),
Num.int_cast(Num.bitwise_or(Num.bitwise_and(Num.shift_right_by(c, 12), 0b111111), 0b10_000000)),
Num.int_cast(Num.bitwise_or(Num.bitwise_and(Num.shift_right_by(c, 6), 0b111111), 0b10_000000)),
Num.int_cast(Num.bitwise_or(Num.bitwise_and(c, 0b111111), 0b10_000000)),
],
),
)
else
Err(CodepointTooLarge)
expect Str.from_utf32([82, 111, 99]) == Ok("Roc")
expect Str.from_utf32([0xb9a, 0xbbf]) == Ok("சி")
expect Str.from_utf32([0x1f426]) == Ok("🐦")
expect Str.from_utf32([]) == Ok("")
# unpaired surrogates, first and second halves
expect Str.from_utf32([82, 0xd83d, 99]) |> Result.is_err
expect Str.from_utf32([82, 0xdc96, 99]) |> Result.is_err
# codepoint out of valid range
expect Str.from_utf32([82, 0x110000, 99]) |> Result.is_err
## Converts a [List] of [U32] UTF-32 [code units](https://unicode.org/glossary/#code_unit) to a string.
## Any invalid code points are replaced with a single unicode replacement character '<27>'.
## ```roc
## expect Str.from_utf32_lossy([82, 111, 99, 0x1f426]) == "Roc🐦"
## expect Str.from_utf32_lossy([82, 0x110000, 99]) == "R<>c"
## ```
from_utf32_lossy : List U32 -> Str
from_utf32_lossy = \codepoints ->
step = \utf8, c ->
when encode_utf8(utf8, c) is
Ok(utf8_next) -> utf8_next
# utf-8 encoded replacement character
Err(_) -> List.concat(utf8, [0xef, 0xbf, 0xbd])
List.walk(codepoints, [], step)
|> from_utf8_lossy()
expect Str.from_utf32_lossy([82, 111, 99, 0x1f426]) == "Roc🐦"
expect Str.from_utf32_lossy([82, 0x110000, 99]) == "R<>c"
## Check if the given [Str] starts with a value.
## ```roc
## expect Str.starts_with("ABC", "A") == Bool.true

View file

@ -348,6 +348,7 @@ pub const STR_EQUAL: &str = "roc_builtins.str.equal";
pub const STR_SUBSTRING_UNSAFE: &str = "roc_builtins.str.substring_unsafe";
pub const STR_TO_UTF8: &str = "roc_builtins.str.to_utf8";
pub const STR_FROM_UTF8: &str = "roc_builtins.str.from_utf8";
pub const STR_FROM_UTF8_LOSSY: &str = "roc_builtins.str.from_utf8_lossy";
pub const STR_REPEAT: &str = "roc_builtins.str.repeat";
pub const STR_TRIM: &str = "roc_builtins.str.trim";
pub const STR_TRIM_START: &str = "roc_builtins.str.trim_start";

View file

@ -119,6 +119,7 @@ map_symbol_to_lowlevel_and_arity! {
StrSplitOn; STR_SPLIT_ON; 2,
StrCountUtf8Bytes; STR_COUNT_UTF8_BYTES; 1,
StrFromUtf8; STR_FROM_UTF8_LOWLEVEL; 1,
StrFromUtf8Lossy; STR_FROM_UTF8_LOSSY; 1,
StrToUtf8; STR_TO_UTF8; 1,
StrRepeat; STR_REPEAT; 2,
StrTrim; STR_TRIM; 1,

View file

@ -1677,6 +1677,13 @@ trait Backend<'a> {
ret_layout,
)
}
LowLevel::StrFromUtf8Lossy => self.build_fn_call(
sym,
bitcode::STR_FROM_UTF8_LOSSY.to_string(),
args,
arg_layouts,
ret_layout,
),
LowLevel::StrRepeat => self.build_fn_call(
sym,
bitcode::STR_REPEAT.to_string(),

View file

@ -7,7 +7,6 @@ use inkwell::values::{BasicValueEnum, FunctionValue, IntValue, PointerValue, Str
use inkwell::{AddressSpace, IntPredicate};
use morphic_lib::UpdateMode;
use roc_builtins::bitcode;
use roc_module::symbol::Symbol;
use roc_mono::layout::{
Builtin, InLayout, Layout, LayoutIds, LayoutInterner, LayoutRepr, STLayoutInterner,
};
@ -17,7 +16,6 @@ use super::build::{
create_entry_block_alloca, load_roc_value, store_roc_value, use_roc_value, BuilderExt,
};
use super::convert::zig_list_type;
use super::scope::Scope;
use super::struct_::struct_from_fields;
fn call_list_bitcode_fn_1<'ctx>(
@ -29,20 +27,6 @@ fn call_list_bitcode_fn_1<'ctx>(
call_list_bitcode_fn(env, &[list], other_arguments, BitcodeReturns::List, fn_name)
}
pub(crate) fn list_symbol_to_c_abi<'a, 'ctx>(
env: &Env<'a, 'ctx, '_>,
scope: &Scope<'a, 'ctx>,
symbol: Symbol,
) -> PointerValue<'ctx> {
let list_type = zig_list_type(env);
let list_alloca = create_entry_block_alloca(env, list_type, "list_alloca");
let list = scope.load_symbol(&symbol);
env.builder.new_build_store(list_alloca, list);
list_alloca
}
pub(crate) fn pass_update_mode<'ctx>(
env: &Env<'_, 'ctx, '_>,
update_mode: UpdateMode,

View file

@ -1,30 +1,68 @@
use crate::llvm::build::Env;
use inkwell::values::{BasicValueEnum, PointerValue};
use roc_builtins::bitcode;
use roc_mono::layout::{InLayout, Layout, LayoutRepr, STLayoutInterner};
use super::bitcode::{call_str_bitcode_fn, BitcodeReturns};
use super::build::load_roc_value;
use super::bitcode::{
call_str_bitcode_fn, call_void_bitcode_fn, pass_list_or_string_to_zig_32bit,
pass_list_to_zig_64bit, pass_list_to_zig_wasm, BitcodeReturns,
};
use super::build::{create_entry_block_alloca, load_roc_value, Env};
use bumpalo::collections::Vec;
pub static CHAR_LAYOUT: InLayout = Layout::U8;
pub(crate) fn decode_from_utf8_result<'a, 'ctx>(
pub(crate) fn call_str_from_utf_bitcode_fn<'a, 'ctx>(
env: &Env<'a, 'ctx, '_>,
layout_interner: &STLayoutInterner<'a>,
pointer: PointerValue<'ctx>,
args: &[BasicValueEnum<'ctx>],
result_struct_name: &str,
fn_name: &str,
) -> BasicValueEnum<'ctx> {
let result_type = env.module.get_struct_type(result_struct_name).unwrap();
let result_ptr = create_entry_block_alloca(env, result_type, "alloca_from_utf_result");
// FromUtf8Result, FromUtf16Result, FromUtf32Result all have the same layout of
// - index: u64
// - string: RocStr
// - is_ok: bool
// - problem_code: u8
let layout =
LayoutRepr::Struct(
env.arena
.alloc([Layout::U64, Layout::STR, Layout::BOOL, Layout::U8]),
);
let list = args[0];
let argn = &args[1..];
let mut args: Vec<BasicValueEnum<'ctx>> = Vec::with_capacity_in(args.len() + 2, env.arena);
args.push(result_ptr.into());
use roc_target::Architecture::*;
match env.target.architecture() {
Aarch32 | X86_32 => {
let (a, b) = pass_list_or_string_to_zig_32bit(env, list.into_struct_value());
args.push(a.into());
args.push(b.into());
}
Aarch64 | X86_64 => {
let list = pass_list_to_zig_64bit(env, list);
args.push(list.into());
}
Wasm32 => {
let list = pass_list_to_zig_wasm(env, list);
args.push(list.into());
}
};
args.extend(argn);
call_void_bitcode_fn(env, &args, fn_name);
load_roc_value(
env,
layout_interner,
layout,
pointer,
"load_decode_from_utf8_result",
result_ptr,
"load_from_utf_result",
)
}

View file

@ -37,9 +37,9 @@ use crate::llvm::{
build_list::{
list_append_unsafe, list_clone, list_concat, list_drop_at, list_get_unsafe, list_len_usize,
list_prepend, list_release_excess_capacity, list_replace_unsafe, list_reserve,
list_sort_with, list_sublist, list_swap, list_symbol_to_c_abi, list_with_capacity,
pass_update_mode,
list_sort_with, list_sublist, list_swap, list_with_capacity, pass_update_mode,
},
build_str::call_str_from_utf_bitcode_fn,
compare::{generic_eq, generic_neq},
convert::{
self, argument_type_from_layout, basic_type_from_layout, zig_num_parse_result_type,
@ -396,46 +396,15 @@ pub(crate) fn run_low_level<'a, 'ctx>(
)
}
StrFromUtf8 => {
let result_type = env.module.get_struct_type("str.FromUtf8Result").unwrap();
let result_ptr =
create_entry_block_alloca(env, result_type, "alloca_utf8_validate_bytes_result");
use roc_target::Architecture::*;
match env.target.architecture() {
Aarch32 | X86_32 => {
arguments!(list);
let (a, b) = pass_list_or_string_to_zig_32bit(env, list.into_struct_value());
call_void_bitcode_fn(
env,
&[
result_ptr.into(),
a.into(),
b.into(),
pass_update_mode(env, update_mode),
],
bitcode::STR_FROM_UTF8,
);
}
Aarch64 | X86_64 | Wasm32 => {
arguments!(_list);
// we use the symbol here instead
let list = args[0];
call_void_bitcode_fn(
env,
&[
result_ptr.into(),
list_symbol_to_c_abi(env, scope, list).into(),
pass_update_mode(env, update_mode),
],
bitcode::STR_FROM_UTF8,
);
}
}
crate::llvm::build_str::decode_from_utf8_result(env, layout_interner, result_ptr)
// Str.from_utf8_lowlevel : List U8 -> FromUtf8Result
arguments!(list);
call_str_from_utf_bitcode_fn(
env,
layout_interner,
&[list, pass_update_mode(env, update_mode)],
"str.FromUtf8Result",
bitcode::STR_FROM_UTF8,
)
}
StrToUtf8 => {
// Str.fromInt : Str -> List U8
@ -449,6 +418,16 @@ pub(crate) fn run_low_level<'a, 'ctx>(
bitcode::STR_TO_UTF8,
)
}
StrFromUtf8Lossy => {
arguments!(list);
call_list_bitcode_fn(
env,
&[list.into_struct_value()],
&[],
BitcodeReturns::Str,
bitcode::STR_FROM_UTF8_LOSSY,
)
}
StrRepeat => {
// Str.repeat : Str, U64 -> Str
arguments!(string, count);

View file

@ -245,6 +245,7 @@ impl<'a> LowLevelCall<'a> {
backend.code_builder.i32_const(UPDATE_MODE_IMMUTABLE);
backend.call_host_fn_after_loading_args(bitcode::STR_FROM_UTF8);
}
StrFromUtf8Lossy => self.load_args_and_call_zig(backend, bitcode::STR_FROM_UTF8_LOSSY),
StrTrimStart => self.load_args_and_call_zig(backend, bitcode::STR_TRIM_START),
StrTrimEnd => self.load_args_and_call_zig(backend, bitcode::STR_TRIM_END),
StrToUtf8 => self.load_args_and_call_zig(backend, bitcode::STR_TO_UTF8),

View file

@ -14,6 +14,7 @@ pub enum LowLevel {
StrCountUtf8Bytes,
StrFromInt,
StrFromUtf8,
StrFromUtf8Lossy,
StrToUtf8,
StrRepeat,
StrFromFloat,
@ -256,6 +257,7 @@ map_symbol_to_lowlevel! {
StrSplitOn <= STR_SPLIT_ON;
StrCountUtf8Bytes <= STR_COUNT_UTF8_BYTES;
StrFromUtf8 <= STR_FROM_UTF8_LOWLEVEL;
StrFromUtf8Lossy <= STR_FROM_UTF8_LOSSY;
StrToUtf8 <= STR_TO_UTF8;
StrRepeat <= STR_REPEAT;
StrTrim <= STR_TRIM;

View file

@ -1377,8 +1377,8 @@ define_builtins! {
7 STR_STARTS_WITH: "starts_with"
8 STR_ENDS_WITH: "ends_with"
9 STR_FROM_UTF8: "from_utf8"
10 STR_UT8_PROBLEM: "Utf8Problem" // the Utf8Problem type alias
11 STR_UT8_BYTE_PROBLEM: "Utf8ByteProblem" // the Utf8ByteProblem type alias
10 STR_FROM_UTF8_LOSSY: "from_utf8_lossy"
11 STR_UTF8_BYTE_PROBLEM: "Utf8Problem"
12 STR_TO_UTF8: "to_utf8"
13 STR_WALK_UTF8: "walk_utf8"
14 STR_ALIAS_ANALYSIS_STATIC: "#aliasAnalysisStatic" // string with the static lifetime
@ -1418,6 +1418,10 @@ define_builtins! {
48 STR_RELEASE_EXCESS_CAPACITY: "release_excess_capacity"
49 STR_DROP_PREFIX: "drop_prefix"
50 STR_DROP_SUFFIX: "drop_suffix"
51 STR_FROM_UTF16: "from_utf16"
52 STR_FROM_UTF16_LOSSY: "from_utf16_lossy"
53 STR_FROM_UTF32: "from_utf32"
54 STR_FROM_UTF32_LOSSY: "from_utf32_lossy"
}
6 LIST: "List" => {
0 LIST_LIST: "List" exposed_apply_type=true // the List.List type alias

View file

@ -1603,6 +1603,7 @@ fn low_level_no_rc(lowlevel: &LowLevel) -> RC {
DictPseudoSeed => RC::NoRc,
StrStartsWith | StrEndsWith => RC::NoRc,
StrFromUtf8 => RC::Rc,
StrFromUtf8Lossy => RC::Rc,
StrToUtf8 => RC::Rc,
StrRepeat => RC::NoRc,
StrFromInt | StrFromFloat => RC::NoRc,

View file

@ -1302,6 +1302,7 @@ pub(crate) fn lowlevel_borrow_signature(op: LowLevel) -> &'static [Ownership] {
| NumF64FromParts => &[IRRELEVANT],
StrStartsWith | StrEndsWith => &[BORROWED, BORROWED],
StrFromUtf8 => &[OWNED],
StrFromUtf8Lossy => &[BORROWED],
StrToUtf8 => &[OWNED],
StrRepeat => &[BORROWED, IRRELEVANT],
StrFromInt | StrFromFloat => &[IRRELEVANT],

View file

@ -165,7 +165,7 @@ mod solve_expr {
Str.from_utf8
"
),
"List U8 -> Result Str [BadUtf8 { index : U64, problem : Utf8ByteProblem }]",
"List U8 -> Result Str [BadUtf8 { index : U64, problem : Utf8Problem }]",
);
}

View file

@ -805,6 +805,164 @@ fn str_from_utf8_fail_surrogate_half() {
);
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf8_lossy_expected_continuation() {
assert_evals_to!(
r#"Str.from_utf8_lossy [97, 98, 0xC2, 99]"#,
roc_std::RocStr::from("ab<EFBFBD>c"),
roc_std::RocStr
);
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf16() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf16 [0x72, 0x6f, 0x63] is
Ok val -> val
_ -> ""
"#
),
roc_std::RocStr::from("roc"),
roc_std::RocStr
)
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf16_emoji() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf16 [0x72, 0xd83d, 0xdc96, 0x63] is
Ok val -> val
_ -> ""
"#
),
roc_std::RocStr::from("r💖c"),
roc_std::RocStr
)
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf16_err_expected_second_surrogate_half() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf16 [0x72, 0xd83d, 0x63] is
Err (BadUtf16 {problem: EncodesSurrogateHalf, index: index }) -> index
_ -> 42
"#
),
1u64,
u64
)
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf16_err_unexpected_second_surrogate_half() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf16 [0x72, 0xdc96, 0x63] is
Err (BadUtf16 {problem: EncodesSurrogateHalf, index: index }) -> index
_ -> 42
"#
),
1u64,
u64
)
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf16_lossy() {
assert_evals_to!(
r#"Str.from_utf16_lossy [0x72, 0xdc96, 0x63]"#,
roc_std::RocStr::from("r<EFBFBD>c"),
roc_std::RocStr
)
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf32() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf32 [0x72, 0x6f, 0x63] is
Ok val -> val
_ -> ""
"#
),
roc_std::RocStr::from("roc"),
roc_std::RocStr
)
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf32_emoji() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf32 [0x72, 0x1f496, 0x63] is
Ok val -> val
_ -> ""
"#
),
roc_std::RocStr::from("r💖c"),
roc_std::RocStr
)
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf32_err_codepoint_too_large() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf32 [0x72, 0x123456, 0x63] is
Err (BadUtf32 {problem: CodepointTooLarge, index: index }) -> index
_ -> 42
"#
),
1u64,
u64
)
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf32_err_utf8_cannot_encode_surrogate_half() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf32 [0x72, 0xd83d, 0x63] is
Err (BadUtf32 {problem: EncodesSurrogateHalf, index: index }) -> index
_ -> 42
"#
),
1u64,
u64
)
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_from_utf32_lossy() {
assert_evals_to!(
r#"Str.from_utf32_lossy [0x72, 0x123456, 0x63]"#,
roc_std::RocStr::from("r<EFBFBD>c"),
roc_std::RocStr
)
}
#[test]
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
fn str_equality() {

View file

@ -630,6 +630,156 @@ fn str_from_utf8_fail_surrogate_half() {
);
}
#[test]
fn str_from_utf8_lossy_expected_continuation() {
assert_evals_to!(
r#"Str.from_utf8_lossy [97, 98, 0xC2, 99]"#,
roc_std::RocStr::from("ab<EFBFBD>c"),
roc_std::RocStr
);
}
#[test]
fn str_from_utf16() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf16 [0x72, 0x6f, 0x63] is
Ok val -> val
_ -> ""
"#
),
roc_std::RocStr::from("roc"),
roc_std::RocStr
)
}
// Marking this as should_panic, because it *does* panic and it is not clear why?
// If some change magically fixes this, great, remove the should_panic attribute.
#[test]
#[should_panic(expected = r#"Roc failed with message: "Integer multiplication overflowed!"#)]
fn str_from_utf16_emoji() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf16 [0x72, 0xd83d, 0xdc96, 0x63] is
Ok val -> val
_ -> ""
"#
),
roc_std::RocStr::from("r💖c"),
roc_std::RocStr
)
}
#[test]
fn str_from_utf16_err_expected_second_surrogate_half() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf16 [0x72, 0xd83d, 0x63] is
Err (BadUtf16 {problem: EncodesSurrogateHalf, index: index }) -> index
_ -> 42
"#
),
1u64,
u64
)
}
#[test]
fn str_from_utf16_err_unexpected_second_surrogate_half() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf16 [0x72, 0xdc96, 0x63] is
Err (BadUtf16 {problem: EncodesSurrogateHalf, index: index }) -> index
_ -> 42
"#
),
1u64,
u64
)
}
#[test]
fn str_from_utf16_lossy() {
assert_evals_to!(
r#"Str.from_utf16_lossy [0x72, 0xdc96, 0x63]"#,
roc_std::RocStr::from("r<EFBFBD>c"),
roc_std::RocStr
)
}
#[test]
fn str_from_utf32() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf32 [0x72, 0x6f, 0x63] is
Ok val -> val
_ -> ""
"#
),
roc_std::RocStr::from("roc"),
roc_std::RocStr
)
}
#[test]
fn str_from_utf32_emoji() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf32 [0x72, 0x1f496, 0x63] is
Ok val -> val
_ -> ""
"#
),
roc_std::RocStr::from("r💖c"),
roc_std::RocStr
)
}
#[test]
fn str_from_utf32_err_codepoint_too_large() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf32 [0x72, 0x123456, 0x63] is
Err (BadUtf32 {problem: CodepointTooLarge, index: index }) -> index
_ -> 42
"#
),
1u64,
u64
)
}
#[test]
fn str_from_utf32_err_utf8_cannot_encode_surrogate_half() {
assert_evals_to!(
indoc!(
r#"
when Str.from_utf32 [0x72, 0xd83d, 0x63] is
Err (BadUtf32 {problem: EncodesSurrogateHalf, index: index }) -> index
_ -> 42
"#
),
1u64,
u64
)
}
#[test]
fn str_from_utf32_lossy() {
assert_evals_to!(
r#"Str.from_utf32_lossy [0x72, 0x123456, 0x63]"#,
roc_std::RocStr::from("r<EFBFBD>c"),
roc_std::RocStr
)
}
#[test]
fn str_equality() {
assert_evals_to!(r#""a" == "a""#, true, bool);