mirror of
https://github.com/roc-lang/roc.git
synced 2025-08-04 20:28:02 +00:00
add fromUtf16 and 32, as well as lossy variants
This commit is contained in:
parent
032f1cc5a4
commit
414fecd14d
17 changed files with 805 additions and 74 deletions
|
@ -203,6 +203,7 @@ comptime {
|
|||
exportStrFn(str.reserveC, "reserve");
|
||||
exportStrFn(str.strToUtf8C, "to_utf8");
|
||||
exportStrFn(str.fromUtf8C, "from_utf8");
|
||||
exportStrFn(str.fromUtf8Lossy, "from_utf8_lossy");
|
||||
exportStrFn(str.repeatC, "repeat");
|
||||
exportStrFn(str.strTrim, "trim");
|
||||
exportStrFn(str.strTrimStart, "trim_start");
|
||||
|
|
|
@ -1449,6 +1449,105 @@ pub fn fromUtf8C(
|
|||
return fromUtf8(list, update_mode);
|
||||
}
|
||||
|
||||
const UNICODE_REPLACEMENT: u21 = 0xfffd;
|
||||
|
||||
const Utf8Iterator = struct {
|
||||
bytes: []u8,
|
||||
i: usize,
|
||||
|
||||
pub fn init(list: RocList) Utf8Iterator {
|
||||
const bytes = @as([*]u8, @ptrCast(list.bytes))[0..list.length];
|
||||
return Utf8Iterator{
|
||||
.bytes = bytes,
|
||||
.i = 0,
|
||||
};
|
||||
}
|
||||
|
||||
pub fn nextLossy(it: *Utf8Iterator) ?u32 {
|
||||
if (it.bytes.len <= it.i) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const rest = it.bytes[it.i..];
|
||||
const n = unicode.utf8ByteSequenceLength(rest[0]) catch {
|
||||
// invalid start byte
|
||||
it.i += 1;
|
||||
return UNICODE_REPLACEMENT;
|
||||
};
|
||||
|
||||
for (1..n) |i| {
|
||||
if (rest.len == i) {
|
||||
// unexpected end
|
||||
it.i += i;
|
||||
return UNICODE_REPLACEMENT;
|
||||
}
|
||||
if (rest[i] < 0x70) {
|
||||
// expected continuation byte (>= 0x70)
|
||||
it.i += i;
|
||||
return UNICODE_REPLACEMENT;
|
||||
}
|
||||
}
|
||||
|
||||
it.i += n;
|
||||
return unicode.utf8Decode(rest[0..n]) catch {
|
||||
return UNICODE_REPLACEMENT;
|
||||
};
|
||||
}
|
||||
|
||||
pub fn reset(it: *Utf8Iterator) void {
|
||||
it.i = 0;
|
||||
}
|
||||
};
|
||||
|
||||
fn codepointSeqLengthLossy(c: u32) u3 {
|
||||
if (c < 0x110000) {
|
||||
if (unicode.utf8CodepointSequenceLength(@intCast(c))) |n| {
|
||||
return n;
|
||||
} else |_| {
|
||||
// fallthrough
|
||||
}
|
||||
}
|
||||
return unicode.utf8CodepointSequenceLength(UNICODE_REPLACEMENT) catch unreachable;
|
||||
}
|
||||
|
||||
fn utf8EncodeLossy(c: u32, out: []u8) u3 {
|
||||
if (c < 0x110000) {
|
||||
if (unicode.utf8Encode(@intCast(c), out)) |n| {
|
||||
return n;
|
||||
} else |_| {
|
||||
// fallthrough
|
||||
}
|
||||
}
|
||||
return unicode.utf8Encode(UNICODE_REPLACEMENT, out) catch unreachable;
|
||||
}
|
||||
|
||||
pub fn fromUtf8Lossy(
|
||||
list: RocList,
|
||||
) callconv(.C) RocStr {
|
||||
if (list.len() == 0) {
|
||||
return RocStr.empty();
|
||||
}
|
||||
|
||||
// PERF: we could try to reuse the input list if it's already valid utf-8, similar to fromUtf8
|
||||
|
||||
var it = Utf8Iterator.init(list);
|
||||
|
||||
var enc_len: usize = 0;
|
||||
while (it.nextLossy()) |c| {
|
||||
enc_len += codepointSeqLengthLossy(c);
|
||||
}
|
||||
|
||||
var str = RocStr.allocate(enc_len);
|
||||
const ptr = str.asU8ptrMut()[0..enc_len];
|
||||
var end_index: usize = 0;
|
||||
it.reset();
|
||||
while (it.nextLossy()) |c| {
|
||||
end_index += utf8EncodeLossy(c, ptr[end_index..]);
|
||||
}
|
||||
str.setLen(end_index);
|
||||
return str;
|
||||
}
|
||||
|
||||
pub fn fromUtf8(
|
||||
list: RocList,
|
||||
update_mode: UpdateMode,
|
||||
|
@ -1667,6 +1766,17 @@ test "validateUtf8Bytes: unicode ∆ in middle of array" {
|
|||
try expectOk(str_result);
|
||||
}
|
||||
|
||||
test "fromUtf8Lossy: ascii, emoji" {
|
||||
var list = RocList.fromSlice(u8, "r💖c", false);
|
||||
defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
|
||||
|
||||
const res = fromUtf8Lossy(list);
|
||||
defer res.decref();
|
||||
const expected = RocStr.fromSlice("r💖c");
|
||||
defer expected.decref();
|
||||
try expect(expected.eq(res));
|
||||
}
|
||||
|
||||
fn expectErr(list: RocList, index: usize, err: Utf8DecodeError, problem: Utf8ByteProblem) !void {
|
||||
const str_ptr = @as([*]u8, @ptrCast(list.bytes));
|
||||
const len = list.length;
|
||||
|
@ -1765,6 +1875,66 @@ test "validateUtf8Bytes: surrogate halves" {
|
|||
try expectErr(list, 3, error.Utf8EncodesSurrogateHalf, Utf8ByteProblem.EncodesSurrogateHalf);
|
||||
}
|
||||
|
||||
test "fromUtf8Lossy: invalid start byte" {
|
||||
var list = RocList.fromSlice(u8, "r\x80c", false);
|
||||
defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
|
||||
|
||||
const res = fromUtf8Lossy(list);
|
||||
defer res.decref();
|
||||
const expected = RocStr.fromSlice("r<EFBFBD>c");
|
||||
defer expected.decref();
|
||||
try expect(expected.eq(res));
|
||||
}
|
||||
|
||||
test "fromUtf8Lossy: overlong encoding" {
|
||||
var list = RocList.fromSlice(u8, "r\xF0\x9F\x92\x96\x80c", false);
|
||||
defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
|
||||
|
||||
const res = fromUtf8Lossy(list);
|
||||
defer res.decref();
|
||||
const expected = RocStr.fromSlice("r💖<EFBFBD>c");
|
||||
defer expected.decref();
|
||||
try expect(expected.eq(res));
|
||||
}
|
||||
|
||||
test "fromUtf8Lossy: expected continuation" {
|
||||
var list = RocList.fromSlice(u8, "r\xCFc", false);
|
||||
defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
|
||||
|
||||
const res = fromUtf8Lossy(list);
|
||||
defer res.decref();
|
||||
const expected = RocStr.fromSlice("r<EFBFBD>c");
|
||||
defer expected.decref();
|
||||
try expect(expected.eq(res));
|
||||
}
|
||||
|
||||
test "fromUtf8Lossy: unexpected end" {
|
||||
var list = RocList.fromSlice(u8, "r\xCF", false);
|
||||
defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
|
||||
|
||||
const res = fromUtf8Lossy(list);
|
||||
defer res.decref();
|
||||
const expected = RocStr.fromSlice("r<EFBFBD>");
|
||||
defer expected.decref();
|
||||
try expect(expected.eq(res));
|
||||
}
|
||||
|
||||
test "fromUtf8Lossy: encodes surrogate" {
|
||||
// 0xd83d == 0b1101_1000_0011_1101
|
||||
// wwww xxxx yyyy zzzz
|
||||
// becomes 0b1110_1101 0b10_1000_00 0b10_11_1101
|
||||
// 1110_wwww 10_xxxx_yy 10_yy_zzzz
|
||||
// 0xED 0x90 0xBD
|
||||
var list = RocList.fromSlice(u8, "r\xED\xA0\xBDc", false);
|
||||
defer list.decref(@alignOf(u8), @sizeOf(u8), false, rcNone);
|
||||
|
||||
const res = fromUtf8Lossy(list);
|
||||
defer res.decref();
|
||||
const expected = RocStr.fromSlice("r<EFBFBD>c");
|
||||
defer expected.decref();
|
||||
try expect(expected.eq(res));
|
||||
}
|
||||
|
||||
fn isWhitespace(codepoint: u21) bool {
|
||||
// https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
|
||||
return switch (codepoint) {
|
||||
|
|
|
@ -328,7 +328,6 @@
|
|||
## Currently, the only way to get seamless slices of strings is by calling certain `Str` functions which return them. In general, `Str` functions which accept a string and return a subset of that string tend to do this. [`Str.trim`](https://www.roc-lang.org/builtins/Str#trim) is another example of a function which returns a seamless slice.
|
||||
module [
|
||||
Utf8Problem,
|
||||
Utf8ByteProblem,
|
||||
concat,
|
||||
is_empty,
|
||||
join_with,
|
||||
|
@ -337,6 +336,11 @@ module [
|
|||
count_utf8_bytes,
|
||||
to_utf8,
|
||||
from_utf8,
|
||||
from_utf16,
|
||||
from_utf32,
|
||||
from_utf8_lossy,
|
||||
from_utf16_lossy,
|
||||
from_utf32_lossy,
|
||||
starts_with,
|
||||
ends_with,
|
||||
trim,
|
||||
|
@ -376,7 +380,7 @@ import Result exposing [Result]
|
|||
import List
|
||||
import Num exposing [Num, U8, U16, U32, U64, U128, I8, I16, I32, I64, I128, F32, F64, Dec]
|
||||
|
||||
Utf8ByteProblem : [
|
||||
Utf8Problem : [
|
||||
InvalidStartByte,
|
||||
UnexpectedEndOfSequence,
|
||||
ExpectedContinuation,
|
||||
|
@ -385,8 +389,6 @@ Utf8ByteProblem : [
|
|||
EncodesSurrogateHalf,
|
||||
]
|
||||
|
||||
Utf8Problem : { byte_index : U64, problem : Utf8ByteProblem }
|
||||
|
||||
## Returns [Bool.true] if the string is empty, and [Bool.false] otherwise.
|
||||
## ```roc
|
||||
## expect Str.is_empty("hi!") == Bool.false
|
||||
|
@ -538,7 +540,7 @@ to_utf8 : Str -> List U8
|
|||
## expect Str.from_utf8([]) == Ok("")
|
||||
## expect Str.from_utf8([255]) |> Result.is_err
|
||||
## ```
|
||||
from_utf8 : List U8 -> Result Str [BadUtf8 { problem : Utf8ByteProblem, index : U64 }]
|
||||
from_utf8 : List U8 -> Result Str [BadUtf8 { problem : Utf8Problem, index : U64 }]
|
||||
from_utf8 = \bytes ->
|
||||
result = from_utf8_lowlevel bytes
|
||||
|
||||
|
@ -557,11 +559,242 @@ FromUtf8Result : {
|
|||
a_byte_index : U64,
|
||||
b_string : Str,
|
||||
c_is_ok : Bool,
|
||||
d_problem_code : Utf8ByteProblem,
|
||||
d_problem_code : Utf8Problem,
|
||||
}
|
||||
|
||||
from_utf8_lowlevel : List U8 -> FromUtf8Result
|
||||
|
||||
## Converts a [List] of [U8] UTF-8 [code units](https://unicode.org/glossary/#code_unit) to a string.
|
||||
## Any grouping of invalid byte sequences are replaced with a single unicode replacement character '<27>'.
|
||||
##
|
||||
## An invalid byte sequence is defined as
|
||||
## - a 2-byte-sequence starting byte, followed by less than 1 continuation byte
|
||||
## - a 3-byte-sequence starting byte, followed by less than 2 continuation bytes
|
||||
## - a 4-byte-sequence starting byte, followed by less than 3 continuation bytes
|
||||
## - an invalid codepoint from the surrogate pair block
|
||||
## - an invalid codepoint greater than 0x110000 encoded as a 4-byte sequence
|
||||
## - any valid codepoint encoded as an incorrect sequence, for instance a codepoint that should be a 2-byte sequence encoded as a 3- or 4-byte sequence
|
||||
##
|
||||
## ```roc
|
||||
## expect (Str.from_utf8_lossy [82, 111, 99, 240, 159, 144, 166]) == "Roc🐦"
|
||||
## expect (Str.from_utf8_lossy [82, 255, 99]) == "R<>c"
|
||||
## expect (Str.from_utf8_lossy [82, 0xED, 0xA0, 0xBD, 99]) == "R<>c"
|
||||
## ```
|
||||
from_utf8_lossy : List U8 -> Str
|
||||
|
||||
expect (Str.from_utf8_lossy [82, 111, 99, 240, 159, 144, 166]) == "Roc🐦"
|
||||
expect (Str.from_utf8_lossy [82, 255, 99]) == "R<>c"
|
||||
expect (Str.from_utf8_lossy [82, 0xED, 0xA0, 0xBD, 99]) == "R<>c"
|
||||
|
||||
## Converts a [List] of [U16] UTF-16 (little-endian) [code units](https://unicode.org/glossary/#code_unit) to a string.
|
||||
##
|
||||
## ```roc
|
||||
## expect Str.from_utf16([82, 111, 99]) == Ok("Roc")
|
||||
## expect Str.from_utf16([0xb9a, 0xbbf]) == Ok("சி")
|
||||
## expect Str.from_utf16([0xd83d, 0xdc26]) == Ok("🐦")
|
||||
## expect Str.from_utf16([]) == Ok("")
|
||||
## # unpaired surrogates, first and second halves
|
||||
## expect Str.from_utf16([82, 0xd83d, 99]) |> Result.isErr
|
||||
## expect Str.from_utf16([82, 0xdc96, 99]) |> Result.isErr
|
||||
## ```
|
||||
from_utf16 : List U16 -> Result Str [BadUtf16 { problem : Utf8Problem, index : U64 }]
|
||||
from_utf16 = \codeunits ->
|
||||
mk_err = \problem, index ->
|
||||
Err(BadUtf16({ problem, index }))
|
||||
|
||||
step = \state, unit ->
|
||||
c : U32
|
||||
c = Num.int_cast(unit)
|
||||
when state is
|
||||
ExpectFirst(i, utf8) ->
|
||||
if unit < 0xd800 then
|
||||
when encode_utf8(utf8, c) is
|
||||
Ok(utf8_next) -> ExpectFirst(i + 1, utf8_next)
|
||||
Err(err) -> mk_err(err, i)
|
||||
else
|
||||
ExpectSecond(i, utf8, c)
|
||||
|
||||
ExpectSecond(i, utf8, first) ->
|
||||
if unit < 0xdc00 then
|
||||
mk_err(EncodesSurrogateHalf, i)
|
||||
else
|
||||
joined = ((first - 0xd800) * 0x400) + (c - 0xdc00) + 0x10000
|
||||
when encode_utf8(utf8, joined) is
|
||||
Ok(utf8_next) -> ExpectFirst(i + 2, utf8_next)
|
||||
Err(err) -> mk_err(err, i)
|
||||
|
||||
Err(err) -> Err(err)
|
||||
|
||||
decode_res = List.walk(codeunits, ExpectFirst(0, []), step)
|
||||
|
||||
when decode_res is
|
||||
ExpectFirst(_, utf8) ->
|
||||
from_utf8(utf8)
|
||||
|> Result.map_err(\BadUtf8(err) -> BadUtf16(err))
|
||||
|
||||
ExpectSecond(i, _, _) ->
|
||||
mk_err(EncodesSurrogateHalf, i)
|
||||
|
||||
Err(err) -> Err(err)
|
||||
|
||||
expect Str.from_utf16([82, 111, 99]) == Ok("Roc")
|
||||
expect Str.from_utf16([0xb9a, 0xbbf]) == Ok("சி")
|
||||
expect Str.from_utf16([0xd83d, 0xdc26]) == Ok("🐦")
|
||||
expect Str.from_utf16([]) == Ok("")
|
||||
# unpaired surrogates, first and second halves
|
||||
expect Str.from_utf16([82, 0xd83d, 99]) == Err(BadUtf16({ index: 1, problem: EncodesSurrogateHalf }))
|
||||
expect Str.from_utf16([82, 0xdc96, 99]) == Err(BadUtf16({ index: 1, problem: EncodesSurrogateHalf }))
|
||||
|
||||
## Converts a [List] of [U16] UTF-16 (little-endian) [code units](https://unicode.org/glossary/#code_unit) to a string.
|
||||
## Any unpaired surrogate code unit is replaced with a single unicode replacement character '<27>'.
|
||||
##
|
||||
## ```roc
|
||||
## expect Str.from_utf16_lossy([82, 111, 99, 0xd83d, 0xdc26]) == "Roc🐦"
|
||||
## expect Str.from_utf16_lossy([82, 0xdc96, 99]) == "R<>c"
|
||||
## ```
|
||||
from_utf16_lossy : List U16 -> Str
|
||||
from_utf16_lossy = \codeunits ->
|
||||
utf8_replacement = [0xef, 0xbf, 0xbd]
|
||||
encode_lossy = \utf8, c ->
|
||||
when encode_utf8(utf8, c) is
|
||||
Ok(utf8_next) -> utf8_next
|
||||
Err(_) -> List.concat(utf8, utf8_replacement)
|
||||
|
||||
step = \state, unit ->
|
||||
c : U32
|
||||
c = Num.int_cast(unit)
|
||||
when state is
|
||||
ExpectFirst(utf8) ->
|
||||
if unit < 0xd800 then
|
||||
ExpectFirst(encode_lossy(utf8, c))
|
||||
else
|
||||
ExpectSecond(utf8, c)
|
||||
|
||||
ExpectSecond(utf8, first) ->
|
||||
if c < 0xd800 then
|
||||
ExpectFirst(
|
||||
List.concat(utf8, utf8_replacement)
|
||||
|> encode_lossy(c),
|
||||
)
|
||||
else if c < 0xdc00 then
|
||||
ExpectSecond(List.concat(utf8, utf8_replacement), c)
|
||||
else
|
||||
joined = ((first - 0xd800) * 0x400) + (c - 0xdc00) + 0x10000
|
||||
ExpectFirst(encode_lossy(utf8, joined))
|
||||
|
||||
result = List.walk(codeunits, ExpectFirst([]), step)
|
||||
when result is
|
||||
ExpectFirst(utf8) -> from_utf8_lossy(utf8)
|
||||
ExpectSecond(utf8, _) -> from_utf8_lossy(List.concat(utf8, utf8_replacement))
|
||||
|
||||
expect Str.from_utf16_lossy([82, 111, 99, 0xd83d, 0xdc26]) == "Roc🐦"
|
||||
expect Str.from_utf16_lossy([82, 0xdc96, 99]) == "R<>c"
|
||||
|
||||
## Converts a [List] of [U32] UTF-32 [code units](https://unicode.org/glossary/#code_unit) to a string.
|
||||
##
|
||||
## ```roc
|
||||
## expect Str.from_utf32([82, 111, 99]) == Ok("Roc")
|
||||
## expect Str.from_utf32([0xb9a, 0xbbf]) == Ok("சி")
|
||||
## expect Str.from_utf32([0x1f426]) == Ok("🐦")
|
||||
## # unpaired surrogates, first and second halves
|
||||
## expect Str.from_utf32([82, 0xd83d, 99]) |> Result.isErr
|
||||
## expect Str.from_utf32([82, 0xdc96, 99]) |> Result.isErr
|
||||
## # invalid codepoint
|
||||
## expect Str.from_utf32([82, 0x110000, 99]) |> Result.isErr
|
||||
## ```
|
||||
|
||||
from_utf32 : List U32 -> Result Str [BadUtf32 { problem : Utf8Problem, index : U64 }]
|
||||
from_utf32 = \codepoints ->
|
||||
step = \state, c ->
|
||||
when state is
|
||||
Ok({ i, utf8 }) ->
|
||||
when encode_utf8(utf8, c) is
|
||||
Ok(utf8_next) -> Ok({ i: i + 1, utf8: utf8_next })
|
||||
Err(problem) -> Err(BadUtf32({ problem, index: i }))
|
||||
|
||||
Err(err) -> Err(err)
|
||||
|
||||
List.walk(codepoints, Ok({ i: 0, utf8: [] }), step)
|
||||
|> Result.try(
|
||||
\state ->
|
||||
when from_utf8(state.utf8) is
|
||||
Ok(str) -> Ok(str)
|
||||
Err(BadUtf8(err)) -> Err(BadUtf32(err)),
|
||||
)
|
||||
|
||||
encode_utf8 : List U8, U32 -> Result (List U8) [EncodesSurrogateHalf, CodepointTooLarge]
|
||||
encode_utf8 = \list, c ->
|
||||
if c < 0x80 then
|
||||
Ok(List.append(list, Num.int_cast(c)))
|
||||
else if c < 0x800 then
|
||||
Ok(
|
||||
List.concat(
|
||||
list,
|
||||
[
|
||||
Num.int_cast(Num.bitwise_or(Num.shift_right_by(c, 6), 0b110_00000)),
|
||||
Num.int_cast(Num.bitwise_or(Num.bitwise_and(c, 0b111111), 0b10_000000)),
|
||||
],
|
||||
),
|
||||
)
|
||||
else if c < 0x10000 then
|
||||
if (c >= 0xd800) && (c < 0xe000) then
|
||||
Err(EncodesSurrogateHalf)
|
||||
else
|
||||
Ok(
|
||||
List.concat(
|
||||
list,
|
||||
[
|
||||
Num.int_cast(Num.bitwise_or(Num.shift_right_by(c, 12), 0b1110_0000)),
|
||||
Num.int_cast(Num.bitwise_or(Num.bitwise_and(Num.shift_right_by(c, 6), 0b111111), 0b10_000000)),
|
||||
Num.int_cast(Num.bitwise_or(Num.bitwise_and(c, 0b111111), 0b10_000000)),
|
||||
],
|
||||
),
|
||||
)
|
||||
else if c < 0x110000 then
|
||||
Ok(
|
||||
List.concat(
|
||||
list,
|
||||
[
|
||||
Num.int_cast(Num.bitwise_or(Num.shift_right_by(c, 18), 0b11110_000)),
|
||||
Num.int_cast(Num.bitwise_or(Num.bitwise_and(Num.shift_right_by(c, 12), 0b111111), 0b10_000000)),
|
||||
Num.int_cast(Num.bitwise_or(Num.bitwise_and(Num.shift_right_by(c, 6), 0b111111), 0b10_000000)),
|
||||
Num.int_cast(Num.bitwise_or(Num.bitwise_and(c, 0b111111), 0b10_000000)),
|
||||
],
|
||||
),
|
||||
)
|
||||
else
|
||||
Err(CodepointTooLarge)
|
||||
|
||||
expect Str.from_utf32([82, 111, 99]) == Ok("Roc")
|
||||
expect Str.from_utf32([0xb9a, 0xbbf]) == Ok("சி")
|
||||
expect Str.from_utf32([0x1f426]) == Ok("🐦")
|
||||
expect Str.from_utf32([]) == Ok("")
|
||||
# unpaired surrogates, first and second halves
|
||||
expect Str.from_utf32([82, 0xd83d, 99]) |> Result.is_err
|
||||
expect Str.from_utf32([82, 0xdc96, 99]) |> Result.is_err
|
||||
# codepoint out of valid range
|
||||
expect Str.from_utf32([82, 0x110000, 99]) |> Result.is_err
|
||||
|
||||
## Converts a [List] of [U32] UTF-32 [code units](https://unicode.org/glossary/#code_unit) to a string.
|
||||
## Any invalid code points are replaced with a single unicode replacement character '<27>'.
|
||||
## ```roc
|
||||
## expect Str.from_utf32_lossy([82, 111, 99, 0x1f426]) == "Roc🐦"
|
||||
## expect Str.from_utf32_lossy([82, 0x110000, 99]) == "R<>c"
|
||||
## ```
|
||||
from_utf32_lossy : List U32 -> Str
|
||||
from_utf32_lossy = \codepoints ->
|
||||
step = \utf8, c ->
|
||||
when encode_utf8(utf8, c) is
|
||||
Ok(utf8_next) -> utf8_next
|
||||
# utf-8 encoded replacement character
|
||||
Err(_) -> List.concat(utf8, [0xef, 0xbf, 0xbd])
|
||||
|
||||
List.walk(codepoints, [], step)
|
||||
|> from_utf8_lossy()
|
||||
|
||||
expect Str.from_utf32_lossy([82, 111, 99, 0x1f426]) == "Roc🐦"
|
||||
expect Str.from_utf32_lossy([82, 0x110000, 99]) == "R<>c"
|
||||
|
||||
## Check if the given [Str] starts with a value.
|
||||
## ```roc
|
||||
## expect Str.starts_with("ABC", "A") == Bool.true
|
||||
|
|
|
@ -348,6 +348,7 @@ pub const STR_EQUAL: &str = "roc_builtins.str.equal";
|
|||
pub const STR_SUBSTRING_UNSAFE: &str = "roc_builtins.str.substring_unsafe";
|
||||
pub const STR_TO_UTF8: &str = "roc_builtins.str.to_utf8";
|
||||
pub const STR_FROM_UTF8: &str = "roc_builtins.str.from_utf8";
|
||||
pub const STR_FROM_UTF8_LOSSY: &str = "roc_builtins.str.from_utf8_lossy";
|
||||
pub const STR_REPEAT: &str = "roc_builtins.str.repeat";
|
||||
pub const STR_TRIM: &str = "roc_builtins.str.trim";
|
||||
pub const STR_TRIM_START: &str = "roc_builtins.str.trim_start";
|
||||
|
|
|
@ -119,6 +119,7 @@ map_symbol_to_lowlevel_and_arity! {
|
|||
StrSplitOn; STR_SPLIT_ON; 2,
|
||||
StrCountUtf8Bytes; STR_COUNT_UTF8_BYTES; 1,
|
||||
StrFromUtf8; STR_FROM_UTF8_LOWLEVEL; 1,
|
||||
StrFromUtf8Lossy; STR_FROM_UTF8_LOSSY; 1,
|
||||
StrToUtf8; STR_TO_UTF8; 1,
|
||||
StrRepeat; STR_REPEAT; 2,
|
||||
StrTrim; STR_TRIM; 1,
|
||||
|
|
|
@ -1677,6 +1677,13 @@ trait Backend<'a> {
|
|||
ret_layout,
|
||||
)
|
||||
}
|
||||
LowLevel::StrFromUtf8Lossy => self.build_fn_call(
|
||||
sym,
|
||||
bitcode::STR_FROM_UTF8_LOSSY.to_string(),
|
||||
args,
|
||||
arg_layouts,
|
||||
ret_layout,
|
||||
),
|
||||
LowLevel::StrRepeat => self.build_fn_call(
|
||||
sym,
|
||||
bitcode::STR_REPEAT.to_string(),
|
||||
|
|
|
@ -7,7 +7,6 @@ use inkwell::values::{BasicValueEnum, FunctionValue, IntValue, PointerValue, Str
|
|||
use inkwell::{AddressSpace, IntPredicate};
|
||||
use morphic_lib::UpdateMode;
|
||||
use roc_builtins::bitcode;
|
||||
use roc_module::symbol::Symbol;
|
||||
use roc_mono::layout::{
|
||||
Builtin, InLayout, Layout, LayoutIds, LayoutInterner, LayoutRepr, STLayoutInterner,
|
||||
};
|
||||
|
@ -17,7 +16,6 @@ use super::build::{
|
|||
create_entry_block_alloca, load_roc_value, store_roc_value, use_roc_value, BuilderExt,
|
||||
};
|
||||
use super::convert::zig_list_type;
|
||||
use super::scope::Scope;
|
||||
use super::struct_::struct_from_fields;
|
||||
|
||||
fn call_list_bitcode_fn_1<'ctx>(
|
||||
|
@ -29,20 +27,6 @@ fn call_list_bitcode_fn_1<'ctx>(
|
|||
call_list_bitcode_fn(env, &[list], other_arguments, BitcodeReturns::List, fn_name)
|
||||
}
|
||||
|
||||
pub(crate) fn list_symbol_to_c_abi<'a, 'ctx>(
|
||||
env: &Env<'a, 'ctx, '_>,
|
||||
scope: &Scope<'a, 'ctx>,
|
||||
symbol: Symbol,
|
||||
) -> PointerValue<'ctx> {
|
||||
let list_type = zig_list_type(env);
|
||||
let list_alloca = create_entry_block_alloca(env, list_type, "list_alloca");
|
||||
|
||||
let list = scope.load_symbol(&symbol);
|
||||
env.builder.new_build_store(list_alloca, list);
|
||||
|
||||
list_alloca
|
||||
}
|
||||
|
||||
pub(crate) fn pass_update_mode<'ctx>(
|
||||
env: &Env<'_, 'ctx, '_>,
|
||||
update_mode: UpdateMode,
|
||||
|
|
|
@ -1,30 +1,68 @@
|
|||
use crate::llvm::build::Env;
|
||||
use inkwell::values::{BasicValueEnum, PointerValue};
|
||||
use roc_builtins::bitcode;
|
||||
use roc_mono::layout::{InLayout, Layout, LayoutRepr, STLayoutInterner};
|
||||
|
||||
use super::bitcode::{call_str_bitcode_fn, BitcodeReturns};
|
||||
use super::build::load_roc_value;
|
||||
use super::bitcode::{
|
||||
call_str_bitcode_fn, call_void_bitcode_fn, pass_list_or_string_to_zig_32bit,
|
||||
pass_list_to_zig_64bit, pass_list_to_zig_wasm, BitcodeReturns,
|
||||
};
|
||||
use super::build::{create_entry_block_alloca, load_roc_value, Env};
|
||||
use bumpalo::collections::Vec;
|
||||
|
||||
pub static CHAR_LAYOUT: InLayout = Layout::U8;
|
||||
|
||||
pub(crate) fn decode_from_utf8_result<'a, 'ctx>(
|
||||
pub(crate) fn call_str_from_utf_bitcode_fn<'a, 'ctx>(
|
||||
env: &Env<'a, 'ctx, '_>,
|
||||
layout_interner: &STLayoutInterner<'a>,
|
||||
pointer: PointerValue<'ctx>,
|
||||
args: &[BasicValueEnum<'ctx>],
|
||||
result_struct_name: &str,
|
||||
fn_name: &str,
|
||||
) -> BasicValueEnum<'ctx> {
|
||||
let result_type = env.module.get_struct_type(result_struct_name).unwrap();
|
||||
let result_ptr = create_entry_block_alloca(env, result_type, "alloca_from_utf_result");
|
||||
// FromUtf8Result, FromUtf16Result, FromUtf32Result all have the same layout of
|
||||
// - index: u64
|
||||
// - string: RocStr
|
||||
// - is_ok: bool
|
||||
// - problem_code: u8
|
||||
let layout =
|
||||
LayoutRepr::Struct(
|
||||
env.arena
|
||||
.alloc([Layout::U64, Layout::STR, Layout::BOOL, Layout::U8]),
|
||||
);
|
||||
|
||||
let list = args[0];
|
||||
let argn = &args[1..];
|
||||
let mut args: Vec<BasicValueEnum<'ctx>> = Vec::with_capacity_in(args.len() + 2, env.arena);
|
||||
args.push(result_ptr.into());
|
||||
|
||||
use roc_target::Architecture::*;
|
||||
match env.target.architecture() {
|
||||
Aarch32 | X86_32 => {
|
||||
let (a, b) = pass_list_or_string_to_zig_32bit(env, list.into_struct_value());
|
||||
args.push(a.into());
|
||||
args.push(b.into());
|
||||
}
|
||||
Aarch64 | X86_64 => {
|
||||
let list = pass_list_to_zig_64bit(env, list);
|
||||
args.push(list.into());
|
||||
}
|
||||
Wasm32 => {
|
||||
let list = pass_list_to_zig_wasm(env, list);
|
||||
args.push(list.into());
|
||||
}
|
||||
};
|
||||
|
||||
args.extend(argn);
|
||||
|
||||
call_void_bitcode_fn(env, &args, fn_name);
|
||||
|
||||
load_roc_value(
|
||||
env,
|
||||
layout_interner,
|
||||
layout,
|
||||
pointer,
|
||||
"load_decode_from_utf8_result",
|
||||
result_ptr,
|
||||
"load_from_utf_result",
|
||||
)
|
||||
}
|
||||
|
||||
|
|
|
@ -37,9 +37,9 @@ use crate::llvm::{
|
|||
build_list::{
|
||||
list_append_unsafe, list_clone, list_concat, list_drop_at, list_get_unsafe, list_len_usize,
|
||||
list_prepend, list_release_excess_capacity, list_replace_unsafe, list_reserve,
|
||||
list_sort_with, list_sublist, list_swap, list_symbol_to_c_abi, list_with_capacity,
|
||||
pass_update_mode,
|
||||
list_sort_with, list_sublist, list_swap, list_with_capacity, pass_update_mode,
|
||||
},
|
||||
build_str::call_str_from_utf_bitcode_fn,
|
||||
compare::{generic_eq, generic_neq},
|
||||
convert::{
|
||||
self, argument_type_from_layout, basic_type_from_layout, zig_num_parse_result_type,
|
||||
|
@ -396,46 +396,15 @@ pub(crate) fn run_low_level<'a, 'ctx>(
|
|||
)
|
||||
}
|
||||
StrFromUtf8 => {
|
||||
let result_type = env.module.get_struct_type("str.FromUtf8Result").unwrap();
|
||||
let result_ptr =
|
||||
create_entry_block_alloca(env, result_type, "alloca_utf8_validate_bytes_result");
|
||||
|
||||
use roc_target::Architecture::*;
|
||||
match env.target.architecture() {
|
||||
Aarch32 | X86_32 => {
|
||||
arguments!(list);
|
||||
let (a, b) = pass_list_or_string_to_zig_32bit(env, list.into_struct_value());
|
||||
|
||||
call_void_bitcode_fn(
|
||||
env,
|
||||
&[
|
||||
result_ptr.into(),
|
||||
a.into(),
|
||||
b.into(),
|
||||
pass_update_mode(env, update_mode),
|
||||
],
|
||||
bitcode::STR_FROM_UTF8,
|
||||
);
|
||||
}
|
||||
Aarch64 | X86_64 | Wasm32 => {
|
||||
arguments!(_list);
|
||||
|
||||
// we use the symbol here instead
|
||||
let list = args[0];
|
||||
|
||||
call_void_bitcode_fn(
|
||||
env,
|
||||
&[
|
||||
result_ptr.into(),
|
||||
list_symbol_to_c_abi(env, scope, list).into(),
|
||||
pass_update_mode(env, update_mode),
|
||||
],
|
||||
bitcode::STR_FROM_UTF8,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
crate::llvm::build_str::decode_from_utf8_result(env, layout_interner, result_ptr)
|
||||
// Str.from_utf8_lowlevel : List U8 -> FromUtf8Result
|
||||
arguments!(list);
|
||||
call_str_from_utf_bitcode_fn(
|
||||
env,
|
||||
layout_interner,
|
||||
&[list, pass_update_mode(env, update_mode)],
|
||||
"str.FromUtf8Result",
|
||||
bitcode::STR_FROM_UTF8,
|
||||
)
|
||||
}
|
||||
StrToUtf8 => {
|
||||
// Str.fromInt : Str -> List U8
|
||||
|
@ -449,6 +418,16 @@ pub(crate) fn run_low_level<'a, 'ctx>(
|
|||
bitcode::STR_TO_UTF8,
|
||||
)
|
||||
}
|
||||
StrFromUtf8Lossy => {
|
||||
arguments!(list);
|
||||
call_list_bitcode_fn(
|
||||
env,
|
||||
&[list.into_struct_value()],
|
||||
&[],
|
||||
BitcodeReturns::Str,
|
||||
bitcode::STR_FROM_UTF8_LOSSY,
|
||||
)
|
||||
}
|
||||
StrRepeat => {
|
||||
// Str.repeat : Str, U64 -> Str
|
||||
arguments!(string, count);
|
||||
|
|
|
@ -245,6 +245,7 @@ impl<'a> LowLevelCall<'a> {
|
|||
backend.code_builder.i32_const(UPDATE_MODE_IMMUTABLE);
|
||||
backend.call_host_fn_after_loading_args(bitcode::STR_FROM_UTF8);
|
||||
}
|
||||
StrFromUtf8Lossy => self.load_args_and_call_zig(backend, bitcode::STR_FROM_UTF8_LOSSY),
|
||||
StrTrimStart => self.load_args_and_call_zig(backend, bitcode::STR_TRIM_START),
|
||||
StrTrimEnd => self.load_args_and_call_zig(backend, bitcode::STR_TRIM_END),
|
||||
StrToUtf8 => self.load_args_and_call_zig(backend, bitcode::STR_TO_UTF8),
|
||||
|
|
|
@ -14,6 +14,7 @@ pub enum LowLevel {
|
|||
StrCountUtf8Bytes,
|
||||
StrFromInt,
|
||||
StrFromUtf8,
|
||||
StrFromUtf8Lossy,
|
||||
StrToUtf8,
|
||||
StrRepeat,
|
||||
StrFromFloat,
|
||||
|
@ -256,6 +257,7 @@ map_symbol_to_lowlevel! {
|
|||
StrSplitOn <= STR_SPLIT_ON;
|
||||
StrCountUtf8Bytes <= STR_COUNT_UTF8_BYTES;
|
||||
StrFromUtf8 <= STR_FROM_UTF8_LOWLEVEL;
|
||||
StrFromUtf8Lossy <= STR_FROM_UTF8_LOSSY;
|
||||
StrToUtf8 <= STR_TO_UTF8;
|
||||
StrRepeat <= STR_REPEAT;
|
||||
StrTrim <= STR_TRIM;
|
||||
|
|
|
@ -1377,8 +1377,8 @@ define_builtins! {
|
|||
7 STR_STARTS_WITH: "starts_with"
|
||||
8 STR_ENDS_WITH: "ends_with"
|
||||
9 STR_FROM_UTF8: "from_utf8"
|
||||
10 STR_UT8_PROBLEM: "Utf8Problem" // the Utf8Problem type alias
|
||||
11 STR_UT8_BYTE_PROBLEM: "Utf8ByteProblem" // the Utf8ByteProblem type alias
|
||||
10 STR_FROM_UTF8_LOSSY: "from_utf8_lossy"
|
||||
11 STR_UTF8_BYTE_PROBLEM: "Utf8Problem"
|
||||
12 STR_TO_UTF8: "to_utf8"
|
||||
13 STR_WALK_UTF8: "walk_utf8"
|
||||
14 STR_ALIAS_ANALYSIS_STATIC: "#aliasAnalysisStatic" // string with the static lifetime
|
||||
|
@ -1418,6 +1418,10 @@ define_builtins! {
|
|||
48 STR_RELEASE_EXCESS_CAPACITY: "release_excess_capacity"
|
||||
49 STR_DROP_PREFIX: "drop_prefix"
|
||||
50 STR_DROP_SUFFIX: "drop_suffix"
|
||||
51 STR_FROM_UTF16: "from_utf16"
|
||||
52 STR_FROM_UTF16_LOSSY: "from_utf16_lossy"
|
||||
53 STR_FROM_UTF32: "from_utf32"
|
||||
54 STR_FROM_UTF32_LOSSY: "from_utf32_lossy"
|
||||
}
|
||||
6 LIST: "List" => {
|
||||
0 LIST_LIST: "List" exposed_apply_type=true // the List.List type alias
|
||||
|
|
|
@ -1603,6 +1603,7 @@ fn low_level_no_rc(lowlevel: &LowLevel) -> RC {
|
|||
DictPseudoSeed => RC::NoRc,
|
||||
StrStartsWith | StrEndsWith => RC::NoRc,
|
||||
StrFromUtf8 => RC::Rc,
|
||||
StrFromUtf8Lossy => RC::Rc,
|
||||
StrToUtf8 => RC::Rc,
|
||||
StrRepeat => RC::NoRc,
|
||||
StrFromInt | StrFromFloat => RC::NoRc,
|
||||
|
|
|
@ -1302,6 +1302,7 @@ pub(crate) fn lowlevel_borrow_signature(op: LowLevel) -> &'static [Ownership] {
|
|||
| NumF64FromParts => &[IRRELEVANT],
|
||||
StrStartsWith | StrEndsWith => &[BORROWED, BORROWED],
|
||||
StrFromUtf8 => &[OWNED],
|
||||
StrFromUtf8Lossy => &[BORROWED],
|
||||
StrToUtf8 => &[OWNED],
|
||||
StrRepeat => &[BORROWED, IRRELEVANT],
|
||||
StrFromInt | StrFromFloat => &[IRRELEVANT],
|
||||
|
|
|
@ -165,7 +165,7 @@ mod solve_expr {
|
|||
Str.from_utf8
|
||||
"
|
||||
),
|
||||
"List U8 -> Result Str [BadUtf8 { index : U64, problem : Utf8ByteProblem }]",
|
||||
"List U8 -> Result Str [BadUtf8 { index : U64, problem : Utf8Problem }]",
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
@ -805,6 +805,164 @@ fn str_from_utf8_fail_surrogate_half() {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
|
||||
fn str_from_utf8_lossy_expected_continuation() {
|
||||
assert_evals_to!(
|
||||
r#"Str.from_utf8_lossy [97, 98, 0xC2, 99]"#,
|
||||
roc_std::RocStr::from("ab<EFBFBD>c"),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
|
||||
fn str_from_utf16() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf16 [0x72, 0x6f, 0x63] is
|
||||
Ok val -> val
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from("roc"),
|
||||
roc_std::RocStr
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
|
||||
fn str_from_utf16_emoji() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf16 [0x72, 0xd83d, 0xdc96, 0x63] is
|
||||
Ok val -> val
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from("r💖c"),
|
||||
roc_std::RocStr
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
|
||||
fn str_from_utf16_err_expected_second_surrogate_half() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf16 [0x72, 0xd83d, 0x63] is
|
||||
Err (BadUtf16 {problem: EncodesSurrogateHalf, index: index }) -> index
|
||||
_ -> 42
|
||||
"#
|
||||
),
|
||||
1u64,
|
||||
u64
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
|
||||
fn str_from_utf16_err_unexpected_second_surrogate_half() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf16 [0x72, 0xdc96, 0x63] is
|
||||
Err (BadUtf16 {problem: EncodesSurrogateHalf, index: index }) -> index
|
||||
_ -> 42
|
||||
"#
|
||||
),
|
||||
1u64,
|
||||
u64
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
|
||||
fn str_from_utf16_lossy() {
|
||||
assert_evals_to!(
|
||||
r#"Str.from_utf16_lossy [0x72, 0xdc96, 0x63]"#,
|
||||
roc_std::RocStr::from("r<EFBFBD>c"),
|
||||
roc_std::RocStr
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
|
||||
fn str_from_utf32() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf32 [0x72, 0x6f, 0x63] is
|
||||
Ok val -> val
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from("roc"),
|
||||
roc_std::RocStr
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
|
||||
fn str_from_utf32_emoji() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf32 [0x72, 0x1f496, 0x63] is
|
||||
Ok val -> val
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from("r💖c"),
|
||||
roc_std::RocStr
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
|
||||
fn str_from_utf32_err_codepoint_too_large() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf32 [0x72, 0x123456, 0x63] is
|
||||
Err (BadUtf32 {problem: CodepointTooLarge, index: index }) -> index
|
||||
_ -> 42
|
||||
"#
|
||||
),
|
||||
1u64,
|
||||
u64
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
|
||||
fn str_from_utf32_err_utf8_cannot_encode_surrogate_half() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf32 [0x72, 0xd83d, 0x63] is
|
||||
Err (BadUtf32 {problem: EncodesSurrogateHalf, index: index }) -> index
|
||||
_ -> 42
|
||||
"#
|
||||
),
|
||||
1u64,
|
||||
u64
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
|
||||
fn str_from_utf32_lossy() {
|
||||
assert_evals_to!(
|
||||
r#"Str.from_utf32_lossy [0x72, 0x123456, 0x63]"#,
|
||||
roc_std::RocStr::from("r<EFBFBD>c"),
|
||||
roc_std::RocStr
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(any(feature = "gen-llvm", feature = "gen-dev"))]
|
||||
fn str_equality() {
|
||||
|
|
|
@ -630,6 +630,156 @@ fn str_from_utf8_fail_surrogate_half() {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf8_lossy_expected_continuation() {
|
||||
assert_evals_to!(
|
||||
r#"Str.from_utf8_lossy [97, 98, 0xC2, 99]"#,
|
||||
roc_std::RocStr::from("ab<EFBFBD>c"),
|
||||
roc_std::RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf16() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf16 [0x72, 0x6f, 0x63] is
|
||||
Ok val -> val
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from("roc"),
|
||||
roc_std::RocStr
|
||||
)
|
||||
}
|
||||
|
||||
// Marking this as should_panic, because it *does* panic and it is not clear why?
|
||||
// If some change magically fixes this, great, remove the should_panic attribute.
|
||||
#[test]
|
||||
#[should_panic(expected = r#"Roc failed with message: "Integer multiplication overflowed!"#)]
|
||||
fn str_from_utf16_emoji() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf16 [0x72, 0xd83d, 0xdc96, 0x63] is
|
||||
Ok val -> val
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from("r💖c"),
|
||||
roc_std::RocStr
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf16_err_expected_second_surrogate_half() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf16 [0x72, 0xd83d, 0x63] is
|
||||
Err (BadUtf16 {problem: EncodesSurrogateHalf, index: index }) -> index
|
||||
_ -> 42
|
||||
"#
|
||||
),
|
||||
1u64,
|
||||
u64
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf16_err_unexpected_second_surrogate_half() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf16 [0x72, 0xdc96, 0x63] is
|
||||
Err (BadUtf16 {problem: EncodesSurrogateHalf, index: index }) -> index
|
||||
_ -> 42
|
||||
"#
|
||||
),
|
||||
1u64,
|
||||
u64
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf16_lossy() {
|
||||
assert_evals_to!(
|
||||
r#"Str.from_utf16_lossy [0x72, 0xdc96, 0x63]"#,
|
||||
roc_std::RocStr::from("r<EFBFBD>c"),
|
||||
roc_std::RocStr
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf32() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf32 [0x72, 0x6f, 0x63] is
|
||||
Ok val -> val
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from("roc"),
|
||||
roc_std::RocStr
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf32_emoji() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf32 [0x72, 0x1f496, 0x63] is
|
||||
Ok val -> val
|
||||
_ -> ""
|
||||
"#
|
||||
),
|
||||
roc_std::RocStr::from("r💖c"),
|
||||
roc_std::RocStr
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf32_err_codepoint_too_large() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf32 [0x72, 0x123456, 0x63] is
|
||||
Err (BadUtf32 {problem: CodepointTooLarge, index: index }) -> index
|
||||
_ -> 42
|
||||
"#
|
||||
),
|
||||
1u64,
|
||||
u64
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf32_err_utf8_cannot_encode_surrogate_half() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
when Str.from_utf32 [0x72, 0xd83d, 0x63] is
|
||||
Err (BadUtf32 {problem: EncodesSurrogateHalf, index: index }) -> index
|
||||
_ -> 42
|
||||
"#
|
||||
),
|
||||
1u64,
|
||||
u64
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_from_utf32_lossy() {
|
||||
assert_evals_to!(
|
||||
r#"Str.from_utf32_lossy [0x72, 0x123456, 0x63]"#,
|
||||
roc_std::RocStr::from("r<EFBFBD>c"),
|
||||
roc_std::RocStr
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_equality() {
|
||||
assert_evals_to!(r#""a" == "a""#, true, bool);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue