mirror of
https://github.com/roc-lang/roc.git
synced 2025-09-26 13:29:12 +00:00
Remove scalars and graphemes from Str
This commit is contained in:
parent
65738acb26
commit
b48fa0698a
26 changed files with 140 additions and 12721 deletions
|
@ -1,6 +1,5 @@
|
|||
const utils = @import("utils.zig");
|
||||
const RocList = @import("list.zig").RocList;
|
||||
const grapheme = @import("helpers/grapheme.zig");
|
||||
const UpdateMode = utils.UpdateMode;
|
||||
const std = @import("std");
|
||||
const mem = std.mem;
|
||||
|
@ -552,242 +551,6 @@ pub fn strNumberOfBytes(string: RocStr) callconv(.C) usize {
|
|||
return string.len();
|
||||
}
|
||||
|
||||
// Str.toScalars
|
||||
pub fn strToScalarsC(str: RocStr) callconv(.C) RocList {
|
||||
return @call(.always_inline, strToScalars, .{str});
|
||||
}
|
||||
|
||||
fn strToScalars(string: RocStr) callconv(.C) RocList {
|
||||
const len = string.len();
|
||||
|
||||
if (len == 0) {
|
||||
return RocList.empty();
|
||||
}
|
||||
|
||||
var capacity = len;
|
||||
|
||||
if (!string.isSmallStr()) {
|
||||
capacity = string.getCapacity();
|
||||
}
|
||||
|
||||
// For purposes of preallocation, assume the number of code points is the same
|
||||
// as the number of bytes. This might be longer than necessary, but definitely
|
||||
// should not require a second allocation.
|
||||
var answer = RocList.allocate(@alignOf(u32), capacity, @sizeOf(u32));
|
||||
|
||||
// `orelse unreachable` is fine here, because we already did an early
|
||||
// return to verify the string was nonempty.
|
||||
var answer_elems = answer.elements(u32) orelse unreachable;
|
||||
var src_index: usize = 0;
|
||||
var answer_index: usize = 0;
|
||||
|
||||
while (src_index < len) {
|
||||
src_index += writeNextScalar(string, src_index, answer_elems, answer_index);
|
||||
answer_index += 1;
|
||||
}
|
||||
|
||||
answer.length = answer_index;
|
||||
|
||||
return answer;
|
||||
}
|
||||
|
||||
// Given a non-empty RocStr, and a src_index byte index into that string,
|
||||
// and a destination [*]u32, and an index into that destination,
|
||||
// Parses the next scalar value out of the string (at the given byte index),
|
||||
// writes it into the destination, and returns the number of bytes parsed.
|
||||
inline fn writeNextScalar(non_empty_string: RocStr, src_index: usize, dest: [*]u32, dest_index: usize) usize {
|
||||
const utf8_byte = non_empty_string.getUnchecked(src_index);
|
||||
|
||||
// How UTF-8 bytes work:
|
||||
// https://docs.teradata.com/r/Teradata-Database-International-Character-Set-Support/June-2017/Client-Character-Set-Options/UTF8-Client-Character-Set-Support/UTF8-Multibyte-Sequences
|
||||
if (utf8_byte <= 127) {
|
||||
// It's an ASCII character. Copy it over directly.
|
||||
dest[dest_index] = @as(u32, @intCast(utf8_byte));
|
||||
|
||||
return 1;
|
||||
} else if (utf8_byte >> 5 == 0b0000_0110) {
|
||||
// Its three high order bits are 110, so this is a two-byte sequence.
|
||||
|
||||
// Example:
|
||||
// utf-8: 1100 1111 1011 0001
|
||||
// code pt: 0000 0011 1111 0001 (decimal: 1009)
|
||||
|
||||
// Discard the first byte's high order bits of 110.
|
||||
var code_pt = @as(u32, @intCast(utf8_byte & 0b0001_1111));
|
||||
|
||||
// Discard the second byte's high order bits of 10.
|
||||
code_pt <<= 6;
|
||||
code_pt |= non_empty_string.getUnchecked(src_index + 1) & 0b0011_1111;
|
||||
|
||||
dest[dest_index] = code_pt;
|
||||
|
||||
return 2;
|
||||
} else if (utf8_byte >> 4 == 0b0000_1110) {
|
||||
// Its four high order bits are 1110, so this is a three-byte sequence.
|
||||
|
||||
// Discard the first byte's high order bits of 1110.
|
||||
var code_pt = @as(u32, @intCast(utf8_byte & 0b0000_1111));
|
||||
|
||||
// Discard the second byte's high order bits of 10.
|
||||
code_pt <<= 6;
|
||||
code_pt |= non_empty_string.getUnchecked(src_index + 1) & 0b0011_1111;
|
||||
|
||||
// Discard the third byte's high order bits of 10 (same as second byte).
|
||||
code_pt <<= 6;
|
||||
code_pt |= non_empty_string.getUnchecked(src_index + 2) & 0b0011_1111;
|
||||
|
||||
dest[dest_index] = code_pt;
|
||||
|
||||
return 3;
|
||||
} else {
|
||||
// This must be a four-byte sequence, so the five high order bits should be 11110.
|
||||
|
||||
// Discard the first byte's high order bits of 11110.
|
||||
var code_pt = @as(u32, @intCast(utf8_byte & 0b0000_0111));
|
||||
|
||||
// Discard the second byte's high order bits of 10.
|
||||
code_pt <<= 6;
|
||||
code_pt |= non_empty_string.getUnchecked(src_index + 1) & 0b0011_1111;
|
||||
|
||||
// Discard the third byte's high order bits of 10 (same as second byte).
|
||||
code_pt <<= 6;
|
||||
code_pt |= non_empty_string.getUnchecked(src_index + 2) & 0b0011_1111;
|
||||
|
||||
// Discard the fourth byte's high order bits of 10 (same as second and third).
|
||||
code_pt <<= 6;
|
||||
code_pt |= non_empty_string.getUnchecked(src_index + 3) & 0b0011_1111;
|
||||
|
||||
dest[dest_index] = code_pt;
|
||||
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
test "strToScalars: empty string" {
|
||||
const str = RocStr.fromSlice("");
|
||||
defer RocStr.decref(str);
|
||||
|
||||
const expected = RocList.empty();
|
||||
const actual = strToScalars(str);
|
||||
defer actual.decref(@sizeOf(u32));
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: One ASCII char" {
|
||||
const str = RocStr.fromSlice("R");
|
||||
defer RocStr.decref(str);
|
||||
|
||||
const expected_array = [_]u32{82};
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer expected.decref(@sizeOf(u32));
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer actual.decref(@sizeOf(u32));
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: Multiple ASCII chars" {
|
||||
const str = RocStr.fromSlice("Roc!");
|
||||
defer RocStr.decref(str);
|
||||
|
||||
const expected_array = [_]u32{ 82, 111, 99, 33 };
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer expected.decref(@sizeOf(u32));
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer actual.decref(@sizeOf(u32));
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: One 2-byte UTF-8 character" {
|
||||
const str = RocStr.fromSlice("é");
|
||||
defer RocStr.decref(str);
|
||||
|
||||
const expected_array = [_]u32{233};
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer expected.decref(@sizeOf(u32));
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer actual.decref(@sizeOf(u32));
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: Multiple 2-byte UTF-8 characters" {
|
||||
const str = RocStr.fromSlice("Cäfés");
|
||||
defer RocStr.decref(str);
|
||||
|
||||
const expected_array = [_]u32{ 67, 228, 102, 233, 115 };
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer expected.decref(@sizeOf(u32));
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer actual.decref(@sizeOf(u32));
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: One 3-byte UTF-8 character" {
|
||||
const str = RocStr.fromSlice("鹏");
|
||||
defer RocStr.decref(str);
|
||||
|
||||
const expected_array = [_]u32{40527};
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer expected.decref(@sizeOf(u32));
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer actual.decref(@sizeOf(u32));
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: Multiple 3-byte UTF-8 characters" {
|
||||
const str = RocStr.fromSlice("鹏很有趣");
|
||||
defer RocStr.decref(str);
|
||||
|
||||
const expected_array = [_]u32{ 40527, 24456, 26377, 36259 };
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer expected.decref(@sizeOf(u32));
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer actual.decref(@sizeOf(u32));
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: One 4-byte UTF-8 character" {
|
||||
// from https://design215.com/toolbox/utf8-4byte-characters.php
|
||||
const str = RocStr.fromSlice("𒀀");
|
||||
defer RocStr.decref(str);
|
||||
|
||||
const expected_array = [_]u32{73728};
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer expected.decref(@sizeOf(u32));
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer actual.decref(@sizeOf(u32));
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: Multiple 4-byte UTF-8 characters" {
|
||||
// from https://design215.com/toolbox/utf8-4byte-characters.php
|
||||
const str = RocStr.fromSlice("𒀀𒀁");
|
||||
defer RocStr.decref(str);
|
||||
|
||||
const expected_array = [_]u32{ 73728, 73729 };
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer expected.decref(@sizeOf(u32));
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer actual.decref(@sizeOf(u32));
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
// Str.fromInt
|
||||
pub fn exportFromInt(comptime T: type, comptime name: []const u8) void {
|
||||
comptime var f = struct {
|
||||
|
@ -1371,125 +1134,6 @@ test "countSegments: overlapping delimiter 2" {
|
|||
try expectEqual(segments_count, 3);
|
||||
}
|
||||
|
||||
// Str.countGraphemeClusters
|
||||
pub fn countGraphemeClusters(string: RocStr) callconv(.C) usize {
|
||||
if (string.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const bytes_len = string.len();
|
||||
const bytes_ptr = string.asU8ptr();
|
||||
|
||||
var bytes = bytes_ptr[0..bytes_len];
|
||||
var iter = (unicode.Utf8View.init(bytes) catch unreachable).iterator();
|
||||
|
||||
var count: usize = 0;
|
||||
var grapheme_break_state: ?grapheme.BoundClass = null;
|
||||
var grapheme_break_state_ptr = &grapheme_break_state;
|
||||
var opt_last_codepoint: ?u21 = null;
|
||||
while (iter.nextCodepoint()) |cur_codepoint| {
|
||||
if (opt_last_codepoint) |last_codepoint| {
|
||||
var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, grapheme_break_state_ptr);
|
||||
if (did_break) {
|
||||
count += 1;
|
||||
grapheme_break_state = null;
|
||||
}
|
||||
}
|
||||
opt_last_codepoint = cur_codepoint;
|
||||
}
|
||||
|
||||
// If there are no breaks, but the str is not empty, then there
|
||||
// must be a single grapheme
|
||||
if (bytes_len != 0) {
|
||||
count += 1;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
// Str.graphemes
|
||||
pub fn strGraphemes(roc_str: RocStr) callconv(.C) RocList {
|
||||
var break_state: ?grapheme.BoundClass = null;
|
||||
var opt_last_codepoint: ?u21 = null;
|
||||
var index: usize = 0;
|
||||
var last_codepoint_len: u8 = 0;
|
||||
|
||||
const alloc_ptr = @intFromPtr(roc_str.getAllocationPtr()) >> 1;
|
||||
const init_fn = if (roc_str.isSmallStr())
|
||||
&initFromSmallStr
|
||||
else
|
||||
&initFromBigStr;
|
||||
|
||||
var result = RocList.allocate(@alignOf(RocStr), countGraphemeClusters(roc_str), @sizeOf(RocStr));
|
||||
const graphemes = result.elements(RocStr) orelse return result;
|
||||
var slice = roc_str.asSlice();
|
||||
var iter = (unicode.Utf8View.init(slice) catch unreachable).iterator();
|
||||
|
||||
while (iter.nextCodepoint()) |cur_codepoint| {
|
||||
const cur_codepoint_len = unicode.utf8CodepointSequenceLength(cur_codepoint) catch unreachable;
|
||||
if (opt_last_codepoint) |last_codepoint| {
|
||||
var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, &break_state);
|
||||
if (did_break) {
|
||||
graphemes[index] = init_fn(@constCast(slice.ptr), last_codepoint_len, alloc_ptr);
|
||||
slice = slice[last_codepoint_len..];
|
||||
index += 1;
|
||||
break_state = null;
|
||||
last_codepoint_len = 0;
|
||||
}
|
||||
}
|
||||
last_codepoint_len += cur_codepoint_len;
|
||||
opt_last_codepoint = cur_codepoint;
|
||||
}
|
||||
// Append last grapheme
|
||||
graphemes[index] = init_fn(@constCast(slice.ptr), slice.len, alloc_ptr);
|
||||
|
||||
if (!roc_str.isSmallStr()) {
|
||||
// Correct refcount for all of the splits made.
|
||||
roc_str.incref(index + 1);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// these test both countGraphemeClusters() and strGraphemes()
|
||||
fn graphemesTest(input: []const u8, expected: []const []const u8) !void {
|
||||
const rocstr = RocStr.fromSlice(input);
|
||||
defer rocstr.decref();
|
||||
const count = countGraphemeClusters(rocstr);
|
||||
try expectEqual(expected.len, count);
|
||||
|
||||
const graphemes = strGraphemes(rocstr);
|
||||
defer graphemes.decref(@sizeOf(u8));
|
||||
if (input.len == 0) return; // empty string
|
||||
const elems = graphemes.elements(RocStr) orelse unreachable;
|
||||
for (expected, 0..) |g, i| {
|
||||
try std.testing.expectEqualStrings(g, elems[i].asSlice());
|
||||
}
|
||||
}
|
||||
|
||||
test "graphemes: empty string" {
|
||||
try graphemesTest("", &.{});
|
||||
}
|
||||
|
||||
test "graphemes: ascii characters" {
|
||||
try graphemesTest("abcd", &.{ "a", "b", "c", "d" });
|
||||
}
|
||||
|
||||
test "graphemes: utf8 characters" {
|
||||
try graphemesTest("ãxā", &.{ "ã", "x", "ā" });
|
||||
}
|
||||
|
||||
test "graphemes: emojis" {
|
||||
try graphemesTest("🤔🤔🤔", &.{ "🤔", "🤔", "🤔" });
|
||||
}
|
||||
|
||||
test "graphemes: emojis and ut8 characters" {
|
||||
try graphemesTest("🤔å🤔¥🤔ç", &.{ "🤔", "å", "🤔", "¥", "🤔", "ç" });
|
||||
}
|
||||
|
||||
test "graphemes: emojis, ut8, and ascii characters" {
|
||||
try graphemesTest("6🤔å🤔e¥🤔çpp", &.{ "6", "🤔", "å", "🤔", "e", "¥", "🤔", "ç", "p", "p" });
|
||||
}
|
||||
|
||||
pub fn countUtf8Bytes(string: RocStr) callconv(.C) usize {
|
||||
return string.len();
|
||||
}
|
||||
|
@ -1614,44 +1258,6 @@ pub fn repeat(string: RocStr, count: usize) callconv(.C) RocStr {
|
|||
return ret_string;
|
||||
}
|
||||
|
||||
// Str.startsWithScalar
|
||||
pub fn startsWithScalar(string: RocStr, prefix: u32) callconv(.C) bool {
|
||||
const len = string.len();
|
||||
|
||||
if (len == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Write this (non-empty) string's first scalar into `first_scalar`
|
||||
var first_scalar: [1]u32 = undefined;
|
||||
|
||||
_ = writeNextScalar(string, 0, &first_scalar, 0);
|
||||
|
||||
// Return whether `first_scalar` equals `prefix`
|
||||
return @as(*u32, @ptrCast(&first_scalar)).* == prefix;
|
||||
}
|
||||
|
||||
test "startsWithScalar: empty string" {
|
||||
const whole = RocStr.empty();
|
||||
const prefix: u32 = 'x';
|
||||
try expect(!startsWithScalar(whole, prefix));
|
||||
}
|
||||
|
||||
test "startsWithScalar: ascii char" {
|
||||
const whole = RocStr.fromSlice("foobar");
|
||||
const prefix: u32 = 'f';
|
||||
try expect(startsWithScalar(whole, prefix));
|
||||
}
|
||||
|
||||
test "startsWithScalar: emoji" {
|
||||
const yes = RocStr.fromSlice("💖foobar");
|
||||
const no = RocStr.fromSlice("foobar");
|
||||
const prefix: u32 = '💖';
|
||||
|
||||
try expect(startsWithScalar(yes, prefix));
|
||||
try expect(!startsWithScalar(no, prefix));
|
||||
}
|
||||
|
||||
test "startsWith: foo starts with fo" {
|
||||
const foo = RocStr.fromSlice("foo");
|
||||
const fo = RocStr.fromSlice("fo");
|
||||
|
@ -2761,78 +2367,6 @@ test "capacity: big string" {
|
|||
try expect(data.getCapacity() >= data_bytes.len);
|
||||
}
|
||||
|
||||
pub fn appendScalar(string: RocStr, scalar_u32: u32) callconv(.C) RocStr {
|
||||
const scalar = @as(u21, @intCast(scalar_u32));
|
||||
const width = std.unicode.utf8CodepointSequenceLength(scalar) catch unreachable;
|
||||
|
||||
var output = string.reallocate(string.len() + width);
|
||||
var slice = output.asSliceWithCapacityMut();
|
||||
|
||||
_ = std.unicode.utf8Encode(scalar, slice[string.len() .. string.len() + width]) catch unreachable;
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
test "appendScalar: small A" {
|
||||
const A: []const u8 = "A";
|
||||
|
||||
const data_bytes = "hello";
|
||||
var data = RocStr.init(data_bytes, data_bytes.len);
|
||||
|
||||
const actual = appendScalar(data, A[0]);
|
||||
defer actual.decref();
|
||||
|
||||
const expected_bytes = "helloA";
|
||||
const expected = RocStr.init(expected_bytes, expected_bytes.len);
|
||||
defer expected.decref();
|
||||
|
||||
try expect(actual.eq(expected));
|
||||
}
|
||||
|
||||
test "appendScalar: small 😀" {
|
||||
const data_bytes = "hello";
|
||||
var data = RocStr.init(data_bytes, data_bytes.len);
|
||||
|
||||
const actual = appendScalar(data, 0x1F600);
|
||||
defer actual.decref();
|
||||
|
||||
const expected_bytes = "hello😀";
|
||||
const expected = RocStr.init(expected_bytes, expected_bytes.len);
|
||||
defer expected.decref();
|
||||
|
||||
try expect(actual.eq(expected));
|
||||
}
|
||||
|
||||
test "appendScalar: big A" {
|
||||
const A: []const u8 = "A";
|
||||
|
||||
const data_bytes = "a string so large that it must be heap-allocated";
|
||||
var data = RocStr.init(data_bytes, data_bytes.len);
|
||||
|
||||
const actual = appendScalar(data, A[0]);
|
||||
defer actual.decref();
|
||||
|
||||
const expected_bytes = "a string so large that it must be heap-allocatedA";
|
||||
const expected = RocStr.init(expected_bytes, expected_bytes.len);
|
||||
defer expected.decref();
|
||||
|
||||
try expect(actual.eq(expected));
|
||||
}
|
||||
|
||||
test "appendScalar: big 😀" {
|
||||
const data_bytes = "a string so large that it must be heap-allocated";
|
||||
var data = RocStr.init(data_bytes, data_bytes.len);
|
||||
|
||||
const actual = appendScalar(data, 0x1F600);
|
||||
defer actual.decref();
|
||||
|
||||
const expected_bytes = "a string so large that it must be heap-allocated😀";
|
||||
const expected = RocStr.init(expected_bytes, expected_bytes.len);
|
||||
defer expected.decref();
|
||||
|
||||
try expect(actual.eq(expected));
|
||||
}
|
||||
|
||||
pub fn reserve(string: RocStr, spare: usize) callconv(.C) RocStr {
|
||||
const old_length = string.len();
|
||||
if (string.getCapacity() >= old_length + spare) {
|
||||
|
@ -2849,27 +2383,6 @@ pub fn withCapacity(capacity: usize) callconv(.C) RocStr {
|
|||
str.setLen(0);
|
||||
return str;
|
||||
}
|
||||
|
||||
pub fn getScalarUnsafe(string: RocStr, index: usize) callconv(.C) extern struct { bytesParsed: usize, scalar: u32 } {
|
||||
const slice = string.asSlice();
|
||||
const bytesParsed = @as(usize, @intCast(std.unicode.utf8ByteSequenceLength(slice[index]) catch unreachable));
|
||||
const scalar = std.unicode.utf8Decode(slice[index .. index + bytesParsed]) catch unreachable;
|
||||
|
||||
return .{ .bytesParsed = bytesParsed, .scalar = @as(u32, @intCast(scalar)) };
|
||||
}
|
||||
|
||||
test "getScalarUnsafe" {
|
||||
const data_bytes = "A";
|
||||
var data = RocStr.init(data_bytes, data_bytes.len);
|
||||
|
||||
const result = getScalarUnsafe(data, 0);
|
||||
|
||||
const expected = try std.unicode.utf8Decode("A");
|
||||
|
||||
try expectEqual(result.scalar, @as(u32, @intCast(expected)));
|
||||
try expectEqual(result.bytesParsed, 1);
|
||||
}
|
||||
|
||||
pub fn strCloneTo(
|
||||
string: RocStr,
|
||||
ptr: [*]u8,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue