Remove scalars and graphemes from Str

This commit is contained in:
Richard Feldman 2024-01-17 21:30:34 -05:00
parent 65738acb26
commit b48fa0698a
No known key found for this signature in database
GPG key ID: F1F21AA5B1D9E43B
26 changed files with 140 additions and 12721 deletions

View file

@ -1,6 +1,5 @@
const utils = @import("utils.zig");
const RocList = @import("list.zig").RocList;
const grapheme = @import("helpers/grapheme.zig");
const UpdateMode = utils.UpdateMode;
const std = @import("std");
const mem = std.mem;
@ -552,242 +551,6 @@ pub fn strNumberOfBytes(string: RocStr) callconv(.C) usize {
return string.len();
}
// Str.toScalars
pub fn strToScalarsC(str: RocStr) callconv(.C) RocList {
return @call(.always_inline, strToScalars, .{str});
}
fn strToScalars(string: RocStr) callconv(.C) RocList {
const len = string.len();
if (len == 0) {
return RocList.empty();
}
var capacity = len;
if (!string.isSmallStr()) {
capacity = string.getCapacity();
}
// For purposes of preallocation, assume the number of code points is the same
// as the number of bytes. This might be longer than necessary, but definitely
// should not require a second allocation.
var answer = RocList.allocate(@alignOf(u32), capacity, @sizeOf(u32));
// `orelse unreachable` is fine here, because we already did an early
// return to verify the string was nonempty.
var answer_elems = answer.elements(u32) orelse unreachable;
var src_index: usize = 0;
var answer_index: usize = 0;
while (src_index < len) {
src_index += writeNextScalar(string, src_index, answer_elems, answer_index);
answer_index += 1;
}
answer.length = answer_index;
return answer;
}
// Given a non-empty RocStr, and a src_index byte index into that string,
// and a destination [*]u32, and an index into that destination,
// Parses the next scalar value out of the string (at the given byte index),
// writes it into the destination, and returns the number of bytes parsed.
inline fn writeNextScalar(non_empty_string: RocStr, src_index: usize, dest: [*]u32, dest_index: usize) usize {
const utf8_byte = non_empty_string.getUnchecked(src_index);
// How UTF-8 bytes work:
// https://docs.teradata.com/r/Teradata-Database-International-Character-Set-Support/June-2017/Client-Character-Set-Options/UTF8-Client-Character-Set-Support/UTF8-Multibyte-Sequences
if (utf8_byte <= 127) {
// It's an ASCII character. Copy it over directly.
dest[dest_index] = @as(u32, @intCast(utf8_byte));
return 1;
} else if (utf8_byte >> 5 == 0b0000_0110) {
// Its three high order bits are 110, so this is a two-byte sequence.
// Example:
// utf-8: 1100 1111 1011 0001
// code pt: 0000 0011 1111 0001 (decimal: 1009)
// Discard the first byte's high order bits of 110.
var code_pt = @as(u32, @intCast(utf8_byte & 0b0001_1111));
// Discard the second byte's high order bits of 10.
code_pt <<= 6;
code_pt |= non_empty_string.getUnchecked(src_index + 1) & 0b0011_1111;
dest[dest_index] = code_pt;
return 2;
} else if (utf8_byte >> 4 == 0b0000_1110) {
// Its four high order bits are 1110, so this is a three-byte sequence.
// Discard the first byte's high order bits of 1110.
var code_pt = @as(u32, @intCast(utf8_byte & 0b0000_1111));
// Discard the second byte's high order bits of 10.
code_pt <<= 6;
code_pt |= non_empty_string.getUnchecked(src_index + 1) & 0b0011_1111;
// Discard the third byte's high order bits of 10 (same as second byte).
code_pt <<= 6;
code_pt |= non_empty_string.getUnchecked(src_index + 2) & 0b0011_1111;
dest[dest_index] = code_pt;
return 3;
} else {
// This must be a four-byte sequence, so the five high order bits should be 11110.
// Discard the first byte's high order bits of 11110.
var code_pt = @as(u32, @intCast(utf8_byte & 0b0000_0111));
// Discard the second byte's high order bits of 10.
code_pt <<= 6;
code_pt |= non_empty_string.getUnchecked(src_index + 1) & 0b0011_1111;
// Discard the third byte's high order bits of 10 (same as second byte).
code_pt <<= 6;
code_pt |= non_empty_string.getUnchecked(src_index + 2) & 0b0011_1111;
// Discard the fourth byte's high order bits of 10 (same as second and third).
code_pt <<= 6;
code_pt |= non_empty_string.getUnchecked(src_index + 3) & 0b0011_1111;
dest[dest_index] = code_pt;
return 4;
}
}
test "strToScalars: empty string" {
const str = RocStr.fromSlice("");
defer RocStr.decref(str);
const expected = RocList.empty();
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: One ASCII char" {
const str = RocStr.fromSlice("R");
defer RocStr.decref(str);
const expected_array = [_]u32{82};
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: Multiple ASCII chars" {
const str = RocStr.fromSlice("Roc!");
defer RocStr.decref(str);
const expected_array = [_]u32{ 82, 111, 99, 33 };
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: One 2-byte UTF-8 character" {
const str = RocStr.fromSlice("é");
defer RocStr.decref(str);
const expected_array = [_]u32{233};
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: Multiple 2-byte UTF-8 characters" {
const str = RocStr.fromSlice("Cäfés");
defer RocStr.decref(str);
const expected_array = [_]u32{ 67, 228, 102, 233, 115 };
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: One 3-byte UTF-8 character" {
const str = RocStr.fromSlice("");
defer RocStr.decref(str);
const expected_array = [_]u32{40527};
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: Multiple 3-byte UTF-8 characters" {
const str = RocStr.fromSlice("鹏很有趣");
defer RocStr.decref(str);
const expected_array = [_]u32{ 40527, 24456, 26377, 36259 };
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: One 4-byte UTF-8 character" {
// from https://design215.com/toolbox/utf8-4byte-characters.php
const str = RocStr.fromSlice("𒀀");
defer RocStr.decref(str);
const expected_array = [_]u32{73728};
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
test "strToScalars: Multiple 4-byte UTF-8 characters" {
// from https://design215.com/toolbox/utf8-4byte-characters.php
const str = RocStr.fromSlice("𒀀𒀁");
defer RocStr.decref(str);
const expected_array = [_]u32{ 73728, 73729 };
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer expected.decref(@sizeOf(u32));
const actual = strToScalars(str);
defer actual.decref(@sizeOf(u32));
try expect(RocList.eql(actual, expected));
}
// Str.fromInt
pub fn exportFromInt(comptime T: type, comptime name: []const u8) void {
comptime var f = struct {
@ -1371,125 +1134,6 @@ test "countSegments: overlapping delimiter 2" {
try expectEqual(segments_count, 3);
}
// Str.countGraphemeClusters
pub fn countGraphemeClusters(string: RocStr) callconv(.C) usize {
if (string.isEmpty()) {
return 0;
}
const bytes_len = string.len();
const bytes_ptr = string.asU8ptr();
var bytes = bytes_ptr[0..bytes_len];
var iter = (unicode.Utf8View.init(bytes) catch unreachable).iterator();
var count: usize = 0;
var grapheme_break_state: ?grapheme.BoundClass = null;
var grapheme_break_state_ptr = &grapheme_break_state;
var opt_last_codepoint: ?u21 = null;
while (iter.nextCodepoint()) |cur_codepoint| {
if (opt_last_codepoint) |last_codepoint| {
var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, grapheme_break_state_ptr);
if (did_break) {
count += 1;
grapheme_break_state = null;
}
}
opt_last_codepoint = cur_codepoint;
}
// If there are no breaks, but the str is not empty, then there
// must be a single grapheme
if (bytes_len != 0) {
count += 1;
}
return count;
}
// Str.graphemes
pub fn strGraphemes(roc_str: RocStr) callconv(.C) RocList {
var break_state: ?grapheme.BoundClass = null;
var opt_last_codepoint: ?u21 = null;
var index: usize = 0;
var last_codepoint_len: u8 = 0;
const alloc_ptr = @intFromPtr(roc_str.getAllocationPtr()) >> 1;
const init_fn = if (roc_str.isSmallStr())
&initFromSmallStr
else
&initFromBigStr;
var result = RocList.allocate(@alignOf(RocStr), countGraphemeClusters(roc_str), @sizeOf(RocStr));
const graphemes = result.elements(RocStr) orelse return result;
var slice = roc_str.asSlice();
var iter = (unicode.Utf8View.init(slice) catch unreachable).iterator();
while (iter.nextCodepoint()) |cur_codepoint| {
const cur_codepoint_len = unicode.utf8CodepointSequenceLength(cur_codepoint) catch unreachable;
if (opt_last_codepoint) |last_codepoint| {
var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, &break_state);
if (did_break) {
graphemes[index] = init_fn(@constCast(slice.ptr), last_codepoint_len, alloc_ptr);
slice = slice[last_codepoint_len..];
index += 1;
break_state = null;
last_codepoint_len = 0;
}
}
last_codepoint_len += cur_codepoint_len;
opt_last_codepoint = cur_codepoint;
}
// Append last grapheme
graphemes[index] = init_fn(@constCast(slice.ptr), slice.len, alloc_ptr);
if (!roc_str.isSmallStr()) {
// Correct refcount for all of the splits made.
roc_str.incref(index + 1);
}
return result;
}
// these test both countGraphemeClusters() and strGraphemes()
fn graphemesTest(input: []const u8, expected: []const []const u8) !void {
const rocstr = RocStr.fromSlice(input);
defer rocstr.decref();
const count = countGraphemeClusters(rocstr);
try expectEqual(expected.len, count);
const graphemes = strGraphemes(rocstr);
defer graphemes.decref(@sizeOf(u8));
if (input.len == 0) return; // empty string
const elems = graphemes.elements(RocStr) orelse unreachable;
for (expected, 0..) |g, i| {
try std.testing.expectEqualStrings(g, elems[i].asSlice());
}
}
test "graphemes: empty string" {
try graphemesTest("", &.{});
}
test "graphemes: ascii characters" {
try graphemesTest("abcd", &.{ "a", "b", "c", "d" });
}
test "graphemes: utf8 characters" {
try graphemesTest("ãxā", &.{ "ã", "x", "ā" });
}
test "graphemes: emojis" {
try graphemesTest("🤔🤔🤔", &.{ "🤔", "🤔", "🤔" });
}
test "graphemes: emojis and ut8 characters" {
try graphemesTest("🤔å🤔¥🤔ç", &.{ "🤔", "å", "🤔", "¥", "🤔", "ç" });
}
test "graphemes: emojis, ut8, and ascii characters" {
try graphemesTest("6🤔å🤔e¥🤔çpp", &.{ "6", "🤔", "å", "🤔", "e", "¥", "🤔", "ç", "p", "p" });
}
pub fn countUtf8Bytes(string: RocStr) callconv(.C) usize {
return string.len();
}
@ -1614,44 +1258,6 @@ pub fn repeat(string: RocStr, count: usize) callconv(.C) RocStr {
return ret_string;
}
// Str.startsWithScalar
pub fn startsWithScalar(string: RocStr, prefix: u32) callconv(.C) bool {
const len = string.len();
if (len == 0) {
return false;
}
// Write this (non-empty) string's first scalar into `first_scalar`
var first_scalar: [1]u32 = undefined;
_ = writeNextScalar(string, 0, &first_scalar, 0);
// Return whether `first_scalar` equals `prefix`
return @as(*u32, @ptrCast(&first_scalar)).* == prefix;
}
test "startsWithScalar: empty string" {
const whole = RocStr.empty();
const prefix: u32 = 'x';
try expect(!startsWithScalar(whole, prefix));
}
test "startsWithScalar: ascii char" {
const whole = RocStr.fromSlice("foobar");
const prefix: u32 = 'f';
try expect(startsWithScalar(whole, prefix));
}
test "startsWithScalar: emoji" {
const yes = RocStr.fromSlice("💖foobar");
const no = RocStr.fromSlice("foobar");
const prefix: u32 = '💖';
try expect(startsWithScalar(yes, prefix));
try expect(!startsWithScalar(no, prefix));
}
test "startsWith: foo starts with fo" {
const foo = RocStr.fromSlice("foo");
const fo = RocStr.fromSlice("fo");
@ -2761,78 +2367,6 @@ test "capacity: big string" {
try expect(data.getCapacity() >= data_bytes.len);
}
pub fn appendScalar(string: RocStr, scalar_u32: u32) callconv(.C) RocStr {
const scalar = @as(u21, @intCast(scalar_u32));
const width = std.unicode.utf8CodepointSequenceLength(scalar) catch unreachable;
var output = string.reallocate(string.len() + width);
var slice = output.asSliceWithCapacityMut();
_ = std.unicode.utf8Encode(scalar, slice[string.len() .. string.len() + width]) catch unreachable;
return output;
}
test "appendScalar: small A" {
const A: []const u8 = "A";
const data_bytes = "hello";
var data = RocStr.init(data_bytes, data_bytes.len);
const actual = appendScalar(data, A[0]);
defer actual.decref();
const expected_bytes = "helloA";
const expected = RocStr.init(expected_bytes, expected_bytes.len);
defer expected.decref();
try expect(actual.eq(expected));
}
test "appendScalar: small 😀" {
const data_bytes = "hello";
var data = RocStr.init(data_bytes, data_bytes.len);
const actual = appendScalar(data, 0x1F600);
defer actual.decref();
const expected_bytes = "hello😀";
const expected = RocStr.init(expected_bytes, expected_bytes.len);
defer expected.decref();
try expect(actual.eq(expected));
}
test "appendScalar: big A" {
const A: []const u8 = "A";
const data_bytes = "a string so large that it must be heap-allocated";
var data = RocStr.init(data_bytes, data_bytes.len);
const actual = appendScalar(data, A[0]);
defer actual.decref();
const expected_bytes = "a string so large that it must be heap-allocatedA";
const expected = RocStr.init(expected_bytes, expected_bytes.len);
defer expected.decref();
try expect(actual.eq(expected));
}
test "appendScalar: big 😀" {
const data_bytes = "a string so large that it must be heap-allocated";
var data = RocStr.init(data_bytes, data_bytes.len);
const actual = appendScalar(data, 0x1F600);
defer actual.decref();
const expected_bytes = "a string so large that it must be heap-allocated😀";
const expected = RocStr.init(expected_bytes, expected_bytes.len);
defer expected.decref();
try expect(actual.eq(expected));
}
pub fn reserve(string: RocStr, spare: usize) callconv(.C) RocStr {
const old_length = string.len();
if (string.getCapacity() >= old_length + spare) {
@ -2849,27 +2383,6 @@ pub fn withCapacity(capacity: usize) callconv(.C) RocStr {
str.setLen(0);
return str;
}
pub fn getScalarUnsafe(string: RocStr, index: usize) callconv(.C) extern struct { bytesParsed: usize, scalar: u32 } {
const slice = string.asSlice();
const bytesParsed = @as(usize, @intCast(std.unicode.utf8ByteSequenceLength(slice[index]) catch unreachable));
const scalar = std.unicode.utf8Decode(slice[index .. index + bytesParsed]) catch unreachable;
return .{ .bytesParsed = bytesParsed, .scalar = @as(u32, @intCast(scalar)) };
}
test "getScalarUnsafe" {
const data_bytes = "A";
var data = RocStr.init(data_bytes, data_bytes.len);
const result = getScalarUnsafe(data, 0);
const expected = try std.unicode.utf8Decode("A");
try expectEqual(result.scalar, @as(u32, @intCast(expected)));
try expectEqual(result.bytesParsed, 1);
}
pub fn strCloneTo(
string: RocStr,
ptr: [*]u8,