Remove scalars and graphemes from Str

2025-09-26 13:29:12 +00:00 · 2024-01-17 21:30:34 -05:00 · 2024-01-17 21:30:34 -05:00 · b48fa0698a
commit b48fa0698a
parent 65738acb26
26 changed files with 140 additions and 12721 deletions
--- a/crates/compiler/builtins/bitcode/src/str.zig
+++ b/crates/compiler/builtins/bitcode/src/str.zig
@ -1,6 +1,5 @@
 const utils = @import("utils.zig");
 const RocList = @import("list.zig").RocList;
-const grapheme = @import("helpers/grapheme.zig");
 const UpdateMode = utils.UpdateMode;
 const std = @import("std");
 const mem = std.mem;
@ -552,242 +551,6 @@ pub fn strNumberOfBytes(string: RocStr) callconv(.C) usize {
    return string.len();
 }

-// Str.toScalars
-pub fn strToScalarsC(str: RocStr) callconv(.C) RocList {
-    return @call(.always_inline, strToScalars, .{str});
-}
-
-fn strToScalars(string: RocStr) callconv(.C) RocList {
-    const len = string.len();
-
-    if (len == 0) {
-        return RocList.empty();
-    }
-
-    var capacity = len;
-
-    if (!string.isSmallStr()) {
-        capacity = string.getCapacity();
-    }
-
-    // For purposes of preallocation, assume the number of code points is the same
-    // as the number of bytes. This might be longer than necessary, but definitely
-    // should not require a second allocation.
-    var answer = RocList.allocate(@alignOf(u32), capacity, @sizeOf(u32));
-
-    // `orelse unreachable` is fine here, because we already did an early
-    // return to verify the string was nonempty.
-    var answer_elems = answer.elements(u32) orelse unreachable;
-    var src_index: usize = 0;
-    var answer_index: usize = 0;
-
-    while (src_index < len) {
-        src_index += writeNextScalar(string, src_index, answer_elems, answer_index);
-        answer_index += 1;
-    }
-
-    answer.length = answer_index;
-
-    return answer;
-}
-
-// Given a non-empty RocStr, and a src_index byte index into that string,
-// and a destination [*]u32, and an index into that destination,
-// Parses the next scalar value out of the string (at the given byte index),
-// writes it into the destination, and returns the number of bytes parsed.
-inline fn writeNextScalar(non_empty_string: RocStr, src_index: usize, dest: [*]u32, dest_index: usize) usize {
-    const utf8_byte = non_empty_string.getUnchecked(src_index);
-
-    // How UTF-8 bytes work:
-    // https://docs.teradata.com/r/Teradata-Database-International-Character-Set-Support/June-2017/Client-Character-Set-Options/UTF8-Client-Character-Set-Support/UTF8-Multibyte-Sequences
-    if (utf8_byte <= 127) {
-        // It's an ASCII character. Copy it over directly.
-        dest[dest_index] = @as(u32, @intCast(utf8_byte));
-
-        return 1;
-    } else if (utf8_byte >> 5 == 0b0000_0110) {
-        // Its three high order bits are 110, so this is a two-byte sequence.
-
-        // Example:
-        //     utf-8:   1100 1111   1011 0001
-        //     code pt: 0000 0011   1111 0001 (decimal: 1009)
-
-        // Discard the first byte's high order bits of 110.
-        var code_pt = @as(u32, @intCast(utf8_byte & 0b0001_1111));
-
-        // Discard the second byte's high order bits of 10.
-        code_pt <<= 6;
-        code_pt |= non_empty_string.getUnchecked(src_index + 1) & 0b0011_1111;
-
-        dest[dest_index] = code_pt;
-
-        return 2;
-    } else if (utf8_byte >> 4 == 0b0000_1110) {
-        // Its four high order bits are 1110, so this is a three-byte sequence.
-
-        // Discard the first byte's high order bits of 1110.
-        var code_pt = @as(u32, @intCast(utf8_byte & 0b0000_1111));
-
-        // Discard the second byte's high order bits of 10.
-        code_pt <<= 6;
-        code_pt |= non_empty_string.getUnchecked(src_index + 1) & 0b0011_1111;
-
-        // Discard the third byte's high order bits of 10 (same as second byte).
-        code_pt <<= 6;
-        code_pt |= non_empty_string.getUnchecked(src_index + 2) & 0b0011_1111;
-
-        dest[dest_index] = code_pt;
-
-        return 3;
-    } else {
-        // This must be a four-byte sequence, so the five high order bits should be 11110.
-
-        // Discard the first byte's high order bits of 11110.
-        var code_pt = @as(u32, @intCast(utf8_byte & 0b0000_0111));
-
-        // Discard the second byte's high order bits of 10.
-        code_pt <<= 6;
-        code_pt |= non_empty_string.getUnchecked(src_index + 1) & 0b0011_1111;
-
-        // Discard the third byte's high order bits of 10 (same as second byte).
-        code_pt <<= 6;
-        code_pt |= non_empty_string.getUnchecked(src_index + 2) & 0b0011_1111;
-
-        // Discard the fourth byte's high order bits of 10 (same as second and third).
-        code_pt <<= 6;
-        code_pt |= non_empty_string.getUnchecked(src_index + 3) & 0b0011_1111;
-
-        dest[dest_index] = code_pt;
-
-        return 4;
-    }
-}
-
-test "strToScalars: empty string" {
-    const str = RocStr.fromSlice("");
-    defer RocStr.decref(str);
-
-    const expected = RocList.empty();
-    const actual = strToScalars(str);
-    defer actual.decref(@sizeOf(u32));
-
-    try expect(RocList.eql(actual, expected));
-}
-
-test "strToScalars: One ASCII char" {
-    const str = RocStr.fromSlice("R");
-    defer RocStr.decref(str);
-
-    const expected_array = [_]u32{82};
-    const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
-    defer expected.decref(@sizeOf(u32));
-
-    const actual = strToScalars(str);
-    defer actual.decref(@sizeOf(u32));
-
-    try expect(RocList.eql(actual, expected));
-}
-
-test "strToScalars: Multiple ASCII chars" {
-    const str = RocStr.fromSlice("Roc!");
-    defer RocStr.decref(str);
-
-    const expected_array = [_]u32{ 82, 111, 99, 33 };
-    const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
-    defer expected.decref(@sizeOf(u32));
-
-    const actual = strToScalars(str);
-    defer actual.decref(@sizeOf(u32));
-
-    try expect(RocList.eql(actual, expected));
-}
-
-test "strToScalars: One 2-byte UTF-8 character" {
-    const str = RocStr.fromSlice("é");
-    defer RocStr.decref(str);
-
-    const expected_array = [_]u32{233};
-    const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
-    defer expected.decref(@sizeOf(u32));
-
-    const actual = strToScalars(str);
-    defer actual.decref(@sizeOf(u32));
-
-    try expect(RocList.eql(actual, expected));
-}
-
-test "strToScalars: Multiple 2-byte UTF-8 characters" {
-    const str = RocStr.fromSlice("Cäfés");
-    defer RocStr.decref(str);
-
-    const expected_array = [_]u32{ 67, 228, 102, 233, 115 };
-    const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
-    defer expected.decref(@sizeOf(u32));
-
-    const actual = strToScalars(str);
-    defer actual.decref(@sizeOf(u32));
-
-    try expect(RocList.eql(actual, expected));
-}
-
-test "strToScalars: One 3-byte UTF-8 character" {
-    const str = RocStr.fromSlice("鹏");
-    defer RocStr.decref(str);
-
-    const expected_array = [_]u32{40527};
-    const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
-    defer expected.decref(@sizeOf(u32));
-
-    const actual = strToScalars(str);
-    defer actual.decref(@sizeOf(u32));
-
-    try expect(RocList.eql(actual, expected));
-}
-
-test "strToScalars: Multiple 3-byte UTF-8 characters" {
-    const str = RocStr.fromSlice("鹏很有趣");
-    defer RocStr.decref(str);
-
-    const expected_array = [_]u32{ 40527, 24456, 26377, 36259 };
-    const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
-    defer expected.decref(@sizeOf(u32));
-
-    const actual = strToScalars(str);
-    defer actual.decref(@sizeOf(u32));
-
-    try expect(RocList.eql(actual, expected));
-}
-
-test "strToScalars: One 4-byte UTF-8 character" {
-    // from https://design215.com/toolbox/utf8-4byte-characters.php
-    const str = RocStr.fromSlice("𒀀");
-    defer RocStr.decref(str);
-
-    const expected_array = [_]u32{73728};
-    const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
-    defer expected.decref(@sizeOf(u32));
-
-    const actual = strToScalars(str);
-    defer actual.decref(@sizeOf(u32));
-
-    try expect(RocList.eql(actual, expected));
-}
-
-test "strToScalars: Multiple 4-byte UTF-8 characters" {
-    // from https://design215.com/toolbox/utf8-4byte-characters.php
-    const str = RocStr.fromSlice("𒀀𒀁");
-    defer RocStr.decref(str);
-
-    const expected_array = [_]u32{ 73728, 73729 };
-    const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
-    defer expected.decref(@sizeOf(u32));
-
-    const actual = strToScalars(str);
-    defer actual.decref(@sizeOf(u32));
-
-    try expect(RocList.eql(actual, expected));
-}
-
 // Str.fromInt
 pub fn exportFromInt(comptime T: type, comptime name: []const u8) void {
    comptime var f = struct {
@ -1371,125 +1134,6 @@ test "countSegments: overlapping delimiter 2" {
    try expectEqual(segments_count, 3);
 }

-// Str.countGraphemeClusters
-pub fn countGraphemeClusters(string: RocStr) callconv(.C) usize {
-    if (string.isEmpty()) {
-        return 0;
-    }
-
-    const bytes_len = string.len();
-    const bytes_ptr = string.asU8ptr();
-
-    var bytes = bytes_ptr[0..bytes_len];
-    var iter = (unicode.Utf8View.init(bytes) catch unreachable).iterator();
-
-    var count: usize = 0;
-    var grapheme_break_state: ?grapheme.BoundClass = null;
-    var grapheme_break_state_ptr = &grapheme_break_state;
-    var opt_last_codepoint: ?u21 = null;
-    while (iter.nextCodepoint()) |cur_codepoint| {
-        if (opt_last_codepoint) |last_codepoint| {
-            var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, grapheme_break_state_ptr);
-            if (did_break) {
-                count += 1;
-                grapheme_break_state = null;
-            }
-        }
-        opt_last_codepoint = cur_codepoint;
-    }
-
-    // If there are no breaks, but the str is not empty, then there
-    // must be a single grapheme
-    if (bytes_len != 0) {
-        count += 1;
-    }
-
-    return count;
-}
-
-// Str.graphemes
-pub fn strGraphemes(roc_str: RocStr) callconv(.C) RocList {
-    var break_state: ?grapheme.BoundClass = null;
-    var opt_last_codepoint: ?u21 = null;
-    var index: usize = 0;
-    var last_codepoint_len: u8 = 0;
-
-    const alloc_ptr = @intFromPtr(roc_str.getAllocationPtr()) >> 1;
-    const init_fn = if (roc_str.isSmallStr())
-        &initFromSmallStr
-    else
-        &initFromBigStr;
-
-    var result = RocList.allocate(@alignOf(RocStr), countGraphemeClusters(roc_str), @sizeOf(RocStr));
-    const graphemes = result.elements(RocStr) orelse return result;
-    var slice = roc_str.asSlice();
-    var iter = (unicode.Utf8View.init(slice) catch unreachable).iterator();
-
-    while (iter.nextCodepoint()) |cur_codepoint| {
-        const cur_codepoint_len = unicode.utf8CodepointSequenceLength(cur_codepoint) catch unreachable;
-        if (opt_last_codepoint) |last_codepoint| {
-            var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, &break_state);
-            if (did_break) {
-                graphemes[index] = init_fn(@constCast(slice.ptr), last_codepoint_len, alloc_ptr);
-                slice = slice[last_codepoint_len..];
-                index += 1;
-                break_state = null;
-                last_codepoint_len = 0;
-            }
-        }
-        last_codepoint_len += cur_codepoint_len;
-        opt_last_codepoint = cur_codepoint;
-    }
-    // Append last grapheme
-    graphemes[index] = init_fn(@constCast(slice.ptr), slice.len, alloc_ptr);
-
-    if (!roc_str.isSmallStr()) {
-        // Correct refcount for all of the splits made.
-        roc_str.incref(index + 1);
-    }
-    return result;
-}
-
-// these test both countGraphemeClusters() and strGraphemes()
-fn graphemesTest(input: []const u8, expected: []const []const u8) !void {
-    const rocstr = RocStr.fromSlice(input);
-    defer rocstr.decref();
-    const count = countGraphemeClusters(rocstr);
-    try expectEqual(expected.len, count);
-
-    const graphemes = strGraphemes(rocstr);
-    defer graphemes.decref(@sizeOf(u8));
-    if (input.len == 0) return; // empty string
-    const elems = graphemes.elements(RocStr) orelse unreachable;
-    for (expected, 0..) |g, i| {
-        try std.testing.expectEqualStrings(g, elems[i].asSlice());
-    }
-}
-
-test "graphemes: empty string" {
-    try graphemesTest("", &.{});
-}
-
-test "graphemes: ascii characters" {
-    try graphemesTest("abcd", &.{ "a", "b", "c", "d" });
-}
-
-test "graphemes: utf8 characters" {
-    try graphemesTest("ãxā", &.{ "ã", "x", "ā" });
-}
-
-test "graphemes: emojis" {
-    try graphemesTest("🤔🤔🤔", &.{ "🤔", "🤔", "🤔" });
-}
-
-test "graphemes: emojis and ut8 characters" {
-    try graphemesTest("🤔å🤔¥🤔ç", &.{ "🤔", "å", "🤔", "¥", "🤔", "ç" });
-}
-
-test "graphemes: emojis, ut8, and ascii characters" {
-    try graphemesTest("6🤔å🤔e¥🤔çpp", &.{ "6", "🤔", "å", "🤔", "e", "¥", "🤔", "ç", "p", "p" });
-}
-
 pub fn countUtf8Bytes(string: RocStr) callconv(.C) usize {
    return string.len();
 }
@ -1614,44 +1258,6 @@ pub fn repeat(string: RocStr, count: usize) callconv(.C) RocStr {
    return ret_string;
 }

-// Str.startsWithScalar
-pub fn startsWithScalar(string: RocStr, prefix: u32) callconv(.C) bool {
-    const len = string.len();
-
-    if (len == 0) {
-        return false;
-    }
-
-    // Write this (non-empty) string's first scalar into `first_scalar`
-    var first_scalar: [1]u32 = undefined;
-
-    _ = writeNextScalar(string, 0, &first_scalar, 0);
-
-    // Return whether `first_scalar` equals `prefix`
-    return @as(*u32, @ptrCast(&first_scalar)).* == prefix;
-}
-
-test "startsWithScalar: empty string" {
-    const whole = RocStr.empty();
-    const prefix: u32 = 'x';
-    try expect(!startsWithScalar(whole, prefix));
-}
-
-test "startsWithScalar: ascii char" {
-    const whole = RocStr.fromSlice("foobar");
-    const prefix: u32 = 'f';
-    try expect(startsWithScalar(whole, prefix));
-}
-
-test "startsWithScalar: emoji" {
-    const yes = RocStr.fromSlice("💖foobar");
-    const no = RocStr.fromSlice("foobar");
-    const prefix: u32 = '💖';
-
-    try expect(startsWithScalar(yes, prefix));
-    try expect(!startsWithScalar(no, prefix));
-}
-
 test "startsWith: foo starts with fo" {
    const foo = RocStr.fromSlice("foo");
    const fo = RocStr.fromSlice("fo");
@ -2761,78 +2367,6 @@ test "capacity: big string" {
    try expect(data.getCapacity() >= data_bytes.len);
 }

-pub fn appendScalar(string: RocStr, scalar_u32: u32) callconv(.C) RocStr {
-    const scalar = @as(u21, @intCast(scalar_u32));
-    const width = std.unicode.utf8CodepointSequenceLength(scalar) catch unreachable;
-
-    var output = string.reallocate(string.len() + width);
-    var slice = output.asSliceWithCapacityMut();
-
-    _ = std.unicode.utf8Encode(scalar, slice[string.len() .. string.len() + width]) catch unreachable;
-
-    return output;
-}
-
-test "appendScalar: small A" {
-    const A: []const u8 = "A";
-
-    const data_bytes = "hello";
-    var data = RocStr.init(data_bytes, data_bytes.len);
-
-    const actual = appendScalar(data, A[0]);
-    defer actual.decref();
-
-    const expected_bytes = "helloA";
-    const expected = RocStr.init(expected_bytes, expected_bytes.len);
-    defer expected.decref();
-
-    try expect(actual.eq(expected));
-}
-
-test "appendScalar: small 😀" {
-    const data_bytes = "hello";
-    var data = RocStr.init(data_bytes, data_bytes.len);
-
-    const actual = appendScalar(data, 0x1F600);
-    defer actual.decref();
-
-    const expected_bytes = "hello😀";
-    const expected = RocStr.init(expected_bytes, expected_bytes.len);
-    defer expected.decref();
-
-    try expect(actual.eq(expected));
-}
-
-test "appendScalar: big A" {
-    const A: []const u8 = "A";
-
-    const data_bytes = "a string so large that it must be heap-allocated";
-    var data = RocStr.init(data_bytes, data_bytes.len);
-
-    const actual = appendScalar(data, A[0]);
-    defer actual.decref();
-
-    const expected_bytes = "a string so large that it must be heap-allocatedA";
-    const expected = RocStr.init(expected_bytes, expected_bytes.len);
-    defer expected.decref();
-
-    try expect(actual.eq(expected));
-}
-
-test "appendScalar: big 😀" {
-    const data_bytes = "a string so large that it must be heap-allocated";
-    var data = RocStr.init(data_bytes, data_bytes.len);
-
-    const actual = appendScalar(data, 0x1F600);
-    defer actual.decref();
-
-    const expected_bytes = "a string so large that it must be heap-allocated😀";
-    const expected = RocStr.init(expected_bytes, expected_bytes.len);
-    defer expected.decref();
-
-    try expect(actual.eq(expected));
-}
-
 pub fn reserve(string: RocStr, spare: usize) callconv(.C) RocStr {
    const old_length = string.len();
    if (string.getCapacity() >= old_length + spare) {
@ -2849,27 +2383,6 @@ pub fn withCapacity(capacity: usize) callconv(.C) RocStr {
    str.setLen(0);
    return str;
 }
-
-pub fn getScalarUnsafe(string: RocStr, index: usize) callconv(.C) extern struct { bytesParsed: usize, scalar: u32 } {
-    const slice = string.asSlice();
-    const bytesParsed = @as(usize, @intCast(std.unicode.utf8ByteSequenceLength(slice[index]) catch unreachable));
-    const scalar = std.unicode.utf8Decode(slice[index .. index + bytesParsed]) catch unreachable;
-
-    return .{ .bytesParsed = bytesParsed, .scalar = @as(u32, @intCast(scalar)) };
-}
-
-test "getScalarUnsafe" {
-    const data_bytes = "A";
-    var data = RocStr.init(data_bytes, data_bytes.len);
-
-    const result = getScalarUnsafe(data, 0);
-
-    const expected = try std.unicode.utf8Decode("A");
-
-    try expectEqual(result.scalar, @as(u32, @intCast(expected)));
-    try expectEqual(result.bytesParsed, 1);
-}
-
 pub fn strCloneTo(
    string: RocStr,
    ptr: [*]u8,