Add RocStr.strToCodePts in Zig

2025-11-02 22:01:20 +00:00 · 2022-07-01 17:25:04 -04:00 · 2022-07-01 17:25:04 -04:00 · 47216b9905
commit 47216b9905
parent 6cdf63b319
1 changed files with 107 additions and 0 deletions
--- a/crates/compiler/builtins/bitcode/src/str.zig
+++ b/crates/compiler/builtins/bitcode/src/str.zig
@ -227,6 +227,17 @@ pub const RocStr = extern struct {
        return self.str_capacity ^ MASK;
    }

+    // This does a small string check, but no bounds checking whatsoever!
+    pub fn getUnchecked(self: RocStr, index: usize) u8 {
+        if (self.isSmallStr()) {
+            return self.asArray()[index];
+        } else {
+            const bytes = self.str_bytes orelse unreachable;
+
+            return bytes[index];
+        }
+    }
+
    pub fn isEmpty(self: RocStr) bool {
        return self.len() == 0;
    }
@ -452,6 +463,102 @@ pub fn strNumberOfBytes(string: RocStr) callconv(.C) usize {
    return string.len();
 }

+// Str.toCodePts : Str -> List U32
+pub fn strToCodePts(string: RocStr) callconv(.C) RocList {
+    const str_len = string.len();
+
+    // For purposes of preallocation, assume the number of code points is the same
+    // as the number of bytes. This might be longer than necessary, but definitely
+    // should not require a second allocation.
+    var answer = RocList.allocate(@alignOf(u32), string.capacity(), @sizeOf(u32));
+    var answer_elems = answer.elements(u32);
+    var src_index: usize = 0;
+    var answer_index: usize = 0;
+
+    while (src_index < str_len) {
+        const utf8_byte = string.getUnchecked(src_index);
+
+        // How UTF-8 bytes work:
+        // https://docs.teradata.com/r/Teradata-Database-International-Character-Set-Support/June-2017/Client-Character-Set-Options/UTF8-Client-Character-Set-Support/UTF8-Multibyte-Sequences
+        if (utf8_byte <= 127) {
+            // It's an ASCII character. Copy it over directly.
+            answer_elems[answer_index] = @intCast(u32, utf8_byte);
+            src_index += 1;
+        } else if (utf8_byte >> 5 == 0b0000_0110) {
+            // Its three high order bits are 110, so this is a two-byte sequence.
+
+            // Example:
+            //     utf-8:   1100 1111   1011 0001
+            //     code pt: 0000 0011   1111 0001 (decimal: 1009)
+
+            // Discard the first byte's high order bits of 110.
+            var code_pt = @intCast(u32, utf8_byte & 0b0001_1111);
+
+            // Discard the second byte's high order bits of 10.
+            code_pt <<= 6;
+            code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
+
+            answer_elems[answer_index] = code_pt;
+            src_index += 2;
+        } else if (utf8_byte >> 4 == 0b0000_1110) {
+            // Its four high order bits are 1110, so this is a three-byte sequence.
+
+            // Discard the first byte's high order bits of 110.
+            var code_pt = @intCast(u32, utf8_byte & 0b0001_1111);
+
+            // Discard the second byte's high order bits of 10.
+            code_pt <<= 6;
+            code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
+
+            // Discard the third byte's high order bits of 10 (same as second byte).
+            code_pt <<= 6;
+            code_pt |= string.getUnchecked(src_index + 2) & 0b0011_1111;
+
+            answer_elems[answer_index] = code_pt;
+            src_index += 3;
+        } else {
+            // This must be a four-byte sequence, so the five high order bits should be 11110.
+
+            // Discard the first byte's high order bits of 110.
+            var code_pt = @intCast(u32, utf8_byte & 0b0001_1111);
+
+            // Discard the second byte's high order bits of 10.
+            code_pt <<= 6;
+            code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
+
+            // Discard the third byte's high order bits of 10 (same as second byte).
+            code_pt <<= 6;
+            code_pt |= string.getUnchecked(src_index + 2) & 0b0011_1111;
+
+            // Discard the fourth byte's high order bits of 10 (same as second and third).
+            code_pt <<= 6;
+            code_pt |= string.getUnchecked(src_index + 3) & 0b0011_1111;
+
+            answer_elems[answer_index] = code_pt;
+            src_index += 4;
+        }
+
+        answer_index += 1;
+    }
+
+    return answer;
+}
+
+test "strToCodePts: One ASCII char" {
+    const str = RocStr.init("R", 1);
+    const array = [_]u32{ 1, 2, 3, 4 };
+    var known_at_runtime_zero: usize = 0;
+    const slice = array[known_at_runtime_zero..array.len];
+    const expected = RocList.fromSlice(u32, slice);
+    const actual = strToCodePts(str);
+
+    try expect(RocList.isEq(actual, expected));
+
+    RocList.deinit(actual, u32);
+    RocList.deinit(expected, u32);
+    RocStr.deinit(str);
+}
+
 // Str.fromInt
 pub fn exportFromInt(comptime T: type, comptime name: []const u8) void {
    comptime var f = struct {