Add RocStr.strToCodePts in Zig

This commit is contained in:
Richard Feldman 2022-07-01 17:25:04 -04:00
parent 6cdf63b319
commit 47216b9905
No known key found for this signature in database
GPG key ID: 7E4127D1E4241798

View file

@ -227,6 +227,17 @@ pub const RocStr = extern struct {
return self.str_capacity ^ MASK;
}
// This does a small string check, but no bounds checking whatsoever!
pub fn getUnchecked(self: RocStr, index: usize) u8 {
if (self.isSmallStr()) {
return self.asArray()[index];
} else {
const bytes = self.str_bytes orelse unreachable;
return bytes[index];
}
}
pub fn isEmpty(self: RocStr) bool {
return self.len() == 0;
}
@ -452,6 +463,102 @@ pub fn strNumberOfBytes(string: RocStr) callconv(.C) usize {
return string.len();
}
// Str.toCodePts : Str -> List U32
pub fn strToCodePts(string: RocStr) callconv(.C) RocList {
const str_len = string.len();
// For purposes of preallocation, assume the number of code points is the same
// as the number of bytes. This might be longer than necessary, but definitely
// should not require a second allocation.
var answer = RocList.allocate(@alignOf(u32), string.capacity(), @sizeOf(u32));
var answer_elems = answer.elements(u32);
var src_index: usize = 0;
var answer_index: usize = 0;
while (src_index < str_len) {
const utf8_byte = string.getUnchecked(src_index);
// How UTF-8 bytes work:
// https://docs.teradata.com/r/Teradata-Database-International-Character-Set-Support/June-2017/Client-Character-Set-Options/UTF8-Client-Character-Set-Support/UTF8-Multibyte-Sequences
if (utf8_byte <= 127) {
// It's an ASCII character. Copy it over directly.
answer_elems[answer_index] = @intCast(u32, utf8_byte);
src_index += 1;
} else if (utf8_byte >> 5 == 0b0000_0110) {
// Its three high order bits are 110, so this is a two-byte sequence.
// Example:
// utf-8: 1100 1111 1011 0001
// code pt: 0000 0011 1111 0001 (decimal: 1009)
// Discard the first byte's high order bits of 110.
var code_pt = @intCast(u32, utf8_byte & 0b0001_1111);
// Discard the second byte's high order bits of 10.
code_pt <<= 6;
code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
answer_elems[answer_index] = code_pt;
src_index += 2;
} else if (utf8_byte >> 4 == 0b0000_1110) {
// Its four high order bits are 1110, so this is a three-byte sequence.
// Discard the first byte's high order bits of 110.
var code_pt = @intCast(u32, utf8_byte & 0b0001_1111);
// Discard the second byte's high order bits of 10.
code_pt <<= 6;
code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
// Discard the third byte's high order bits of 10 (same as second byte).
code_pt <<= 6;
code_pt |= string.getUnchecked(src_index + 2) & 0b0011_1111;
answer_elems[answer_index] = code_pt;
src_index += 3;
} else {
// This must be a four-byte sequence, so the five high order bits should be 11110.
// Discard the first byte's high order bits of 110.
var code_pt = @intCast(u32, utf8_byte & 0b0001_1111);
// Discard the second byte's high order bits of 10.
code_pt <<= 6;
code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
// Discard the third byte's high order bits of 10 (same as second byte).
code_pt <<= 6;
code_pt |= string.getUnchecked(src_index + 2) & 0b0011_1111;
// Discard the fourth byte's high order bits of 10 (same as second and third).
code_pt <<= 6;
code_pt |= string.getUnchecked(src_index + 3) & 0b0011_1111;
answer_elems[answer_index] = code_pt;
src_index += 4;
}
answer_index += 1;
}
return answer;
}
test "strToCodePts: One ASCII char" {
const str = RocStr.init("R", 1);
const array = [_]u32{ 1, 2, 3, 4 };
var known_at_runtime_zero: usize = 0;
const slice = array[known_at_runtime_zero..array.len];
const expected = RocList.fromSlice(u32, slice);
const actual = strToCodePts(str);
try expect(RocList.isEq(actual, expected));
RocList.deinit(actual, u32);
RocList.deinit(expected, u32);
RocStr.deinit(str);
}
// Str.fromInt
pub fn exportFromInt(comptime T: type, comptime name: []const u8) void {
comptime var f = struct {