mirror of
https://github.com/roc-lang/roc.git
synced 2025-08-04 04:08:19 +00:00
Add RocStr.strToCodePts in Zig
This commit is contained in:
parent
6cdf63b319
commit
47216b9905
1 changed files with 107 additions and 0 deletions
|
@ -227,6 +227,17 @@ pub const RocStr = extern struct {
|
|||
return self.str_capacity ^ MASK;
|
||||
}
|
||||
|
||||
// This does a small string check, but no bounds checking whatsoever!
|
||||
pub fn getUnchecked(self: RocStr, index: usize) u8 {
|
||||
if (self.isSmallStr()) {
|
||||
return self.asArray()[index];
|
||||
} else {
|
||||
const bytes = self.str_bytes orelse unreachable;
|
||||
|
||||
return bytes[index];
|
||||
}
|
||||
}
|
||||
|
||||
pub fn isEmpty(self: RocStr) bool {
|
||||
return self.len() == 0;
|
||||
}
|
||||
|
@ -452,6 +463,102 @@ pub fn strNumberOfBytes(string: RocStr) callconv(.C) usize {
|
|||
return string.len();
|
||||
}
|
||||
|
||||
// Str.toCodePts : Str -> List U32
|
||||
pub fn strToCodePts(string: RocStr) callconv(.C) RocList {
|
||||
const str_len = string.len();
|
||||
|
||||
// For purposes of preallocation, assume the number of code points is the same
|
||||
// as the number of bytes. This might be longer than necessary, but definitely
|
||||
// should not require a second allocation.
|
||||
var answer = RocList.allocate(@alignOf(u32), string.capacity(), @sizeOf(u32));
|
||||
var answer_elems = answer.elements(u32);
|
||||
var src_index: usize = 0;
|
||||
var answer_index: usize = 0;
|
||||
|
||||
while (src_index < str_len) {
|
||||
const utf8_byte = string.getUnchecked(src_index);
|
||||
|
||||
// How UTF-8 bytes work:
|
||||
// https://docs.teradata.com/r/Teradata-Database-International-Character-Set-Support/June-2017/Client-Character-Set-Options/UTF8-Client-Character-Set-Support/UTF8-Multibyte-Sequences
|
||||
if (utf8_byte <= 127) {
|
||||
// It's an ASCII character. Copy it over directly.
|
||||
answer_elems[answer_index] = @intCast(u32, utf8_byte);
|
||||
src_index += 1;
|
||||
} else if (utf8_byte >> 5 == 0b0000_0110) {
|
||||
// Its three high order bits are 110, so this is a two-byte sequence.
|
||||
|
||||
// Example:
|
||||
// utf-8: 1100 1111 1011 0001
|
||||
// code pt: 0000 0011 1111 0001 (decimal: 1009)
|
||||
|
||||
// Discard the first byte's high order bits of 110.
|
||||
var code_pt = @intCast(u32, utf8_byte & 0b0001_1111);
|
||||
|
||||
// Discard the second byte's high order bits of 10.
|
||||
code_pt <<= 6;
|
||||
code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
|
||||
|
||||
answer_elems[answer_index] = code_pt;
|
||||
src_index += 2;
|
||||
} else if (utf8_byte >> 4 == 0b0000_1110) {
|
||||
// Its four high order bits are 1110, so this is a three-byte sequence.
|
||||
|
||||
// Discard the first byte's high order bits of 110.
|
||||
var code_pt = @intCast(u32, utf8_byte & 0b0001_1111);
|
||||
|
||||
// Discard the second byte's high order bits of 10.
|
||||
code_pt <<= 6;
|
||||
code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
|
||||
|
||||
// Discard the third byte's high order bits of 10 (same as second byte).
|
||||
code_pt <<= 6;
|
||||
code_pt |= string.getUnchecked(src_index + 2) & 0b0011_1111;
|
||||
|
||||
answer_elems[answer_index] = code_pt;
|
||||
src_index += 3;
|
||||
} else {
|
||||
// This must be a four-byte sequence, so the five high order bits should be 11110.
|
||||
|
||||
// Discard the first byte's high order bits of 110.
|
||||
var code_pt = @intCast(u32, utf8_byte & 0b0001_1111);
|
||||
|
||||
// Discard the second byte's high order bits of 10.
|
||||
code_pt <<= 6;
|
||||
code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
|
||||
|
||||
// Discard the third byte's high order bits of 10 (same as second byte).
|
||||
code_pt <<= 6;
|
||||
code_pt |= string.getUnchecked(src_index + 2) & 0b0011_1111;
|
||||
|
||||
// Discard the fourth byte's high order bits of 10 (same as second and third).
|
||||
code_pt <<= 6;
|
||||
code_pt |= string.getUnchecked(src_index + 3) & 0b0011_1111;
|
||||
|
||||
answer_elems[answer_index] = code_pt;
|
||||
src_index += 4;
|
||||
}
|
||||
|
||||
answer_index += 1;
|
||||
}
|
||||
|
||||
return answer;
|
||||
}
|
||||
|
||||
test "strToCodePts: One ASCII char" {
|
||||
const str = RocStr.init("R", 1);
|
||||
const array = [_]u32{ 1, 2, 3, 4 };
|
||||
var known_at_runtime_zero: usize = 0;
|
||||
const slice = array[known_at_runtime_zero..array.len];
|
||||
const expected = RocList.fromSlice(u32, slice);
|
||||
const actual = strToCodePts(str);
|
||||
|
||||
try expect(RocList.isEq(actual, expected));
|
||||
|
||||
RocList.deinit(actual, u32);
|
||||
RocList.deinit(expected, u32);
|
||||
RocStr.deinit(str);
|
||||
}
|
||||
|
||||
// Str.fromInt
|
||||
pub fn exportFromInt(comptime T: type, comptime name: []const u8) void {
|
||||
comptime var f = struct {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue