Merge pull request #3365 from rtfeldman/to-scalars

Add Str.toScalars builtin
This commit is contained in:
Folkert de Vries 2022-07-02 23:04:28 +02:00 committed by GitHub
commit 75b4b3a206
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
17 changed files with 460 additions and 20 deletions

View file

@ -3,6 +3,7 @@ const utils = @import("utils.zig");
const RocResult = utils.RocResult;
const UpdateMode = utils.UpdateMode;
const mem = std.mem;
const math = std.math;
const EqFn = fn (?[*]u8, ?[*]u8) callconv(.C) bool;
const CompareFn = fn (?[*]u8, ?[*]u8, ?[*]u8) callconv(.C) u8;
@ -30,6 +31,57 @@ pub const RocList = extern struct {
return RocList{ .bytes = null, .length = 0, .capacity = 0 };
}
pub fn eql(self: RocList, other: RocList) bool {
if (self.len() != other.len()) {
return false;
}
// Their lengths are the same, and one is empty; they're both empty!
if (self.isEmpty()) {
return true;
}
var index: usize = 0;
const self_bytes = self.bytes orelse unreachable;
const other_bytes = other.bytes orelse unreachable;
while (index < self.len()) {
if (self_bytes[index] != other_bytes[index]) {
return false;
}
index += 1;
}
return true;
}
pub fn fromSlice(comptime T: type, slice: []const T) RocList {
if (slice.len == 0) {
return RocList.empty();
}
var list = allocate(@alignOf(T), slice.len, @sizeOf(T));
if (slice.len > 0) {
const dest = list.bytes orelse unreachable;
const src = @ptrCast([*]const u8, slice.ptr);
const num_bytes = slice.len * @sizeOf(T);
@memcpy(dest, src, num_bytes);
}
return list;
}
pub fn deinit(self: RocList, comptime T: type) void {
utils.decref(self.bytes, self.len(), @alignOf(T));
}
pub fn elements(self: RocList, comptime T: type) ?[*]T {
return @ptrCast(?[*]T, @alignCast(@alignOf(T), self.bytes));
}
pub fn isUnique(self: RocList) bool {
// the empty list is unique (in the sense that copying it will not leak memory)
if (self.isEmpty()) {

View file

@ -152,6 +152,7 @@ comptime {
const str = @import("str.zig");
comptime {
exportStrFn(str.init, "init");
exportStrFn(str.strToScalarsC, "to_scalars");
exportStrFn(str.strSplitInPlaceC, "str_split_in_place");
exportStrFn(str.countSegments, "count_segments");
exportStrFn(str.countGraphemeClusters, "count_grapheme_clusters");

View file

@ -56,6 +56,10 @@ pub const RocStr = extern struct {
return result;
}
pub fn fromSlice(slice: []const u8) RocStr {
return RocStr.init(slice.ptr, slice.len);
}
pub fn initBig(_: InPlace, number_of_chars: usize) RocStr {
const first_element = utils.allocateWithRefcount(number_of_chars, @sizeOf(usize));
@ -227,6 +231,17 @@ pub const RocStr = extern struct {
return self.str_capacity ^ MASK;
}
// This does a small string check, but no bounds checking whatsoever!
pub fn getUnchecked(self: RocStr, index: usize) u8 {
if (self.isSmallStr()) {
return self.asArray()[index];
} else {
const bytes = self.str_bytes orelse unreachable;
return bytes[index];
}
}
pub fn isEmpty(self: RocStr) bool {
return self.len() == 0;
}
@ -239,7 +254,7 @@ pub const RocStr = extern struct {
const length = self.len();
const longest_small_str = @sizeOf(RocStr) - 1;
// NOTE: We want to compare length here, *NOT* check for is_small_str!
// NOTE: We want to compare length here, *NOT* check for isSmallStr!
// This is because we explicitly want the empty string to be handled in
// this branch, even though the empty string is not a small string.
//
@ -452,6 +467,230 @@ pub fn strNumberOfBytes(string: RocStr) callconv(.C) usize {
return string.len();
}
// Str.toScalars
pub fn strToScalarsC(str: RocStr) callconv(.C) RocList {
return @call(.{ .modifier = always_inline }, strToScalars, .{str});
}
fn strToScalars(string: RocStr) callconv(.C) RocList {
const str_len = string.len();
if (str_len == 0) {
return RocList.empty();
}
var capacity = str_len;
if (!string.isSmallStr()) {
capacity = string.capacity();
}
// For purposes of preallocation, assume the number of code points is the same
// as the number of bytes. This might be longer than necessary, but definitely
// should not require a second allocation.
var answer = RocList.allocate(@alignOf(u32), capacity, @sizeOf(u32));
// We already did an early return to verify the string was nonempty.
var answer_elems = answer.elements(u32) orelse unreachable;
var src_index: usize = 0;
var answer_index: usize = 0;
while (src_index < str_len) {
const utf8_byte = string.getUnchecked(src_index);
// How UTF-8 bytes work:
// https://docs.teradata.com/r/Teradata-Database-International-Character-Set-Support/June-2017/Client-Character-Set-Options/UTF8-Client-Character-Set-Support/UTF8-Multibyte-Sequences
if (utf8_byte <= 127) {
// It's an ASCII character. Copy it over directly.
answer_elems[answer_index] = @intCast(u32, utf8_byte);
src_index += 1;
} else if (utf8_byte >> 5 == 0b0000_0110) {
// Its three high order bits are 110, so this is a two-byte sequence.
// Example:
// utf-8: 1100 1111 1011 0001
// code pt: 0000 0011 1111 0001 (decimal: 1009)
// Discard the first byte's high order bits of 110.
var code_pt = @intCast(u32, utf8_byte & 0b0001_1111);
// Discard the second byte's high order bits of 10.
code_pt <<= 6;
code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
answer_elems[answer_index] = code_pt;
src_index += 2;
} else if (utf8_byte >> 4 == 0b0000_1110) {
// Its four high order bits are 1110, so this is a three-byte sequence.
// Discard the first byte's high order bits of 1110.
var code_pt = @intCast(u32, utf8_byte & 0b0000_1111);
// Discard the second byte's high order bits of 10.
code_pt <<= 6;
code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
// Discard the third byte's high order bits of 10 (same as second byte).
code_pt <<= 6;
code_pt |= string.getUnchecked(src_index + 2) & 0b0011_1111;
answer_elems[answer_index] = code_pt;
src_index += 3;
} else {
// This must be a four-byte sequence, so the five high order bits should be 11110.
// Discard the first byte's high order bits of 11110.
var code_pt = @intCast(u32, utf8_byte & 0b0000_0111);
// Discard the second byte's high order bits of 10.
code_pt <<= 6;
code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
// Discard the third byte's high order bits of 10 (same as second byte).
code_pt <<= 6;
code_pt |= string.getUnchecked(src_index + 2) & 0b0011_1111;
// Discard the fourth byte's high order bits of 10 (same as second and third).
code_pt <<= 6;
code_pt |= string.getUnchecked(src_index + 3) & 0b0011_1111;
answer_elems[answer_index] = code_pt;
src_index += 4;
}
answer_index += 1;
}
answer.length = answer_index;
return answer;
}
test "strToScalars: empty string" {
const str = RocStr.fromSlice("");
defer RocStr.deinit(str);
const expected = RocList.empty();
const actual = strToScalars(str);
defer RocList.deinit(actual, u32);
try expect(RocList.eql(actual, expected));
}
test "strToScalars: One ASCII char" {
const str = RocStr.fromSlice("R");
defer RocStr.deinit(str);
const expected_array = [_]u32{82};
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer RocList.deinit(expected, u32);
const actual = strToScalars(str);
defer RocList.deinit(actual, u32);
try expect(RocList.eql(actual, expected));
}
test "strToScalars: Multiple ASCII chars" {
const str = RocStr.fromSlice("Roc!");
defer RocStr.deinit(str);
const expected_array = [_]u32{ 82, 111, 99, 33 };
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer RocList.deinit(expected, u32);
const actual = strToScalars(str);
defer RocList.deinit(actual, u32);
try expect(RocList.eql(actual, expected));
}
test "strToScalars: One 2-byte UTF-8 character" {
const str = RocStr.fromSlice("é");
defer RocStr.deinit(str);
const expected_array = [_]u32{233};
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer RocList.deinit(expected, u32);
const actual = strToScalars(str);
defer RocList.deinit(actual, u32);
try expect(RocList.eql(actual, expected));
}
test "strToScalars: Multiple 2-byte UTF-8 characters" {
const str = RocStr.fromSlice("Cäfés");
defer RocStr.deinit(str);
const expected_array = [_]u32{ 67, 228, 102, 233, 115 };
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer RocList.deinit(expected, u32);
const actual = strToScalars(str);
defer RocList.deinit(actual, u32);
try expect(RocList.eql(actual, expected));
}
test "strToScalars: One 3-byte UTF-8 character" {
const str = RocStr.fromSlice("");
defer RocStr.deinit(str);
const expected_array = [_]u32{40527};
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer RocList.deinit(expected, u32);
const actual = strToScalars(str);
defer RocList.deinit(actual, u32);
try expect(RocList.eql(actual, expected));
}
test "strToScalars: Multiple 3-byte UTF-8 characters" {
const str = RocStr.fromSlice("鹏很有趣");
defer RocStr.deinit(str);
const expected_array = [_]u32{ 40527, 24456, 26377, 36259 };
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer RocList.deinit(expected, u32);
const actual = strToScalars(str);
defer RocList.deinit(actual, u32);
try expect(RocList.eql(actual, expected));
}
test "strToScalars: One 4-byte UTF-8 character" {
// from https://design215.com/toolbox/utf8-4byte-characters.php
const str = RocStr.fromSlice("𒀀");
defer RocStr.deinit(str);
const expected_array = [_]u32{73728};
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer RocList.deinit(expected, u32);
const actual = strToScalars(str);
defer RocList.deinit(actual, u32);
try expect(RocList.eql(actual, expected));
}
test "strToScalars: Multiple 4-byte UTF-8 characters" {
// from https://design215.com/toolbox/utf8-4byte-characters.php
const str = RocStr.fromSlice("𒀀𒀁");
defer RocStr.deinit(str);
const expected_array = [_]u32{ 73728, 73729 };
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
defer RocList.deinit(expected, u32);
const actual = strToScalars(str);
defer RocList.deinit(actual, u32);
try expect(RocList.eql(actual, expected));
}
// Str.fromInt
pub fn exportFromInt(comptime T: type, comptime name: []const u8) void {
comptime var f = struct {

View file

@ -183,7 +183,7 @@ pub fn decref(
var bytes = bytes_or_null orelse return;
const isizes: [*]isize = @ptrCast([*]isize, @alignCast(@sizeOf(isize), bytes));
const isizes: [*]isize = @ptrCast([*]isize, @alignCast(@alignOf(isize), bytes));
decref_ptr_to_refcount(isizes - 1, alignment);
}