mirror of
https://github.com/roc-lang/roc.git
synced 2025-11-01 13:34:15 +00:00
Merge pull request #3365 from rtfeldman/to-scalars
Add Str.toScalars builtin
This commit is contained in:
commit
75b4b3a206
17 changed files with 460 additions and 20 deletions
|
|
@ -3,6 +3,7 @@ const utils = @import("utils.zig");
|
|||
const RocResult = utils.RocResult;
|
||||
const UpdateMode = utils.UpdateMode;
|
||||
const mem = std.mem;
|
||||
const math = std.math;
|
||||
|
||||
const EqFn = fn (?[*]u8, ?[*]u8) callconv(.C) bool;
|
||||
const CompareFn = fn (?[*]u8, ?[*]u8, ?[*]u8) callconv(.C) u8;
|
||||
|
|
@ -30,6 +31,57 @@ pub const RocList = extern struct {
|
|||
return RocList{ .bytes = null, .length = 0, .capacity = 0 };
|
||||
}
|
||||
|
||||
pub fn eql(self: RocList, other: RocList) bool {
|
||||
if (self.len() != other.len()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Their lengths are the same, and one is empty; they're both empty!
|
||||
if (self.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
var index: usize = 0;
|
||||
const self_bytes = self.bytes orelse unreachable;
|
||||
const other_bytes = other.bytes orelse unreachable;
|
||||
|
||||
while (index < self.len()) {
|
||||
if (self_bytes[index] != other_bytes[index]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
index += 1;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
pub fn fromSlice(comptime T: type, slice: []const T) RocList {
|
||||
if (slice.len == 0) {
|
||||
return RocList.empty();
|
||||
}
|
||||
|
||||
var list = allocate(@alignOf(T), slice.len, @sizeOf(T));
|
||||
|
||||
if (slice.len > 0) {
|
||||
const dest = list.bytes orelse unreachable;
|
||||
const src = @ptrCast([*]const u8, slice.ptr);
|
||||
const num_bytes = slice.len * @sizeOf(T);
|
||||
|
||||
@memcpy(dest, src, num_bytes);
|
||||
}
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
pub fn deinit(self: RocList, comptime T: type) void {
|
||||
utils.decref(self.bytes, self.len(), @alignOf(T));
|
||||
}
|
||||
|
||||
pub fn elements(self: RocList, comptime T: type) ?[*]T {
|
||||
return @ptrCast(?[*]T, @alignCast(@alignOf(T), self.bytes));
|
||||
}
|
||||
|
||||
pub fn isUnique(self: RocList) bool {
|
||||
// the empty list is unique (in the sense that copying it will not leak memory)
|
||||
if (self.isEmpty()) {
|
||||
|
|
|
|||
|
|
@ -152,6 +152,7 @@ comptime {
|
|||
const str = @import("str.zig");
|
||||
comptime {
|
||||
exportStrFn(str.init, "init");
|
||||
exportStrFn(str.strToScalarsC, "to_scalars");
|
||||
exportStrFn(str.strSplitInPlaceC, "str_split_in_place");
|
||||
exportStrFn(str.countSegments, "count_segments");
|
||||
exportStrFn(str.countGraphemeClusters, "count_grapheme_clusters");
|
||||
|
|
|
|||
|
|
@ -56,6 +56,10 @@ pub const RocStr = extern struct {
|
|||
return result;
|
||||
}
|
||||
|
||||
pub fn fromSlice(slice: []const u8) RocStr {
|
||||
return RocStr.init(slice.ptr, slice.len);
|
||||
}
|
||||
|
||||
pub fn initBig(_: InPlace, number_of_chars: usize) RocStr {
|
||||
const first_element = utils.allocateWithRefcount(number_of_chars, @sizeOf(usize));
|
||||
|
||||
|
|
@ -227,6 +231,17 @@ pub const RocStr = extern struct {
|
|||
return self.str_capacity ^ MASK;
|
||||
}
|
||||
|
||||
// This does a small string check, but no bounds checking whatsoever!
|
||||
pub fn getUnchecked(self: RocStr, index: usize) u8 {
|
||||
if (self.isSmallStr()) {
|
||||
return self.asArray()[index];
|
||||
} else {
|
||||
const bytes = self.str_bytes orelse unreachable;
|
||||
|
||||
return bytes[index];
|
||||
}
|
||||
}
|
||||
|
||||
pub fn isEmpty(self: RocStr) bool {
|
||||
return self.len() == 0;
|
||||
}
|
||||
|
|
@ -239,7 +254,7 @@ pub const RocStr = extern struct {
|
|||
const length = self.len();
|
||||
const longest_small_str = @sizeOf(RocStr) - 1;
|
||||
|
||||
// NOTE: We want to compare length here, *NOT* check for is_small_str!
|
||||
// NOTE: We want to compare length here, *NOT* check for isSmallStr!
|
||||
// This is because we explicitly want the empty string to be handled in
|
||||
// this branch, even though the empty string is not a small string.
|
||||
//
|
||||
|
|
@ -452,6 +467,230 @@ pub fn strNumberOfBytes(string: RocStr) callconv(.C) usize {
|
|||
return string.len();
|
||||
}
|
||||
|
||||
// Str.toScalars
|
||||
pub fn strToScalarsC(str: RocStr) callconv(.C) RocList {
|
||||
return @call(.{ .modifier = always_inline }, strToScalars, .{str});
|
||||
}
|
||||
|
||||
fn strToScalars(string: RocStr) callconv(.C) RocList {
|
||||
const str_len = string.len();
|
||||
|
||||
if (str_len == 0) {
|
||||
return RocList.empty();
|
||||
}
|
||||
|
||||
var capacity = str_len;
|
||||
|
||||
if (!string.isSmallStr()) {
|
||||
capacity = string.capacity();
|
||||
}
|
||||
|
||||
// For purposes of preallocation, assume the number of code points is the same
|
||||
// as the number of bytes. This might be longer than necessary, but definitely
|
||||
// should not require a second allocation.
|
||||
var answer = RocList.allocate(@alignOf(u32), capacity, @sizeOf(u32));
|
||||
|
||||
// We already did an early return to verify the string was nonempty.
|
||||
var answer_elems = answer.elements(u32) orelse unreachable;
|
||||
var src_index: usize = 0;
|
||||
var answer_index: usize = 0;
|
||||
|
||||
while (src_index < str_len) {
|
||||
const utf8_byte = string.getUnchecked(src_index);
|
||||
|
||||
// How UTF-8 bytes work:
|
||||
// https://docs.teradata.com/r/Teradata-Database-International-Character-Set-Support/June-2017/Client-Character-Set-Options/UTF8-Client-Character-Set-Support/UTF8-Multibyte-Sequences
|
||||
if (utf8_byte <= 127) {
|
||||
// It's an ASCII character. Copy it over directly.
|
||||
answer_elems[answer_index] = @intCast(u32, utf8_byte);
|
||||
src_index += 1;
|
||||
} else if (utf8_byte >> 5 == 0b0000_0110) {
|
||||
// Its three high order bits are 110, so this is a two-byte sequence.
|
||||
|
||||
// Example:
|
||||
// utf-8: 1100 1111 1011 0001
|
||||
// code pt: 0000 0011 1111 0001 (decimal: 1009)
|
||||
|
||||
// Discard the first byte's high order bits of 110.
|
||||
var code_pt = @intCast(u32, utf8_byte & 0b0001_1111);
|
||||
|
||||
// Discard the second byte's high order bits of 10.
|
||||
code_pt <<= 6;
|
||||
code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
|
||||
|
||||
answer_elems[answer_index] = code_pt;
|
||||
src_index += 2;
|
||||
} else if (utf8_byte >> 4 == 0b0000_1110) {
|
||||
// Its four high order bits are 1110, so this is a three-byte sequence.
|
||||
|
||||
// Discard the first byte's high order bits of 1110.
|
||||
var code_pt = @intCast(u32, utf8_byte & 0b0000_1111);
|
||||
|
||||
// Discard the second byte's high order bits of 10.
|
||||
code_pt <<= 6;
|
||||
code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
|
||||
|
||||
// Discard the third byte's high order bits of 10 (same as second byte).
|
||||
code_pt <<= 6;
|
||||
code_pt |= string.getUnchecked(src_index + 2) & 0b0011_1111;
|
||||
|
||||
answer_elems[answer_index] = code_pt;
|
||||
src_index += 3;
|
||||
} else {
|
||||
// This must be a four-byte sequence, so the five high order bits should be 11110.
|
||||
|
||||
// Discard the first byte's high order bits of 11110.
|
||||
var code_pt = @intCast(u32, utf8_byte & 0b0000_0111);
|
||||
|
||||
// Discard the second byte's high order bits of 10.
|
||||
code_pt <<= 6;
|
||||
code_pt |= string.getUnchecked(src_index + 1) & 0b0011_1111;
|
||||
|
||||
// Discard the third byte's high order bits of 10 (same as second byte).
|
||||
code_pt <<= 6;
|
||||
code_pt |= string.getUnchecked(src_index + 2) & 0b0011_1111;
|
||||
|
||||
// Discard the fourth byte's high order bits of 10 (same as second and third).
|
||||
code_pt <<= 6;
|
||||
code_pt |= string.getUnchecked(src_index + 3) & 0b0011_1111;
|
||||
|
||||
answer_elems[answer_index] = code_pt;
|
||||
src_index += 4;
|
||||
}
|
||||
|
||||
answer_index += 1;
|
||||
}
|
||||
|
||||
answer.length = answer_index;
|
||||
|
||||
return answer;
|
||||
}
|
||||
|
||||
test "strToScalars: empty string" {
|
||||
const str = RocStr.fromSlice("");
|
||||
defer RocStr.deinit(str);
|
||||
|
||||
const expected = RocList.empty();
|
||||
const actual = strToScalars(str);
|
||||
defer RocList.deinit(actual, u32);
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: One ASCII char" {
|
||||
const str = RocStr.fromSlice("R");
|
||||
defer RocStr.deinit(str);
|
||||
|
||||
const expected_array = [_]u32{82};
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer RocList.deinit(expected, u32);
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer RocList.deinit(actual, u32);
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: Multiple ASCII chars" {
|
||||
const str = RocStr.fromSlice("Roc!");
|
||||
defer RocStr.deinit(str);
|
||||
|
||||
const expected_array = [_]u32{ 82, 111, 99, 33 };
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer RocList.deinit(expected, u32);
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer RocList.deinit(actual, u32);
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: One 2-byte UTF-8 character" {
|
||||
const str = RocStr.fromSlice("é");
|
||||
defer RocStr.deinit(str);
|
||||
|
||||
const expected_array = [_]u32{233};
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer RocList.deinit(expected, u32);
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer RocList.deinit(actual, u32);
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: Multiple 2-byte UTF-8 characters" {
|
||||
const str = RocStr.fromSlice("Cäfés");
|
||||
defer RocStr.deinit(str);
|
||||
|
||||
const expected_array = [_]u32{ 67, 228, 102, 233, 115 };
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer RocList.deinit(expected, u32);
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer RocList.deinit(actual, u32);
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: One 3-byte UTF-8 character" {
|
||||
const str = RocStr.fromSlice("鹏");
|
||||
defer RocStr.deinit(str);
|
||||
|
||||
const expected_array = [_]u32{40527};
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer RocList.deinit(expected, u32);
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer RocList.deinit(actual, u32);
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: Multiple 3-byte UTF-8 characters" {
|
||||
const str = RocStr.fromSlice("鹏很有趣");
|
||||
defer RocStr.deinit(str);
|
||||
|
||||
const expected_array = [_]u32{ 40527, 24456, 26377, 36259 };
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer RocList.deinit(expected, u32);
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer RocList.deinit(actual, u32);
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: One 4-byte UTF-8 character" {
|
||||
// from https://design215.com/toolbox/utf8-4byte-characters.php
|
||||
const str = RocStr.fromSlice("𒀀");
|
||||
defer RocStr.deinit(str);
|
||||
|
||||
const expected_array = [_]u32{73728};
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer RocList.deinit(expected, u32);
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer RocList.deinit(actual, u32);
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
test "strToScalars: Multiple 4-byte UTF-8 characters" {
|
||||
// from https://design215.com/toolbox/utf8-4byte-characters.php
|
||||
const str = RocStr.fromSlice("𒀀𒀁");
|
||||
defer RocStr.deinit(str);
|
||||
|
||||
const expected_array = [_]u32{ 73728, 73729 };
|
||||
const expected = RocList.fromSlice(u32, expected_array[0..expected_array.len]);
|
||||
defer RocList.deinit(expected, u32);
|
||||
|
||||
const actual = strToScalars(str);
|
||||
defer RocList.deinit(actual, u32);
|
||||
|
||||
try expect(RocList.eql(actual, expected));
|
||||
}
|
||||
|
||||
// Str.fromInt
|
||||
pub fn exportFromInt(comptime T: type, comptime name: []const u8) void {
|
||||
comptime var f = struct {
|
||||
|
|
|
|||
|
|
@ -183,7 +183,7 @@ pub fn decref(
|
|||
|
||||
var bytes = bytes_or_null orelse return;
|
||||
|
||||
const isizes: [*]isize = @ptrCast([*]isize, @alignCast(@sizeOf(isize), bytes));
|
||||
const isizes: [*]isize = @ptrCast([*]isize, @alignCast(@alignOf(isize), bytes));
|
||||
|
||||
decref_ptr_to_refcount(isizes - 1, alignment);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue