mirror of
https://github.com/roc-lang/roc.git
synced 2025-07-24 06:55:15 +00:00
Merge pull request #5142 from roc-lang/seamless-slices-str
Seamless slices str
This commit is contained in:
commit
a80b25d044
8 changed files with 274 additions and 164 deletions
|
@ -155,6 +155,7 @@ comptime {
|
|||
exportStrFn(str.strCloneTo, "clone_to");
|
||||
exportStrFn(str.withCapacity, "with_capacity");
|
||||
exportStrFn(str.strGraphemes, "graphemes");
|
||||
exportStrFn(str.strRefcountPtr, "refcount_ptr");
|
||||
|
||||
inline for (INTEGERS) |T| {
|
||||
str.exportFromInt(T, ROC_BUILTINS ++ "." ++ STR ++ ".from_int.");
|
||||
|
|
|
@ -18,6 +18,7 @@ const InPlace = enum(u8) {
|
|||
|
||||
const MASK_ISIZE: isize = std.math.minInt(isize);
|
||||
const MASK: usize = @bitCast(usize, MASK_ISIZE);
|
||||
const SEAMLESS_SLICE_BIT: usize = MASK;
|
||||
|
||||
const SMALL_STR_MAX_LENGTH = SMALL_STRING_SIZE - 1;
|
||||
const SMALL_STRING_SIZE = @sizeOf(RocStr);
|
||||
|
@ -57,18 +58,35 @@ pub const RocStr = extern struct {
|
|||
return result;
|
||||
}
|
||||
|
||||
pub fn fromByteList(list: RocList) RocStr {
|
||||
// TODO: upon adding string seamless slices, I believe this branch can be changed to bit manipulation.
|
||||
// This requires that the list is non-null.
|
||||
// It also requires that start and count define a slice that does not go outside the bounds of the list.
|
||||
pub fn fromSubListUnsafe(list: RocList, start: usize, count: usize, update_mode: UpdateMode) RocStr {
|
||||
const start_byte = @ptrCast([*]u8, list.bytes) + start;
|
||||
if (list.isSeamlessSlice()) {
|
||||
// Str doesn't have seamless slices yet.
|
||||
// Need to copy.
|
||||
return RocStr.init(@ptrCast([*]const u8, list.bytes), list.length);
|
||||
return RocStr{
|
||||
.str_bytes = start_byte,
|
||||
.str_len = count | SEAMLESS_SLICE_BIT,
|
||||
.str_capacity = list.capacity_or_ref_ptr & (~SEAMLESS_SLICE_BIT),
|
||||
};
|
||||
} else if (start == 0 and (update_mode == .InPlace or list.isUnique())) {
|
||||
// Rare case, we can take over the original list.
|
||||
return RocStr{
|
||||
.str_bytes = start_byte,
|
||||
.str_len = count,
|
||||
.str_capacity = list.capacity_or_ref_ptr, // This is guaranteed to be a proper capacity.
|
||||
};
|
||||
} else {
|
||||
// Create seamless slice pointing to the list.
|
||||
return RocStr{
|
||||
.str_bytes = start_byte,
|
||||
.str_len = count | SEAMLESS_SLICE_BIT,
|
||||
.str_capacity = @ptrToInt(list.bytes) >> 1,
|
||||
};
|
||||
}
|
||||
return RocStr{
|
||||
.str_bytes = list.bytes,
|
||||
.str_len = list.length,
|
||||
.str_capacity = list.capacity_or_ref_ptr, // This is guaranteed to be a proper capacity.
|
||||
};
|
||||
}
|
||||
|
||||
pub fn isSeamlessSlice(self: RocStr) bool {
|
||||
return !self.isSmallStr() and @bitCast(isize, self.str_len) < 0;
|
||||
}
|
||||
|
||||
pub fn fromSlice(slice: []const u8) RocStr {
|
||||
|
@ -103,9 +121,39 @@ pub const RocStr = extern struct {
|
|||
}
|
||||
}
|
||||
|
||||
// This returns all ones if the list is a seamless slice.
|
||||
// Otherwise, it returns all zeros.
|
||||
// This is done without branching for optimization purposes.
|
||||
pub fn seamlessSliceMask(self: RocStr) usize {
|
||||
return @bitCast(usize, @bitCast(isize, self.str_len) >> (@bitSizeOf(isize) - 1));
|
||||
}
|
||||
|
||||
// returns a pointer to just after the refcount.
|
||||
// It is just after the refcount as an optimization for other shared code paths.
|
||||
// For regular list, it just returns their bytes pointer.
|
||||
// For seamless slices, it returns the pointer stored in capacity_or_ref_ptr.
|
||||
// This does not return a valid value if the input is a small string.
|
||||
pub fn getRefcountPtr(self: RocStr) ?[*]u8 {
|
||||
const str_ref_ptr = @ptrToInt(self.str_bytes);
|
||||
const slice_ref_ptr = self.str_capacity << 1;
|
||||
const slice_mask = self.seamlessSliceMask();
|
||||
const ref_ptr = (str_ref_ptr & ~slice_mask) | (slice_ref_ptr & slice_mask);
|
||||
return @intToPtr(?[*]u8, ref_ptr);
|
||||
}
|
||||
|
||||
pub fn incref(self: RocStr, n: usize) void {
|
||||
if (!self.isSmallStr()) {
|
||||
const ref_ptr = self.getRefcountPtr();
|
||||
if (ref_ptr != null) {
|
||||
const isizes: [*]isize = @ptrCast([*]isize, @alignCast(@alignOf(isize), ref_ptr));
|
||||
utils.increfC(@ptrCast(*isize, isizes - 1), @intCast(isize, n));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn decref(self: RocStr) void {
|
||||
if (!self.isSmallStr()) {
|
||||
utils.decref(self.str_bytes, self.str_capacity, RocStr.alignment);
|
||||
utils.decref(self.getRefcountPtr(), self.str_capacity, RocStr.alignment);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -126,20 +174,11 @@ pub const RocStr = extern struct {
|
|||
// Now we have to look at the string contents
|
||||
const self_bytes = self.asU8ptr();
|
||||
const other_bytes = other.asU8ptr();
|
||||
|
||||
// It's faster to compare pointer-sized words rather than bytes, as far as possible
|
||||
// The bytes are always pointer-size aligned due to the refcount
|
||||
const self_words = @ptrCast([*]const usize, @alignCast(@alignOf(usize), self_bytes));
|
||||
const other_words = @ptrCast([*]const usize, @alignCast(@alignOf(usize), other_bytes));
|
||||
var w: usize = 0;
|
||||
while (w < self_len / @sizeOf(usize)) : (w += 1) {
|
||||
if (self_words[w] != other_words[w]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Compare the leftover bytes
|
||||
var b = w * @sizeOf(usize);
|
||||
// TODO: we can make an optimization like memcmp does in glibc.
|
||||
// We can check the min shared alignment 1, 2, 4, or 8.
|
||||
// Then do a copy at that alignment before falling back on one byte at a time.
|
||||
// Currently we have to be unaligned because slices can be at any alignment.
|
||||
var b: usize = 0;
|
||||
while (b < self_len) : (b += 1) {
|
||||
if (self_bytes[b] != other_bytes[b]) {
|
||||
return false;
|
||||
|
@ -172,7 +211,7 @@ pub const RocStr = extern struct {
|
|||
const element_width = 1;
|
||||
const old_capacity = self.getCapacity();
|
||||
|
||||
if (self.isSmallStr() or !self.isUnique()) {
|
||||
if (self.isSmallStr() or self.isSeamlessSlice() or !self.isUnique()) {
|
||||
return self.reallocateFresh(new_length);
|
||||
}
|
||||
|
||||
|
@ -238,7 +277,7 @@ pub const RocStr = extern struct {
|
|||
if (self.isSmallStr()) {
|
||||
return self.asArray()[@sizeOf(RocStr) - 1] ^ 0b1000_0000;
|
||||
} else {
|
||||
return self.str_len;
|
||||
return self.str_len & (~SEAMLESS_SLICE_BIT);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -246,13 +285,15 @@ pub const RocStr = extern struct {
|
|||
if (self.isSmallStr()) {
|
||||
self.asU8ptrMut()[@sizeOf(RocStr) - 1] = @intCast(u8, length) | 0b1000_0000;
|
||||
} else {
|
||||
self.str_len = length;
|
||||
self.str_len = length | (SEAMLESS_SLICE_BIT & self.str_len);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn getCapacity(self: RocStr) usize {
|
||||
if (self.isSmallStr()) {
|
||||
return SMALL_STR_MAX_LENGTH;
|
||||
} else if (self.isSeamlessSlice()) {
|
||||
return self.str_len & (~SEAMLESS_SLICE_BIT);
|
||||
} else {
|
||||
return self.str_capacity;
|
||||
}
|
||||
|
@ -299,6 +340,9 @@ pub const RocStr = extern struct {
|
|||
// then the next byte is off the end of the struct;
|
||||
// in that case, we are also not null-terminated!
|
||||
return length != 0 and length != longest_small_str;
|
||||
} else if (self.isSeamlessSlice()) {
|
||||
// Seamless slices can not use the character past the end even if it is null.
|
||||
return false;
|
||||
} else {
|
||||
// This is a big string, and it's not empty, so we can safely
|
||||
// dereference the pointer.
|
||||
|
@ -334,7 +378,7 @@ pub const RocStr = extern struct {
|
|||
}
|
||||
|
||||
fn refcountMachine(self: RocStr) usize {
|
||||
if (self.getCapacity() == 0 or self.isSmallStr()) {
|
||||
if ((self.getCapacity() == 0 and !self.isSeamlessSlice()) or self.isSmallStr()) {
|
||||
return utils.REFCOUNT_ONE;
|
||||
}
|
||||
|
||||
|
@ -811,6 +855,27 @@ pub fn strSplit(string: RocStr, delimiter: RocStr) callconv(.C) RocList {
|
|||
return list;
|
||||
}
|
||||
|
||||
fn initFromSmallStr(slice_bytes: [*]u8, len: usize, _: usize) RocStr {
|
||||
return RocStr.init(slice_bytes, len);
|
||||
}
|
||||
|
||||
// The ref_ptr must already be shifted to be ready for storing in a seamless slice.
|
||||
fn initFromBigStr(slice_bytes: [*]u8, len: usize, ref_ptr: usize) RocStr {
|
||||
// Here we can make seamless slices instead of copying to a new small str.
|
||||
return RocStr{
|
||||
.str_bytes = slice_bytes,
|
||||
.str_len = len | SEAMLESS_SLICE_BIT,
|
||||
.str_capacity = ref_ptr,
|
||||
};
|
||||
}
|
||||
|
||||
// TODO: relpace this with @qualCast or @constCast in future version of zig
|
||||
fn constCast(ptr: [*]const u8) [*]u8 {
|
||||
var result: [*]u8 = undefined;
|
||||
@memcpy(@ptrCast([*]u8, &result), @ptrCast([*]const u8, &ptr), @sizeOf([*]u8));
|
||||
return result;
|
||||
}
|
||||
|
||||
fn strSplitHelp(array: [*]RocStr, string: RocStr, delimiter: RocStr) void {
|
||||
var ret_array_index: usize = 0;
|
||||
var slice_start_index: usize = 0;
|
||||
|
@ -818,6 +883,11 @@ fn strSplitHelp(array: [*]RocStr, string: RocStr, delimiter: RocStr) void {
|
|||
|
||||
const str_bytes = string.asU8ptr();
|
||||
const str_len = string.len();
|
||||
const ref_ptr = @ptrToInt(string.getRefcountPtr()) >> 1;
|
||||
const init_fn = if (string.isSmallStr())
|
||||
initFromSmallStr
|
||||
else
|
||||
initFromBigStr;
|
||||
|
||||
const delimiter_bytes_ptrs = delimiter.asU8ptr();
|
||||
const delimiter_len = delimiter.len();
|
||||
|
@ -849,7 +919,7 @@ fn strSplitHelp(array: [*]RocStr, string: RocStr, delimiter: RocStr) void {
|
|||
if (matches_delimiter) {
|
||||
const segment_len: usize = str_index - slice_start_index;
|
||||
|
||||
array[ret_array_index] = RocStr.init(str_bytes + slice_start_index, segment_len);
|
||||
array[ret_array_index] = init_fn(constCast(str_bytes) + slice_start_index, segment_len, ref_ptr);
|
||||
slice_start_index = str_index + delimiter_len;
|
||||
ret_array_index += 1;
|
||||
str_index += delimiter_len;
|
||||
|
@ -859,7 +929,12 @@ fn strSplitHelp(array: [*]RocStr, string: RocStr, delimiter: RocStr) void {
|
|||
}
|
||||
}
|
||||
|
||||
array[ret_array_index] = RocStr.init(str_bytes + slice_start_index, str_len - slice_start_index);
|
||||
array[ret_array_index] = init_fn(constCast(str_bytes) + slice_start_index, str_len - slice_start_index, ref_ptr);
|
||||
|
||||
if (!string.isSmallStr()) {
|
||||
// Correct refcount for all of the splits made.
|
||||
string.incref(ret_array_index + 1);
|
||||
}
|
||||
}
|
||||
|
||||
test "strSplitHelp: empty delimiter" {
|
||||
|
@ -1354,6 +1429,12 @@ pub fn strGraphemes(roc_str: RocStr) callconv(.C) RocList {
|
|||
var index: usize = 0;
|
||||
var last_codepoint_len: u8 = 0;
|
||||
|
||||
const ref_ptr = @ptrToInt(roc_str.getRefcountPtr()) >> 1;
|
||||
const init_fn = if (roc_str.isSmallStr())
|
||||
initFromSmallStr
|
||||
else
|
||||
initFromBigStr;
|
||||
|
||||
var result = RocList.allocate(@alignOf(RocStr), countGraphemeClusters(roc_str), @sizeOf(RocStr));
|
||||
const graphemes = result.elements(RocStr) orelse return result;
|
||||
var slice = roc_str.asSlice();
|
||||
|
@ -1364,7 +1445,7 @@ pub fn strGraphemes(roc_str: RocStr) callconv(.C) RocList {
|
|||
if (opt_last_codepoint) |last_codepoint| {
|
||||
var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, &break_state);
|
||||
if (did_break) {
|
||||
graphemes[index] = RocStr.fromSlice(slice[0..last_codepoint_len]);
|
||||
graphemes[index] = init_fn(constCast(slice.ptr), last_codepoint_len, ref_ptr);
|
||||
slice = slice[last_codepoint_len..];
|
||||
index += 1;
|
||||
break_state = null;
|
||||
|
@ -1375,7 +1456,12 @@ pub fn strGraphemes(roc_str: RocStr) callconv(.C) RocList {
|
|||
opt_last_codepoint = cur_codepoint;
|
||||
}
|
||||
// Append last grapheme
|
||||
graphemes[index] = RocStr.fromSlice(slice);
|
||||
graphemes[index] = init_fn(constCast(slice.ptr), slice.len, ref_ptr);
|
||||
|
||||
if (!roc_str.isSmallStr()) {
|
||||
// Correct refcount for all of the splits made.
|
||||
roc_str.incref(index + 1);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -1782,7 +1868,8 @@ inline fn strToBytes(arg: RocStr) RocList {
|
|||
|
||||
return RocList{ .length = length, .bytes = ptr, .capacity_or_ref_ptr = length };
|
||||
} else {
|
||||
return RocList{ .length = length, .bytes = arg.str_bytes, .capacity_or_ref_ptr = arg.str_capacity };
|
||||
const is_seamless_slice = arg.str_len & SEAMLESS_SLICE_BIT;
|
||||
return RocList{ .length = length, .bytes = arg.str_bytes, .capacity_or_ref_ptr = arg.str_capacity | is_seamless_slice };
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1809,37 +1896,26 @@ pub fn fromUtf8RangeC(
|
|||
}
|
||||
|
||||
pub fn fromUtf8Range(arg: RocList, start: usize, count: usize, update_mode: UpdateMode) FromUtf8Result {
|
||||
if (arg.len() == 0 or count == 0) {
|
||||
arg.decref(RocStr.alignment);
|
||||
return FromUtf8Result{
|
||||
.is_ok = true,
|
||||
.string = RocStr.empty(),
|
||||
.byte_index = 0,
|
||||
.problem_code = Utf8ByteProblem.InvalidStartByte,
|
||||
};
|
||||
}
|
||||
const bytes = @ptrCast([*]const u8, arg.bytes)[start..count];
|
||||
|
||||
if (isValidUnicode(bytes)) {
|
||||
// the output will be correct. Now we need to clone the input
|
||||
|
||||
// TODO: rework this to properly take advantage fo seamless slices.
|
||||
if (count == arg.len() and count > SMALL_STR_MAX_LENGTH) {
|
||||
const byte_list = arg.makeUniqueExtra(RocStr.alignment, @sizeOf(u8), update_mode);
|
||||
|
||||
const string = RocStr.fromByteList(byte_list);
|
||||
|
||||
return FromUtf8Result{
|
||||
.is_ok = true,
|
||||
.string = string,
|
||||
.byte_index = 0,
|
||||
.problem_code = Utf8ByteProblem.InvalidStartByte,
|
||||
};
|
||||
} else {
|
||||
// turn the bytes into a small string
|
||||
const string = RocStr.init(@ptrCast([*]const u8, bytes), count);
|
||||
|
||||
// decref the list
|
||||
arg.decref(RocStr.alignment);
|
||||
|
||||
return FromUtf8Result{
|
||||
.is_ok = true,
|
||||
.string = string,
|
||||
.byte_index = 0,
|
||||
.problem_code = Utf8ByteProblem.InvalidStartByte,
|
||||
};
|
||||
}
|
||||
// Make a seamless slice of the input.
|
||||
const string = RocStr.fromSubListUnsafe(arg, start, count, update_mode);
|
||||
return FromUtf8Result{
|
||||
.is_ok = true,
|
||||
.string = string,
|
||||
.byte_index = 0,
|
||||
.problem_code = Utf8ByteProblem.InvalidStartByte,
|
||||
};
|
||||
} else {
|
||||
const temp = errorToProblem(@ptrCast([*]u8, arg.bytes), arg.length);
|
||||
|
||||
|
@ -1988,7 +2064,9 @@ test "validateUtf8Bytes: ascii" {
|
|||
const ptr: [*]const u8 = @ptrCast([*]const u8, raw);
|
||||
const list = sliceHelp(ptr, raw.len);
|
||||
|
||||
try expectOk(validateUtf8BytesX(list));
|
||||
const str_result = validateUtf8BytesX(list);
|
||||
defer str_result.string.decref();
|
||||
try expectOk(str_result);
|
||||
}
|
||||
|
||||
test "validateUtf8Bytes: unicode œ" {
|
||||
|
@ -1996,7 +2074,9 @@ test "validateUtf8Bytes: unicode œ" {
|
|||
const ptr: [*]const u8 = @ptrCast([*]const u8, raw);
|
||||
const list = sliceHelp(ptr, raw.len);
|
||||
|
||||
try expectOk(validateUtf8BytesX(list));
|
||||
const str_result = validateUtf8BytesX(list);
|
||||
defer str_result.string.decref();
|
||||
try expectOk(str_result);
|
||||
}
|
||||
|
||||
test "validateUtf8Bytes: unicode ∆" {
|
||||
|
@ -2004,7 +2084,9 @@ test "validateUtf8Bytes: unicode ∆" {
|
|||
const ptr: [*]const u8 = @ptrCast([*]const u8, raw);
|
||||
const list = sliceHelp(ptr, raw.len);
|
||||
|
||||
try expectOk(validateUtf8BytesX(list));
|
||||
const str_result = validateUtf8BytesX(list);
|
||||
defer str_result.string.decref();
|
||||
try expectOk(str_result);
|
||||
}
|
||||
|
||||
test "validateUtf8Bytes: emoji" {
|
||||
|
@ -2012,7 +2094,9 @@ test "validateUtf8Bytes: emoji" {
|
|||
const ptr: [*]const u8 = @ptrCast([*]const u8, raw);
|
||||
const list = sliceHelp(ptr, raw.len);
|
||||
|
||||
try expectOk(validateUtf8BytesX(list));
|
||||
const str_result = validateUtf8BytesX(list);
|
||||
defer str_result.string.decref();
|
||||
try expectOk(str_result);
|
||||
}
|
||||
|
||||
test "validateUtf8Bytes: unicode ∆ in middle of array" {
|
||||
|
@ -2020,7 +2104,9 @@ test "validateUtf8Bytes: unicode ∆ in middle of array" {
|
|||
const ptr: [*]const u8 = @ptrCast([*]const u8, raw);
|
||||
const list = sliceHelp(ptr, raw.len);
|
||||
|
||||
try expectOk(validateUtf8BytesX(list));
|
||||
const str_result = validateUtf8BytesX(list);
|
||||
defer str_result.string.decref();
|
||||
try expectOk(str_result);
|
||||
}
|
||||
|
||||
fn expectErr(list: RocList, index: usize, err: Utf8DecodeError, problem: Utf8ByteProblem) !void {
|
||||
|
@ -2168,33 +2254,31 @@ pub fn strTrim(input_string: RocStr) callconv(.C) RocStr {
|
|||
const trailing_bytes = countTrailingWhitespaceBytes(string);
|
||||
const new_len = original_len - leading_bytes - trailing_bytes;
|
||||
|
||||
if (string.isSmallStr() or !string.isRefcountOne()) {
|
||||
// consume the input string; this will not free the
|
||||
// bytes because the string is small or shared
|
||||
const result = RocStr.init(string.asU8ptr() + leading_bytes, new_len);
|
||||
|
||||
string.decref();
|
||||
|
||||
return result;
|
||||
} else {
|
||||
// nonempty, large, and unique: shift everything over in-place if necessary.
|
||||
// Note: must use memmove over memcpy, because the bytes definitely overlap!
|
||||
if (leading_bytes > 0) {
|
||||
// Zig doesn't seem to have `memmove` in the stdlib anymore; this is based on:
|
||||
// https://github.com/ziglang/zig/blob/52ba2c3a43a88a4db30cff47f2f3eff8c3d5be19/lib/std/special/c.zig#L115
|
||||
// Copyright Andrew Kelley, MIT licensed.
|
||||
const src = bytes_ptr + leading_bytes;
|
||||
var index: usize = 0;
|
||||
|
||||
while (index != new_len) : (index += 1) {
|
||||
bytes_ptr[index] = src[index];
|
||||
}
|
||||
}
|
||||
|
||||
if (string.isSmallStr()) {
|
||||
// Just create another small string of the correct bytes.
|
||||
// No need to decref because it is a small string.
|
||||
return RocStr.init(string.asU8ptr() + leading_bytes, new_len);
|
||||
} else if (leading_bytes == 0 and string.isUnique()) {
|
||||
// Big and unique with no leading bytes to remove.
|
||||
// Just take ownership and shrink the length.
|
||||
var new_string = string;
|
||||
new_string.str_len = new_len;
|
||||
|
||||
return new_string;
|
||||
} else if (string.isSeamlessSlice()) {
|
||||
// Already a seamless slice, just update the range.
|
||||
return RocStr{
|
||||
.str_bytes = bytes_ptr + leading_bytes,
|
||||
.str_len = new_len | SEAMLESS_SLICE_BIT,
|
||||
.str_capacity = string.str_capacity,
|
||||
};
|
||||
} else {
|
||||
// Not unique or removing leading bytes, just make a slice.
|
||||
return RocStr{
|
||||
.str_bytes = bytes_ptr + leading_bytes,
|
||||
.str_len = new_len | SEAMLESS_SLICE_BIT,
|
||||
.str_capacity = @ptrToInt(bytes_ptr) >> 1,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2218,33 +2302,31 @@ pub fn strTrimLeft(input_string: RocStr) callconv(.C) RocStr {
|
|||
|
||||
const new_len = original_len - leading_bytes;
|
||||
|
||||
if (string.isSmallStr() or !string.isRefcountOne()) {
|
||||
// if the trimmed string fits in a small string,
|
||||
// make the result a small string and decref the original string
|
||||
const result = RocStr.init(string.asU8ptr() + leading_bytes, new_len);
|
||||
|
||||
string.decref();
|
||||
|
||||
return result;
|
||||
} else {
|
||||
// nonempty, large, and unique: shift everything over in-place if necessary.
|
||||
// Note: must use memmove over memcpy, because the bytes definitely overlap!
|
||||
if (leading_bytes > 0) {
|
||||
// Zig doesn't seem to have `memmove` in the stdlib anymore; this is based on:
|
||||
// https://github.com/ziglang/zig/blob/52ba2c3a43a88a4db30cff47f2f3eff8c3d5be19/lib/std/special/c.zig#L115
|
||||
// Copyright Andrew Kelley, MIT licensed.
|
||||
const src = bytes_ptr + leading_bytes;
|
||||
var index: usize = 0;
|
||||
|
||||
while (index != new_len) : (index += 1) {
|
||||
bytes_ptr[index] = src[index];
|
||||
}
|
||||
}
|
||||
|
||||
if (string.isSmallStr()) {
|
||||
// Just create another small string of the correct bytes.
|
||||
// No need to decref because it is a small string.
|
||||
return RocStr.init(string.asU8ptr() + leading_bytes, new_len);
|
||||
} else if (leading_bytes == 0 and string.isUnique()) {
|
||||
// Big and unique with no leading bytes to remove.
|
||||
// Just take ownership and shrink the length.
|
||||
var new_string = string;
|
||||
new_string.str_len = new_len;
|
||||
|
||||
return new_string;
|
||||
} else if (string.isSeamlessSlice()) {
|
||||
// Already a seamless slice, just update the range.
|
||||
return RocStr{
|
||||
.str_bytes = bytes_ptr + leading_bytes,
|
||||
.str_len = new_len | SEAMLESS_SLICE_BIT,
|
||||
.str_capacity = string.str_capacity,
|
||||
};
|
||||
} else {
|
||||
// Not unique or removing leading bytes, just make a slice.
|
||||
return RocStr{
|
||||
.str_bytes = bytes_ptr + leading_bytes,
|
||||
.str_len = new_len | SEAMLESS_SLICE_BIT,
|
||||
.str_capacity = @ptrToInt(bytes_ptr) >> 1,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2268,27 +2350,32 @@ pub fn strTrimRight(input_string: RocStr) callconv(.C) RocStr {
|
|||
|
||||
const new_len = original_len - trailing_bytes;
|
||||
|
||||
if (string.isSmallStr() or !string.isRefcountOne()) {
|
||||
const result = RocStr.init(string.asU8ptr(), new_len);
|
||||
if (string.isSmallStr()) {
|
||||
// Just create another small string of the correct bytes.
|
||||
// No need to decref because it is a small string.
|
||||
return RocStr.init(string.asU8ptr(), new_len);
|
||||
} else if (string.isUnique()) {
|
||||
// Big and unique with no leading bytes to remove.
|
||||
// Just take ownership and shrink the length.
|
||||
var new_string = string;
|
||||
new_string.str_len = new_len;
|
||||
|
||||
string.decref();
|
||||
|
||||
return result;
|
||||
return new_string;
|
||||
} else if (string.isSeamlessSlice()) {
|
||||
// Already a seamless slice, just update the range.
|
||||
return RocStr{
|
||||
.str_bytes = bytes_ptr,
|
||||
.str_len = new_len | SEAMLESS_SLICE_BIT,
|
||||
.str_capacity = string.str_capacity,
|
||||
};
|
||||
} else {
|
||||
// Not unique, just make a slice.
|
||||
return RocStr{
|
||||
.str_bytes = bytes_ptr,
|
||||
.str_len = new_len | SEAMLESS_SLICE_BIT,
|
||||
.str_capacity = @ptrToInt(bytes_ptr) >> 1,
|
||||
};
|
||||
}
|
||||
|
||||
// nonempty, large, and unique:
|
||||
|
||||
var i: usize = 0;
|
||||
while (i < new_len) : (i += 1) {
|
||||
const dest = bytes_ptr + i;
|
||||
const source = dest;
|
||||
@memcpy(dest, source, 1);
|
||||
}
|
||||
|
||||
var new_string = string;
|
||||
new_string.str_len = new_len;
|
||||
|
||||
return new_string;
|
||||
}
|
||||
|
||||
fn countLeadingWhitespaceBytes(string: RocStr) usize {
|
||||
|
@ -2412,9 +2499,9 @@ test "strTrim: null byte" {
|
|||
test "strTrim: blank" {
|
||||
const original_bytes = " ";
|
||||
const original = RocStr.init(original_bytes, original_bytes.len);
|
||||
defer original.decref();
|
||||
|
||||
const trimmed = strTrim(original);
|
||||
defer trimmed.decref();
|
||||
|
||||
try expect(trimmed.eq(RocStr.empty()));
|
||||
}
|
||||
|
@ -2422,7 +2509,6 @@ test "strTrim: blank" {
|
|||
test "strTrim: large to large" {
|
||||
const original_bytes = " hello even more giant world ";
|
||||
const original = RocStr.init(original_bytes, original_bytes.len);
|
||||
defer original.decref();
|
||||
|
||||
try expect(!original.isSmallStr());
|
||||
|
||||
|
@ -2433,14 +2519,14 @@ test "strTrim: large to large" {
|
|||
try expect(!expected.isSmallStr());
|
||||
|
||||
const trimmed = strTrim(original);
|
||||
defer trimmed.decref();
|
||||
|
||||
try expect(trimmed.eq(expected));
|
||||
}
|
||||
|
||||
test "strTrim: large to small" {
|
||||
test "strTrim: large to small sized slice" {
|
||||
const original_bytes = " hello ";
|
||||
const original = RocStr.init(original_bytes, original_bytes.len);
|
||||
defer original.decref();
|
||||
|
||||
try expect(!original.isSmallStr());
|
||||
|
||||
|
@ -2452,11 +2538,10 @@ test "strTrim: large to small" {
|
|||
|
||||
try expect(original.isUnique());
|
||||
const trimmed = strTrim(original);
|
||||
defer trimmed.decref();
|
||||
|
||||
try expect(trimmed.eq(expected));
|
||||
try expect(!trimmed.isSmallStr());
|
||||
|
||||
try expect(trimmed.getCapacity() >= original.len());
|
||||
}
|
||||
|
||||
test "strTrim: small to small" {
|
||||
|
@ -2800,3 +2885,9 @@ pub fn strCloneTo(
|
|||
return extra_offset + slice.len;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn strRefcountPtr(
|
||||
string: RocStr,
|
||||
) callconv(.C) ?[*]u8 {
|
||||
return string.getRefcountPtr();
|
||||
}
|
||||
|
|
|
@ -333,6 +333,7 @@ pub const STR_GET_SCALAR_UNSAFE: &str = "roc_builtins.str.get_scalar_unsafe";
|
|||
pub const STR_CLONE_TO: &str = "roc_builtins.str.clone_to";
|
||||
pub const STR_WITH_CAPACITY: &str = "roc_builtins.str.with_capacity";
|
||||
pub const STR_GRAPHEMES: &str = "roc_builtins.str.graphemes";
|
||||
pub const STR_REFCOUNT_PTR: &str = "roc_builtins.str.refcount_ptr";
|
||||
|
||||
pub const LIST_MAP: &str = "roc_builtins.list.map";
|
||||
pub const LIST_MAP2: &str = "roc_builtins.list.map2";
|
||||
|
|
|
@ -63,3 +63,19 @@ pub(crate) fn str_equal<'a, 'ctx, 'env>(
|
|||
bitcode::STR_EQUAL,
|
||||
)
|
||||
}
|
||||
|
||||
// Gets a pointer to just after the refcount for a list or seamless slice.
|
||||
// The value is just after the refcount so that normal lists and seamless slices can share code paths easily.
|
||||
pub(crate) fn str_refcount_ptr<'a, 'ctx, 'env>(
|
||||
env: &Env<'a, 'ctx, 'env>,
|
||||
value: BasicValueEnum<'ctx>,
|
||||
) -> PointerValue<'ctx> {
|
||||
call_str_bitcode_fn(
|
||||
env,
|
||||
&[value],
|
||||
&[],
|
||||
BitcodeReturns::Basic,
|
||||
bitcode::STR_REFCOUNT_PTR,
|
||||
)
|
||||
.into_pointer_value()
|
||||
}
|
||||
|
|
|
@ -8,12 +8,13 @@ use crate::llvm::build::{
|
|||
use crate::llvm::build_list::{
|
||||
incrementing_elem_loop, list_capacity_or_ref_ptr, list_refcount_ptr, load_list,
|
||||
};
|
||||
use crate::llvm::build_str::str_refcount_ptr;
|
||||
use crate::llvm::convert::{basic_type_from_layout, zig_str_type, RocUnion};
|
||||
use bumpalo::collections::Vec;
|
||||
use inkwell::basic_block::BasicBlock;
|
||||
use inkwell::module::Linkage;
|
||||
use inkwell::types::{AnyTypeEnum, BasicMetadataTypeEnum, BasicType, BasicTypeEnum};
|
||||
use inkwell::values::{BasicValueEnum, FunctionValue, IntValue, PointerValue, StructValue};
|
||||
use inkwell::values::{BasicValueEnum, FunctionValue, IntValue, PointerValue};
|
||||
use inkwell::{AddressSpace, IntPredicate};
|
||||
use roc_module::symbol::Interns;
|
||||
use roc_module::symbol::Symbol;
|
||||
|
@ -75,16 +76,6 @@ impl<'ctx> PointerToRefcount<'ctx> {
|
|||
}
|
||||
}
|
||||
|
||||
fn from_list_wrapper(env: &Env<'_, 'ctx, '_>, list_wrapper: StructValue<'ctx>) -> Self {
|
||||
let data_ptr = env
|
||||
.builder
|
||||
.build_extract_value(list_wrapper, Builtin::WRAPPER_PTR, "read_list_ptr")
|
||||
.unwrap()
|
||||
.into_pointer_value();
|
||||
|
||||
Self::from_ptr_to_data(env, data_ptr)
|
||||
}
|
||||
|
||||
pub fn is_1<'a, 'env>(&self, env: &Env<'a, 'ctx, 'env>) -> IntValue<'ctx> {
|
||||
let current = self.get_refcount(env);
|
||||
let one = match env.target_info.ptr_width() {
|
||||
|
@ -815,9 +806,9 @@ fn modify_refcount_str_help<'a, 'ctx, 'env>(
|
|||
|
||||
let parent = fn_val;
|
||||
|
||||
let arg_val =
|
||||
let str_type = zig_str_type(env);
|
||||
let str_wrapper =
|
||||
if Layout::Builtin(Builtin::Str).is_passed_by_reference(layout_interner, env.target_info) {
|
||||
let str_type = zig_str_type(env);
|
||||
env.builder
|
||||
.new_build_load(str_type, arg_val.into_pointer_value(), "load_str_to_stack")
|
||||
} else {
|
||||
|
@ -825,7 +816,7 @@ fn modify_refcount_str_help<'a, 'ctx, 'env>(
|
|||
debug_assert!(arg_val.is_struct_value());
|
||||
arg_val
|
||||
};
|
||||
let str_wrapper = arg_val.into_struct_value();
|
||||
let str_wrapper = str_wrapper.into_struct_value();
|
||||
|
||||
let capacity = builder
|
||||
.build_extract_value(str_wrapper, Builtin::WRAPPER_CAPACITY, "read_str_capacity")
|
||||
|
@ -848,7 +839,7 @@ fn modify_refcount_str_help<'a, 'ctx, 'env>(
|
|||
builder.build_conditional_branch(is_big_and_non_empty, modification_block, cont_block);
|
||||
builder.position_at_end(modification_block);
|
||||
|
||||
let refcount_ptr = PointerToRefcount::from_list_wrapper(env, str_wrapper);
|
||||
let refcount_ptr = PointerToRefcount::from_ptr_to_data(env, str_refcount_ptr(env, arg_val));
|
||||
let call_mode = mode_to_call_mode(fn_val, mode);
|
||||
refcount_ptr.modify(call_mode, layout, env, layout_interner);
|
||||
|
||||
|
|
|
@ -60,7 +60,10 @@ impl FromWasm32Memory for RocStr {
|
|||
let str_words: &[u32; 3] = unsafe { std::mem::transmute(&str_bytes) };
|
||||
|
||||
let big_elem_ptr = str_words[Builtin::WRAPPER_PTR as usize] as usize;
|
||||
let big_length = str_words[Builtin::WRAPPER_LEN as usize] as usize;
|
||||
// If the str is a seamless slice, it's highest bit will be set to 1.
|
||||
// We need to remove that bit or we will get an incorrect negative length.
|
||||
// Since wasm length is 32bits, and with i32::MAX (0 followed by all 1s in 32 bit).
|
||||
let big_length = str_words[Builtin::WRAPPER_LEN as usize] as usize & (i32::MAX as usize);
|
||||
let big_capacity = str_words[Builtin::WRAPPER_CAPACITY as usize] as usize;
|
||||
|
||||
let last_byte = str_bytes[11];
|
||||
|
|
|
@ -94,18 +94,18 @@ impl<T> RocList<T> {
|
|||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.length
|
||||
self.length & (isize::MAX as usize)
|
||||
}
|
||||
|
||||
pub fn is_seamless_slice(&self) -> bool {
|
||||
(self.capacity_or_ref_ptr as isize) < 0
|
||||
((self.length | self.capacity_or_ref_ptr) as isize) < 0
|
||||
}
|
||||
|
||||
pub fn capacity(&self) -> usize {
|
||||
if !self.is_seamless_slice() {
|
||||
self.capacity_or_ref_ptr
|
||||
} else {
|
||||
self.length
|
||||
self.len()
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -257,8 +257,10 @@ where
|
|||
|
||||
// Copy the old elements to the new allocation.
|
||||
unsafe {
|
||||
copy_nonoverlapping(elements.as_ptr(), new_elements.as_ptr(), self.length);
|
||||
copy_nonoverlapping(elements.as_ptr(), new_elements.as_ptr(), self.len());
|
||||
}
|
||||
// Clear the seamless slice bit since we now have clear ownership.
|
||||
self.length = self.len();
|
||||
|
||||
new_elements
|
||||
}
|
||||
|
@ -295,7 +297,7 @@ impl<T> RocList<T> {
|
|||
///
|
||||
/// May return a new RocList, if the provided one was not unique.
|
||||
pub fn reserve(&mut self, num_elems: usize) {
|
||||
let new_len = num_elems + self.length;
|
||||
let new_len = num_elems + self.len();
|
||||
let new_elems;
|
||||
let old_elements_ptr;
|
||||
|
||||
|
@ -338,7 +340,7 @@ impl<T> RocList<T> {
|
|||
|
||||
unsafe {
|
||||
// Copy the old elements to the new allocation.
|
||||
copy_nonoverlapping(old_elements_ptr, new_elems.as_ptr(), self.length);
|
||||
copy_nonoverlapping(old_elements_ptr, new_elems.as_ptr(), self.len());
|
||||
}
|
||||
|
||||
// Decrease the current allocation's reference count.
|
||||
|
@ -371,7 +373,7 @@ impl<T> RocList<T> {
|
|||
|
||||
self.update_to(Self {
|
||||
elements: Some(new_elems),
|
||||
length: self.length,
|
||||
length: self.len(),
|
||||
capacity_or_ref_ptr: new_len,
|
||||
});
|
||||
}
|
||||
|
@ -392,7 +394,7 @@ impl<T> Deref for RocList<T> {
|
|||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
if let Some(elements) = self.elements {
|
||||
let elements = ptr::slice_from_raw_parts(elements.as_ptr().cast::<T>(), self.length);
|
||||
let elements = ptr::slice_from_raw_parts(elements.as_ptr().cast::<T>(), self.len());
|
||||
|
||||
unsafe { &*elements }
|
||||
} else {
|
||||
|
@ -424,7 +426,7 @@ where
|
|||
{
|
||||
fn partial_cmp(&self, other: &RocList<U>) -> Option<cmp::Ordering> {
|
||||
// If one is longer than the other, use that as the ordering.
|
||||
match self.length.partial_cmp(&other.length) {
|
||||
match self.len().partial_cmp(&other.len()) {
|
||||
Some(Ordering::Equal) => {}
|
||||
ord => return ord,
|
||||
}
|
||||
|
@ -448,7 +450,7 @@ where
|
|||
{
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
// If one is longer than the other, use that as the ordering.
|
||||
match self.length.cmp(&other.length) {
|
||||
match self.len().cmp(&other.len()) {
|
||||
Ordering::Equal => {}
|
||||
ord => return ord,
|
||||
}
|
||||
|
|
|
@ -698,6 +698,11 @@ impl From<SendSafeRocStr> for RocStr {
|
|||
|
||||
#[repr(C)]
|
||||
union RocStrInner {
|
||||
// TODO: this really should be separated from the List type.
|
||||
// Due to length specifying seamless slices for Str and capacity for Lists they should not share the same code.
|
||||
// Currently, there are work arounds in RocList to handle both via removing the highest bit of length in many cases.
|
||||
// With glue changes, we should probably rewrite these cleanly to match what is in the zig bitcode.
|
||||
// It is definitely a bit stale now and I think the storage mechanism can be quite confusing with our extra pieces of state.
|
||||
heap_allocated: ManuallyDrop<RocList<u8>>,
|
||||
small_string: SmallString,
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue