add seamless slices for str

This commit is contained in:
Brendan Hansknecht 2023-03-13 22:26:04 -07:00
parent 24c403eba0
commit 3978059aa2
No known key found for this signature in database
GPG key ID: 0EA784685083E75B
5 changed files with 122 additions and 35 deletions

View file

@ -155,6 +155,7 @@ comptime {
exportStrFn(str.strCloneTo, "clone_to");
exportStrFn(str.withCapacity, "with_capacity");
exportStrFn(str.strGraphemes, "graphemes");
exportStrFn(str.strRefcountPtr, "refcount_ptr");
inline for (INTEGERS) |T| {
str.exportFromInt(T, ROC_BUILTINS ++ "." ++ STR ++ ".from_int.");

View file

@ -18,6 +18,7 @@ const InPlace = enum(u8) {
const MASK_ISIZE: isize = std.math.minInt(isize);
const MASK: usize = @bitCast(usize, MASK_ISIZE);
const SEAMLESS_SLICE_BIT: usize = MASK;
const SMALL_STR_MAX_LENGTH = SMALL_STRING_SIZE - 1;
const SMALL_STRING_SIZE = @sizeOf(RocStr);
@ -58,11 +59,12 @@ pub const RocStr = extern struct {
}
pub fn fromByteList(list: RocList) RocStr {
// TODO: upon adding string seamless slices, I believe this branch can be changed to bit manipulation.
if (list.isSeamlessSlice()) {
// Str doesn't have seamless slices yet.
// Need to copy.
return RocStr.init(@ptrCast([*]const u8, list.bytes), list.length);
return RocStr{
.str_bytes = list.bytes,
.str_len = list.length | SEAMLESS_SLICE_BIT,
.str_capacity = list.capacity_or_ref_ptr & (~SEAMLESS_SLICE_BIT),
};
}
return RocStr{
.str_bytes = list.bytes,
@ -71,6 +73,10 @@ pub const RocStr = extern struct {
};
}
pub fn isSeamlessSlice(self: RocStr) bool {
return !self.isSmallStr() and @bitCast(isize, self.str_len) < 0;
}
pub fn fromSlice(slice: []const u8) RocStr {
return RocStr.init(slice.ptr, slice.len);
}
@ -103,9 +109,39 @@ pub const RocStr = extern struct {
}
}
// This returns all ones if the list is a seamless slice.
// Otherwise, it returns all zeros.
// This is done without branching for optimization purposes.
pub fn seamlessSliceMask(self: RocStr) usize {
return @bitCast(usize, @bitCast(isize, self.str_len) >> (@bitSizeOf(isize) - 1));
}
// returns a pointer to just after the refcount.
// It is just after the refcount as an optimization for other shared code paths.
// For regular list, it just returns their bytes pointer.
// For seamless slices, it returns the pointer stored in capacity_or_ref_ptr.
// This does not return a valid value if the input is a small string.
pub fn getRefcountPtr(self: RocStr) ?[*]u8 {
const str_ref_ptr = @ptrToInt(self.str_bytes);
const slice_ref_ptr = self.str_capacity << 1;
const slice_mask = self.seamlessSliceMask();
const ref_ptr = (str_ref_ptr & ~slice_mask) | (slice_ref_ptr & slice_mask);
return @intToPtr(?[*]u8, ref_ptr);
}
pub fn incref(self: RocStr, n: usize) void {
if (!self.isSmallStr()) {
const ref_ptr = self.getRefcountPtr();
if (ref_ptr != null) {
const isizes: [*]isize = @ptrCast([*]isize, @alignCast(@alignOf(isize), ref_ptr));
utils.increfC(@ptrCast(*isize, isizes - 1), @intCast(isize, n));
}
}
}
pub fn decref(self: RocStr) void {
if (!self.isSmallStr()) {
utils.decref(self.str_bytes, self.str_capacity, RocStr.alignment);
utils.decref(self.getRefcountPtr(), self.str_capacity, RocStr.alignment);
}
}
@ -129,17 +165,18 @@ pub const RocStr = extern struct {
// It's faster to compare pointer-sized words rather than bytes, as far as possible
// The bytes are always pointer-size aligned due to the refcount
const self_words = @ptrCast([*]const usize, @alignCast(@alignOf(usize), self_bytes));
const other_words = @ptrCast([*]const usize, @alignCast(@alignOf(usize), other_bytes));
var w: usize = 0;
while (w < self_len / @sizeOf(usize)) : (w += 1) {
if (self_words[w] != other_words[w]) {
return false;
}
}
// const self_words = @ptrCast([*]const usize, @alignCast(@alignOf(usize), self_bytes));
// const other_words = @ptrCast([*]const usize, @alignCast(@alignOf(usize), other_bytes));
// var w: usize = 0;
// while (w < self_len / @sizeOf(usize)) : (w += 1) {
// if (self_words[w] != other_words[w]) {
// return false;
// }
// }
// Compare the leftover bytes
var b = w * @sizeOf(usize);
// var b = w * @sizeOf(usize);
var b: usize = 0;
while (b < self_len) : (b += 1) {
if (self_bytes[b] != other_bytes[b]) {
return false;
@ -238,7 +275,7 @@ pub const RocStr = extern struct {
if (self.isSmallStr()) {
return self.asArray()[@sizeOf(RocStr) - 1] ^ 0b1000_0000;
} else {
return self.str_len;
return self.str_len & (~SEAMLESS_SLICE_BIT);
}
}
@ -246,13 +283,15 @@ pub const RocStr = extern struct {
if (self.isSmallStr()) {
self.asU8ptrMut()[@sizeOf(RocStr) - 1] = @intCast(u8, length) | 0b1000_0000;
} else {
self.str_len = length;
self.str_len = length | (SEAMLESS_SLICE_BIT & self.str_len);
}
}
pub fn getCapacity(self: RocStr) usize {
if (self.isSmallStr()) {
return SMALL_STR_MAX_LENGTH;
} else if (self.isSeamlessSlice()) {
return self.str_len & (~SEAMLESS_SLICE_BIT);
} else {
return self.str_capacity;
}
@ -299,6 +338,9 @@ pub const RocStr = extern struct {
// then the next byte is off the end of the struct;
// in that case, we are also not null-terminated!
return length != 0 and length != longest_small_str;
} else if (self.isSeamlessSlice()) {
// Seamless slices can not use the character past the end even if it is null.
return false;
} else {
// This is a big string, and it's not empty, so we can safely
// dereference the pointer.
@ -334,7 +376,7 @@ pub const RocStr = extern struct {
}
fn refcountMachine(self: RocStr) usize {
if (self.getCapacity() == 0 or self.isSmallStr()) {
if ((self.getCapacity() == 0 and !self.isSeamlessSlice()) or self.isSmallStr()) {
return utils.REFCOUNT_ONE;
}
@ -811,13 +853,34 @@ pub fn strSplit(string: RocStr, delimiter: RocStr) callconv(.C) RocList {
return list;
}
const Init = fn (bytes: [*]u8, offset: usize, len: usize, ref_ptr: usize) RocStr;
fn initFromSmallStr(bytes: [*]u8, offset: usize, len: usize, _: usize) RocStr {
return RocStr.init(bytes + offset, len);
}
// The ref_ptr must already be shifted to be ready for storing in a seamless slice.
fn initFromBigStr(bytes: [*]u8, offset: usize, len: usize, ref_ptr: usize) RocStr {
// Here we can make seamless slices instead of copying to a new small str.
return RocStr{
.str_bytes = bytes + offset,
.str_len = len | SEAMLESS_SLICE_BIT,
.str_capacity = ref_ptr,
};
}
fn strSplitHelp(array: [*]RocStr, string: RocStr, delimiter: RocStr) void {
var ret_array_index: usize = 0;
var slice_start_index: usize = 0;
var str_index: usize = 0;
const str_bytes = string.asU8ptr();
var mut_str = string;
const str_bytes = mut_str.asU8ptrMut();
const str_len = string.len();
const ref_ptr = @ptrToInt(string.getRefcountPtr()) >> 1;
const init_fn = if (string.isSmallStr())
initFromSmallStr
else
initFromBigStr;
const delimiter_bytes_ptrs = delimiter.asU8ptr();
const delimiter_len = delimiter.len();
@ -849,7 +912,7 @@ fn strSplitHelp(array: [*]RocStr, string: RocStr, delimiter: RocStr) void {
if (matches_delimiter) {
const segment_len: usize = str_index - slice_start_index;
array[ret_array_index] = RocStr.init(str_bytes + slice_start_index, segment_len);
array[ret_array_index] = init_fn(str_bytes, slice_start_index, segment_len, ref_ptr);
slice_start_index = str_index + delimiter_len;
ret_array_index += 1;
str_index += delimiter_len;
@ -859,7 +922,12 @@ fn strSplitHelp(array: [*]RocStr, string: RocStr, delimiter: RocStr) void {
}
}
array[ret_array_index] = RocStr.init(str_bytes + slice_start_index, str_len - slice_start_index);
array[ret_array_index] = init_fn(str_bytes, slice_start_index, str_len - slice_start_index, ref_ptr);
if (!string.isSmallStr()) {
// Correct refcount for all of the splits made.
mut_str.incref(ret_array_index + 1);
}
}
test "strSplitHelp: empty delimiter" {
@ -2800,3 +2868,9 @@ pub fn strCloneTo(
return extra_offset + slice.len;
}
}
pub fn strRefcountPtr(
string: RocStr,
) callconv(.C) ?[*]u8 {
return string.getRefcountPtr();
}

View file

@ -333,6 +333,7 @@ pub const STR_GET_SCALAR_UNSAFE: &str = "roc_builtins.str.get_scalar_unsafe";
pub const STR_CLONE_TO: &str = "roc_builtins.str.clone_to";
pub const STR_WITH_CAPACITY: &str = "roc_builtins.str.with_capacity";
pub const STR_GRAPHEMES: &str = "roc_builtins.str.graphemes";
pub const STR_REFCOUNT_PTR: &str = "roc_builtins.str.refcount_ptr";
pub const LIST_MAP: &str = "roc_builtins.list.map";
pub const LIST_MAP2: &str = "roc_builtins.list.map2";

View file

@ -63,3 +63,19 @@ pub(crate) fn str_equal<'a, 'ctx, 'env>(
bitcode::STR_EQUAL,
)
}
// Gets a pointer to just after the refcount for a list or seamless slice.
// The value is just after the refcount so that normal lists and seamless slices can share code paths easily.
pub(crate) fn str_refcount_ptr<'a, 'ctx, 'env>(
env: &Env<'a, 'ctx, 'env>,
value: BasicValueEnum<'ctx>,
) -> PointerValue<'ctx> {
call_str_bitcode_fn(
env,
&[value],
&[],
BitcodeReturns::Basic,
bitcode::STR_REFCOUNT_PTR,
)
.into_pointer_value()
}

View file

@ -2,18 +2,19 @@ use crate::debug_info_init;
use crate::llvm::bitcode::call_void_bitcode_fn;
use crate::llvm::build::BuilderExt;
use crate::llvm::build::{
add_func, cast_basic_basic, get_tag_id, tag_pointer_clear_tag_id, use_roc_value, Env,
FAST_CALL_CONV,
add_func, cast_basic_basic, create_entry_block_alloca, get_tag_id, tag_pointer_clear_tag_id,
use_roc_value, Env, FAST_CALL_CONV,
};
use crate::llvm::build_list::{
incrementing_elem_loop, list_capacity_or_ref_ptr, list_refcount_ptr, load_list,
};
use crate::llvm::build_str::str_refcount_ptr;
use crate::llvm::convert::{basic_type_from_layout, zig_str_type, RocUnion};
use bumpalo::collections::Vec;
use inkwell::basic_block::BasicBlock;
use inkwell::module::Linkage;
use inkwell::types::{AnyTypeEnum, BasicMetadataTypeEnum, BasicType, BasicTypeEnum};
use inkwell::values::{BasicValueEnum, FunctionValue, IntValue, PointerValue, StructValue};
use inkwell::values::{BasicValueEnum, FunctionValue, IntValue, PointerValue};
use inkwell::{AddressSpace, IntPredicate};
use roc_module::symbol::Interns;
use roc_module::symbol::Symbol;
@ -75,16 +76,6 @@ impl<'ctx> PointerToRefcount<'ctx> {
}
}
fn from_list_wrapper(env: &Env<'_, 'ctx, '_>, list_wrapper: StructValue<'ctx>) -> Self {
let data_ptr = env
.builder
.build_extract_value(list_wrapper, Builtin::WRAPPER_PTR, "read_list_ptr")
.unwrap()
.into_pointer_value();
Self::from_ptr_to_data(env, data_ptr)
}
pub fn is_1<'a, 'env>(&self, env: &Env<'a, 'ctx, 'env>) -> IntValue<'ctx> {
let current = self.get_refcount(env);
let one = match env.target_info.ptr_width() {
@ -815,9 +806,9 @@ fn modify_refcount_str_help<'a, 'ctx, 'env>(
let parent = fn_val;
let str_type = zig_str_type(env);
let arg_val =
if Layout::Builtin(Builtin::Str).is_passed_by_reference(layout_interner, env.target_info) {
let str_type = zig_str_type(env);
env.builder
.new_build_load(str_type, arg_val.into_pointer_value(), "load_str_to_stack")
} else {
@ -848,7 +839,11 @@ fn modify_refcount_str_help<'a, 'ctx, 'env>(
builder.build_conditional_branch(is_big_and_non_empty, modification_block, cont_block);
builder.position_at_end(modification_block);
let refcount_ptr = PointerToRefcount::from_list_wrapper(env, str_wrapper);
let str_alloca = create_entry_block_alloca(env, parent, str_type.into(), "str_alloca");
env.builder.build_store(str_alloca, str_wrapper);
let refcount_ptr =
PointerToRefcount::from_ptr_to_data(env, str_refcount_ptr(env, str_alloca.into()));
let call_mode = mode_to_call_mode(fn_val, mode);
refcount_ptr.modify(call_mode, layout, env, layout_interner);