const std = @import("std"); const unicode = std.unicode; const testing = std.testing; const expectEqual = testing.expectEqual; const expect = testing.expect; extern fn malloc(size: usize) ?*u8; extern fn free([*]u8) void; const RocStr = extern struct { str_bytes: ?[*]u8, str_len: usize, pub fn empty() RocStr { return RocStr{ .str_len = 0, .str_bytes = null, }; } // This takes ownership of the pointed-to bytes if they won't fit in a // small string, and returns a (pointer, len) tuple which points to them. pub fn init(bytes: [*]const u8, length: usize) RocStr { const rocStrSize = @sizeOf(RocStr); if (length < rocStrSize) { var ret_small_str = RocStr.empty(); const target_ptr = @ptrToInt(&ret_small_str); var index: u8 = 0; // TODO isn't there a way to bulk-zero data in Zig? // Zero out the data, just to be safe while (index < rocStrSize) { var offset_ptr = @intToPtr(*u8, target_ptr + index); offset_ptr.* = 0; index += 1; } // TODO rewrite this into a for loop index = 0; while (index < length) { var offset_ptr = @intToPtr(*u8, target_ptr + index); offset_ptr.* = bytes[index]; index += 1; } // set the final byte to be the length const final_byte_ptr = @intToPtr(*u8, target_ptr + rocStrSize - 1); final_byte_ptr.* = @truncate(u8, length) ^ 0b10000000; return ret_small_str; } else { var new_bytes: [*]u8 = @ptrCast([*]u8, malloc(length)); @memcpy(new_bytes, bytes, length); return RocStr{ .str_bytes = new_bytes, .str_len = length, }; } } pub fn drop(self: RocStr) void { if (!self.is_small_str()) { const str_bytes: [*]u8 = self.str_bytes orelse unreachable; free(str_bytes); } } pub fn eq(self: RocStr, other: RocStr) bool { const self_bytes_ptr: ?[*]const u8 = self.str_bytes; const other_bytes_ptr: ?[*]const u8 = other.str_bytes; // If they are byte-for-byte equal, they're definitely equal! if (self_bytes_ptr == other_bytes_ptr and self.str_len == other.str_len) { return true; } const self_len = self.len(); const other_len = other.len(); // If their lengths are different, they're definitely unequal. if (self_len != other_len) { return false; } const self_bytes_nonnull: [*]const u8 = self_bytes_ptr orelse unreachable; const other_bytes_nonnull: [*]const u8 = other_bytes_ptr orelse unreachable; const self_u8_ptr: [*]const u8 = @ptrCast([*]const u8, &self); const other_u8_ptr: [*]const u8 = @ptrCast([*]const u8, &other); const self_bytes: [*]const u8 = if (self.is_small_str()) self_u8_ptr else self_bytes_nonnull; const other_bytes: [*]const u8 = if (other.is_small_str()) other_u8_ptr else other_bytes_nonnull; var index: usize = 0; // TODO rewrite this into a for loop while (index < self.str_len) { if (self_bytes[index] != other_bytes[index]) { return false; } index = index + 1; } return true; } pub fn is_small_str(self: RocStr) bool { return @bitCast(isize, self.str_len) < 0; } pub fn len(self: RocStr) usize { const bytes: [*]const u8 = @ptrCast([*]const u8, &self); const last_byte = bytes[@sizeOf(RocStr) - 1]; const small_len = @as(usize, last_byte ^ 0b1000_0000); const big_len = self.str_len; // Since this conditional would be prone to branch misprediction, // make sure it will compile to a cmov. return if (self.is_small_str()) small_len else big_len; } pub fn is_empty(self: RocStr) bool { return self.len() == 0; } // Given a pointer to some bytes, write the first (len) bytes of this // RocStr's contents into it. // // One use for this function is writing into an `alloca` for a C string that // only needs to live long enough to be passed as an argument to // a C function - like the file path argument to `fopen`. pub fn memcpy(self: RocStr, dest: [*]u8, len: usize) void { const small_src = @ptrCast(*u8, self); const big_src = self.str_bytes_ptr; // For a small string, copy the bytes directly from `self`. // For a large string, copy from the pointed-to bytes. // Since this conditional would be prone to branch misprediction, // make sure it will compile to a cmov. const src: [*]u8 = if (self.is_small_str()) small_src else big_src; @memcpy(dest, src, len); } test "RocStr.eq: equal" { const str1_len = 3; var str1: [str1_len]u8 = "abc".*; const str1_ptr: [*]u8 = &str1; var roc_str1 = RocStr.init(str1_ptr, str1_len); const str2_len = 3; var str2: [str2_len]u8 = "abc".*; const str2_ptr: [*]u8 = &str2; var roc_str2 = RocStr.init(str2_ptr, str2_len); // TODO: fix those tests // expect(roc_str1.eq(roc_str2)); roc_str1.drop(); roc_str2.drop(); } test "RocStr.eq: not equal different length" { const str1_len = 4; var str1: [str1_len]u8 = "abcd".*; const str1_ptr: [*]u8 = &str1; var roc_str1 = RocStr.init(str1_ptr, str1_len); const str2_len = 3; var str2: [str2_len]u8 = "abc".*; const str2_ptr: [*]u8 = &str2; var roc_str2 = RocStr.init(str2_ptr, str2_len); expect(!roc_str1.eq(roc_str2)); roc_str1.drop(); roc_str2.drop(); } test "RocStr.eq: not equal same length" { const str1_len = 3; var str1: [str1_len]u8 = "acb".*; const str1_ptr: [*]u8 = &str1; var roc_str1 = RocStr.init(str1_ptr, str1_len); const str2_len = 3; var str2: [str2_len]u8 = "abc".*; const str2_ptr: [*]u8 = &str2; var roc_str2 = RocStr.init(str2_ptr, str2_len); // TODO: fix those tests // expect(!roc_str1.eq(roc_str2)); roc_str1.drop(); roc_str2.drop(); } }; // Str.split pub fn strSplitInPlace(array: [*]RocStr, array_len: usize, str_bytes: [*]const u8, str_len: usize, delimiter_bytes_ptrs: [*]const u8, delimiter_len: usize) callconv(.C) void { var ret_array_index: usize = 0; var sliceStart_index: usize = 0; var str_index: usize = 0; if (str_len > delimiter_len) { const end_index: usize = str_len - delimiter_len + 1; while (str_index <= end_index) { var delimiter_index: usize = 0; var matches_delimiter = true; while (delimiter_index < delimiter_len) { var delimiterChar = delimiter_bytes_ptrs[delimiter_index]; var strChar = str_bytes[str_index + delimiter_index]; if (delimiterChar != strChar) { matches_delimiter = false; break; } delimiter_index += 1; } if (matches_delimiter) { const segment_len: usize = str_index - sliceStart_index; array[ret_array_index] = RocStr.init(str_bytes + sliceStart_index, segment_len); sliceStart_index = str_index + delimiter_len; ret_array_index += 1; str_index += delimiter_len; } else { str_index += 1; } } } array[ret_array_index] = RocStr.init(str_bytes + sliceStart_index, str_len - sliceStart_index); } test "strSplitInPlace: no delimiter" { // Str.split "abc" "!" == [ "abc" ] var str: [3]u8 = "abc".*; const str_ptr: [*]const u8 = &str; var delimiter: [1]u8 = "!".*; const delimiter_ptr: [*]const u8 = &delimiter; var array: [1]RocStr = undefined; const array_ptr: [*]RocStr = &array; strSplitInPlace(array_ptr, 1, str_ptr, 3, delimiter_ptr, 1); var expected = [1]RocStr{ RocStr.init(str_ptr, 3), }; expectEqual(array.len, expected.len); // TODO: fix those tests //expect(array[0].eq(expected[0])); for (array) |roc_str| { roc_str.drop(); } for (expected) |roc_str| { roc_str.drop(); } } test "strSplitInPlace: empty end" { const str_len: usize = 50; var str: [str_len]u8 = "1---- ---- ---- ---- ----2---- ---- ---- ---- ----".*; const str_ptr: [*]u8 = &str; const delimiter_len = 24; const delimiter: [delimiter_len:0]u8 = "---- ---- ---- ---- ----".*; const delimiter_ptr: [*]const u8 = &delimiter; const array_len: usize = 3; var array: [array_len]RocStr = [_]RocStr{ undefined, undefined, undefined, }; const array_ptr: [*]RocStr = &array; strSplitInPlace(array_ptr, array_len, str_ptr, str_len, delimiter_ptr, delimiter_len); const first_expected_str_len: usize = 1; var first_expected_str: [first_expected_str_len]u8 = "1".*; const first_expected_str_ptr: [*]u8 = &first_expected_str; var firstExpectedRocStr = RocStr.init(first_expected_str_ptr, first_expected_str_len); const second_expected_str_len: usize = 1; var second_expected_str: [second_expected_str_len]u8 = "2".*; const second_expected_str_ptr: [*]u8 = &second_expected_str; var secondExpectedRocStr = RocStr.init(second_expected_str_ptr, second_expected_str_len); // TODO: fix those tests // expectEqual(array.len, 3); // expectEqual(array[0].str_len, 1); // expect(array[0].eq(firstExpectedRocStr)); // expect(array[1].eq(secondExpectedRocStr)); // expectEqual(array[2].str_len, 0); } test "strSplitInPlace: delimiter on sides" { // Str.split "tttghittt" "ttt" == [ "", "ghi", "" ] const str_len: usize = 9; var str: [str_len]u8 = "tttghittt".*; const str_ptr: [*]u8 = &str; const delimiter_len = 3; var delimiter: [delimiter_len]u8 = "ttt".*; const delimiter_ptr: [*]u8 = &delimiter; const array_len: usize = 3; var array: [array_len]RocStr = [_]RocStr{ undefined, undefined, undefined, }; const array_ptr: [*]RocStr = &array; strSplitInPlace(array_ptr, array_len, str_ptr, str_len, delimiter_ptr, delimiter_len); const expected_str_len: usize = 3; var expected_str: [expected_str_len]u8 = "ghi".*; const expected_str_ptr: [*]const u8 = &expected_str; var expectedRocStr = RocStr.init(expected_str_ptr, expected_str_len); // TODO: fix those tests // expectEqual(array.len, 3); // expectEqual(array[0].str_len, 0); // expect(array[1].eq(expectedRocStr)); // expectEqual(array[2].str_len, 0); } test "strSplitInPlace: three pieces" { // Str.split "a!b!c" "!" == [ "a", "b", "c" ] const str_len: usize = 5; var str: [str_len]u8 = "a!b!c".*; const str_ptr: [*]u8 = &str; const delimiter_len = 1; var delimiter: [delimiter_len]u8 = "!".*; const delimiter_ptr: [*]u8 = &delimiter; const array_len: usize = 3; var array: [array_len]RocStr = undefined; const array_ptr: [*]RocStr = &array; strSplitInPlace(array_ptr, array_len, str_ptr, str_len, delimiter_ptr, delimiter_len); var a: [1]u8 = "a".*; const a_ptr: [*]u8 = &a; var b: [1]u8 = "b".*; const b_ptr: [*]u8 = &b; var c: [1]u8 = "c".*; const c_ptr: [*]u8 = &c; var expected_array = [array_len]RocStr{ RocStr{ .str_bytes = a_ptr, .str_len = 1, }, RocStr{ .str_bytes = b_ptr, .str_len = 1, }, RocStr{ .str_bytes = c_ptr, .str_len = 1, }, }; // TODO: fix those tests // expectEqual(expected_array.len, array.len); // expect(array[0].eq(expected_array[0])); // expect(array[1].eq(expected_array[1])); // expect(array[2].eq(expected_array[2])); } // This is used for `Str.split : Str, Str -> Array Str // It is used to count how many segments the input `_str` // needs to be broken into, so that we can allocate a array // of that size. It always returns at least 1. pub fn countSegments(str_bytes: [*]u8, str_len: usize, delimiter_bytes_ptrs: [*]u8, delimiter_len: usize) callconv(.C) usize { var count: usize = 1; if (str_len > delimiter_len) { var str_index: usize = 0; const end_cond: usize = str_len - delimiter_len + 1; while (str_index < end_cond) { var delimiter_index: usize = 0; var matches_delimiter = true; while (delimiter_index < delimiter_len) { const delimiterChar = delimiter_bytes_ptrs[delimiter_index]; const strChar = str_bytes[str_index + delimiter_index]; if (delimiterChar != strChar) { matches_delimiter = false; break; } delimiter_index += 1; } if (matches_delimiter) { count += 1; } str_index += 1; } } return count; } test "countSegments: long delimiter" { // Str.split "str" "delimiter" == [ "str" ] // 1 segment const str_len: usize = 3; var str: [str_len]u8 = "str".*; const str_ptr: [*]u8 = &str; const delimiter_len = 9; var delimiter: [delimiter_len]u8 = "delimiter".*; const delimiter_ptr: [*]u8 = &delimiter; const segments_count = countSegments(str_ptr, str_len, delimiter_ptr, delimiter_len); expectEqual(segments_count, 1); } test "countSegments: delimiter at start" { // Str.split "hello there" "hello" == [ "", " there" ] // 2 segments const str_len: usize = 11; var str: [str_len]u8 = "hello there".*; const str_ptr: [*]u8 = &str; const delimiter_len = 5; var delimiter: [delimiter_len]u8 = "hello".*; const delimiter_ptr: [*]u8 = &delimiter; const segments_count = countSegments(str_ptr, str_len, delimiter_ptr, delimiter_len); expectEqual(segments_count, 2); } test "countSegments: delimiter interspered" { // Str.split "a!b!c" "!" == [ "a", "b", "c" ] // 3 segments const str_len: usize = 5; var str: [str_len]u8 = "a!b!c".*; const str_ptr: [*]u8 = &str; const delimiter_len = 1; var delimiter: [delimiter_len]u8 = "!".*; const delimiter_ptr: [*]u8 = &delimiter; const segments_count = countSegments(str_ptr, str_len, delimiter_ptr, delimiter_len); expectEqual(segments_count, 3); } // Str.countGraphemeClusters const grapheme = @import("helpers/grapheme.zig"); pub fn countGraphemeClusters(bytes_ptr: [*]u8, bytes_len: usize) callconv(.C) usize { var bytes = bytes_ptr[0..bytes_len]; var iter = (unicode.Utf8View.init(bytes) catch unreachable).iterator(); var count: usize = 0; var grapheme_break_state: ?grapheme.BoundClass = null; var grapheme_break_state_ptr = &grapheme_break_state; var opt_last_codepoint: ?u21 = null; while (iter.nextCodepoint()) |cur_codepoint| { if (opt_last_codepoint) |last_codepoint| { var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, grapheme_break_state_ptr); if (did_break) { count += 1; grapheme_break_state = null; } } opt_last_codepoint = cur_codepoint; } // If there are no breaks, but the str is not empty, then there // must be a single grapheme if (bytes_len != 0) { count += 1; } return count; } test "countGraphemeClusters: empty string" { var bytes_arr = "".*; var bytes_len = bytes_arr.len; var bytes_ptr: [*]u8 = &bytes_arr; var count = countGraphemeClusters(bytes_ptr, bytes_len); expectEqual(count, 0); } test "countGraphemeClusters: ascii characters" { var bytes_arr = "abcd".*; var bytes_len = bytes_arr.len; var bytes_ptr: [*]u8 = &bytes_arr; var count = countGraphemeClusters(bytes_ptr, bytes_len); expectEqual(count, 4); } test "countGraphemeClusters: utf8 characters" { var bytes_arr = "ãxā".*; var bytes_len = bytes_arr.len; var bytes_ptr: [*]u8 = &bytes_arr; var count = countGraphemeClusters(bytes_ptr, bytes_len); expectEqual(count, 3); } test "countGraphemeClusters: emojis" { var bytes_arr = "🤔🤔🤔".*; var bytes_len = bytes_arr.len; var bytes_ptr: [*]u8 = &bytes_arr; var count = countGraphemeClusters(bytes_ptr, bytes_len); expectEqual(count, 3); } test "countGraphemeClusters: emojis and ut8 characters" { var bytes_arr = "🤔å🤔¥🤔ç".*; var bytes_len = bytes_arr.len; var bytes_ptr: [*]u8 = &bytes_arr; var count = countGraphemeClusters(bytes_ptr, bytes_len); expectEqual(count, 6); } test "countGraphemeClusters: emojis, ut8, and ascii characters" { var bytes_arr = "6🤔å🤔e¥🤔çpp".*; var bytes_len = bytes_arr.len; var bytes_ptr: [*]u8 = &bytes_arr; var count = countGraphemeClusters(bytes_ptr, bytes_len); expectEqual(count, 10); } // Str.startsWith pub fn startsWith(bytes_ptr: [*]u8, bytes_len: usize, prefix_ptr: [*]u8, prefix_len: usize) callconv(.C) bool { if (prefix_len > bytes_len) { return false; } // we won't exceed bytes_len due to the previous check var i: usize = 0; while (i < prefix_len) { if (bytes_ptr[i] != prefix_ptr[i]) { return false; } i += 1; } return true; } test "startsWith: 123456789123456789 starts with 123456789123456789" { const str_len: usize = 18; var str: [str_len]u8 = "123456789123456789".*; const str_ptr: [*]u8 = &str; expect(startsWith(str_ptr, str_len, str_ptr, str_len)); } test "startsWith: 12345678912345678910 starts with 123456789123456789" { const str_len: usize = 20; var str: [str_len]u8 = "12345678912345678910".*; const str_ptr: [*]u8 = &str; const prefix_len: usize = 18; var prefix: [prefix_len]u8 = "123456789123456789".*; const prefix_ptr: [*]u8 = &str; expect(startsWith(str_ptr, str_len, prefix_ptr, prefix_len)); } // Str.concat test "RocStr.concat: small concat small" { const str1_len = 3; var str1: [str1_len]u8 = "foo".*; const str1_ptr: [*]u8 = &str1; var roc_str1 = RocStr.init(str1_ptr, str1_len); const str2_len = 3; var str2: [str2_len]u8 = "abc".*; const str2_ptr: [*]u8 = &str2; var roc_str2 = RocStr.init(str2_ptr, str2_len); const str3_len = 6; var str3: [str3_len]u8 = "fooabc".*; const str3_ptr: [*]u8 = &str3; var roc_str3 = RocStr.init(str3_ptr, str3_len); const result = strConcat(8, InPlace.Clone, roc_str1, roc_str2); expect(roc_str3.eq(result)); roc_str1.drop(); roc_str2.drop(); roc_str3.drop(); result.drop(); } pub fn strConcat(ptr_size: u32, result_in_place: InPlace, arg1: RocStr, arg2: RocStr) callconv(.C) RocStr { return switch (ptr_size) { 4 => strConcatHelp(i32, result_in_place, arg1, arg2), 8 => strConcatHelp(i64, result_in_place, arg1, arg2), else => unreachable, }; } fn strConcatHelp(comptime T: type, result_in_place: InPlace, arg1: RocStr, arg2: RocStr) RocStr { if (arg1.is_empty()) { return cloneStr(T, result_in_place, arg2); } else if (arg2.is_empty()) { return cloneStr(T, result_in_place, arg1); } else { const combined_length = arg1.len() + arg2.len(); const small_str_bytes = 2 * @sizeOf(T); const result_is_big = combined_length >= small_str_bytes; if (result_is_big) { var result = allocate_str(T, result_in_place, combined_length); { const old_if_small = &@bitCast([16]u8, arg1); const old_if_big = @ptrCast([*]u8, arg1.str_bytes); const old_bytes = if (arg1.is_small_str()) old_if_small else old_if_big; const new_bytes: [*]u8 = @ptrCast([*]u8, result.str_bytes); @memcpy(new_bytes, old_bytes, arg1.len()); } { const old_if_small = &@bitCast([16]u8, arg2); const old_if_big = @ptrCast([*]u8, arg2.str_bytes); const old_bytes = if (arg2.is_small_str()) old_if_small else old_if_big; const new_bytes = @ptrCast([*]u8, result.str_bytes) + arg1.len(); @memcpy(new_bytes, old_bytes, arg2.len()); } return result; } else { var result = [16]u8{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; // if the result is small, then for sure arg1 and arg2 are also small { var old_bytes: [*]u8 = @ptrCast([*]u8, &@bitCast([16]u8, arg1)); var new_bytes: [*]u8 = @ptrCast([*]u8, &result); @memcpy(new_bytes, old_bytes, arg1.len()); } { var old_bytes: [*]u8 = @ptrCast([*]u8, &@bitCast([16]u8, arg2)); var new_bytes = @ptrCast([*]u8, &result) + arg1.len(); @memcpy(new_bytes, old_bytes, arg2.len()); } const mask: u8 = 0b1000_0000; const final_byte = @truncate(u8, combined_length) | mask; result[small_str_bytes - 1] = final_byte; return @bitCast(RocStr, result); } return result; } } const InPlace = packed enum(u8) { InPlace, Clone, }; fn cloneStr(comptime T: type, in_place: InPlace, str: RocStr) RocStr { if (str.is_small_str() or str.is_empty()) { // just return the bytes return str; } else { var new_str = allocate_str(T, in_place, str.str_len); var old_bytes: [*]u8 = @ptrCast([*]u8, str.str_bytes); var new_bytes: [*]u8 = @ptrCast([*]u8, new_str.str_bytes); @memcpy(new_bytes, old_bytes, str.str_len); return new_str; } } fn allocate_str(comptime T: type, in_place: InPlace, number_of_chars: u64) RocStr { const length = @sizeOf(T) + number_of_chars; var new_bytes: [*]T = @ptrCast([*]T, @alignCast(@alignOf(T), malloc(length))); if (in_place == InPlace.InPlace) { new_bytes[0] = @intCast(T, number_of_chars); } else { new_bytes[0] = std.math.minInt(T); } var first_element = @ptrCast([*]align(@alignOf(T)) u8, new_bytes); first_element += 8; return RocStr{ .str_bytes = first_element, .str_len = number_of_chars, }; }