diff --git a/crates/compiler/build/src/link.rs b/crates/compiler/build/src/link.rs index 601a21bd56..385f82d11c 100644 --- a/crates/compiler/build/src/link.rs +++ b/crates/compiler/build/src/link.rs @@ -434,7 +434,12 @@ pub fn build_c_host_native( _ => { command.args(&[ shared_lib_path.to_str().unwrap(), - &bitcode::get_builtins_host_obj_path(), + // This line is commented out because + // @bhansconnect: With the addition of Str.graphemes, always + // linking the built-ins led to a surgical linker bug for + // optimized builds. Disabling until it is needed for dev + // builds. + // &bitcode::get_builtins_host_obj_path(), "-fPIE", "-pie", "-lm", diff --git a/crates/compiler/builtins/bitcode/src/main.zig b/crates/compiler/builtins/bitcode/src/main.zig index 22a82d328a..61a3065e17 100644 --- a/crates/compiler/builtins/bitcode/src/main.zig +++ b/crates/compiler/builtins/bitcode/src/main.zig @@ -145,7 +145,7 @@ comptime { exportStrFn(str.strTrimRight, "trim_right"); exportStrFn(str.strCloneTo, "clone_to"); exportStrFn(str.withCapacity, "with_capacity"); - exportStrFn(str.strGraphemes, "str_graphemes"); + exportStrFn(str.strGraphemes, "graphemes"); inline for (INTEGERS) |T| { str.exportFromInt(T, ROC_BUILTINS ++ "." ++ STR ++ ".from_int."); diff --git a/crates/compiler/builtins/bitcode/src/str.zig b/crates/compiler/builtins/bitcode/src/str.zig index de77e65cf7..8bf25b72b3 100644 --- a/crates/compiler/builtins/bitcode/src/str.zig +++ b/crates/compiler/builtins/bitcode/src/str.zig @@ -1248,96 +1248,76 @@ pub fn countGraphemeClusters(string: RocStr) callconv(.C) usize { return count; } -test "countGraphemeClusters: empty string" { - const count = countGraphemeClusters(RocStr.empty()); - try expectEqual(count, 0); -} - -test "countGraphemeClusters: ascii characters" { - const bytes_arr = "abcd"; - const bytes_len = bytes_arr.len; - const str = RocStr.init(bytes_arr, bytes_len); - defer str.deinit(); - - const count = countGraphemeClusters(str); - try expectEqual(count, 4); -} - -test "countGraphemeClusters: utf8 characters" { - const bytes_arr = "ãxā"; - const bytes_len = bytes_arr.len; - const str = RocStr.init(bytes_arr, bytes_len); - defer str.deinit(); - - const count = countGraphemeClusters(str); - try expectEqual(count, 3); -} - -test "countGraphemeClusters: emojis" { - const bytes_arr = "🤔🤔🤔"; - const bytes_len = bytes_arr.len; - const str = RocStr.init(bytes_arr, bytes_len); - defer str.deinit(); - - const count = countGraphemeClusters(str); - try expectEqual(count, 3); -} - -test "countGraphemeClusters: emojis and ut8 characters" { - const bytes_arr = "🤔å🤔¥🤔ç"; - const bytes_len = bytes_arr.len; - const str = RocStr.init(bytes_arr, bytes_len); - defer str.deinit(); - - const count = countGraphemeClusters(str); - try expectEqual(count, 6); -} - -test "countGraphemeClusters: emojis, ut8, and ascii characters" { - const bytes_arr = "6🤔å🤔e¥🤔çpp"; - const bytes_len = bytes_arr.len; - const str = RocStr.init(bytes_arr, bytes_len); - defer str.deinit(); - - const count = countGraphemeClusters(str); - try expectEqual(count, 10); -} - // Str.graphemes -pub fn strGraphemes(string: RocStr) callconv(.C) RocList { - var list = RocList.allocate(@alignOf(RocStr), countGraphemeClusters(string), @sizeOf(RocStr)); - const graphemes = @ptrCast([*]RocStr, @alignCast(@alignOf(RocStr), list.bytes)); - - const bytes_ptr = string.asU8ptr(); - var bytes = bytes_ptr[0..string.len()]; - var iter = (unicode.Utf8View.init(bytes) catch unreachable).iterator(); - var grapheme_break_state: ?grapheme.BoundClass = null; - var grapheme_break_state_ptr = &grapheme_break_state; +pub fn strGraphemes(roc_str: RocStr) callconv(.C) RocList { + var break_state: ?grapheme.BoundClass = null; var opt_last_codepoint: ?u21 = null; + var index: usize = 0; + var last_codepoint_len: u8 = 0; - var list_index: usize = 0; - var start_index: usize = 0; - var str_index: usize = 0; - var cur_codepoint_len: usize = 0; + var result = RocList.allocate(@alignOf(RocStr), countGraphemeClusters(roc_str), @sizeOf(RocStr)); + const graphemes = result.elements(RocStr) orelse return result; + var slice = roc_str.asSlice(); + var iter = (unicode.Utf8View.init(slice) catch unreachable).iterator(); while (iter.nextCodepoint()) |cur_codepoint| { - cur_codepoint_len = unicode.utf8CodepointSequenceLength(cur_codepoint) catch unreachable; + const cur_codepoint_len = unicode.utf8CodepointSequenceLength(cur_codepoint) catch unreachable; if (opt_last_codepoint) |last_codepoint| { - var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, grapheme_break_state_ptr); + var did_break = grapheme.isGraphemeBreak(last_codepoint, cur_codepoint, &break_state); if (did_break) { - graphemes[list_index] = RocStr.init(bytes_ptr + start_index, str_index - start_index + cur_codepoint_len); - list_index += 1; - start_index = str_index + cur_codepoint_len; - grapheme_break_state = null; + graphemes[index] = RocStr.fromSlice(slice[0..last_codepoint_len]); + slice = slice[last_codepoint_len..]; + index += 1; + break_state = null; + last_codepoint_len = 0; } - str_index += cur_codepoint_len; } + last_codepoint_len += cur_codepoint_len; opt_last_codepoint = cur_codepoint; } // Append last grapheme - graphemes[list_index] = RocStr.init(bytes_ptr + start_index, str_index - start_index + cur_codepoint_len); + graphemes[index] = RocStr.fromSlice(slice); + return result; +} - return list; +// these test both countGraphemeClusters() and strGraphemes() +fn graphemesTest(input: []const u8, expected: []const []const u8) !void { + const rocstr = RocStr.fromSlice(input); + defer rocstr.deinit(); + const count = countGraphemeClusters(rocstr); + try expectEqual(expected.len, count); + + const graphemes = strGraphemes(rocstr); + defer graphemes.deinit(u8); + if (input.len == 0) return; // empty string + const elems = graphemes.elements(RocStr) orelse unreachable; + for (expected) |g, i| { + try std.testing.expectEqualStrings(g, elems[i].asSlice()); + } +} + +test "graphemes: empty string" { + try graphemesTest("", &.{}); +} + +test "graphemes: ascii characters" { + try graphemesTest("abcd", &.{ "a", "b", "c", "d" }); +} + +test "graphemes: utf8 characters" { + try graphemesTest("ãxā", &.{ "ã", "x", "ā" }); +} + +test "graphemes: emojis" { + try graphemesTest("🤔🤔🤔", &.{ "🤔", "🤔", "🤔" }); +} + +test "graphemes: emojis and ut8 characters" { + try graphemesTest("🤔å🤔¥🤔ç", &.{ "🤔", "å", "🤔", "¥", "🤔", "ç" }); +} + +test "graphemes: emojis, ut8, and ascii characters" { + try graphemesTest("6🤔å🤔e¥🤔çpp", &.{ "6", "🤔", "å", "🤔", "e", "¥", "🤔", "ç", "p", "p" }); } pub fn countUtf8Bytes(string: RocStr) callconv(.C) usize { diff --git a/crates/compiler/builtins/src/bitcode.rs b/crates/compiler/builtins/src/bitcode.rs index f5fdb7dc05..58145098f5 100644 --- a/crates/compiler/builtins/src/bitcode.rs +++ b/crates/compiler/builtins/src/bitcode.rs @@ -362,7 +362,7 @@ pub const STR_APPEND_SCALAR: &str = "roc_builtins.str.append_scalar"; pub const STR_GET_SCALAR_UNSAFE: &str = "roc_builtins.str.get_scalar_unsafe"; pub const STR_CLONE_TO: &str = "roc_builtins.str.clone_to"; pub const STR_WITH_CAPACITY: &str = "roc_builtins.str.with_capacity"; -pub const STR_GRAPHEMES: &str = "roc_builtins.str.str_graphemes"; +pub const STR_GRAPHEMES: &str = "roc_builtins.str.graphemes"; pub const LIST_MAP: &str = "roc_builtins.list.map"; pub const LIST_MAP2: &str = "roc_builtins.list.map2";