diff --git a/cli/tests/repl_eval.rs b/cli/tests/repl_eval.rs index de25e58e20..5142c728d0 100644 --- a/cli/tests/repl_eval.rs +++ b/cli/tests/repl_eval.rs @@ -110,6 +110,14 @@ mod repl_eval { ); } + #[test] + fn str_count_graphemes() { + expect_success( + "Str.concat \"å🤔\"", + "2 : Int", + ); + } + #[test] fn literal_empty_list() { expect_success("[]", "[] : List *"); diff --git a/compiler/builtins/bitcode/src/grapheme.zig b/compiler/builtins/bitcode/src/helpers/grapheme.zig similarity index 100% rename from compiler/builtins/bitcode/src/grapheme.zig rename to compiler/builtins/bitcode/src/helpers/grapheme.zig diff --git a/compiler/builtins/bitcode/src/main.zig b/compiler/builtins/bitcode/src/main.zig index 2e447fe22c..73875f11a0 100644 --- a/compiler/builtins/bitcode/src/main.zig +++ b/compiler/builtins/bitcode/src/main.zig @@ -1,476 +1,33 @@ +const builtin = @import("builtin"); const std = @import("std"); -const math = std.math; -const unicode = std.unicode; const testing = std.testing; -const expectEqual = testing.expectEqual; -const expect = testing.expect; -const roc_builtins_namespace = "roc_builtins"; +// Num Module +const num = @import("num.zig"); +comptime { exportNumFn(num.atan, "atan"); } +comptime { exportNumFn(num.isFinite, "is_finite"); } +comptime { exportNumFn(num.powInt, "pow_int"); } +comptime { exportNumFn(num.acos, "acos"); } +comptime { exportNumFn(num.asin, "asin"); } -// MATH -const math_namespace = roc_builtins_namespace ++ ".math"; +// Str Module +const str = @import("str.zig"); +comptime { exportStrFn(str.strSplitInPlace, "str_split_in_place"); } +comptime { exportStrFn(str.countSegments, "count_segements"); } +comptime { exportStrFn(str.countGraphemeClusters, "count_grapheme_clusters"); } -comptime { @export(atan, .{ .name = math_namespace ++ ".atan", .linkage = .Strong }); } -fn atan(num: f64) callconv(.C) f64 { - return math.atan(num); -} - -comptime { @export(isFinite, .{ .name = math_namespace ++ ".is_finite", .linkage = .Strong }); } -fn isFinite(num: f64) callconv(.C) bool { - return math.isFinite(num); -} - -comptime { @export(powInt, .{ .name = math_namespace ++ ".pow_int", .linkage = .Strong }); } -fn powInt(base: i64, exp: i64) callconv(.C) i64 { - return math.pow(i64, base, exp); -} - -comptime { @export(acos, .{ .name = math_namespace ++ ".acos", .linkage = .Strong }); } -fn acos(num: f64) callconv(.C) f64 { - return math.acos(num); -} - -comptime { @export(asin, .{ .name = math_namespace ++ ".asin", .linkage = .Strong }); } -fn asin(num: f64) callconv(.C) f64 { - return math.asin(num); -} - -// STR -const str_namespace = roc_builtins_namespace ++ ".str"; - -// Str.split - -const RocStr = struct { - str_bytes_ptrs: [*]u8, - str_len: usize, - - pub fn init(bytes: [*]u8, len: usize) RocStr { - return RocStr { - .str_bytes_ptrs = bytes, - .str_len = len - }; - } - - pub fn eq(self: *RocStr, other: RocStr) bool { - if (self.str_len != other.str_len) { - return false; - } - - var areEq: bool = true; - var index: usize = 0; - while (index < self.str_len and areEq) { - areEq = areEq and self.str_bytes_ptrs[index] == other.str_bytes_ptrs[index]; - index = index + 1; - } - - return areEq; - } - - test "RocStr.eq: equal" { - const str1_len = 3; - var str1: [str1_len]u8 = "abc".*; - const str1_ptr: [*]u8 = &str1; - var roc_str1 = RocStr.init(str1_ptr, str1_len); - - const str2_len = 3; - var str2: [str2_len]u8 = "abc".*; - const str2_ptr: [*]u8 = &str2; - var roc_str2 = RocStr.init(str2_ptr, str2_len); - - expect(roc_str1.eq(roc_str2)); - } - - test "RocStr.eq: not equal different length" { - const str1_len = 4; - var str1: [str1_len]u8 = "abcd".*; - const str1_ptr: [*]u8 = &str1; - var roc_str1 = RocStr.init(str1_ptr, str1_len); - - const str2_len = 3; - var str2: [str2_len]u8 = "abc".*; - const str2_ptr: [*]u8 = &str2; - var roc_str2 = RocStr.init(str2_ptr, str2_len); - - expect(!roc_str1.eq(roc_str2)); - } - - test "RocStr.eq: not equal same length" { - const str1_len = 3; - var str1: [str1_len]u8 = "acb".*; - const str1_ptr: [*]u8 = &str1; - var roc_str1 = RocStr.init(str1_ptr, str1_len); - - const str2_len = 3; - var str2: [str2_len]u8 = "abc".*; - const str2_ptr: [*]u8 = &str2; - var roc_str2 = RocStr.init(str2_ptr, str2_len); - - expect(!roc_str1.eq(roc_str2)); - } -}; - -comptime { @export(strSplitInPlace, .{ .name = str_namespace ++ ".str_split_in_place", .linkage = .Strong }); } -fn strSplitInPlace( - array: [*]RocStr, - array_len: usize, - str_bytes_ptrs: [*]u8, - str_len: usize, - delimiter_bytes: [*]u8, - delimiter_len: usize -) callconv(.C) void { - var ret_array_index : usize = 0; - - var sliceStart_index : usize = 0; - - var str_index : usize = 0; - - if (str_len > delimiter_len) { - const end_index : usize = str_len - delimiter_len; - while (str_index <= end_index) { - var delimiter_index : usize = 0; - var matches_delimiter = true; - - while (delimiter_index < delimiter_len) { - var delimiterChar = delimiter_bytes[delimiter_index]; - var strChar = str_bytes_ptrs[str_index + delimiter_index]; - - if (delimiterChar != strChar) { - matches_delimiter = false; - break; - } - - delimiter_index += 1; - } - - if (matches_delimiter) { - array[ret_array_index] = RocStr.init(str_bytes_ptrs + sliceStart_index, str_index - sliceStart_index); - sliceStart_index = str_index + delimiter_len; - ret_array_index += 1; - str_index += delimiter_len; - } else { - str_index += 1; - } - } - } - - array[ret_array_index] = RocStr.init(str_bytes_ptrs + sliceStart_index, str_len - sliceStart_index); -} - -test "strSplitInPlace: no delimiter" { - // Str.split "abc" "!" == [ "abc" ] - - var str: [3]u8 = "abc".*; - const str_ptr: [*]u8 = &str; - - var delimiter: [1]u8 = "!".*; - const delimiter_ptr: [*]u8 = &delimiter; - - var array: [1]RocStr = undefined; - const array_ptr: [*]RocStr = &array; - - strSplitInPlace( - array_ptr, - 1, - str_ptr, - 3, - delimiter_ptr, - 1 - ); - - var expected = [1]RocStr{ - RocStr.init(str_ptr, 3), - }; - - expectEqual(array.len, expected.len); - expect(array[0].eq(expected[0])); -} - -test "strSplitInPlace: delimiter on sides" { - // Str.split "tttghittt" "ttt" == [ "", "ghi", "" ] - - const str_len: usize = 9; - var str: [str_len]u8 = "tttghittt".*; - const str_ptr: [*]u8 = &str; - - const delimiter_len = 3; - var delimiter: [delimiter_len]u8 = "ttt".*; - const delimiter_ptr: [*]u8 = &delimiter; - - const array_len : usize = 3; - var array: [array_len]RocStr = [_]RocStr{ - undefined , - undefined, - undefined, - }; - const array_ptr: [*]RocStr = &array; - - strSplitInPlace( - array_ptr, - array_len, - str_ptr, - str_len, - delimiter_ptr, - delimiter_len - ); - - const expected_str_len: usize = 3; - var expected_str: [expected_str_len]u8 = "ghi".*; - const expected_str_ptr: [*]u8 = &expected_str; - var expectedRocStr = RocStr.init(expected_str_ptr, expected_str_len); - - expectEqual(array.len, 3); - expectEqual(array[0].str_len, 0); - expect(array[1].eq(expectedRocStr)); - expectEqual(array[2].str_len, 0); -} - -test "strSplitInPlace: three pieces" { - // Str.split "a!b!c" "!" == [ "a", "b", "c" ] - - const str_len: usize = 5; - var str: [str_len]u8 = "a!b!c".*; - const str_ptr: [*]u8 = &str; - - const delimiter_len = 1; - var delimiter: [delimiter_len]u8 = "!".*; - const delimiter_ptr: [*]u8 = &delimiter; - - const array_len : usize = 3; - var array: [array_len]RocStr = undefined; - const array_ptr: [*]RocStr = &array; - - strSplitInPlace( - array_ptr, - array_len, - str_ptr, - str_len, - delimiter_ptr, - delimiter_len - ); - - var a: [1]u8 = "a".*; - const a_ptr: [*]u8 = &a; - - var b: [1]u8 = "b".*; - const b_ptr: [*]u8 = &b; - - var c: [1]u8 = "c".*; - const c_ptr: [*]u8 = &c; - - var expected_array = [array_len]RocStr{ - RocStr{ - .str_bytes_ptrs = a_ptr, - .str_len = 1, - }, - RocStr{ - .str_bytes_ptrs = b_ptr, - .str_len = 1, - }, - RocStr{ - .str_bytes_ptrs = c_ptr, - .str_len = 1, - } - }; - - expectEqual(expected_array.len, array.len); - expect(array[0].eq(expected_array[0])); - expect(array[1].eq(expected_array[1])); - expect(array[2].eq(expected_array[2])); -} - -// This is used for `Str.split : Str, Str -> Array Str -// It is used to count how many segments the input `_str` -// needs to be broken into, so that we can allocate a array -// of that size. It always returns at least 1. -comptime { @export(countSegments, .{ .name = str_namespace ++ ".count_segements", .linkage = .Strong }); } -fn countSegments( - str_bytes_ptrs: [*]u8, - str_len: usize, - delimiter_bytes: [*]u8, - delimiter_len: usize -) callconv(.C) i64 { - var count: i64 = 1; - - if (str_len > delimiter_len) { - var str_index: usize = 0; - const end_cond: usize = str_len - delimiter_len; - - while (str_index < end_cond) { - var delimiter_index: usize = 0; - - var matches_delimiter = true; - - while (delimiter_index < delimiter_len) { - const delimiterChar = delimiter_bytes[delimiter_index]; - const strChar = str_bytes_ptrs[str_index + delimiter_index]; - - if (delimiterChar != strChar) { - matches_delimiter = false; - break; - } - - delimiter_index += 1; - } - - if (matches_delimiter) { - count += 1; - } - - str_index += 1; - } - } - - return count; -} - -test "countSegments: long delimiter" { - // Str.split "str" "delimiter" == [ "str" ] - // 1 segment - - const str_len: usize = 3; - var str: [str_len]u8 = "str".*; - const str_ptr: [*]u8 = &str; - - const delimiter_len = 9; - var delimiter: [delimiter_len]u8 = "delimiter".*; - const delimiter_ptr: [*]u8 = &delimiter; - - const segments_count = countSegments( - str_ptr, - str_len, - delimiter_ptr, - delimiter_len - ); - - expectEqual(segments_count, 1); -} - -test "countSegments: delimiter at start" { - // Str.split "hello there" "hello" == [ "", " there" ] - // 2 segments - - const str_len: usize = 11; - var str: [str_len]u8 = "hello there".*; - const str_ptr: [*]u8 = &str; - - const delimiter_len = 5; - var delimiter: [delimiter_len]u8 = "hello".*; - const delimiter_ptr: [*]u8 = &delimiter; - - const segments_count = countSegments( - str_ptr, - str_len, - delimiter_ptr, - delimiter_len - ); - - expectEqual(segments_count, 2); -} - -test "countSegments: delimiter interspered" { - // Str.split "a!b!c" "!" == [ "a", "b", "c" ] - // 3 segments - - const str_len: usize = 5; - var str: [str_len]u8 = "a!b!c".*; - const str_ptr: [*]u8 = &str; - - const delimiter_len = 1; - var delimiter: [delimiter_len]u8 = "!".*; - const delimiter_ptr: [*]u8 = &delimiter; - - const segments_count = countSegments( - str_ptr, - str_len, - delimiter_ptr, - delimiter_len - ); - - expectEqual(segments_count, 3); -} - -// Str.countGraphemeClusters -const grapheme = @import("grapheme.zig"); - -comptime { @export(countGraphemeClusters, .{ .name = str_namespace ++ ".count_grapheme_clusters", .linkage = .Strong }); } -fn countGraphemeClusters(bytes_ptr: [*]u8, bytes_len: usize) callconv(.C) usize { - var bytes = bytes_ptr[0..bytes_len]; - var iter = (unicode.Utf8View.init(bytes) catch unreachable).iterator(); - - var count: usize = 0; - var grapheme_break_state: ?grapheme.BoundClass = null; - var grapheme_break_state_ptr = &grapheme_break_state; - var opt_last_codepoint: ?u21 = null; - while (iter.nextCodepoint()) |cur_codepoint| { - if (opt_last_codepoint) |last_codepoint| { - var did_break = grapheme.isGraphemeBreak( - last_codepoint, - cur_codepoint, - grapheme_break_state_ptr - ); - if (did_break) { - count += 1; - grapheme_break_state = null; - } - } - opt_last_codepoint = cur_codepoint; - } - - if (bytes_len != 0) { - count += 1; - } - - return count; -} - -test "countGraphemeClusters: empty string" { - var bytes_arr = "".*; - var bytes_len = bytes_arr.len; - var bytes_ptr: [*]u8 = &bytes_arr; - var count = countGraphemeClusters(bytes_ptr, bytes_len); - expectEqual(count, 0); -} - -test "countGraphemeClusters: ascii characters" { - var bytes_arr = "abcd".*; - var bytes_len = bytes_arr.len; - var bytes_ptr: [*]u8 = &bytes_arr; - var count = countGraphemeClusters(bytes_ptr, bytes_len); - expectEqual(count, 4); -} - -test "countGraphemeClusters: utf8 characters" { - var bytes_arr = "ãxā".*; - var bytes_len = bytes_arr.len; - var bytes_ptr: [*]u8 = &bytes_arr; - var count = countGraphemeClusters(bytes_ptr, bytes_len); - expectEqual(count, 3); -} - -test "countGraphemeClusters: emojis" { - var bytes_arr = "🤔🤔🤔".*; - var bytes_len = bytes_arr.len; - var bytes_ptr: [*]u8 = &bytes_arr; - var count = countGraphemeClusters(bytes_ptr, bytes_len); - expectEqual(count, 3); -} - -test "countGraphemeClusters: emojis and ut8 characters" { - var bytes_arr = "🤔å🤔¥🤔ç".*; - var bytes_len = bytes_arr.len; - var bytes_ptr: [*]u8 = &bytes_arr; - var count = countGraphemeClusters(bytes_ptr, bytes_len); - expectEqual(count, 6); -} - -test "countGraphemeClusters: emojis, ut8, and ascii characters" { - var bytes_arr = "6🤔å🤔e¥🤔çpp".*; - var bytes_len = bytes_arr.len; - var bytes_ptr: [*]u8 = &bytes_arr; - var count = countGraphemeClusters(bytes_ptr, bytes_len); - expectEqual(count, 10); +// Export helpers - Must be run inside a comptime +fn exportBuiltinFn(comptime fn_target: anytype, comptime fn_name: []const u8) void { + @export(fn_target, .{ .name = "roc_builtins." ++ fn_name, .linkage = .Strong }); +} +fn exportNumFn(comptime fn_target: anytype, comptime fn_name: []const u8) void { + exportBuiltinFn(fn_target, "num." ++ fn_name); +} +fn exportStrFn(comptime fn_target: anytype, comptime fn_name: []const u8) void { + exportBuiltinFn(fn_target, "str." ++ fn_name); } +// Run all tests in imported modules // https://github.com/ziglang/zig/blob/master/lib/std/std.zig#L94 test "" { testing.refAllDecls(@This()); diff --git a/compiler/builtins/bitcode/src/num.zig b/compiler/builtins/bitcode/src/num.zig new file mode 100644 index 0000000000..4e9922288b --- /dev/null +++ b/compiler/builtins/bitcode/src/num.zig @@ -0,0 +1,22 @@ +const std = @import("std"); +const math = std.math; + +pub fn atan(num: f64) callconv(.C) f64 { + return math.atan(num); +} + +pub fn isFinite(num: f64) callconv(.C) bool { + return math.isFinite(num); +} + +pub fn powInt(base: i64, exp: i64) callconv(.C) i64 { + return math.pow(i64, base, exp); +} + +pub fn acos(num: f64) callconv(.C) f64 { + return math.acos(num); +} + +pub fn asin(num: f64) callconv(.C) f64 { + return math.asin(num); +} diff --git a/compiler/builtins/bitcode/src/str.zig b/compiler/builtins/bitcode/src/str.zig new file mode 100644 index 0000000000..80966f977e --- /dev/null +++ b/compiler/builtins/bitcode/src/str.zig @@ -0,0 +1,437 @@ +const std = @import("std"); +const unicode = std.unicode; +const testing = std.testing; +const expectEqual = testing.expectEqual; +const expect = testing.expect; + +const RocStr = struct { + str_bytes_ptrs: [*]u8, + str_len: usize, + + pub fn init(bytes: [*]u8, len: usize) RocStr { + return RocStr { + .str_bytes_ptrs = bytes, + .str_len = len + }; + } + + pub fn eq(self: *RocStr, other: RocStr) bool { + if (self.str_len != other.str_len) { + return false; + } + + var areEq: bool = true; + var index: usize = 0; + while (index < self.str_len and areEq) { + areEq = areEq and self.str_bytes_ptrs[index] == other.str_bytes_ptrs[index]; + index = index + 1; + } + + return areEq; + } + + test "RocStr.eq: equal" { + const str1_len = 3; + var str1: [str1_len]u8 = "abc".*; + const str1_ptr: [*]u8 = &str1; + var roc_str1 = RocStr.init(str1_ptr, str1_len); + + const str2_len = 3; + var str2: [str2_len]u8 = "abc".*; + const str2_ptr: [*]u8 = &str2; + var roc_str2 = RocStr.init(str2_ptr, str2_len); + + expect(roc_str1.eq(roc_str2)); + } + + test "RocStr.eq: not equal different length" { + const str1_len = 4; + var str1: [str1_len]u8 = "abcd".*; + const str1_ptr: [*]u8 = &str1; + var roc_str1 = RocStr.init(str1_ptr, str1_len); + + const str2_len = 3; + var str2: [str2_len]u8 = "abc".*; + const str2_ptr: [*]u8 = &str2; + var roc_str2 = RocStr.init(str2_ptr, str2_len); + + expect(!roc_str1.eq(roc_str2)); + } + + test "RocStr.eq: not equal same length" { + const str1_len = 3; + var str1: [str1_len]u8 = "acb".*; + const str1_ptr: [*]u8 = &str1; + var roc_str1 = RocStr.init(str1_ptr, str1_len); + + const str2_len = 3; + var str2: [str2_len]u8 = "abc".*; + const str2_ptr: [*]u8 = &str2; + var roc_str2 = RocStr.init(str2_ptr, str2_len); + + expect(!roc_str1.eq(roc_str2)); + } +}; + +// Str.split + +pub fn strSplitInPlace( + array: [*]RocStr, + array_len: usize, + str_bytes_ptrs: [*]u8, + str_len: usize, + delimiter_bytes: [*]u8, + delimiter_len: usize +) callconv(.C) void { + var ret_array_index : usize = 0; + + var sliceStart_index : usize = 0; + + var str_index : usize = 0; + + if (str_len > delimiter_len) { + const end_index : usize = str_len - delimiter_len; + while (str_index <= end_index) { + var delimiter_index : usize = 0; + var matches_delimiter = true; + + while (delimiter_index < delimiter_len) { + var delimiterChar = delimiter_bytes[delimiter_index]; + var strChar = str_bytes_ptrs[str_index + delimiter_index]; + + if (delimiterChar != strChar) { + matches_delimiter = false; + break; + } + + delimiter_index += 1; + } + + if (matches_delimiter) { + array[ret_array_index] = RocStr.init(str_bytes_ptrs + sliceStart_index, str_index - sliceStart_index); + sliceStart_index = str_index + delimiter_len; + ret_array_index += 1; + str_index += delimiter_len; + } else { + str_index += 1; + } + } + } + + array[ret_array_index] = RocStr.init(str_bytes_ptrs + sliceStart_index, str_len - sliceStart_index); +} + +test "strSplitInPlace: no delimiter" { + // Str.split "abc" "!" == [ "abc" ] + + var str: [3]u8 = "abc".*; + const str_ptr: [*]u8 = &str; + + var delimiter: [1]u8 = "!".*; + const delimiter_ptr: [*]u8 = &delimiter; + + var array: [1]RocStr = undefined; + const array_ptr: [*]RocStr = &array; + + strSplitInPlace( + array_ptr, + 1, + str_ptr, + 3, + delimiter_ptr, + 1 + ); + + var expected = [1]RocStr{ + RocStr.init(str_ptr, 3), + }; + + expectEqual(array.len, expected.len); + expect(array[0].eq(expected[0])); +} + +test "strSplitInPlace: delimiter on sides" { + // Str.split "tttghittt" "ttt" == [ "", "ghi", "" ] + + const str_len: usize = 9; + var str: [str_len]u8 = "tttghittt".*; + const str_ptr: [*]u8 = &str; + + const delimiter_len = 3; + var delimiter: [delimiter_len]u8 = "ttt".*; + const delimiter_ptr: [*]u8 = &delimiter; + + const array_len : usize = 3; + var array: [array_len]RocStr = [_]RocStr{ + undefined , + undefined, + undefined, + }; + const array_ptr: [*]RocStr = &array; + + strSplitInPlace( + array_ptr, + array_len, + str_ptr, + str_len, + delimiter_ptr, + delimiter_len + ); + + const expected_str_len: usize = 3; + var expected_str: [expected_str_len]u8 = "ghi".*; + const expected_str_ptr: [*]u8 = &expected_str; + var expectedRocStr = RocStr.init(expected_str_ptr, expected_str_len); + + expectEqual(array.len, 3); + expectEqual(array[0].str_len, 0); + expect(array[1].eq(expectedRocStr)); + expectEqual(array[2].str_len, 0); +} + +test "strSplitInPlace: three pieces" { + // Str.split "a!b!c" "!" == [ "a", "b", "c" ] + + const str_len: usize = 5; + var str: [str_len]u8 = "a!b!c".*; + const str_ptr: [*]u8 = &str; + + const delimiter_len = 1; + var delimiter: [delimiter_len]u8 = "!".*; + const delimiter_ptr: [*]u8 = &delimiter; + + const array_len : usize = 3; + var array: [array_len]RocStr = undefined; + const array_ptr: [*]RocStr = &array; + + strSplitInPlace( + array_ptr, + array_len, + str_ptr, + str_len, + delimiter_ptr, + delimiter_len + ); + + var a: [1]u8 = "a".*; + const a_ptr: [*]u8 = &a; + + var b: [1]u8 = "b".*; + const b_ptr: [*]u8 = &b; + + var c: [1]u8 = "c".*; + const c_ptr: [*]u8 = &c; + + var expected_array = [array_len]RocStr{ + RocStr{ + .str_bytes_ptrs = a_ptr, + .str_len = 1, + }, + RocStr{ + .str_bytes_ptrs = b_ptr, + .str_len = 1, + }, + RocStr{ + .str_bytes_ptrs = c_ptr, + .str_len = 1, + } + }; + + expectEqual(expected_array.len, array.len); + expect(array[0].eq(expected_array[0])); + expect(array[1].eq(expected_array[1])); + expect(array[2].eq(expected_array[2])); +} + +// This is used for `Str.split : Str, Str -> Array Str +// It is used to count how many segments the input `_str` +// needs to be broken into, so that we can allocate a array +// of that size. It always returns at least 1. +pub fn countSegments( + str_bytes_ptrs: [*]u8, + str_len: usize, + delimiter_bytes: [*]u8, + delimiter_len: usize +) callconv(.C) i64 { + var count: i64 = 1; + + if (str_len > delimiter_len) { + var str_index: usize = 0; + const end_cond: usize = str_len - delimiter_len; + + while (str_index < end_cond) { + var delimiter_index: usize = 0; + + var matches_delimiter = true; + + while (delimiter_index < delimiter_len) { + const delimiterChar = delimiter_bytes[delimiter_index]; + const strChar = str_bytes_ptrs[str_index + delimiter_index]; + + if (delimiterChar != strChar) { + matches_delimiter = false; + break; + } + + delimiter_index += 1; + } + + if (matches_delimiter) { + count += 1; + } + + str_index += 1; + } + } + + return count; +} + +test "countSegments: long delimiter" { + // Str.split "str" "delimiter" == [ "str" ] + // 1 segment + + const str_len: usize = 3; + var str: [str_len]u8 = "str".*; + const str_ptr: [*]u8 = &str; + + const delimiter_len = 9; + var delimiter: [delimiter_len]u8 = "delimiter".*; + const delimiter_ptr: [*]u8 = &delimiter; + + const segments_count = countSegments( + str_ptr, + str_len, + delimiter_ptr, + delimiter_len + ); + + expectEqual(segments_count, 1); +} + +test "countSegments: delimiter at start" { + // Str.split "hello there" "hello" == [ "", " there" ] + // 2 segments + + const str_len: usize = 11; + var str: [str_len]u8 = "hello there".*; + const str_ptr: [*]u8 = &str; + + const delimiter_len = 5; + var delimiter: [delimiter_len]u8 = "hello".*; + const delimiter_ptr: [*]u8 = &delimiter; + + const segments_count = countSegments( + str_ptr, + str_len, + delimiter_ptr, + delimiter_len + ); + + expectEqual(segments_count, 2); +} + +test "countSegments: delimiter interspered" { + // Str.split "a!b!c" "!" == [ "a", "b", "c" ] + // 3 segments + + const str_len: usize = 5; + var str: [str_len]u8 = "a!b!c".*; + const str_ptr: [*]u8 = &str; + + const delimiter_len = 1; + var delimiter: [delimiter_len]u8 = "!".*; + const delimiter_ptr: [*]u8 = &delimiter; + + const segments_count = countSegments( + str_ptr, + str_len, + delimiter_ptr, + delimiter_len + ); + + expectEqual(segments_count, 3); +} + +// Str.countGraphemeClusters +const grapheme = @import("helpers/grapheme.zig"); + +pub fn countGraphemeClusters(bytes_ptr: [*]u8, bytes_len: usize) callconv(.C) usize { + var bytes = bytes_ptr[0..bytes_len]; + var iter = (unicode.Utf8View.init(bytes) catch unreachable).iterator(); + + var count: usize = 0; + var grapheme_break_state: ?grapheme.BoundClass = null; + var grapheme_break_state_ptr = &grapheme_break_state; + var opt_last_codepoint: ?u21 = null; + while (iter.nextCodepoint()) |cur_codepoint| { + if (opt_last_codepoint) |last_codepoint| { + var did_break = grapheme.isGraphemeBreak( + last_codepoint, + cur_codepoint, + grapheme_break_state_ptr + ); + if (did_break) { + count += 1; + grapheme_break_state = null; + } + } + opt_last_codepoint = cur_codepoint; + } + + // If there are no breaks, but the str is not empty the there + // must be a single grapheme + if (bytes_len != 0) { + count += 1; + } + + return count; +} + +test "countGraphemeClusters: empty string" { + var bytes_arr = "".*; + var bytes_len = bytes_arr.len; + var bytes_ptr: [*]u8 = &bytes_arr; + var count = countGraphemeClusters(bytes_ptr, bytes_len); + expectEqual(count, 0); +} + +test "countGraphemeClusters: ascii characters" { + var bytes_arr = "abcd".*; + var bytes_len = bytes_arr.len; + var bytes_ptr: [*]u8 = &bytes_arr; + var count = countGraphemeClusters(bytes_ptr, bytes_len); + expectEqual(count, 4); +} + +test "countGraphemeClusters: utf8 characters" { + var bytes_arr = "ãxā".*; + var bytes_len = bytes_arr.len; + var bytes_ptr: [*]u8 = &bytes_arr; + var count = countGraphemeClusters(bytes_ptr, bytes_len); + expectEqual(count, 3); +} + +test "countGraphemeClusters: emojis" { + var bytes_arr = "🤔🤔🤔".*; + var bytes_len = bytes_arr.len; + var bytes_ptr: [*]u8 = &bytes_arr; + var count = countGraphemeClusters(bytes_ptr, bytes_len); + expectEqual(count, 3); +} + +test "countGraphemeClusters: emojis and ut8 characters" { + var bytes_arr = "🤔å🤔¥🤔ç".*; + var bytes_len = bytes_arr.len; + var bytes_ptr: [*]u8 = &bytes_arr; + var count = countGraphemeClusters(bytes_ptr, bytes_len); + expectEqual(count, 6); +} + +test "countGraphemeClusters: emojis, ut8, and ascii characters" { + var bytes_arr = "6🤔å🤔e¥🤔çpp".*; + var bytes_len = bytes_arr.len; + var bytes_ptr: [*]u8 = &bytes_arr; + var count = countGraphemeClusters(bytes_ptr, bytes_len); + expectEqual(count, 10); +} diff --git a/compiler/builtins/src/bitcode.rs b/compiler/builtins/src/bitcode.rs index 265247a58e..dedf409dbf 100644 --- a/compiler/builtins/src/bitcode.rs +++ b/compiler/builtins/src/bitcode.rs @@ -17,11 +17,11 @@ pub fn get_bytes() -> Vec { buffer } -pub const MATH_ASIN: &str = "roc_builtins.math.asin"; -pub const MATH_ACOS: &str = "roc_builtins.math.acos"; -pub const MATH_ATAN: &str = "roc_builtins.math.atan"; -pub const MATH_IS_FINITE: &str = "roc_builtins.math.is_finite"; -pub const MATH_POW_INT: &str = "roc_builtins.math.pow_int"; +pub const NUM_ASIN: &str = "roc_builtins.num.asin"; +pub const NUM_ACOS: &str = "roc_builtins.num.acos"; +pub const NUM_ATAN: &str = "roc_builtins.num.atan"; +pub const NUM_IS_FINITE: &str = "roc_builtins.num.is_finite"; +pub const NUM_POW_INT: &str = "roc_builtins.num.pow_int"; pub const STR_COUNT_SEGEMENTS: &str = "roc_builtins.str.count_segements"; pub const STR_SPLIT_IN_PLACE: &str = "roc_builtins.str.str_split_in_place"; diff --git a/compiler/gen/src/llvm/bitcode.rs b/compiler/gen/src/llvm/bitcode.rs index a9140a623d..bfd6557365 100644 --- a/compiler/gen/src/llvm/bitcode.rs +++ b/compiler/gen/src/llvm/bitcode.rs @@ -1,7 +1,7 @@ use inkwell::types::BasicTypeEnum; use roc_module::low_level::LowLevel; -fn call_bitcode_fn<'a, 'ctx, 'env>( +pub fn call_bitcode_fn<'a, 'ctx, 'env>( op: LowLevel, env: &Env<'a, 'ctx, 'env>, args: &[BasicValueEnum<'ctx>], diff --git a/compiler/gen/src/llvm/build.rs b/compiler/gen/src/llvm/build.rs index 3ee3614efc..261df4ab24 100644 --- a/compiler/gen/src/llvm/build.rs +++ b/compiler/gen/src/llvm/build.rs @@ -3033,7 +3033,7 @@ fn build_int_binop<'a, 'ctx, 'env>( NumPowInt, env, &[lhs.into(), rhs.into()], - &bitcode::MATH_POW_INT, + &bitcode::NUM_POW_INT, ), _ => { unreachable!("Unrecognized int binary operation: {:?}", op); @@ -3041,7 +3041,7 @@ fn build_int_binop<'a, 'ctx, 'env>( } } -fn call_bitcode_fn<'a, 'ctx, 'env>( +pub fn call_bitcode_fn<'a, 'ctx, 'env>( op: LowLevel, env: &Env<'a, 'ctx, 'env>, args: &[BasicValueEnum<'ctx>], @@ -3082,7 +3082,7 @@ fn build_float_binop<'a, 'ctx, 'env>( let result = bd.build_float_add(lhs, rhs, "add_float"); let is_finite = - call_bitcode_fn(NumIsFinite, env, &[result.into()], &bitcode::MATH_IS_FINITE) + call_bitcode_fn(NumIsFinite, env, &[result.into()], &bitcode::NUM_IS_FINITE) .into_int_value(); let then_block = context.append_basic_block(parent, "then_block"); @@ -3104,7 +3104,7 @@ fn build_float_binop<'a, 'ctx, 'env>( let result = bd.build_float_add(lhs, rhs, "add_float"); let is_finite = - call_bitcode_fn(NumIsFinite, env, &[result.into()], &bitcode::MATH_IS_FINITE) + call_bitcode_fn(NumIsFinite, env, &[result.into()], &bitcode::NUM_IS_FINITE) .into_int_value(); let is_infinite = bd.build_not(is_finite, "negate"); @@ -3234,10 +3234,10 @@ fn build_float_unary_op<'a, 'ctx, 'env>( env.context.i64_type(), "num_floor", ), - NumIsFinite => call_bitcode_fn(NumIsFinite, env, &[arg.into()], &bitcode::MATH_IS_FINITE), - NumAtan => call_bitcode_fn(NumAtan, env, &[arg.into()], &bitcode::MATH_ATAN), - NumAcos => call_bitcode_fn(NumAcos, env, &[arg.into()], &bitcode::MATH_ACOS), - NumAsin => call_bitcode_fn(NumAsin, env, &[arg.into()], &bitcode::MATH_ASIN), + NumIsFinite => call_bitcode_fn(NumIsFinite, env, &[arg.into()], &bitcode::NUM_IS_FINITE), + NumAtan => call_bitcode_fn(NumAtan, env, &[arg.into()], &bitcode::NUM_ATAN), + NumAcos => call_bitcode_fn(NumAcos, env, &[arg.into()], &bitcode::NUM_ACOS), + NumAsin => call_bitcode_fn(NumAsin, env, &[arg.into()], &bitcode::NUM_ASIN), _ => { unreachable!("Unrecognized int unary operation: {:?}", op); } diff --git a/compiler/gen/src/llvm/build_str.rs b/compiler/gen/src/llvm/build_str.rs index 2c50ea0db7..32e71ccc95 100644 --- a/compiler/gen/src/llvm/build_str.rs +++ b/compiler/gen/src/llvm/build_str.rs @@ -1,4 +1,4 @@ -use crate::llvm::build::{ptr_from_symbol, Env, InPlace, Scope}; +use crate::llvm::build::{ptr_from_symbol, Env, InPlace, Scope, call_bitcode_fn}; use crate::llvm::build_list::{ allocate_list, build_basic_phi2, empty_list, incrementing_elem_loop, load_list_ptr, store_list, }; @@ -29,19 +29,19 @@ pub fn str_concat<'a, 'ctx, 'env>( let second_str_ptr = ptr_from_symbol(scope, second_str_symbol); let first_str_ptr = ptr_from_symbol(scope, first_str_symbol); - let str_wrapper_type = BasicTypeEnum::StructType(collection(ctx, env.ptr_bytes)); + let ret_type = BasicTypeEnum::StructType(collection(ctx, env.ptr_bytes)); load_str( env, parent, *second_str_ptr, - str_wrapper_type, + ret_type, |second_str_ptr, second_str_len, second_str_smallness| { load_str( env, parent, *first_str_ptr, - str_wrapper_type, + ret_type, |first_str_ptr, first_str_len, first_str_smallness| { // first_str_len > 0 // We do this check to avoid allocating memory. If the first input @@ -74,7 +74,7 @@ pub fn str_concat<'a, 'ctx, 'env>( second_str_length_comparison, if_second_str_is_nonempty, if_second_str_is_empty, - str_wrapper_type, + ret_type, ) }; @@ -604,13 +604,13 @@ pub fn str_count_graphemes<'a, 'ctx, 'env>( let ctx = env.context; let sym_str_ptr = ptr_from_symbol(scope, str_symbol); - let str_wrapper_type = BasicTypeEnum::StructType(collection(ctx, env.ptr_bytes)); + let ret_type = BasicTypeEnum::IntType(ctx.i64_type()); load_str( env, parent, *sym_str_ptr, - str_wrapper_type, + ret_type, |str_ptr, str_len, _str_smallness| { call_bitcode_fn( LowLevel::StrCountGraphemes, @@ -624,24 +624,3 @@ pub fn str_count_graphemes<'a, 'ctx, 'env>( }, ) } - -// Duplicated from build.rs for now, once it's all working I'll delete this and import it form a -// common place -fn call_bitcode_fn<'a, 'ctx, 'env>( - op: LowLevel, - env: &Env<'a, 'ctx, 'env>, - args: &[BasicValueEnum<'ctx>], - fn_name: &str, -) -> BasicValueEnum<'ctx> { - let fn_val = env - .module - .get_function(fn_name) - .unwrap_or_else(|| panic!("Unrecognized builtin function: {:?} - if you're working on the Roc compiler, do you need to rebuild the bitcode? See compiler/builtins/bitcode/README.md", fn_name)); - let call = env.builder.build_call(fn_val, args, "call_builtin"); - - call.set_call_convention(fn_val.get_call_conventions()); - - call.try_as_basic_value() - .left() - .unwrap_or_else(|| panic!("LLVM error: Invalid call for low-level op {:?}", op)) -} diff --git a/compiler/gen/tests/gen_str.rs b/compiler/gen/tests/gen_str.rs index ee37500058..ccc2b57320 100644 --- a/compiler/gen/tests/gen_str.rs +++ b/compiler/gen/tests/gen_str.rs @@ -204,7 +204,12 @@ mod gen_str { } #[test] - fn str_count_graphemes() { - assert_evals_to!(r#"Str.countGraphemes "6🤔å🤔e¥🤔çpp""#, 10, usize); + fn str_count_graphemes_small_str() { + assert_evals_to!(r#"Str.countGraphemes "å🤔""#, 2, usize); + } + + #[test] + fn str_count_graphemes_big_str() { + assert_evals_to!(r#"Str.countGraphemes "6🤔å🤔e¥🤔çppkd🙃1jdal🦯asdfa∆ltråø˚waia8918.,🏅jjc""#, 45, usize); } } diff --git a/compiler/mono/src/borrow.rs b/compiler/mono/src/borrow.rs index fa6a7da8ee..848b829638 100644 --- a/compiler/mono/src/borrow.rs +++ b/compiler/mono/src/borrow.rs @@ -506,12 +506,11 @@ pub fn lowlevel_borrow_signature(arena: &Bump, op: LowLevel) -> &[bool] { // - arguments that we may want to update destructively must be Owned // - other refcounted arguments are Borrowed match op { - ListLen | StrIsEmpty => arena.alloc_slice_copy(&[borrowed]), + ListLen | StrIsEmpty | StrCountGraphemes => arena.alloc_slice_copy(&[borrowed]), ListSet => arena.alloc_slice_copy(&[owned, irrelevant, irrelevant]), ListSetInPlace => arena.alloc_slice_copy(&[owned, irrelevant, irrelevant]), ListGetUnsafe => arena.alloc_slice_copy(&[borrowed, irrelevant]), ListConcat | StrConcat => arena.alloc_slice_copy(&[owned, borrowed]), - StrCountGraphemes => arena.alloc_slice_copy(&[borrowed]), ListSingle => arena.alloc_slice_copy(&[irrelevant]), ListRepeat => arena.alloc_slice_copy(&[irrelevant, irrelevant]),