diff --git a/compiler/builtins/bitcode/src/main.zig b/compiler/builtins/bitcode/src/main.zig index d33149fbdc..a356964e25 100644 --- a/compiler/builtins/bitcode/src/main.zig +++ b/compiler/builtins/bitcode/src/main.zig @@ -121,6 +121,7 @@ comptime { exportStrFn(str.fromUtf8C, "from_utf8"); exportStrFn(str.fromUtf8RangeC, "from_utf8_range"); exportStrFn(str.repeat, "repeat"); + exportStrFn(str.strTrim, "trim"); } // Utils diff --git a/compiler/builtins/bitcode/src/str.zig b/compiler/builtins/bitcode/src/str.zig index 4985e6bbc5..7ca72c7cc4 100644 --- a/compiler/builtins/bitcode/src/str.zig +++ b/compiler/builtins/bitcode/src/str.zig @@ -163,7 +163,7 @@ pub const RocStr = extern struct { ) RocStr { const element_width = 1; - if (self.bytes) |source_ptr| { + if (self.str_bytes) |source_ptr| { if (self.isUnique()) { const new_source = utils.unsafeReallocate(source_ptr, RocStr.alignment, self.len(), new_length, element_width); @@ -171,7 +171,7 @@ pub const RocStr = extern struct { } } - return self.reallocateFresh(RocStr.alignment, new_length, element_width); + return self.reallocateFresh(new_length); } /// reallocate by explicitly making a new allocation and copying elements over @@ -294,7 +294,7 @@ pub const RocStr = extern struct { } pub fn isUnique(self: RocStr) bool { - // the empty list is unique (in the sense that copying it will not leak memory) + // the empty string is unique (in the sense that copying it will not leak memory) if (self.isEmpty()) { return true; } @@ -305,6 +305,10 @@ pub const RocStr = extern struct { } // otherwise, check if the refcount is one + return @call(.{ .modifier = always_inline }, RocStr.isRefcountOne, .{self}); + } + + fn isRefcountOne(self: RocStr) bool { const ptr: [*]usize = @ptrCast([*]usize, @alignCast(8, self.str_bytes)); return (ptr - 1)[0] == utils.REFCOUNT_ONE; } @@ -1473,3 +1477,253 @@ test "validateUtf8Bytes: surrogate halves" { try expectErr(list, 3, error.Utf8EncodesSurrogateHalf, Utf8ByteProblem.EncodesSurrogateHalf); } + +fn isWhitespace(codepoint: u21) bool { + // https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt + return switch (codepoint) { + 0x0009...0x000D => true, // control characters + 0x0020 => true, // space + 0x0085 => true, // control character + 0x00A0 => true, // no-break space + 0x1680 => true, // ogham space + 0x2000...0x200A => true, // en quad..hair space + 0x200E...0x200F => true, // left-to-right & right-to-left marks + 0x2028 => true, // line separator + 0x2029 => true, // paragraph separator + 0x202F => true, // narrow no-break space + 0x205F => true, // medium mathematical space + 0x3000 => true, // ideographic space + + else => false, + }; +} + +test "isWhitespace" { + try expect(isWhitespace(' ')); + try expect(isWhitespace('\u{00A0}')); + try expect(!isWhitespace('x')); +} + +pub fn strTrim(string: RocStr) callconv(.C) RocStr { + if (string.str_bytes) |bytes_ptr| { + const leading_bytes = countLeadingWhitespaceBytes(string); + const original_len = string.len(); + + if (original_len == leading_bytes) { + string.deinit(); + return RocStr.empty(); + } + + const trailing_bytes = countTrailingWhitespaceBytes(string); + const new_len = original_len - leading_bytes - trailing_bytes; + + const small_or_shared = new_len <= SMALL_STR_MAX_LENGTH or !string.isRefcountOne(); + if (small_or_shared) { + return RocStr.init(string.asU8ptr() + leading_bytes, new_len); + } + + // nonempty, large, and unique: + + if (leading_bytes > 0) { + var i: usize = 0; + while (i < new_len) : (i += 1) { + const dest = bytes_ptr + i; + const source = dest + leading_bytes; + @memcpy(dest, source, 1); + } + } + + var new_string = string; + new_string.str_len = new_len; + + return new_string; + } + + return RocStr.empty(); +} + +fn countLeadingWhitespaceBytes(string: RocStr) usize { + var byte_count: usize = 0; + + var bytes = string.asU8ptr()[0..string.len()]; + var iter = unicode.Utf8View.initUnchecked(bytes).iterator(); + while (iter.nextCodepoint()) |codepoint| { + if (isWhitespace(codepoint)) { + byte_count += unicode.utf8CodepointSequenceLength(codepoint) catch break; + } else { + break; + } + } + + return byte_count; +} + +fn countTrailingWhitespaceBytes(string: RocStr) usize { + var byte_count: usize = 0; + + var bytes = string.asU8ptr()[0..string.len()]; + var iter = ReverseUtf8View.initUnchecked(bytes).iterator(); + while (iter.nextCodepoint()) |codepoint| { + if (isWhitespace(codepoint)) { + byte_count += unicode.utf8CodepointSequenceLength(codepoint) catch break; + } else { + break; + } + } + + return byte_count; +} + +/// A backwards version of Utf8View from std.unicode +const ReverseUtf8View = struct { + bytes: []const u8, + + pub fn initUnchecked(s: []const u8) ReverseUtf8View { + return ReverseUtf8View{ .bytes = s }; + } + + pub fn iterator(s: ReverseUtf8View) ReverseUtf8Iterator { + return ReverseUtf8Iterator{ + .bytes = s.bytes, + .i = if (s.bytes.len > 0) s.bytes.len - 1 else null, + }; + } +}; + +/// A backwards version of Utf8Iterator from std.unicode +const ReverseUtf8Iterator = struct { + bytes: []const u8, + // NOTE null signifies complete/empty + i: ?usize, + + pub fn nextCodepointSlice(it: *ReverseUtf8Iterator) ?[]const u8 { + if (it.i) |index| { + var i = index; + + // NOTE this relies on the string being valid utf8 to not run off the end + while (!utf8BeginByte(it.bytes[i])) { + i -= 1; + } + + const cp_len = unicode.utf8ByteSequenceLength(it.bytes[i]) catch unreachable; + const slice = it.bytes[i .. i + cp_len]; + + it.i = if (i == 0) null else i - 1; + + return slice; + } else { + return null; + } + } + + pub fn nextCodepoint(it: *ReverseUtf8Iterator) ?u21 { + const slice = it.nextCodepointSlice() orelse return null; + + return switch (slice.len) { + 1 => @as(u21, slice[0]), + 2 => unicode.utf8Decode2(slice) catch unreachable, + 3 => unicode.utf8Decode3(slice) catch unreachable, + 4 => unicode.utf8Decode4(slice) catch unreachable, + else => unreachable, + }; + } +}; + +fn utf8BeginByte(byte: u8) bool { + return switch (byte) { + 0b1000_0000...0b1011_1111 => false, + else => true, + }; +} + +test "strTrim: empty" { + const trimmedEmpty = strTrim(RocStr.empty()); + try expect(trimmedEmpty.eq(RocStr.empty())); +} + +test "strTrim: blank" { + const original_bytes = " "; + const original = RocStr.init(original_bytes, original_bytes.len); + defer original.deinit(); + + const trimmed = strTrim(original); + + try expect(trimmed.eq(RocStr.empty())); +} + +test "strTrim: large to large" { + const original_bytes = " hello giant world "; + const original = RocStr.init(original_bytes, original_bytes.len); + defer original.deinit(); + + try expect(!original.isSmallStr()); + + const expected_bytes = "hello giant world"; + const expected = RocStr.init(expected_bytes, expected_bytes.len); + defer expected.deinit(); + + try expect(!expected.isSmallStr()); + + const trimmed = strTrim(original); + + try expect(trimmed.eq(expected)); +} + +test "strTrim: large to small" { + const original_bytes = " hello world "; + const original = RocStr.init(original_bytes, original_bytes.len); + defer original.deinit(); + + try expect(!original.isSmallStr()); + + const expected_bytes = "hello world"; + const expected = RocStr.init(expected_bytes, expected_bytes.len); + defer expected.deinit(); + + try expect(expected.isSmallStr()); + + const trimmed = strTrim(original); + + try expect(trimmed.eq(expected)); + try expect(trimmed.isSmallStr()); +} + +test "strTrim: small to small" { + const original_bytes = " hello world "; + const original = RocStr.init(original_bytes, original_bytes.len); + defer original.deinit(); + + try expect(original.isSmallStr()); + + const expected_bytes = "hello world"; + const expected = RocStr.init(expected_bytes, expected_bytes.len); + defer expected.deinit(); + + try expect(expected.isSmallStr()); + + const trimmed = strTrim(original); + + try expect(trimmed.eq(expected)); + try expect(trimmed.isSmallStr()); +} + +test "ReverseUtf8View: hello world" { + const original_bytes = "hello world"; + const expected_bytes = "dlrow olleh"; + + var i: usize = 0; + var iter = ReverseUtf8View.initUnchecked(original_bytes).iterator(); + while (iter.nextCodepoint()) |codepoint| { + try expect(expected_bytes[i] == codepoint); + i += 1; + } +} + +test "ReverseUtf8View: empty" { + const original_bytes = ""; + + var iter = ReverseUtf8View.initUnchecked(original_bytes).iterator(); + while (iter.nextCodepoint()) |codepoint| { + try expect(false); + } +} diff --git a/compiler/builtins/src/bitcode.rs b/compiler/builtins/src/bitcode.rs index 2623b2ad3b..c517094b8a 100644 --- a/compiler/builtins/src/bitcode.rs +++ b/compiler/builtins/src/bitcode.rs @@ -142,6 +142,7 @@ pub const STR_TO_UTF8: &str = "roc_builtins.str.to_utf8"; pub const STR_FROM_UTF8: &str = "roc_builtins.str.from_utf8"; pub const STR_FROM_UTF8_RANGE: &str = "roc_builtins.str.from_utf8_range"; pub const STR_REPEAT: &str = "roc_builtins.str.repeat"; +pub const STR_TRIM: &str = "roc_builtins.str.trim"; pub const DICT_HASH: &str = "roc_builtins.dict.hash"; pub const DICT_HASH_STR: &str = "roc_builtins.dict.hash_str"; diff --git a/compiler/builtins/src/std.rs b/compiler/builtins/src/std.rs index b39bd4f6e8..e416912b1f 100644 --- a/compiler/builtins/src/std.rs +++ b/compiler/builtins/src/std.rs @@ -632,6 +632,9 @@ pub fn types() -> MutMap { Box::new(str_type()) ); + // trim : Str -> Str + add_top_level_function_type!(Symbol::STR_TRIM, vec![str_type()], Box::new(str_type())); + // fromUtf8 : List U8 -> Result Str [ BadUtf8 Utf8Problem ]* { let bad_utf8 = SolvedType::TagUnion( diff --git a/compiler/can/src/builtins.rs b/compiler/can/src/builtins.rs index d670491361..c21ed5f27c 100644 --- a/compiler/can/src/builtins.rs +++ b/compiler/can/src/builtins.rs @@ -67,6 +67,7 @@ pub fn builtin_defs_map(symbol: Symbol, var_store: &mut VarStore) -> Option STR_TO_UTF8 => str_to_utf8, STR_FROM_FLOAT=> str_from_float, STR_REPEAT => str_repeat, + STR_TRIM => str_trim, LIST_LEN => list_len, LIST_GET => list_get, LIST_SET => list_set, @@ -1238,6 +1239,11 @@ fn str_split(symbol: Symbol, var_store: &mut VarStore) -> Def { ) } +/// Str.trim : Str -> Str +fn str_trim(symbol: Symbol, var_store: &mut VarStore) -> Def { + lowlevel_1(symbol, LowLevel::StrTrim, var_store) +} + /// Str.repeat : Str, Nat -> Str fn str_repeat(symbol: Symbol, var_store: &mut VarStore) -> Def { let str_var = var_store.fresh(); diff --git a/compiler/gen_llvm/src/llvm/build.rs b/compiler/gen_llvm/src/llvm/build.rs index 34461198d8..55d35a481d 100644 --- a/compiler/gen_llvm/src/llvm/build.rs +++ b/compiler/gen_llvm/src/llvm/build.rs @@ -17,7 +17,7 @@ use crate::llvm::build_list::{ use crate::llvm::build_str::{ empty_str, str_concat, str_count_graphemes, str_ends_with, str_from_float, str_from_int, str_from_utf8, str_from_utf8_range, str_join_with, str_number_of_bytes, str_repeat, str_split, - str_starts_with, str_starts_with_code_point, str_to_utf8, + str_starts_with, str_starts_with_code_point, str_to_utf8, str_trim, }; use crate::llvm::compare::{generic_eq, generic_neq}; use crate::llvm::convert::{ @@ -4953,6 +4953,12 @@ fn run_low_level<'a, 'ctx, 'env>( str_count_graphemes(env, scope, args[0]) } + StrTrim => { + // Str.trim : Str -> Str + debug_assert_eq!(args.len(), 1); + + str_trim(env, scope, args[0]) + } ListLen => { // List.len : List * -> Int debug_assert_eq!(args.len(), 1); diff --git a/compiler/gen_llvm/src/llvm/build_str.rs b/compiler/gen_llvm/src/llvm/build_str.rs index 564f35625e..8a8241719b 100644 --- a/compiler/gen_llvm/src/llvm/build_str.rs +++ b/compiler/gen_llvm/src/llvm/build_str.rs @@ -249,6 +249,16 @@ pub fn str_count_graphemes<'a, 'ctx, 'env>( ) } +/// Str.trim : Str -> Str +pub fn str_trim<'a, 'ctx, 'env>( + env: &Env<'a, 'ctx, 'env>, + scope: &Scope<'a, 'ctx>, + str_symbol: Symbol, +) -> BasicValueEnum<'ctx> { + let str_i128 = str_symbol_to_c_abi(env, scope, str_symbol); + call_bitcode_fn(env, &[str_i128.into()], bitcode::STR_TRIM) +} + /// Str.fromInt : Int -> Str pub fn str_from_int<'a, 'ctx, 'env>( env: &Env<'a, 'ctx, 'env>, diff --git a/compiler/module/src/low_level.rs b/compiler/module/src/low_level.rs index 9be88c2945..c38acfc71e 100644 --- a/compiler/module/src/low_level.rs +++ b/compiler/module/src/low_level.rs @@ -17,6 +17,7 @@ pub enum LowLevel { StrToUtf8, StrRepeat, StrFromFloat, + StrTrim, ListLen, ListGetUnsafe, ListSet, @@ -123,6 +124,7 @@ macro_rules! first_order { | StrFromUtf8Range | StrToUtf8 | StrRepeat + | StrTrim | StrFromFloat | ListLen | ListGetUnsafe diff --git a/compiler/module/src/symbol.rs b/compiler/module/src/symbol.rs index d6acdf24db..c4961522c8 100644 --- a/compiler/module/src/symbol.rs +++ b/compiler/module/src/symbol.rs @@ -1015,6 +1015,7 @@ define_builtins! { 17 STR_ALIAS_ANALYSIS_STATIC: "#aliasAnalysisStatic" // string with the static lifetime 18 STR_FROM_UTF8_RANGE: "fromUtf8Range" 19 STR_REPEAT: "repeat" + 20 STR_TRIM: "trim" } 4 LIST: "List" => { 0 LIST_LIST: "List" imported // the List.List type alias diff --git a/compiler/mono/src/borrow.rs b/compiler/mono/src/borrow.rs index d890c136c7..2c2430d0fb 100644 --- a/compiler/mono/src/borrow.rs +++ b/compiler/mono/src/borrow.rs @@ -922,6 +922,7 @@ pub fn lowlevel_borrow_signature(arena: &Bump, op: LowLevel) -> &[bool] { ListGetUnsafe => arena.alloc_slice_copy(&[borrowed, irrelevant]), ListConcat => arena.alloc_slice_copy(&[owned, owned]), StrConcat => arena.alloc_slice_copy(&[owned, borrowed]), + StrTrim => arena.alloc_slice_copy(&[owned]), StrSplit => arena.alloc_slice_copy(&[borrowed, borrowed]), ListSingle => arena.alloc_slice_copy(&[irrelevant]), ListRepeat => arena.alloc_slice_copy(&[irrelevant, borrowed]), diff --git a/compiler/solve/tests/solve_expr.rs b/compiler/solve/tests/solve_expr.rs index 43b97738a4..3b0c758c26 100644 --- a/compiler/solve/tests/solve_expr.rs +++ b/compiler/solve/tests/solve_expr.rs @@ -3733,6 +3733,18 @@ mod solve_expr { ); } + #[test] + fn str_trim() { + infer_eq_without_problem( + indoc!( + r#" + Str.trim + "# + ), + "Str -> Str", + ); + } + #[test] fn list_drop_last() { infer_eq_without_problem( diff --git a/compiler/test_gen/src/gen_list.rs b/compiler/test_gen/src/gen_list.rs index 4dddce1997..0aa87308cd 100644 --- a/compiler/test_gen/src/gen_list.rs +++ b/compiler/test_gen/src/gen_list.rs @@ -215,7 +215,7 @@ fn list_drop_at() { } #[test] -fn list_drop_at_mutable() { +fn list_drop_at_shared() { assert_evals_to!( indoc!( r#" diff --git a/compiler/test_gen/src/gen_str.rs b/compiler/test_gen/src/gen_str.rs index 292946403e..aebfd70c37 100644 --- a/compiler/test_gen/src/gen_str.rs +++ b/compiler/test_gen/src/gen_str.rs @@ -977,3 +977,94 @@ fn str_repeat_empty_string() { fn str_repeat_zero_times() { assert_evals_to!(indoc!(r#"Str.repeat "Roc" 0"#), RocStr::from(""), RocStr); } + +#[test] +fn str_trim_empty_string() { + assert_evals_to!(indoc!(r#"Str.trim """#), RocStr::from(""), RocStr); +} + +#[test] +fn str_trim_small_blank_string() { + assert_evals_to!(indoc!(r#"Str.trim " ""#), RocStr::from(""), RocStr); +} + +#[test] +fn str_trim_small_to_small() { + assert_evals_to!( + indoc!(r#"Str.trim " hello world ""#), + RocStr::from("hello world"), + RocStr + ); +} + +#[test] +fn str_trim_large_to_large_unique() { + assert_evals_to!( + indoc!(r#"Str.trim (Str.concat " " "hello world from a large string ")"#), + RocStr::from("hello world from a large string"), + RocStr + ); +} + +#[test] +fn str_trim_large_to_small_unique() { + assert_evals_to!( + indoc!(r#"Str.trim (Str.concat " " "hello world ")"#), + RocStr::from("hello world"), + RocStr + ); +} + +#[test] +fn str_trim_large_to_large_shared() { + assert_evals_to!( + indoc!( + r#" + original : Str + original = " hello world world " + + { trimmed: Str.trim original, original: original } + "# + ), + ( + RocStr::from(" hello world world "), + RocStr::from("hello world world"), + ), + (RocStr, RocStr) + ); +} + +#[test] +fn str_trim_large_to_small_shared() { + assert_evals_to!( + indoc!( + r#" + original : Str + original = " hello world " + + { trimmed: Str.trim original, original: original } + "# + ), + ( + RocStr::from(" hello world "), + RocStr::from("hello world"), + ), + (RocStr, RocStr) + ); +} + +#[test] +fn str_trim_small_to_small_shared() { + assert_evals_to!( + indoc!( + r#" + original : Str + original = " hello world " + + { trimmed: Str.trim original, original: original } + "# + ), + (RocStr::from(" hello world "), RocStr::from("hello world"),), + (RocStr, RocStr) + ); +} diff --git a/examples/hello-rust/hello-world b/examples/hello-rust/hello-world new file mode 100755 index 0000000000..04439a158d Binary files /dev/null and b/examples/hello-rust/hello-world differ