Merge branch 'trunk' into list_min

2025-09-29 14:54:47 +00:00 · 2021-10-28 20:32:27 +02:00 · 2021-10-28 20:32:27 +02:00 · aa978ae6d9
commit aa978ae6d9
parent 329c035e24 1e6b5db1e3
14 changed files with 393 additions and 5 deletions
--- a/compiler/builtins/bitcode/src/main.zig
+++ b/compiler/builtins/bitcode/src/main.zig
@ -121,6 +121,7 @@ comptime {
    exportStrFn(str.fromUtf8C, "from_utf8");
    exportStrFn(str.fromUtf8RangeC, "from_utf8_range");
    exportStrFn(str.repeat, "repeat");
+    exportStrFn(str.strTrim, "trim");
 }

 // Utils
--- a/compiler/builtins/bitcode/src/str.zig
+++ b/compiler/builtins/bitcode/src/str.zig
@ -163,7 +163,7 @@ pub const RocStr = extern struct {
    ) RocStr {
        const element_width = 1;

-        if (self.bytes) |source_ptr| {
+        if (self.str_bytes) |source_ptr| {
            if (self.isUnique()) {
                const new_source = utils.unsafeReallocate(source_ptr, RocStr.alignment, self.len(), new_length, element_width);

@ -171,7 +171,7 @@ pub const RocStr = extern struct {
            }
        }

-        return self.reallocateFresh(RocStr.alignment, new_length, element_width);
+        return self.reallocateFresh(new_length);
    }

    /// reallocate by explicitly making a new allocation and copying elements over
@ -294,7 +294,7 @@ pub const RocStr = extern struct {
    }

    pub fn isUnique(self: RocStr) bool {
-        // the empty list is unique (in the sense that copying it will not leak memory)
+        // the empty string is unique (in the sense that copying it will not leak memory)
        if (self.isEmpty()) {
            return true;
        }
@ -305,6 +305,10 @@ pub const RocStr = extern struct {
        }

        // otherwise, check if the refcount is one
+        return @call(.{ .modifier = always_inline }, RocStr.isRefcountOne, .{self});
+    }
+
+    fn isRefcountOne(self: RocStr) bool {
        const ptr: [*]usize = @ptrCast([*]usize, @alignCast(8, self.str_bytes));
        return (ptr - 1)[0] == utils.REFCOUNT_ONE;
    }
@ -1473,3 +1477,253 @@ test "validateUtf8Bytes: surrogate halves" {

    try expectErr(list, 3, error.Utf8EncodesSurrogateHalf, Utf8ByteProblem.EncodesSurrogateHalf);
 }
+
+fn isWhitespace(codepoint: u21) bool {
+    // https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+    return switch (codepoint) {
+        0x0009...0x000D => true, // control characters
+        0x0020 => true, // space
+        0x0085 => true, // control character
+        0x00A0 => true, // no-break space
+        0x1680 => true, // ogham space
+        0x2000...0x200A => true, // en quad..hair space
+        0x200E...0x200F => true, // left-to-right & right-to-left marks
+        0x2028 => true, // line separator
+        0x2029 => true, // paragraph separator
+        0x202F => true, // narrow no-break space
+        0x205F => true, // medium mathematical space
+        0x3000 => true, // ideographic space
+
+        else => false,
+    };
+}
+
+test "isWhitespace" {
+    try expect(isWhitespace(' '));
+    try expect(isWhitespace('\u{00A0}'));
+    try expect(!isWhitespace('x'));
+}
+
+pub fn strTrim(string: RocStr) callconv(.C) RocStr {
+    if (string.str_bytes) |bytes_ptr| {
+        const leading_bytes = countLeadingWhitespaceBytes(string);
+        const original_len = string.len();
+
+        if (original_len == leading_bytes) {
+            string.deinit();
+            return RocStr.empty();
+        }
+
+        const trailing_bytes = countTrailingWhitespaceBytes(string);
+        const new_len = original_len - leading_bytes - trailing_bytes;
+
+        const small_or_shared = new_len <= SMALL_STR_MAX_LENGTH or !string.isRefcountOne();
+        if (small_or_shared) {
+            return RocStr.init(string.asU8ptr() + leading_bytes, new_len);
+        }
+
+        // nonempty, large, and unique:
+
+        if (leading_bytes > 0) {
+            var i: usize = 0;
+            while (i < new_len) : (i += 1) {
+                const dest = bytes_ptr + i;
+                const source = dest + leading_bytes;
+                @memcpy(dest, source, 1);
+            }
+        }
+
+        var new_string = string;
+        new_string.str_len = new_len;
+
+        return new_string;
+    }
+
+    return RocStr.empty();
+}
+
+fn countLeadingWhitespaceBytes(string: RocStr) usize {
+    var byte_count: usize = 0;
+
+    var bytes = string.asU8ptr()[0..string.len()];
+    var iter = unicode.Utf8View.initUnchecked(bytes).iterator();
+    while (iter.nextCodepoint()) |codepoint| {
+        if (isWhitespace(codepoint)) {
+            byte_count += unicode.utf8CodepointSequenceLength(codepoint) catch break;
+        } else {
+            break;
+        }
+    }
+
+    return byte_count;
+}
+
+fn countTrailingWhitespaceBytes(string: RocStr) usize {
+    var byte_count: usize = 0;
+
+    var bytes = string.asU8ptr()[0..string.len()];
+    var iter = ReverseUtf8View.initUnchecked(bytes).iterator();
+    while (iter.nextCodepoint()) |codepoint| {
+        if (isWhitespace(codepoint)) {
+            byte_count += unicode.utf8CodepointSequenceLength(codepoint) catch break;
+        } else {
+            break;
+        }
+    }
+
+    return byte_count;
+}
+
+/// A backwards version of Utf8View from std.unicode
+const ReverseUtf8View = struct {
+    bytes: []const u8,
+
+    pub fn initUnchecked(s: []const u8) ReverseUtf8View {
+        return ReverseUtf8View{ .bytes = s };
+    }
+
+    pub fn iterator(s: ReverseUtf8View) ReverseUtf8Iterator {
+        return ReverseUtf8Iterator{
+            .bytes = s.bytes,
+            .i = if (s.bytes.len > 0) s.bytes.len - 1 else null,
+        };
+    }
+};
+
+/// A backwards version of Utf8Iterator from std.unicode
+const ReverseUtf8Iterator = struct {
+    bytes: []const u8,
+    // NOTE null signifies complete/empty
+    i: ?usize,
+
+    pub fn nextCodepointSlice(it: *ReverseUtf8Iterator) ?[]const u8 {
+        if (it.i) |index| {
+            var i = index;
+
+            // NOTE this relies on the string being valid utf8 to not run off the end
+            while (!utf8BeginByte(it.bytes[i])) {
+                i -= 1;
+            }
+
+            const cp_len = unicode.utf8ByteSequenceLength(it.bytes[i]) catch unreachable;
+            const slice = it.bytes[i .. i + cp_len];
+
+            it.i = if (i == 0) null else i - 1;
+
+            return slice;
+        } else {
+            return null;
+        }
+    }
+
+    pub fn nextCodepoint(it: *ReverseUtf8Iterator) ?u21 {
+        const slice = it.nextCodepointSlice() orelse return null;
+
+        return switch (slice.len) {
+            1 => @as(u21, slice[0]),
+            2 => unicode.utf8Decode2(slice) catch unreachable,
+            3 => unicode.utf8Decode3(slice) catch unreachable,
+            4 => unicode.utf8Decode4(slice) catch unreachable,
+            else => unreachable,
+        };
+    }
+};
+
+fn utf8BeginByte(byte: u8) bool {
+    return switch (byte) {
+        0b1000_0000...0b1011_1111 => false,
+        else => true,
+    };
+}
+
+test "strTrim: empty" {
+    const trimmedEmpty = strTrim(RocStr.empty());
+    try expect(trimmedEmpty.eq(RocStr.empty()));
+}
+
+test "strTrim: blank" {
+    const original_bytes = "   ";
+    const original = RocStr.init(original_bytes, original_bytes.len);
+    defer original.deinit();
+
+    const trimmed = strTrim(original);
+
+    try expect(trimmed.eq(RocStr.empty()));
+}
+
+test "strTrim: large to large" {
+    const original_bytes = " hello giant world ";
+    const original = RocStr.init(original_bytes, original_bytes.len);
+    defer original.deinit();
+
+    try expect(!original.isSmallStr());
+
+    const expected_bytes = "hello giant world";
+    const expected = RocStr.init(expected_bytes, expected_bytes.len);
+    defer expected.deinit();
+
+    try expect(!expected.isSmallStr());
+
+    const trimmed = strTrim(original);
+
+    try expect(trimmed.eq(expected));
+}
+
+test "strTrim: large to small" {
+    const original_bytes = "             hello world         ";
+    const original = RocStr.init(original_bytes, original_bytes.len);
+    defer original.deinit();
+
+    try expect(!original.isSmallStr());
+
+    const expected_bytes = "hello world";
+    const expected = RocStr.init(expected_bytes, expected_bytes.len);
+    defer expected.deinit();
+
+    try expect(expected.isSmallStr());
+
+    const trimmed = strTrim(original);
+
+    try expect(trimmed.eq(expected));
+    try expect(trimmed.isSmallStr());
+}
+
+test "strTrim: small to small" {
+    const original_bytes = " hello world ";
+    const original = RocStr.init(original_bytes, original_bytes.len);
+    defer original.deinit();
+
+    try expect(original.isSmallStr());
+
+    const expected_bytes = "hello world";
+    const expected = RocStr.init(expected_bytes, expected_bytes.len);
+    defer expected.deinit();
+
+    try expect(expected.isSmallStr());
+
+    const trimmed = strTrim(original);
+
+    try expect(trimmed.eq(expected));
+    try expect(trimmed.isSmallStr());
+}
+
+test "ReverseUtf8View: hello world" {
+    const original_bytes = "hello world";
+    const expected_bytes = "dlrow olleh";
+
+    var i: usize = 0;
+    var iter = ReverseUtf8View.initUnchecked(original_bytes).iterator();
+    while (iter.nextCodepoint()) |codepoint| {
+        try expect(expected_bytes[i] == codepoint);
+        i += 1;
+    }
+}
+
+test "ReverseUtf8View: empty" {
+    const original_bytes = "";
+
+    var iter = ReverseUtf8View.initUnchecked(original_bytes).iterator();
+    while (iter.nextCodepoint()) |codepoint| {
+        try expect(false);
+    }
+}
--- a/compiler/builtins/src/bitcode.rs
+++ b/compiler/builtins/src/bitcode.rs
@ -142,6 +142,7 @@ pub const STR_TO_UTF8: &str = "roc_builtins.str.to_utf8";
 pub const STR_FROM_UTF8: &str = "roc_builtins.str.from_utf8";
 pub const STR_FROM_UTF8_RANGE: &str = "roc_builtins.str.from_utf8_range";
 pub const STR_REPEAT: &str = "roc_builtins.str.repeat";
+pub const STR_TRIM: &str = "roc_builtins.str.trim";

 pub const DICT_HASH: &str = "roc_builtins.dict.hash";
 pub const DICT_HASH_STR: &str = "roc_builtins.dict.hash_str";
--- a/compiler/builtins/src/std.rs
+++ b/compiler/builtins/src/std.rs
@ -632,6 +632,9 @@ pub fn types() -> MutMap<Symbol, (SolvedType, Region)> {
        Box::new(str_type())
    );

+    // trim : Str -> Str
+    add_top_level_function_type!(Symbol::STR_TRIM, vec![str_type()], Box::new(str_type()));
+
    // fromUtf8 : List U8 -> Result Str [ BadUtf8 Utf8Problem ]*
    {
        let bad_utf8 = SolvedType::TagUnion(
--- a/compiler/can/src/builtins.rs
+++ b/compiler/can/src/builtins.rs
@ -67,6 +67,7 @@ pub fn builtin_defs_map(symbol: Symbol, var_store: &mut VarStore) -> Option<Def>
        STR_TO_UTF8 => str_to_utf8,
        STR_FROM_FLOAT=> str_from_float,
        STR_REPEAT => str_repeat,
+        STR_TRIM => str_trim,
        LIST_LEN => list_len,
        LIST_GET => list_get,
        LIST_SET => list_set,
@ -1238,6 +1239,11 @@ fn str_split(symbol: Symbol, var_store: &mut VarStore) -> Def {
    )
 }

+/// Str.trim : Str -> Str
+fn str_trim(symbol: Symbol, var_store: &mut VarStore) -> Def {
+    lowlevel_1(symbol, LowLevel::StrTrim, var_store)
+}
+
 /// Str.repeat : Str, Nat -> Str
 fn str_repeat(symbol: Symbol, var_store: &mut VarStore) -> Def {
    let str_var = var_store.fresh();
--- a/compiler/gen_llvm/src/llvm/build.rs
+++ b/compiler/gen_llvm/src/llvm/build.rs
@ -17,7 +17,7 @@ use crate::llvm::build_list::{
 use crate::llvm::build_str::{
    empty_str, str_concat, str_count_graphemes, str_ends_with, str_from_float, str_from_int,
    str_from_utf8, str_from_utf8_range, str_join_with, str_number_of_bytes, str_repeat, str_split,
-    str_starts_with, str_starts_with_code_point, str_to_utf8,
+    str_starts_with, str_starts_with_code_point, str_to_utf8, str_trim,
 };
 use crate::llvm::compare::{generic_eq, generic_neq};
 use crate::llvm::convert::{
@ -4953,6 +4953,12 @@ fn run_low_level<'a, 'ctx, 'env>(

            str_count_graphemes(env, scope, args[0])
        }
+        StrTrim => {
+            // Str.trim : Str -> Str
+            debug_assert_eq!(args.len(), 1);
+
+            str_trim(env, scope, args[0])
+        }
        ListLen => {
            // List.len : List * -> Int
            debug_assert_eq!(args.len(), 1);
--- a/compiler/gen_llvm/src/llvm/build_str.rs
+++ b/compiler/gen_llvm/src/llvm/build_str.rs
@ -249,6 +249,16 @@ pub fn str_count_graphemes<'a, 'ctx, 'env>(
    )
 }

+/// Str.trim : Str -> Str
+pub fn str_trim<'a, 'ctx, 'env>(
+    env: &Env<'a, 'ctx, 'env>,
+    scope: &Scope<'a, 'ctx>,
+    str_symbol: Symbol,
+) -> BasicValueEnum<'ctx> {
+    let str_i128 = str_symbol_to_c_abi(env, scope, str_symbol);
+    call_bitcode_fn(env, &[str_i128.into()], bitcode::STR_TRIM)
+}
+
 /// Str.fromInt : Int -> Str
 pub fn str_from_int<'a, 'ctx, 'env>(
    env: &Env<'a, 'ctx, 'env>,
--- a/compiler/module/src/low_level.rs
+++ b/compiler/module/src/low_level.rs
@ -17,6 +17,7 @@ pub enum LowLevel {
    StrToUtf8,
    StrRepeat,
    StrFromFloat,
+    StrTrim,
    ListLen,
    ListGetUnsafe,
    ListSet,
@ -123,6 +124,7 @@ macro_rules! first_order {
            | StrFromUtf8Range
            | StrToUtf8
            | StrRepeat
+            | StrTrim
            | StrFromFloat
            | ListLen
            | ListGetUnsafe
--- a/compiler/module/src/symbol.rs
+++ b/compiler/module/src/symbol.rs
@ -1015,6 +1015,7 @@ define_builtins! {
        17 STR_ALIAS_ANALYSIS_STATIC: "#aliasAnalysisStatic" // string with the static lifetime
        18 STR_FROM_UTF8_RANGE: "fromUtf8Range"
        19 STR_REPEAT: "repeat"
+        20 STR_TRIM: "trim"
    }
    4 LIST: "List" => {
        0 LIST_LIST: "List" imported // the List.List type alias
--- a/compiler/mono/src/borrow.rs
+++ b/compiler/mono/src/borrow.rs
@ -922,6 +922,7 @@ pub fn lowlevel_borrow_signature(arena: &Bump, op: LowLevel) -> &[bool] {
        ListGetUnsafe => arena.alloc_slice_copy(&[borrowed, irrelevant]),
        ListConcat => arena.alloc_slice_copy(&[owned, owned]),
        StrConcat => arena.alloc_slice_copy(&[owned, borrowed]),
+        StrTrim => arena.alloc_slice_copy(&[owned]),
        StrSplit => arena.alloc_slice_copy(&[borrowed, borrowed]),
        ListSingle => arena.alloc_slice_copy(&[irrelevant]),
        ListRepeat => arena.alloc_slice_copy(&[irrelevant, borrowed]),
--- a/compiler/solve/tests/solve_expr.rs
+++ b/compiler/solve/tests/solve_expr.rs
@ -3733,6 +3733,18 @@ mod solve_expr {
        );
    }

+    #[test]
+    fn str_trim() {
+        infer_eq_without_problem(
+            indoc!(
+                r#"
+                Str.trim
+                "#
+            ),
+            "Str -> Str",
+        );
+    }
+
    #[test]
    fn list_drop_last() {
        infer_eq_without_problem(
--- a/compiler/test_gen/src/gen_list.rs
+++ b/compiler/test_gen/src/gen_list.rs
@ -215,7 +215,7 @@ fn list_drop_at() {
 }

 #[test]
-fn list_drop_at_mutable() {
+fn list_drop_at_shared() {
    assert_evals_to!(
        indoc!(
            r#"
--- a/compiler/test_gen/src/gen_str.rs
+++ b/compiler/test_gen/src/gen_str.rs
@ -977,3 +977,94 @@ fn str_repeat_empty_string() {
 fn str_repeat_zero_times() {
    assert_evals_to!(indoc!(r#"Str.repeat "Roc" 0"#), RocStr::from(""), RocStr);
 }
+
+#[test]
+fn str_trim_empty_string() {
+    assert_evals_to!(indoc!(r#"Str.trim """#), RocStr::from(""), RocStr);
+}
+
+#[test]
+fn str_trim_small_blank_string() {
+    assert_evals_to!(indoc!(r#"Str.trim " ""#), RocStr::from(""), RocStr);
+}
+
+#[test]
+fn str_trim_small_to_small() {
+    assert_evals_to!(
+        indoc!(r#"Str.trim "  hello world  ""#),
+        RocStr::from("hello world"),
+        RocStr
+    );
+}
+
+#[test]
+fn str_trim_large_to_large_unique() {
+    assert_evals_to!(
+        indoc!(r#"Str.trim (Str.concat "  " "hello world from a large string ")"#),
+        RocStr::from("hello world from a large string"),
+        RocStr
+    );
+}
+
+#[test]
+fn str_trim_large_to_small_unique() {
+    assert_evals_to!(
+        indoc!(r#"Str.trim (Str.concat "  " "hello world        ")"#),
+        RocStr::from("hello world"),
+        RocStr
+    );
+}
+
+#[test]
+fn str_trim_large_to_large_shared() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+               original : Str
+               original = " hello world world "
+
+               { trimmed: Str.trim original, original: original }
+               "#
+        ),
+        (
+            RocStr::from(" hello world world "),
+            RocStr::from("hello world world"),
+        ),
+        (RocStr, RocStr)
+    );
+}
+
+#[test]
+fn str_trim_large_to_small_shared() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+               original : Str
+               original = " hello world             "
+
+               { trimmed: Str.trim original, original: original }
+               "#
+        ),
+        (
+            RocStr::from(" hello world             "),
+            RocStr::from("hello world"),
+        ),
+        (RocStr, RocStr)
+    );
+}
+
+#[test]
+fn str_trim_small_to_small_shared() {
+    assert_evals_to!(
+        indoc!(
+            r#"
+               original : Str
+               original = " hello world "
+
+               { trimmed: Str.trim original, original: original }
+               "#
+        ),
+        (RocStr::from(" hello world "), RocStr::from("hello world"),),
+        (RocStr, RocStr)
+    );
+}
--- a/examples/hello-rust/hello-world
+++ b/examples/hello-rust/hello-world