mirror of
https://github.com/roc-lang/roc.git
synced 2025-09-29 14:54:47 +00:00
Merge branch 'trunk' into list_min
This commit is contained in:
commit
aa978ae6d9
14 changed files with 393 additions and 5 deletions
|
@ -121,6 +121,7 @@ comptime {
|
|||
exportStrFn(str.fromUtf8C, "from_utf8");
|
||||
exportStrFn(str.fromUtf8RangeC, "from_utf8_range");
|
||||
exportStrFn(str.repeat, "repeat");
|
||||
exportStrFn(str.strTrim, "trim");
|
||||
}
|
||||
|
||||
// Utils
|
||||
|
|
|
@ -163,7 +163,7 @@ pub const RocStr = extern struct {
|
|||
) RocStr {
|
||||
const element_width = 1;
|
||||
|
||||
if (self.bytes) |source_ptr| {
|
||||
if (self.str_bytes) |source_ptr| {
|
||||
if (self.isUnique()) {
|
||||
const new_source = utils.unsafeReallocate(source_ptr, RocStr.alignment, self.len(), new_length, element_width);
|
||||
|
||||
|
@ -171,7 +171,7 @@ pub const RocStr = extern struct {
|
|||
}
|
||||
}
|
||||
|
||||
return self.reallocateFresh(RocStr.alignment, new_length, element_width);
|
||||
return self.reallocateFresh(new_length);
|
||||
}
|
||||
|
||||
/// reallocate by explicitly making a new allocation and copying elements over
|
||||
|
@ -294,7 +294,7 @@ pub const RocStr = extern struct {
|
|||
}
|
||||
|
||||
pub fn isUnique(self: RocStr) bool {
|
||||
// the empty list is unique (in the sense that copying it will not leak memory)
|
||||
// the empty string is unique (in the sense that copying it will not leak memory)
|
||||
if (self.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
@ -305,6 +305,10 @@ pub const RocStr = extern struct {
|
|||
}
|
||||
|
||||
// otherwise, check if the refcount is one
|
||||
return @call(.{ .modifier = always_inline }, RocStr.isRefcountOne, .{self});
|
||||
}
|
||||
|
||||
fn isRefcountOne(self: RocStr) bool {
|
||||
const ptr: [*]usize = @ptrCast([*]usize, @alignCast(8, self.str_bytes));
|
||||
return (ptr - 1)[0] == utils.REFCOUNT_ONE;
|
||||
}
|
||||
|
@ -1473,3 +1477,253 @@ test "validateUtf8Bytes: surrogate halves" {
|
|||
|
||||
try expectErr(list, 3, error.Utf8EncodesSurrogateHalf, Utf8ByteProblem.EncodesSurrogateHalf);
|
||||
}
|
||||
|
||||
fn isWhitespace(codepoint: u21) bool {
|
||||
// https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
|
||||
return switch (codepoint) {
|
||||
0x0009...0x000D => true, // control characters
|
||||
0x0020 => true, // space
|
||||
0x0085 => true, // control character
|
||||
0x00A0 => true, // no-break space
|
||||
0x1680 => true, // ogham space
|
||||
0x2000...0x200A => true, // en quad..hair space
|
||||
0x200E...0x200F => true, // left-to-right & right-to-left marks
|
||||
0x2028 => true, // line separator
|
||||
0x2029 => true, // paragraph separator
|
||||
0x202F => true, // narrow no-break space
|
||||
0x205F => true, // medium mathematical space
|
||||
0x3000 => true, // ideographic space
|
||||
|
||||
else => false,
|
||||
};
|
||||
}
|
||||
|
||||
test "isWhitespace" {
|
||||
try expect(isWhitespace(' '));
|
||||
try expect(isWhitespace('\u{00A0}'));
|
||||
try expect(!isWhitespace('x'));
|
||||
}
|
||||
|
||||
pub fn strTrim(string: RocStr) callconv(.C) RocStr {
|
||||
if (string.str_bytes) |bytes_ptr| {
|
||||
const leading_bytes = countLeadingWhitespaceBytes(string);
|
||||
const original_len = string.len();
|
||||
|
||||
if (original_len == leading_bytes) {
|
||||
string.deinit();
|
||||
return RocStr.empty();
|
||||
}
|
||||
|
||||
const trailing_bytes = countTrailingWhitespaceBytes(string);
|
||||
const new_len = original_len - leading_bytes - trailing_bytes;
|
||||
|
||||
const small_or_shared = new_len <= SMALL_STR_MAX_LENGTH or !string.isRefcountOne();
|
||||
if (small_or_shared) {
|
||||
return RocStr.init(string.asU8ptr() + leading_bytes, new_len);
|
||||
}
|
||||
|
||||
// nonempty, large, and unique:
|
||||
|
||||
if (leading_bytes > 0) {
|
||||
var i: usize = 0;
|
||||
while (i < new_len) : (i += 1) {
|
||||
const dest = bytes_ptr + i;
|
||||
const source = dest + leading_bytes;
|
||||
@memcpy(dest, source, 1);
|
||||
}
|
||||
}
|
||||
|
||||
var new_string = string;
|
||||
new_string.str_len = new_len;
|
||||
|
||||
return new_string;
|
||||
}
|
||||
|
||||
return RocStr.empty();
|
||||
}
|
||||
|
||||
fn countLeadingWhitespaceBytes(string: RocStr) usize {
|
||||
var byte_count: usize = 0;
|
||||
|
||||
var bytes = string.asU8ptr()[0..string.len()];
|
||||
var iter = unicode.Utf8View.initUnchecked(bytes).iterator();
|
||||
while (iter.nextCodepoint()) |codepoint| {
|
||||
if (isWhitespace(codepoint)) {
|
||||
byte_count += unicode.utf8CodepointSequenceLength(codepoint) catch break;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return byte_count;
|
||||
}
|
||||
|
||||
fn countTrailingWhitespaceBytes(string: RocStr) usize {
|
||||
var byte_count: usize = 0;
|
||||
|
||||
var bytes = string.asU8ptr()[0..string.len()];
|
||||
var iter = ReverseUtf8View.initUnchecked(bytes).iterator();
|
||||
while (iter.nextCodepoint()) |codepoint| {
|
||||
if (isWhitespace(codepoint)) {
|
||||
byte_count += unicode.utf8CodepointSequenceLength(codepoint) catch break;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return byte_count;
|
||||
}
|
||||
|
||||
/// A backwards version of Utf8View from std.unicode
|
||||
const ReverseUtf8View = struct {
|
||||
bytes: []const u8,
|
||||
|
||||
pub fn initUnchecked(s: []const u8) ReverseUtf8View {
|
||||
return ReverseUtf8View{ .bytes = s };
|
||||
}
|
||||
|
||||
pub fn iterator(s: ReverseUtf8View) ReverseUtf8Iterator {
|
||||
return ReverseUtf8Iterator{
|
||||
.bytes = s.bytes,
|
||||
.i = if (s.bytes.len > 0) s.bytes.len - 1 else null,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
/// A backwards version of Utf8Iterator from std.unicode
|
||||
const ReverseUtf8Iterator = struct {
|
||||
bytes: []const u8,
|
||||
// NOTE null signifies complete/empty
|
||||
i: ?usize,
|
||||
|
||||
pub fn nextCodepointSlice(it: *ReverseUtf8Iterator) ?[]const u8 {
|
||||
if (it.i) |index| {
|
||||
var i = index;
|
||||
|
||||
// NOTE this relies on the string being valid utf8 to not run off the end
|
||||
while (!utf8BeginByte(it.bytes[i])) {
|
||||
i -= 1;
|
||||
}
|
||||
|
||||
const cp_len = unicode.utf8ByteSequenceLength(it.bytes[i]) catch unreachable;
|
||||
const slice = it.bytes[i .. i + cp_len];
|
||||
|
||||
it.i = if (i == 0) null else i - 1;
|
||||
|
||||
return slice;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
pub fn nextCodepoint(it: *ReverseUtf8Iterator) ?u21 {
|
||||
const slice = it.nextCodepointSlice() orelse return null;
|
||||
|
||||
return switch (slice.len) {
|
||||
1 => @as(u21, slice[0]),
|
||||
2 => unicode.utf8Decode2(slice) catch unreachable,
|
||||
3 => unicode.utf8Decode3(slice) catch unreachable,
|
||||
4 => unicode.utf8Decode4(slice) catch unreachable,
|
||||
else => unreachable,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
fn utf8BeginByte(byte: u8) bool {
|
||||
return switch (byte) {
|
||||
0b1000_0000...0b1011_1111 => false,
|
||||
else => true,
|
||||
};
|
||||
}
|
||||
|
||||
test "strTrim: empty" {
|
||||
const trimmedEmpty = strTrim(RocStr.empty());
|
||||
try expect(trimmedEmpty.eq(RocStr.empty()));
|
||||
}
|
||||
|
||||
test "strTrim: blank" {
|
||||
const original_bytes = " ";
|
||||
const original = RocStr.init(original_bytes, original_bytes.len);
|
||||
defer original.deinit();
|
||||
|
||||
const trimmed = strTrim(original);
|
||||
|
||||
try expect(trimmed.eq(RocStr.empty()));
|
||||
}
|
||||
|
||||
test "strTrim: large to large" {
|
||||
const original_bytes = " hello giant world ";
|
||||
const original = RocStr.init(original_bytes, original_bytes.len);
|
||||
defer original.deinit();
|
||||
|
||||
try expect(!original.isSmallStr());
|
||||
|
||||
const expected_bytes = "hello giant world";
|
||||
const expected = RocStr.init(expected_bytes, expected_bytes.len);
|
||||
defer expected.deinit();
|
||||
|
||||
try expect(!expected.isSmallStr());
|
||||
|
||||
const trimmed = strTrim(original);
|
||||
|
||||
try expect(trimmed.eq(expected));
|
||||
}
|
||||
|
||||
test "strTrim: large to small" {
|
||||
const original_bytes = " hello world ";
|
||||
const original = RocStr.init(original_bytes, original_bytes.len);
|
||||
defer original.deinit();
|
||||
|
||||
try expect(!original.isSmallStr());
|
||||
|
||||
const expected_bytes = "hello world";
|
||||
const expected = RocStr.init(expected_bytes, expected_bytes.len);
|
||||
defer expected.deinit();
|
||||
|
||||
try expect(expected.isSmallStr());
|
||||
|
||||
const trimmed = strTrim(original);
|
||||
|
||||
try expect(trimmed.eq(expected));
|
||||
try expect(trimmed.isSmallStr());
|
||||
}
|
||||
|
||||
test "strTrim: small to small" {
|
||||
const original_bytes = " hello world ";
|
||||
const original = RocStr.init(original_bytes, original_bytes.len);
|
||||
defer original.deinit();
|
||||
|
||||
try expect(original.isSmallStr());
|
||||
|
||||
const expected_bytes = "hello world";
|
||||
const expected = RocStr.init(expected_bytes, expected_bytes.len);
|
||||
defer expected.deinit();
|
||||
|
||||
try expect(expected.isSmallStr());
|
||||
|
||||
const trimmed = strTrim(original);
|
||||
|
||||
try expect(trimmed.eq(expected));
|
||||
try expect(trimmed.isSmallStr());
|
||||
}
|
||||
|
||||
test "ReverseUtf8View: hello world" {
|
||||
const original_bytes = "hello world";
|
||||
const expected_bytes = "dlrow olleh";
|
||||
|
||||
var i: usize = 0;
|
||||
var iter = ReverseUtf8View.initUnchecked(original_bytes).iterator();
|
||||
while (iter.nextCodepoint()) |codepoint| {
|
||||
try expect(expected_bytes[i] == codepoint);
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
test "ReverseUtf8View: empty" {
|
||||
const original_bytes = "";
|
||||
|
||||
var iter = ReverseUtf8View.initUnchecked(original_bytes).iterator();
|
||||
while (iter.nextCodepoint()) |codepoint| {
|
||||
try expect(false);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -142,6 +142,7 @@ pub const STR_TO_UTF8: &str = "roc_builtins.str.to_utf8";
|
|||
pub const STR_FROM_UTF8: &str = "roc_builtins.str.from_utf8";
|
||||
pub const STR_FROM_UTF8_RANGE: &str = "roc_builtins.str.from_utf8_range";
|
||||
pub const STR_REPEAT: &str = "roc_builtins.str.repeat";
|
||||
pub const STR_TRIM: &str = "roc_builtins.str.trim";
|
||||
|
||||
pub const DICT_HASH: &str = "roc_builtins.dict.hash";
|
||||
pub const DICT_HASH_STR: &str = "roc_builtins.dict.hash_str";
|
||||
|
|
|
@ -632,6 +632,9 @@ pub fn types() -> MutMap<Symbol, (SolvedType, Region)> {
|
|||
Box::new(str_type())
|
||||
);
|
||||
|
||||
// trim : Str -> Str
|
||||
add_top_level_function_type!(Symbol::STR_TRIM, vec![str_type()], Box::new(str_type()));
|
||||
|
||||
// fromUtf8 : List U8 -> Result Str [ BadUtf8 Utf8Problem ]*
|
||||
{
|
||||
let bad_utf8 = SolvedType::TagUnion(
|
||||
|
|
|
@ -67,6 +67,7 @@ pub fn builtin_defs_map(symbol: Symbol, var_store: &mut VarStore) -> Option<Def>
|
|||
STR_TO_UTF8 => str_to_utf8,
|
||||
STR_FROM_FLOAT=> str_from_float,
|
||||
STR_REPEAT => str_repeat,
|
||||
STR_TRIM => str_trim,
|
||||
LIST_LEN => list_len,
|
||||
LIST_GET => list_get,
|
||||
LIST_SET => list_set,
|
||||
|
@ -1238,6 +1239,11 @@ fn str_split(symbol: Symbol, var_store: &mut VarStore) -> Def {
|
|||
)
|
||||
}
|
||||
|
||||
/// Str.trim : Str -> Str
|
||||
fn str_trim(symbol: Symbol, var_store: &mut VarStore) -> Def {
|
||||
lowlevel_1(symbol, LowLevel::StrTrim, var_store)
|
||||
}
|
||||
|
||||
/// Str.repeat : Str, Nat -> Str
|
||||
fn str_repeat(symbol: Symbol, var_store: &mut VarStore) -> Def {
|
||||
let str_var = var_store.fresh();
|
||||
|
|
|
@ -17,7 +17,7 @@ use crate::llvm::build_list::{
|
|||
use crate::llvm::build_str::{
|
||||
empty_str, str_concat, str_count_graphemes, str_ends_with, str_from_float, str_from_int,
|
||||
str_from_utf8, str_from_utf8_range, str_join_with, str_number_of_bytes, str_repeat, str_split,
|
||||
str_starts_with, str_starts_with_code_point, str_to_utf8,
|
||||
str_starts_with, str_starts_with_code_point, str_to_utf8, str_trim,
|
||||
};
|
||||
use crate::llvm::compare::{generic_eq, generic_neq};
|
||||
use crate::llvm::convert::{
|
||||
|
@ -4953,6 +4953,12 @@ fn run_low_level<'a, 'ctx, 'env>(
|
|||
|
||||
str_count_graphemes(env, scope, args[0])
|
||||
}
|
||||
StrTrim => {
|
||||
// Str.trim : Str -> Str
|
||||
debug_assert_eq!(args.len(), 1);
|
||||
|
||||
str_trim(env, scope, args[0])
|
||||
}
|
||||
ListLen => {
|
||||
// List.len : List * -> Int
|
||||
debug_assert_eq!(args.len(), 1);
|
||||
|
|
|
@ -249,6 +249,16 @@ pub fn str_count_graphemes<'a, 'ctx, 'env>(
|
|||
)
|
||||
}
|
||||
|
||||
/// Str.trim : Str -> Str
|
||||
pub fn str_trim<'a, 'ctx, 'env>(
|
||||
env: &Env<'a, 'ctx, 'env>,
|
||||
scope: &Scope<'a, 'ctx>,
|
||||
str_symbol: Symbol,
|
||||
) -> BasicValueEnum<'ctx> {
|
||||
let str_i128 = str_symbol_to_c_abi(env, scope, str_symbol);
|
||||
call_bitcode_fn(env, &[str_i128.into()], bitcode::STR_TRIM)
|
||||
}
|
||||
|
||||
/// Str.fromInt : Int -> Str
|
||||
pub fn str_from_int<'a, 'ctx, 'env>(
|
||||
env: &Env<'a, 'ctx, 'env>,
|
||||
|
|
|
@ -17,6 +17,7 @@ pub enum LowLevel {
|
|||
StrToUtf8,
|
||||
StrRepeat,
|
||||
StrFromFloat,
|
||||
StrTrim,
|
||||
ListLen,
|
||||
ListGetUnsafe,
|
||||
ListSet,
|
||||
|
@ -123,6 +124,7 @@ macro_rules! first_order {
|
|||
| StrFromUtf8Range
|
||||
| StrToUtf8
|
||||
| StrRepeat
|
||||
| StrTrim
|
||||
| StrFromFloat
|
||||
| ListLen
|
||||
| ListGetUnsafe
|
||||
|
|
|
@ -1015,6 +1015,7 @@ define_builtins! {
|
|||
17 STR_ALIAS_ANALYSIS_STATIC: "#aliasAnalysisStatic" // string with the static lifetime
|
||||
18 STR_FROM_UTF8_RANGE: "fromUtf8Range"
|
||||
19 STR_REPEAT: "repeat"
|
||||
20 STR_TRIM: "trim"
|
||||
}
|
||||
4 LIST: "List" => {
|
||||
0 LIST_LIST: "List" imported // the List.List type alias
|
||||
|
|
|
@ -922,6 +922,7 @@ pub fn lowlevel_borrow_signature(arena: &Bump, op: LowLevel) -> &[bool] {
|
|||
ListGetUnsafe => arena.alloc_slice_copy(&[borrowed, irrelevant]),
|
||||
ListConcat => arena.alloc_slice_copy(&[owned, owned]),
|
||||
StrConcat => arena.alloc_slice_copy(&[owned, borrowed]),
|
||||
StrTrim => arena.alloc_slice_copy(&[owned]),
|
||||
StrSplit => arena.alloc_slice_copy(&[borrowed, borrowed]),
|
||||
ListSingle => arena.alloc_slice_copy(&[irrelevant]),
|
||||
ListRepeat => arena.alloc_slice_copy(&[irrelevant, borrowed]),
|
||||
|
|
|
@ -3733,6 +3733,18 @@ mod solve_expr {
|
|||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_trim() {
|
||||
infer_eq_without_problem(
|
||||
indoc!(
|
||||
r#"
|
||||
Str.trim
|
||||
"#
|
||||
),
|
||||
"Str -> Str",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn list_drop_last() {
|
||||
infer_eq_without_problem(
|
||||
|
|
|
@ -215,7 +215,7 @@ fn list_drop_at() {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn list_drop_at_mutable() {
|
||||
fn list_drop_at_shared() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
|
|
|
@ -977,3 +977,94 @@ fn str_repeat_empty_string() {
|
|||
fn str_repeat_zero_times() {
|
||||
assert_evals_to!(indoc!(r#"Str.repeat "Roc" 0"#), RocStr::from(""), RocStr);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_trim_empty_string() {
|
||||
assert_evals_to!(indoc!(r#"Str.trim """#), RocStr::from(""), RocStr);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_trim_small_blank_string() {
|
||||
assert_evals_to!(indoc!(r#"Str.trim " ""#), RocStr::from(""), RocStr);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_trim_small_to_small() {
|
||||
assert_evals_to!(
|
||||
indoc!(r#"Str.trim " hello world ""#),
|
||||
RocStr::from("hello world"),
|
||||
RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_trim_large_to_large_unique() {
|
||||
assert_evals_to!(
|
||||
indoc!(r#"Str.trim (Str.concat " " "hello world from a large string ")"#),
|
||||
RocStr::from("hello world from a large string"),
|
||||
RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_trim_large_to_small_unique() {
|
||||
assert_evals_to!(
|
||||
indoc!(r#"Str.trim (Str.concat " " "hello world ")"#),
|
||||
RocStr::from("hello world"),
|
||||
RocStr
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_trim_large_to_large_shared() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
original : Str
|
||||
original = " hello world world "
|
||||
|
||||
{ trimmed: Str.trim original, original: original }
|
||||
"#
|
||||
),
|
||||
(
|
||||
RocStr::from(" hello world world "),
|
||||
RocStr::from("hello world world"),
|
||||
),
|
||||
(RocStr, RocStr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_trim_large_to_small_shared() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
original : Str
|
||||
original = " hello world "
|
||||
|
||||
{ trimmed: Str.trim original, original: original }
|
||||
"#
|
||||
),
|
||||
(
|
||||
RocStr::from(" hello world "),
|
||||
RocStr::from("hello world"),
|
||||
),
|
||||
(RocStr, RocStr)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn str_trim_small_to_small_shared() {
|
||||
assert_evals_to!(
|
||||
indoc!(
|
||||
r#"
|
||||
original : Str
|
||||
original = " hello world "
|
||||
|
||||
{ trimmed: Str.trim original, original: original }
|
||||
"#
|
||||
),
|
||||
(RocStr::from(" hello world "), RocStr::from("hello world"),),
|
||||
(RocStr, RocStr)
|
||||
);
|
||||
}
|
||||
|
|
BIN
examples/hello-rust/hello-world
Executable file
BIN
examples/hello-rust/hello-world
Executable file
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue