Merge branch 'trunk' into list_min

This commit is contained in:
Folkert de Vries 2021-10-28 20:32:27 +02:00 committed by GitHub
commit aa978ae6d9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 393 additions and 5 deletions

View file

@ -121,6 +121,7 @@ comptime {
exportStrFn(str.fromUtf8C, "from_utf8");
exportStrFn(str.fromUtf8RangeC, "from_utf8_range");
exportStrFn(str.repeat, "repeat");
exportStrFn(str.strTrim, "trim");
}
// Utils

View file

@ -163,7 +163,7 @@ pub const RocStr = extern struct {
) RocStr {
const element_width = 1;
if (self.bytes) |source_ptr| {
if (self.str_bytes) |source_ptr| {
if (self.isUnique()) {
const new_source = utils.unsafeReallocate(source_ptr, RocStr.alignment, self.len(), new_length, element_width);
@ -171,7 +171,7 @@ pub const RocStr = extern struct {
}
}
return self.reallocateFresh(RocStr.alignment, new_length, element_width);
return self.reallocateFresh(new_length);
}
/// reallocate by explicitly making a new allocation and copying elements over
@ -294,7 +294,7 @@ pub const RocStr = extern struct {
}
pub fn isUnique(self: RocStr) bool {
// the empty list is unique (in the sense that copying it will not leak memory)
// the empty string is unique (in the sense that copying it will not leak memory)
if (self.isEmpty()) {
return true;
}
@ -305,6 +305,10 @@ pub const RocStr = extern struct {
}
// otherwise, check if the refcount is one
return @call(.{ .modifier = always_inline }, RocStr.isRefcountOne, .{self});
}
fn isRefcountOne(self: RocStr) bool {
const ptr: [*]usize = @ptrCast([*]usize, @alignCast(8, self.str_bytes));
return (ptr - 1)[0] == utils.REFCOUNT_ONE;
}
@ -1473,3 +1477,253 @@ test "validateUtf8Bytes: surrogate halves" {
try expectErr(list, 3, error.Utf8EncodesSurrogateHalf, Utf8ByteProblem.EncodesSurrogateHalf);
}
fn isWhitespace(codepoint: u21) bool {
// https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
return switch (codepoint) {
0x0009...0x000D => true, // control characters
0x0020 => true, // space
0x0085 => true, // control character
0x00A0 => true, // no-break space
0x1680 => true, // ogham space
0x2000...0x200A => true, // en quad..hair space
0x200E...0x200F => true, // left-to-right & right-to-left marks
0x2028 => true, // line separator
0x2029 => true, // paragraph separator
0x202F => true, // narrow no-break space
0x205F => true, // medium mathematical space
0x3000 => true, // ideographic space
else => false,
};
}
test "isWhitespace" {
try expect(isWhitespace(' '));
try expect(isWhitespace('\u{00A0}'));
try expect(!isWhitespace('x'));
}
pub fn strTrim(string: RocStr) callconv(.C) RocStr {
if (string.str_bytes) |bytes_ptr| {
const leading_bytes = countLeadingWhitespaceBytes(string);
const original_len = string.len();
if (original_len == leading_bytes) {
string.deinit();
return RocStr.empty();
}
const trailing_bytes = countTrailingWhitespaceBytes(string);
const new_len = original_len - leading_bytes - trailing_bytes;
const small_or_shared = new_len <= SMALL_STR_MAX_LENGTH or !string.isRefcountOne();
if (small_or_shared) {
return RocStr.init(string.asU8ptr() + leading_bytes, new_len);
}
// nonempty, large, and unique:
if (leading_bytes > 0) {
var i: usize = 0;
while (i < new_len) : (i += 1) {
const dest = bytes_ptr + i;
const source = dest + leading_bytes;
@memcpy(dest, source, 1);
}
}
var new_string = string;
new_string.str_len = new_len;
return new_string;
}
return RocStr.empty();
}
fn countLeadingWhitespaceBytes(string: RocStr) usize {
var byte_count: usize = 0;
var bytes = string.asU8ptr()[0..string.len()];
var iter = unicode.Utf8View.initUnchecked(bytes).iterator();
while (iter.nextCodepoint()) |codepoint| {
if (isWhitespace(codepoint)) {
byte_count += unicode.utf8CodepointSequenceLength(codepoint) catch break;
} else {
break;
}
}
return byte_count;
}
fn countTrailingWhitespaceBytes(string: RocStr) usize {
var byte_count: usize = 0;
var bytes = string.asU8ptr()[0..string.len()];
var iter = ReverseUtf8View.initUnchecked(bytes).iterator();
while (iter.nextCodepoint()) |codepoint| {
if (isWhitespace(codepoint)) {
byte_count += unicode.utf8CodepointSequenceLength(codepoint) catch break;
} else {
break;
}
}
return byte_count;
}
/// A backwards version of Utf8View from std.unicode
const ReverseUtf8View = struct {
bytes: []const u8,
pub fn initUnchecked(s: []const u8) ReverseUtf8View {
return ReverseUtf8View{ .bytes = s };
}
pub fn iterator(s: ReverseUtf8View) ReverseUtf8Iterator {
return ReverseUtf8Iterator{
.bytes = s.bytes,
.i = if (s.bytes.len > 0) s.bytes.len - 1 else null,
};
}
};
/// A backwards version of Utf8Iterator from std.unicode
const ReverseUtf8Iterator = struct {
bytes: []const u8,
// NOTE null signifies complete/empty
i: ?usize,
pub fn nextCodepointSlice(it: *ReverseUtf8Iterator) ?[]const u8 {
if (it.i) |index| {
var i = index;
// NOTE this relies on the string being valid utf8 to not run off the end
while (!utf8BeginByte(it.bytes[i])) {
i -= 1;
}
const cp_len = unicode.utf8ByteSequenceLength(it.bytes[i]) catch unreachable;
const slice = it.bytes[i .. i + cp_len];
it.i = if (i == 0) null else i - 1;
return slice;
} else {
return null;
}
}
pub fn nextCodepoint(it: *ReverseUtf8Iterator) ?u21 {
const slice = it.nextCodepointSlice() orelse return null;
return switch (slice.len) {
1 => @as(u21, slice[0]),
2 => unicode.utf8Decode2(slice) catch unreachable,
3 => unicode.utf8Decode3(slice) catch unreachable,
4 => unicode.utf8Decode4(slice) catch unreachable,
else => unreachable,
};
}
};
fn utf8BeginByte(byte: u8) bool {
return switch (byte) {
0b1000_0000...0b1011_1111 => false,
else => true,
};
}
test "strTrim: empty" {
const trimmedEmpty = strTrim(RocStr.empty());
try expect(trimmedEmpty.eq(RocStr.empty()));
}
test "strTrim: blank" {
const original_bytes = " ";
const original = RocStr.init(original_bytes, original_bytes.len);
defer original.deinit();
const trimmed = strTrim(original);
try expect(trimmed.eq(RocStr.empty()));
}
test "strTrim: large to large" {
const original_bytes = " hello giant world ";
const original = RocStr.init(original_bytes, original_bytes.len);
defer original.deinit();
try expect(!original.isSmallStr());
const expected_bytes = "hello giant world";
const expected = RocStr.init(expected_bytes, expected_bytes.len);
defer expected.deinit();
try expect(!expected.isSmallStr());
const trimmed = strTrim(original);
try expect(trimmed.eq(expected));
}
test "strTrim: large to small" {
const original_bytes = " hello world ";
const original = RocStr.init(original_bytes, original_bytes.len);
defer original.deinit();
try expect(!original.isSmallStr());
const expected_bytes = "hello world";
const expected = RocStr.init(expected_bytes, expected_bytes.len);
defer expected.deinit();
try expect(expected.isSmallStr());
const trimmed = strTrim(original);
try expect(trimmed.eq(expected));
try expect(trimmed.isSmallStr());
}
test "strTrim: small to small" {
const original_bytes = " hello world ";
const original = RocStr.init(original_bytes, original_bytes.len);
defer original.deinit();
try expect(original.isSmallStr());
const expected_bytes = "hello world";
const expected = RocStr.init(expected_bytes, expected_bytes.len);
defer expected.deinit();
try expect(expected.isSmallStr());
const trimmed = strTrim(original);
try expect(trimmed.eq(expected));
try expect(trimmed.isSmallStr());
}
test "ReverseUtf8View: hello world" {
const original_bytes = "hello world";
const expected_bytes = "dlrow olleh";
var i: usize = 0;
var iter = ReverseUtf8View.initUnchecked(original_bytes).iterator();
while (iter.nextCodepoint()) |codepoint| {
try expect(expected_bytes[i] == codepoint);
i += 1;
}
}
test "ReverseUtf8View: empty" {
const original_bytes = "";
var iter = ReverseUtf8View.initUnchecked(original_bytes).iterator();
while (iter.nextCodepoint()) |codepoint| {
try expect(false);
}
}

View file

@ -142,6 +142,7 @@ pub const STR_TO_UTF8: &str = "roc_builtins.str.to_utf8";
pub const STR_FROM_UTF8: &str = "roc_builtins.str.from_utf8";
pub const STR_FROM_UTF8_RANGE: &str = "roc_builtins.str.from_utf8_range";
pub const STR_REPEAT: &str = "roc_builtins.str.repeat";
pub const STR_TRIM: &str = "roc_builtins.str.trim";
pub const DICT_HASH: &str = "roc_builtins.dict.hash";
pub const DICT_HASH_STR: &str = "roc_builtins.dict.hash_str";

View file

@ -632,6 +632,9 @@ pub fn types() -> MutMap<Symbol, (SolvedType, Region)> {
Box::new(str_type())
);
// trim : Str -> Str
add_top_level_function_type!(Symbol::STR_TRIM, vec![str_type()], Box::new(str_type()));
// fromUtf8 : List U8 -> Result Str [ BadUtf8 Utf8Problem ]*
{
let bad_utf8 = SolvedType::TagUnion(

View file

@ -67,6 +67,7 @@ pub fn builtin_defs_map(symbol: Symbol, var_store: &mut VarStore) -> Option<Def>
STR_TO_UTF8 => str_to_utf8,
STR_FROM_FLOAT=> str_from_float,
STR_REPEAT => str_repeat,
STR_TRIM => str_trim,
LIST_LEN => list_len,
LIST_GET => list_get,
LIST_SET => list_set,
@ -1238,6 +1239,11 @@ fn str_split(symbol: Symbol, var_store: &mut VarStore) -> Def {
)
}
/// Str.trim : Str -> Str
fn str_trim(symbol: Symbol, var_store: &mut VarStore) -> Def {
lowlevel_1(symbol, LowLevel::StrTrim, var_store)
}
/// Str.repeat : Str, Nat -> Str
fn str_repeat(symbol: Symbol, var_store: &mut VarStore) -> Def {
let str_var = var_store.fresh();

View file

@ -17,7 +17,7 @@ use crate::llvm::build_list::{
use crate::llvm::build_str::{
empty_str, str_concat, str_count_graphemes, str_ends_with, str_from_float, str_from_int,
str_from_utf8, str_from_utf8_range, str_join_with, str_number_of_bytes, str_repeat, str_split,
str_starts_with, str_starts_with_code_point, str_to_utf8,
str_starts_with, str_starts_with_code_point, str_to_utf8, str_trim,
};
use crate::llvm::compare::{generic_eq, generic_neq};
use crate::llvm::convert::{
@ -4953,6 +4953,12 @@ fn run_low_level<'a, 'ctx, 'env>(
str_count_graphemes(env, scope, args[0])
}
StrTrim => {
// Str.trim : Str -> Str
debug_assert_eq!(args.len(), 1);
str_trim(env, scope, args[0])
}
ListLen => {
// List.len : List * -> Int
debug_assert_eq!(args.len(), 1);

View file

@ -249,6 +249,16 @@ pub fn str_count_graphemes<'a, 'ctx, 'env>(
)
}
/// Str.trim : Str -> Str
pub fn str_trim<'a, 'ctx, 'env>(
env: &Env<'a, 'ctx, 'env>,
scope: &Scope<'a, 'ctx>,
str_symbol: Symbol,
) -> BasicValueEnum<'ctx> {
let str_i128 = str_symbol_to_c_abi(env, scope, str_symbol);
call_bitcode_fn(env, &[str_i128.into()], bitcode::STR_TRIM)
}
/// Str.fromInt : Int -> Str
pub fn str_from_int<'a, 'ctx, 'env>(
env: &Env<'a, 'ctx, 'env>,

View file

@ -17,6 +17,7 @@ pub enum LowLevel {
StrToUtf8,
StrRepeat,
StrFromFloat,
StrTrim,
ListLen,
ListGetUnsafe,
ListSet,
@ -123,6 +124,7 @@ macro_rules! first_order {
| StrFromUtf8Range
| StrToUtf8
| StrRepeat
| StrTrim
| StrFromFloat
| ListLen
| ListGetUnsafe

View file

@ -1015,6 +1015,7 @@ define_builtins! {
17 STR_ALIAS_ANALYSIS_STATIC: "#aliasAnalysisStatic" // string with the static lifetime
18 STR_FROM_UTF8_RANGE: "fromUtf8Range"
19 STR_REPEAT: "repeat"
20 STR_TRIM: "trim"
}
4 LIST: "List" => {
0 LIST_LIST: "List" imported // the List.List type alias

View file

@ -922,6 +922,7 @@ pub fn lowlevel_borrow_signature(arena: &Bump, op: LowLevel) -> &[bool] {
ListGetUnsafe => arena.alloc_slice_copy(&[borrowed, irrelevant]),
ListConcat => arena.alloc_slice_copy(&[owned, owned]),
StrConcat => arena.alloc_slice_copy(&[owned, borrowed]),
StrTrim => arena.alloc_slice_copy(&[owned]),
StrSplit => arena.alloc_slice_copy(&[borrowed, borrowed]),
ListSingle => arena.alloc_slice_copy(&[irrelevant]),
ListRepeat => arena.alloc_slice_copy(&[irrelevant, borrowed]),

View file

@ -3733,6 +3733,18 @@ mod solve_expr {
);
}
#[test]
fn str_trim() {
infer_eq_without_problem(
indoc!(
r#"
Str.trim
"#
),
"Str -> Str",
);
}
#[test]
fn list_drop_last() {
infer_eq_without_problem(

View file

@ -215,7 +215,7 @@ fn list_drop_at() {
}
#[test]
fn list_drop_at_mutable() {
fn list_drop_at_shared() {
assert_evals_to!(
indoc!(
r#"

View file

@ -977,3 +977,94 @@ fn str_repeat_empty_string() {
fn str_repeat_zero_times() {
assert_evals_to!(indoc!(r#"Str.repeat "Roc" 0"#), RocStr::from(""), RocStr);
}
#[test]
fn str_trim_empty_string() {
assert_evals_to!(indoc!(r#"Str.trim """#), RocStr::from(""), RocStr);
}
#[test]
fn str_trim_small_blank_string() {
assert_evals_to!(indoc!(r#"Str.trim " ""#), RocStr::from(""), RocStr);
}
#[test]
fn str_trim_small_to_small() {
assert_evals_to!(
indoc!(r#"Str.trim " hello world ""#),
RocStr::from("hello world"),
RocStr
);
}
#[test]
fn str_trim_large_to_large_unique() {
assert_evals_to!(
indoc!(r#"Str.trim (Str.concat " " "hello world from a large string ")"#),
RocStr::from("hello world from a large string"),
RocStr
);
}
#[test]
fn str_trim_large_to_small_unique() {
assert_evals_to!(
indoc!(r#"Str.trim (Str.concat " " "hello world ")"#),
RocStr::from("hello world"),
RocStr
);
}
#[test]
fn str_trim_large_to_large_shared() {
assert_evals_to!(
indoc!(
r#"
original : Str
original = " hello world world "
{ trimmed: Str.trim original, original: original }
"#
),
(
RocStr::from(" hello world world "),
RocStr::from("hello world world"),
),
(RocStr, RocStr)
);
}
#[test]
fn str_trim_large_to_small_shared() {
assert_evals_to!(
indoc!(
r#"
original : Str
original = " hello world "
{ trimmed: Str.trim original, original: original }
"#
),
(
RocStr::from(" hello world "),
RocStr::from("hello world"),
),
(RocStr, RocStr)
);
}
#[test]
fn str_trim_small_to_small_shared() {
assert_evals_to!(
indoc!(
r#"
original : Str
original = " hello world "
{ trimmed: Str.trim original, original: original }
"#
),
(RocStr::from(" hello world "), RocStr::from("hello world"),),
(RocStr, RocStr)
);
}

BIN
examples/hello-rust/hello-world Executable file

Binary file not shown.