Add Str.countGraphemes

This commit is contained in:
Jared Ramirez 2020-11-02 10:22:36 -06:00
parent 00130c6dc0
commit 63308d90e1
18 changed files with 11345 additions and 44 deletions

View file

@ -64,6 +64,34 @@ One of the cool things about Roc is that it evaluates if a value in memory is sh
We have to define the uniqueness constraints of a function just like we have to define a type signature. That is what happens in `unique.rs`. This can be tricky so it would be a good step to ask for help on if it is confusing.
## Testing it
### solve/tests/solve_expr.rs
To make sure that Roc is properly inferring the type of the new builting, add a test to this file simlar to:
```
#[test]
fn atan() {
infer_eq_without_problem(
indoc!(
r#"
Num.atan
"#
),
"Float -> Float",
);
}
```
But replace `Num.atan` and the type signature with the new builtin.
### gen/test/*.rs
In this directory, there are a couple files like `gen_num.rs`, `gen_str.rs`, etc. For the `Str` module builtins, put the test in `gen_str.rs`, etc. Find the one for the new builtin, and add a test like:
```
#[test]
fn atan() {
assert_evals_to!("Num.atan 10", 1.4711276743037347, f64);
}
```
But replace `Num.atan`, the return value, and the return type with your new builtin.
# Mistakes that are easy to make!!
When implementing a new builtin, it is often easy to copy and paste the implementation for an existing builtin. This can take you quite far since many builtins are very similar, but it also risks forgetting to change one small part of what you copy and pasted and losing a lot of time later on when you cant figure out why things dont work. So, speaking from experience, even if you are copying an existing builtin, try and implement it manually without copying and pasting. Two recent instances of this (as of September 7th, 2020):

View file

@ -1,2 +1,4 @@
zig-cache
src/zig-cache
builtins.ll
builtins.bc

View file

@ -1,5 +1,7 @@
# Bitcode for Builtins
## How it works
Roc's builtins are implemented in the compiler using LLVM only.
When their implementations are simple enough (e.g. addition), they
can be implemented directly in Inkwell.
@ -22,3 +24,10 @@ There will be two directories like `roc_builtins-[some random characters]`, look
## Calling bitcode functions
Use the `call_bitcode_fn` function defined in `llvm/src/build.rs` to call bitcode funcitons.
## Developing
To add a builtin, just add it to `src/main.zig`. For anything you add, you must add tests for it! Not only does to make the builtins more maintainable, it's the the easiest way to test these functions on Zig. To run the test, run
```
zig build test
```

View file

@ -0,0 +1,59 @@
const builtin = @import("builtin");
const std = @import("std");
const mem = std.mem;
const Builder = std.build.Builder;
pub fn build(b: *Builder) void {
b.setPreferredReleaseMode(builtin.Mode.ReleaseFast);
const mode = b.standardReleaseOptions();
// Options
const fallback_main_path = "./src/main.zig";
const main_path_desc = b.fmt("Override path to main.zig. Used by \"ir\", \"bc\", and \"test\". Defaults to \"{}\". ", .{fallback_main_path});
const main_path = b.option([]const u8, "main-path", main_path_desc) orelse fallback_main_path;
const fallback_bitcode_path = "./builtins.bc";
const bitcode_path_desc = b.fmt("Override path to generated bitcode file. Used by \"ir\" and \"bc\". Defaults to \"{}\". ", .{fallback_bitcode_path});
const bitcode_path = b.option([]const u8, "bc-path", bitcode_path_desc) orelse fallback_bitcode_path;
// Tests
var main_tests = b.addTest(main_path);
main_tests.setBuildMode(mode);
const test_step = b.step("test", "Run tests");
test_step.dependOn(&main_tests.step);
// Lib
const obj_name = "builtins";
const obj = b.addObject(obj_name, main_path);
obj.setBuildMode(mode);
obj.strip = true;
obj.emit_llvm_ir = true;
obj.emit_bin = false;
const ir = b.step("ir", "Build LLVM ir");
ir.dependOn(&obj.step);
// IR to Bitcode
const bitcode_path_arg = b.fmt("-o={}", .{bitcode_path});
const ir_out_file = b.fmt("{}.ll", .{obj_name});
const ir_to_bitcode = b.addSystemCommand(&[_][]const u8{
"llvm-as-10",
ir_out_file,
bitcode_path_arg
});
const bicode = b.step("bc", "Build LLVM ir and convert to bitcode");
bicode.dependOn(ir);
bicode.dependOn(&ir_to_bitcode.step);
b.default_step = ir;
removeInstallSteps(b);
}
fn removeInstallSteps(b: *Builder) void {
for (b.top_level_steps.items) |top_level_step, i| {
if (mem.eql(u8, top_level_step.step.name, "install") or mem.eql(u8, top_level_step.step.name, "uninstall")) {
const name = top_level_step.step.name;
_ = b.top_level_steps.swapRemove(i);
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -1,10 +1,14 @@
const std = @import("std");
const math = std.math;
const expect = std.testing.expect;
const unicode = std.unicode;
const testing = std.testing;
const expectEqual = testing.expectEqual;
const expect = testing.expect;
const roc_builtins_namespace = "roc_builtins";
// MATH
const math_namespace = roc_builtins_namespace ++ ".math";
const str_namespace = roc_builtins_namespace ++ ".str";
comptime { @export(atan, .{ .name = math_namespace ++ ".atan", .linkage = .Strong }); }
fn atan(num: f64) callconv(.C) f64 {
@ -23,7 +27,7 @@ fn powInt(base: i64, exp: i64) callconv(.C) i64 {
comptime { @export(acos, .{ .name = math_namespace ++ ".acos", .linkage = .Strong }); }
fn acos(num: f64) callconv(.C) f64 {
return math.acos(num);
return math.acos(num);
}
comptime { @export(asin, .{ .name = math_namespace ++ ".asin", .linkage = .Strong }); }
@ -31,6 +35,8 @@ fn asin(num: f64) callconv(.C) f64 {
return math.asin(num);
}
// STR
const str_namespace = roc_builtins_namespace ++ ".str";
// Str.split
@ -45,7 +51,7 @@ const RocStr = struct {
};
}
pub fn eq(self: RocStr, other: RocStr) bool {
pub fn eq(self: *RocStr, other: RocStr) bool {
if (self.str_len != other.str_len) {
return false;
}
@ -71,7 +77,7 @@ const RocStr = struct {
const str2_ptr: [*]u8 = &str2;
var roc_str2 = RocStr.init(str2_ptr, str2_len);
expect(RocStr.eq(roc_str1, roc_str2));
expect(roc_str1.eq(roc_str2));
}
test "RocStr.eq: not equal different length" {
@ -85,7 +91,7 @@ const RocStr = struct {
const str2_ptr: [*]u8 = &str2;
var roc_str2 = RocStr.init(str2_ptr, str2_len);
expect(!RocStr.eq(roc_str1, roc_str2));
expect(!roc_str1.eq(roc_str2));
}
test "RocStr.eq: not equal same length" {
@ -99,7 +105,7 @@ const RocStr = struct {
const str2_ptr: [*]u8 = &str2;
var roc_str2 = RocStr.init(str2_ptr, str2_len);
expect(!RocStr.eq(roc_str1, roc_str2));
expect(!roc_str1.eq(roc_str2));
}
};
@ -175,8 +181,8 @@ test "strSplitInPlace: no delimiter" {
RocStr.init(str_ptr, 3),
};
expect(array.len == expected.len);
expect(RocStr.eq(array[0], expected[0]));
expectEqual(array.len, expected.len);
expect(array[0].eq(expected[0]));
}
test "strSplitInPlace: delimiter on sides" {
@ -212,10 +218,10 @@ test "strSplitInPlace: delimiter on sides" {
const expected_str_ptr: [*]u8 = &expected_str;
var expectedRocStr = RocStr.init(expected_str_ptr, expected_str_len);
expect(array.len == 3);
expect(array[0].str_len == 0);
expect(RocStr.eq(array[1], expectedRocStr));
expect(array[2].str_len == 0);
expectEqual(array.len, 3);
expectEqual(array[0].str_len, 0);
expect(array[1].eq(expectedRocStr));
expectEqual(array[2].str_len, 0);
}
test "strSplitInPlace: three pieces" {
@ -266,10 +272,10 @@ test "strSplitInPlace: three pieces" {
}
};
expect(expected_array.len == array.len);
expect(RocStr.eq(array[0], expected_array[0]));
expect(RocStr.eq(array[1], expected_array[1]));
expect(RocStr.eq(array[2], expected_array[2]));
expectEqual(expected_array.len, array.len);
expect(array[0].eq(expected_array[0]));
expect(array[1].eq(expected_array[1]));
expect(array[2].eq(expected_array[2]));
}
// This is used for `Str.split : Str, Str -> Array Str
@ -336,7 +342,7 @@ test "countSegments: long delimiter" {
delimiter_len
);
expect(segments_count == 1);
expectEqual(segments_count, 1);
}
test "countSegments: delimiter at start" {
@ -358,7 +364,7 @@ test "countSegments: delimiter at start" {
delimiter_len
);
expect(segments_count == 2);
expectEqual(segments_count, 2);
}
test "countSegments: delimiter interspered" {
@ -380,5 +386,92 @@ test "countSegments: delimiter interspered" {
delimiter_len
);
expect(segments_count == 3);
expectEqual(segments_count, 3);
}
// Str.countGraphemeClusters
const grapheme = @import("grapheme.zig");
comptime { @export(countGraphemeClusters, .{ .name = str_namespace ++ ".count_grapheme_clusters", .linkage = .Strong }); }
fn countGraphemeClusters(bytes_ptr: [*]u8, bytes_len: usize) callconv(.C) usize {
var bytes = bytes_ptr[0..bytes_len];
var iter = (unicode.Utf8View.init(bytes) catch unreachable).iterator();
var count: usize = 0;
var grapheme_break_state: ?grapheme.BoundClass = null;
var grapheme_break_state_ptr = &grapheme_break_state;
var opt_last_codepoint: ?u21 = null;
while (iter.nextCodepoint()) |cur_codepoint| {
if (opt_last_codepoint) |last_codepoint| {
var did_break = grapheme.isGraphemeBreak(
last_codepoint,
cur_codepoint,
grapheme_break_state_ptr
);
if (did_break) {
count += 1;
grapheme_break_state = null;
}
}
opt_last_codepoint = cur_codepoint;
}
if (bytes_len != 0) {
count += 1;
}
return count;
}
test "countGraphemeClusters: empty string" {
var bytes_arr = "".*;
var bytes_len = bytes_arr.len;
var bytes_ptr: [*]u8 = &bytes_arr;
var count = countGraphemeClusters(bytes_ptr, bytes_len);
expectEqual(count, 0);
}
test "countGraphemeClusters: ascii characters" {
var bytes_arr = "abcd".*;
var bytes_len = bytes_arr.len;
var bytes_ptr: [*]u8 = &bytes_arr;
var count = countGraphemeClusters(bytes_ptr, bytes_len);
expectEqual(count, 4);
}
test "countGraphemeClusters: utf8 characters" {
var bytes_arr = "ãxā".*;
var bytes_len = bytes_arr.len;
var bytes_ptr: [*]u8 = &bytes_arr;
var count = countGraphemeClusters(bytes_ptr, bytes_len);
expectEqual(count, 3);
}
test "countGraphemeClusters: emojis" {
var bytes_arr = "🤔🤔🤔".*;
var bytes_len = bytes_arr.len;
var bytes_ptr: [*]u8 = &bytes_arr;
var count = countGraphemeClusters(bytes_ptr, bytes_len);
expectEqual(count, 3);
}
test "countGraphemeClusters: emojis and ut8 characters" {
var bytes_arr = "🤔å🤔¥🤔ç".*;
var bytes_len = bytes_arr.len;
var bytes_ptr: [*]u8 = &bytes_arr;
var count = countGraphemeClusters(bytes_ptr, bytes_len);
expectEqual(count, 6);
}
test "countGraphemeClusters: emojis, ut8, and ascii characters" {
var bytes_arr = "6🤔å🤔e¥🤔çpp".*;
var bytes_len = bytes_arr.len;
var bytes_ptr: [*]u8 = &bytes_arr;
var count = countGraphemeClusters(bytes_ptr, bytes_len);
expectEqual(count, 10);
}
// https://github.com/ziglang/zig/blob/master/lib/std/std.zig#L94
test "" {
testing.refAllDecls(@This());
}

View file

@ -5,26 +5,7 @@ use std::path::Path;
use std::process::Command;
use std::str;
fn run_command<S, I>(command: &str, args: I)
where
I: IntoIterator<Item = S>,
S: AsRef<OsStr>,
{
let output_result = Command::new(OsStr::new(&command)).args(args).output();
match output_result {
Ok(output) => match output.status.success() {
true => (),
false => {
let error_str = match str::from_utf8(&output.stderr) {
Ok(stderr) => stderr.to_string(),
Err(_) => format!("Failed to run \"{}\"", command),
};
panic!("{} failed: {}", command, error_str);
}
},
Err(reason) => panic!("{} failed: {}", command, reason),
}
}
// TODO: Use zig build system command instead
fn main() {
let out_dir = env::var_os("OUT_DIR").unwrap();
@ -68,3 +49,24 @@ fn main() {
println!("cargo:rerun-if-changed={}", src_path_str);
println!("cargo:rustc-env=BUILTINS_BC={}", dest_bc);
}
fn run_command<S, I>(command: &str, args: I)
where
I: IntoIterator<Item = S>,
S: AsRef<OsStr>,
{
let output_result = Command::new(OsStr::new(&command)).args(args).output();
match output_result {
Ok(output) => match output.status.success() {
true => (),
false => {
let error_str = match str::from_utf8(&output.stderr) {
Ok(stderr) => stderr.to_string(),
Err(_) => format!("Failed to run \"{}\"", command),
};
panic!("{} failed: {}", command, error_str);
}
},
Err(reason) => panic!("{} failed: {}", command, reason),
}
}

View file

@ -24,4 +24,5 @@ pub const MATH_IS_FINITE: &str = "roc_builtins.math.is_finite";
pub const MATH_POW_INT: &str = "roc_builtins.math.pow_int";
pub const STR_COUNT_SEGEMENTS: &str = "roc_builtins.str.count_segements";
pub const STR_STR_SPLIT_IN_PLACE: &str = "roc_builtins.str.str_split_in_place";
pub const STR_SPLIT_IN_PLACE: &str = "roc_builtins.str.str_split_in_place";
pub const STR_COUNT_GRAPEHEME_CLUSTERS: &str = "roc_builtins.str.count_grapheme_clusters";

View file

@ -402,6 +402,12 @@ pub fn types() -> MutMap<Symbol, (SolvedType, Region)> {
top_level_function(vec![str_type()], Box::new(bool_type())),
);
// countGraphemes : Str -> Int
add_type(
Symbol::STR_COUNT_GRAPHEMES,
top_level_function(vec![str_type()], Box::new(int_type())),
);
// List module
// get : List elem, Int -> Result elem [ OutOfBounds ]*

View file

@ -1028,6 +1028,12 @@ pub fn types() -> MutMap<Symbol, (SolvedType, Region)> {
unique_function(vec![str_type(star1), str_type(star2)], str_type(star3))
});
// Str.countGraphemes : Attr * Str, -> Attr * Int
add_type(Symbol::STR_COUNT_GRAPHEMES, {
let_tvars! { star1, star2 };
unique_function(vec![str_type(star1)], int_type(star2))
});
// Result module
// map : Attr * (Result (Attr a e))

View file

@ -52,6 +52,7 @@ pub fn builtin_defs(var_store: &mut VarStore) -> MutMap<Symbol, Def> {
Symbol::BOOL_NOT => bool_not,
Symbol::STR_CONCAT => str_concat,
Symbol::STR_IS_EMPTY => str_is_empty,
Symbol::STR_COUNT_GRAPHEMES => str_count_graphemes,
Symbol::LIST_LEN => list_len,
Symbol::LIST_GET => list_get,
Symbol::LIST_SET => list_set,
@ -924,7 +925,7 @@ fn str_concat(symbol: Symbol, var_store: &mut VarStore) -> Def {
)
}
/// Str.isEmpty : List * -> Bool
/// Str.isEmpty : Str -> Bool
fn str_is_empty(symbol: Symbol, var_store: &mut VarStore) -> Def {
let str_var = var_store.fresh();
let bool_var = var_store.fresh();
@ -944,6 +945,26 @@ fn str_is_empty(symbol: Symbol, var_store: &mut VarStore) -> Def {
)
}
/// Str.countGraphemes : Str -> Int
fn str_count_graphemes(symbol: Symbol, var_store: &mut VarStore) -> Def {
let str_var = var_store.fresh();
let int_var = var_store.fresh();
let body = RunLowLevel {
op: LowLevel::StrCountGraphemes,
args: vec![(str_var, Var(Symbol::ARG_1))],
ret_var: int_var,
};
defn(
symbol,
vec![(str_var, Symbol::ARG_1)],
var_store,
body,
int_var,
)
}
/// List.concat : List elem, List elem -> List elem
fn list_concat(symbol: Symbol, var_store: &mut VarStore) -> Def {
let list_var = var_store.fresh();

View file

@ -0,0 +1,21 @@
use inkwell::types::BasicTypeEnum;
use roc_module::low_level::LowLevel;
fn call_bitcode_fn<'a, 'ctx, 'env>(
op: LowLevel,
env: &Env<'a, 'ctx, 'env>,
args: &[BasicValueEnum<'ctx>],
fn_name: &str,
) -> BasicValueEnum<'ctx> {
let fn_val = env
.module
.get_function(fn_name)
.unwrap_or_else(|| panic!("Unrecognized builtin function: {:?} - if you're working on the Roc compiler, do you need to rebuild the bitcode? See compiler/builtins/bitcode/README.md", fn_name));
let call = env.builder.build_call(fn_val, args, "call_builtin");
call.set_call_convention(fn_val.get_call_conventions());
call.try_as_basic_value()
.left()
.unwrap_or_else(|| panic!("LLVM error: Invalid call for low-level op {:?}", op))
}

View file

@ -4,7 +4,7 @@ use crate::llvm::build_list::{
list_get_unsafe, list_join, list_keep_if, list_len, list_map, list_prepend, list_repeat,
list_reverse, list_set, list_single, list_walk_right,
};
use crate::llvm::build_str::{str_concat, str_len, CHAR_LAYOUT};
use crate::llvm::build_str::{str_concat, str_count_graphemes, str_len, CHAR_LAYOUT};
use crate::llvm::compare::{build_eq, build_neq};
use crate::llvm::convert::{
basic_type_from_layout, block_of_memory, collection, get_fn_type, get_ptr_type, ptr_int,
@ -2527,6 +2527,12 @@ fn run_low_level<'a, 'ctx, 'env>(
);
BasicValueEnum::IntValue(is_zero)
}
StrCountGraphemes => {
// Str.countGraphemes : Str -> Int
debug_assert_eq!(args.len(), 1);
str_count_graphemes(env, scope, parent, args[0])
}
ListLen => {
// List.len : List * -> Int
debug_assert_eq!(args.len(), 1);

View file

@ -7,6 +7,8 @@ use inkwell::builder::Builder;
use inkwell::types::BasicTypeEnum;
use inkwell::values::{BasicValueEnum, FunctionValue, IntValue, PointerValue, StructValue};
use inkwell::{AddressSpace, IntPredicate};
use roc_builtins::bitcode;
use roc_module::low_level::LowLevel;
use roc_module::symbol::Symbol;
use roc_mono::layout::{Builtin, Layout};
@ -591,3 +593,55 @@ fn str_is_not_empty<'ctx>(env: &Env<'_, 'ctx, '_>, len: IntValue<'ctx>) -> IntVa
"str_len_is_nonzero",
)
}
/// Str.countGraphemes : Str -> Int
pub fn str_count_graphemes<'a, 'ctx, 'env>(
env: &Env<'a, 'ctx, 'env>,
scope: &Scope<'a, 'ctx>,
parent: FunctionValue<'ctx>,
str_symbol: Symbol,
) -> BasicValueEnum<'ctx> {
let ctx = env.context;
let sym_str_ptr = ptr_from_symbol(scope, str_symbol);
let str_wrapper_type = BasicTypeEnum::StructType(collection(ctx, env.ptr_bytes));
load_str(
env,
parent,
*sym_str_ptr,
str_wrapper_type,
|str_ptr, str_len, _str_smallness| {
call_bitcode_fn(
LowLevel::StrCountGraphemes,
env,
&[
BasicValueEnum::PointerValue(str_ptr).into(),
BasicValueEnum::IntValue(str_len).into(),
],
&bitcode::STR_COUNT_GRAPEHEME_CLUSTERS,
)
},
)
}
// Duplicated from build.rs for now, once it's all working I'll delete this and import it form a
// common place
fn call_bitcode_fn<'a, 'ctx, 'env>(
op: LowLevel,
env: &Env<'a, 'ctx, 'env>,
args: &[BasicValueEnum<'ctx>],
fn_name: &str,
) -> BasicValueEnum<'ctx> {
let fn_val = env
.module
.get_function(fn_name)
.unwrap_or_else(|| panic!("Unrecognized builtin function: {:?} - if you're working on the Roc compiler, do you need to rebuild the bitcode? See compiler/builtins/bitcode/README.md", fn_name));
let call = env.builder.build_call(fn_val, args, "call_builtin");
call.set_call_convention(fn_val.get_call_conventions());
call.try_as_basic_value()
.left()
.unwrap_or_else(|| panic!("LLVM error: Invalid call for low-level op {:?}", op))
}

View file

@ -202,4 +202,9 @@ mod gen_str {
fn empty_str_is_empty() {
assert_evals_to!(r#"Str.isEmpty """#, true, bool);
}
#[test]
fn str_count_graphemes() {
assert_evals_to!(r#"Str.countGraphemes "6🤔å🤔e¥🤔çpp""#, 10, usize);
}
}

View file

@ -5,6 +5,7 @@
pub enum LowLevel {
StrConcat,
StrIsEmpty,
StrCountGraphemes,
ListLen,
ListGetUnsafe,
ListSet,

View file

@ -670,6 +670,7 @@ define_builtins! {
2 STR_IS_EMPTY: "isEmpty"
3 STR_APPEND: "append"
4 STR_CONCAT: "concat"
5 STR_COUNT_GRAPHEMES: "countGraphemes"
}
4 LIST: "List" => {
0 LIST_LIST: "List" imported // the List.List type alias

View file

@ -510,7 +510,7 @@ pub fn lowlevel_borrow_signature(arena: &Bump, op: LowLevel) -> &[bool] {
ListSet => arena.alloc_slice_copy(&[owned, irrelevant, irrelevant]),
ListSetInPlace => arena.alloc_slice_copy(&[owned, irrelevant, irrelevant]),
ListGetUnsafe => arena.alloc_slice_copy(&[borrowed, irrelevant]),
ListConcat | StrConcat => arena.alloc_slice_copy(&[owned, borrowed]),
ListConcat | StrConcat | StrCountGraphemes => arena.alloc_slice_copy(&[owned, borrowed]),
ListSingle => arena.alloc_slice_copy(&[irrelevant]),
ListRepeat => arena.alloc_slice_copy(&[irrelevant, irrelevant]),