From 00eefa19472b1fa10c0e19acbae28b5222ba0590 Mon Sep 17 00:00:00 2001 From: Richard Feldman Date: Mon, 2 Aug 2021 22:17:13 -0400 Subject: [PATCH] Add IdentStr --- Cargo.lock | 4 + Cargo.toml | 1 + compiler/ident/Cargo.toml | 6 + compiler/ident/src/lib.rs | 383 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 394 insertions(+) create mode 100644 compiler/ident/Cargo.toml create mode 100644 compiler/ident/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 4a3a6f031a..35f44c4404 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3288,6 +3288,10 @@ dependencies = [ "tokio", ] +[[package]] +name = "roc_ident" +version = "0.1.0" + [[package]] name = "roc_load" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index 06f10ff1fd..2eabb2fc11 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,6 @@ [workspace] members = [ + "compiler/ident", "compiler/region", "compiler/collections", "compiler/module", diff --git a/compiler/ident/Cargo.toml b/compiler/ident/Cargo.toml new file mode 100644 index 0000000000..d272f4818f --- /dev/null +++ b/compiler/ident/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "roc_ident" +version = "0.1.0" +authors = ["The Roc Contributors"] +license = "UPL-1.0" +edition = "2018" diff --git a/compiler/ident/src/lib.rs b/compiler/ident/src/lib.rs new file mode 100644 index 0000000000..c963bf584d --- /dev/null +++ b/compiler/ident/src/lib.rs @@ -0,0 +1,383 @@ +#![warn(clippy::dbg_macro)] + +use core::cmp::Ordering; +use core::convert::From; +use core::{fmt, mem, ptr, slice}; +use std::alloc::{GlobalAlloc, Layout, System}; + +/// A string which can store identifiers using the small string optimization. +/// It relies on the invariant that it cannot store null characters to store +/// an extra character; if the last byte is 0, that means it's a large string. +/// +/// Because the msbyte of the length is always 0, this can only store up to +/// 2^56 bytes on a 64-bit target, or 2^28 bytes in a 32-bit target. That's +/// way more than enough for an identifier! +/// +/// If it's a small string, that discriminant byte is used to store the length, +/// except it stores it as (255 - length) so that it will be in the range +/// 192 - 255 (all of which are invalid UTF-8 when in the final position of +/// a UTF-8 string). This design works on little-endian targets, but a different +/// design for storing length might be necessary on big-endian targets. + +// For big-endian, field order must be swapped! +// Otherwise, the discriminant byte will be in the wrong place. +#[cfg(target_endian = "little")] +#[repr(C)] +pub struct IdentStr { + elements: *mut u8, + length: usize, +} + +impl IdentStr { + pub fn len(&self) -> usize { + let bytes = self.length.to_ne_bytes(); + let last_byte = bytes[mem::size_of::() - 1]; + + // We always perform this subtraction so that the following + // conditionals can all be cmov instructions. + let small_str_variable_len = (u8::MAX - last_byte) as usize; + + // The numbers 192 - 255 (0xC0 - 0xFF) are not valid as the final + // byte of a UTF-8 string. Hence they are unused and we can use them + // to store the length of a small string! + // + // Reference: https://en.wikipedia.org/wiki/UTF-8#Codepage_layout + if last_byte >= 0xC0 { + small_str_variable_len + } else if last_byte == 0 { + // This is a big string, so return its length. + self.length + } else { + // This is a valid UTF-8 character, meaning the entire struct must + // be in use for storing characters. + mem::size_of::() + } + } + + pub fn is_empty(&self) -> bool { + self.length == 0 + } + + pub fn is_small_str(&self) -> bool { + let bytes = self.length.to_ne_bytes(); + let last_byte = bytes[mem::size_of::() - 1]; + + last_byte != 0 + } + + pub fn empty() -> Self { + IdentStr { + length: 0, + elements: core::ptr::null_mut(), + } + } + + pub fn get(&self, index: usize) -> Option<&u8> { + if index < self.len() { + Some(unsafe { + let raw = if self.is_small_str() { + self.get_small_str_ptr().add(index) + } else { + self.elements.add(index) + }; + + &*raw + }) + } else { + None + } + } + + pub fn get_bytes(&self) -> *const u8 { + if self.is_small_str() { + self.get_small_str_ptr() + } else { + self.elements + } + } + + fn get_small_str_ptr(&self) -> *const u8 { + (self as *const IdentStr).cast() + } + + fn from_slice(slice: &[u8]) -> Self { + let len = slice.len(); + + match len.cmp(&mem::size_of::()) { + Ordering::Less => { + // This fits in a small string, but needs its length recorded + let mut answer_bytes: [u8; mem::size_of::()] = + unsafe { mem::transmute::()]>(Self::empty()) }; + + // Copy the bytes from the slice into the answer + let dest_slice = + unsafe { slice::from_raw_parts_mut(&mut answer_bytes as *mut u8, len) }; + + dest_slice.copy_from_slice(slice); + + let mut answer: Self = + unsafe { mem::transmute::<[u8; mem::size_of::()], Self>(answer_bytes) }; + + // Write length and small string bit to last byte of length. + { + let mut bytes = answer.length.to_ne_bytes(); + + bytes[mem::size_of::() - 1] = u8::MAX - len as u8; + + answer.length = usize::from_ne_bytes(bytes); + } + + answer + } + Ordering::Equal => { + // This fits in a small string, and is exactly long enough to + // take up the entire available struct + let mut answer_bytes: [u8; mem::size_of::()] = + unsafe { mem::transmute::()]>(Self::empty()) }; + + // Copy the bytes from the slice into the answer + let dest_slice = unsafe { + slice::from_raw_parts_mut(&mut answer_bytes as *mut u8, mem::size_of::()) + }; + + dest_slice.copy_from_slice(slice); + + unsafe { mem::transmute::<[u8; mem::size_of::()], Self>(answer_bytes) } + } + Ordering::Greater => { + // This needs a big string + let elements = unsafe { + let align = mem::align_of::(); + let layout = Layout::from_size_align_unchecked(len, align); + + System.alloc(layout) + }; + + // Turn the new elements into a slice, and copy the existing + // slice's bytes into it. + unsafe { + let dest_slice = slice::from_raw_parts_mut(elements, len); + + dest_slice.copy_from_slice(slice); + } + + Self { + length: len, + elements, + } + } + } + } + + pub fn as_slice(&self) -> &[u8] { + use core::slice::from_raw_parts; + + if self.is_small_str() { + unsafe { from_raw_parts(self.get_small_str_ptr(), self.len()) } + } else { + unsafe { from_raw_parts(self.elements, self.length) } + } + } + + #[allow(clippy::missing_safety_doc)] + pub unsafe fn as_str(&self) -> &str { + let slice = self.as_slice(); + + core::str::from_utf8_unchecked(slice) + } + + /// Write a CStr (null-terminated) representation of this IdentStr into + /// the given buffer. + /// + /// # Safety + /// This assumes the given buffer has enough space, so make sure you only + /// pass in a pointer to an allocation that's at least as long as this Str! + pub unsafe fn write_c_str(&self, buf: *mut u8) -> *mut char { + if self.is_small_str() { + ptr::copy_nonoverlapping(self.get_small_str_ptr(), buf, self.len()); + } else { + ptr::copy_nonoverlapping(self.elements, buf, self.len()); + } + + // null-terminate + *(buf.add(self.len())) = 0; + + buf as *mut char + } +} + +impl From<&str> for IdentStr { + fn from(str: &str) -> Self { + Self::from_slice(str.as_bytes()) + } +} + +impl fmt::Debug for IdentStr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // IdentStr { is_small_str: false, storage: Refcounted(3), elements: [ 1,2,3,4] } + f.debug_struct("IdentStr") + .field("is_small_str", &self.is_small_str()) + .field("elements", &self.as_slice()) + .finish() + } +} + +impl PartialEq for IdentStr { + fn eq(&self, other: &Self) -> bool { + self.as_slice() == other.as_slice() + } +} + +impl Eq for IdentStr {} + +impl Clone for IdentStr { + fn clone(&self) -> Self { + if self.is_small_str() || self.is_empty() { + Self { + elements: self.elements, + length: self.length, + } + } else { + let capacity_size = core::mem::size_of::(); + let copy_length = self.length + capacity_size; + let elements = unsafe { + let align = mem::align_of::(); + let layout = Layout::from_size_align_unchecked(copy_length, align); + let raw_ptr = System.alloc(layout); + + let dest_slice = slice::from_raw_parts_mut(raw_ptr, copy_length); + let src_ptr = self.elements as *mut u8; + let src_slice = slice::from_raw_parts(src_ptr, copy_length); + + dest_slice.copy_from_slice(src_slice); + + raw_ptr as *mut u8 + }; + + Self { + elements, + length: self.length, + } + } + } +} + +impl Drop for IdentStr { + fn drop(&mut self) { + if !self.is_small_str() { + unsafe { + let align = mem::align_of::(); + let layout = Layout::from_size_align_unchecked(self.length, align); + + System.dealloc(self.elements, layout); + } + } + } +} + +#[test] +fn empty() { + let answer = IdentStr::empty(); + + assert_eq!(answer.len(), 0); + assert_eq!(answer, answer); + assert_eq!(answer.clone(), answer); + assert_eq!(answer.clone(), answer.clone()); + assert_eq!(unsafe { answer.as_str() }, ""); + assert_eq!(unsafe { answer.clone().as_str() }, ""); +} + +#[test] +fn big_str() { + for &string in &[ + "0123456789abcdefg", + "0123456789abcdefgh", + "0123456789abcdefghi", + ] { + let answer = IdentStr::from(string); + + assert_eq!(answer.len(), string.len()); + assert_eq!(answer, answer); + assert_eq!(answer.clone(), answer); + assert_eq!(answer.clone(), answer.clone()); + assert_eq!(unsafe { answer.as_str() }, string); + assert_eq!(unsafe { answer.clone().as_str() }, string); + } +} + +#[cfg(target_pointer_width = "64")] +#[test] +fn small_var_length() { + for &string in &[ + "", + "0", + "01", + "012", + "0123", + "01234", + "012345", + "0123456", + "01234567", + "012345678", + "0123456789", + "0123456789a", + "0123456789ab", + "0123456789abc", + "0123456789abcd", + "0123456789abcde ", + ] { + let answer = IdentStr::from(string); + + assert_eq!(answer.len(), string.len()); + assert_eq!(answer, answer); + assert_eq!(answer.clone(), answer); + assert_eq!(answer.clone(), answer.clone()); + assert_eq!(unsafe { answer.as_str() }, string); + assert_eq!(unsafe { answer.clone().as_str() }, string); + } +} + +#[cfg(target_pointer_width = "32")] +#[test] +fn small_var_length() { + for &string in &[ + "", "0", "01", "012", "0123", "01234", "012345", "0123456", "01234567", + ] { + let answer = IdentStr::from(string); + + assert_eq!(answer.len(), string.len()); + assert_eq!(answer, answer); + assert_eq!(answer.clone(), answer); + assert_eq!(answer.clone(), answer.clone()); + assert_eq!(unsafe { answer.as_str() }, string); + assert_eq!(unsafe { answer.clone().as_str() }, string); + } +} + +#[cfg(target_pointer_width = "64")] +#[test] +fn small_max_length() { + let string = "0123456789abcdef"; + let answer = IdentStr::from(string); + + assert_eq!(answer.len(), string.len()); + assert_eq!(answer, answer); + assert_eq!(answer.clone(), answer); + assert_eq!(answer.clone(), answer.clone()); + assert_eq!(unsafe { answer.as_str() }, string); + assert_eq!(unsafe { answer.clone().as_str() }, string); +} + +#[cfg(target_pointer_width = "32")] +#[test] +fn small_max_length() { + let string = "01234567"; + let answer = IdentStr::from(string); + + assert_eq!(answer.len(), string.len()); + assert_eq!(answer, answer); + assert_eq!(answer.clone(), answer); + assert_eq!(answer.clone(), answer.clone()); + assert_eq!(unsafe { answer.as_str() }, string); + assert_eq!(unsafe { answer.clone().as_str() }, string); +}