mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-31 15:47:41 +00:00
perf: Optimize UTF8/ASCII byte offset index (#3439)
This commit is contained in:
parent
cc8b13d3a7
commit
d2988043af
1 changed files with 258 additions and 149 deletions
|
@ -10,94 +10,6 @@ pub struct Locator<'a> {
|
||||||
index: OnceCell<Index>,
|
index: OnceCell<Index>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub enum Index {
|
|
||||||
Ascii(Vec<usize>),
|
|
||||||
Utf8(Vec<Vec<usize>>),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute the starting byte index of each line in ASCII source code.
|
|
||||||
fn index_ascii(contents: &str) -> Vec<usize> {
|
|
||||||
let mut index = Vec::with_capacity(48);
|
|
||||||
index.push(0);
|
|
||||||
let bytes = contents.as_bytes();
|
|
||||||
for (i, byte) in bytes.iter().enumerate() {
|
|
||||||
if *byte == b'\n' {
|
|
||||||
index.push(i + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
index
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute the starting byte index of each character in UTF-8 source code.
|
|
||||||
fn index_utf8(contents: &str) -> Vec<Vec<usize>> {
|
|
||||||
let mut index = Vec::with_capacity(48);
|
|
||||||
let mut current_row = Vec::with_capacity(48);
|
|
||||||
let mut current_byte_offset = 0;
|
|
||||||
let mut previous_char = '\0';
|
|
||||||
for char in contents.chars() {
|
|
||||||
// Skip BOM.
|
|
||||||
if previous_char == '\0' && char == '\u{feff}' {
|
|
||||||
current_byte_offset += char.len_utf8();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
current_row.push(current_byte_offset);
|
|
||||||
if char == '\n' {
|
|
||||||
if previous_char == '\r' {
|
|
||||||
current_row.pop();
|
|
||||||
}
|
|
||||||
index.push(current_row);
|
|
||||||
current_row = Vec::with_capacity(48);
|
|
||||||
}
|
|
||||||
current_byte_offset += char.len_utf8();
|
|
||||||
previous_char = char;
|
|
||||||
}
|
|
||||||
index.push(current_row);
|
|
||||||
index
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Compute the starting byte index of each line in source code.
|
|
||||||
pub fn index(contents: &str) -> Index {
|
|
||||||
if contents.is_ascii() {
|
|
||||||
Index::Ascii(index_ascii(contents))
|
|
||||||
} else {
|
|
||||||
Index::Utf8(index_utf8(contents))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Truncate a [`Location`] to a byte offset in ASCII source code.
|
|
||||||
fn truncate_ascii(location: Location, index: &[usize], contents: &str) -> usize {
|
|
||||||
if location.row() - 1 == index.len() && location.column() == 0
|
|
||||||
|| (!index.is_empty()
|
|
||||||
&& location.row() - 1 == index.len() - 1
|
|
||||||
&& index[location.row() - 1] + location.column() >= contents.len())
|
|
||||||
{
|
|
||||||
contents.len()
|
|
||||||
} else {
|
|
||||||
index[location.row() - 1] + location.column()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Truncate a [`Location`] to a byte offset in UTF-8 source code.
|
|
||||||
fn truncate_utf8(location: Location, index: &[Vec<usize>], contents: &str) -> usize {
|
|
||||||
if (location.row() - 1 == index.len() && location.column() == 0)
|
|
||||||
|| (location.row() - 1 == index.len() - 1
|
|
||||||
&& location.column() == index[location.row() - 1].len())
|
|
||||||
{
|
|
||||||
contents.len()
|
|
||||||
} else {
|
|
||||||
index[location.row() - 1][location.column()]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Truncate a [`Location`] to a byte offset in source code.
|
|
||||||
fn truncate(location: Location, index: &Index, contents: &str) -> usize {
|
|
||||||
match index {
|
|
||||||
Index::Ascii(index) => truncate_ascii(location, index, contents),
|
|
||||||
Index::Utf8(index) => truncate_utf8(location, index, contents),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> Locator<'a> {
|
impl<'a> Locator<'a> {
|
||||||
pub const fn new(contents: &'a str) -> Self {
|
pub const fn new(contents: &'a str) -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
@ -107,20 +19,20 @@ impl<'a> Locator<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_or_init_index(&self) -> &Index {
|
fn get_or_init_index(&self) -> &Index {
|
||||||
self.index.get_or_init(|| index(self.contents))
|
self.index.get_or_init(|| Index::from(self.contents))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Take the source code up to the given [`Location`].
|
/// Take the source code up to the given [`Location`].
|
||||||
pub fn take(&self, location: Location) -> &'a str {
|
pub fn take(&self, location: Location) -> &'a str {
|
||||||
let index = self.get_or_init_index();
|
let index = self.get_or_init_index();
|
||||||
let offset = truncate(location, index, self.contents);
|
let offset = index.byte_offset(location, self.contents);
|
||||||
&self.contents[..offset]
|
&self.contents[..offset]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Take the source code after the given [`Location`].
|
/// Take the source code after the given [`Location`].
|
||||||
pub fn skip(&self, location: Location) -> &'a str {
|
pub fn skip(&self, location: Location) -> &'a str {
|
||||||
let index = self.get_or_init_index();
|
let index = self.get_or_init_index();
|
||||||
let offset = truncate(location, index, self.contents);
|
let offset = index.byte_offset(location, self.contents);
|
||||||
&self.contents[offset..]
|
&self.contents[offset..]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -128,15 +40,15 @@ impl<'a> Locator<'a> {
|
||||||
pub fn slice<R: Into<Range>>(&self, range: R) -> &'a str {
|
pub fn slice<R: Into<Range>>(&self, range: R) -> &'a str {
|
||||||
let index = self.get_or_init_index();
|
let index = self.get_or_init_index();
|
||||||
let range = range.into();
|
let range = range.into();
|
||||||
let start = truncate(range.location, index, self.contents);
|
let start = index.byte_offset(range.location, self.contents);
|
||||||
let end = truncate(range.end_location, index, self.contents);
|
let end = index.byte_offset(range.end_location, self.contents);
|
||||||
&self.contents[start..end]
|
&self.contents[start..end]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the byte offset of the given [`Location`].
|
/// Return the byte offset of the given [`Location`].
|
||||||
pub fn offset(&self, location: Location) -> usize {
|
pub fn offset(&self, location: Location) -> usize {
|
||||||
let index = self.get_or_init_index();
|
let index = self.get_or_init_index();
|
||||||
truncate(location, index, self.contents)
|
index.byte_offset(location, self.contents)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the underlying source code.
|
/// Return the underlying source code.
|
||||||
|
@ -153,116 +65,313 @@ impl<'a> Locator<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Index for fast [`Location`] to byte offset conversions.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
enum Index {
|
||||||
|
/// Optimized index for an ASCII only document
|
||||||
|
Ascii(AsciiIndex),
|
||||||
|
|
||||||
|
/// Index for UTF8 documents
|
||||||
|
Utf8(Utf8Index),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Index {
|
||||||
|
/// Truncate a [`Location`] to a byte offset in source code.
|
||||||
|
fn byte_offset(&self, location: Location, contents: &str) -> usize {
|
||||||
|
match self {
|
||||||
|
Index::Ascii(ascii) => ascii.byte_offset(location, contents),
|
||||||
|
Index::Utf8(utf8) => utf8.byte_offset(location, contents),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<&str> for Index {
|
||||||
|
fn from(contents: &str) -> Self {
|
||||||
|
assert!(u32::try_from(contents.len()).is_ok());
|
||||||
|
|
||||||
|
let mut line_start_offsets: Vec<u32> = Vec::with_capacity(48);
|
||||||
|
line_start_offsets.push(0);
|
||||||
|
|
||||||
|
// SAFE because of length assertion above
|
||||||
|
#[allow(clippy::cast_possible_truncation)]
|
||||||
|
for (i, byte) in contents.bytes().enumerate() {
|
||||||
|
if !byte.is_ascii() {
|
||||||
|
return Self::Utf8(continue_utf8_index(&contents[i..], i, line_start_offsets));
|
||||||
|
}
|
||||||
|
|
||||||
|
match byte {
|
||||||
|
// Only track one line break for `\r\n`.
|
||||||
|
b'\r' if contents.as_bytes().get(i + 1) == Some(&b'\n') => continue,
|
||||||
|
b'\n' | b'\r' => {
|
||||||
|
line_start_offsets.push((i + 1) as u32);
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Self::Ascii(AsciiIndex::new(line_start_offsets))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// SAFE because of length assertion in `Index::from(&str)`
|
||||||
|
#[allow(clippy::cast_possible_truncation)]
|
||||||
|
fn continue_utf8_index(
|
||||||
|
non_ascii_part: &str,
|
||||||
|
offset: usize,
|
||||||
|
line_start_offsets: Vec<u32>,
|
||||||
|
) -> Utf8Index {
|
||||||
|
let mut lines = line_start_offsets;
|
||||||
|
|
||||||
|
for (position, char) in non_ascii_part.char_indices() {
|
||||||
|
match char {
|
||||||
|
// Only track `\n` for `\r\n`
|
||||||
|
'\r' if non_ascii_part.as_bytes().get(position + 1) == Some(&b'\n') => continue,
|
||||||
|
'\r' | '\n' => {
|
||||||
|
let absolute_offset = offset + position + 1;
|
||||||
|
lines.push(absolute_offset as u32);
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Utf8Index::new(lines)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Index for fast [`Location`] to byte offset conversions for ASCII documents.
|
||||||
|
///
|
||||||
|
/// The index stores the byte offsets for every line. It computes the byte offset for a [`Location`]
|
||||||
|
/// by retrieving the line offset from its index and adding the column.
|
||||||
|
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||||
|
struct AsciiIndex {
|
||||||
|
line_start_byte_offsets: Vec<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AsciiIndex {
|
||||||
|
fn new(line_start_positions: Vec<u32>) -> Self {
|
||||||
|
Self {
|
||||||
|
line_start_byte_offsets: line_start_positions,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Truncate a [`Location`] to a byte offset in ASCII source code.
|
||||||
|
fn byte_offset(&self, location: Location, contents: &str) -> usize {
|
||||||
|
let index = &self.line_start_byte_offsets;
|
||||||
|
|
||||||
|
// If start-of-line position after last line
|
||||||
|
if location.row() - 1 == index.len() && location.column() == 0 {
|
||||||
|
contents.len()
|
||||||
|
} else {
|
||||||
|
let byte_offset = index[location.row() - 1] as usize + location.column();
|
||||||
|
byte_offset.min(contents.len())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Index for fast [`Location`] to byte offset conversions for UTF8 documents.
|
||||||
|
///
|
||||||
|
/// The index stores the byte offset of every line. The column offset is lazily computed by
|
||||||
|
/// adding the line start offset and then iterating to the `nth` character.
|
||||||
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
|
struct Utf8Index {
|
||||||
|
line_start_byte_offsets: Vec<u32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Utf8Index {
|
||||||
|
fn new(line_byte_positions: Vec<u32>) -> Self {
|
||||||
|
Self {
|
||||||
|
line_start_byte_offsets: line_byte_positions,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Truncate a [`Location`] to a byte offset in UTF-8 source code.
|
||||||
|
fn byte_offset(&self, location: Location, contents: &str) -> usize {
|
||||||
|
let index = &self.line_start_byte_offsets;
|
||||||
|
|
||||||
|
if location.row() - 1 == index.len() && location.column() == 0 {
|
||||||
|
contents.len()
|
||||||
|
} else {
|
||||||
|
// Casting is safe because the length of utf8 characters is always between 1-4
|
||||||
|
#[allow(clippy::cast_possible_truncation)]
|
||||||
|
let line_start = if location.row() == 1 && contents.starts_with('\u{feff}') {
|
||||||
|
'\u{feff}'.len_utf8() as u32
|
||||||
|
} else {
|
||||||
|
index[location.row() - 1]
|
||||||
|
};
|
||||||
|
|
||||||
|
let rest = &contents[line_start as usize..];
|
||||||
|
|
||||||
|
let column_offset = match rest.char_indices().nth(location.column()) {
|
||||||
|
Some((offset, _)) => offset,
|
||||||
|
None => contents.len(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let offset = line_start as usize + column_offset;
|
||||||
|
offset.min(contents.len())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use crate::source_code::locator::{AsciiIndex, Index, Utf8Index};
|
||||||
use rustpython_parser::ast::Location;
|
use rustpython_parser::ast::Location;
|
||||||
|
|
||||||
use super::{index_ascii, index_utf8, truncate_ascii, truncate_utf8};
|
fn index_ascii(content: &str) -> AsciiIndex {
|
||||||
|
match Index::from(content) {
|
||||||
|
Index::Ascii(ascii) => ascii,
|
||||||
|
Index::Utf8(_) => {
|
||||||
|
panic!("Expected ASCII index")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn index_utf8(content: &str) -> Utf8Index {
|
||||||
|
match Index::from(content) {
|
||||||
|
Index::Utf8(utf8) => utf8,
|
||||||
|
Index::Ascii(_) => {
|
||||||
|
panic!("Expected UTF8 index")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn ascii_index() {
|
fn ascii_index() {
|
||||||
let contents = "";
|
let contents = "";
|
||||||
let index = index_ascii(contents);
|
let index = index_ascii(contents);
|
||||||
assert_eq!(index, [0]);
|
assert_eq!(index, AsciiIndex::new(vec![0]));
|
||||||
|
|
||||||
let contents = "x = 1";
|
let contents = "x = 1";
|
||||||
let index = index_ascii(contents);
|
let index = index_ascii(contents);
|
||||||
assert_eq!(index, [0]);
|
assert_eq!(index, AsciiIndex::new(vec![0]));
|
||||||
|
|
||||||
let contents = "x = 1\n";
|
let contents = "x = 1\n";
|
||||||
let index = index_ascii(contents);
|
let index = index_ascii(contents);
|
||||||
assert_eq!(index, [0, 6]);
|
assert_eq!(index, AsciiIndex::new(vec![0, 6]));
|
||||||
|
|
||||||
let contents = "x = 1\r\n";
|
|
||||||
let index = index_ascii(contents);
|
|
||||||
assert_eq!(index, [0, 7]);
|
|
||||||
|
|
||||||
let contents = "x = 1\ny = 2\nz = x + y\n";
|
let contents = "x = 1\ny = 2\nz = x + y\n";
|
||||||
let index = index_ascii(contents);
|
let index = index_ascii(contents);
|
||||||
assert_eq!(index, [0, 6, 12, 22]);
|
assert_eq!(index, AsciiIndex::new(vec![0, 6, 12, 22]));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn ascii_truncate() {
|
fn ascii_byte_offset() {
|
||||||
let contents = "x = 1\ny = 2";
|
let contents = "x = 1\ny = 2";
|
||||||
let index = index_ascii(contents);
|
let index = index_ascii(contents);
|
||||||
|
|
||||||
// First row.
|
// First row.
|
||||||
let loc = truncate_ascii(Location::new(1, 0), &index, contents);
|
let loc = index.byte_offset(Location::new(1, 0), contents);
|
||||||
assert_eq!(loc, 0);
|
assert_eq!(loc, 0);
|
||||||
|
|
||||||
// Second row.
|
// Second row.
|
||||||
let loc = truncate_ascii(Location::new(2, 0), &index, contents);
|
let loc = index.byte_offset(Location::new(2, 0), contents);
|
||||||
assert_eq!(loc, 6);
|
assert_eq!(loc, 6);
|
||||||
|
|
||||||
// One-past-the-end.
|
// One-past-the-end.
|
||||||
let loc = truncate_ascii(Location::new(3, 0), &index, contents);
|
let loc = index.byte_offset(Location::new(3, 0), contents);
|
||||||
assert_eq!(loc, 11);
|
assert_eq!(loc, 11);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn utf8_index() {
|
fn ascii_carriage_return() {
|
||||||
let contents = "";
|
let contents = "x = 4\ry = 3";
|
||||||
let index = index_utf8(contents);
|
let index = index_ascii(contents);
|
||||||
assert_eq!(index.len(), 1);
|
assert_eq!(index, AsciiIndex::new(vec![0, 6]));
|
||||||
assert_eq!(index[0], Vec::<usize>::new());
|
|
||||||
|
|
||||||
let contents = "x = 1";
|
assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4);
|
||||||
let index = index_utf8(contents);
|
assert_eq!(index.byte_offset(Location::new(2, 0), contents), 6);
|
||||||
assert_eq!(index.len(), 1);
|
assert_eq!(index.byte_offset(Location::new(2, 1), contents), 7);
|
||||||
assert_eq!(index[0], [0, 1, 2, 3, 4]);
|
|
||||||
|
|
||||||
let contents = "x = 1\n";
|
|
||||||
let index = index_utf8(contents);
|
|
||||||
assert_eq!(index.len(), 2);
|
|
||||||
assert_eq!(index[0], [0, 1, 2, 3, 4, 5]);
|
|
||||||
assert_eq!(index[1], Vec::<usize>::new());
|
|
||||||
|
|
||||||
let contents = "x = 1\r\n";
|
|
||||||
let index = index_utf8(contents);
|
|
||||||
assert_eq!(index.len(), 2);
|
|
||||||
assert_eq!(index[0], [0, 1, 2, 3, 4, 5]);
|
|
||||||
assert_eq!(index[1], Vec::<usize>::new());
|
|
||||||
|
|
||||||
let contents = "x = 1\ny = 2\nz = x + y\n";
|
|
||||||
let index = index_utf8(contents);
|
|
||||||
assert_eq!(index.len(), 4);
|
|
||||||
assert_eq!(index[0], [0, 1, 2, 3, 4, 5]);
|
|
||||||
assert_eq!(index[1], [6, 7, 8, 9, 10, 11]);
|
|
||||||
assert_eq!(index[2], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21]);
|
|
||||||
assert_eq!(index[3], Vec::<usize>::new());
|
|
||||||
|
|
||||||
let contents = "# \u{4e9c}\nclass Foo:\n \"\"\".\"\"\"";
|
|
||||||
let index = index_utf8(contents);
|
|
||||||
assert_eq!(index.len(), 3);
|
|
||||||
assert_eq!(index[0], [0, 1, 2, 5]);
|
|
||||||
assert_eq!(index[1], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
|
|
||||||
assert_eq!(index[2], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn utf8_truncate() {
|
fn ascii_carriage_return_newline() {
|
||||||
|
let contents = "x = 4\r\ny = 3";
|
||||||
|
let index = index_ascii(contents);
|
||||||
|
assert_eq!(index, AsciiIndex::new(vec![0, 7]));
|
||||||
|
|
||||||
|
assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4);
|
||||||
|
assert_eq!(index.byte_offset(Location::new(2, 0), contents), 7);
|
||||||
|
assert_eq!(index.byte_offset(Location::new(2, 1), contents), 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Utf8Index {
|
||||||
|
fn line_count(&self) -> usize {
|
||||||
|
self.line_start_byte_offsets.len()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn utf8_index() {
|
||||||
|
let contents = "x = '🫣'";
|
||||||
|
let index = index_utf8(contents);
|
||||||
|
assert_eq!(index.line_count(), 1);
|
||||||
|
assert_eq!(index, Utf8Index::new(vec![0]));
|
||||||
|
|
||||||
|
let contents = "x = '🫣'\n";
|
||||||
|
let index = index_utf8(contents);
|
||||||
|
assert_eq!(index.line_count(), 2);
|
||||||
|
assert_eq!(index, Utf8Index::new(vec![0, 11]));
|
||||||
|
|
||||||
|
let contents = "x = '🫣'\ny = 2\nz = x + y\n";
|
||||||
|
let index = index_utf8(contents);
|
||||||
|
assert_eq!(index.line_count(), 4);
|
||||||
|
assert_eq!(index, Utf8Index::new(vec![0, 11, 17, 27]));
|
||||||
|
|
||||||
|
let contents = "# 🫣\nclass Foo:\n \"\"\".\"\"\"";
|
||||||
|
let index = index_utf8(contents);
|
||||||
|
assert_eq!(index.line_count(), 3);
|
||||||
|
assert_eq!(index, Utf8Index::new(vec![0, 7, 18]));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn utf8_carriage_return() {
|
||||||
|
let contents = "x = '🫣'\ry = 3";
|
||||||
|
let index = index_utf8(contents);
|
||||||
|
assert_eq!(index.line_count(), 2);
|
||||||
|
assert_eq!(index, Utf8Index::new(vec![0, 11]));
|
||||||
|
|
||||||
|
// Second '
|
||||||
|
assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9);
|
||||||
|
assert_eq!(index.byte_offset(Location::new(2, 0), contents), 11);
|
||||||
|
assert_eq!(index.byte_offset(Location::new(2, 1), contents), 12);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn utf8_carriage_return_newline() {
|
||||||
|
let contents = "x = '🫣'\r\ny = 3";
|
||||||
|
let index = index_utf8(contents);
|
||||||
|
assert_eq!(index.line_count(), 2);
|
||||||
|
assert_eq!(index, Utf8Index::new(vec![0, 12]));
|
||||||
|
|
||||||
|
// Second '
|
||||||
|
assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9);
|
||||||
|
assert_eq!(index.byte_offset(Location::new(2, 0), contents), 12);
|
||||||
|
assert_eq!(index.byte_offset(Location::new(2, 1), contents), 13);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn utf8_byte_offset() {
|
||||||
let contents = "x = '☃'\ny = 2";
|
let contents = "x = '☃'\ny = 2";
|
||||||
let index = index_utf8(contents);
|
let index = index_utf8(contents);
|
||||||
|
assert_eq!(index, Utf8Index::new(vec![0, 10]));
|
||||||
|
|
||||||
// First row.
|
// First row.
|
||||||
let loc = truncate_utf8(Location::new(1, 0), &index, contents);
|
let loc = index.byte_offset(Location::new(1, 0), contents);
|
||||||
assert_eq!(loc, 0);
|
assert_eq!(loc, 0);
|
||||||
|
|
||||||
let loc = truncate_utf8(Location::new(1, 5), &index, contents);
|
let loc = index.byte_offset(Location::new(1, 5), contents);
|
||||||
assert_eq!(loc, 5);
|
assert_eq!(loc, 5);
|
||||||
assert_eq!(&contents[loc..], "☃'\ny = 2");
|
assert_eq!(&contents[loc..], "☃'\ny = 2");
|
||||||
|
|
||||||
let loc = truncate_utf8(Location::new(1, 6), &index, contents);
|
let loc = index.byte_offset(Location::new(1, 6), contents);
|
||||||
assert_eq!(loc, 8);
|
assert_eq!(loc, 8);
|
||||||
assert_eq!(&contents[loc..], "'\ny = 2");
|
assert_eq!(&contents[loc..], "'\ny = 2");
|
||||||
|
|
||||||
// Second row.
|
// Second row.
|
||||||
let loc = truncate_utf8(Location::new(2, 0), &index, contents);
|
let loc = index.byte_offset(Location::new(2, 0), contents);
|
||||||
assert_eq!(loc, 10);
|
assert_eq!(loc, 10);
|
||||||
|
|
||||||
// One-past-the-end.
|
// One-past-the-end.
|
||||||
let loc = truncate_utf8(Location::new(3, 0), &index, contents);
|
let loc = index.byte_offset(Location::new(3, 0), contents);
|
||||||
assert_eq!(loc, 15);
|
assert_eq!(loc, 15);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue