Cheap cloneable LineIndex (#3896)

This commit is contained in:
Micha Reiser 2023-04-11 09:33:40 +02:00 committed by GitHub
parent 9209e57c5a
commit 76c47a9a43
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 465 additions and 335 deletions

View file

@ -1,13 +1,15 @@
//! Struct used to efficiently slice source code at (row, column) Locations.
use crate::source_code::line_index::LineIndex;
use once_cell::unsync::OnceCell;
use ruff_text_size::{TextRange, TextSize};
use rustpython_parser::ast::Location;
use crate::types::Range;
pub struct Locator<'a> {
contents: &'a str,
index: OnceCell<Index>,
index: OnceCell<LineIndex>,
}
impl<'a> Locator<'a> {
@ -18,37 +20,38 @@ impl<'a> Locator<'a> {
}
}
fn get_or_init_index(&self) -> &Index {
self.index.get_or_init(|| Index::from(self.contents))
fn get_or_init_index(&self) -> &LineIndex {
self.index
.get_or_init(|| LineIndex::from_source_text(self.contents))
}
/// Take the source code up to the given [`Location`].
pub fn take(&self, location: Location) -> &'a str {
let index = self.get_or_init_index();
let offset = index.byte_offset(location, self.contents);
&self.contents[..offset]
let offset = index.location_offset(location, self.contents);
&self.contents[TextRange::up_to(offset)]
}
/// Take the source code after the given [`Location`].
pub fn skip(&self, location: Location) -> &'a str {
let index = self.get_or_init_index();
let offset = index.byte_offset(location, self.contents);
&self.contents[offset..]
let offset = index.location_offset(location, self.contents);
&self.contents[usize::from(offset)..]
}
/// Take the source code between the given [`Range`].
pub fn slice<R: Into<Range>>(&self, range: R) -> &'a str {
let index = self.get_or_init_index();
let range = range.into();
let start = index.byte_offset(range.location, self.contents);
let end = index.byte_offset(range.end_location, self.contents);
&self.contents[start..end]
let start = index.location_offset(range.location, self.contents);
let end = index.location_offset(range.end_location, self.contents);
&self.contents[TextRange::new(start, end)]
}
/// Return the byte offset of the given [`Location`].
pub fn offset(&self, location: Location) -> usize {
pub fn offset(&self, location: Location) -> TextSize {
let index = self.get_or_init_index();
index.byte_offset(location, self.contents)
index.location_offset(location, self.contents)
}
/// Return the underlying source code.
@ -59,7 +62,7 @@ impl<'a> Locator<'a> {
/// Return the number of lines in the source code.
pub fn count_lines(&self) -> usize {
let index = self.get_or_init_index();
index.count_lines()
index.lines_count()
}
/// Return the number of bytes in the source code.
@ -72,302 +75,3 @@ impl<'a> Locator<'a> {
self.contents.is_empty()
}
}
/// Index for fast [`Location`] to byte offset conversions.
#[derive(Debug, Clone)]
enum Index {
/// Optimized index for an ASCII only document
Ascii(AsciiIndex),
/// Index for UTF8 documents
Utf8(Utf8Index),
}
impl Index {
/// Truncate a [`Location`] to a byte offset in source code.
fn byte_offset(&self, location: Location, contents: &str) -> usize {
match self {
Index::Ascii(ascii) => ascii.byte_offset(location, contents),
Index::Utf8(utf8) => utf8.byte_offset(location, contents),
}
}
/// Return the number of lines in the source code.
fn count_lines(&self) -> usize {
match self {
Index::Ascii(ascii) => ascii.line_start_byte_offsets.len(),
Index::Utf8(utf8) => utf8.line_start_byte_offsets.len(),
}
}
}
impl From<&str> for Index {
fn from(contents: &str) -> Self {
assert!(u32::try_from(contents.len()).is_ok());
let mut line_start_offsets: Vec<u32> = Vec::with_capacity(48);
line_start_offsets.push(0);
let mut utf8 = false;
// SAFE because of length assertion above
#[allow(clippy::cast_possible_truncation)]
for (i, byte) in contents.bytes().enumerate() {
utf8 |= !byte.is_ascii();
match byte {
// Only track one line break for `\r\n`.
b'\r' if contents.as_bytes().get(i + 1) == Some(&b'\n') => continue,
b'\n' | b'\r' => {
line_start_offsets.push((i + 1) as u32);
}
_ => {}
}
}
if utf8 {
Self::Utf8(Utf8Index::new(line_start_offsets))
} else {
Self::Ascii(AsciiIndex::new(line_start_offsets))
}
}
}
/// Index for fast [`Location`] to byte offset conversions for ASCII documents.
///
/// The index stores the byte offsets for every line. It computes the byte offset for a [`Location`]
/// by retrieving the line offset from its index and adding the column.
#[derive(Debug, Clone, Eq, PartialEq)]
struct AsciiIndex {
line_start_byte_offsets: Vec<u32>,
}
impl AsciiIndex {
fn new(line_start_positions: Vec<u32>) -> Self {
Self {
line_start_byte_offsets: line_start_positions,
}
}
/// Truncate a [`Location`] to a byte offset in ASCII source code.
fn byte_offset(&self, location: Location, contents: &str) -> usize {
let index = &self.line_start_byte_offsets;
// If start-of-line position after last line
if location.row() - 1 == index.len() && location.column() == 0 {
contents.len()
} else {
let byte_offset = index[location.row() - 1] as usize + location.column();
byte_offset.min(contents.len())
}
}
}
/// Index for fast [`Location`] to byte offset conversions for UTF8 documents.
///
/// The index stores the byte offset of every line. The column offset is lazily computed by
/// adding the line start offset and then iterating to the `nth` character.
#[derive(Debug, Clone, PartialEq)]
struct Utf8Index {
line_start_byte_offsets: Vec<u32>,
}
impl Utf8Index {
fn new(line_byte_positions: Vec<u32>) -> Self {
Self {
line_start_byte_offsets: line_byte_positions,
}
}
/// Truncate a [`Location`] to a byte offset in UTF-8 source code.
fn byte_offset(&self, location: Location, contents: &str) -> usize {
let index = &self.line_start_byte_offsets;
if location.row() - 1 == index.len() && location.column() == 0 {
contents.len()
} else {
// Casting is safe because the length of utf8 characters is always between 1-4
#[allow(clippy::cast_possible_truncation)]
let line_start = if location.row() == 1 && contents.starts_with('\u{feff}') {
'\u{feff}'.len_utf8() as u32
} else {
index[location.row() - 1]
};
let rest = &contents[line_start as usize..];
let column_offset = match rest.char_indices().nth(location.column()) {
Some((offset, _)) => offset,
None => contents.len(),
};
let offset = line_start as usize + column_offset;
offset.min(contents.len())
}
}
}
#[cfg(test)]
mod tests {
use rustpython_parser::ast::Location;
use crate::source_code::locator::{AsciiIndex, Index, Utf8Index};
fn index_ascii(content: &str) -> AsciiIndex {
match Index::from(content) {
Index::Ascii(ascii) => ascii,
Index::Utf8(_) => {
panic!("Expected ASCII index")
}
}
}
fn index_utf8(content: &str) -> Utf8Index {
match Index::from(content) {
Index::Utf8(utf8) => utf8,
Index::Ascii(_) => {
panic!("Expected UTF8 index")
}
}
}
#[test]
fn ascii_index() {
let contents = "";
let index = index_ascii(contents);
assert_eq!(index, AsciiIndex::new(vec![0]));
let contents = "x = 1";
let index = index_ascii(contents);
assert_eq!(index, AsciiIndex::new(vec![0]));
let contents = "x = 1\n";
let index = index_ascii(contents);
assert_eq!(index, AsciiIndex::new(vec![0, 6]));
let contents = "x = 1\ny = 2\nz = x + y\n";
let index = index_ascii(contents);
assert_eq!(index, AsciiIndex::new(vec![0, 6, 12, 22]));
}
#[test]
fn ascii_byte_offset() {
let contents = "x = 1\ny = 2";
let index = index_ascii(contents);
// First row.
let loc = index.byte_offset(Location::new(1, 0), contents);
assert_eq!(loc, 0);
// Second row.
let loc = index.byte_offset(Location::new(2, 0), contents);
assert_eq!(loc, 6);
// One-past-the-end.
let loc = index.byte_offset(Location::new(3, 0), contents);
assert_eq!(loc, 11);
}
#[test]
fn ascii_carriage_return() {
let contents = "x = 4\ry = 3";
let index = index_ascii(contents);
assert_eq!(index, AsciiIndex::new(vec![0, 6]));
assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4);
assert_eq!(index.byte_offset(Location::new(2, 0), contents), 6);
assert_eq!(index.byte_offset(Location::new(2, 1), contents), 7);
}
#[test]
fn ascii_carriage_return_newline() {
let contents = "x = 4\r\ny = 3";
let index = index_ascii(contents);
assert_eq!(index, AsciiIndex::new(vec![0, 7]));
assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4);
assert_eq!(index.byte_offset(Location::new(2, 0), contents), 7);
assert_eq!(index.byte_offset(Location::new(2, 1), contents), 8);
}
impl Utf8Index {
fn line_count(&self) -> usize {
self.line_start_byte_offsets.len()
}
}
#[test]
fn utf8_index() {
let contents = "x = '🫣'";
let index = index_utf8(contents);
assert_eq!(index.line_count(), 1);
assert_eq!(index, Utf8Index::new(vec![0]));
let contents = "x = '🫣'\n";
let index = index_utf8(contents);
assert_eq!(index.line_count(), 2);
assert_eq!(index, Utf8Index::new(vec![0, 11]));
let contents = "x = '🫣'\ny = 2\nz = x + y\n";
let index = index_utf8(contents);
assert_eq!(index.line_count(), 4);
assert_eq!(index, Utf8Index::new(vec![0, 11, 17, 27]));
let contents = "# 🫣\nclass Foo:\n \"\"\".\"\"\"";
let index = index_utf8(contents);
assert_eq!(index.line_count(), 3);
assert_eq!(index, Utf8Index::new(vec![0, 7, 18]));
}
#[test]
fn utf8_carriage_return() {
let contents = "x = '🫣'\ry = 3";
let index = index_utf8(contents);
assert_eq!(index.line_count(), 2);
assert_eq!(index, Utf8Index::new(vec![0, 11]));
// Second '
assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9);
assert_eq!(index.byte_offset(Location::new(2, 0), contents), 11);
assert_eq!(index.byte_offset(Location::new(2, 1), contents), 12);
}
#[test]
fn utf8_carriage_return_newline() {
let contents = "x = '🫣'\r\ny = 3";
let index = index_utf8(contents);
assert_eq!(index.line_count(), 2);
assert_eq!(index, Utf8Index::new(vec![0, 12]));
// Second '
assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9);
assert_eq!(index.byte_offset(Location::new(2, 0), contents), 12);
assert_eq!(index.byte_offset(Location::new(2, 1), contents), 13);
}
#[test]
fn utf8_byte_offset() {
let contents = "x = '☃'\ny = 2";
let index = index_utf8(contents);
assert_eq!(index, Utf8Index::new(vec![0, 10]));
// First row.
let loc = index.byte_offset(Location::new(1, 0), contents);
assert_eq!(loc, 0);
let loc = index.byte_offset(Location::new(1, 5), contents);
assert_eq!(loc, 5);
assert_eq!(&contents[loc..], "☃'\ny = 2");
let loc = index.byte_offset(Location::new(1, 6), contents);
assert_eq!(loc, 8);
assert_eq!(&contents[loc..], "'\ny = 2");
// Second row.
let loc = index.byte_offset(Location::new(2, 0), contents);
assert_eq!(loc, 10);
// One-past-the-end.
let loc = index.byte_offset(Location::new(3, 0), contents);
assert_eq!(loc, 15);
}
}