perf: Optimize UTF8/ASCII byte offset index (#3439)

2025-08-19 01:51:30 +00:00 · 2023-03-11 13:12:10 +01:00 · 2023-03-11 13:12:10 +01:00 · d2988043af
commit d2988043af
parent cc8b13d3a7
1 changed files with 258 additions and 149 deletions
--- a/crates/ruff_python_ast/src/source_code/locator.rs
+++ b/crates/ruff_python_ast/src/source_code/locator.rs
@ -10,94 +10,6 @@ pub struct Locator<'a> {
    index: OnceCell<Index>,
 }

-pub enum Index {
-    Ascii(Vec<usize>),
-    Utf8(Vec<Vec<usize>>),
-}
-
-/// Compute the starting byte index of each line in ASCII source code.
-fn index_ascii(contents: &str) -> Vec<usize> {
-    let mut index = Vec::with_capacity(48);
-    index.push(0);
-    let bytes = contents.as_bytes();
-    for (i, byte) in bytes.iter().enumerate() {
-        if *byte == b'\n' {
-            index.push(i + 1);
-        }
-    }
-    index
-}
-
-/// Compute the starting byte index of each character in UTF-8 source code.
-fn index_utf8(contents: &str) -> Vec<Vec<usize>> {
-    let mut index = Vec::with_capacity(48);
-    let mut current_row = Vec::with_capacity(48);
-    let mut current_byte_offset = 0;
-    let mut previous_char = '\0';
-    for char in contents.chars() {
-        // Skip BOM.
-        if previous_char == '\0' && char == '\u{feff}' {
-            current_byte_offset += char.len_utf8();
-            continue;
-        }
-
-        current_row.push(current_byte_offset);
-        if char == '\n' {
-            if previous_char == '\r' {
-                current_row.pop();
-            }
-            index.push(current_row);
-            current_row = Vec::with_capacity(48);
-        }
-        current_byte_offset += char.len_utf8();
-        previous_char = char;
-    }
-    index.push(current_row);
-    index
-}
-
-/// Compute the starting byte index of each line in source code.
-pub fn index(contents: &str) -> Index {
-    if contents.is_ascii() {
-        Index::Ascii(index_ascii(contents))
-    } else {
-        Index::Utf8(index_utf8(contents))
-    }
-}
-
-/// Truncate a [`Location`] to a byte offset in ASCII source code.
-fn truncate_ascii(location: Location, index: &[usize], contents: &str) -> usize {
-    if location.row() - 1 == index.len() && location.column() == 0
-        || (!index.is_empty()
-            && location.row() - 1 == index.len() - 1
-            && index[location.row() - 1] + location.column() >= contents.len())
-    {
-        contents.len()
-    } else {
-        index[location.row() - 1] + location.column()
-    }
-}
-
-/// Truncate a [`Location`] to a byte offset in UTF-8 source code.
-fn truncate_utf8(location: Location, index: &[Vec<usize>], contents: &str) -> usize {
-    if (location.row() - 1 == index.len() && location.column() == 0)
-        || (location.row() - 1 == index.len() - 1
-            && location.column() == index[location.row() - 1].len())
-    {
-        contents.len()
-    } else {
-        index[location.row() - 1][location.column()]
-    }
-}
-
-/// Truncate a [`Location`] to a byte offset in source code.
-fn truncate(location: Location, index: &Index, contents: &str) -> usize {
-    match index {
-        Index::Ascii(index) => truncate_ascii(location, index, contents),
-        Index::Utf8(index) => truncate_utf8(location, index, contents),
-    }
-}
-
 impl<'a> Locator<'a> {
    pub const fn new(contents: &'a str) -> Self {
        Self {
@ -107,20 +19,20 @@ impl<'a> Locator<'a> {
    }

    fn get_or_init_index(&self) -> &Index {
-        self.index.get_or_init(|| index(self.contents))
+        self.index.get_or_init(|| Index::from(self.contents))
    }

    /// Take the source code up to the given [`Location`].
    pub fn take(&self, location: Location) -> &'a str {
        let index = self.get_or_init_index();
-        let offset = truncate(location, index, self.contents);
+        let offset = index.byte_offset(location, self.contents);
        &self.contents[..offset]
    }

    /// Take the source code after the given [`Location`].
    pub fn skip(&self, location: Location) -> &'a str {
        let index = self.get_or_init_index();
-        let offset = truncate(location, index, self.contents);
+        let offset = index.byte_offset(location, self.contents);
        &self.contents[offset..]
    }

@ -128,15 +40,15 @@ impl<'a> Locator<'a> {
    pub fn slice<R: Into<Range>>(&self, range: R) -> &'a str {
        let index = self.get_or_init_index();
        let range = range.into();
-        let start = truncate(range.location, index, self.contents);
-        let end = truncate(range.end_location, index, self.contents);
+        let start = index.byte_offset(range.location, self.contents);
+        let end = index.byte_offset(range.end_location, self.contents);
        &self.contents[start..end]
    }

    /// Return the byte offset of the given [`Location`].
    pub fn offset(&self, location: Location) -> usize {
        let index = self.get_or_init_index();
-        truncate(location, index, self.contents)
+        index.byte_offset(location, self.contents)
    }

    /// Return the underlying source code.
@ -153,116 +65,313 @@ impl<'a> Locator<'a> {
    }
 }

+/// Index for fast [`Location`] to byte offset conversions.
+#[derive(Debug, Clone)]
+enum Index {
+    /// Optimized index for an ASCII only document
+    Ascii(AsciiIndex),
+
+    /// Index for UTF8 documents
+    Utf8(Utf8Index),
+}
+
+impl Index {
+    /// Truncate a [`Location`] to a byte offset in source code.
+    fn byte_offset(&self, location: Location, contents: &str) -> usize {
+        match self {
+            Index::Ascii(ascii) => ascii.byte_offset(location, contents),
+            Index::Utf8(utf8) => utf8.byte_offset(location, contents),
+        }
+    }
+}
+
+impl From<&str> for Index {
+    fn from(contents: &str) -> Self {
+        assert!(u32::try_from(contents.len()).is_ok());
+
+        let mut line_start_offsets: Vec<u32> = Vec::with_capacity(48);
+        line_start_offsets.push(0);
+
+        // SAFE because of length assertion above
+        #[allow(clippy::cast_possible_truncation)]
+        for (i, byte) in contents.bytes().enumerate() {
+            if !byte.is_ascii() {
+                return Self::Utf8(continue_utf8_index(&contents[i..], i, line_start_offsets));
+            }
+
+            match byte {
+                // Only track one line break for `\r\n`.
+                b'\r' if contents.as_bytes().get(i + 1) == Some(&b'\n') => continue,
+                b'\n' | b'\r' => {
+                    line_start_offsets.push((i + 1) as u32);
+                }
+                _ => {}
+            }
+        }
+
+        Self::Ascii(AsciiIndex::new(line_start_offsets))
+    }
+}
+
+// SAFE because of length assertion in `Index::from(&str)`
+#[allow(clippy::cast_possible_truncation)]
+fn continue_utf8_index(
+    non_ascii_part: &str,
+    offset: usize,
+    line_start_offsets: Vec<u32>,
+) -> Utf8Index {
+    let mut lines = line_start_offsets;
+
+    for (position, char) in non_ascii_part.char_indices() {
+        match char {
+            // Only track `\n` for `\r\n`
+            '\r' if non_ascii_part.as_bytes().get(position + 1) == Some(&b'\n') => continue,
+            '\r' | '\n' => {
+                let absolute_offset = offset + position + 1;
+                lines.push(absolute_offset as u32);
+            }
+            _ => {}
+        }
+    }
+
+    Utf8Index::new(lines)
+}
+
+/// Index for fast [`Location`] to byte offset conversions for ASCII documents.
+///
+/// The index stores the byte offsets for every line. It computes the byte offset for a [`Location`]
+/// by retrieving the line offset from its index and adding the column.
+#[derive(Debug, Clone, Eq, PartialEq)]
+struct AsciiIndex {
+    line_start_byte_offsets: Vec<u32>,
+}
+
+impl AsciiIndex {
+    fn new(line_start_positions: Vec<u32>) -> Self {
+        Self {
+            line_start_byte_offsets: line_start_positions,
+        }
+    }
+
+    /// Truncate a [`Location`] to a byte offset in ASCII source code.
+    fn byte_offset(&self, location: Location, contents: &str) -> usize {
+        let index = &self.line_start_byte_offsets;
+
+        // If start-of-line position after last line
+        if location.row() - 1 == index.len() && location.column() == 0 {
+            contents.len()
+        } else {
+            let byte_offset = index[location.row() - 1] as usize + location.column();
+            byte_offset.min(contents.len())
+        }
+    }
+}
+
+/// Index for fast [`Location`] to byte offset conversions for UTF8 documents.
+///
+/// The index stores the byte offset of every line. The column offset is lazily computed by
+/// adding the line start offset and then iterating to the `nth` character.
+#[derive(Debug, Clone, PartialEq)]
+struct Utf8Index {
+    line_start_byte_offsets: Vec<u32>,
+}
+
+impl Utf8Index {
+    fn new(line_byte_positions: Vec<u32>) -> Self {
+        Self {
+            line_start_byte_offsets: line_byte_positions,
+        }
+    }
+
+    /// Truncate a [`Location`] to a byte offset in UTF-8 source code.
+    fn byte_offset(&self, location: Location, contents: &str) -> usize {
+        let index = &self.line_start_byte_offsets;
+
+        if location.row() - 1 == index.len() && location.column() == 0 {
+            contents.len()
+        } else {
+            // Casting is safe because the length of utf8 characters is always between 1-4
+            #[allow(clippy::cast_possible_truncation)]
+            let line_start = if location.row() == 1 && contents.starts_with('\u{feff}') {
+                '\u{feff}'.len_utf8() as u32
+            } else {
+                index[location.row() - 1]
+            };
+
+            let rest = &contents[line_start as usize..];
+
+            let column_offset = match rest.char_indices().nth(location.column()) {
+                Some((offset, _)) => offset,
+                None => contents.len(),
+            };
+
+            let offset = line_start as usize + column_offset;
+            offset.min(contents.len())
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
+    use crate::source_code::locator::{AsciiIndex, Index, Utf8Index};
    use rustpython_parser::ast::Location;

-    use super::{index_ascii, index_utf8, truncate_ascii, truncate_utf8};
+    fn index_ascii(content: &str) -> AsciiIndex {
+        match Index::from(content) {
+            Index::Ascii(ascii) => ascii,
+            Index::Utf8(_) => {
+                panic!("Expected ASCII index")
+            }
+        }
+    }
+
+    fn index_utf8(content: &str) -> Utf8Index {
+        match Index::from(content) {
+            Index::Utf8(utf8) => utf8,
+            Index::Ascii(_) => {
+                panic!("Expected UTF8 index")
+            }
+        }
+    }

    #[test]
    fn ascii_index() {
        let contents = "";
        let index = index_ascii(contents);
-        assert_eq!(index, [0]);
+        assert_eq!(index, AsciiIndex::new(vec![0]));

        let contents = "x = 1";
        let index = index_ascii(contents);
-        assert_eq!(index, [0]);
+        assert_eq!(index, AsciiIndex::new(vec![0]));

        let contents = "x = 1\n";
        let index = index_ascii(contents);
-        assert_eq!(index, [0, 6]);
-
-        let contents = "x = 1\r\n";
-        let index = index_ascii(contents);
-        assert_eq!(index, [0, 7]);
+        assert_eq!(index, AsciiIndex::new(vec![0, 6]));

        let contents = "x = 1\ny = 2\nz = x + y\n";
        let index = index_ascii(contents);
-        assert_eq!(index, [0, 6, 12, 22]);
+        assert_eq!(index, AsciiIndex::new(vec![0, 6, 12, 22]));
    }

    #[test]
-    fn ascii_truncate() {
+    fn ascii_byte_offset() {
        let contents = "x = 1\ny = 2";
        let index = index_ascii(contents);

        // First row.
-        let loc = truncate_ascii(Location::new(1, 0), &index, contents);
+        let loc = index.byte_offset(Location::new(1, 0), contents);
        assert_eq!(loc, 0);

        // Second row.
-        let loc = truncate_ascii(Location::new(2, 0), &index, contents);
+        let loc = index.byte_offset(Location::new(2, 0), contents);
        assert_eq!(loc, 6);

        // One-past-the-end.
-        let loc = truncate_ascii(Location::new(3, 0), &index, contents);
+        let loc = index.byte_offset(Location::new(3, 0), contents);
        assert_eq!(loc, 11);
    }

    #[test]
-    fn utf8_index() {
-        let contents = "";
-        let index = index_utf8(contents);
-        assert_eq!(index.len(), 1);
-        assert_eq!(index[0], Vec::<usize>::new());
+    fn ascii_carriage_return() {
+        let contents = "x = 4\ry = 3";
+        let index = index_ascii(contents);
+        assert_eq!(index, AsciiIndex::new(vec![0, 6]));

-        let contents = "x = 1";
-        let index = index_utf8(contents);
-        assert_eq!(index.len(), 1);
-        assert_eq!(index[0], [0, 1, 2, 3, 4]);
-
-        let contents = "x = 1\n";
-        let index = index_utf8(contents);
-        assert_eq!(index.len(), 2);
-        assert_eq!(index[0], [0, 1, 2, 3, 4, 5]);
-        assert_eq!(index[1], Vec::<usize>::new());
-
-        let contents = "x = 1\r\n";
-        let index = index_utf8(contents);
-        assert_eq!(index.len(), 2);
-        assert_eq!(index[0], [0, 1, 2, 3, 4, 5]);
-        assert_eq!(index[1], Vec::<usize>::new());
-
-        let contents = "x = 1\ny = 2\nz = x + y\n";
-        let index = index_utf8(contents);
-        assert_eq!(index.len(), 4);
-        assert_eq!(index[0], [0, 1, 2, 3, 4, 5]);
-        assert_eq!(index[1], [6, 7, 8, 9, 10, 11]);
-        assert_eq!(index[2], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21]);
-        assert_eq!(index[3], Vec::<usize>::new());
-
-        let contents = "# \u{4e9c}\nclass Foo:\n    \"\"\".\"\"\"";
-        let index = index_utf8(contents);
-        assert_eq!(index.len(), 3);
-        assert_eq!(index[0], [0, 1, 2, 5]);
-        assert_eq!(index[1], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
-        assert_eq!(index[2], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]);
+        assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4);
+        assert_eq!(index.byte_offset(Location::new(2, 0), contents), 6);
+        assert_eq!(index.byte_offset(Location::new(2, 1), contents), 7);
    }

    #[test]
-    fn utf8_truncate() {
+    fn ascii_carriage_return_newline() {
+        let contents = "x = 4\r\ny = 3";
+        let index = index_ascii(contents);
+        assert_eq!(index, AsciiIndex::new(vec![0, 7]));
+
+        assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4);
+        assert_eq!(index.byte_offset(Location::new(2, 0), contents), 7);
+        assert_eq!(index.byte_offset(Location::new(2, 1), contents), 8);
+    }
+
+    impl Utf8Index {
+        fn line_count(&self) -> usize {
+            self.line_start_byte_offsets.len()
+        }
+    }
+
+    #[test]
+    fn utf8_index() {
+        let contents = "x = '🫣'";
+        let index = index_utf8(contents);
+        assert_eq!(index.line_count(), 1);
+        assert_eq!(index, Utf8Index::new(vec![0]));
+
+        let contents = "x = '🫣'\n";
+        let index = index_utf8(contents);
+        assert_eq!(index.line_count(), 2);
+        assert_eq!(index, Utf8Index::new(vec![0, 11]));
+
+        let contents = "x = '🫣'\ny = 2\nz = x + y\n";
+        let index = index_utf8(contents);
+        assert_eq!(index.line_count(), 4);
+        assert_eq!(index, Utf8Index::new(vec![0, 11, 17, 27]));
+
+        let contents = "# 🫣\nclass Foo:\n    \"\"\".\"\"\"";
+        let index = index_utf8(contents);
+        assert_eq!(index.line_count(), 3);
+        assert_eq!(index, Utf8Index::new(vec![0, 7, 18]));
+    }
+
+    #[test]
+    fn utf8_carriage_return() {
+        let contents = "x = '🫣'\ry = 3";
+        let index = index_utf8(contents);
+        assert_eq!(index.line_count(), 2);
+        assert_eq!(index, Utf8Index::new(vec![0, 11]));
+
+        // Second '
+        assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9);
+        assert_eq!(index.byte_offset(Location::new(2, 0), contents), 11);
+        assert_eq!(index.byte_offset(Location::new(2, 1), contents), 12);
+    }
+
+    #[test]
+    fn utf8_carriage_return_newline() {
+        let contents = "x = '🫣'\r\ny = 3";
+        let index = index_utf8(contents);
+        assert_eq!(index.line_count(), 2);
+        assert_eq!(index, Utf8Index::new(vec![0, 12]));
+
+        // Second '
+        assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9);
+        assert_eq!(index.byte_offset(Location::new(2, 0), contents), 12);
+        assert_eq!(index.byte_offset(Location::new(2, 1), contents), 13);
+    }
+
+    #[test]
+    fn utf8_byte_offset() {
        let contents = "x = '☃'\ny = 2";
        let index = index_utf8(contents);
+        assert_eq!(index, Utf8Index::new(vec![0, 10]));

        // First row.
-        let loc = truncate_utf8(Location::new(1, 0), &index, contents);
+        let loc = index.byte_offset(Location::new(1, 0), contents);
        assert_eq!(loc, 0);

-        let loc = truncate_utf8(Location::new(1, 5), &index, contents);
+        let loc = index.byte_offset(Location::new(1, 5), contents);
        assert_eq!(loc, 5);
        assert_eq!(&contents[loc..], "☃'\ny = 2");

-        let loc = truncate_utf8(Location::new(1, 6), &index, contents);
+        let loc = index.byte_offset(Location::new(1, 6), contents);
        assert_eq!(loc, 8);
        assert_eq!(&contents[loc..], "'\ny = 2");

        // Second row.
-        let loc = truncate_utf8(Location::new(2, 0), &index, contents);
+        let loc = index.byte_offset(Location::new(2, 0), contents);
        assert_eq!(loc, 10);

        // One-past-the-end.
-        let loc = truncate_utf8(Location::new(3, 0), &index, contents);
+        let loc = index.byte_offset(Location::new(3, 0), contents);
        assert_eq!(loc, 15);
    }
 }