Cheap cloneable LineIndex (#3896)

2025-11-26 14:42:58 +00:00 · 2023-04-11 09:33:40 +02:00 · 2023-04-11 09:33:40 +02:00 · 76c47a9a43
commit 76c47a9a43
parent 9209e57c5a
10 changed files with 465 additions and 335 deletions
--- a/crates/ruff_python_ast/Cargo.toml
+++ b/crates/ruff_python_ast/Cargo.toml
@ -9,6 +9,7 @@ rust-version = { workspace = true }

 [dependencies]
 ruff_rustpython = { path = "../ruff_rustpython" }
+ruff_text_size = { path = "../ruff_text_size" }

 anyhow = { workspace = true }
 bitflags = { workspace = true }
--- a/crates/ruff_python_ast/src/source_code/line_index.rs
+++ b/crates/ruff_python_ast/src/source_code/line_index.rs
@ -0,0 +1,418 @@
+use ruff_text_size::{TextLen, TextRange, TextSize};
+use rustpython_parser::ast::Location;
+use std::fmt;
+use std::fmt::{Debug, Formatter};
+use std::num::NonZeroUsize;
+use std::ops::Deref;
+use std::sync::Arc;
+
+/// Index for fast [`Location`] to [byte offset](TextSize) conversions.
+///
+/// Cloning a [`LineIndex`] is cheap because it only requires bumping a reference count.
+#[derive(Clone)]
+pub struct LineIndex {
+    inner: Arc<LineIndexInner>,
+}
+
+struct LineIndexInner {
+    line_starts: Vec<TextSize>,
+    kind: IndexKind,
+}
+
+impl LineIndex {
+    /// Builds the [`LineIndex`] from the source text of a file.
+    pub fn from_source_text(text: &str) -> Self {
+        assert!(u32::try_from(text.len()).is_ok());
+
+        let mut line_starts: Vec<TextSize> = Vec::with_capacity(text.len() / 88);
+        line_starts.push(TextSize::default());
+
+        let bytes = text.as_bytes();
+        let mut utf8 = false;
+
+        for (i, byte) in bytes.iter().enumerate() {
+            utf8 |= !byte.is_ascii();
+
+            match byte {
+                // Only track one line break for `\r\n`.
+                b'\r' if bytes.get(i + 1) == Some(&b'\n') => continue,
+                b'\n' | b'\r' => {
+                    line_starts.push(TextSize::try_from(i + 1).unwrap());
+                }
+                _ => {}
+            }
+        }
+
+        let kind = if utf8 {
+            IndexKind::Utf8
+        } else {
+            IndexKind::Ascii
+        };
+
+        Self {
+            inner: Arc::new(LineIndexInner { line_starts, kind }),
+        }
+    }
+
+    fn kind(&self) -> IndexKind {
+        self.inner.kind
+    }
+
+    /// Converts a [`Location`] to it's [byte offset](TextSize) in the source code.
+    pub fn location_offset(&self, location: Location, contents: &str) -> TextSize {
+        let line_index = OneIndexed::new(location.row()).unwrap();
+        let line_range = self.line_range(line_index, contents);
+
+        let column_offset = match self.kind() {
+            IndexKind::Ascii => TextSize::try_from(location.column()).unwrap(),
+            IndexKind::Utf8 => {
+                let line = &contents[line_range];
+
+                // Skip the bom character
+                let bom_len =
+                    usize::from(line_index.to_zero_indexed() == 0 && line.starts_with('\u{feff}'));
+
+                match line.char_indices().nth(location.column() + bom_len) {
+                    Some((offset, _)) => TextSize::try_from(offset).unwrap(),
+                    None => line_range.len(),
+                }
+            }
+        };
+
+        line_range.start() + column_offset
+    }
+
+    /// Return the number of lines in the source code.
+    pub(crate) fn lines_count(&self) -> usize {
+        self.line_starts().len()
+    }
+
+    /// Returns the [byte offset](TextSize) for the `line` with the given index.
+    fn line_start(&self, line: OneIndexed, contents: &str) -> TextSize {
+        let row_index = line.to_zero_indexed();
+        let starts = self.line_starts();
+
+        // If start-of-line position after last line
+        if row_index == starts.len() {
+            contents.text_len()
+        } else {
+            starts[row_index]
+        }
+    }
+
+    /// Returns the [`TextRange`] of the `line` with the given index.
+    /// The start points to the first character's [byte offset](TextSize), the end up to, and including
+    /// the newline character ending the line (if any).
+    fn line_range(&self, line: OneIndexed, contents: &str) -> TextRange {
+        let starts = self.line_starts();
+
+        if starts.len() == line.to_zero_indexed() {
+            TextRange::empty(contents.text_len())
+        } else {
+            TextRange::new(
+                self.line_start(line, contents),
+                self.line_start(line.saturating_add(1), contents),
+            )
+        }
+    }
+
+    /// Returns the [byte offsets](TextSize) for every line
+    pub fn line_starts(&self) -> &[TextSize] {
+        &self.inner.line_starts
+    }
+}
+
+impl Deref for LineIndex {
+    type Target = [TextSize];
+
+    fn deref(&self) -> &Self::Target {
+        self.line_starts()
+    }
+}
+
+impl Debug for LineIndex {
+    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
+        f.debug_list().entries(self.line_starts()).finish()
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+enum IndexKind {
+    /// Optimized index for an ASCII only document
+    Ascii,
+
+    /// Index for UTF8 documents
+    Utf8,
+}
+
+/// Type-safe wrapper for a value whose logical range starts at `1`, for
+/// instance the line or column numbers in a file
+///
+/// Internally this is represented as a [`NonZeroUsize`], this enables some
+/// memory optimizations
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct OneIndexed(NonZeroUsize);
+
+const ONE: NonZeroUsize = unwrap(NonZeroUsize::new(1));
+
+impl OneIndexed {
+    // SAFETY: These constants are being initialized with non-zero values
+    /// The smallest value that can be represented by this integer type.
+    pub const MIN: Self = unwrap(Self::new(1));
+    /// The largest value that can be represented by this integer type
+    pub const MAX: Self = unwrap(Self::new(usize::MAX));
+
+    /// Creates a non-zero if the given value is not zero.
+    pub const fn new(value: usize) -> Option<Self> {
+        match NonZeroUsize::new(value) {
+            Some(value) => Some(Self(value)),
+            None => None,
+        }
+    }
+
+    /// Construct a new [`OneIndexed`] from a zero-indexed value
+    pub const fn from_zero_indexed(value: usize) -> Self {
+        Self(ONE.saturating_add(value))
+    }
+
+    /// Return the zero-indexed primitive value for this [`OneIndexed`]
+    pub const fn to_zero_indexed(self) -> usize {
+        self.0.get() - 1
+    }
+
+    /// Saturating integer addition. Computes `self + rhs`, saturating at
+    /// the numeric bounds instead of overflowing.
+    #[must_use]
+    pub const fn saturating_add(self, rhs: usize) -> Self {
+        match NonZeroUsize::new(self.0.get().saturating_add(rhs)) {
+            Some(value) => Self(value),
+            None => Self::MAX,
+        }
+    }
+
+    /// Saturating integer subtraction. Computes `self - rhs`, saturating
+    /// at the numeric bounds instead of overflowing.
+    #[must_use]
+    pub const fn saturating_sub(self, rhs: usize) -> Self {
+        match NonZeroUsize::new(self.0.get().saturating_sub(rhs)) {
+            Some(value) => Self(value),
+            None => Self::MIN,
+        }
+    }
+}
+
+impl std::fmt::Display for OneIndexed {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        std::fmt::Debug::fmt(&self.0.get(), f)
+    }
+}
+
+/// A const `Option::unwrap` without nightly features:
+/// [Tracking issue](https://github.com/rust-lang/rust/issues/67441)
+const fn unwrap<T: Copy>(option: Option<T>) -> T {
+    match option {
+        Some(value) => value,
+        None => panic!("unwrapping None"),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::source_code::line_index::LineIndex;
+    use ruff_text_size::TextSize;
+    use rustpython_parser::ast::Location;
+
+    #[test]
+    fn ascii_index() {
+        let index = LineIndex::from_source_text("");
+        assert_eq!(index.line_starts(), &[TextSize::from(0)]);
+
+        let index = LineIndex::from_source_text("x = 1");
+        assert_eq!(index.line_starts(), &[TextSize::from(0)]);
+
+        let index = LineIndex::from_source_text("x = 1\n");
+        assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(6)]);
+
+        let index = LineIndex::from_source_text("x = 1\ny = 2\nz = x + y\n");
+        assert_eq!(
+            index.line_starts(),
+            &[
+                TextSize::from(0),
+                TextSize::from(6),
+                TextSize::from(12),
+                TextSize::from(22)
+            ]
+        );
+    }
+
+    #[test]
+    fn ascii_byte_offset() {
+        let contents = "x = 1\ny = 2";
+        let index = LineIndex::from_source_text(contents);
+
+        // First row.
+        let loc = index.location_offset(Location::new(1, 0), contents);
+        assert_eq!(loc, TextSize::from(0));
+
+        // Second row.
+        let loc = index.location_offset(Location::new(2, 0), contents);
+        assert_eq!(loc, TextSize::from(6));
+
+        // One-past-the-end.
+        let loc = index.location_offset(Location::new(3, 0), contents);
+        assert_eq!(loc, TextSize::from(11));
+    }
+
+    #[test]
+    fn ascii_carriage_return() {
+        let contents = "x = 4\ry = 3";
+        let index = LineIndex::from_source_text(contents);
+        assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(6)]);
+
+        assert_eq!(
+            index.location_offset(Location::new(1, 4), contents),
+            TextSize::from(4)
+        );
+        assert_eq!(
+            index.location_offset(Location::new(2, 0), contents),
+            TextSize::from(6)
+        );
+        assert_eq!(
+            index.location_offset(Location::new(2, 1), contents),
+            TextSize::from(7)
+        );
+    }
+
+    #[test]
+    fn ascii_carriage_return_newline() {
+        let contents = "x = 4\r\ny = 3";
+        let index = LineIndex::from_source_text(contents);
+        assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(7)]);
+
+        assert_eq!(
+            index.location_offset(Location::new(1, 4), contents),
+            TextSize::from(4)
+        );
+        assert_eq!(
+            index.location_offset(Location::new(2, 0), contents),
+            TextSize::from(7)
+        );
+        assert_eq!(
+            index.location_offset(Location::new(2, 1), contents),
+            TextSize::from(8)
+        );
+    }
+
+    #[test]
+    fn utf8_index() {
+        let index = LineIndex::from_source_text("x = '🫣'");
+        assert_eq!(index.lines_count(), 1);
+        assert_eq!(index.line_starts(), &[TextSize::from(0)]);
+
+        let index = LineIndex::from_source_text("x = '🫣'\n");
+        assert_eq!(index.lines_count(), 2);
+        assert_eq!(
+            index.line_starts(),
+            &[TextSize::from(0), TextSize::from(11)]
+        );
+
+        let index = LineIndex::from_source_text("x = '🫣'\ny = 2\nz = x + y\n");
+        assert_eq!(index.lines_count(), 4);
+        assert_eq!(
+            index.line_starts(),
+            &[
+                TextSize::from(0),
+                TextSize::from(11),
+                TextSize::from(17),
+                TextSize::from(27)
+            ]
+        );
+
+        let index = LineIndex::from_source_text("# 🫣\nclass Foo:\n    \"\"\".\"\"\"");
+        assert_eq!(index.lines_count(), 3);
+        assert_eq!(
+            index.line_starts(),
+            &[TextSize::from(0), TextSize::from(7), TextSize::from(18)]
+        );
+    }
+
+    #[test]
+    fn utf8_carriage_return() {
+        let contents = "x = '🫣'\ry = 3";
+        let index = LineIndex::from_source_text(contents);
+        assert_eq!(index.lines_count(), 2);
+        assert_eq!(
+            index.line_starts(),
+            &[TextSize::from(0), TextSize::from(11)]
+        );
+
+        // Second '
+        assert_eq!(
+            index.location_offset(Location::new(1, 6), contents),
+            TextSize::from(9)
+        );
+        assert_eq!(
+            index.location_offset(Location::new(2, 0), contents),
+            TextSize::from(11)
+        );
+        assert_eq!(
+            index.location_offset(Location::new(2, 1), contents),
+            TextSize::from(12)
+        );
+    }
+
+    #[test]
+    fn utf8_carriage_return_newline() {
+        let contents = "x = '🫣'\r\ny = 3";
+        let index = LineIndex::from_source_text(contents);
+        assert_eq!(index.lines_count(), 2);
+        assert_eq!(
+            index.line_starts(),
+            &[TextSize::from(0), TextSize::from(12)]
+        );
+
+        // Second '
+        assert_eq!(
+            index.location_offset(Location::new(1, 6), contents),
+            TextSize::from(9)
+        );
+        assert_eq!(
+            index.location_offset(Location::new(2, 0), contents),
+            TextSize::from(12)
+        );
+        assert_eq!(
+            index.location_offset(Location::new(2, 1), contents),
+            TextSize::from(13)
+        );
+    }
+
+    #[test]
+    fn utf8_byte_offset() {
+        let contents = "x = '☃'\ny = 2";
+        let index = LineIndex::from_source_text(contents);
+        assert_eq!(
+            index.line_starts(),
+            &[TextSize::from(0), TextSize::from(10)]
+        );
+
+        // First row.
+        let loc = index.location_offset(Location::new(1, 0), contents);
+        assert_eq!(loc, TextSize::from(0));
+
+        let loc = index.location_offset(Location::new(1, 5), contents);
+        assert_eq!(loc, TextSize::from(5));
+        assert_eq!(&"x = '☃'\ny = 2"[usize::from(loc)..], "☃'\ny = 2");
+
+        let loc = index.location_offset(Location::new(1, 6), contents);
+        assert_eq!(loc, TextSize::from(8));
+        assert_eq!(&"x = '☃'\ny = 2"[usize::from(loc)..], "'\ny = 2");
+
+        // Second row.
+        let loc = index.location_offset(Location::new(2, 0), contents);
+        assert_eq!(loc, TextSize::from(10));
+
+        // One-past-the-end.
+        let loc = index.location_offset(Location::new(3, 0), contents);
+        assert_eq!(loc, TextSize::from(15));
+    }
+}
--- a/crates/ruff_python_ast/src/source_code/locator.rs
+++ b/crates/ruff_python_ast/src/source_code/locator.rs
@ -1,13 +1,15 @@
 //! Struct used to efficiently slice source code at (row, column) Locations.

+use crate::source_code::line_index::LineIndex;
 use once_cell::unsync::OnceCell;
+use ruff_text_size::{TextRange, TextSize};
 use rustpython_parser::ast::Location;

 use crate::types::Range;

 pub struct Locator<'a> {
    contents: &'a str,
-    index: OnceCell<Index>,
+    index: OnceCell<LineIndex>,
 }

 impl<'a> Locator<'a> {
@ -18,37 +20,38 @@ impl<'a> Locator<'a> {
        }
    }

-    fn get_or_init_index(&self) -> &Index {
-        self.index.get_or_init(|| Index::from(self.contents))
+    fn get_or_init_index(&self) -> &LineIndex {
+        self.index
+            .get_or_init(|| LineIndex::from_source_text(self.contents))
    }

    /// Take the source code up to the given [`Location`].
    pub fn take(&self, location: Location) -> &'a str {
        let index = self.get_or_init_index();
-        let offset = index.byte_offset(location, self.contents);
-        &self.contents[..offset]
+        let offset = index.location_offset(location, self.contents);
+        &self.contents[TextRange::up_to(offset)]
    }

    /// Take the source code after the given [`Location`].
    pub fn skip(&self, location: Location) -> &'a str {
        let index = self.get_or_init_index();
-        let offset = index.byte_offset(location, self.contents);
-        &self.contents[offset..]
+        let offset = index.location_offset(location, self.contents);
+        &self.contents[usize::from(offset)..]
    }

    /// Take the source code between the given [`Range`].
    pub fn slice<R: Into<Range>>(&self, range: R) -> &'a str {
        let index = self.get_or_init_index();
        let range = range.into();
-        let start = index.byte_offset(range.location, self.contents);
-        let end = index.byte_offset(range.end_location, self.contents);
-        &self.contents[start..end]
+        let start = index.location_offset(range.location, self.contents);
+        let end = index.location_offset(range.end_location, self.contents);
+        &self.contents[TextRange::new(start, end)]
    }

    /// Return the byte offset of the given [`Location`].
-    pub fn offset(&self, location: Location) -> usize {
+    pub fn offset(&self, location: Location) -> TextSize {
        let index = self.get_or_init_index();
-        index.byte_offset(location, self.contents)
+        index.location_offset(location, self.contents)
    }

    /// Return the underlying source code.
@ -59,7 +62,7 @@ impl<'a> Locator<'a> {
    /// Return the number of lines in the source code.
    pub fn count_lines(&self) -> usize {
        let index = self.get_or_init_index();
-        index.count_lines()
+        index.lines_count()
    }

    /// Return the number of bytes in the source code.
@ -72,302 +75,3 @@ impl<'a> Locator<'a> {
        self.contents.is_empty()
    }
 }
-
-/// Index for fast [`Location`] to byte offset conversions.
-#[derive(Debug, Clone)]
-enum Index {
-    /// Optimized index for an ASCII only document
-    Ascii(AsciiIndex),
-
-    /// Index for UTF8 documents
-    Utf8(Utf8Index),
-}
-
-impl Index {
-    /// Truncate a [`Location`] to a byte offset in source code.
-    fn byte_offset(&self, location: Location, contents: &str) -> usize {
-        match self {
-            Index::Ascii(ascii) => ascii.byte_offset(location, contents),
-            Index::Utf8(utf8) => utf8.byte_offset(location, contents),
-        }
-    }
-
-    /// Return the number of lines in the source code.
-    fn count_lines(&self) -> usize {
-        match self {
-            Index::Ascii(ascii) => ascii.line_start_byte_offsets.len(),
-            Index::Utf8(utf8) => utf8.line_start_byte_offsets.len(),
-        }
-    }
-}
-
-impl From<&str> for Index {
-    fn from(contents: &str) -> Self {
-        assert!(u32::try_from(contents.len()).is_ok());
-
-        let mut line_start_offsets: Vec<u32> = Vec::with_capacity(48);
-        line_start_offsets.push(0);
-        let mut utf8 = false;
-
-        // SAFE because of length assertion above
-        #[allow(clippy::cast_possible_truncation)]
-        for (i, byte) in contents.bytes().enumerate() {
-            utf8 |= !byte.is_ascii();
-
-            match byte {
-                // Only track one line break for `\r\n`.
-                b'\r' if contents.as_bytes().get(i + 1) == Some(&b'\n') => continue,
-                b'\n' | b'\r' => {
-                    line_start_offsets.push((i + 1) as u32);
-                }
-                _ => {}
-            }
-        }
-
-        if utf8 {
-            Self::Utf8(Utf8Index::new(line_start_offsets))
-        } else {
-            Self::Ascii(AsciiIndex::new(line_start_offsets))
-        }
-    }
-}
-
-/// Index for fast [`Location`] to byte offset conversions for ASCII documents.
-///
-/// The index stores the byte offsets for every line. It computes the byte offset for a [`Location`]
-/// by retrieving the line offset from its index and adding the column.
-#[derive(Debug, Clone, Eq, PartialEq)]
-struct AsciiIndex {
-    line_start_byte_offsets: Vec<u32>,
-}
-
-impl AsciiIndex {
-    fn new(line_start_positions: Vec<u32>) -> Self {
-        Self {
-            line_start_byte_offsets: line_start_positions,
-        }
-    }
-
-    /// Truncate a [`Location`] to a byte offset in ASCII source code.
-    fn byte_offset(&self, location: Location, contents: &str) -> usize {
-        let index = &self.line_start_byte_offsets;
-
-        // If start-of-line position after last line
-        if location.row() - 1 == index.len() && location.column() == 0 {
-            contents.len()
-        } else {
-            let byte_offset = index[location.row() - 1] as usize + location.column();
-            byte_offset.min(contents.len())
-        }
-    }
-}
-
-/// Index for fast [`Location`] to byte offset conversions for UTF8 documents.
-///
-/// The index stores the byte offset of every line. The column offset is lazily computed by
-/// adding the line start offset and then iterating to the `nth` character.
-#[derive(Debug, Clone, PartialEq)]
-struct Utf8Index {
-    line_start_byte_offsets: Vec<u32>,
-}
-
-impl Utf8Index {
-    fn new(line_byte_positions: Vec<u32>) -> Self {
-        Self {
-            line_start_byte_offsets: line_byte_positions,
-        }
-    }
-
-    /// Truncate a [`Location`] to a byte offset in UTF-8 source code.
-    fn byte_offset(&self, location: Location, contents: &str) -> usize {
-        let index = &self.line_start_byte_offsets;
-
-        if location.row() - 1 == index.len() && location.column() == 0 {
-            contents.len()
-        } else {
-            // Casting is safe because the length of utf8 characters is always between 1-4
-            #[allow(clippy::cast_possible_truncation)]
-            let line_start = if location.row() == 1 && contents.starts_with('\u{feff}') {
-                '\u{feff}'.len_utf8() as u32
-            } else {
-                index[location.row() - 1]
-            };
-
-            let rest = &contents[line_start as usize..];
-
-            let column_offset = match rest.char_indices().nth(location.column()) {
-                Some((offset, _)) => offset,
-                None => contents.len(),
-            };
-
-            let offset = line_start as usize + column_offset;
-            offset.min(contents.len())
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use rustpython_parser::ast::Location;
-
-    use crate::source_code::locator::{AsciiIndex, Index, Utf8Index};
-
-    fn index_ascii(content: &str) -> AsciiIndex {
-        match Index::from(content) {
-            Index::Ascii(ascii) => ascii,
-            Index::Utf8(_) => {
-                panic!("Expected ASCII index")
-            }
-        }
-    }
-
-    fn index_utf8(content: &str) -> Utf8Index {
-        match Index::from(content) {
-            Index::Utf8(utf8) => utf8,
-            Index::Ascii(_) => {
-                panic!("Expected UTF8 index")
-            }
-        }
-    }
-
-    #[test]
-    fn ascii_index() {
-        let contents = "";
-        let index = index_ascii(contents);
-        assert_eq!(index, AsciiIndex::new(vec![0]));
-
-        let contents = "x = 1";
-        let index = index_ascii(contents);
-        assert_eq!(index, AsciiIndex::new(vec![0]));
-
-        let contents = "x = 1\n";
-        let index = index_ascii(contents);
-        assert_eq!(index, AsciiIndex::new(vec![0, 6]));
-
-        let contents = "x = 1\ny = 2\nz = x + y\n";
-        let index = index_ascii(contents);
-        assert_eq!(index, AsciiIndex::new(vec![0, 6, 12, 22]));
-    }
-
-    #[test]
-    fn ascii_byte_offset() {
-        let contents = "x = 1\ny = 2";
-        let index = index_ascii(contents);
-
-        // First row.
-        let loc = index.byte_offset(Location::new(1, 0), contents);
-        assert_eq!(loc, 0);
-
-        // Second row.
-        let loc = index.byte_offset(Location::new(2, 0), contents);
-        assert_eq!(loc, 6);
-
-        // One-past-the-end.
-        let loc = index.byte_offset(Location::new(3, 0), contents);
-        assert_eq!(loc, 11);
-    }
-
-    #[test]
-    fn ascii_carriage_return() {
-        let contents = "x = 4\ry = 3";
-        let index = index_ascii(contents);
-        assert_eq!(index, AsciiIndex::new(vec![0, 6]));
-
-        assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4);
-        assert_eq!(index.byte_offset(Location::new(2, 0), contents), 6);
-        assert_eq!(index.byte_offset(Location::new(2, 1), contents), 7);
-    }
-
-    #[test]
-    fn ascii_carriage_return_newline() {
-        let contents = "x = 4\r\ny = 3";
-        let index = index_ascii(contents);
-        assert_eq!(index, AsciiIndex::new(vec![0, 7]));
-
-        assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4);
-        assert_eq!(index.byte_offset(Location::new(2, 0), contents), 7);
-        assert_eq!(index.byte_offset(Location::new(2, 1), contents), 8);
-    }
-
-    impl Utf8Index {
-        fn line_count(&self) -> usize {
-            self.line_start_byte_offsets.len()
-        }
-    }
-
-    #[test]
-    fn utf8_index() {
-        let contents = "x = '🫣'";
-        let index = index_utf8(contents);
-        assert_eq!(index.line_count(), 1);
-        assert_eq!(index, Utf8Index::new(vec![0]));
-
-        let contents = "x = '🫣'\n";
-        let index = index_utf8(contents);
-        assert_eq!(index.line_count(), 2);
-        assert_eq!(index, Utf8Index::new(vec![0, 11]));
-
-        let contents = "x = '🫣'\ny = 2\nz = x + y\n";
-        let index = index_utf8(contents);
-        assert_eq!(index.line_count(), 4);
-        assert_eq!(index, Utf8Index::new(vec![0, 11, 17, 27]));
-
-        let contents = "# 🫣\nclass Foo:\n    \"\"\".\"\"\"";
-        let index = index_utf8(contents);
-        assert_eq!(index.line_count(), 3);
-        assert_eq!(index, Utf8Index::new(vec![0, 7, 18]));
-    }
-
-    #[test]
-    fn utf8_carriage_return() {
-        let contents = "x = '🫣'\ry = 3";
-        let index = index_utf8(contents);
-        assert_eq!(index.line_count(), 2);
-        assert_eq!(index, Utf8Index::new(vec![0, 11]));
-
-        // Second '
-        assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9);
-        assert_eq!(index.byte_offset(Location::new(2, 0), contents), 11);
-        assert_eq!(index.byte_offset(Location::new(2, 1), contents), 12);
-    }
-
-    #[test]
-    fn utf8_carriage_return_newline() {
-        let contents = "x = '🫣'\r\ny = 3";
-        let index = index_utf8(contents);
-        assert_eq!(index.line_count(), 2);
-        assert_eq!(index, Utf8Index::new(vec![0, 12]));
-
-        // Second '
-        assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9);
-        assert_eq!(index.byte_offset(Location::new(2, 0), contents), 12);
-        assert_eq!(index.byte_offset(Location::new(2, 1), contents), 13);
-    }
-
-    #[test]
-    fn utf8_byte_offset() {
-        let contents = "x = '☃'\ny = 2";
-        let index = index_utf8(contents);
-        assert_eq!(index, Utf8Index::new(vec![0, 10]));
-
-        // First row.
-        let loc = index.byte_offset(Location::new(1, 0), contents);
-        assert_eq!(loc, 0);
-
-        let loc = index.byte_offset(Location::new(1, 5), contents);
-        assert_eq!(loc, 5);
-        assert_eq!(&contents[loc..], "☃'\ny = 2");
-
-        let loc = index.byte_offset(Location::new(1, 6), contents);
-        assert_eq!(loc, 8);
-        assert_eq!(&contents[loc..], "'\ny = 2");
-
-        // Second row.
-        let loc = index.byte_offset(Location::new(2, 0), contents);
-        assert_eq!(loc, 10);
-
-        // One-past-the-end.
-        let loc = index.byte_offset(Location::new(3, 0), contents);
-        assert_eq!(loc, 15);
-    }
-}
--- a/crates/ruff_python_ast/src/source_code/mod.rs
+++ b/crates/ruff_python_ast/src/source_code/mod.rs
@ -1,13 +1,16 @@
 mod generator;
 mod indexer;
+mod line_index;
 mod locator;
 mod stylist;

+pub use crate::source_code::line_index::{LineIndex, OneIndexed};
 pub use generator::Generator;
 pub use indexer::Indexer;
 pub use locator::Locator;
 use rustpython_parser as parser;
 use rustpython_parser::{lexer, Mode, ParseError};
+
 pub use stylist::{LineEnding, Stylist};

 /// Run round-trip source code generation on a given Python code.