mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-04 02:38:25 +00:00
Cheap cloneable LineIndex (#3896)
This commit is contained in:
parent
9209e57c5a
commit
76c47a9a43
10 changed files with 465 additions and 335 deletions
|
@ -9,6 +9,7 @@ rust-version = { workspace = true }
|
|||
|
||||
[dependencies]
|
||||
ruff_rustpython = { path = "../ruff_rustpython" }
|
||||
ruff_text_size = { path = "../ruff_text_size" }
|
||||
|
||||
anyhow = { workspace = true }
|
||||
bitflags = { workspace = true }
|
||||
|
|
418
crates/ruff_python_ast/src/source_code/line_index.rs
Normal file
418
crates/ruff_python_ast/src/source_code/line_index.rs
Normal file
|
@ -0,0 +1,418 @@
|
|||
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||||
use rustpython_parser::ast::Location;
|
||||
use std::fmt;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
use std::num::NonZeroUsize;
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Index for fast [`Location`] to [byte offset](TextSize) conversions.
|
||||
///
|
||||
/// Cloning a [`LineIndex`] is cheap because it only requires bumping a reference count.
|
||||
#[derive(Clone)]
|
||||
pub struct LineIndex {
|
||||
inner: Arc<LineIndexInner>,
|
||||
}
|
||||
|
||||
struct LineIndexInner {
|
||||
line_starts: Vec<TextSize>,
|
||||
kind: IndexKind,
|
||||
}
|
||||
|
||||
impl LineIndex {
|
||||
/// Builds the [`LineIndex`] from the source text of a file.
|
||||
pub fn from_source_text(text: &str) -> Self {
|
||||
assert!(u32::try_from(text.len()).is_ok());
|
||||
|
||||
let mut line_starts: Vec<TextSize> = Vec::with_capacity(text.len() / 88);
|
||||
line_starts.push(TextSize::default());
|
||||
|
||||
let bytes = text.as_bytes();
|
||||
let mut utf8 = false;
|
||||
|
||||
for (i, byte) in bytes.iter().enumerate() {
|
||||
utf8 |= !byte.is_ascii();
|
||||
|
||||
match byte {
|
||||
// Only track one line break for `\r\n`.
|
||||
b'\r' if bytes.get(i + 1) == Some(&b'\n') => continue,
|
||||
b'\n' | b'\r' => {
|
||||
line_starts.push(TextSize::try_from(i + 1).unwrap());
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let kind = if utf8 {
|
||||
IndexKind::Utf8
|
||||
} else {
|
||||
IndexKind::Ascii
|
||||
};
|
||||
|
||||
Self {
|
||||
inner: Arc::new(LineIndexInner { line_starts, kind }),
|
||||
}
|
||||
}
|
||||
|
||||
fn kind(&self) -> IndexKind {
|
||||
self.inner.kind
|
||||
}
|
||||
|
||||
/// Converts a [`Location`] to it's [byte offset](TextSize) in the source code.
|
||||
pub fn location_offset(&self, location: Location, contents: &str) -> TextSize {
|
||||
let line_index = OneIndexed::new(location.row()).unwrap();
|
||||
let line_range = self.line_range(line_index, contents);
|
||||
|
||||
let column_offset = match self.kind() {
|
||||
IndexKind::Ascii => TextSize::try_from(location.column()).unwrap(),
|
||||
IndexKind::Utf8 => {
|
||||
let line = &contents[line_range];
|
||||
|
||||
// Skip the bom character
|
||||
let bom_len =
|
||||
usize::from(line_index.to_zero_indexed() == 0 && line.starts_with('\u{feff}'));
|
||||
|
||||
match line.char_indices().nth(location.column() + bom_len) {
|
||||
Some((offset, _)) => TextSize::try_from(offset).unwrap(),
|
||||
None => line_range.len(),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
line_range.start() + column_offset
|
||||
}
|
||||
|
||||
/// Return the number of lines in the source code.
|
||||
pub(crate) fn lines_count(&self) -> usize {
|
||||
self.line_starts().len()
|
||||
}
|
||||
|
||||
/// Returns the [byte offset](TextSize) for the `line` with the given index.
|
||||
fn line_start(&self, line: OneIndexed, contents: &str) -> TextSize {
|
||||
let row_index = line.to_zero_indexed();
|
||||
let starts = self.line_starts();
|
||||
|
||||
// If start-of-line position after last line
|
||||
if row_index == starts.len() {
|
||||
contents.text_len()
|
||||
} else {
|
||||
starts[row_index]
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the [`TextRange`] of the `line` with the given index.
|
||||
/// The start points to the first character's [byte offset](TextSize), the end up to, and including
|
||||
/// the newline character ending the line (if any).
|
||||
fn line_range(&self, line: OneIndexed, contents: &str) -> TextRange {
|
||||
let starts = self.line_starts();
|
||||
|
||||
if starts.len() == line.to_zero_indexed() {
|
||||
TextRange::empty(contents.text_len())
|
||||
} else {
|
||||
TextRange::new(
|
||||
self.line_start(line, contents),
|
||||
self.line_start(line.saturating_add(1), contents),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the [byte offsets](TextSize) for every line
|
||||
pub fn line_starts(&self) -> &[TextSize] {
|
||||
&self.inner.line_starts
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for LineIndex {
|
||||
type Target = [TextSize];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.line_starts()
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for LineIndex {
|
||||
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
|
||||
f.debug_list().entries(self.line_starts()).finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum IndexKind {
|
||||
/// Optimized index for an ASCII only document
|
||||
Ascii,
|
||||
|
||||
/// Index for UTF8 documents
|
||||
Utf8,
|
||||
}
|
||||
|
||||
/// Type-safe wrapper for a value whose logical range starts at `1`, for
|
||||
/// instance the line or column numbers in a file
|
||||
///
|
||||
/// Internally this is represented as a [`NonZeroUsize`], this enables some
|
||||
/// memory optimizations
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct OneIndexed(NonZeroUsize);
|
||||
|
||||
const ONE: NonZeroUsize = unwrap(NonZeroUsize::new(1));
|
||||
|
||||
impl OneIndexed {
|
||||
// SAFETY: These constants are being initialized with non-zero values
|
||||
/// The smallest value that can be represented by this integer type.
|
||||
pub const MIN: Self = unwrap(Self::new(1));
|
||||
/// The largest value that can be represented by this integer type
|
||||
pub const MAX: Self = unwrap(Self::new(usize::MAX));
|
||||
|
||||
/// Creates a non-zero if the given value is not zero.
|
||||
pub const fn new(value: usize) -> Option<Self> {
|
||||
match NonZeroUsize::new(value) {
|
||||
Some(value) => Some(Self(value)),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct a new [`OneIndexed`] from a zero-indexed value
|
||||
pub const fn from_zero_indexed(value: usize) -> Self {
|
||||
Self(ONE.saturating_add(value))
|
||||
}
|
||||
|
||||
/// Return the zero-indexed primitive value for this [`OneIndexed`]
|
||||
pub const fn to_zero_indexed(self) -> usize {
|
||||
self.0.get() - 1
|
||||
}
|
||||
|
||||
/// Saturating integer addition. Computes `self + rhs`, saturating at
|
||||
/// the numeric bounds instead of overflowing.
|
||||
#[must_use]
|
||||
pub const fn saturating_add(self, rhs: usize) -> Self {
|
||||
match NonZeroUsize::new(self.0.get().saturating_add(rhs)) {
|
||||
Some(value) => Self(value),
|
||||
None => Self::MAX,
|
||||
}
|
||||
}
|
||||
|
||||
/// Saturating integer subtraction. Computes `self - rhs`, saturating
|
||||
/// at the numeric bounds instead of overflowing.
|
||||
#[must_use]
|
||||
pub const fn saturating_sub(self, rhs: usize) -> Self {
|
||||
match NonZeroUsize::new(self.0.get().saturating_sub(rhs)) {
|
||||
Some(value) => Self(value),
|
||||
None => Self::MIN,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for OneIndexed {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
std::fmt::Debug::fmt(&self.0.get(), f)
|
||||
}
|
||||
}
|
||||
|
||||
/// A const `Option::unwrap` without nightly features:
|
||||
/// [Tracking issue](https://github.com/rust-lang/rust/issues/67441)
|
||||
const fn unwrap<T: Copy>(option: Option<T>) -> T {
|
||||
match option {
|
||||
Some(value) => value,
|
||||
None => panic!("unwrapping None"),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::source_code::line_index::LineIndex;
|
||||
use ruff_text_size::TextSize;
|
||||
use rustpython_parser::ast::Location;
|
||||
|
||||
#[test]
|
||||
fn ascii_index() {
|
||||
let index = LineIndex::from_source_text("");
|
||||
assert_eq!(index.line_starts(), &[TextSize::from(0)]);
|
||||
|
||||
let index = LineIndex::from_source_text("x = 1");
|
||||
assert_eq!(index.line_starts(), &[TextSize::from(0)]);
|
||||
|
||||
let index = LineIndex::from_source_text("x = 1\n");
|
||||
assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(6)]);
|
||||
|
||||
let index = LineIndex::from_source_text("x = 1\ny = 2\nz = x + y\n");
|
||||
assert_eq!(
|
||||
index.line_starts(),
|
||||
&[
|
||||
TextSize::from(0),
|
||||
TextSize::from(6),
|
||||
TextSize::from(12),
|
||||
TextSize::from(22)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_byte_offset() {
|
||||
let contents = "x = 1\ny = 2";
|
||||
let index = LineIndex::from_source_text(contents);
|
||||
|
||||
// First row.
|
||||
let loc = index.location_offset(Location::new(1, 0), contents);
|
||||
assert_eq!(loc, TextSize::from(0));
|
||||
|
||||
// Second row.
|
||||
let loc = index.location_offset(Location::new(2, 0), contents);
|
||||
assert_eq!(loc, TextSize::from(6));
|
||||
|
||||
// One-past-the-end.
|
||||
let loc = index.location_offset(Location::new(3, 0), contents);
|
||||
assert_eq!(loc, TextSize::from(11));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_carriage_return() {
|
||||
let contents = "x = 4\ry = 3";
|
||||
let index = LineIndex::from_source_text(contents);
|
||||
assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(6)]);
|
||||
|
||||
assert_eq!(
|
||||
index.location_offset(Location::new(1, 4), contents),
|
||||
TextSize::from(4)
|
||||
);
|
||||
assert_eq!(
|
||||
index.location_offset(Location::new(2, 0), contents),
|
||||
TextSize::from(6)
|
||||
);
|
||||
assert_eq!(
|
||||
index.location_offset(Location::new(2, 1), contents),
|
||||
TextSize::from(7)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_carriage_return_newline() {
|
||||
let contents = "x = 4\r\ny = 3";
|
||||
let index = LineIndex::from_source_text(contents);
|
||||
assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(7)]);
|
||||
|
||||
assert_eq!(
|
||||
index.location_offset(Location::new(1, 4), contents),
|
||||
TextSize::from(4)
|
||||
);
|
||||
assert_eq!(
|
||||
index.location_offset(Location::new(2, 0), contents),
|
||||
TextSize::from(7)
|
||||
);
|
||||
assert_eq!(
|
||||
index.location_offset(Location::new(2, 1), contents),
|
||||
TextSize::from(8)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn utf8_index() {
|
||||
let index = LineIndex::from_source_text("x = '🫣'");
|
||||
assert_eq!(index.lines_count(), 1);
|
||||
assert_eq!(index.line_starts(), &[TextSize::from(0)]);
|
||||
|
||||
let index = LineIndex::from_source_text("x = '🫣'\n");
|
||||
assert_eq!(index.lines_count(), 2);
|
||||
assert_eq!(
|
||||
index.line_starts(),
|
||||
&[TextSize::from(0), TextSize::from(11)]
|
||||
);
|
||||
|
||||
let index = LineIndex::from_source_text("x = '🫣'\ny = 2\nz = x + y\n");
|
||||
assert_eq!(index.lines_count(), 4);
|
||||
assert_eq!(
|
||||
index.line_starts(),
|
||||
&[
|
||||
TextSize::from(0),
|
||||
TextSize::from(11),
|
||||
TextSize::from(17),
|
||||
TextSize::from(27)
|
||||
]
|
||||
);
|
||||
|
||||
let index = LineIndex::from_source_text("# 🫣\nclass Foo:\n \"\"\".\"\"\"");
|
||||
assert_eq!(index.lines_count(), 3);
|
||||
assert_eq!(
|
||||
index.line_starts(),
|
||||
&[TextSize::from(0), TextSize::from(7), TextSize::from(18)]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn utf8_carriage_return() {
|
||||
let contents = "x = '🫣'\ry = 3";
|
||||
let index = LineIndex::from_source_text(contents);
|
||||
assert_eq!(index.lines_count(), 2);
|
||||
assert_eq!(
|
||||
index.line_starts(),
|
||||
&[TextSize::from(0), TextSize::from(11)]
|
||||
);
|
||||
|
||||
// Second '
|
||||
assert_eq!(
|
||||
index.location_offset(Location::new(1, 6), contents),
|
||||
TextSize::from(9)
|
||||
);
|
||||
assert_eq!(
|
||||
index.location_offset(Location::new(2, 0), contents),
|
||||
TextSize::from(11)
|
||||
);
|
||||
assert_eq!(
|
||||
index.location_offset(Location::new(2, 1), contents),
|
||||
TextSize::from(12)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn utf8_carriage_return_newline() {
|
||||
let contents = "x = '🫣'\r\ny = 3";
|
||||
let index = LineIndex::from_source_text(contents);
|
||||
assert_eq!(index.lines_count(), 2);
|
||||
assert_eq!(
|
||||
index.line_starts(),
|
||||
&[TextSize::from(0), TextSize::from(12)]
|
||||
);
|
||||
|
||||
// Second '
|
||||
assert_eq!(
|
||||
index.location_offset(Location::new(1, 6), contents),
|
||||
TextSize::from(9)
|
||||
);
|
||||
assert_eq!(
|
||||
index.location_offset(Location::new(2, 0), contents),
|
||||
TextSize::from(12)
|
||||
);
|
||||
assert_eq!(
|
||||
index.location_offset(Location::new(2, 1), contents),
|
||||
TextSize::from(13)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn utf8_byte_offset() {
|
||||
let contents = "x = '☃'\ny = 2";
|
||||
let index = LineIndex::from_source_text(contents);
|
||||
assert_eq!(
|
||||
index.line_starts(),
|
||||
&[TextSize::from(0), TextSize::from(10)]
|
||||
);
|
||||
|
||||
// First row.
|
||||
let loc = index.location_offset(Location::new(1, 0), contents);
|
||||
assert_eq!(loc, TextSize::from(0));
|
||||
|
||||
let loc = index.location_offset(Location::new(1, 5), contents);
|
||||
assert_eq!(loc, TextSize::from(5));
|
||||
assert_eq!(&"x = '☃'\ny = 2"[usize::from(loc)..], "☃'\ny = 2");
|
||||
|
||||
let loc = index.location_offset(Location::new(1, 6), contents);
|
||||
assert_eq!(loc, TextSize::from(8));
|
||||
assert_eq!(&"x = '☃'\ny = 2"[usize::from(loc)..], "'\ny = 2");
|
||||
|
||||
// Second row.
|
||||
let loc = index.location_offset(Location::new(2, 0), contents);
|
||||
assert_eq!(loc, TextSize::from(10));
|
||||
|
||||
// One-past-the-end.
|
||||
let loc = index.location_offset(Location::new(3, 0), contents);
|
||||
assert_eq!(loc, TextSize::from(15));
|
||||
}
|
||||
}
|
|
@ -1,13 +1,15 @@
|
|||
//! Struct used to efficiently slice source code at (row, column) Locations.
|
||||
|
||||
use crate::source_code::line_index::LineIndex;
|
||||
use once_cell::unsync::OnceCell;
|
||||
use ruff_text_size::{TextRange, TextSize};
|
||||
use rustpython_parser::ast::Location;
|
||||
|
||||
use crate::types::Range;
|
||||
|
||||
pub struct Locator<'a> {
|
||||
contents: &'a str,
|
||||
index: OnceCell<Index>,
|
||||
index: OnceCell<LineIndex>,
|
||||
}
|
||||
|
||||
impl<'a> Locator<'a> {
|
||||
|
@ -18,37 +20,38 @@ impl<'a> Locator<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
fn get_or_init_index(&self) -> &Index {
|
||||
self.index.get_or_init(|| Index::from(self.contents))
|
||||
fn get_or_init_index(&self) -> &LineIndex {
|
||||
self.index
|
||||
.get_or_init(|| LineIndex::from_source_text(self.contents))
|
||||
}
|
||||
|
||||
/// Take the source code up to the given [`Location`].
|
||||
pub fn take(&self, location: Location) -> &'a str {
|
||||
let index = self.get_or_init_index();
|
||||
let offset = index.byte_offset(location, self.contents);
|
||||
&self.contents[..offset]
|
||||
let offset = index.location_offset(location, self.contents);
|
||||
&self.contents[TextRange::up_to(offset)]
|
||||
}
|
||||
|
||||
/// Take the source code after the given [`Location`].
|
||||
pub fn skip(&self, location: Location) -> &'a str {
|
||||
let index = self.get_or_init_index();
|
||||
let offset = index.byte_offset(location, self.contents);
|
||||
&self.contents[offset..]
|
||||
let offset = index.location_offset(location, self.contents);
|
||||
&self.contents[usize::from(offset)..]
|
||||
}
|
||||
|
||||
/// Take the source code between the given [`Range`].
|
||||
pub fn slice<R: Into<Range>>(&self, range: R) -> &'a str {
|
||||
let index = self.get_or_init_index();
|
||||
let range = range.into();
|
||||
let start = index.byte_offset(range.location, self.contents);
|
||||
let end = index.byte_offset(range.end_location, self.contents);
|
||||
&self.contents[start..end]
|
||||
let start = index.location_offset(range.location, self.contents);
|
||||
let end = index.location_offset(range.end_location, self.contents);
|
||||
&self.contents[TextRange::new(start, end)]
|
||||
}
|
||||
|
||||
/// Return the byte offset of the given [`Location`].
|
||||
pub fn offset(&self, location: Location) -> usize {
|
||||
pub fn offset(&self, location: Location) -> TextSize {
|
||||
let index = self.get_or_init_index();
|
||||
index.byte_offset(location, self.contents)
|
||||
index.location_offset(location, self.contents)
|
||||
}
|
||||
|
||||
/// Return the underlying source code.
|
||||
|
@ -59,7 +62,7 @@ impl<'a> Locator<'a> {
|
|||
/// Return the number of lines in the source code.
|
||||
pub fn count_lines(&self) -> usize {
|
||||
let index = self.get_or_init_index();
|
||||
index.count_lines()
|
||||
index.lines_count()
|
||||
}
|
||||
|
||||
/// Return the number of bytes in the source code.
|
||||
|
@ -72,302 +75,3 @@ impl<'a> Locator<'a> {
|
|||
self.contents.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
/// Index for fast [`Location`] to byte offset conversions.
|
||||
#[derive(Debug, Clone)]
|
||||
enum Index {
|
||||
/// Optimized index for an ASCII only document
|
||||
Ascii(AsciiIndex),
|
||||
|
||||
/// Index for UTF8 documents
|
||||
Utf8(Utf8Index),
|
||||
}
|
||||
|
||||
impl Index {
|
||||
/// Truncate a [`Location`] to a byte offset in source code.
|
||||
fn byte_offset(&self, location: Location, contents: &str) -> usize {
|
||||
match self {
|
||||
Index::Ascii(ascii) => ascii.byte_offset(location, contents),
|
||||
Index::Utf8(utf8) => utf8.byte_offset(location, contents),
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the number of lines in the source code.
|
||||
fn count_lines(&self) -> usize {
|
||||
match self {
|
||||
Index::Ascii(ascii) => ascii.line_start_byte_offsets.len(),
|
||||
Index::Utf8(utf8) => utf8.line_start_byte_offsets.len(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for Index {
|
||||
fn from(contents: &str) -> Self {
|
||||
assert!(u32::try_from(contents.len()).is_ok());
|
||||
|
||||
let mut line_start_offsets: Vec<u32> = Vec::with_capacity(48);
|
||||
line_start_offsets.push(0);
|
||||
let mut utf8 = false;
|
||||
|
||||
// SAFE because of length assertion above
|
||||
#[allow(clippy::cast_possible_truncation)]
|
||||
for (i, byte) in contents.bytes().enumerate() {
|
||||
utf8 |= !byte.is_ascii();
|
||||
|
||||
match byte {
|
||||
// Only track one line break for `\r\n`.
|
||||
b'\r' if contents.as_bytes().get(i + 1) == Some(&b'\n') => continue,
|
||||
b'\n' | b'\r' => {
|
||||
line_start_offsets.push((i + 1) as u32);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
if utf8 {
|
||||
Self::Utf8(Utf8Index::new(line_start_offsets))
|
||||
} else {
|
||||
Self::Ascii(AsciiIndex::new(line_start_offsets))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Index for fast [`Location`] to byte offset conversions for ASCII documents.
|
||||
///
|
||||
/// The index stores the byte offsets for every line. It computes the byte offset for a [`Location`]
|
||||
/// by retrieving the line offset from its index and adding the column.
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
struct AsciiIndex {
|
||||
line_start_byte_offsets: Vec<u32>,
|
||||
}
|
||||
|
||||
impl AsciiIndex {
|
||||
fn new(line_start_positions: Vec<u32>) -> Self {
|
||||
Self {
|
||||
line_start_byte_offsets: line_start_positions,
|
||||
}
|
||||
}
|
||||
|
||||
/// Truncate a [`Location`] to a byte offset in ASCII source code.
|
||||
fn byte_offset(&self, location: Location, contents: &str) -> usize {
|
||||
let index = &self.line_start_byte_offsets;
|
||||
|
||||
// If start-of-line position after last line
|
||||
if location.row() - 1 == index.len() && location.column() == 0 {
|
||||
contents.len()
|
||||
} else {
|
||||
let byte_offset = index[location.row() - 1] as usize + location.column();
|
||||
byte_offset.min(contents.len())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Index for fast [`Location`] to byte offset conversions for UTF8 documents.
|
||||
///
|
||||
/// The index stores the byte offset of every line. The column offset is lazily computed by
|
||||
/// adding the line start offset and then iterating to the `nth` character.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
struct Utf8Index {
|
||||
line_start_byte_offsets: Vec<u32>,
|
||||
}
|
||||
|
||||
impl Utf8Index {
|
||||
fn new(line_byte_positions: Vec<u32>) -> Self {
|
||||
Self {
|
||||
line_start_byte_offsets: line_byte_positions,
|
||||
}
|
||||
}
|
||||
|
||||
/// Truncate a [`Location`] to a byte offset in UTF-8 source code.
|
||||
fn byte_offset(&self, location: Location, contents: &str) -> usize {
|
||||
let index = &self.line_start_byte_offsets;
|
||||
|
||||
if location.row() - 1 == index.len() && location.column() == 0 {
|
||||
contents.len()
|
||||
} else {
|
||||
// Casting is safe because the length of utf8 characters is always between 1-4
|
||||
#[allow(clippy::cast_possible_truncation)]
|
||||
let line_start = if location.row() == 1 && contents.starts_with('\u{feff}') {
|
||||
'\u{feff}'.len_utf8() as u32
|
||||
} else {
|
||||
index[location.row() - 1]
|
||||
};
|
||||
|
||||
let rest = &contents[line_start as usize..];
|
||||
|
||||
let column_offset = match rest.char_indices().nth(location.column()) {
|
||||
Some((offset, _)) => offset,
|
||||
None => contents.len(),
|
||||
};
|
||||
|
||||
let offset = line_start as usize + column_offset;
|
||||
offset.min(contents.len())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rustpython_parser::ast::Location;
|
||||
|
||||
use crate::source_code::locator::{AsciiIndex, Index, Utf8Index};
|
||||
|
||||
fn index_ascii(content: &str) -> AsciiIndex {
|
||||
match Index::from(content) {
|
||||
Index::Ascii(ascii) => ascii,
|
||||
Index::Utf8(_) => {
|
||||
panic!("Expected ASCII index")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn index_utf8(content: &str) -> Utf8Index {
|
||||
match Index::from(content) {
|
||||
Index::Utf8(utf8) => utf8,
|
||||
Index::Ascii(_) => {
|
||||
panic!("Expected UTF8 index")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_index() {
|
||||
let contents = "";
|
||||
let index = index_ascii(contents);
|
||||
assert_eq!(index, AsciiIndex::new(vec![0]));
|
||||
|
||||
let contents = "x = 1";
|
||||
let index = index_ascii(contents);
|
||||
assert_eq!(index, AsciiIndex::new(vec![0]));
|
||||
|
||||
let contents = "x = 1\n";
|
||||
let index = index_ascii(contents);
|
||||
assert_eq!(index, AsciiIndex::new(vec![0, 6]));
|
||||
|
||||
let contents = "x = 1\ny = 2\nz = x + y\n";
|
||||
let index = index_ascii(contents);
|
||||
assert_eq!(index, AsciiIndex::new(vec![0, 6, 12, 22]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_byte_offset() {
|
||||
let contents = "x = 1\ny = 2";
|
||||
let index = index_ascii(contents);
|
||||
|
||||
// First row.
|
||||
let loc = index.byte_offset(Location::new(1, 0), contents);
|
||||
assert_eq!(loc, 0);
|
||||
|
||||
// Second row.
|
||||
let loc = index.byte_offset(Location::new(2, 0), contents);
|
||||
assert_eq!(loc, 6);
|
||||
|
||||
// One-past-the-end.
|
||||
let loc = index.byte_offset(Location::new(3, 0), contents);
|
||||
assert_eq!(loc, 11);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_carriage_return() {
|
||||
let contents = "x = 4\ry = 3";
|
||||
let index = index_ascii(contents);
|
||||
assert_eq!(index, AsciiIndex::new(vec![0, 6]));
|
||||
|
||||
assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4);
|
||||
assert_eq!(index.byte_offset(Location::new(2, 0), contents), 6);
|
||||
assert_eq!(index.byte_offset(Location::new(2, 1), contents), 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ascii_carriage_return_newline() {
|
||||
let contents = "x = 4\r\ny = 3";
|
||||
let index = index_ascii(contents);
|
||||
assert_eq!(index, AsciiIndex::new(vec![0, 7]));
|
||||
|
||||
assert_eq!(index.byte_offset(Location::new(1, 4), contents), 4);
|
||||
assert_eq!(index.byte_offset(Location::new(2, 0), contents), 7);
|
||||
assert_eq!(index.byte_offset(Location::new(2, 1), contents), 8);
|
||||
}
|
||||
|
||||
impl Utf8Index {
|
||||
fn line_count(&self) -> usize {
|
||||
self.line_start_byte_offsets.len()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn utf8_index() {
|
||||
let contents = "x = '🫣'";
|
||||
let index = index_utf8(contents);
|
||||
assert_eq!(index.line_count(), 1);
|
||||
assert_eq!(index, Utf8Index::new(vec![0]));
|
||||
|
||||
let contents = "x = '🫣'\n";
|
||||
let index = index_utf8(contents);
|
||||
assert_eq!(index.line_count(), 2);
|
||||
assert_eq!(index, Utf8Index::new(vec![0, 11]));
|
||||
|
||||
let contents = "x = '🫣'\ny = 2\nz = x + y\n";
|
||||
let index = index_utf8(contents);
|
||||
assert_eq!(index.line_count(), 4);
|
||||
assert_eq!(index, Utf8Index::new(vec![0, 11, 17, 27]));
|
||||
|
||||
let contents = "# 🫣\nclass Foo:\n \"\"\".\"\"\"";
|
||||
let index = index_utf8(contents);
|
||||
assert_eq!(index.line_count(), 3);
|
||||
assert_eq!(index, Utf8Index::new(vec![0, 7, 18]));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn utf8_carriage_return() {
|
||||
let contents = "x = '🫣'\ry = 3";
|
||||
let index = index_utf8(contents);
|
||||
assert_eq!(index.line_count(), 2);
|
||||
assert_eq!(index, Utf8Index::new(vec![0, 11]));
|
||||
|
||||
// Second '
|
||||
assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9);
|
||||
assert_eq!(index.byte_offset(Location::new(2, 0), contents), 11);
|
||||
assert_eq!(index.byte_offset(Location::new(2, 1), contents), 12);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn utf8_carriage_return_newline() {
|
||||
let contents = "x = '🫣'\r\ny = 3";
|
||||
let index = index_utf8(contents);
|
||||
assert_eq!(index.line_count(), 2);
|
||||
assert_eq!(index, Utf8Index::new(vec![0, 12]));
|
||||
|
||||
// Second '
|
||||
assert_eq!(index.byte_offset(Location::new(1, 6), contents), 9);
|
||||
assert_eq!(index.byte_offset(Location::new(2, 0), contents), 12);
|
||||
assert_eq!(index.byte_offset(Location::new(2, 1), contents), 13);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn utf8_byte_offset() {
|
||||
let contents = "x = '☃'\ny = 2";
|
||||
let index = index_utf8(contents);
|
||||
assert_eq!(index, Utf8Index::new(vec![0, 10]));
|
||||
|
||||
// First row.
|
||||
let loc = index.byte_offset(Location::new(1, 0), contents);
|
||||
assert_eq!(loc, 0);
|
||||
|
||||
let loc = index.byte_offset(Location::new(1, 5), contents);
|
||||
assert_eq!(loc, 5);
|
||||
assert_eq!(&contents[loc..], "☃'\ny = 2");
|
||||
|
||||
let loc = index.byte_offset(Location::new(1, 6), contents);
|
||||
assert_eq!(loc, 8);
|
||||
assert_eq!(&contents[loc..], "'\ny = 2");
|
||||
|
||||
// Second row.
|
||||
let loc = index.byte_offset(Location::new(2, 0), contents);
|
||||
assert_eq!(loc, 10);
|
||||
|
||||
// One-past-the-end.
|
||||
let loc = index.byte_offset(Location::new(3, 0), contents);
|
||||
assert_eq!(loc, 15);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
mod generator;
|
||||
mod indexer;
|
||||
mod line_index;
|
||||
mod locator;
|
||||
mod stylist;
|
||||
|
||||
pub use crate::source_code::line_index::{LineIndex, OneIndexed};
|
||||
pub use generator::Generator;
|
||||
pub use indexer::Indexer;
|
||||
pub use locator::Locator;
|
||||
use rustpython_parser as parser;
|
||||
use rustpython_parser::{lexer, Mode, ParseError};
|
||||
|
||||
pub use stylist::{LineEnding, Stylist};
|
||||
|
||||
/// Run round-trip source code generation on a given Python code.
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue