Split SourceLocation into LineColumn and SourceLocation (#17587)

This commit is contained in:
Micha Reiser 2025-04-27 11:27:33 +01:00 committed by GitHub
parent 4443f6653c
commit 1c65e0ad25
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
29 changed files with 695 additions and 537 deletions

View file

@ -5,13 +5,12 @@ use std::ops::Deref;
use std::str::FromStr;
use std::sync::Arc;
use crate::{LineColumn, SourceLocation};
use ruff_text_size::{TextLen, TextRange, TextSize};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use crate::SourceLocation;
/// Index for fast [byte offset](TextSize) to [`SourceLocation`] conversions.
/// Index for fast [byte offset](TextSize) to [`LineColumn`] conversions.
///
/// Cloning a [`LineIndex`] is cheap because it only requires bumping a reference count.
#[derive(Clone, Eq, PartialEq)]
@ -66,60 +65,146 @@ impl LineIndex {
self.inner.kind
}
/// Returns the row and column index for an offset.
/// Returns the line and column number for an UTF-8 byte offset.
///
/// The `column` number is the nth-character of the line, except for the first line
/// where it doesn't include the UTF-8 BOM marker at the start of the file.
///
/// ### BOM handling
///
/// For files starting with a UTF-8 BOM marker, the byte offsets
/// in the range `0...3` are all mapped to line 0 and column 0.
/// Because of this, the conversion isn't losless.
///
/// ## Examples
///
/// ```
/// # use ruff_text_size::TextSize;
/// # use ruff_source_file::{LineIndex, OneIndexed, SourceLocation};
/// let source = "def a():\n pass";
/// let index = LineIndex::from_source_text(source);
/// # use ruff_source_file::{LineIndex, OneIndexed, LineColumn};
/// let source = format!("\u{FEFF}{}", "def a():\n pass");
/// let index = LineIndex::from_source_text(&source);
///
/// // Before BOM, maps to after BOM
/// assert_eq!(
/// index.source_location(TextSize::from(0), source),
/// SourceLocation { row: OneIndexed::from_zero_indexed(0), column: OneIndexed::from_zero_indexed(0) }
/// index.line_column(TextSize::from(0), &source),
/// LineColumn { line: OneIndexed::from_zero_indexed(0), column: OneIndexed::from_zero_indexed(0) }
/// );
///
/// // After BOM, maps to after BOM
/// assert_eq!(
/// index.line_column(TextSize::from(3), &source),
/// LineColumn { line: OneIndexed::from_zero_indexed(0), column: OneIndexed::from_zero_indexed(0) }
/// );
///
/// assert_eq!(
/// index.source_location(TextSize::from(4), source),
/// SourceLocation { row: OneIndexed::from_zero_indexed(0), column: OneIndexed::from_zero_indexed(4) }
/// index.line_column(TextSize::from(7), &source),
/// LineColumn { line: OneIndexed::from_zero_indexed(0), column: OneIndexed::from_zero_indexed(4) }
/// );
/// assert_eq!(
/// index.source_location(TextSize::from(13), source),
/// SourceLocation { row: OneIndexed::from_zero_indexed(1), column: OneIndexed::from_zero_indexed(4) }
/// index.line_column(TextSize::from(16), &source),
/// LineColumn { line: OneIndexed::from_zero_indexed(1), column: OneIndexed::from_zero_indexed(4) }
/// );
/// ```
///
/// ## Panics
///
/// If the offset is out of bounds.
pub fn source_location(&self, offset: TextSize, content: &str) -> SourceLocation {
match self.line_starts().binary_search(&offset) {
// Offset is at the start of a line
Ok(row) => SourceLocation {
row: OneIndexed::from_zero_indexed(row),
column: OneIndexed::from_zero_indexed(0),
},
Err(next_row) => {
// SAFETY: Safe because the index always contains an entry for the offset 0
let row = next_row - 1;
let mut line_start = self.line_starts()[row];
/// If the byte offset isn't within the bounds of `content`.
pub fn line_column(&self, offset: TextSize, content: &str) -> LineColumn {
let location = self.source_location(offset, content, PositionEncoding::Utf32);
let column = if self.kind().is_ascii() {
usize::from(offset) - usize::from(line_start)
} else {
// Don't count the BOM character as a column.
if line_start == TextSize::from(0) && content.starts_with('\u{feff}') {
line_start = '\u{feff}'.text_len();
}
// Don't count the BOM character as a column, but only on the first line.
let column = if location.line.to_zero_indexed() == 0 && content.starts_with('\u{feff}') {
location.character_offset.saturating_sub(1)
} else {
location.character_offset
};
content[TextRange::new(line_start, offset)].chars().count()
};
LineColumn {
line: location.line,
column,
}
}
/// Given a UTF-8 byte offset, returns the line and character offset according to the given encoding.
///
/// ### BOM handling
///
/// Unlike [`Self::line_column`], this method does not skip the BOM character at the start of the file.
/// This allows for bidirectional mapping between [`SourceLocation`] and [`TextSize`] (see [`Self::offset`]).
///
/// ## Examples
///
/// ```
/// # use ruff_text_size::TextSize;
/// # use ruff_source_file::{LineIndex, OneIndexed, LineColumn, SourceLocation, PositionEncoding, Line};
/// let source = format!("\u{FEFF}{}", "def a():\n pass");
/// let index = LineIndex::from_source_text(&source);
///
/// // Before BOM, maps to character 0
/// assert_eq!(
/// index.source_location(TextSize::from(0), &source, PositionEncoding::Utf32),
/// SourceLocation { line: OneIndexed::from_zero_indexed(0), character_offset: OneIndexed::from_zero_indexed(0) }
/// );
///
/// // After BOM, maps to after BOM
/// assert_eq!(
/// index.source_location(TextSize::from(3), &source, PositionEncoding::Utf32),
/// SourceLocation { line: OneIndexed::from_zero_indexed(0), character_offset: OneIndexed::from_zero_indexed(1) }
/// );
///
/// assert_eq!(
/// index.source_location(TextSize::from(7), &source, PositionEncoding::Utf32),
/// SourceLocation { line: OneIndexed::from_zero_indexed(0), character_offset: OneIndexed::from_zero_indexed(5) }
/// );
/// assert_eq!(
/// index.source_location(TextSize::from(16), &source, PositionEncoding::Utf32),
/// SourceLocation { line: OneIndexed::from_zero_indexed(1), character_offset: OneIndexed::from_zero_indexed(4) }
/// );
/// ```
///
/// ## Panics
///
/// If the UTF-8 byte offset is out of bounds of `text`.
pub fn source_location(
&self,
offset: TextSize,
text: &str,
encoding: PositionEncoding,
) -> SourceLocation {
let line = self.line_index(offset);
let line_start = self.line_start(line, text);
if self.is_ascii() {
return SourceLocation {
line,
character_offset: OneIndexed::from_zero_indexed((offset - line_start).to_usize()),
};
}
match encoding {
PositionEncoding::Utf8 => {
let character_offset = offset - line_start;
SourceLocation {
line,
character_offset: OneIndexed::from_zero_indexed(character_offset.to_usize()),
}
}
PositionEncoding::Utf16 => {
let up_to_character = &text[TextRange::new(line_start, offset)];
let character = up_to_character.encode_utf16().count();
SourceLocation {
row: OneIndexed::from_zero_indexed(row),
column: OneIndexed::from_zero_indexed(column),
line,
character_offset: OneIndexed::from_zero_indexed(character),
}
}
PositionEncoding::Utf32 => {
let up_to_character = &text[TextRange::new(line_start, offset)];
let character = up_to_character.chars().count();
SourceLocation {
line,
character_offset: OneIndexed::from_zero_indexed(character),
}
}
}
@ -141,7 +226,7 @@ impl LineIndex {
///
/// ```
/// # use ruff_text_size::TextSize;
/// # use ruff_source_file::{LineIndex, OneIndexed, SourceLocation};
/// # use ruff_source_file::{LineIndex, OneIndexed, LineColumn};
/// let source = "def a():\n pass";
/// let index = LineIndex::from_source_text(source);
///
@ -221,83 +306,211 @@ impl LineIndex {
}
}
/// Returns the [byte offset](TextSize) at `line` and `column`.
/// Returns the [UTF-8 byte offset](TextSize) at `line` and `character` where character is counted using the given encoding.
///
/// ## Examples
///
/// ### ASCII
/// ### ASCII only source text
///
/// ```
/// use ruff_source_file::{LineIndex, OneIndexed};
/// use ruff_text_size::TextSize;
/// # use ruff_source_file::{SourceLocation, LineIndex, OneIndexed, PositionEncoding};
/// # use ruff_text_size::TextSize;
/// let source = r#"a = 4
/// c = "some string"
/// x = b"#;
///
/// let index = LineIndex::from_source_text(source);
///
/// // First line, first column
/// assert_eq!(index.offset(OneIndexed::from_zero_indexed(0), OneIndexed::from_zero_indexed(0), source), TextSize::new(0));
/// // First line, first character
/// assert_eq!(
/// index.offset(
/// SourceLocation {
/// line: OneIndexed::from_zero_indexed(0),
/// character_offset: OneIndexed::from_zero_indexed(0)
/// },
/// source,
/// PositionEncoding::Utf32,
/// ),
/// TextSize::new(0)
/// );
///
/// // Second line, 4th column
/// assert_eq!(index.offset(OneIndexed::from_zero_indexed(1), OneIndexed::from_zero_indexed(4), source), TextSize::new(10));
/// assert_eq!(
/// index.offset(
/// SourceLocation {
/// line: OneIndexed::from_zero_indexed(1),
/// character_offset: OneIndexed::from_zero_indexed(4)
/// },
/// source,
/// PositionEncoding::Utf32,
/// ),
/// TextSize::new(10)
/// );
///
/// // Offset past the end of the first line
/// assert_eq!(index.offset(OneIndexed::from_zero_indexed(0), OneIndexed::from_zero_indexed(10), source), TextSize::new(6));
/// assert_eq!(
/// index.offset(
/// SourceLocation {
/// line: OneIndexed::from_zero_indexed(0),
/// character_offset: OneIndexed::from_zero_indexed(10)
/// },
/// source,
/// PositionEncoding::Utf32,
/// ),
/// TextSize::new(6)
/// );
///
/// // Offset past the end of the file
/// assert_eq!(index.offset(OneIndexed::from_zero_indexed(3), OneIndexed::from_zero_indexed(0), source), TextSize::new(29));
/// assert_eq!(
/// index.offset(
/// SourceLocation {
/// line: OneIndexed::from_zero_indexed(3),
/// character_offset: OneIndexed::from_zero_indexed(0)
/// },
/// source,
/// PositionEncoding::Utf32,
/// ),
/// TextSize::new(29)
/// );
/// ```
///
/// ### UTF8
/// ### Non-ASCII source text
///
/// ```
/// use ruff_source_file::{LineIndex, OneIndexed};
/// use ruff_source_file::{LineIndex, OneIndexed, SourceLocation, PositionEncoding};
/// use ruff_text_size::TextSize;
/// let source = r#"a = 4
/// let source = format!("\u{FEFF}{}", r#"a = 4
/// c = "❤️"
/// x = b"#;
/// x = b"#);
///
/// let index = LineIndex::from_source_text(source);
/// let index = LineIndex::from_source_text(&source);
///
/// // First line, first column
/// assert_eq!(index.offset(OneIndexed::from_zero_indexed(0), OneIndexed::from_zero_indexed(0), source), TextSize::new(0));
/// // First line, first character, points at the BOM
/// assert_eq!(
/// index.offset(
/// SourceLocation {
/// line: OneIndexed::from_zero_indexed(0),
/// character_offset: OneIndexed::from_zero_indexed(0)
/// },
/// &source,
/// PositionEncoding::Utf32,
/// ),
/// TextSize::new(0)
/// );
///
/// // First line, after the BOM
/// assert_eq!(
/// index.offset(
/// SourceLocation {
/// line: OneIndexed::from_zero_indexed(0),
/// character_offset: OneIndexed::from_zero_indexed(1)
/// },
/// &source,
/// PositionEncoding::Utf32,
/// ),
/// TextSize::new(3)
/// );
///
/// // second line, 7th character, after emoji, UTF32
/// assert_eq!(
/// index.offset(
/// SourceLocation {
/// line: OneIndexed::from_zero_indexed(1),
/// character_offset: OneIndexed::from_zero_indexed(7)
/// },
/// &source,
/// PositionEncoding::Utf32,
/// ),
/// TextSize::new(20)
/// );
///
/// // Second line, 7th character, after emoji, UTF 16
/// assert_eq!(
/// index.offset(
/// SourceLocation {
/// line: OneIndexed::from_zero_indexed(1),
/// character_offset: OneIndexed::from_zero_indexed(7)
/// },
/// &source,
/// PositionEncoding::Utf16,
/// ),
/// TextSize::new(20)
/// );
///
/// // Third line, 2nd column, after emoji
/// assert_eq!(index.offset(OneIndexed::from_zero_indexed(2), OneIndexed::from_zero_indexed(1), source), TextSize::new(20));
///
/// // Offset past the end of the second line
/// assert_eq!(index.offset(OneIndexed::from_zero_indexed(1), OneIndexed::from_zero_indexed(10), source), TextSize::new(19));
/// assert_eq!(
/// index.offset(
/// SourceLocation {
/// line: OneIndexed::from_zero_indexed(1),
/// character_offset: OneIndexed::from_zero_indexed(10)
/// },
/// &source,
/// PositionEncoding::Utf32,
/// ),
/// TextSize::new(22)
/// );
///
/// // Offset past the end of the file
/// assert_eq!(index.offset(OneIndexed::from_zero_indexed(3), OneIndexed::from_zero_indexed(0), source), TextSize::new(24));
/// assert_eq!(
/// index.offset(
/// SourceLocation {
/// line: OneIndexed::from_zero_indexed(3),
/// character_offset: OneIndexed::from_zero_indexed(0)
/// },
/// &source,
/// PositionEncoding::Utf32,
/// ),
/// TextSize::new(27)
/// );
/// ```
///
pub fn offset(&self, line: OneIndexed, column: OneIndexed, contents: &str) -> TextSize {
pub fn offset(
&self,
position: SourceLocation,
text: &str,
position_encoding: PositionEncoding,
) -> TextSize {
// If start-of-line position after last line
if line.to_zero_indexed() > self.line_starts().len() {
return contents.text_len();
if position.line.to_zero_indexed() > self.line_starts().len() {
return text.text_len();
}
let line_range = self.line_range(line, contents);
let line_range = self.line_range(position.line, text);
match self.kind() {
IndexKind::Ascii => {
line_range.start()
+ TextSize::try_from(column.to_zero_indexed())
.unwrap_or(line_range.len())
.clamp(TextSize::new(0), line_range.len())
}
IndexKind::Utf8 => {
let rest = &contents[line_range];
let column_offset: TextSize = rest
let character_offset = position.character_offset.to_zero_indexed();
let character_byte_offset = if self.is_ascii() {
TextSize::try_from(character_offset).unwrap()
} else {
let line = &text[line_range];
match position_encoding {
PositionEncoding::Utf8 => {
TextSize::try_from(position.character_offset.to_zero_indexed()).unwrap()
}
PositionEncoding::Utf16 => {
let mut byte_offset = TextSize::new(0);
let mut utf16_code_unit_offset = 0;
for c in line.chars() {
if utf16_code_unit_offset >= character_offset {
break;
}
// Count characters encoded as two 16 bit words as 2 characters.
byte_offset += c.text_len();
utf16_code_unit_offset += c.len_utf16();
}
byte_offset
}
PositionEncoding::Utf32 => line
.chars()
.take(column.to_zero_indexed())
.take(position.character_offset.to_zero_indexed())
.map(ruff_text_size::TextLen::text_len)
.sum();
line_range.start() + column_offset
.sum(),
}
}
};
line_range.start() + character_byte_offset.clamp(TextSize::new(0), line_range.len())
}
/// Returns the [byte offsets](TextSize) for every line
@ -430,12 +643,26 @@ impl FromStr for OneIndexed {
}
}
#[derive(Default, Copy, Clone, Debug)]
pub enum PositionEncoding {
/// Character offsets count the number of bytes from the start of the line.
#[default]
Utf8,
/// Character offsets count the number of UTF-16 code units from the start of the line.
Utf16,
/// Character offsets count the number of UTF-32 code points units (the same as number of characters in Rust)
/// from the start of the line.
Utf32,
}
#[cfg(test)]
mod tests {
use ruff_text_size::TextSize;
use crate::line_index::LineIndex;
use crate::{OneIndexed, SourceLocation};
use crate::{LineColumn, OneIndexed};
#[test]
fn ascii_index() {
@ -466,30 +693,30 @@ mod tests {
let index = LineIndex::from_source_text(contents);
// First row.
let loc = index.source_location(TextSize::from(2), contents);
let loc = index.line_column(TextSize::from(2), contents);
assert_eq!(
loc,
SourceLocation {
row: OneIndexed::from_zero_indexed(0),
LineColumn {
line: OneIndexed::from_zero_indexed(0),
column: OneIndexed::from_zero_indexed(2)
}
);
// Second row.
let loc = index.source_location(TextSize::from(6), contents);
let loc = index.line_column(TextSize::from(6), contents);
assert_eq!(
loc,
SourceLocation {
row: OneIndexed::from_zero_indexed(1),
LineColumn {
line: OneIndexed::from_zero_indexed(1),
column: OneIndexed::from_zero_indexed(0)
}
);
let loc = index.source_location(TextSize::from(11), contents);
let loc = index.line_column(TextSize::from(11), contents);
assert_eq!(
loc,
SourceLocation {
row: OneIndexed::from_zero_indexed(1),
LineColumn {
line: OneIndexed::from_zero_indexed(1),
column: OneIndexed::from_zero_indexed(5)
}
);
@ -502,23 +729,23 @@ mod tests {
assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(6)]);
assert_eq!(
index.source_location(TextSize::from(4), contents),
SourceLocation {
row: OneIndexed::from_zero_indexed(0),
index.line_column(TextSize::from(4), contents),
LineColumn {
line: OneIndexed::from_zero_indexed(0),
column: OneIndexed::from_zero_indexed(4)
}
);
assert_eq!(
index.source_location(TextSize::from(6), contents),
SourceLocation {
row: OneIndexed::from_zero_indexed(1),
index.line_column(TextSize::from(6), contents),
LineColumn {
line: OneIndexed::from_zero_indexed(1),
column: OneIndexed::from_zero_indexed(0)
}
);
assert_eq!(
index.source_location(TextSize::from(7), contents),
SourceLocation {
row: OneIndexed::from_zero_indexed(1),
index.line_column(TextSize::from(7), contents),
LineColumn {
line: OneIndexed::from_zero_indexed(1),
column: OneIndexed::from_zero_indexed(1)
}
);
@ -531,23 +758,23 @@ mod tests {
assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(7)]);
assert_eq!(
index.source_location(TextSize::from(4), contents),
SourceLocation {
row: OneIndexed::from_zero_indexed(0),
index.line_column(TextSize::from(4), contents),
LineColumn {
line: OneIndexed::from_zero_indexed(0),
column: OneIndexed::from_zero_indexed(4)
}
);
assert_eq!(
index.source_location(TextSize::from(7), contents),
SourceLocation {
row: OneIndexed::from_zero_indexed(1),
index.line_column(TextSize::from(7), contents),
LineColumn {
line: OneIndexed::from_zero_indexed(1),
column: OneIndexed::from_zero_indexed(0)
}
);
assert_eq!(
index.source_location(TextSize::from(8), contents),
SourceLocation {
row: OneIndexed::from_zero_indexed(1),
index.line_column(TextSize::from(8), contents),
LineColumn {
line: OneIndexed::from_zero_indexed(1),
column: OneIndexed::from_zero_indexed(1)
}
);
@ -598,23 +825,23 @@ mod tests {
// Second '
assert_eq!(
index.source_location(TextSize::from(9), contents),
SourceLocation {
row: OneIndexed::from_zero_indexed(0),
index.line_column(TextSize::from(9), contents),
LineColumn {
line: OneIndexed::from_zero_indexed(0),
column: OneIndexed::from_zero_indexed(6)
}
);
assert_eq!(
index.source_location(TextSize::from(11), contents),
SourceLocation {
row: OneIndexed::from_zero_indexed(1),
index.line_column(TextSize::from(11), contents),
LineColumn {
line: OneIndexed::from_zero_indexed(1),
column: OneIndexed::from_zero_indexed(0)
}
);
assert_eq!(
index.source_location(TextSize::from(12), contents),
SourceLocation {
row: OneIndexed::from_zero_indexed(1),
index.line_column(TextSize::from(12), contents),
LineColumn {
line: OneIndexed::from_zero_indexed(1),
column: OneIndexed::from_zero_indexed(1)
}
);
@ -632,23 +859,23 @@ mod tests {
// Second '
assert_eq!(
index.source_location(TextSize::from(9), contents),
SourceLocation {
row: OneIndexed::from_zero_indexed(0),
index.line_column(TextSize::from(9), contents),
LineColumn {
line: OneIndexed::from_zero_indexed(0),
column: OneIndexed::from_zero_indexed(6)
}
);
assert_eq!(
index.source_location(TextSize::from(12), contents),
SourceLocation {
row: OneIndexed::from_zero_indexed(1),
index.line_column(TextSize::from(12), contents),
LineColumn {
line: OneIndexed::from_zero_indexed(1),
column: OneIndexed::from_zero_indexed(0)
}
);
assert_eq!(
index.source_location(TextSize::from(13), contents),
SourceLocation {
row: OneIndexed::from_zero_indexed(1),
index.line_column(TextSize::from(13), contents),
LineColumn {
line: OneIndexed::from_zero_indexed(1),
column: OneIndexed::from_zero_indexed(1)
}
);
@ -664,49 +891,49 @@ mod tests {
);
// First row.
let loc = index.source_location(TextSize::from(0), contents);
let loc = index.line_column(TextSize::from(0), contents);
assert_eq!(
loc,
SourceLocation {
row: OneIndexed::from_zero_indexed(0),
LineColumn {
line: OneIndexed::from_zero_indexed(0),
column: OneIndexed::from_zero_indexed(0)
}
);
let loc = index.source_location(TextSize::from(5), contents);
let loc = index.line_column(TextSize::from(5), contents);
assert_eq!(
loc,
SourceLocation {
row: OneIndexed::from_zero_indexed(0),
LineColumn {
line: OneIndexed::from_zero_indexed(0),
column: OneIndexed::from_zero_indexed(5)
}
);
let loc = index.source_location(TextSize::from(8), contents);
let loc = index.line_column(TextSize::from(8), contents);
assert_eq!(
loc,
SourceLocation {
row: OneIndexed::from_zero_indexed(0),
LineColumn {
line: OneIndexed::from_zero_indexed(0),
column: OneIndexed::from_zero_indexed(6)
}
);
// Second row.
let loc = index.source_location(TextSize::from(10), contents);
let loc = index.line_column(TextSize::from(10), contents);
assert_eq!(
loc,
SourceLocation {
row: OneIndexed::from_zero_indexed(1),
LineColumn {
line: OneIndexed::from_zero_indexed(1),
column: OneIndexed::from_zero_indexed(0)
}
);
// One-past-the-end.
let loc = index.source_location(TextSize::from(15), contents);
let loc = index.line_column(TextSize::from(15), contents);
assert_eq!(
loc,
SourceLocation {
row: OneIndexed::from_zero_indexed(1),
LineColumn {
line: OneIndexed::from_zero_indexed(1),
column: OneIndexed::from_zero_indexed(5)
}
);