Replace row/column based Location with byte-offsets. (#3931)

This commit is contained in:
Micha Reiser 2023-04-26 20:11:02 +02:00 committed by GitHub
parent ee91598835
commit cab65b25da
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
418 changed files with 6203 additions and 7040 deletions

View file

@ -1,98 +1,135 @@
//! Struct used to index source code, to enable efficient lookup of tokens that
//! are omitted from the AST (e.g., commented lines).
use rustpython_parser::ast::Location;
use crate::source_code::Locator;
use ruff_text_size::{TextRange, TextSize};
use rustpython_parser::lexer::LexResult;
use rustpython_parser::Tok;
use crate::types::Range;
pub struct Indexer {
commented_lines: Vec<usize>,
continuation_lines: Vec<usize>,
string_ranges: Vec<Range>,
/// Stores the ranges of comments sorted by [`TextRange::start`] in increasing order. No two ranges are overlapping.
comment_ranges: Vec<TextRange>,
/// Stores the start offset of continuation lines.
continuation_lines: Vec<TextSize>,
/// The range of all triple quoted strings in the source document. The ranges are sorted by their
/// [`TextRange::start`] position in increasing order. No two ranges are overlapping.
triple_quoted_string_ranges: Vec<TextRange>,
}
impl Indexer {
/// Return a slice of all lines that include a comment.
pub fn commented_lines(&self) -> &[usize] {
&self.commented_lines
}
pub fn from_tokens(tokens: &[LexResult], locator: &Locator) -> Self {
assert!(TextSize::try_from(locator.contents().len()).is_ok());
/// Return a slice of all lines that end with a continuation (backslash).
pub fn continuation_lines(&self) -> &[usize] {
&self.continuation_lines
}
/// Return a slice of all ranges that include a triple-quoted string.
pub fn string_ranges(&self) -> &[Range] {
&self.string_ranges
}
}
impl From<&[LexResult]> for Indexer {
fn from(lxr: &[LexResult]) -> Self {
let mut commented_lines = Vec::new();
let mut continuation_lines = Vec::new();
let mut string_ranges = Vec::new();
let mut prev: Option<(&Location, &Tok, &Location)> = None;
for (start, tok, end) in lxr.iter().flatten() {
// Token, end
let mut prev_end = TextSize::default();
let mut prev_token: Option<&Tok> = None;
let mut line_start = TextSize::default();
for (tok, range) in tokens.iter().flatten() {
let trivia = &locator.contents()[TextRange::new(prev_end, range.start())];
// Get the trivia between the previous and the current token and detect any newlines.
// This is necessary because `RustPython` doesn't emit `[Tok::Newline]` tokens
// between any two tokens that form a continuation nor multiple newlines in a row.
// That's why we have to extract the newlines "manually".
for (index, text) in trivia.match_indices(['\n', '\r']) {
if text == "\r" && trivia.as_bytes().get(index + 1) == Some(&b'\n') {
continue;
}
// Newlines after a comment or new-line never form a continuation.
if !matches!(
prev_token,
Some(Tok::Newline | Tok::NonLogicalNewline | Tok::Comment(..)) | None
) {
continuation_lines.push(line_start);
}
// SAFETY: Safe because of the len assertion at the top of the function.
#[allow(clippy::cast_possible_truncation)]
{
line_start = prev_end + TextSize::new((index + 1) as u32);
}
}
match tok {
Tok::Comment(..) => commented_lines.push(start.row()),
Tok::Comment(..) => {
commented_lines.push(*range);
}
Tok::Newline | Tok::NonLogicalNewline => {
line_start = range.end();
}
Tok::String {
triple_quoted: true,
..
} => string_ranges.push(Range::new(*start, *end)),
_ => (),
} => string_ranges.push(*range),
_ => {}
}
if let Some((.., prev_tok, prev_end)) = prev {
if !matches!(
prev_tok,
Tok::Newline | Tok::NonLogicalNewline | Tok::Comment(..)
) {
for line in prev_end.row()..start.row() {
continuation_lines.push(line);
}
}
}
prev = Some((start, tok, end));
prev_token = Some(tok);
prev_end = range.end();
}
Self {
commented_lines,
comment_ranges: commented_lines,
continuation_lines,
string_ranges,
triple_quoted_string_ranges: string_ranges,
}
}
/// Returns the byte offset ranges of comments
pub fn comment_ranges(&self) -> &[TextRange] {
&self.comment_ranges
}
/// Returns the line start positions of continuations (backslash).
pub fn continuation_line_starts(&self) -> &[TextSize] {
&self.continuation_lines
}
/// Return a slice of all ranges that include a triple-quoted string. The ranges are sorted by
/// [`TextRange::start`] in increasing order. No two ranges are overlapping.
pub fn triple_quoted_string_ranges(&self) -> &[TextRange] {
&self.triple_quoted_string_ranges
}
pub fn is_continuation(&self, offset: TextSize, locator: &Locator) -> bool {
let line_start = locator.line_start(offset);
self.continuation_lines.binary_search(&line_start).is_ok()
}
}
#[cfg(test)]
mod tests {
use rustpython_parser::ast::Location;
use ruff_text_size::{TextRange, TextSize};
use rustpython_parser::lexer::LexResult;
use rustpython_parser::{lexer, Mode};
use crate::source_code::Indexer;
use crate::types::Range;
use crate::source_code::{Indexer, Locator};
#[test]
fn continuation() {
let contents = r#"x = 1"#;
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer: Indexer = lxr.as_slice().into();
assert_eq!(indexer.continuation_lines(), Vec::<usize>::new().as_slice());
let indexer = Indexer::from_tokens(&lxr, &Locator::new(contents));
assert_eq!(indexer.continuation_line_starts(), &[]);
let contents = r#"
# Hello, world!
# Hello, world!
x = 1
y = 2
"#
"#
.trim();
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer: Indexer = lxr.as_slice().into();
assert_eq!(indexer.continuation_lines(), Vec::<usize>::new().as_slice());
let indexer = Indexer::from_tokens(&lxr, &Locator::new(contents));
assert_eq!(indexer.continuation_line_starts(), &[]);
let contents = r#"
x = \
@ -111,8 +148,20 @@ if True:
"#
.trim();
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer: Indexer = lxr.as_slice().into();
assert_eq!(indexer.continuation_lines(), [1, 5, 6, 11]);
let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
assert_eq!(
indexer.continuation_line_starts(),
[
// row 1
TextSize::from(0),
// row 5
TextSize::from(22),
// row 6
TextSize::from(32),
// row 11
TextSize::from(71),
]
);
let contents = r#"
x = 1; import sys
@ -131,16 +180,24 @@ import os
"#
.trim();
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer: Indexer = lxr.as_slice().into();
assert_eq!(indexer.continuation_lines(), [9, 12]);
let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
assert_eq!(
indexer.continuation_line_starts(),
[
// row 9
TextSize::from(84),
// row 12
TextSize::from(116)
]
);
}
#[test]
fn string_ranges() {
let contents = r#""this is a single-quoted string""#;
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer: Indexer = lxr.as_slice().into();
assert_eq!(indexer.string_ranges(), &vec![]);
let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
assert_eq!(indexer.triple_quoted_string_ranges(), []);
let contents = r#"
"""
@ -148,10 +205,10 @@ import os
"""
"#;
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer: Indexer = lxr.as_slice().into();
let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
assert_eq!(
indexer.string_ranges(),
&vec![Range::new(Location::new(2, 12), Location::new(4, 15))]
indexer.triple_quoted_string_ranges(),
[TextRange::new(TextSize::from(13), TextSize::from(71))]
);
let contents = r#"
@ -160,10 +217,10 @@ import os
"""
"#;
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer: Indexer = lxr.as_slice().into();
let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
assert_eq!(
indexer.string_ranges(),
&vec![Range::new(Location::new(2, 12), Location::new(4, 15))]
indexer.triple_quoted_string_ranges(),
[TextRange::new(TextSize::from(13), TextSize::from(107))]
);
let contents = r#"
@ -177,12 +234,12 @@ import os
"""
"#;
let lxr: Vec<LexResult> = lexer::lex(contents, Mode::Module).collect();
let indexer: Indexer = lxr.as_slice().into();
let indexer = Indexer::from_tokens(lxr.as_slice(), &Locator::new(contents));
assert_eq!(
indexer.string_ranges(),
&vec![
Range::new(Location::new(2, 12), Location::new(5, 15)),
Range::new(Location::new(6, 12), Location::new(9, 15))
indexer.triple_quoted_string_ranges(),
&[
TextRange::new(TextSize::from(13), TextSize::from(85)),
TextRange::new(TextSize::from(98), TextSize::from(161))
]
);
}