Remove parser dependency from ruff-python-ast (#6096)

This commit is contained in:
Micha Reiser 2023-07-26 17:47:22 +02:00 committed by GitHub
parent 99127243f4
commit 2cf00fee96
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
658 changed files with 1714 additions and 1546 deletions

View file

@ -0,0 +1,240 @@
use rustpython_ast::text_size::TextSize;
use rustpython_ast::{CmpOp, Expr, Mod, ModModule, Ranged, Suite};
use rustpython_parser as parser;
use rustpython_parser::lexer::LexResult;
use rustpython_parser::text_size::TextRange;
use rustpython_parser::{lexer, Mode, ParseError, Tok};
pub mod token_kind;
pub mod typing;
/// Collect tokens up to and including the first error.
pub fn tokenize(contents: &str) -> Vec<LexResult> {
let mut tokens: Vec<LexResult> = vec![];
for tok in lexer::lex(contents, Mode::Module) {
let is_err = tok.is_err();
tokens.push(tok);
if is_err {
break;
}
}
tokens
}
/// Parse a full Python program from its tokens.
pub fn parse_program_tokens(
lxr: Vec<LexResult>,
source_path: &str,
) -> anyhow::Result<Suite, ParseError> {
parser::parse_tokens(lxr, Mode::Module, source_path).map(|top| match top {
Mod::Module(ModModule { body, .. }) => body,
_ => unreachable!(),
})
}
/// Return the `Range` of the first `Tok::Colon` token in a `Range`.
pub fn first_colon_range(range: TextRange, source: &str) -> Option<TextRange> {
let contents = &source[range];
let range = lexer::lex_starts_at(contents, Mode::Module, range.start())
.flatten()
.find(|(tok, _)| tok.is_colon())
.map(|(_, range)| range);
range
}
/// Extract all [`CmpOp`] operators from an expression snippet, with appropriate
/// ranges.
///
/// `RustPython` doesn't include line and column information on [`CmpOp`] nodes.
/// `CPython` doesn't either. This method iterates over the token stream and
/// re-identifies [`CmpOp`] nodes, annotating them with valid ranges.
pub fn locate_cmp_ops(expr: &Expr, source: &str) -> Vec<LocatedCmpOp> {
// If `Expr` is a multi-line expression, we need to parenthesize it to
// ensure that it's lexed correctly.
let contents = &source[expr.range()];
let parenthesized_contents = format!("({contents})");
let mut tok_iter = lexer::lex(&parenthesized_contents, Mode::Expression)
.flatten()
.skip(1)
.map(|(tok, range)| (tok, range - TextSize::from(1)))
.filter(|(tok, _)| !matches!(tok, Tok::NonLogicalNewline | Tok::Comment(_)))
.peekable();
let mut ops: Vec<LocatedCmpOp> = vec![];
let mut count = 0u32;
loop {
let Some((tok, range)) = tok_iter.next() else {
break;
};
if matches!(tok, Tok::Lpar) {
count = count.saturating_add(1);
continue;
} else if matches!(tok, Tok::Rpar) {
count = count.saturating_sub(1);
continue;
}
if count == 0 {
match tok {
Tok::Not => {
if let Some((_, next_range)) =
tok_iter.next_if(|(tok, _)| matches!(tok, Tok::In))
{
ops.push(LocatedCmpOp::new(
TextRange::new(range.start(), next_range.end()),
CmpOp::NotIn,
));
}
}
Tok::In => {
ops.push(LocatedCmpOp::new(range, CmpOp::In));
}
Tok::Is => {
let op = if let Some((_, next_range)) =
tok_iter.next_if(|(tok, _)| matches!(tok, Tok::Not))
{
LocatedCmpOp::new(
TextRange::new(range.start(), next_range.end()),
CmpOp::IsNot,
)
} else {
LocatedCmpOp::new(range, CmpOp::Is)
};
ops.push(op);
}
Tok::NotEqual => {
ops.push(LocatedCmpOp::new(range, CmpOp::NotEq));
}
Tok::EqEqual => {
ops.push(LocatedCmpOp::new(range, CmpOp::Eq));
}
Tok::GreaterEqual => {
ops.push(LocatedCmpOp::new(range, CmpOp::GtE));
}
Tok::Greater => {
ops.push(LocatedCmpOp::new(range, CmpOp::Gt));
}
Tok::LessEqual => {
ops.push(LocatedCmpOp::new(range, CmpOp::LtE));
}
Tok::Less => {
ops.push(LocatedCmpOp::new(range, CmpOp::Lt));
}
_ => {}
}
}
}
ops
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LocatedCmpOp {
pub range: TextRange,
pub op: CmpOp,
}
impl LocatedCmpOp {
fn new<T: Into<TextRange>>(range: T, op: CmpOp) -> Self {
Self {
range: range.into(),
op,
}
}
}
#[cfg(test)]
mod tests {
use crate::{first_colon_range, locate_cmp_ops, LocatedCmpOp};
use anyhow::Result;
use ruff_text_size::TextSize;
use rustpython_ast::text_size::{TextLen, TextRange};
use rustpython_ast::CmpOp;
use rustpython_ast::Expr;
use rustpython_parser::Parse;
#[test]
fn extract_first_colon_range() {
let contents = "with a: pass";
let range = first_colon_range(
TextRange::new(TextSize::from(0), contents.text_len()),
contents,
)
.unwrap();
assert_eq!(&contents[range], ":");
assert_eq!(range, TextRange::new(TextSize::from(6), TextSize::from(7)));
}
#[test]
fn extract_cmp_op_location() -> Result<()> {
let contents = "x == 1";
let expr = Expr::parse(contents, "<filename>")?;
assert_eq!(
locate_cmp_ops(&expr, contents),
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(4),
CmpOp::Eq
)]
);
let contents = "x != 1";
let expr = Expr::parse(contents, "<filename>")?;
assert_eq!(
locate_cmp_ops(&expr, contents),
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(4),
CmpOp::NotEq
)]
);
let contents = "x is 1";
let expr = Expr::parse(contents, "<filename>")?;
assert_eq!(
locate_cmp_ops(&expr, contents),
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(4),
CmpOp::Is
)]
);
let contents = "x is not 1";
let expr = Expr::parse(contents, "<filename>")?;
assert_eq!(
locate_cmp_ops(&expr, contents),
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(8),
CmpOp::IsNot
)]
);
let contents = "x in 1";
let expr = Expr::parse(contents, "<filename>")?;
assert_eq!(
locate_cmp_ops(&expr, contents),
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(4),
CmpOp::In
)]
);
let contents = "x not in 1";
let expr = Expr::parse(contents, "<filename>")?;
assert_eq!(
locate_cmp_ops(&expr, contents),
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(8),
CmpOp::NotIn
)]
);
let contents = "x != (1 is not 2)";
let expr = Expr::parse(contents, "<filename>")?;
assert_eq!(
locate_cmp_ops(&expr, contents),
vec![LocatedCmpOp::new(
TextSize::from(2)..TextSize::from(4),
CmpOp::NotEq
)]
);
Ok(())
}
}

View file

@ -0,0 +1,448 @@
use rustpython_parser::Tok;
// TODO move to ruff_python_parser?
#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
pub enum TokenKind {
/// Token value for a name, commonly known as an identifier.
Name,
/// Token value for an integer.
Int,
/// Token value for a floating point number.
Float,
/// Token value for a complex number.
Complex,
/// Token value for a string.
String,
/// Token value for a Jupyter magic command.
MagicCommand,
/// Token value for a comment. These are filtered out of the token stream prior to parsing.
Comment,
/// Token value for a newline.
Newline,
/// Token value for a newline that is not a logical line break. These are filtered out of
/// the token stream prior to parsing.
NonLogicalNewline,
/// Token value for an indent.
Indent,
/// Token value for a dedent.
Dedent,
EndOfFile,
/// Token value for a left parenthesis `(`.
Lpar,
/// Token value for a right parenthesis `)`.
Rpar,
/// Token value for a left square bracket `[`.
Lsqb,
/// Token value for a right square bracket `]`.
Rsqb,
/// Token value for a colon `:`.
Colon,
/// Token value for a comma `,`.
Comma,
/// Token value for a semicolon `;`.
Semi,
/// Token value for plus `+`.
Plus,
/// Token value for minus `-`.
Minus,
/// Token value for star `*`.
Star,
/// Token value for slash `/`.
Slash,
/// Token value for vertical bar `|`.
Vbar,
/// Token value for ampersand `&`.
Amper,
/// Token value for less than `<`.
Less,
/// Token value for greater than `>`.
Greater,
/// Token value for equal `=`.
Equal,
/// Token value for dot `.`.
Dot,
/// Token value for percent `%`.
Percent,
/// Token value for left bracket `{`.
Lbrace,
/// Token value for right bracket `}`.
Rbrace,
/// Token value for double equal `==`.
EqEqual,
/// Token value for not equal `!=`.
NotEqual,
/// Token value for less than or equal `<=`.
LessEqual,
/// Token value for greater than or equal `>=`.
GreaterEqual,
/// Token value for tilde `~`.
Tilde,
/// Token value for caret `^`.
CircumFlex,
/// Token value for left shift `<<`.
LeftShift,
/// Token value for right shift `>>`.
RightShift,
/// Token value for double star `**`.
DoubleStar,
/// Token value for double star equal `**=`.
DoubleStarEqual,
/// Token value for plus equal `+=`.
PlusEqual,
/// Token value for minus equal `-=`.
MinusEqual,
/// Token value for star equal `*=`.
StarEqual,
/// Token value for slash equal `/=`.
SlashEqual,
/// Token value for percent equal `%=`.
PercentEqual,
/// Token value for ampersand equal `&=`.
AmperEqual,
/// Token value for vertical bar equal `|=`.
VbarEqual,
/// Token value for caret equal `^=`.
CircumflexEqual,
/// Token value for left shift equal `<<=`.
LeftShiftEqual,
/// Token value for right shift equal `>>=`.
RightShiftEqual,
/// Token value for double slash `//`.
DoubleSlash,
/// Token value for double slash equal `//=`.
DoubleSlashEqual,
/// Token value for colon equal `:=`.
ColonEqual,
/// Token value for at `@`.
At,
/// Token value for at equal `@=`.
AtEqual,
/// Token value for arrow `->`.
Rarrow,
/// Token value for ellipsis `...`.
Ellipsis,
// Self documenting.
// Keywords (alphabetically):
False,
None,
True,
And,
As,
Assert,
Async,
Await,
Break,
Class,
Continue,
Def,
Del,
Elif,
Else,
Except,
Finally,
For,
From,
Global,
If,
Import,
In,
Is,
Lambda,
Nonlocal,
Not,
Or,
Pass,
Raise,
Return,
Try,
While,
Match,
Type,
Case,
With,
Yield,
// RustPython specific.
StartModule,
StartInteractive,
StartExpression,
}
impl TokenKind {
#[inline]
pub const fn is_newline(&self) -> bool {
matches!(self, TokenKind::Newline | TokenKind::NonLogicalNewline)
}
#[inline]
pub const fn is_unary(&self) -> bool {
matches!(self, TokenKind::Plus | TokenKind::Minus)
}
#[inline]
pub const fn is_keyword(&self) -> bool {
matches!(
self,
TokenKind::False
| TokenKind::True
| TokenKind::None
| TokenKind::And
| TokenKind::As
| TokenKind::Assert
| TokenKind::Await
| TokenKind::Break
| TokenKind::Class
| TokenKind::Continue
| TokenKind::Def
| TokenKind::Del
| TokenKind::Elif
| TokenKind::Else
| TokenKind::Except
| TokenKind::Finally
| TokenKind::For
| TokenKind::From
| TokenKind::Global
| TokenKind::If
| TokenKind::Import
| TokenKind::In
| TokenKind::Is
| TokenKind::Lambda
| TokenKind::Nonlocal
| TokenKind::Not
| TokenKind::Or
| TokenKind::Pass
| TokenKind::Raise
| TokenKind::Return
| TokenKind::Try
| TokenKind::While
| TokenKind::With
| TokenKind::Yield
)
}
#[inline]
pub const fn is_operator(&self) -> bool {
matches!(
self,
TokenKind::Lpar
| TokenKind::Rpar
| TokenKind::Lsqb
| TokenKind::Rsqb
| TokenKind::Comma
| TokenKind::Semi
| TokenKind::Plus
| TokenKind::Minus
| TokenKind::Star
| TokenKind::Slash
| TokenKind::Vbar
| TokenKind::Amper
| TokenKind::Less
| TokenKind::Greater
| TokenKind::Equal
| TokenKind::Dot
| TokenKind::Percent
| TokenKind::Lbrace
| TokenKind::Rbrace
| TokenKind::EqEqual
| TokenKind::NotEqual
| TokenKind::LessEqual
| TokenKind::GreaterEqual
| TokenKind::Tilde
| TokenKind::CircumFlex
| TokenKind::LeftShift
| TokenKind::RightShift
| TokenKind::DoubleStar
| TokenKind::PlusEqual
| TokenKind::MinusEqual
| TokenKind::StarEqual
| TokenKind::SlashEqual
| TokenKind::PercentEqual
| TokenKind::AmperEqual
| TokenKind::VbarEqual
| TokenKind::CircumflexEqual
| TokenKind::LeftShiftEqual
| TokenKind::RightShiftEqual
| TokenKind::DoubleStarEqual
| TokenKind::DoubleSlash
| TokenKind::DoubleSlashEqual
| TokenKind::At
| TokenKind::AtEqual
| TokenKind::Rarrow
| TokenKind::Ellipsis
| TokenKind::ColonEqual
| TokenKind::Colon
| TokenKind::And
| TokenKind::Or
| TokenKind::Not
| TokenKind::In
| TokenKind::Is
)
}
#[inline]
pub const fn is_singleton(&self) -> bool {
matches!(self, TokenKind::False | TokenKind::True | TokenKind::None)
}
#[inline]
pub const fn is_trivia(&self) -> bool {
matches!(
self,
TokenKind::Newline
| TokenKind::Indent
| TokenKind::Dedent
| TokenKind::NonLogicalNewline
| TokenKind::Comment
)
}
#[inline]
pub const fn is_arithmetic(&self) -> bool {
matches!(
self,
TokenKind::DoubleStar
| TokenKind::Star
| TokenKind::Plus
| TokenKind::Minus
| TokenKind::Slash
| TokenKind::DoubleSlash
| TokenKind::At
)
}
#[inline]
pub const fn is_bitwise_or_shift(&self) -> bool {
matches!(
self,
TokenKind::LeftShift
| TokenKind::LeftShiftEqual
| TokenKind::RightShift
| TokenKind::RightShiftEqual
| TokenKind::Amper
| TokenKind::AmperEqual
| TokenKind::Vbar
| TokenKind::VbarEqual
| TokenKind::CircumFlex
| TokenKind::CircumflexEqual
| TokenKind::Tilde
)
}
#[inline]
pub const fn is_soft_keyword(&self) -> bool {
matches!(self, TokenKind::Match | TokenKind::Case)
}
pub const fn from_token(token: &Tok) -> Self {
match token {
Tok::Name { .. } => TokenKind::Name,
Tok::Int { .. } => TokenKind::Int,
Tok::Float { .. } => TokenKind::Float,
Tok::Complex { .. } => TokenKind::Complex,
Tok::String { .. } => TokenKind::String,
Tok::MagicCommand { .. } => TokenKind::MagicCommand,
Tok::Comment(_) => TokenKind::Comment,
Tok::Newline => TokenKind::Newline,
Tok::NonLogicalNewline => TokenKind::NonLogicalNewline,
Tok::Indent => TokenKind::Indent,
Tok::Dedent => TokenKind::Dedent,
Tok::EndOfFile => TokenKind::EndOfFile,
Tok::Lpar => TokenKind::Lpar,
Tok::Rpar => TokenKind::Rpar,
Tok::Lsqb => TokenKind::Lsqb,
Tok::Rsqb => TokenKind::Rsqb,
Tok::Colon => TokenKind::Colon,
Tok::Comma => TokenKind::Comma,
Tok::Semi => TokenKind::Semi,
Tok::Plus => TokenKind::Plus,
Tok::Minus => TokenKind::Minus,
Tok::Star => TokenKind::Star,
Tok::Slash => TokenKind::Slash,
Tok::Vbar => TokenKind::Vbar,
Tok::Amper => TokenKind::Amper,
Tok::Less => TokenKind::Less,
Tok::Greater => TokenKind::Greater,
Tok::Equal => TokenKind::Equal,
Tok::Dot => TokenKind::Dot,
Tok::Percent => TokenKind::Percent,
Tok::Lbrace => TokenKind::Lbrace,
Tok::Rbrace => TokenKind::Rbrace,
Tok::EqEqual => TokenKind::EqEqual,
Tok::NotEqual => TokenKind::NotEqual,
Tok::LessEqual => TokenKind::LessEqual,
Tok::GreaterEqual => TokenKind::GreaterEqual,
Tok::Tilde => TokenKind::Tilde,
Tok::CircumFlex => TokenKind::CircumFlex,
Tok::LeftShift => TokenKind::LeftShift,
Tok::RightShift => TokenKind::RightShift,
Tok::DoubleStar => TokenKind::DoubleStar,
Tok::DoubleStarEqual => TokenKind::DoubleStarEqual,
Tok::PlusEqual => TokenKind::PlusEqual,
Tok::MinusEqual => TokenKind::MinusEqual,
Tok::StarEqual => TokenKind::StarEqual,
Tok::SlashEqual => TokenKind::SlashEqual,
Tok::PercentEqual => TokenKind::PercentEqual,
Tok::AmperEqual => TokenKind::AmperEqual,
Tok::VbarEqual => TokenKind::VbarEqual,
Tok::CircumflexEqual => TokenKind::CircumflexEqual,
Tok::LeftShiftEqual => TokenKind::LeftShiftEqual,
Tok::RightShiftEqual => TokenKind::RightShiftEqual,
Tok::DoubleSlash => TokenKind::DoubleSlash,
Tok::DoubleSlashEqual => TokenKind::DoubleSlashEqual,
Tok::ColonEqual => TokenKind::ColonEqual,
Tok::At => TokenKind::At,
Tok::AtEqual => TokenKind::AtEqual,
Tok::Rarrow => TokenKind::Rarrow,
Tok::Ellipsis => TokenKind::Ellipsis,
Tok::False => TokenKind::False,
Tok::None => TokenKind::None,
Tok::True => TokenKind::True,
Tok::And => TokenKind::And,
Tok::As => TokenKind::As,
Tok::Assert => TokenKind::Assert,
Tok::Async => TokenKind::Async,
Tok::Await => TokenKind::Await,
Tok::Break => TokenKind::Break,
Tok::Class => TokenKind::Class,
Tok::Continue => TokenKind::Continue,
Tok::Def => TokenKind::Def,
Tok::Del => TokenKind::Del,
Tok::Elif => TokenKind::Elif,
Tok::Else => TokenKind::Else,
Tok::Except => TokenKind::Except,
Tok::Finally => TokenKind::Finally,
Tok::For => TokenKind::For,
Tok::From => TokenKind::From,
Tok::Global => TokenKind::Global,
Tok::If => TokenKind::If,
Tok::Import => TokenKind::Import,
Tok::In => TokenKind::In,
Tok::Is => TokenKind::Is,
Tok::Lambda => TokenKind::Lambda,
Tok::Nonlocal => TokenKind::Nonlocal,
Tok::Not => TokenKind::Not,
Tok::Or => TokenKind::Or,
Tok::Pass => TokenKind::Pass,
Tok::Raise => TokenKind::Raise,
Tok::Return => TokenKind::Return,
Tok::Try => TokenKind::Try,
Tok::While => TokenKind::While,
Tok::Match => TokenKind::Match,
Tok::Case => TokenKind::Case,
Tok::Type => TokenKind::Type,
Tok::With => TokenKind::With,
Tok::Yield => TokenKind::Yield,
Tok::StartModule => TokenKind::StartModule,
Tok::StartInteractive => TokenKind::StartInteractive,
Tok::StartExpression => TokenKind::StartExpression,
}
}
}
impl From<&Tok> for TokenKind {
fn from(value: &Tok) -> Self {
Self::from_token(value)
}
}

View file

@ -0,0 +1,48 @@
use anyhow::Result;
use ruff_python_ast::relocate::relocate_expr;
use ruff_python_ast::str;
use ruff_text_size::{TextLen, TextRange};
use rustpython_ast::Expr;
use rustpython_parser::Parse;
#[derive(is_macro::Is, Copy, Clone)]
pub enum AnnotationKind {
/// The annotation is defined as part a simple string literal,
/// e.g. `x: "List[int]" = []`. Annotations within simple literals
/// can be accurately located. For example, we can underline specific
/// expressions within the annotation and apply automatic fixes, which is
/// not possible for complex string literals.
Simple,
/// The annotation is defined as part of a complex string literal, such as
/// a literal containing an implicit concatenation or escaped characters,
/// e.g. `x: "List" "[int]" = []`. These are comparatively rare, but valid.
Complex,
}
/// Parse a type annotation from a string.
pub fn parse_type_annotation(
value: &str,
range: TextRange,
source: &str,
) -> Result<(Expr, AnnotationKind)> {
let expression = &source[range];
if str::raw_contents(expression).map_or(false, |body| body == value) {
// The annotation is considered "simple" if and only if the raw representation (e.g.,
// `List[int]` within "List[int]") exactly matches the parsed representation. This
// isn't the case, e.g., for implicit concatenations, or for annotations that contain
// escaped quotes.
let leading_quote = str::leading_quote(expression).unwrap();
let expr = Expr::parse_starts_at(
value,
"<filename>",
range.start() + leading_quote.text_len(),
)?;
Ok((expr, AnnotationKind::Simple))
} else {
// Otherwise, consider this a "complex" annotation.
let mut expr = Expr::parse(value, "<filename>")?;
relocate_expr(&mut expr, range);
Ok((expr, AnnotationKind::Complex))
}
}