Add support for PEP 701 (#7376)

## Summary

This PR adds support for PEP 701 in Ruff. This is a rollup PR of all the
other individual PRs. The separate PRs were created for logic separation
and code reviews. Refer to each pull request for a detail description on
the change.

Refer to the PR description for the list of pull requests within this PR.

## Test Plan

### Formatter ecosystem checks

Explanation for the change in ecosystem check:
https://github.com/astral-sh/ruff/pull/7597#issue-1908878183

#### `main`

```
| project      | similarity index  | total files       | changed files     |
|--------------|------------------:|------------------:|------------------:|
| cpython      |           0.76083 |              1789 |              1631 |
| django       |           0.99983 |              2760 |                36 |
| transformers |           0.99963 |              2587 |               319 |
| twine        |           1.00000 |                33 |                 0 |
| typeshed     |           0.99983 |              3496 |                18 |
| warehouse    |           0.99967 |               648 |                15 |
| zulip        |           0.99972 |              1437 |                21 |
```

#### `dhruv/pep-701`

```
| project      | similarity index  | total files       | changed files     |
|--------------|------------------:|------------------:|------------------:|
| cpython      |           0.76051 |              1789 |              1632 |
| django       |           0.99983 |              2760 |                36 |
| transformers |           0.99963 |              2587 |               319 |
| twine        |           1.00000 |                33 |                 0 |
| typeshed     |           0.99983 |              3496 |                18 |
| warehouse    |           0.99967 |               648 |                15 |
| zulip        |           0.99972 |              1437 |                21 |
```
This commit is contained in:
Dhruv Manilawala 2023-09-29 08:25:39 +05:30 committed by GitHub
parent 78b8741352
commit e62e245c61
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
115 changed files with 44780 additions and 31370 deletions

View file

@ -37,6 +37,7 @@ use ruff_python_ast::{Int, IpyEscapeKind};
use ruff_text_size::{TextLen, TextRange, TextSize};
use crate::lexer::cursor::{Cursor, EOF_CHAR};
use crate::lexer::fstring::{FStringContext, FStringContextFlags, FStrings};
use crate::lexer::indentation::{Indentation, Indentations};
use crate::{
soft_keywords::SoftKeywordTransformer,
@ -46,6 +47,7 @@ use crate::{
};
mod cursor;
mod fstring;
mod indentation;
/// A lexer for Python source code.
@ -62,6 +64,8 @@ pub struct Lexer<'source> {
pending_indentation: Option<Indentation>,
// Lexer mode.
mode: Mode,
// F-string contexts.
fstrings: FStrings,
}
/// Contains a Token along with its `range`.
@ -154,6 +158,7 @@ impl<'source> Lexer<'source> {
source: input,
cursor: Cursor::new(input),
mode,
fstrings: FStrings::default(),
};
// TODO: Handle possible mismatch between BOM and explicit encoding declaration.
// spell-checker:ignore feff
@ -165,16 +170,24 @@ impl<'source> Lexer<'source> {
/// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
fn lex_identifier(&mut self, first: char) -> Result<Tok, LexicalError> {
// Detect potential string like rb'' b'' f'' u'' r''
match self.cursor.first() {
quote @ ('\'' | '"') => {
match (first, self.cursor.first()) {
('f' | 'F', quote @ ('\'' | '"')) => {
self.cursor.bump();
return Ok(self.lex_fstring_start(quote, false));
}
('r' | 'R', 'f' | 'F') | ('f' | 'F', 'r' | 'R') if is_quote(self.cursor.second()) => {
self.cursor.bump();
let quote = self.cursor.bump().unwrap();
return Ok(self.lex_fstring_start(quote, true));
}
(_, quote @ ('\'' | '"')) => {
if let Ok(string_kind) = StringKind::try_from(first) {
self.cursor.bump();
return self.lex_string(string_kind, quote);
}
}
second @ ('f' | 'F' | 'r' | 'R' | 'b' | 'B') if is_quote(self.cursor.second()) => {
(_, second @ ('r' | 'R' | 'b' | 'B')) if is_quote(self.cursor.second()) => {
self.cursor.bump();
if let Ok(string_kind) = StringKind::try_from([first, second]) {
let quote = self.cursor.bump().unwrap();
return self.lex_string(string_kind, quote);
@ -509,6 +522,148 @@ impl<'source> Lexer<'source> {
}
}
/// Lex a f-string start token.
fn lex_fstring_start(&mut self, quote: char, is_raw_string: bool) -> Tok {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), quote);
let mut flags = FStringContextFlags::empty();
if quote == '"' {
flags |= FStringContextFlags::DOUBLE;
}
if is_raw_string {
flags |= FStringContextFlags::RAW;
}
if self.cursor.eat_char2(quote, quote) {
flags |= FStringContextFlags::TRIPLE;
}
self.fstrings.push(FStringContext::new(flags, self.nesting));
Tok::FStringStart
}
/// Lex a f-string middle or end token.
fn lex_fstring_middle_or_end(&mut self) -> Result<Option<Tok>, LexicalError> {
// SAFETY: Safe because the function is only called when `self.fstrings` is not empty.
let fstring = self.fstrings.current().unwrap();
self.cursor.start_token();
// Check if we're at the end of the f-string.
if fstring.is_triple_quoted() {
let quote_char = fstring.quote_char();
if self.cursor.eat_char3(quote_char, quote_char, quote_char) {
return Ok(Some(Tok::FStringEnd));
}
} else if self.cursor.eat_char(fstring.quote_char()) {
return Ok(Some(Tok::FStringEnd));
}
// We have to decode `{{` and `}}` into `{` and `}` respectively. As an
// optimization, we only allocate a new string we find any escaped curly braces,
// otherwise this string will remain empty and we'll use a source slice instead.
let mut normalized = String::new();
// Tracks the last offset of token value that has been written to `normalized`.
let mut last_offset = self.offset();
let mut in_named_unicode = false;
loop {
match self.cursor.first() {
// The condition is to differentiate between the `NUL` (`\0`) character
// in the source code and the one returned by `self.cursor.first()` when
// we reach the end of the source code.
EOF_CHAR if self.cursor.is_eof() => {
let error = if fstring.is_triple_quoted() {
FStringErrorType::UnterminatedTripleQuotedString
} else {
FStringErrorType::UnterminatedString
};
return Err(LexicalError {
error: LexicalErrorType::FStringError(error),
location: self.offset(),
});
}
'\n' if !fstring.is_triple_quoted() => {
return Err(LexicalError {
error: LexicalErrorType::FStringError(FStringErrorType::UnterminatedString),
location: self.offset(),
});
}
'\\' => {
self.cursor.bump(); // '\'
if matches!(self.cursor.first(), '{' | '}') {
// Don't consume `{` or `}` as we want them to be emitted as tokens.
// They will be handled in the next iteration.
continue;
} else if !fstring.is_raw_string() {
if self.cursor.eat_char2('N', '{') {
in_named_unicode = true;
continue;
}
}
// Consume the escaped character.
self.cursor.bump();
}
quote @ ('\'' | '"') if quote == fstring.quote_char() => {
if let Some(triple_quotes) = fstring.triple_quotes() {
if self.cursor.rest().starts_with(triple_quotes) {
break;
}
self.cursor.bump();
} else {
break;
}
}
'{' => {
if self.cursor.second() == '{' {
self.cursor.bump();
normalized
.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
self.cursor.bump(); // Skip the second `{`
last_offset = self.offset();
} else {
break;
}
}
'}' => {
if in_named_unicode {
in_named_unicode = false;
self.cursor.bump();
} else if self.cursor.second() == '}'
&& !fstring.is_in_format_spec(self.nesting)
{
self.cursor.bump();
normalized
.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
self.cursor.bump(); // Skip the second `}`
last_offset = self.offset();
} else {
break;
}
}
_ => {
self.cursor.bump();
}
}
}
let range = self.token_range();
if range.is_empty() {
return Ok(None);
}
let value = if normalized.is_empty() {
self.source[range].to_string()
} else {
normalized.push_str(&self.source[TextRange::new(last_offset, self.offset())]);
normalized
};
Ok(Some(Tok::FStringMiddle {
value,
is_raw: fstring.is_raw_string(),
}))
}
/// Lex a string literal.
fn lex_string(&mut self, kind: StringKind, quote: char) -> Result<Tok, LexicalError> {
#[cfg(debug_assertions)]
@ -530,6 +685,19 @@ impl<'source> Lexer<'source> {
}
}
Some('\r' | '\n') if !triple_quoted => {
if let Some(fstring) = self.fstrings.current() {
// When we are in an f-string, check whether does the initial quote
// matches with f-strings quotes and if it is, then this must be a
// missing '}' token so raise the proper error.
if fstring.quote_char() == quote && !fstring.is_triple_quoted() {
return Err(LexicalError {
error: LexicalErrorType::FStringError(
FStringErrorType::UnclosedLbrace,
),
location: self.offset() - fstring.quote_size(),
});
}
}
return Err(LexicalError {
error: LexicalErrorType::OtherError(
"EOL while scanning string literal".to_owned(),
@ -549,6 +717,21 @@ impl<'source> Lexer<'source> {
Some(_) => {}
None => {
if let Some(fstring) = self.fstrings.current() {
// When we are in an f-string, check whether does the initial quote
// matches with f-strings quotes and if it is, then this must be a
// missing '}' token so raise the proper error.
if fstring.quote_char() == quote
&& fstring.is_triple_quoted() == triple_quoted
{
return Err(LexicalError {
error: LexicalErrorType::FStringError(
FStringErrorType::UnclosedLbrace,
),
location: self.offset() - fstring.quote_size(),
});
}
}
return Err(LexicalError {
error: if triple_quoted {
LexicalErrorType::Eof
@ -572,8 +755,28 @@ impl<'source> Lexer<'source> {
// This is the main entry point. Call this function to retrieve the next token.
// This function is used by the iterator implementation.
pub fn next_token(&mut self) -> LexResult {
if let Some(fstring) = self.fstrings.current() {
if !fstring.is_in_expression(self.nesting) {
match self.lex_fstring_middle_or_end() {
Ok(Some(tok)) => {
if tok == Tok::FStringEnd {
self.fstrings.pop();
}
return Ok((tok, self.token_range()));
}
Err(e) => {
// This is to prevent an infinite loop in which the lexer
// continuously returns an error token because the f-string
// remains on the stack.
self.fstrings.pop();
return Err(e);
}
_ => {}
}
}
}
// Return dedent tokens until the current indentation level matches the indentation of the next token.
if let Some(indentation) = self.pending_indentation.take() {
else if let Some(indentation) = self.pending_indentation.take() {
match self.indentations.current().try_compare(indentation) {
Ok(Ordering::Greater) => {
self.pending_indentation = Some(indentation);
@ -894,10 +1097,7 @@ impl<'source> Lexer<'source> {
if self.cursor.eat_char('=') {
Tok::NotEqual
} else {
return Err(LexicalError {
error: LexicalErrorType::UnrecognizedToken { tok: '!' },
location: self.token_start(),
});
Tok::Exclamation
}
}
'~' => Tok::Tilde,
@ -922,11 +1122,26 @@ impl<'source> Lexer<'source> {
Tok::Lbrace
}
'}' => {
if let Some(fstring) = self.fstrings.current_mut() {
if fstring.nesting() == self.nesting {
return Err(LexicalError {
error: LexicalErrorType::FStringError(FStringErrorType::SingleRbrace),
location: self.token_start(),
});
}
fstring.try_end_format_spec(self.nesting);
}
self.nesting = self.nesting.saturating_sub(1);
Tok::Rbrace
}
':' => {
if self.cursor.eat_char('=') {
if self
.fstrings
.current_mut()
.is_some_and(|fstring| fstring.try_start_format_spec(self.nesting))
{
Tok::Colon
} else if self.cursor.eat_char('=') {
Tok::ColonEqual
} else {
Tok::Colon
@ -1743,4 +1958,191 @@ def f(arg=%timeit a = b):
.collect();
assert_debug_snapshot!(tokens);
}
#[test]
fn test_empty_fstrings() {
let source = r#"f"" "" F"" f'' '' f"""""" f''''''"#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_prefix() {
let source = r#"f"" F"" rf"" rF"" Rf"" RF"" fr"" Fr"" fR"" FR"""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring() {
let source = r#"f"normal {foo} {{another}} {bar} {{{three}}}""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_parentheses() {
let source = r#"f"{}" f"{{}}" f" {}" f"{{{}}}" f"{{{{}}}}" f" {} {{}} {{{}}} {{{{}}}} ""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_escape() {
let source = r#"f"\{x:\"\{x}} \"\"\
end""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_escape_braces() {
let source = r"f'\{foo}' f'\\{foo}' f'\{{foo}}' f'\\{{foo}}'";
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_escape_raw() {
let source = r#"rf"\{x:\"\{x}} \"\"\
end""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_named_unicode() {
let source = r#"f"\N{BULLET} normal \Nope \N""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_named_unicode_raw() {
let source = r#"rf"\N{BULLET} normal""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_named_expression() {
let source = r#"f"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_format_spec() {
let source = r#"f"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}}""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_conversion() {
let source = r#"f"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_nested() {
let source = r#"f"foo {f"bar {x + f"{wow}"}"} baz" f'foo {f'bar'} some {f"another"}'"#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_expression_multiline() {
let source = r#"f"first {
x
*
y
} second""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_multiline() {
let source = r#"f"""
hello
world
""" f'''
world
hello
''' f"some {f"""multiline
allowed {x}"""} string""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_comments() {
let source = r#"f"""
# not a comment { # comment {
x
} # not a comment
""""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_ipy_escape_command() {
let source = r#"f"foo {!pwd} bar""#;
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_lambda_expression() {
let source = r#"
f"{lambda x:{x}}"
f"{(lambda x:{x})}"
"#
.trim();
assert_debug_snapshot!(lex_source(source));
}
#[test]
fn test_fstring_with_nul_char() {
let source = r"f'\0'";
assert_debug_snapshot!(lex_source(source));
}
fn lex_fstring_error(source: &str) -> FStringErrorType {
match lex(source, Mode::Module).find_map(std::result::Result::err) {
Some(err) => match err.error {
LexicalErrorType::FStringError(error) => error,
_ => panic!("Expected FStringError: {err:?}"),
},
_ => panic!("Expected atleast one FStringError"),
}
}
#[test]
fn test_fstring_error() {
use FStringErrorType::{
SingleRbrace, UnclosedLbrace, UnterminatedString, UnterminatedTripleQuotedString,
};
assert_eq!(lex_fstring_error("f'}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{{}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{{}}}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'foo}'"), SingleRbrace);
assert_eq!(lex_fstring_error(r"f'\u007b}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{a:b}}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{3:}}>10}'"), SingleRbrace);
assert_eq!(lex_fstring_error(r"f'\{foo}\}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{'"), UnclosedLbrace);
assert_eq!(lex_fstring_error("f'{foo!r'"), UnclosedLbrace);
assert_eq!(lex_fstring_error("f'{foo='"), UnclosedLbrace);
assert_eq!(
lex_fstring_error(
r#"f"{"
"#
),
UnclosedLbrace
);
assert_eq!(lex_fstring_error(r#"f"""{""""#), UnclosedLbrace);
assert_eq!(lex_fstring_error(r#"f""#), UnterminatedString);
assert_eq!(lex_fstring_error(r#"f'"#), UnterminatedString);
assert_eq!(lex_fstring_error(r#"f""""#), UnterminatedTripleQuotedString);
assert_eq!(lex_fstring_error(r#"f'''"#), UnterminatedTripleQuotedString);
assert_eq!(
lex_fstring_error(r#"f"""""#),
UnterminatedTripleQuotedString
);
assert_eq!(
lex_fstring_error(r#"f""""""#),
UnterminatedTripleQuotedString
);
}
}