Implement template strings (#17851)

This PR implements template strings (t-strings) in the parser and
formatter for Ruff.

Minimal changes necessary to compile were made in other parts of the code (e.g. ty, the linter, etc.). These will be covered properly in follow-up PRs.
This commit is contained in:
Dylan 2025-05-30 15:00:56 -05:00 committed by GitHub
parent ad024f9a09
commit 9bbf4987e8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
261 changed files with 18023 additions and 1802 deletions

View file

@ -18,15 +18,17 @@ use ruff_python_trivia::is_python_whitespace;
use ruff_text_size::{TextLen, TextRange, TextSize};
use crate::Mode;
use crate::error::{FStringErrorType, LexicalError, LexicalErrorType};
use crate::error::{InterpolatedStringErrorType, LexicalError, LexicalErrorType};
use crate::lexer::cursor::{Cursor, EOF_CHAR};
use crate::lexer::fstring::{FStringContext, FStrings, FStringsCheckpoint};
use crate::lexer::indentation::{Indentation, Indentations, IndentationsCheckpoint};
use crate::lexer::interpolated_string::{
InterpolatedStringContext, InterpolatedStrings, InterpolatedStringsCheckpoint,
};
use crate::token::{TokenFlags, TokenKind, TokenValue};
mod cursor;
mod fstring;
mod indentation;
mod interpolated_string;
const BOM: char = '\u{feff}';
@ -65,8 +67,8 @@ pub struct Lexer<'src> {
/// Lexer mode.
mode: Mode,
/// F-string contexts.
fstrings: FStrings,
/// F-string and t-string contexts.
interpolated_strings: InterpolatedStrings,
/// Errors encountered while lexing.
errors: Vec<LexicalError>,
@ -102,7 +104,7 @@ impl<'src> Lexer<'src> {
indentations: Indentations::default(),
pending_indentation: None,
mode,
fstrings: FStrings::default(),
interpolated_strings: InterpolatedStrings::default(),
errors: Vec::new(),
};
@ -162,11 +164,11 @@ impl<'src> Lexer<'src> {
}
fn lex_token(&mut self) -> TokenKind {
if let Some(fstring) = self.fstrings.current() {
if !fstring.is_in_expression(self.nesting) {
if let Some(token) = self.lex_fstring_middle_or_end() {
if matches!(token, TokenKind::FStringEnd) {
self.fstrings.pop();
if let Some(interpolated_string) = self.interpolated_strings.current() {
if !interpolated_string.is_in_interpolation(self.nesting) {
if let Some(token) = self.lex_interpolated_string_middle_or_end() {
if token.is_interpolated_string_end() {
self.interpolated_strings.pop();
}
return token;
}
@ -506,23 +508,26 @@ impl<'src> Lexer<'src> {
TokenKind::Lbrace
}
'}' => {
if let Some(fstring) = self.fstrings.current_mut() {
if fstring.nesting() == self.nesting {
return self.push_error(LexicalError::new(
LexicalErrorType::FStringError(FStringErrorType::SingleRbrace),
self.token_range(),
));
if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
if interpolated_string.nesting() == self.nesting {
let error_type = LexicalErrorType::from_interpolated_string_error(
InterpolatedStringErrorType::SingleRbrace,
interpolated_string.kind(),
);
return self.push_error(LexicalError::new(error_type, self.token_range()));
}
fstring.try_end_format_spec(self.nesting);
interpolated_string.try_end_format_spec(self.nesting);
}
self.nesting = self.nesting.saturating_sub(1);
TokenKind::Rbrace
}
':' => {
if self
.fstrings
.interpolated_strings
.current_mut()
.is_some_and(|fstring| fstring.try_start_format_spec(self.nesting))
.is_some_and(|interpolated_string| {
interpolated_string.try_start_format_spec(self.nesting)
})
{
TokenKind::Colon
} else if self.cursor.eat_char('=') {
@ -573,8 +578,8 @@ impl<'src> Lexer<'src> {
self.state = State::AfterNewline;
TokenKind::Newline
} else {
if let Some(fstring) = self.fstrings.current_mut() {
fstring.try_end_format_spec(self.nesting);
if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
interpolated_string.try_end_format_spec(self.nesting);
}
TokenKind::NonLogicalNewline
};
@ -586,8 +591,8 @@ impl<'src> Lexer<'src> {
self.state = State::AfterNewline;
TokenKind::Newline
} else {
if let Some(fstring) = self.fstrings.current_mut() {
fstring.try_end_format_spec(self.nesting);
if let Some(interpolated_string) = self.interpolated_strings.current_mut() {
interpolated_string.try_end_format_spec(self.nesting);
}
TokenKind::NonLogicalNewline
};
@ -610,7 +615,7 @@ impl<'src> Lexer<'src> {
/// Lex an identifier. Also used for keywords and string/bytes literals with a prefix.
fn lex_identifier(&mut self, first: char) -> TokenKind {
// Detect potential string like rb'' b'' f'' u'' r''
// Detect potential string like rb'' b'' f'' t'' u'' r''
let quote = match (first, self.cursor.first()) {
(_, quote @ ('\'' | '"')) => self.try_single_char_prefix(first).then(|| {
self.cursor.bump();
@ -627,8 +632,10 @@ impl<'src> Lexer<'src> {
};
if let Some(quote) = quote {
if self.current_flags.is_f_string() {
return self.lex_fstring_start(quote);
if self.current_flags.is_interpolated_string() {
if let Some(kind) = self.lex_interpolated_string_start(quote) {
return kind;
}
}
return self.lex_string(quote);
@ -711,6 +718,7 @@ impl<'src> Lexer<'src> {
fn try_single_char_prefix(&mut self, first: char) -> bool {
match first {
'f' | 'F' => self.current_flags |= TokenFlags::F_STRING,
't' | 'T' => self.current_flags |= TokenFlags::T_STRING,
'u' | 'U' => self.current_flags |= TokenFlags::UNICODE_STRING,
'b' | 'B' => self.current_flags |= TokenFlags::BYTE_STRING,
'r' => self.current_flags |= TokenFlags::RAW_STRING_LOWERCASE,
@ -730,6 +738,12 @@ impl<'src> Lexer<'src> {
['R', 'f' | 'F'] | ['f' | 'F', 'R'] => {
self.current_flags |= TokenFlags::F_STRING | TokenFlags::RAW_STRING_UPPERCASE;
}
['r', 't' | 'T'] | ['t' | 'T', 'r'] => {
self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_LOWERCASE;
}
['R', 't' | 'T'] | ['t' | 'T', 'R'] => {
self.current_flags |= TokenFlags::T_STRING | TokenFlags::RAW_STRING_UPPERCASE;
}
['r', 'b' | 'B'] | ['b' | 'B', 'r'] => {
self.current_flags |= TokenFlags::BYTE_STRING | TokenFlags::RAW_STRING_LOWERCASE;
}
@ -741,8 +755,8 @@ impl<'src> Lexer<'src> {
true
}
/// Lex a f-string start token.
fn lex_fstring_start(&mut self, quote: char) -> TokenKind {
/// Lex a f-string or t-string start token if positioned at the start of an f-string or t-string.
fn lex_interpolated_string_start(&mut self, quote: char) -> Option<TokenKind> {
#[cfg(debug_assertions)]
debug_assert_eq!(self.cursor.previous(), quote);
@ -754,27 +768,31 @@ impl<'src> Lexer<'src> {
self.current_flags |= TokenFlags::TRIPLE_QUOTED_STRING;
}
self.fstrings
.push(FStringContext::new(self.current_flags, self.nesting));
let ftcontext = InterpolatedStringContext::new(self.current_flags, self.nesting)?;
TokenKind::FStringStart
let kind = ftcontext.kind();
self.interpolated_strings.push(ftcontext);
Some(kind.start_token())
}
/// Lex a f-string middle or end token.
fn lex_fstring_middle_or_end(&mut self) -> Option<TokenKind> {
/// Lex an f-string or t-string middle or end token.
fn lex_interpolated_string_middle_or_end(&mut self) -> Option<TokenKind> {
// SAFETY: Safe because the function is only called when `self.fstrings` is not empty.
let fstring = self.fstrings.current().unwrap();
let interpolated_string = self.interpolated_strings.current().unwrap();
let string_kind = interpolated_string.kind();
// Check if we're at the end of the f-string.
if fstring.is_triple_quoted() {
let quote_char = fstring.quote_char();
if interpolated_string.is_triple_quoted() {
let quote_char = interpolated_string.quote_char();
if self.cursor.eat_char3(quote_char, quote_char, quote_char) {
self.current_flags = fstring.flags();
return Some(TokenKind::FStringEnd);
self.current_flags = interpolated_string.flags();
return Some(string_kind.end_token());
}
} else if self.cursor.eat_char(fstring.quote_char()) {
self.current_flags = fstring.flags();
return Some(TokenKind::FStringEnd);
} else if self.cursor.eat_char(interpolated_string.quote_char()) {
self.current_flags = interpolated_string.flags();
return Some(string_kind.end_token());
}
// We have to decode `{{` and `}}` into `{` and `}` respectively. As an
@ -786,7 +804,7 @@ impl<'src> Lexer<'src> {
let mut last_offset = self.offset();
// This isn't going to change for the duration of the loop.
let in_format_spec = fstring.is_in_format_spec(self.nesting);
let in_format_spec = interpolated_string.is_in_format_spec(self.nesting);
let mut in_named_unicode = false;
@ -796,18 +814,18 @@ impl<'src> Lexer<'src> {
// in the source code and the one returned by `self.cursor.first()` when
// we reach the end of the source code.
EOF_CHAR if self.cursor.is_eof() => {
let error = if fstring.is_triple_quoted() {
FStringErrorType::UnterminatedTripleQuotedString
let error = if interpolated_string.is_triple_quoted() {
InterpolatedStringErrorType::UnterminatedTripleQuotedString
} else {
FStringErrorType::UnterminatedString
InterpolatedStringErrorType::UnterminatedString
};
self.fstrings.pop();
self.interpolated_strings.pop();
return Some(self.push_error(LexicalError::new(
LexicalErrorType::FStringError(error),
LexicalErrorType::from_interpolated_string_error(error, string_kind),
self.token_range(),
)));
}
'\n' | '\r' if !fstring.is_triple_quoted() => {
'\n' | '\r' if !interpolated_string.is_triple_quoted() => {
// If we encounter a newline while we're in a format spec, then
// we stop here and let the lexer emit the newline token.
//
@ -815,9 +833,12 @@ impl<'src> Lexer<'src> {
if in_format_spec {
break;
}
self.fstrings.pop();
self.interpolated_strings.pop();
return Some(self.push_error(LexicalError::new(
LexicalErrorType::FStringError(FStringErrorType::UnterminatedString),
LexicalErrorType::from_interpolated_string_error(
InterpolatedStringErrorType::UnterminatedString,
string_kind,
),
self.token_range(),
)));
}
@ -827,7 +848,7 @@ impl<'src> Lexer<'src> {
// Don't consume `{` or `}` as we want them to be emitted as tokens.
// They will be handled in the next iteration.
continue;
} else if !fstring.is_raw_string() {
} else if !interpolated_string.is_raw_string() {
if self.cursor.eat_char2('N', '{') {
in_named_unicode = true;
continue;
@ -840,8 +861,8 @@ impl<'src> Lexer<'src> {
self.cursor.bump();
}
}
quote @ ('\'' | '"') if quote == fstring.quote_char() => {
if let Some(triple_quotes) = fstring.triple_quotes() {
quote @ ('\'' | '"') if quote == interpolated_string.quote_char() => {
if let Some(triple_quotes) = interpolated_string.triple_quotes() {
if self.cursor.rest().starts_with(triple_quotes) {
break;
}
@ -892,10 +913,10 @@ impl<'src> Lexer<'src> {
normalized
};
self.current_value = TokenValue::FStringMiddle(value.into_boxed_str());
self.current_value = TokenValue::InterpolatedStringMiddle(value.into_boxed_str());
self.current_flags = fstring.flags();
Some(TokenKind::FStringMiddle)
self.current_flags = interpolated_string.flags();
Some(string_kind.middle_token())
}
/// Lex a string literal.
@ -1403,9 +1424,9 @@ impl<'src> Lexer<'src> {
// i.e., it recovered from an unclosed parenthesis (`(`, `[`, or `{`).
self.nesting -= 1;
// The lexer can't be moved back for a triple-quoted f-string because the newlines are
// part of the f-string itself, so there is no newline token to be emitted.
if self.current_flags.is_triple_quoted_fstring() {
// The lexer can't be moved back for a triple-quoted f/t-string because the newlines are
// part of the f/t-string itself, so there is no newline token to be emitted.
if self.current_flags.is_triple_quoted_interpolated_string() {
return false;
}
@ -1478,7 +1499,7 @@ impl<'src> Lexer<'src> {
nesting: self.nesting,
indentations_checkpoint: self.indentations.checkpoint(),
pending_indentation: self.pending_indentation,
fstrings_checkpoint: self.fstrings.checkpoint(),
interpolated_strings_checkpoint: self.interpolated_strings.checkpoint(),
errors_position: self.errors.len(),
}
}
@ -1495,7 +1516,7 @@ impl<'src> Lexer<'src> {
nesting,
indentations_checkpoint,
pending_indentation,
fstrings_checkpoint,
interpolated_strings_checkpoint,
errors_position,
} = checkpoint;
@ -1512,7 +1533,8 @@ impl<'src> Lexer<'src> {
self.nesting = nesting;
self.indentations.rewind(indentations_checkpoint);
self.pending_indentation = pending_indentation;
self.fstrings.rewind(fstrings_checkpoint);
self.interpolated_strings
.rewind(interpolated_strings_checkpoint);
self.errors.truncate(errors_position);
}
@ -1531,7 +1553,7 @@ pub(crate) struct LexerCheckpoint {
nesting: u32,
indentations_checkpoint: IndentationsCheckpoint,
pending_indentation: Option<Indentation>,
fstrings_checkpoint: FStringsCheckpoint,
interpolated_strings_checkpoint: InterpolatedStringsCheckpoint,
errors_position: usize,
}
@ -2450,6 +2472,190 @@ f"{(lambda x:{x})}"
assert_snapshot!(lex_source(source));
}
#[test]
fn test_empty_tstrings() {
let source = r#"t"" "" t"" t'' '' t"""""" t''''''"#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_prefix() {
let source = r#"t"" t"" rt"" rt"" Rt"" Rt"" tr"" Tr"" tR"" TR"""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring() {
let source = r#"t"normal {foo} {{another}} {bar} {{{three}}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_parentheses() {
let source = r#"t"{}" t"{{}}" t" {}" t"{{{}}}" t"{{{{}}}}" t" {} {{}} {{{}}} {{{{}}}} ""#;
assert_snapshot!(lex_source(source));
}
fn tstring_single_quote_escape_eol(eol: &str) -> LexerOutput {
let source = format!(r"t'text \{eol} more text'");
lex_source(&source)
}
#[test]
fn test_tstring_single_quote_escape_unix_eol() {
assert_snapshot!(tstring_single_quote_escape_eol(UNIX_EOL));
}
#[test]
fn test_tstring_single_quote_escape_mac_eol() {
assert_snapshot!(tstring_single_quote_escape_eol(MAC_EOL));
}
#[test]
fn test_tstring_single_quote_escape_windows_eol() {
assert_snapshot!(tstring_single_quote_escape_eol(WINDOWS_EOL));
}
#[test]
fn test_tstring_escape() {
let source = r#"t"\{x:\"\{x}} \"\"\
end""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_escape_braces() {
let source = r"t'\{foo}' t'\\{foo}' t'\{{foo}}' t'\\{{foo}}'";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_escape_raw() {
let source = r#"rt"\{x:\"\{x}} \"\"\
end""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_named_unicode() {
let source = r#"t"\N{BULLET} normal \Nope \N""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_named_unicode_raw() {
let source = r#"rt"\N{BULLET} normal""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_with_named_expression() {
let source = r#"t"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_with_format_spec() {
let source = r#"t"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}} {x:{{1}.pop()}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_with_multiline_format_spec() {
// The last t-string is invalid syntactically but we should still lex it.
// Note that the `b` is a `Name` token and not a `TStringMiddle` token.
let source = r"t'''__{
x:d
}__'''
t'''__{
x:a
b
c
}__'''
t'__{
x:d
}__'
t'__{
x:a
b
}__'
";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_conversion() {
let source = r#"t"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_nested() {
let source = r#"t"foo {t"bar {x + t"{wow}"}"} baz" t'foo {t'bar'} some {t"another"}'"#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_expression_multiline() {
let source = r#"t"first {
x
*
y
} second""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_multiline() {
let source = r#"t"""
hello
world
""" t'''
world
hello
''' t"some {t"""multiline
allowed {x}"""} string""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_comments() {
let source = r#"t"""
# not a comment { # comment {
x
} # not a comment
""""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_with_ipy_escape_command() {
let source = r#"t"foo {!pwd} bar""#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_with_lambda_expression() {
let source = r#"
t"{lambda x:{x}}"
t"{(lambda x:{x})}"
"#
.trim();
assert_snapshot!(lex_source(source));
}
#[test]
fn test_tstring_with_nul_char() {
let source = r"t'\0'";
assert_snapshot!(lex_source(source));
}
#[test]
fn test_nested_t_and_fstring() {
let source = r#"t"foo {f"bar {x + t"{wow}"}"} baz" f'foo {t'bar'!r} some {f"another"}'"#;
assert_snapshot!(lex_source(source));
}
#[test]
fn test_match_softkeyword_in_notebook() {
let source = r"match foo:
@ -2458,7 +2664,7 @@ f"{(lambda x:{x})}"
assert_snapshot!(lex_jupyter_source(source));
}
fn lex_fstring_error(source: &str) -> FStringErrorType {
fn lex_fstring_error(source: &str) -> InterpolatedStringErrorType {
let output = lex(source, Mode::Module, TextSize::default());
match output
.errors
@ -2474,7 +2680,9 @@ f"{(lambda x:{x})}"
#[test]
fn test_fstring_error() {
use FStringErrorType::{SingleRbrace, UnterminatedString, UnterminatedTripleQuotedString};
use InterpolatedStringErrorType::{
SingleRbrace, UnterminatedString, UnterminatedTripleQuotedString,
};
assert_eq!(lex_fstring_error("f'}'"), SingleRbrace);
assert_eq!(lex_fstring_error("f'{{}'"), SingleRbrace);
@ -2499,4 +2707,48 @@ f"{(lambda x:{x})}"
UnterminatedTripleQuotedString
);
}
fn lex_tstring_error(source: &str) -> InterpolatedStringErrorType {
let output = lex(source, Mode::Module, TextSize::default());
match output
.errors
.into_iter()
.next()
.expect("lexer should give at least one error")
.into_error()
{
LexicalErrorType::TStringError(error) => error,
err => panic!("Expected TStringError: {err:?}"),
}
}
#[test]
fn test_tstring_error() {
use InterpolatedStringErrorType::{
SingleRbrace, UnterminatedString, UnterminatedTripleQuotedString,
};
assert_eq!(lex_tstring_error("t'}'"), SingleRbrace);
assert_eq!(lex_tstring_error("t'{{}'"), SingleRbrace);
assert_eq!(lex_tstring_error("t'{{}}}'"), SingleRbrace);
assert_eq!(lex_tstring_error("t'foo}'"), SingleRbrace);
assert_eq!(lex_tstring_error(r"t'\u007b}'"), SingleRbrace);
assert_eq!(lex_tstring_error("t'{a:b}}'"), SingleRbrace);
assert_eq!(lex_tstring_error("t'{3:}}>10}'"), SingleRbrace);
assert_eq!(lex_tstring_error(r"t'\{foo}\}'"), SingleRbrace);
assert_eq!(lex_tstring_error(r#"t""#), UnterminatedString);
assert_eq!(lex_tstring_error(r"t'"), UnterminatedString);
assert_eq!(lex_tstring_error(r#"t""""#), UnterminatedTripleQuotedString);
assert_eq!(lex_tstring_error(r"t'''"), UnterminatedTripleQuotedString);
assert_eq!(
lex_tstring_error(r#"t"""""#),
UnterminatedTripleQuotedString
);
assert_eq!(
lex_tstring_error(r#"t""""""#),
UnterminatedTripleQuotedString
);
}
}