mirror of
https://github.com/astral-sh/ruff.git
synced 2025-09-29 05:15:12 +00:00

This PR modifies our AST so that nodes for string literals, bytes literals and f-strings all retain the following information: - The quoting style used (double or single quotes) - Whether the string is triple-quoted or not - Whether the string is raw or not This PR is a followup to #10256. Like with that PR, this PR does not, in itself, fix any bugs. However, it means that we will have the necessary information to preserve quoting style and rawness of strings in the `ExprGenerator` in a followup PR, which will allow us to provide a fix for https://github.com/astral-sh/ruff/issues/7799. The information is recorded on the AST nodes using a bitflag field on each node, similarly to how we recorded the information on `Tok::String`, `Tok::FStringStart` and `Tok::FStringMiddle` tokens in #10298. Rather than reusing the bitflag I used for the tokens, however, I decided to create a custom bitflag for each AST node. Using different bitflags for each node allows us to make invalid states unrepresentable: it is valid to set a `u` prefix on a string literal, but not on a bytes literal or an f-string. It also allows us to have better debug representations for each AST node modified in this PR.
243 lines
5.6 KiB
Rust
243 lines
5.6 KiB
Rust
use aho_corasick::{AhoCorasick, AhoCorasickKind, Anchored, Input, MatchKind, StartKind};
|
|
use once_cell::sync::Lazy;
|
|
|
|
use ruff_text_size::{TextLen, TextRange};
|
|
|
|
#[derive(Debug, Default, Copy, Clone, Hash, PartialEq, Eq, is_macro::Is)]
|
|
pub enum QuoteStyle {
|
|
/// E.g. '
|
|
Single,
|
|
/// E.g. "
|
|
#[default]
|
|
Double,
|
|
}
|
|
|
|
impl QuoteStyle {
|
|
pub const fn as_char(self) -> char {
|
|
match self {
|
|
Self::Single => '\'',
|
|
Self::Double => '"',
|
|
}
|
|
}
|
|
|
|
#[must_use]
|
|
pub const fn opposite(self) -> Self {
|
|
match self {
|
|
Self::Single => Self::Double,
|
|
Self::Double => Self::Single,
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Includes all permutations of `r`, `u`, `f`, and `fr` (`ur` is invalid, as is `uf`). This
|
|
/// includes all possible orders, and all possible casings, for both single and triple quotes.
|
|
///
|
|
/// See: <https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals>
|
|
#[rustfmt::skip]
|
|
const TRIPLE_QUOTE_STR_PREFIXES: &[&str] = &[
|
|
"FR\"\"\"",
|
|
"Fr\"\"\"",
|
|
"fR\"\"\"",
|
|
"fr\"\"\"",
|
|
"RF\"\"\"",
|
|
"Rf\"\"\"",
|
|
"rF\"\"\"",
|
|
"rf\"\"\"",
|
|
"FR'''",
|
|
"Fr'''",
|
|
"fR'''",
|
|
"fr'''",
|
|
"RF'''",
|
|
"Rf'''",
|
|
"rF'''",
|
|
"rf'''",
|
|
"R\"\"\"",
|
|
"r\"\"\"",
|
|
"R'''",
|
|
"r'''",
|
|
"F\"\"\"",
|
|
"f\"\"\"",
|
|
"F'''",
|
|
"f'''",
|
|
"U\"\"\"",
|
|
"u\"\"\"",
|
|
"U'''",
|
|
"u'''",
|
|
"\"\"\"",
|
|
"'''",
|
|
];
|
|
|
|
#[rustfmt::skip]
|
|
const SINGLE_QUOTE_STR_PREFIXES: &[&str] = &[
|
|
"FR\"",
|
|
"Fr\"",
|
|
"fR\"",
|
|
"fr\"",
|
|
"RF\"",
|
|
"Rf\"",
|
|
"rF\"",
|
|
"rf\"",
|
|
"FR'",
|
|
"Fr'",
|
|
"fR'",
|
|
"fr'",
|
|
"RF'",
|
|
"Rf'",
|
|
"rF'",
|
|
"rf'",
|
|
"R\"",
|
|
"r\"",
|
|
"R'",
|
|
"r'",
|
|
"F\"",
|
|
"f\"",
|
|
"F'",
|
|
"f'",
|
|
"U\"",
|
|
"u\"",
|
|
"U'",
|
|
"u'",
|
|
"\"",
|
|
"'",
|
|
];
|
|
|
|
/// Includes all permutations of `b` and `rb`. This includes all possible orders, and all possible
|
|
/// casings, for both single and triple quotes.
|
|
///
|
|
/// See: <https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals>
|
|
#[rustfmt::skip]
|
|
pub const TRIPLE_QUOTE_BYTE_PREFIXES: &[&str] = &[
|
|
"BR\"\"\"",
|
|
"Br\"\"\"",
|
|
"bR\"\"\"",
|
|
"br\"\"\"",
|
|
"RB\"\"\"",
|
|
"Rb\"\"\"",
|
|
"rB\"\"\"",
|
|
"rb\"\"\"",
|
|
"BR'''",
|
|
"Br'''",
|
|
"bR'''",
|
|
"br'''",
|
|
"RB'''",
|
|
"Rb'''",
|
|
"rB'''",
|
|
"rb'''",
|
|
"B\"\"\"",
|
|
"b\"\"\"",
|
|
"B'''",
|
|
"b'''",
|
|
];
|
|
|
|
#[rustfmt::skip]
|
|
pub const SINGLE_QUOTE_BYTE_PREFIXES: &[&str] = &[
|
|
"BR\"",
|
|
"Br\"",
|
|
"bR\"",
|
|
"br\"",
|
|
"RB\"",
|
|
"Rb\"",
|
|
"rB\"",
|
|
"rb\"",
|
|
"BR'",
|
|
"Br'",
|
|
"bR'",
|
|
"br'",
|
|
"RB'",
|
|
"Rb'",
|
|
"rB'",
|
|
"rb'",
|
|
"B\"",
|
|
"b\"",
|
|
"B'",
|
|
"b'",
|
|
];
|
|
|
|
/// Strip the leading and trailing quotes from a string.
|
|
/// Assumes that the string is a valid string literal, but does not verify that the string
|
|
/// is a "simple" string literal (i.e., that it does not contain any implicit concatenations).
|
|
pub fn raw_contents(contents: &str) -> Option<&str> {
|
|
let range = raw_contents_range(contents)?;
|
|
|
|
Some(&contents[range])
|
|
}
|
|
|
|
pub fn raw_contents_range(contents: &str) -> Option<TextRange> {
|
|
let leading_quote_str = leading_quote(contents)?;
|
|
let trailing_quote_str = trailing_quote(contents)?;
|
|
|
|
Some(TextRange::new(
|
|
leading_quote_str.text_len(),
|
|
contents.text_len() - trailing_quote_str.text_len(),
|
|
))
|
|
}
|
|
|
|
/// An [`AhoCorasick`] matcher for string and byte literal prefixes.
|
|
static PREFIX_MATCHER: Lazy<AhoCorasick> = Lazy::new(|| {
|
|
AhoCorasick::builder()
|
|
.start_kind(StartKind::Anchored)
|
|
.match_kind(MatchKind::LeftmostLongest)
|
|
.kind(Some(AhoCorasickKind::DFA))
|
|
.build(
|
|
TRIPLE_QUOTE_STR_PREFIXES
|
|
.iter()
|
|
.chain(TRIPLE_QUOTE_BYTE_PREFIXES)
|
|
.chain(SINGLE_QUOTE_STR_PREFIXES)
|
|
.chain(SINGLE_QUOTE_BYTE_PREFIXES),
|
|
)
|
|
.unwrap()
|
|
});
|
|
|
|
/// Return the leading quote for a string or byte literal (e.g., `"""`).
|
|
pub fn leading_quote(content: &str) -> Option<&str> {
|
|
let mat = PREFIX_MATCHER.find(Input::new(content).anchored(Anchored::Yes))?;
|
|
Some(&content[mat.start()..mat.end()])
|
|
}
|
|
|
|
/// Return the trailing quote string for a string or byte literal (e.g., `"""`).
|
|
pub fn trailing_quote(content: &str) -> Option<&str> {
|
|
if content.ends_with("'''") {
|
|
Some("'''")
|
|
} else if content.ends_with("\"\"\"") {
|
|
Some("\"\"\"")
|
|
} else if content.ends_with('\'') {
|
|
Some("'")
|
|
} else if content.ends_with('\"') {
|
|
Some("\"")
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Return `true` if the string is a triple-quote string or byte prefix.
|
|
pub fn is_triple_quote(content: &str) -> bool {
|
|
TRIPLE_QUOTE_STR_PREFIXES.contains(&content) || TRIPLE_QUOTE_BYTE_PREFIXES.contains(&content)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::{
|
|
SINGLE_QUOTE_BYTE_PREFIXES, SINGLE_QUOTE_STR_PREFIXES, TRIPLE_QUOTE_BYTE_PREFIXES,
|
|
TRIPLE_QUOTE_STR_PREFIXES,
|
|
};
|
|
|
|
#[test]
|
|
fn prefix_uniqueness() {
|
|
let prefixes = TRIPLE_QUOTE_STR_PREFIXES
|
|
.iter()
|
|
.chain(TRIPLE_QUOTE_BYTE_PREFIXES)
|
|
.chain(SINGLE_QUOTE_STR_PREFIXES)
|
|
.chain(SINGLE_QUOTE_BYTE_PREFIXES)
|
|
.collect::<Vec<_>>();
|
|
for (i, prefix_i) in prefixes.iter().enumerate() {
|
|
for (j, prefix_j) in prefixes.iter().enumerate() {
|
|
if i > j {
|
|
assert!(
|
|
!prefix_i.starts_with(*prefix_j),
|
|
"Prefixes are not unique: {prefix_i} starts with {prefix_j}",
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|