Basic string formatting

## Summary This PR implements formatting for non-f-string Strings that do not use implicit concatenation. Docstring formatting is out of the scope of this PR.  ## Test Plan I added a few tests for simple string literals. ## Performance Ouch. This is hitting performance somewhat hard. This is probably because we now iterate each string a couple of times: 1. To detect if it is an implicit string continuation 2. To detect if the string contains any new lines 3. To detect the preferred quote 4. To normalize the string Edit: I integrated the detection of newlines into the preferred quote detection so that we only iterate the string three time. We can probably do better by merging the implicit string continuation with the quote detection and new line detection by iterating till the end of the string part and returning the offset. We then use our simple tokenizer to skip over any comments or whitespace until we find the first non trivia token. From there we keep continue doing this in a loop until we reach the end o the string. I'll leave this improvement for later.
2025-09-14 06:15:13 +00:00 · 2023-06-23 09:46:05 +02:00 · 2023-06-23 09:46:05 +02:00 · c52aa8f065
commit c52aa8f065
parent 3e12bdff45
46 changed files with 1278 additions and 1086 deletions
--- a/crates/ruff_python_formatter/src/expression/expr_constant.rs
+++ b/crates/ruff_python_formatter/src/expression/expr_constant.rs
@ -2,6 +2,7 @@ use crate::comments::Comments;
 use crate::expression::parentheses::{
    default_expression_needs_parentheses, NeedsParentheses, Parentheses, Parenthesize,
 };
+use crate::expression::string::FormatString;
 use crate::prelude::*;
 use crate::{not_yet_implemented_custom_text, verbatim_text, FormatNodeRule};
 use ruff_formatter::write;
@ -28,9 +29,7 @@ impl FormatNodeRule<ExprConstant> for FormatExprConstant {
            Constant::Int(_) | Constant::Float(_) | Constant::Complex { .. } => {
                write!(f, [verbatim_text(item)])
            }
-            Constant::Str(_) => {
-                not_yet_implemented_custom_text(r#""NOT_YET_IMPLEMENTED_STRING""#).fmt(f)
-            }
+            Constant::Str(_) => FormatString::new(item).fmt(f),
            Constant::Bytes(_) => {
                not_yet_implemented_custom_text(r#"b"NOT_YET_IMPLEMENTED_BYTE_STRING""#).fmt(f)
            }
--- a/crates/ruff_python_formatter/src/expression/mod.rs
+++ b/crates/ruff_python_formatter/src/expression/mod.rs
@ -37,6 +37,7 @@ pub(crate) mod expr_unary_op;
 pub(crate) mod expr_yield;
 pub(crate) mod expr_yield_from;
 pub(crate) mod parentheses;
+mod string;

 #[derive(Default)]
 pub struct FormatExpr {
--- a/crates/ruff_python_formatter/src/expression/string.rs
+++ b/crates/ruff_python_formatter/src/expression/string.rs
@ -0,0 +1,318 @@
+use crate::prelude::*;
+use crate::{not_yet_implemented_custom_text, QuoteStyle};
+use bitflags::bitflags;
+use ruff_formatter::{write, FormatError};
+use ruff_python_ast::str::is_implicit_concatenation;
+use ruff_text_size::{TextLen, TextRange, TextSize};
+use rustpython_parser::ast::{ExprConstant, Ranged};
+use std::borrow::Cow;
+
+pub(super) struct FormatString {
+    string_range: TextRange,
+}
+
+impl FormatString {
+    pub(super) fn new(constant: &ExprConstant) -> Self {
+        debug_assert!(constant.value.is_str());
+        Self {
+            string_range: constant.range(),
+        }
+    }
+}
+
+impl Format<PyFormatContext<'_>> for FormatString {
+    fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
+        let string_content = f.context().locator().slice(self.string_range);
+
+        if is_implicit_concatenation(string_content) {
+            not_yet_implemented_custom_text(r#""NOT_YET_IMPLEMENTED" "IMPLICIT_CONCATENATION""#)
+                .fmt(f)
+        } else {
+            FormatStringPart::new(self.string_range).fmt(f)
+        }
+    }
+}
+
+struct FormatStringPart {
+    part_range: TextRange,
+}
+
+impl FormatStringPart {
+    const fn new(range: TextRange) -> Self {
+        Self { part_range: range }
+    }
+}
+
+impl Format<PyFormatContext<'_>> for FormatStringPart {
+    fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
+        let string_content = f.context().locator().slice(self.part_range);
+
+        let prefix = StringPrefix::parse(string_content);
+        let after_prefix = &string_content[usize::from(prefix.text_len())..];
+
+        let quotes = StringQuotes::parse(after_prefix).ok_or(FormatError::SyntaxError)?;
+        let relative_raw_content_range = TextRange::new(
+            prefix.text_len() + quotes.text_len(),
+            string_content.text_len() - quotes.text_len(),
+        );
+        let raw_content_range = relative_raw_content_range + self.part_range.start();
+
+        let raw_content = &string_content[relative_raw_content_range];
+        let (preferred_quotes, contains_newlines) = preferred_quotes(raw_content, quotes);
+
+        write!(f, [prefix, preferred_quotes])?;
+
+        let normalized = normalize_quotes(raw_content, preferred_quotes);
+
+        match normalized {
+            Cow::Borrowed(_) => {
+                source_text_slice(raw_content_range, contains_newlines).fmt(f)?;
+            }
+            Cow::Owned(normalized) => {
+                dynamic_text(&normalized, Some(raw_content_range.start())).fmt(f)?;
+            }
+        }
+
+        preferred_quotes.fmt(f)
+    }
+}
+
+bitflags! {
+    #[derive(Copy, Clone, Debug)]
+    struct StringPrefix: u8 {
+        const UNICODE   = 0b0000_0001;
+        /// `r"test"`
+        const RAW       = 0b0000_0010;
+        /// `R"test"
+        const RAW_UPPER = 0b0000_0100;
+        const BYTE      = 0b0000_1000;
+        const F_STRING  = 0b0001_0000;
+    }
+}
+
+impl StringPrefix {
+    fn parse(input: &str) -> StringPrefix {
+        let chars = input.chars();
+        let mut prefix = StringPrefix::empty();
+
+        for c in chars {
+            let flag = match c {
+                'u' | 'U' => StringPrefix::UNICODE,
+                'f' | 'F' => StringPrefix::F_STRING,
+                'b' | 'B' => StringPrefix::BYTE,
+                'r' => StringPrefix::RAW,
+                'R' => StringPrefix::RAW_UPPER,
+                '\'' | '"' => break,
+                c => {
+                    unreachable!(
+                        "Unexpected character '{c}' terminating the prefix of a string literal"
+                    );
+                }
+            };
+
+            prefix |= flag;
+        }
+
+        prefix
+    }
+
+    const fn text_len(self) -> TextSize {
+        TextSize::new(self.bits().count_ones())
+    }
+}
+
+impl Format<PyFormatContext<'_>> for StringPrefix {
+    fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
+        // Retain the casing for the raw prefix:
+        // https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#r-strings-and-r-strings
+        if self.contains(StringPrefix::RAW) {
+            text("r").fmt(f)?;
+        } else if self.contains(StringPrefix::RAW_UPPER) {
+            text("R").fmt(f)?;
+        }
+
+        if self.contains(StringPrefix::BYTE) {
+            text("b").fmt(f)?;
+        }
+
+        if self.contains(StringPrefix::F_STRING) {
+            text("f").fmt(f)?;
+        }
+
+        // Remove the unicode prefix `u` if any because it is meaningless in Python 3+.
+
+        Ok(())
+    }
+}
+
+/// Detects the preferred quotes for `input`.
+/// * single quoted strings: The preferred quote style is the one that requires less escape sequences.
+/// * triple quoted strings: Use double quotes except the string contains a sequence of `"""`.
+fn preferred_quotes(input: &str, quotes: StringQuotes) -> (StringQuotes, ContainsNewlines) {
+    let mut contains_newlines = ContainsNewlines::No;
+
+    let preferred_style = if quotes.triple {
+        let mut use_single_quotes = false;
+        let mut chars = input.chars().peekable();
+
+        while let Some(c) = chars.next() {
+            match c {
+                '\n' | '\r' => contains_newlines = ContainsNewlines::Yes,
+                '\\' => {
+                    if matches!(chars.peek(), Some('"' | '\\')) {
+                        chars.next();
+                    }
+                }
+                '"' => {
+                    match chars.peek().copied() {
+                        Some('"') => {
+                            // `""`
+                            chars.next();
+
+                            if chars.peek().copied() == Some('"') {
+                                // `"""`
+                                chars.next();
+                                use_single_quotes = true;
+                            }
+                        }
+                        Some(_) => {
+                            // Single quote, this is ok
+                        }
+                        None => {
+                            // Trailing quote at the end of the comment
+                            use_single_quotes = true;
+                        }
+                    }
+                }
+                _ => continue,
+            }
+        }
+
+        if use_single_quotes {
+            QuoteStyle::Single
+        } else {
+            QuoteStyle::Double
+        }
+    } else {
+        let mut single_quotes = 0u32;
+        let mut double_quotes = 0u32;
+
+        for c in input.chars() {
+            match c {
+                '\'' => {
+                    single_quotes += 1;
+                }
+
+                '"' => {
+                    double_quotes += 1;
+                }
+
+                '\n' | '\r' => {
+                    contains_newlines = ContainsNewlines::Yes;
+                }
+
+                _ => continue,
+            }
+        }
+
+        if double_quotes > single_quotes {
+            QuoteStyle::Single
+        } else {
+            QuoteStyle::Double
+        }
+    };
+
+    (
+        StringQuotes {
+            triple: quotes.triple,
+            style: preferred_style,
+        },
+        contains_newlines,
+    )
+}
+
+#[derive(Copy, Clone, Debug)]
+struct StringQuotes {
+    triple: bool,
+    style: QuoteStyle,
+}
+
+impl StringQuotes {
+    fn parse(input: &str) -> Option<StringQuotes> {
+        let mut chars = input.chars();
+
+        let quote_char = chars.next()?;
+        let style = QuoteStyle::try_from(quote_char).ok()?;
+
+        let triple = chars.next() == Some(quote_char) && chars.next() == Some(quote_char);
+
+        Some(Self { triple, style })
+    }
+
+    const fn text_len(self) -> TextSize {
+        if self.triple {
+            TextSize::new(3)
+        } else {
+            TextSize::new(1)
+        }
+    }
+}
+
+impl Format<PyFormatContext<'_>> for StringQuotes {
+    fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
+        let quotes = match (self.style, self.triple) {
+            (QuoteStyle::Single, false) => "'",
+            (QuoteStyle::Single, true) => "'''",
+            (QuoteStyle::Double, false) => "\"",
+            (QuoteStyle::Double, true) => "\"\"\"",
+        };
+
+        text(quotes).fmt(f)
+    }
+}
+
+/// Adds the necessary quote escapes and removes unnecessary escape sequences when quoting `input`
+/// with the provided `style`.
+fn normalize_quotes(input: &str, quotes: StringQuotes) -> Cow<str> {
+    if quotes.triple {
+        Cow::Borrowed(input)
+    } else {
+        // The normalized string if `input` is not yet normalized.
+        // `output` must remain empty if `input` is already normalized.
+        let mut output = String::new();
+        // Tracks the last index of `input` that has been written to `output`.
+        // If `last_index` is `0` at the end, then the input is already normalized and can be returned as is.
+        let mut last_index = 0;
+
+        let style = quotes.style;
+        let preferred_quote = style.as_char();
+        let opposite_quote = style.opposite().as_char();
+
+        let mut chars = input.char_indices();
+
+        while let Some((index, c)) = chars.next() {
+            if c == '\\' {
+                if let Some((_, next)) = chars.next() {
+                    if next == opposite_quote {
+                        // Remove the escape by ending before the backslash and starting again with the quote
+                        output.push_str(&input[last_index..index]);
+                        last_index = index + '\\'.len_utf8();
+                    }
+                }
+            } else if c == preferred_quote {
+                // Escape the quote
+                output.push_str(&input[last_index..index]);
+                output.push('\\');
+                output.push(c);
+                last_index = index + preferred_quote.len_utf8();
+            }
+        }
+
+        if last_index == 0 {
+            Cow::Borrowed(input)
+        } else {
+            output.push_str(&input[last_index..]);
+            Cow::Owned(output)
+        }
+    }
+}