Basic string formatting

<!--
Thank you for contributing to Ruff! To help us out with reviewing, please consider the following:

- Does this pull request include a summary of the change? (See below.)
- Does this pull request include a descriptive title?
- Does this pull request include references to any relevant issues?
-->

## Summary

This PR implements formatting for non-f-string Strings that do not use implicit concatenation. 

Docstring formatting is out of the scope of this PR.

<!-- What's the purpose of the change? What does it do, and why? -->

## Test Plan

I added a few tests for simple string literals. 

## Performance

Ouch. This is hitting performance somewhat hard. This is probably because we now iterate each string a couple of times:

1. To detect if it is an implicit string continuation
2. To detect if the string contains any new lines
3. To detect the preferred quote
4. To normalize the string

Edit: I integrated the detection of newlines into the preferred quote detection so that we only iterate the string three time.
We can probably do better by merging the implicit string continuation with the quote detection and new line detection by iterating till the end of the string part and returning the offset. We then use our simple tokenizer to skip over any comments or whitespace until we find the first non trivia token. From there we keep continue doing this in a loop until we reach the end o the string. I'll leave this improvement for later.
This commit is contained in:
Micha Reiser 2023-06-23 09:46:05 +02:00 committed by GitHub
parent 3e12bdff45
commit c52aa8f065
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
46 changed files with 1278 additions and 1086 deletions

View file

@ -2,6 +2,7 @@ use crate::comments::Comments;
use crate::expression::parentheses::{
default_expression_needs_parentheses, NeedsParentheses, Parentheses, Parenthesize,
};
use crate::expression::string::FormatString;
use crate::prelude::*;
use crate::{not_yet_implemented_custom_text, verbatim_text, FormatNodeRule};
use ruff_formatter::write;
@ -28,9 +29,7 @@ impl FormatNodeRule<ExprConstant> for FormatExprConstant {
Constant::Int(_) | Constant::Float(_) | Constant::Complex { .. } => {
write!(f, [verbatim_text(item)])
}
Constant::Str(_) => {
not_yet_implemented_custom_text(r#""NOT_YET_IMPLEMENTED_STRING""#).fmt(f)
}
Constant::Str(_) => FormatString::new(item).fmt(f),
Constant::Bytes(_) => {
not_yet_implemented_custom_text(r#"b"NOT_YET_IMPLEMENTED_BYTE_STRING""#).fmt(f)
}

View file

@ -37,6 +37,7 @@ pub(crate) mod expr_unary_op;
pub(crate) mod expr_yield;
pub(crate) mod expr_yield_from;
pub(crate) mod parentheses;
mod string;
#[derive(Default)]
pub struct FormatExpr {

View file

@ -0,0 +1,318 @@
use crate::prelude::*;
use crate::{not_yet_implemented_custom_text, QuoteStyle};
use bitflags::bitflags;
use ruff_formatter::{write, FormatError};
use ruff_python_ast::str::is_implicit_concatenation;
use ruff_text_size::{TextLen, TextRange, TextSize};
use rustpython_parser::ast::{ExprConstant, Ranged};
use std::borrow::Cow;
pub(super) struct FormatString {
string_range: TextRange,
}
impl FormatString {
pub(super) fn new(constant: &ExprConstant) -> Self {
debug_assert!(constant.value.is_str());
Self {
string_range: constant.range(),
}
}
}
impl Format<PyFormatContext<'_>> for FormatString {
fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
let string_content = f.context().locator().slice(self.string_range);
if is_implicit_concatenation(string_content) {
not_yet_implemented_custom_text(r#""NOT_YET_IMPLEMENTED" "IMPLICIT_CONCATENATION""#)
.fmt(f)
} else {
FormatStringPart::new(self.string_range).fmt(f)
}
}
}
struct FormatStringPart {
part_range: TextRange,
}
impl FormatStringPart {
const fn new(range: TextRange) -> Self {
Self { part_range: range }
}
}
impl Format<PyFormatContext<'_>> for FormatStringPart {
fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
let string_content = f.context().locator().slice(self.part_range);
let prefix = StringPrefix::parse(string_content);
let after_prefix = &string_content[usize::from(prefix.text_len())..];
let quotes = StringQuotes::parse(after_prefix).ok_or(FormatError::SyntaxError)?;
let relative_raw_content_range = TextRange::new(
prefix.text_len() + quotes.text_len(),
string_content.text_len() - quotes.text_len(),
);
let raw_content_range = relative_raw_content_range + self.part_range.start();
let raw_content = &string_content[relative_raw_content_range];
let (preferred_quotes, contains_newlines) = preferred_quotes(raw_content, quotes);
write!(f, [prefix, preferred_quotes])?;
let normalized = normalize_quotes(raw_content, preferred_quotes);
match normalized {
Cow::Borrowed(_) => {
source_text_slice(raw_content_range, contains_newlines).fmt(f)?;
}
Cow::Owned(normalized) => {
dynamic_text(&normalized, Some(raw_content_range.start())).fmt(f)?;
}
}
preferred_quotes.fmt(f)
}
}
bitflags! {
#[derive(Copy, Clone, Debug)]
struct StringPrefix: u8 {
const UNICODE = 0b0000_0001;
/// `r"test"`
const RAW = 0b0000_0010;
/// `R"test"
const RAW_UPPER = 0b0000_0100;
const BYTE = 0b0000_1000;
const F_STRING = 0b0001_0000;
}
}
impl StringPrefix {
fn parse(input: &str) -> StringPrefix {
let chars = input.chars();
let mut prefix = StringPrefix::empty();
for c in chars {
let flag = match c {
'u' | 'U' => StringPrefix::UNICODE,
'f' | 'F' => StringPrefix::F_STRING,
'b' | 'B' => StringPrefix::BYTE,
'r' => StringPrefix::RAW,
'R' => StringPrefix::RAW_UPPER,
'\'' | '"' => break,
c => {
unreachable!(
"Unexpected character '{c}' terminating the prefix of a string literal"
);
}
};
prefix |= flag;
}
prefix
}
const fn text_len(self) -> TextSize {
TextSize::new(self.bits().count_ones())
}
}
impl Format<PyFormatContext<'_>> for StringPrefix {
fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
// Retain the casing for the raw prefix:
// https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#r-strings-and-r-strings
if self.contains(StringPrefix::RAW) {
text("r").fmt(f)?;
} else if self.contains(StringPrefix::RAW_UPPER) {
text("R").fmt(f)?;
}
if self.contains(StringPrefix::BYTE) {
text("b").fmt(f)?;
}
if self.contains(StringPrefix::F_STRING) {
text("f").fmt(f)?;
}
// Remove the unicode prefix `u` if any because it is meaningless in Python 3+.
Ok(())
}
}
/// Detects the preferred quotes for `input`.
/// * single quoted strings: The preferred quote style is the one that requires less escape sequences.
/// * triple quoted strings: Use double quotes except the string contains a sequence of `"""`.
fn preferred_quotes(input: &str, quotes: StringQuotes) -> (StringQuotes, ContainsNewlines) {
let mut contains_newlines = ContainsNewlines::No;
let preferred_style = if quotes.triple {
let mut use_single_quotes = false;
let mut chars = input.chars().peekable();
while let Some(c) = chars.next() {
match c {
'\n' | '\r' => contains_newlines = ContainsNewlines::Yes,
'\\' => {
if matches!(chars.peek(), Some('"' | '\\')) {
chars.next();
}
}
'"' => {
match chars.peek().copied() {
Some('"') => {
// `""`
chars.next();
if chars.peek().copied() == Some('"') {
// `"""`
chars.next();
use_single_quotes = true;
}
}
Some(_) => {
// Single quote, this is ok
}
None => {
// Trailing quote at the end of the comment
use_single_quotes = true;
}
}
}
_ => continue,
}
}
if use_single_quotes {
QuoteStyle::Single
} else {
QuoteStyle::Double
}
} else {
let mut single_quotes = 0u32;
let mut double_quotes = 0u32;
for c in input.chars() {
match c {
'\'' => {
single_quotes += 1;
}
'"' => {
double_quotes += 1;
}
'\n' | '\r' => {
contains_newlines = ContainsNewlines::Yes;
}
_ => continue,
}
}
if double_quotes > single_quotes {
QuoteStyle::Single
} else {
QuoteStyle::Double
}
};
(
StringQuotes {
triple: quotes.triple,
style: preferred_style,
},
contains_newlines,
)
}
#[derive(Copy, Clone, Debug)]
struct StringQuotes {
triple: bool,
style: QuoteStyle,
}
impl StringQuotes {
fn parse(input: &str) -> Option<StringQuotes> {
let mut chars = input.chars();
let quote_char = chars.next()?;
let style = QuoteStyle::try_from(quote_char).ok()?;
let triple = chars.next() == Some(quote_char) && chars.next() == Some(quote_char);
Some(Self { triple, style })
}
const fn text_len(self) -> TextSize {
if self.triple {
TextSize::new(3)
} else {
TextSize::new(1)
}
}
}
impl Format<PyFormatContext<'_>> for StringQuotes {
fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
let quotes = match (self.style, self.triple) {
(QuoteStyle::Single, false) => "'",
(QuoteStyle::Single, true) => "'''",
(QuoteStyle::Double, false) => "\"",
(QuoteStyle::Double, true) => "\"\"\"",
};
text(quotes).fmt(f)
}
}
/// Adds the necessary quote escapes and removes unnecessary escape sequences when quoting `input`
/// with the provided `style`.
fn normalize_quotes(input: &str, quotes: StringQuotes) -> Cow<str> {
if quotes.triple {
Cow::Borrowed(input)
} else {
// The normalized string if `input` is not yet normalized.
// `output` must remain empty if `input` is already normalized.
let mut output = String::new();
// Tracks the last index of `input` that has been written to `output`.
// If `last_index` is `0` at the end, then the input is already normalized and can be returned as is.
let mut last_index = 0;
let style = quotes.style;
let preferred_quote = style.as_char();
let opposite_quote = style.opposite().as_char();
let mut chars = input.char_indices();
while let Some((index, c)) = chars.next() {
if c == '\\' {
if let Some((_, next)) = chars.next() {
if next == opposite_quote {
// Remove the escape by ending before the backslash and starting again with the quote
output.push_str(&input[last_index..index]);
last_index = index + '\\'.len_utf8();
}
}
} else if c == preferred_quote {
// Escape the quote
output.push_str(&input[last_index..index]);
output.push('\\');
output.push(c);
last_index = index + preferred_quote.len_utf8();
}
}
if last_index == 0 {
Cow::Borrowed(input)
} else {
output.push_str(&input[last_index..]);
Cow::Owned(output)
}
}
}