Format docstrings (#6452)

**Summary** Implement docstring formatting

**Test Plan** Matches black's `docstring.py` fixture exactly, added some
new cases for what is hard to debug with black and with what black
doesn't cover.

similarity index:

main:
zulip: 0.99702
django: 0.99784
warehouse: 0.99585
build: 0.75623
transformers: 0.99469
cpython: 0.75989
typeshed: 0.74853

this branch:

zulip: 0.99702
django: 0.99784
warehouse: 0.99585
build: 0.75623
transformers: 0.99464
cpython: 0.75517
typeshed: 0.74853

The regression in transformers is actually an improvement in a file they
don't format with black (they run `black examples tests src utils
setup.py conftest.py`, the difference is in hubconf.py). cpython doesn't
use black.

Closes #6196
This commit is contained in:
konsti 2023-08-14 14:28:58 +02:00 committed by GitHub
parent 910dbbd9b6
commit 01eceaf0dc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 1064 additions and 1245 deletions

View file

@ -1,16 +1,16 @@
use std::borrow::Cow;
use bitflags::bitflags;
use ruff_formatter::{format_args, write, FormatError};
use ruff_python_ast::node::AnyNodeRef;
use ruff_python_ast::str::is_implicit_concatenation;
use ruff_python_ast::{self as ast, ExprConstant, ExprFString, Ranged};
use ruff_python_parser::lexer::{lex_starts_at, LexicalError, LexicalErrorType};
use ruff_python_parser::{Mode, Tok};
use ruff_source_file::Locator;
use ruff_text_size::{TextLen, TextRange, TextSize};
use ruff_formatter::{format_args, write, FormatError};
use ruff_python_ast::str::is_implicit_concatenation;
use crate::comments::{leading_comments, trailing_comments};
use crate::expression::parentheses::{
in_parentheses_only_group, in_parentheses_only_soft_line_break_or_space,
@ -78,7 +78,7 @@ pub(super) struct FormatString<'a> {
pub enum StringLayout {
#[default]
Default,
DocString,
ImplicitConcatenatedBinaryLeftSide,
}
@ -109,10 +109,25 @@ impl<'a> Format<PyFormatContext<'_>> for FormatString<'a> {
if is_implicit_concatenation(string_content) {
in_parentheses_only_group(&FormatStringContinuation::new(self.string)).fmt(f)
} else {
FormatStringPart::new(string_range, self.string.quoting(&f.context().locator()))
.fmt(f)
FormatStringPart::new(
string_range,
self.string.quoting(&f.context().locator()),
&f.context().locator(),
f.options().quote_style(),
)
.fmt(f)
}
}
StringLayout::DocString => {
let string_part = FormatStringPart::new(
self.string.range(),
// f-strings can't be docstrings
Quoting::CanChange,
&f.context().locator(),
f.options().quote_style(),
);
format_docstring(&string_part, f)
}
StringLayout::ImplicitConcatenatedBinaryLeftSide => {
FormatStringContinuation::new(self.string).fmt(f)
}
@ -137,6 +152,7 @@ impl Format<PyFormatContext<'_>> for FormatStringContinuation<'_> {
fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> {
let comments = f.context().comments().clone();
let locator = f.context().locator();
let quote_style = f.options().quote_style();
let mut dangling_comments = comments.dangling_comments(self.string);
let string_range = self.string.range();
@ -213,7 +229,12 @@ impl Format<PyFormatContext<'_>> for FormatStringContinuation<'_> {
joiner.entry(&format_args![
line_suffix_boundary(),
leading_comments(leading_part_comments),
FormatStringPart::new(token_range, self.string.quoting(&locator)),
FormatStringPart::new(
token_range,
self.string.quoting(&locator),
&locator,
quote_style,
),
trailing_comments(trailing_part_comments)
]);
@ -235,63 +256,67 @@ impl Format<PyFormatContext<'_>> for FormatStringContinuation<'_> {
}
struct FormatStringPart {
part_range: TextRange,
quoting: Quoting,
prefix: StringPrefix,
preferred_quotes: StringQuotes,
range: TextRange,
is_raw_string: bool,
}
impl FormatStringPart {
const fn new(range: TextRange, quoting: Quoting) -> Self {
fn new(range: TextRange, quoting: Quoting, locator: &Locator, quote_style: QuoteStyle) -> Self {
let string_content = locator.slice(range);
let prefix = StringPrefix::parse(string_content);
let after_prefix = &string_content[usize::from(prefix.text_len())..];
let quotes =
StringQuotes::parse(after_prefix).expect("Didn't find string quotes after prefix");
let relative_raw_content_range = TextRange::new(
prefix.text_len() + quotes.text_len(),
string_content.text_len() - quotes.text_len(),
);
let raw_content_range = relative_raw_content_range + range.start();
let raw_content = &string_content[relative_raw_content_range];
let is_raw_string = prefix.is_raw_string();
let preferred_quotes = match quoting {
Quoting::Preserve => quotes,
Quoting::CanChange => {
if is_raw_string {
preferred_quotes_raw(raw_content, quotes, quote_style)
} else {
preferred_quotes(raw_content, quotes, quote_style)
}
}
};
Self {
part_range: range,
quoting,
prefix,
range: raw_content_range,
preferred_quotes,
is_raw_string,
}
}
}
impl Format<PyFormatContext<'_>> for FormatStringPart {
fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> {
let string_content = f.context().locator().slice(self.part_range);
let prefix = StringPrefix::parse(string_content);
let after_prefix = &string_content[usize::from(prefix.text_len())..];
let quotes = StringQuotes::parse(after_prefix).ok_or(FormatError::syntax_error(
"Didn't find string quotes after prefix",
))?;
let relative_raw_content_range = TextRange::new(
prefix.text_len() + quotes.text_len(),
string_content.text_len() - quotes.text_len(),
let (normalized, contains_newlines) = normalize_string(
f.context().locator().slice(self.range),
self.preferred_quotes,
self.is_raw_string,
);
let raw_content_range = relative_raw_content_range + self.part_range.start();
let raw_content = &string_content[relative_raw_content_range];
let is_raw_string = prefix.is_raw_string();
let preferred_quotes = match self.quoting {
Quoting::Preserve => quotes,
Quoting::CanChange => {
if is_raw_string {
preferred_quotes_raw(raw_content, quotes, f.options().quote_style())
} else {
preferred_quotes(raw_content, quotes, f.options().quote_style())
}
}
};
write!(f, [prefix, preferred_quotes])?;
let (normalized, contains_newlines) =
normalize_string(raw_content, preferred_quotes, is_raw_string);
write!(f, [self.prefix, self.preferred_quotes])?;
match normalized {
Cow::Borrowed(_) => {
source_text_slice(raw_content_range, contains_newlines).fmt(f)?;
source_text_slice(self.range, contains_newlines).fmt(f)?;
}
Cow::Owned(normalized) => {
dynamic_text(&normalized, Some(raw_content_range.start())).fmt(f)?;
dynamic_text(&normalized, Some(self.range.start())).fmt(f)?;
}
}
preferred_quotes.fmt(f)
self.preferred_quotes.fmt(f)
}
}
@ -639,3 +664,311 @@ fn normalize_string(
(normalized, newlines)
}
/// For docstring indentation, black counts spaces as 1 and tabs by increasing the indentation up
/// to the next multiple of 8. This is effectively a port of
/// [`str.expandtabs`](https://docs.python.org/3/library/stdtypes.html#str.expandtabs),
/// which black [calls with the default tab width of 8](https://github.com/psf/black/blob/c36e468794f9256d5e922c399240d49782ba04f1/src/black/strings.py#L61)
fn count_indentation_like_black(line: &str) -> TextSize {
let tab_width: u32 = 8;
let mut indentation = TextSize::default();
for char in line.chars() {
if char == '\t' {
// Pad to the next multiple of tab_width
indentation += TextSize::from(tab_width - (indentation.to_u32().rem_euclid(tab_width)));
} else if char.is_whitespace() {
indentation += char.text_len();
} else {
return indentation;
}
}
indentation
}
/// Format a docstring by trimming whitespace and adjusting the indentation.
///
/// Summary of changes we make:
/// * Normalize the string like all other strings
/// * Ignore docstring that have an escaped newline
/// * Trim all trailing whitespace, except for a chaperone space that avoids quotes or backslashes
/// in the last line.
/// * Trim leading whitespace on the first line, again except for a chaperone space
/// * If there is only content in the first line and after that only whitespace, collapse the
/// docstring into one line
/// * Adjust the indentation (see below)
///
/// # Docstring indentation
///
/// Unlike any other string, like black we change the indentation of docstring lines.
///
/// We want to preserve the indentation inside the docstring relative to the suite statement/block
/// indent that the docstring statement is in, but also want to apply the change of the outer
/// indentation in the docstring, e.g.
/// ```python
/// def sparkle_sky():
/// """Make a pretty sparkly sky.
/// * * ✨ *. .
/// * * ✨ .
/// . * . ✨ * . .
/// """
/// ```
/// should become
/// ```python
/// def sparkle_sky():
/// """Make a pretty sparkly sky.
/// * * ✨ *. .
/// * * ✨ .
/// . * . ✨ * . .
/// """
/// ```
/// We can't compute the full indentation here since we don't know what the block indent of
/// the doc comment will be yet and which we can only have added by formatting each line
/// separately with a hard line break. This means we need to strip shared indentation from
/// docstring while preserving the in-docstring bigger-than-suite-statement indentation. Example:
/// ```python
/// def f():
/// """first line
/// line a
/// line b
/// """
/// ```
/// The docstring indentation is 2, the block indents will change this to 4 (but we can't
/// determine this at this point). The indentation of line a is 2, so we trim ` line a`
/// to `line a`. For line b it's 5, so we trim it to `line b` and pad with 5-2=3 spaces to
/// ` line b`. The closing quotes, being on their own line, are stripped get only the
/// default indentation. Fully formatted:
/// ```python
/// def f():
/// """first line
/// line a
/// line b
/// """
/// ```
///
/// Tabs are counted by padding them to the next multiple of 8 according to
/// [`str.expandtabs`](https://docs.python.org/3/library/stdtypes.html#str.expandtabs). When
/// we see indentation that contains a tab or any other none ascii-space whitespace we rewrite the
/// string.
///
/// Additionally, if any line in the docstring has less indentation than the docstring
/// (effectively a negative indentation wrt. to the current level), we pad all lines to the
/// level of the docstring with spaces.
/// ```python
/// def f():
/// """first line
/// line a
/// line b
/// line c
/// """
/// ```
/// Here line a is 3 columns negatively indented, so we pad all lines by an extra 3 spaces:
/// ```python
/// def f():
/// """first line
/// line a
/// line b
/// line c
/// """
/// ```
fn format_docstring(string_part: &FormatStringPart, f: &mut PyFormatter) -> FormatResult<()> {
let locator = f.context().locator();
// Black doesn't change the indentation of docstrings that contain an escaped newline
if locator.slice(string_part.range).contains("\\\n") {
return string_part.fmt(f);
}
let (normalized, _) = normalize_string(
locator.slice(string_part.range),
string_part.preferred_quotes,
string_part.is_raw_string,
);
// is_borrowed is unstable :/
let already_normalized = matches!(normalized, Cow::Borrowed(_));
let mut lines = normalized.lines().peekable();
// Start the string
write!(
f,
[
source_position(string_part.range.start()),
string_part.prefix,
string_part.preferred_quotes
]
)?;
// We track where in the source docstring we are (in source code byte offsets)
let mut offset = string_part.range.start();
// The first line directly after the opening quotes has different rules than the rest, mainly
// that we remove all leading whitespace as there's no indentation
let first = lines.next().unwrap_or_default();
// Black trims whitespace using [`str.strip()`](https://docs.python.org/3/library/stdtypes.html#str.strip)
// https://github.com/psf/black/blob/b4dca26c7d93f930bbd5a7b552807370b60d4298/src/black/strings.py#L77-L85
// So we use the unicode whitespace definition through `trim_{start,end}` instead of the python
// tokenizer whitespace definition in `trim_whitespace_{start,end}`.
let trim_end = first.trim_end();
let trim_both = trim_end.trim_start();
// Edge case: The first line is `""" "content`, so we need to insert chaperone space that keep
// inner quotes and closing quotes from getting to close to avoid `""""content`
if trim_both.starts_with(string_part.preferred_quotes.style.as_char()) {
space().fmt(f)?;
}
if !trim_end.is_empty() {
// For the first line of the docstring we strip the leading and trailing whitespace, e.g.
// `""" content ` to `"""content`
let leading_whitespace = trim_end.text_len() - trim_both.text_len();
let trimmed_line_range =
TextRange::at(offset, trim_end.text_len()).add_start(leading_whitespace);
if already_normalized {
source_text_slice(trimmed_line_range, ContainsNewlines::No).fmt(f)?;
} else {
dynamic_text(trim_both, Some(trimmed_line_range.start())).fmt(f)?;
}
}
offset += first.text_len();
// Check if we have a single line (or empty) docstring
if normalized[first.len()..].trim().is_empty() {
// For `"""\n"""` or other whitespace between the quotes, black keeps a single whitespace,
// but `""""""` doesn't get one inserted.
if needs_chaperone_space(string_part, trim_end)
|| (trim_end.is_empty() && !normalized.is_empty())
{
space().fmt(f)?;
}
string_part.preferred_quotes.fmt(f)?;
return Ok(());
}
hard_line_break().fmt(f)?;
// We know that the normalized string has \n line endings
offset += "\n".text_len();
// If some line of the docstring is less indented than the function body, we pad all lines to
// align it with the docstring statement. Conversely, if all lines are over-indented, we strip
// the extra indentation. We call this stripped indentation since it's relative to the block
// indent printer-made indentation.
let stripped_indentation = lines
.clone()
// We don't want to count whitespace-only lines as miss-indented
.filter(|line| !line.trim().is_empty())
.map(count_indentation_like_black)
.min()
.unwrap_or_default();
while let Some(line) = lines.next() {
let is_last = lines.peek().is_none();
format_docstring_line(
line,
is_last,
offset,
stripped_indentation,
already_normalized,
f,
)?;
// We know that the normalized string has \n line endings
offset += line.text_len() + "\n".text_len();
}
// Same special case in the last line as for the first line
let trim_end = normalized
.as_ref()
.trim_end_matches(|c: char| c.is_whitespace() && c != '\n');
if needs_chaperone_space(string_part, trim_end) {
space().fmt(f)?;
}
write!(
f,
[
string_part.preferred_quotes,
source_position(string_part.range.end())
]
)
}
/// If the last line of the docstring is `content" """` or `content\ """`, we need a chaperone space
/// that avoids `content""""` and `content\"""`. This does only applies to un-escaped backslashes,
/// so `content\\ """` doesn't need a space while `content\\\ """` does.
fn needs_chaperone_space(string_part: &FormatStringPart, trim_end: &str) -> bool {
trim_end.ends_with(string_part.preferred_quotes.style.as_char())
|| trim_end.chars().rev().take_while(|c| *c == '\\').count() % 2 == 1
}
/// Format a docstring line that is not the first line
fn format_docstring_line(
line: &str,
is_last: bool,
offset: TextSize,
stripped_indentation: TextSize,
already_normalized: bool,
f: &mut PyFormatter,
) -> FormatResult<()> {
let trim_end = line.trim_end();
if trim_end.is_empty() {
return if is_last {
// If the doc string ends with ` """`, the last line is ` `, but we don't want to
// insert an empty line (but close the docstring)
Ok(())
} else {
empty_line().fmt(f)
};
}
let tab_or_non_ascii_space = trim_end
.chars()
.take_while(|c| c.is_whitespace())
.any(|c| c != ' ');
if tab_or_non_ascii_space {
// We strip the indentation that is shared with the docstring statement, unless a line
// was indented less than the docstring statement, in which we strip only this much
// indentation to implicitly pad all lines by the difference, or all lines were
// overindented, in which case we strip the additional whitespace (see example in
// [`format_docstring`] doc comment). We then prepend the in-docstring indentation to the
// string.
let indent_len = count_indentation_like_black(trim_end) - stripped_indentation;
let in_docstring_indent = " ".repeat(indent_len.to_usize()) + trim_end.trim_start();
dynamic_text(&in_docstring_indent, Some(offset)).fmt(f)?;
} else {
// Take the string with the trailing whitespace removed, then also skip the leading
// whitespace
let trimmed_line_range =
TextRange::at(offset, trim_end.text_len()).add_start(stripped_indentation);
if already_normalized {
source_text_slice(trimmed_line_range, ContainsNewlines::No).fmt(f)?;
} else {
// All indents are ascii spaces, so the slicing is correct
dynamic_text(
&trim_end[stripped_indentation.to_usize()..],
Some(trimmed_line_range.start()),
)
.fmt(f)?;
}
}
// We handled the case that the closing quotes are on their own line above (the last line is
// empty except for whitespace). If they are on the same line as content, we don't insert a line
// break.
if !is_last {
hard_line_break().fmt(f)?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use crate::expression::string::count_indentation_like_black;
#[test]
fn test_indentation_like_black() {
assert_eq!(count_indentation_like_black("\t \t \t").to_u32(), 24);
assert_eq!(count_indentation_like_black("\t \t").to_u32(), 24);
assert_eq!(count_indentation_like_black("\t\t\t").to_u32(), 24);
assert_eq!(count_indentation_like_black(" ").to_u32(), 4);
}
}

View file

@ -1,9 +1,15 @@
use ruff_formatter::{write, FormatOwnedWithRule, FormatRefWithRule, FormatRuleWithOptions};
use ruff_python_ast::helpers::is_compound_statement;
use ruff_python_ast::{self as ast, Constant, Expr, Ranged, Stmt, Suite};
use ruff_python_ast::str::is_implicit_concatenation;
use ruff_python_ast::{self as ast, Expr, Ranged, Stmt, Suite};
use ruff_python_ast::{Constant, ExprConstant};
use ruff_python_trivia::{lines_after_ignoring_trivia, lines_before};
use ruff_source_file::Locator;
use crate::comments::{leading_comments, trailing_comments};
use crate::context::{NodeLevel, WithNodeLevel};
use crate::expression::expr_constant::ExprConstantLayout;
use crate::expression::string::StringLayout;
use crate::prelude::*;
/// Level at which the [`Suite`] appears in the source code.
@ -71,44 +77,76 @@ impl FormatRule<Suite, PyFormatContext<'_>> for FormatSuite {
}
write!(f, [first.format()])?;
}
SuiteKind::Class if is_docstring(first) => {
if !comments.has_leading_comments(first) && lines_before(first.start(), source) > 1
{
// Allow up to one empty line before a class docstring, e.g., this is
// stable formatting:
SuiteKind::Function => {
if let Some(constant) = get_docstring(first, &f.context().locator()) {
write!(
f,
[
// We format the expression, but the statement carries the comments
leading_comments(comments.leading_comments(first)),
constant
.format()
.with_options(ExprConstantLayout::String(StringLayout::DocString)),
trailing_comments(comments.trailing_comments(first)),
]
)?;
} else {
write!(f, [first.format()])?;
}
}
SuiteKind::Class => {
if let Some(constant) = get_docstring(first, &f.context().locator()) {
if !comments.has_leading_comments(first)
&& lines_before(first.start(), source) > 1
{
// Allow up to one empty line before a class docstring
// ```python
// class Test:
//
// """Docstring"""
// ```
write!(f, [empty_line()])?;
}
write!(
f,
[
// We format the expression, but the statement carries the comments
leading_comments(comments.leading_comments(first)),
constant
.format()
.with_options(ExprConstantLayout::String(StringLayout::DocString)),
trailing_comments(comments.trailing_comments(first)),
]
)?;
// Enforce an empty line after a class docstring
// ```python
// class Test:
// """Docstring"""
//
// ...
//
//
// class Test:
//
// """Docstring"""
//
// ...
// ```
write!(f, [empty_line()])?;
}
write!(f, [first.format()])?;
// Enforce an empty line after a class docstring, e.g., these are both stable
// formatting:
// ```python
// class Test:
// """Docstring"""
//
// ...
//
//
// class Test:
//
// """Docstring"""
//
// ...
// ```
if let Some(second) = iter.next() {
// Format the subsequent statement immediately. This rule takes precedence
// over the rules in the loop below (and most of them won't apply anyway,
// e.g., we know the first statement isn't an import).
write!(f, [empty_line(), second.format()])?;
last = second;
// Unlike black, we add the newline also after single quoted docstrings
if let Some(second) = iter.next() {
// Format the subsequent statement immediately. This rule takes precedence
// over the rules in the loop below (and most of them won't apply anyway,
// e.g., we know the first statement isn't an import).
write!(f, [empty_line(), second.format()])?;
last = second;
}
} else {
// No docstring, use normal formatting
write!(f, [first.format()])?;
}
}
_ => {
SuiteKind::TopLevel => {
write!(f, [first.format()])?;
}
}
@ -218,18 +256,27 @@ const fn is_import_definition(stmt: &Stmt) -> bool {
matches!(stmt, Stmt::Import(_) | Stmt::ImportFrom(_))
}
fn is_docstring(stmt: &Stmt) -> bool {
/// Checks if the statement is a simple string that can be formatted as a docstring
fn get_docstring<'a>(stmt: &'a Stmt, locator: &Locator) -> Option<&'a ExprConstant> {
let Stmt::Expr(ast::StmtExpr { value, .. }) = stmt else {
return false;
return None;
};
matches!(
value.as_ref(),
Expr::Constant(ast::ExprConstant {
value: Constant::Str(..),
..
})
)
let Expr::Constant(constant) = value.as_ref() else {
return None;
};
if let ExprConstant {
value: Constant::Str(..),
range,
..
} = constant
{
if is_implicit_concatenation(locator.slice(*range)) {
return None;
}
return Some(constant);
}
None
}
impl FormatRuleWithOptions<Suite, PyFormatContext<'_>> for FormatSuite {
@ -260,7 +307,6 @@ impl<'ast> IntoFormat<PyFormatContext<'ast>> for Suite {
#[cfg(test)]
mod tests {
use ruff_formatter::format;
use ruff_python_parser::parse_suite;
use crate::comments::Comments;