mirror of
https://github.com/astral-sh/ruff.git
synced 2025-09-26 11:59:10 +00:00
split string module (#9987)
This commit is contained in:
parent
bb8d2034e2
commit
fe79798c12
4 changed files with 847 additions and 813 deletions
212
crates/ruff_python_formatter/src/string/any.rs
Normal file
212
crates/ruff_python_formatter/src/string/any.rs
Normal file
|
@ -0,0 +1,212 @@
|
||||||
|
use std::iter::FusedIterator;
|
||||||
|
|
||||||
|
use memchr::memchr2;
|
||||||
|
|
||||||
|
use ruff_python_ast::{
|
||||||
|
self as ast, AnyNodeRef, Expr, ExprBytesLiteral, ExprFString, ExprStringLiteral, ExpressionRef,
|
||||||
|
StringLiteral,
|
||||||
|
};
|
||||||
|
use ruff_source_file::Locator;
|
||||||
|
use ruff_text_size::{Ranged, TextLen, TextRange};
|
||||||
|
|
||||||
|
use crate::expression::expr_f_string::f_string_quoting;
|
||||||
|
use crate::other::f_string::FormatFString;
|
||||||
|
use crate::other::string_literal::{FormatStringLiteral, StringLiteralKind};
|
||||||
|
use crate::prelude::*;
|
||||||
|
use crate::string::{Quoting, StringPrefix, StringQuotes};
|
||||||
|
|
||||||
|
/// Represents any kind of string expression. This could be either a string,
|
||||||
|
/// bytes or f-string.
|
||||||
|
#[derive(Copy, Clone, Debug)]
|
||||||
|
pub(crate) enum AnyString<'a> {
|
||||||
|
String(&'a ExprStringLiteral),
|
||||||
|
Bytes(&'a ExprBytesLiteral),
|
||||||
|
FString(&'a ExprFString),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> AnyString<'a> {
|
||||||
|
/// Creates a new [`AnyString`] from the given [`Expr`].
|
||||||
|
///
|
||||||
|
/// Returns `None` if the expression is not either a string, bytes or f-string.
|
||||||
|
pub(crate) fn from_expression(expression: &'a Expr) -> Option<AnyString<'a>> {
|
||||||
|
match expression {
|
||||||
|
Expr::StringLiteral(string) => Some(AnyString::String(string)),
|
||||||
|
Expr::BytesLiteral(bytes) => Some(AnyString::Bytes(bytes)),
|
||||||
|
Expr::FString(fstring) => Some(AnyString::FString(fstring)),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns `true` if the string is implicitly concatenated.
|
||||||
|
pub(crate) fn is_implicit_concatenated(self) -> bool {
|
||||||
|
match self {
|
||||||
|
Self::String(ExprStringLiteral { value, .. }) => value.is_implicit_concatenated(),
|
||||||
|
Self::Bytes(ExprBytesLiteral { value, .. }) => value.is_implicit_concatenated(),
|
||||||
|
Self::FString(ExprFString { value, .. }) => value.is_implicit_concatenated(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the quoting to be used for this string.
|
||||||
|
pub(super) fn quoting(self, locator: &Locator<'_>) -> Quoting {
|
||||||
|
match self {
|
||||||
|
Self::String(_) | Self::Bytes(_) => Quoting::CanChange,
|
||||||
|
Self::FString(f_string) => f_string_quoting(f_string, locator),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a vector of all the [`AnyStringPart`] of this string.
|
||||||
|
pub(super) fn parts(self, quoting: Quoting) -> AnyStringPartsIter<'a> {
|
||||||
|
match self {
|
||||||
|
Self::String(ExprStringLiteral { value, .. }) => {
|
||||||
|
AnyStringPartsIter::String(value.iter())
|
||||||
|
}
|
||||||
|
Self::Bytes(ExprBytesLiteral { value, .. }) => AnyStringPartsIter::Bytes(value.iter()),
|
||||||
|
Self::FString(ExprFString { value, .. }) => {
|
||||||
|
AnyStringPartsIter::FString(value.iter(), quoting)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn is_multiline(self, source: &str) -> bool {
|
||||||
|
match self {
|
||||||
|
AnyString::String(_) | AnyString::Bytes(_) => {
|
||||||
|
let contents = &source[self.range()];
|
||||||
|
let prefix = StringPrefix::parse(contents);
|
||||||
|
let quotes = StringQuotes::parse(
|
||||||
|
&contents[TextRange::new(prefix.text_len(), contents.text_len())],
|
||||||
|
);
|
||||||
|
|
||||||
|
quotes.is_some_and(StringQuotes::is_triple)
|
||||||
|
&& memchr2(b'\n', b'\r', contents.as_bytes()).is_some()
|
||||||
|
}
|
||||||
|
AnyString::FString(fstring) => {
|
||||||
|
memchr2(b'\n', b'\r', source[fstring.range].as_bytes()).is_some()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Ranged for AnyString<'_> {
|
||||||
|
fn range(&self) -> TextRange {
|
||||||
|
match self {
|
||||||
|
Self::String(expr) => expr.range(),
|
||||||
|
Self::Bytes(expr) => expr.range(),
|
||||||
|
Self::FString(expr) => expr.range(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> From<&AnyString<'a>> for AnyNodeRef<'a> {
|
||||||
|
fn from(value: &AnyString<'a>) -> Self {
|
||||||
|
match value {
|
||||||
|
AnyString::String(expr) => AnyNodeRef::ExprStringLiteral(expr),
|
||||||
|
AnyString::Bytes(expr) => AnyNodeRef::ExprBytesLiteral(expr),
|
||||||
|
AnyString::FString(expr) => AnyNodeRef::ExprFString(expr),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> From<AnyString<'a>> for AnyNodeRef<'a> {
|
||||||
|
fn from(value: AnyString<'a>) -> Self {
|
||||||
|
AnyNodeRef::from(&value)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> From<&AnyString<'a>> for ExpressionRef<'a> {
|
||||||
|
fn from(value: &AnyString<'a>) -> Self {
|
||||||
|
match value {
|
||||||
|
AnyString::String(expr) => ExpressionRef::StringLiteral(expr),
|
||||||
|
AnyString::Bytes(expr) => ExpressionRef::BytesLiteral(expr),
|
||||||
|
AnyString::FString(expr) => ExpressionRef::FString(expr),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(super) enum AnyStringPartsIter<'a> {
|
||||||
|
String(std::slice::Iter<'a, StringLiteral>),
|
||||||
|
Bytes(std::slice::Iter<'a, ast::BytesLiteral>),
|
||||||
|
FString(std::slice::Iter<'a, ast::FStringPart>, Quoting),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for AnyStringPartsIter<'a> {
|
||||||
|
type Item = AnyStringPart<'a>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
let part = match self {
|
||||||
|
Self::String(inner) => {
|
||||||
|
let part = inner.next()?;
|
||||||
|
AnyStringPart::String {
|
||||||
|
part,
|
||||||
|
layout: StringLiteralKind::String,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Self::Bytes(inner) => AnyStringPart::Bytes(inner.next()?),
|
||||||
|
Self::FString(inner, quoting) => {
|
||||||
|
let part = inner.next()?;
|
||||||
|
match part {
|
||||||
|
ast::FStringPart::Literal(string_literal) => AnyStringPart::String {
|
||||||
|
part: string_literal,
|
||||||
|
layout: StringLiteralKind::InImplicitlyConcatenatedFString(*quoting),
|
||||||
|
},
|
||||||
|
ast::FStringPart::FString(f_string) => AnyStringPart::FString {
|
||||||
|
part: f_string,
|
||||||
|
quoting: *quoting,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Some(part)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FusedIterator for AnyStringPartsIter<'_> {}
|
||||||
|
|
||||||
|
/// Represents any kind of string which is part of an implicitly concatenated
|
||||||
|
/// string. This could be either a string, bytes or f-string.
|
||||||
|
///
|
||||||
|
/// This is constructed from the [`AnyString::parts`] method on [`AnyString`].
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub(super) enum AnyStringPart<'a> {
|
||||||
|
String {
|
||||||
|
part: &'a ast::StringLiteral,
|
||||||
|
layout: StringLiteralKind,
|
||||||
|
},
|
||||||
|
Bytes(&'a ast::BytesLiteral),
|
||||||
|
FString {
|
||||||
|
part: &'a ast::FString,
|
||||||
|
quoting: Quoting,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> From<&AnyStringPart<'a>> for AnyNodeRef<'a> {
|
||||||
|
fn from(value: &AnyStringPart<'a>) -> Self {
|
||||||
|
match value {
|
||||||
|
AnyStringPart::String { part, .. } => AnyNodeRef::StringLiteral(part),
|
||||||
|
AnyStringPart::Bytes(part) => AnyNodeRef::BytesLiteral(part),
|
||||||
|
AnyStringPart::FString { part, .. } => AnyNodeRef::FString(part),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Ranged for AnyStringPart<'_> {
|
||||||
|
fn range(&self) -> TextRange {
|
||||||
|
match self {
|
||||||
|
Self::String { part, .. } => part.range(),
|
||||||
|
Self::Bytes(part) => part.range(),
|
||||||
|
Self::FString { part, .. } => part.range(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Format<PyFormatContext<'_>> for AnyStringPart<'_> {
|
||||||
|
fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> {
|
||||||
|
match self {
|
||||||
|
AnyStringPart::String { part, layout } => {
|
||||||
|
FormatStringLiteral::new(part, *layout).fmt(f)
|
||||||
|
}
|
||||||
|
AnyStringPart::Bytes(bytes_literal) => bytes_literal.format().fmt(f),
|
||||||
|
AnyStringPart::FString { part, quoting } => FormatFString::new(part, *quoting).fmt(f),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -109,7 +109,7 @@ use super::{NormalizedString, QuoteChar};
|
||||||
/// `indent-width * spaces` to tabs because doing so could break ASCII art and other docstrings
|
/// `indent-width * spaces` to tabs because doing so could break ASCII art and other docstrings
|
||||||
/// that use spaces for alignment.
|
/// that use spaces for alignment.
|
||||||
pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> FormatResult<()> {
|
pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> FormatResult<()> {
|
||||||
let docstring = &normalized.text;
|
let docstring = &normalized.text();
|
||||||
|
|
||||||
// Black doesn't change the indentation of docstrings that contain an escaped newline
|
// Black doesn't change the indentation of docstrings that contain an escaped newline
|
||||||
if contains_unescaped_newline(docstring) {
|
if contains_unescaped_newline(docstring) {
|
||||||
|
@ -125,7 +125,7 @@ pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> Form
|
||||||
let mut lines = docstring.split('\n').peekable();
|
let mut lines = docstring.split('\n').peekable();
|
||||||
|
|
||||||
// Start the string
|
// Start the string
|
||||||
write!(f, [normalized.prefix, normalized.quotes])?;
|
write!(f, [normalized.prefix(), normalized.quotes()])?;
|
||||||
// We track where in the source docstring we are (in source code byte offsets)
|
// We track where in the source docstring we are (in source code byte offsets)
|
||||||
let mut offset = normalized.start();
|
let mut offset = normalized.start();
|
||||||
|
|
||||||
|
@ -141,7 +141,7 @@ pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> Form
|
||||||
|
|
||||||
// Edge case: The first line is `""" "content`, so we need to insert chaperone space that keep
|
// Edge case: The first line is `""" "content`, so we need to insert chaperone space that keep
|
||||||
// inner quotes and closing quotes from getting to close to avoid `""""content`
|
// inner quotes and closing quotes from getting to close to avoid `""""content`
|
||||||
if trim_both.starts_with(normalized.quotes.quote_char.as_char()) {
|
if trim_both.starts_with(normalized.quotes().quote_char.as_char()) {
|
||||||
space().fmt(f)?;
|
space().fmt(f)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -168,7 +168,7 @@ pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> Form
|
||||||
{
|
{
|
||||||
space().fmt(f)?;
|
space().fmt(f)?;
|
||||||
}
|
}
|
||||||
normalized.quotes.fmt(f)?;
|
normalized.quotes().fmt(f)?;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -194,7 +194,7 @@ pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> Form
|
||||||
offset,
|
offset,
|
||||||
stripped_indentation,
|
stripped_indentation,
|
||||||
already_normalized,
|
already_normalized,
|
||||||
quote_char: normalized.quotes.quote_char,
|
quote_char: normalized.quotes().quote_char,
|
||||||
code_example: CodeExample::default(),
|
code_example: CodeExample::default(),
|
||||||
}
|
}
|
||||||
.add_iter(lines)?;
|
.add_iter(lines)?;
|
||||||
|
@ -207,7 +207,7 @@ pub(crate) fn format(normalized: &NormalizedString, f: &mut PyFormatter) -> Form
|
||||||
space().fmt(f)?;
|
space().fmt(f)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
write!(f, [normalized.quotes])
|
write!(f, [normalized.quotes()])
|
||||||
}
|
}
|
||||||
|
|
||||||
fn contains_unescaped_newline(haystack: &str) -> bool {
|
fn contains_unescaped_newline(haystack: &str) -> bool {
|
||||||
|
@ -1569,7 +1569,7 @@ fn docstring_format_source(
|
||||||
/// that avoids `content""""` and `content\"""`. This does only applies to un-escaped backslashes,
|
/// that avoids `content""""` and `content\"""`. This does only applies to un-escaped backslashes,
|
||||||
/// so `content\\ """` doesn't need a space while `content\\\ """` does.
|
/// so `content\\ """` doesn't need a space while `content\\\ """` does.
|
||||||
fn needs_chaperone_space(normalized: &NormalizedString, trim_end: &str) -> bool {
|
fn needs_chaperone_space(normalized: &NormalizedString, trim_end: &str) -> bool {
|
||||||
trim_end.ends_with(normalized.quotes.quote_char.as_char())
|
trim_end.ends_with(normalized.quotes().quote_char.as_char())
|
||||||
|| trim_end.chars().rev().take_while(|c| *c == '\\').count() % 2 == 1
|
|| trim_end.chars().rev().take_while(|c| *c == '\\').count() % 2 == 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,27 +1,19 @@
|
||||||
use std::borrow::Cow;
|
|
||||||
use std::iter::FusedIterator;
|
|
||||||
|
|
||||||
use bitflags::bitflags;
|
use bitflags::bitflags;
|
||||||
use memchr::memchr2;
|
|
||||||
|
|
||||||
use ruff_formatter::{format_args, write};
|
pub(crate) use any::AnyString;
|
||||||
use ruff_python_ast::{
|
pub(crate) use normalize::{NormalizedString, StringNormalizer};
|
||||||
self as ast, Expr, ExprBytesLiteral, ExprFString, ExprStringLiteral, ExpressionRef,
|
use ruff_formatter::format_args;
|
||||||
};
|
|
||||||
use ruff_python_ast::{AnyNodeRef, StringLiteral};
|
|
||||||
use ruff_source_file::Locator;
|
use ruff_source_file::Locator;
|
||||||
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
|
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||||||
|
|
||||||
use crate::comments::{leading_comments, trailing_comments};
|
use crate::comments::{leading_comments, trailing_comments};
|
||||||
use crate::expression::expr_f_string::f_string_quoting;
|
|
||||||
use crate::expression::parentheses::in_parentheses_only_soft_line_break_or_space;
|
use crate::expression::parentheses::in_parentheses_only_soft_line_break_or_space;
|
||||||
use crate::other::f_string::FormatFString;
|
|
||||||
use crate::other::string_literal::{FormatStringLiteral, StringLiteralKind};
|
|
||||||
use crate::prelude::*;
|
use crate::prelude::*;
|
||||||
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
|
|
||||||
use crate::QuoteStyle;
|
use crate::QuoteStyle;
|
||||||
|
|
||||||
|
mod any;
|
||||||
pub(crate) mod docstring;
|
pub(crate) mod docstring;
|
||||||
|
mod normalize;
|
||||||
|
|
||||||
#[derive(Copy, Clone, Debug, Default)]
|
#[derive(Copy, Clone, Debug, Default)]
|
||||||
pub(crate) enum Quoting {
|
pub(crate) enum Quoting {
|
||||||
|
@ -30,202 +22,6 @@ pub(crate) enum Quoting {
|
||||||
Preserve,
|
Preserve,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Represents any kind of string expression. This could be either a string,
|
|
||||||
/// bytes or f-string.
|
|
||||||
#[derive(Copy, Clone, Debug)]
|
|
||||||
pub(crate) enum AnyString<'a> {
|
|
||||||
String(&'a ExprStringLiteral),
|
|
||||||
Bytes(&'a ExprBytesLiteral),
|
|
||||||
FString(&'a ExprFString),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> AnyString<'a> {
|
|
||||||
/// Creates a new [`AnyString`] from the given [`Expr`].
|
|
||||||
///
|
|
||||||
/// Returns `None` if the expression is not either a string, bytes or f-string.
|
|
||||||
pub(crate) fn from_expression(expression: &'a Expr) -> Option<AnyString<'a>> {
|
|
||||||
match expression {
|
|
||||||
Expr::StringLiteral(string) => Some(AnyString::String(string)),
|
|
||||||
Expr::BytesLiteral(bytes) => Some(AnyString::Bytes(bytes)),
|
|
||||||
Expr::FString(fstring) => Some(AnyString::FString(fstring)),
|
|
||||||
_ => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns `true` if the string is implicitly concatenated.
|
|
||||||
pub(crate) fn is_implicit_concatenated(self) -> bool {
|
|
||||||
match self {
|
|
||||||
Self::String(ExprStringLiteral { value, .. }) => value.is_implicit_concatenated(),
|
|
||||||
Self::Bytes(ExprBytesLiteral { value, .. }) => value.is_implicit_concatenated(),
|
|
||||||
Self::FString(ExprFString { value, .. }) => value.is_implicit_concatenated(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the quoting to be used for this string.
|
|
||||||
fn quoting(self, locator: &Locator<'_>) -> Quoting {
|
|
||||||
match self {
|
|
||||||
Self::String(_) | Self::Bytes(_) => Quoting::CanChange,
|
|
||||||
Self::FString(f_string) => f_string_quoting(f_string, locator),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns a vector of all the [`AnyStringPart`] of this string.
|
|
||||||
fn parts(self, quoting: Quoting) -> AnyStringPartsIter<'a> {
|
|
||||||
match self {
|
|
||||||
Self::String(ExprStringLiteral { value, .. }) => {
|
|
||||||
AnyStringPartsIter::String(value.iter())
|
|
||||||
}
|
|
||||||
Self::Bytes(ExprBytesLiteral { value, .. }) => AnyStringPartsIter::Bytes(value.iter()),
|
|
||||||
Self::FString(ExprFString { value, .. }) => {
|
|
||||||
AnyStringPartsIter::FString(value.iter(), quoting)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn is_multiline(self, source: &str) -> bool {
|
|
||||||
match self {
|
|
||||||
AnyString::String(_) | AnyString::Bytes(_) => {
|
|
||||||
let contents = &source[self.range()];
|
|
||||||
let prefix = StringPrefix::parse(contents);
|
|
||||||
let quotes = StringQuotes::parse(
|
|
||||||
&contents[TextRange::new(prefix.text_len(), contents.text_len())],
|
|
||||||
);
|
|
||||||
|
|
||||||
quotes.is_some_and(StringQuotes::is_triple)
|
|
||||||
&& memchr2(b'\n', b'\r', contents.as_bytes()).is_some()
|
|
||||||
}
|
|
||||||
AnyString::FString(fstring) => {
|
|
||||||
memchr2(b'\n', b'\r', source[fstring.range].as_bytes()).is_some()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Ranged for AnyString<'_> {
|
|
||||||
fn range(&self) -> TextRange {
|
|
||||||
match self {
|
|
||||||
Self::String(expr) => expr.range(),
|
|
||||||
Self::Bytes(expr) => expr.range(),
|
|
||||||
Self::FString(expr) => expr.range(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> From<&AnyString<'a>> for AnyNodeRef<'a> {
|
|
||||||
fn from(value: &AnyString<'a>) -> Self {
|
|
||||||
match value {
|
|
||||||
AnyString::String(expr) => AnyNodeRef::ExprStringLiteral(expr),
|
|
||||||
AnyString::Bytes(expr) => AnyNodeRef::ExprBytesLiteral(expr),
|
|
||||||
AnyString::FString(expr) => AnyNodeRef::ExprFString(expr),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> From<AnyString<'a>> for AnyNodeRef<'a> {
|
|
||||||
fn from(value: AnyString<'a>) -> Self {
|
|
||||||
AnyNodeRef::from(&value)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> From<&AnyString<'a>> for ExpressionRef<'a> {
|
|
||||||
fn from(value: &AnyString<'a>) -> Self {
|
|
||||||
match value {
|
|
||||||
AnyString::String(expr) => ExpressionRef::StringLiteral(expr),
|
|
||||||
AnyString::Bytes(expr) => ExpressionRef::BytesLiteral(expr),
|
|
||||||
AnyString::FString(expr) => ExpressionRef::FString(expr),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
enum AnyStringPartsIter<'a> {
|
|
||||||
String(std::slice::Iter<'a, StringLiteral>),
|
|
||||||
Bytes(std::slice::Iter<'a, ast::BytesLiteral>),
|
|
||||||
FString(std::slice::Iter<'a, ast::FStringPart>, Quoting),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> Iterator for AnyStringPartsIter<'a> {
|
|
||||||
type Item = AnyStringPart<'a>;
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
let part = match self {
|
|
||||||
Self::String(inner) => {
|
|
||||||
let part = inner.next()?;
|
|
||||||
AnyStringPart::String {
|
|
||||||
part,
|
|
||||||
layout: StringLiteralKind::String,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Self::Bytes(inner) => AnyStringPart::Bytes(inner.next()?),
|
|
||||||
Self::FString(inner, quoting) => {
|
|
||||||
let part = inner.next()?;
|
|
||||||
match part {
|
|
||||||
ast::FStringPart::Literal(string_literal) => AnyStringPart::String {
|
|
||||||
part: string_literal,
|
|
||||||
layout: StringLiteralKind::InImplicitlyConcatenatedFString(*quoting),
|
|
||||||
},
|
|
||||||
ast::FStringPart::FString(f_string) => AnyStringPart::FString {
|
|
||||||
part: f_string,
|
|
||||||
quoting: *quoting,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
Some(part)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FusedIterator for AnyStringPartsIter<'_> {}
|
|
||||||
|
|
||||||
/// Represents any kind of string which is part of an implicitly concatenated
|
|
||||||
/// string. This could be either a string, bytes or f-string.
|
|
||||||
///
|
|
||||||
/// This is constructed from the [`AnyString::parts`] method on [`AnyString`].
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
enum AnyStringPart<'a> {
|
|
||||||
String {
|
|
||||||
part: &'a ast::StringLiteral,
|
|
||||||
layout: StringLiteralKind,
|
|
||||||
},
|
|
||||||
Bytes(&'a ast::BytesLiteral),
|
|
||||||
FString {
|
|
||||||
part: &'a ast::FString,
|
|
||||||
quoting: Quoting,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> From<&AnyStringPart<'a>> for AnyNodeRef<'a> {
|
|
||||||
fn from(value: &AnyStringPart<'a>) -> Self {
|
|
||||||
match value {
|
|
||||||
AnyStringPart::String { part, .. } => AnyNodeRef::StringLiteral(part),
|
|
||||||
AnyStringPart::Bytes(part) => AnyNodeRef::BytesLiteral(part),
|
|
||||||
AnyStringPart::FString { part, .. } => AnyNodeRef::FString(part),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Ranged for AnyStringPart<'_> {
|
|
||||||
fn range(&self) -> TextRange {
|
|
||||||
match self {
|
|
||||||
Self::String { part, .. } => part.range(),
|
|
||||||
Self::Bytes(part) => part.range(),
|
|
||||||
Self::FString { part, .. } => part.range(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Format<PyFormatContext<'_>> for AnyStringPart<'_> {
|
|
||||||
fn fmt(&self, f: &mut PyFormatter) -> FormatResult<()> {
|
|
||||||
match self {
|
|
||||||
AnyStringPart::String { part, layout } => {
|
|
||||||
FormatStringLiteral::new(part, *layout).fmt(f)
|
|
||||||
}
|
|
||||||
AnyStringPart::Bytes(bytes_literal) => bytes_literal.format().fmt(f),
|
|
||||||
AnyStringPart::FString { part, quoting } => FormatFString::new(part, *quoting).fmt(f),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Formats any implicitly concatenated string. This could be any valid combination
|
/// Formats any implicitly concatenated string. This could be any valid combination
|
||||||
/// of string, bytes or f-string literals.
|
/// of string, bytes or f-string literals.
|
||||||
pub(crate) struct FormatStringContinuation<'a> {
|
pub(crate) struct FormatStringContinuation<'a> {
|
||||||
|
@ -308,167 +104,6 @@ impl StringPart {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) struct StringNormalizer {
|
|
||||||
quoting: Quoting,
|
|
||||||
preferred_quote_style: QuoteStyle,
|
|
||||||
parent_docstring_quote_char: Option<QuoteChar>,
|
|
||||||
normalize_hex: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl StringNormalizer {
|
|
||||||
pub(crate) fn from_context(context: &PyFormatContext<'_>) -> Self {
|
|
||||||
Self {
|
|
||||||
quoting: Quoting::default(),
|
|
||||||
preferred_quote_style: QuoteStyle::default(),
|
|
||||||
parent_docstring_quote_char: context.docstring(),
|
|
||||||
normalize_hex: is_hex_codes_in_unicode_sequences_enabled(context),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn with_preferred_quote_style(mut self, quote_style: QuoteStyle) -> Self {
|
|
||||||
self.preferred_quote_style = quote_style;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn with_quoting(mut self, quoting: Quoting) -> Self {
|
|
||||||
self.quoting = quoting;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Computes the strings preferred quotes.
|
|
||||||
pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> StringQuotes {
|
|
||||||
// Per PEP 8, always prefer double quotes for triple-quoted strings.
|
|
||||||
// Except when using quote-style-preserve.
|
|
||||||
let preferred_style = if string.quotes().triple {
|
|
||||||
// ... unless we're formatting a code snippet inside a docstring,
|
|
||||||
// then we specifically want to invert our quote style to avoid
|
|
||||||
// writing out invalid Python.
|
|
||||||
//
|
|
||||||
// It's worth pointing out that we can actually wind up being
|
|
||||||
// somewhat out of sync with PEP8 in this case. Consider this
|
|
||||||
// example:
|
|
||||||
//
|
|
||||||
// def foo():
|
|
||||||
// '''
|
|
||||||
// Something.
|
|
||||||
//
|
|
||||||
// >>> """tricksy"""
|
|
||||||
// '''
|
|
||||||
// pass
|
|
||||||
//
|
|
||||||
// Ideally, this would be reformatted as:
|
|
||||||
//
|
|
||||||
// def foo():
|
|
||||||
// """
|
|
||||||
// Something.
|
|
||||||
//
|
|
||||||
// >>> '''tricksy'''
|
|
||||||
// """
|
|
||||||
// pass
|
|
||||||
//
|
|
||||||
// But the logic here results in the original quoting being
|
|
||||||
// preserved. This is because the quoting style of the outer
|
|
||||||
// docstring is determined, in part, by looking at its contents. In
|
|
||||||
// this case, it notices that it contains a `"""` and thus infers
|
|
||||||
// that using `'''` would overall read better because it avoids
|
|
||||||
// the need to escape the interior `"""`. Except... in this case,
|
|
||||||
// the `"""` is actually part of a code snippet that could get
|
|
||||||
// reformatted to using a different quoting style itself.
|
|
||||||
//
|
|
||||||
// Fixing this would, I believe, require some fairly seismic
|
|
||||||
// changes to how formatting strings works. Namely, we would need
|
|
||||||
// to look for code snippets before normalizing the docstring, and
|
|
||||||
// then figure out the quoting style more holistically by looking
|
|
||||||
// at the various kinds of quotes used in the code snippets and
|
|
||||||
// what reformatting them might look like.
|
|
||||||
//
|
|
||||||
// Overall this is a bit of a corner case and just inverting the
|
|
||||||
// style from what the parent ultimately decided upon works, even
|
|
||||||
// if it doesn't have perfect alignment with PEP8.
|
|
||||||
if let Some(quote) = self.parent_docstring_quote_char {
|
|
||||||
QuoteStyle::from(quote.invert())
|
|
||||||
} else if self.preferred_quote_style.is_preserve() {
|
|
||||||
QuoteStyle::Preserve
|
|
||||||
} else {
|
|
||||||
QuoteStyle::Double
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
self.preferred_quote_style
|
|
||||||
};
|
|
||||||
|
|
||||||
match self.quoting {
|
|
||||||
Quoting::Preserve => string.quotes(),
|
|
||||||
Quoting::CanChange => {
|
|
||||||
if let Some(preferred_quote) = QuoteChar::from_style(preferred_style) {
|
|
||||||
let raw_content = locator.slice(string.content_range());
|
|
||||||
if string.prefix().is_raw_string() {
|
|
||||||
choose_quotes_for_raw_string(raw_content, string.quotes(), preferred_quote)
|
|
||||||
} else {
|
|
||||||
choose_quotes_impl(raw_content, string.quotes(), preferred_quote)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
string.quotes()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Computes the strings preferred quotes and normalizes its content.
|
|
||||||
pub(crate) fn normalize<'a>(
|
|
||||||
&self,
|
|
||||||
string: &StringPart,
|
|
||||||
locator: &'a Locator,
|
|
||||||
) -> NormalizedString<'a> {
|
|
||||||
let raw_content = locator.slice(string.content_range());
|
|
||||||
|
|
||||||
let quotes = self.choose_quotes(string, locator);
|
|
||||||
|
|
||||||
let normalized = normalize_string(raw_content, quotes, string.prefix(), self.normalize_hex);
|
|
||||||
|
|
||||||
NormalizedString {
|
|
||||||
prefix: string.prefix(),
|
|
||||||
content_range: string.content_range(),
|
|
||||||
text: normalized,
|
|
||||||
quotes,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub(crate) struct NormalizedString<'a> {
|
|
||||||
prefix: StringPrefix,
|
|
||||||
|
|
||||||
/// The quotes of the normalized string (preferred quotes)
|
|
||||||
quotes: StringQuotes,
|
|
||||||
|
|
||||||
/// The range of the string's content in the source (minus prefix and quotes).
|
|
||||||
content_range: TextRange,
|
|
||||||
|
|
||||||
/// The normalized text
|
|
||||||
text: Cow<'a, str>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Ranged for NormalizedString<'_> {
|
|
||||||
fn range(&self) -> TextRange {
|
|
||||||
self.content_range
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Format<PyFormatContext<'_>> for NormalizedString<'_> {
|
|
||||||
fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
|
|
||||||
write!(f, [self.prefix, self.quotes])?;
|
|
||||||
match &self.text {
|
|
||||||
Cow::Borrowed(_) => {
|
|
||||||
source_text_slice(self.range()).fmt(f)?;
|
|
||||||
}
|
|
||||||
Cow::Owned(normalized) => {
|
|
||||||
text(normalized).fmt(f)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self.quotes.fmt(f)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bitflags! {
|
bitflags! {
|
||||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||||
pub(crate) struct StringPrefix: u8 {
|
pub(crate) struct StringPrefix: u8 {
|
||||||
|
@ -549,175 +184,6 @@ impl Format<PyFormatContext<'_>> for StringPrefix {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Choose the appropriate quote style for a raw string.
|
|
||||||
///
|
|
||||||
/// The preferred quote style is chosen unless the string contains unescaped quotes of the
|
|
||||||
/// preferred style. For example, `r"foo"` is chosen over `r'foo'` if the preferred quote
|
|
||||||
/// style is double quotes.
|
|
||||||
fn choose_quotes_for_raw_string(
|
|
||||||
input: &str,
|
|
||||||
quotes: StringQuotes,
|
|
||||||
preferred_quote: QuoteChar,
|
|
||||||
) -> StringQuotes {
|
|
||||||
let preferred_quote_char = preferred_quote.as_char();
|
|
||||||
let mut chars = input.chars().peekable();
|
|
||||||
let contains_unescaped_configured_quotes = loop {
|
|
||||||
match chars.next() {
|
|
||||||
Some('\\') => {
|
|
||||||
// Ignore escaped characters
|
|
||||||
chars.next();
|
|
||||||
}
|
|
||||||
// `"` or `'`
|
|
||||||
Some(c) if c == preferred_quote_char => {
|
|
||||||
if !quotes.triple {
|
|
||||||
break true;
|
|
||||||
}
|
|
||||||
|
|
||||||
match chars.peek() {
|
|
||||||
// We can't turn `r'''\""'''` into `r"""\"""""`, this would confuse the parser
|
|
||||||
// about where the closing triple quotes start
|
|
||||||
None => break true,
|
|
||||||
Some(next) if *next == preferred_quote_char => {
|
|
||||||
// `""` or `''`
|
|
||||||
chars.next();
|
|
||||||
|
|
||||||
// We can't turn `r'''""'''` into `r""""""""`, nor can we have
|
|
||||||
// `"""` or `'''` respectively inside the string
|
|
||||||
if chars.peek().is_none() || chars.peek() == Some(&preferred_quote_char) {
|
|
||||||
break true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some(_) => continue,
|
|
||||||
None => break false,
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
StringQuotes {
|
|
||||||
triple: quotes.triple,
|
|
||||||
quote_char: if contains_unescaped_configured_quotes {
|
|
||||||
quotes.quote_char
|
|
||||||
} else {
|
|
||||||
preferred_quote
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Choose the appropriate quote style for a string.
|
|
||||||
///
|
|
||||||
/// For single quoted strings, the preferred quote style is used, unless the alternative quote style
|
|
||||||
/// would require fewer escapes.
|
|
||||||
///
|
|
||||||
/// For triple quoted strings, the preferred quote style is always used, unless the string contains
|
|
||||||
/// a triplet of the quote character (e.g., if double quotes are preferred, double quotes will be
|
|
||||||
/// used unless the string contains `"""`).
|
|
||||||
fn choose_quotes_impl(
|
|
||||||
input: &str,
|
|
||||||
quotes: StringQuotes,
|
|
||||||
preferred_quote: QuoteChar,
|
|
||||||
) -> StringQuotes {
|
|
||||||
let quote = if quotes.triple {
|
|
||||||
// True if the string contains a triple quote sequence of the configured quote style.
|
|
||||||
let mut uses_triple_quotes = false;
|
|
||||||
let mut chars = input.chars().peekable();
|
|
||||||
|
|
||||||
while let Some(c) = chars.next() {
|
|
||||||
let preferred_quote_char = preferred_quote.as_char();
|
|
||||||
match c {
|
|
||||||
'\\' => {
|
|
||||||
if matches!(chars.peek(), Some('"' | '\\')) {
|
|
||||||
chars.next();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// `"` or `'`
|
|
||||||
c if c == preferred_quote_char => {
|
|
||||||
match chars.peek().copied() {
|
|
||||||
Some(c) if c == preferred_quote_char => {
|
|
||||||
// `""` or `''`
|
|
||||||
chars.next();
|
|
||||||
|
|
||||||
match chars.peek().copied() {
|
|
||||||
Some(c) if c == preferred_quote_char => {
|
|
||||||
// `"""` or `'''`
|
|
||||||
chars.next();
|
|
||||||
uses_triple_quotes = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
Some(_) => {}
|
|
||||||
None => {
|
|
||||||
// Handle `''' ""'''`. At this point we have consumed both
|
|
||||||
// double quotes, so on the next iteration the iterator is empty
|
|
||||||
// and we'd miss the string ending with a preferred quote
|
|
||||||
uses_triple_quotes = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some(_) => {
|
|
||||||
// A single quote char, this is ok
|
|
||||||
}
|
|
||||||
None => {
|
|
||||||
// Trailing quote at the end of the comment
|
|
||||||
uses_triple_quotes = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => continue,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if uses_triple_quotes {
|
|
||||||
// String contains a triple quote sequence of the configured quote style.
|
|
||||||
// Keep the existing quote style.
|
|
||||||
quotes.quote_char
|
|
||||||
} else {
|
|
||||||
preferred_quote
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
let mut single_quotes = 0u32;
|
|
||||||
let mut double_quotes = 0u32;
|
|
||||||
|
|
||||||
for c in input.chars() {
|
|
||||||
match c {
|
|
||||||
'\'' => {
|
|
||||||
single_quotes += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
'"' => {
|
|
||||||
double_quotes += 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
_ => continue,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
match preferred_quote {
|
|
||||||
QuoteChar::Single => {
|
|
||||||
if single_quotes > double_quotes {
|
|
||||||
QuoteChar::Double
|
|
||||||
} else {
|
|
||||||
QuoteChar::Single
|
|
||||||
}
|
|
||||||
}
|
|
||||||
QuoteChar::Double => {
|
|
||||||
if double_quotes > single_quotes {
|
|
||||||
QuoteChar::Single
|
|
||||||
} else {
|
|
||||||
QuoteChar::Double
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
StringQuotes {
|
|
||||||
triple: quotes.triple,
|
|
||||||
quote_char: quote,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Copy, Clone, Debug)]
|
#[derive(Copy, Clone, Debug)]
|
||||||
pub(crate) struct StringQuotes {
|
pub(crate) struct StringQuotes {
|
||||||
triple: bool,
|
triple: bool,
|
||||||
|
@ -821,269 +287,3 @@ impl TryFrom<char> for QuoteChar {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Adds the necessary quote escapes and removes unnecessary escape sequences when quoting `input`
|
|
||||||
/// with the provided [`StringQuotes`] style.
|
|
||||||
///
|
|
||||||
/// Returns the normalized string and whether it contains new lines.
|
|
||||||
pub(crate) fn normalize_string(
|
|
||||||
input: &str,
|
|
||||||
quotes: StringQuotes,
|
|
||||||
prefix: StringPrefix,
|
|
||||||
normalize_hex: bool,
|
|
||||||
) -> Cow<str> {
|
|
||||||
// The normalized string if `input` is not yet normalized.
|
|
||||||
// `output` must remain empty if `input` is already normalized.
|
|
||||||
let mut output = String::new();
|
|
||||||
// Tracks the last index of `input` that has been written to `output`.
|
|
||||||
// If `last_index` is `0` at the end, then the input is already normalized and can be returned as is.
|
|
||||||
let mut last_index = 0;
|
|
||||||
|
|
||||||
let quote = quotes.quote_char;
|
|
||||||
let preferred_quote = quote.as_char();
|
|
||||||
let opposite_quote = quote.invert().as_char();
|
|
||||||
|
|
||||||
let mut chars = input.char_indices().peekable();
|
|
||||||
|
|
||||||
let is_raw = prefix.is_raw_string();
|
|
||||||
let is_fstring = prefix.is_fstring();
|
|
||||||
let mut formatted_value_nesting = 0u32;
|
|
||||||
|
|
||||||
while let Some((index, c)) = chars.next() {
|
|
||||||
if is_fstring && matches!(c, '{' | '}') {
|
|
||||||
if chars.peek().copied().is_some_and(|(_, next)| next == c) {
|
|
||||||
// Skip over the second character of the double braces
|
|
||||||
chars.next();
|
|
||||||
} else if c == '{' {
|
|
||||||
formatted_value_nesting += 1;
|
|
||||||
} else {
|
|
||||||
// Safe to assume that `c == '}'` here because of the matched pattern above
|
|
||||||
formatted_value_nesting = formatted_value_nesting.saturating_sub(1);
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if c == '\r' {
|
|
||||||
output.push_str(&input[last_index..index]);
|
|
||||||
|
|
||||||
// Skip over the '\r' character, keep the `\n`
|
|
||||||
if chars.peek().copied().is_some_and(|(_, next)| next == '\n') {
|
|
||||||
chars.next();
|
|
||||||
}
|
|
||||||
// Replace the `\r` with a `\n`
|
|
||||||
else {
|
|
||||||
output.push('\n');
|
|
||||||
}
|
|
||||||
|
|
||||||
last_index = index + '\r'.len_utf8();
|
|
||||||
} else if !is_raw {
|
|
||||||
if c == '\\' {
|
|
||||||
if let Some((_, next)) = chars.clone().next() {
|
|
||||||
if next == '\\' {
|
|
||||||
// Skip over escaped backslashes
|
|
||||||
chars.next();
|
|
||||||
} else if normalize_hex {
|
|
||||||
if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte())
|
|
||||||
.and_then(|escape| {
|
|
||||||
escape.normalize(&input[index + c.len_utf8() + next.len_utf8()..])
|
|
||||||
})
|
|
||||||
{
|
|
||||||
// Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
|
|
||||||
let escape_start_len = '\\'.len_utf8() + next.len_utf8();
|
|
||||||
let escape_start_offset = index + escape_start_len;
|
|
||||||
if let Cow::Owned(normalised) = &normalised {
|
|
||||||
output.push_str(&input[last_index..escape_start_offset]);
|
|
||||||
output.push_str(normalised);
|
|
||||||
last_index = escape_start_offset + normalised.len();
|
|
||||||
};
|
|
||||||
|
|
||||||
// Move the `chars` iterator passed the escape sequence.
|
|
||||||
// Simply reassigning `chars` doesn't work because the indices` would
|
|
||||||
// then be off.
|
|
||||||
for _ in 0..next.len_utf8() + normalised.len() {
|
|
||||||
chars.next();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !quotes.triple {
|
|
||||||
#[allow(clippy::if_same_then_else)]
|
|
||||||
if next == opposite_quote && formatted_value_nesting == 0 {
|
|
||||||
// Remove the escape by ending before the backslash and starting again with the quote
|
|
||||||
chars.next();
|
|
||||||
output.push_str(&input[last_index..index]);
|
|
||||||
last_index = index + '\\'.len_utf8();
|
|
||||||
} else if next == preferred_quote {
|
|
||||||
// Quote is already escaped, skip over it.
|
|
||||||
chars.next();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if !quotes.triple && c == preferred_quote && formatted_value_nesting == 0 {
|
|
||||||
// Escape the quote
|
|
||||||
output.push_str(&input[last_index..index]);
|
|
||||||
output.push('\\');
|
|
||||||
output.push(c);
|
|
||||||
last_index = index + preferred_quote.len_utf8();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let normalized = if last_index == 0 {
|
|
||||||
Cow::Borrowed(input)
|
|
||||||
} else {
|
|
||||||
output.push_str(&input[last_index..]);
|
|
||||||
Cow::Owned(output)
|
|
||||||
};
|
|
||||||
|
|
||||||
normalized
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
|
||||||
enum UnicodeEscape {
|
|
||||||
/// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
|
|
||||||
Hex(usize),
|
|
||||||
|
|
||||||
/// An escaped unicode name (`\N{name}`)
|
|
||||||
CharacterName,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl UnicodeEscape {
|
|
||||||
fn new(first: char, allow_unicode: bool) -> Option<UnicodeEscape> {
|
|
||||||
Some(match first {
|
|
||||||
'x' => UnicodeEscape::Hex(2),
|
|
||||||
'u' if allow_unicode => UnicodeEscape::Hex(4),
|
|
||||||
'U' if allow_unicode => UnicodeEscape::Hex(8),
|
|
||||||
'N' if allow_unicode => UnicodeEscape::CharacterName,
|
|
||||||
_ => return None,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Normalises `\u..`, `\U..`, `\x..` and `\N{..}` escape sequences to:
|
|
||||||
///
|
|
||||||
/// * `\u`, `\U'` and `\x`: To use lower case for the characters `a-f`.
|
|
||||||
/// * `\N`: To use uppercase letters
|
|
||||||
fn normalize(self, input: &str) -> Option<Cow<str>> {
|
|
||||||
let mut normalised = String::new();
|
|
||||||
|
|
||||||
let len = match self {
|
|
||||||
UnicodeEscape::Hex(len) => {
|
|
||||||
// It's not a valid escape sequence if the input string has fewer characters
|
|
||||||
// left than required by the escape sequence.
|
|
||||||
if input.len() < len {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (index, c) in input.char_indices().take(len) {
|
|
||||||
match c {
|
|
||||||
'0'..='9' | 'a'..='f' => {
|
|
||||||
if !normalised.is_empty() {
|
|
||||||
normalised.push(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
'A'..='F' => {
|
|
||||||
if normalised.is_empty() {
|
|
||||||
normalised.reserve(len);
|
|
||||||
normalised.push_str(&input[..index]);
|
|
||||||
normalised.push(c.to_ascii_lowercase());
|
|
||||||
} else {
|
|
||||||
normalised.push(c.to_ascii_lowercase());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
// not a valid escape sequence
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
len
|
|
||||||
}
|
|
||||||
UnicodeEscape::CharacterName => {
|
|
||||||
let mut char_indices = input.char_indices();
|
|
||||||
|
|
||||||
if !matches!(char_indices.next(), Some((_, '{'))) {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
loop {
|
|
||||||
if let Some((index, c)) = char_indices.next() {
|
|
||||||
match c {
|
|
||||||
'}' => {
|
|
||||||
if !normalised.is_empty() {
|
|
||||||
normalised.push('}');
|
|
||||||
}
|
|
||||||
|
|
||||||
// Name must be at least two characters long.
|
|
||||||
if index < 3 {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
break index + '}'.len_utf8();
|
|
||||||
}
|
|
||||||
'0'..='9' | 'A'..='Z' | ' ' | '-' => {
|
|
||||||
if !normalised.is_empty() {
|
|
||||||
normalised.push(c);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
'a'..='z' => {
|
|
||||||
if normalised.is_empty() {
|
|
||||||
normalised.reserve(c.len_utf8() + '}'.len_utf8());
|
|
||||||
normalised.push_str(&input[..index]);
|
|
||||||
normalised.push(c.to_ascii_uppercase());
|
|
||||||
} else {
|
|
||||||
normalised.push(c.to_ascii_uppercase());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
// Seems like an invalid escape sequence, don't normalise it.
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Unterminated escape sequence, don't normalise it.
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
Some(if normalised.is_empty() {
|
|
||||||
Cow::Borrowed(&input[..len])
|
|
||||||
} else {
|
|
||||||
Cow::Owned(normalised)
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use crate::string::{normalize_string, QuoteChar, StringPrefix, StringQuotes, UnicodeEscape};
|
|
||||||
use std::borrow::Cow;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn normalize_32_escape() {
|
|
||||||
let escape_sequence = UnicodeEscape::new('U', true).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
Some(Cow::Owned("0001f60e".to_string())),
|
|
||||||
escape_sequence.normalize("0001F60E")
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn normalize_hex_in_byte_string() {
|
|
||||||
let input = r"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A";
|
|
||||||
|
|
||||||
let normalized = normalize_string(
|
|
||||||
input,
|
|
||||||
StringQuotes {
|
|
||||||
triple: false,
|
|
||||||
quote_char: QuoteChar::Double,
|
|
||||||
},
|
|
||||||
StringPrefix::BYTE,
|
|
||||||
true,
|
|
||||||
);
|
|
||||||
|
|
||||||
assert_eq!(r"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", &normalized);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
622
crates/ruff_python_formatter/src/string/normalize.rs
Normal file
622
crates/ruff_python_formatter/src/string/normalize.rs
Normal file
|
@ -0,0 +1,622 @@
|
||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
use ruff_source_file::Locator;
|
||||||
|
use ruff_text_size::{Ranged, TextRange};
|
||||||
|
|
||||||
|
use crate::prelude::*;
|
||||||
|
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
|
||||||
|
use crate::string::{QuoteChar, Quoting, StringPart, StringPrefix, StringQuotes};
|
||||||
|
use crate::QuoteStyle;
|
||||||
|
|
||||||
|
pub(crate) struct StringNormalizer {
|
||||||
|
quoting: Quoting,
|
||||||
|
preferred_quote_style: QuoteStyle,
|
||||||
|
parent_docstring_quote_char: Option<QuoteChar>,
|
||||||
|
normalize_hex: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StringNormalizer {
|
||||||
|
pub(crate) fn from_context(context: &PyFormatContext<'_>) -> Self {
|
||||||
|
Self {
|
||||||
|
quoting: Quoting::default(),
|
||||||
|
preferred_quote_style: QuoteStyle::default(),
|
||||||
|
parent_docstring_quote_char: context.docstring(),
|
||||||
|
normalize_hex: is_hex_codes_in_unicode_sequences_enabled(context),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn with_preferred_quote_style(mut self, quote_style: QuoteStyle) -> Self {
|
||||||
|
self.preferred_quote_style = quote_style;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn with_quoting(mut self, quoting: Quoting) -> Self {
|
||||||
|
self.quoting = quoting;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Computes the strings preferred quotes.
|
||||||
|
pub(crate) fn choose_quotes(&self, string: &StringPart, locator: &Locator) -> StringQuotes {
|
||||||
|
// Per PEP 8, always prefer double quotes for triple-quoted strings.
|
||||||
|
// Except when using quote-style-preserve.
|
||||||
|
let preferred_style = if string.quotes().triple {
|
||||||
|
// ... unless we're formatting a code snippet inside a docstring,
|
||||||
|
// then we specifically want to invert our quote style to avoid
|
||||||
|
// writing out invalid Python.
|
||||||
|
//
|
||||||
|
// It's worth pointing out that we can actually wind up being
|
||||||
|
// somewhat out of sync with PEP8 in this case. Consider this
|
||||||
|
// example:
|
||||||
|
//
|
||||||
|
// def foo():
|
||||||
|
// '''
|
||||||
|
// Something.
|
||||||
|
//
|
||||||
|
// >>> """tricksy"""
|
||||||
|
// '''
|
||||||
|
// pass
|
||||||
|
//
|
||||||
|
// Ideally, this would be reformatted as:
|
||||||
|
//
|
||||||
|
// def foo():
|
||||||
|
// """
|
||||||
|
// Something.
|
||||||
|
//
|
||||||
|
// >>> '''tricksy'''
|
||||||
|
// """
|
||||||
|
// pass
|
||||||
|
//
|
||||||
|
// But the logic here results in the original quoting being
|
||||||
|
// preserved. This is because the quoting style of the outer
|
||||||
|
// docstring is determined, in part, by looking at its contents. In
|
||||||
|
// this case, it notices that it contains a `"""` and thus infers
|
||||||
|
// that using `'''` would overall read better because it avoids
|
||||||
|
// the need to escape the interior `"""`. Except... in this case,
|
||||||
|
// the `"""` is actually part of a code snippet that could get
|
||||||
|
// reformatted to using a different quoting style itself.
|
||||||
|
//
|
||||||
|
// Fixing this would, I believe, require some fairly seismic
|
||||||
|
// changes to how formatting strings works. Namely, we would need
|
||||||
|
// to look for code snippets before normalizing the docstring, and
|
||||||
|
// then figure out the quoting style more holistically by looking
|
||||||
|
// at the various kinds of quotes used in the code snippets and
|
||||||
|
// what reformatting them might look like.
|
||||||
|
//
|
||||||
|
// Overall this is a bit of a corner case and just inverting the
|
||||||
|
// style from what the parent ultimately decided upon works, even
|
||||||
|
// if it doesn't have perfect alignment with PEP8.
|
||||||
|
if let Some(quote) = self.parent_docstring_quote_char {
|
||||||
|
QuoteStyle::from(quote.invert())
|
||||||
|
} else if self.preferred_quote_style.is_preserve() {
|
||||||
|
QuoteStyle::Preserve
|
||||||
|
} else {
|
||||||
|
QuoteStyle::Double
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
self.preferred_quote_style
|
||||||
|
};
|
||||||
|
|
||||||
|
match self.quoting {
|
||||||
|
Quoting::Preserve => string.quotes(),
|
||||||
|
Quoting::CanChange => {
|
||||||
|
if let Some(preferred_quote) = QuoteChar::from_style(preferred_style) {
|
||||||
|
let raw_content = locator.slice(string.content_range());
|
||||||
|
if string.prefix().is_raw_string() {
|
||||||
|
choose_quotes_for_raw_string(raw_content, string.quotes(), preferred_quote)
|
||||||
|
} else {
|
||||||
|
choose_quotes_impl(raw_content, string.quotes(), preferred_quote)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
string.quotes()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Computes the strings preferred quotes and normalizes its content.
|
||||||
|
pub(crate) fn normalize<'a>(
|
||||||
|
&self,
|
||||||
|
string: &StringPart,
|
||||||
|
locator: &'a Locator,
|
||||||
|
) -> NormalizedString<'a> {
|
||||||
|
let raw_content = locator.slice(string.content_range());
|
||||||
|
|
||||||
|
let quotes = self.choose_quotes(string, locator);
|
||||||
|
|
||||||
|
let normalized = normalize_string(raw_content, quotes, string.prefix(), self.normalize_hex);
|
||||||
|
|
||||||
|
NormalizedString {
|
||||||
|
prefix: string.prefix(),
|
||||||
|
content_range: string.content_range(),
|
||||||
|
text: normalized,
|
||||||
|
quotes,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub(crate) struct NormalizedString<'a> {
|
||||||
|
prefix: crate::string::StringPrefix,
|
||||||
|
|
||||||
|
/// The quotes of the normalized string (preferred quotes)
|
||||||
|
quotes: StringQuotes,
|
||||||
|
|
||||||
|
/// The range of the string's content in the source (minus prefix and quotes).
|
||||||
|
content_range: TextRange,
|
||||||
|
|
||||||
|
/// The normalized text
|
||||||
|
text: Cow<'a, str>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> NormalizedString<'a> {
|
||||||
|
pub(crate) fn text(&self) -> &Cow<'a, str> {
|
||||||
|
&self.text
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn quotes(&self) -> StringQuotes {
|
||||||
|
self.quotes
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn prefix(&self) -> StringPrefix {
|
||||||
|
self.prefix
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Ranged for NormalizedString<'_> {
|
||||||
|
fn range(&self) -> TextRange {
|
||||||
|
self.content_range
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Format<PyFormatContext<'_>> for NormalizedString<'_> {
|
||||||
|
fn fmt(&self, f: &mut Formatter<PyFormatContext<'_>>) -> FormatResult<()> {
|
||||||
|
ruff_formatter::write!(f, [self.prefix, self.quotes])?;
|
||||||
|
match &self.text {
|
||||||
|
Cow::Borrowed(_) => {
|
||||||
|
source_text_slice(self.range()).fmt(f)?;
|
||||||
|
}
|
||||||
|
Cow::Owned(normalized) => {
|
||||||
|
text(normalized).fmt(f)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.quotes.fmt(f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Choose the appropriate quote style for a raw string.
|
||||||
|
///
|
||||||
|
/// The preferred quote style is chosen unless the string contains unescaped quotes of the
|
||||||
|
/// preferred style. For example, `r"foo"` is chosen over `r'foo'` if the preferred quote
|
||||||
|
/// style is double quotes.
|
||||||
|
fn choose_quotes_for_raw_string(
|
||||||
|
input: &str,
|
||||||
|
quotes: StringQuotes,
|
||||||
|
preferred_quote: QuoteChar,
|
||||||
|
) -> StringQuotes {
|
||||||
|
let preferred_quote_char = preferred_quote.as_char();
|
||||||
|
let mut chars = input.chars().peekable();
|
||||||
|
let contains_unescaped_configured_quotes = loop {
|
||||||
|
match chars.next() {
|
||||||
|
Some('\\') => {
|
||||||
|
// Ignore escaped characters
|
||||||
|
chars.next();
|
||||||
|
}
|
||||||
|
// `"` or `'`
|
||||||
|
Some(c) if c == preferred_quote_char => {
|
||||||
|
if !quotes.triple {
|
||||||
|
break true;
|
||||||
|
}
|
||||||
|
|
||||||
|
match chars.peek() {
|
||||||
|
// We can't turn `r'''\""'''` into `r"""\"""""`, this would confuse the parser
|
||||||
|
// about where the closing triple quotes start
|
||||||
|
None => break true,
|
||||||
|
Some(next) if *next == preferred_quote_char => {
|
||||||
|
// `""` or `''`
|
||||||
|
chars.next();
|
||||||
|
|
||||||
|
// We can't turn `r'''""'''` into `r""""""""`, nor can we have
|
||||||
|
// `"""` or `'''` respectively inside the string
|
||||||
|
if chars.peek().is_none() || chars.peek() == Some(&preferred_quote_char) {
|
||||||
|
break true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(_) => continue,
|
||||||
|
None => break false,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
StringQuotes {
|
||||||
|
triple: quotes.triple,
|
||||||
|
quote_char: if contains_unescaped_configured_quotes {
|
||||||
|
quotes.quote_char
|
||||||
|
} else {
|
||||||
|
preferred_quote
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Choose the appropriate quote style for a string.
|
||||||
|
///
|
||||||
|
/// For single quoted strings, the preferred quote style is used, unless the alternative quote style
|
||||||
|
/// would require fewer escapes.
|
||||||
|
///
|
||||||
|
/// For triple quoted strings, the preferred quote style is always used, unless the string contains
|
||||||
|
/// a triplet of the quote character (e.g., if double quotes are preferred, double quotes will be
|
||||||
|
/// used unless the string contains `"""`).
|
||||||
|
fn choose_quotes_impl(
|
||||||
|
input: &str,
|
||||||
|
quotes: StringQuotes,
|
||||||
|
preferred_quote: QuoteChar,
|
||||||
|
) -> StringQuotes {
|
||||||
|
let quote = if quotes.triple {
|
||||||
|
// True if the string contains a triple quote sequence of the configured quote style.
|
||||||
|
let mut uses_triple_quotes = false;
|
||||||
|
let mut chars = input.chars().peekable();
|
||||||
|
|
||||||
|
while let Some(c) = chars.next() {
|
||||||
|
let preferred_quote_char = preferred_quote.as_char();
|
||||||
|
match c {
|
||||||
|
'\\' => {
|
||||||
|
if matches!(chars.peek(), Some('"' | '\\')) {
|
||||||
|
chars.next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// `"` or `'`
|
||||||
|
c if c == preferred_quote_char => {
|
||||||
|
match chars.peek().copied() {
|
||||||
|
Some(c) if c == preferred_quote_char => {
|
||||||
|
// `""` or `''`
|
||||||
|
chars.next();
|
||||||
|
|
||||||
|
match chars.peek().copied() {
|
||||||
|
Some(c) if c == preferred_quote_char => {
|
||||||
|
// `"""` or `'''`
|
||||||
|
chars.next();
|
||||||
|
uses_triple_quotes = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Some(_) => {}
|
||||||
|
None => {
|
||||||
|
// Handle `''' ""'''`. At this point we have consumed both
|
||||||
|
// double quotes, so on the next iteration the iterator is empty
|
||||||
|
// and we'd miss the string ending with a preferred quote
|
||||||
|
uses_triple_quotes = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(_) => {
|
||||||
|
// A single quote char, this is ok
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// Trailing quote at the end of the comment
|
||||||
|
uses_triple_quotes = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => continue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if uses_triple_quotes {
|
||||||
|
// String contains a triple quote sequence of the configured quote style.
|
||||||
|
// Keep the existing quote style.
|
||||||
|
quotes.quote_char
|
||||||
|
} else {
|
||||||
|
preferred_quote
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let mut single_quotes = 0u32;
|
||||||
|
let mut double_quotes = 0u32;
|
||||||
|
|
||||||
|
for c in input.chars() {
|
||||||
|
match c {
|
||||||
|
'\'' => {
|
||||||
|
single_quotes += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
'"' => {
|
||||||
|
double_quotes += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
_ => continue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match preferred_quote {
|
||||||
|
QuoteChar::Single => {
|
||||||
|
if single_quotes > double_quotes {
|
||||||
|
QuoteChar::Double
|
||||||
|
} else {
|
||||||
|
QuoteChar::Single
|
||||||
|
}
|
||||||
|
}
|
||||||
|
QuoteChar::Double => {
|
||||||
|
if double_quotes > single_quotes {
|
||||||
|
QuoteChar::Single
|
||||||
|
} else {
|
||||||
|
QuoteChar::Double
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
StringQuotes {
|
||||||
|
triple: quotes.triple,
|
||||||
|
quote_char: quote,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adds the necessary quote escapes and removes unnecessary escape sequences when quoting `input`
|
||||||
|
/// with the provided [`StringQuotes`] style.
|
||||||
|
///
|
||||||
|
/// Returns the normalized string and whether it contains new lines.
|
||||||
|
pub(crate) fn normalize_string(
|
||||||
|
input: &str,
|
||||||
|
quotes: StringQuotes,
|
||||||
|
prefix: StringPrefix,
|
||||||
|
normalize_hex: bool,
|
||||||
|
) -> Cow<str> {
|
||||||
|
// The normalized string if `input` is not yet normalized.
|
||||||
|
// `output` must remain empty if `input` is already normalized.
|
||||||
|
let mut output = String::new();
|
||||||
|
// Tracks the last index of `input` that has been written to `output`.
|
||||||
|
// If `last_index` is `0` at the end, then the input is already normalized and can be returned as is.
|
||||||
|
let mut last_index = 0;
|
||||||
|
|
||||||
|
let quote = quotes.quote_char;
|
||||||
|
let preferred_quote = quote.as_char();
|
||||||
|
let opposite_quote = quote.invert().as_char();
|
||||||
|
|
||||||
|
let mut chars = input.char_indices().peekable();
|
||||||
|
|
||||||
|
let is_raw = prefix.is_raw_string();
|
||||||
|
let is_fstring = prefix.is_fstring();
|
||||||
|
let mut formatted_value_nesting = 0u32;
|
||||||
|
|
||||||
|
while let Some((index, c)) = chars.next() {
|
||||||
|
if is_fstring && matches!(c, '{' | '}') {
|
||||||
|
if chars.peek().copied().is_some_and(|(_, next)| next == c) {
|
||||||
|
// Skip over the second character of the double braces
|
||||||
|
chars.next();
|
||||||
|
} else if c == '{' {
|
||||||
|
formatted_value_nesting += 1;
|
||||||
|
} else {
|
||||||
|
// Safe to assume that `c == '}'` here because of the matched pattern above
|
||||||
|
formatted_value_nesting = formatted_value_nesting.saturating_sub(1);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if c == '\r' {
|
||||||
|
output.push_str(&input[last_index..index]);
|
||||||
|
|
||||||
|
// Skip over the '\r' character, keep the `\n`
|
||||||
|
if chars.peek().copied().is_some_and(|(_, next)| next == '\n') {
|
||||||
|
chars.next();
|
||||||
|
}
|
||||||
|
// Replace the `\r` with a `\n`
|
||||||
|
else {
|
||||||
|
output.push('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
last_index = index + '\r'.len_utf8();
|
||||||
|
} else if !is_raw {
|
||||||
|
if c == '\\' {
|
||||||
|
if let Some((_, next)) = chars.clone().next() {
|
||||||
|
if next == '\\' {
|
||||||
|
// Skip over escaped backslashes
|
||||||
|
chars.next();
|
||||||
|
} else if normalize_hex {
|
||||||
|
if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte())
|
||||||
|
.and_then(|escape| {
|
||||||
|
escape.normalize(&input[index + c.len_utf8() + next.len_utf8()..])
|
||||||
|
})
|
||||||
|
{
|
||||||
|
// Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
|
||||||
|
let escape_start_len = '\\'.len_utf8() + next.len_utf8();
|
||||||
|
let escape_start_offset = index + escape_start_len;
|
||||||
|
if let Cow::Owned(normalised) = &normalised {
|
||||||
|
output.push_str(&input[last_index..escape_start_offset]);
|
||||||
|
output.push_str(normalised);
|
||||||
|
last_index = escape_start_offset + normalised.len();
|
||||||
|
};
|
||||||
|
|
||||||
|
// Move the `chars` iterator passed the escape sequence.
|
||||||
|
// Simply reassigning `chars` doesn't work because the indices` would
|
||||||
|
// then be off.
|
||||||
|
for _ in 0..next.len_utf8() + normalised.len() {
|
||||||
|
chars.next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !quotes.triple {
|
||||||
|
#[allow(clippy::if_same_then_else)]
|
||||||
|
if next == opposite_quote && formatted_value_nesting == 0 {
|
||||||
|
// Remove the escape by ending before the backslash and starting again with the quote
|
||||||
|
chars.next();
|
||||||
|
output.push_str(&input[last_index..index]);
|
||||||
|
last_index = index + '\\'.len_utf8();
|
||||||
|
} else if next == preferred_quote {
|
||||||
|
// Quote is already escaped, skip over it.
|
||||||
|
chars.next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if !quotes.triple && c == preferred_quote && formatted_value_nesting == 0 {
|
||||||
|
// Escape the quote
|
||||||
|
output.push_str(&input[last_index..index]);
|
||||||
|
output.push('\\');
|
||||||
|
output.push(c);
|
||||||
|
last_index = index + preferred_quote.len_utf8();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let normalized = if last_index == 0 {
|
||||||
|
Cow::Borrowed(input)
|
||||||
|
} else {
|
||||||
|
output.push_str(&input[last_index..]);
|
||||||
|
Cow::Owned(output)
|
||||||
|
};
|
||||||
|
|
||||||
|
normalized
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||||
|
enum UnicodeEscape {
|
||||||
|
/// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
|
||||||
|
Hex(usize),
|
||||||
|
|
||||||
|
/// An escaped unicode name (`\N{name}`)
|
||||||
|
CharacterName,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl UnicodeEscape {
|
||||||
|
fn new(first: char, allow_unicode: bool) -> Option<UnicodeEscape> {
|
||||||
|
Some(match first {
|
||||||
|
'x' => UnicodeEscape::Hex(2),
|
||||||
|
'u' if allow_unicode => UnicodeEscape::Hex(4),
|
||||||
|
'U' if allow_unicode => UnicodeEscape::Hex(8),
|
||||||
|
'N' if allow_unicode => UnicodeEscape::CharacterName,
|
||||||
|
_ => return None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalises `\u..`, `\U..`, `\x..` and `\N{..}` escape sequences to:
|
||||||
|
///
|
||||||
|
/// * `\u`, `\U'` and `\x`: To use lower case for the characters `a-f`.
|
||||||
|
/// * `\N`: To use uppercase letters
|
||||||
|
fn normalize(self, input: &str) -> Option<Cow<str>> {
|
||||||
|
let mut normalised = String::new();
|
||||||
|
|
||||||
|
let len = match self {
|
||||||
|
UnicodeEscape::Hex(len) => {
|
||||||
|
// It's not a valid escape sequence if the input string has fewer characters
|
||||||
|
// left than required by the escape sequence.
|
||||||
|
if input.len() < len {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (index, c) in input.char_indices().take(len) {
|
||||||
|
match c {
|
||||||
|
'0'..='9' | 'a'..='f' => {
|
||||||
|
if !normalised.is_empty() {
|
||||||
|
normalised.push(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'A'..='F' => {
|
||||||
|
if normalised.is_empty() {
|
||||||
|
normalised.reserve(len);
|
||||||
|
normalised.push_str(&input[..index]);
|
||||||
|
normalised.push(c.to_ascii_lowercase());
|
||||||
|
} else {
|
||||||
|
normalised.push(c.to_ascii_lowercase());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// not a valid escape sequence
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
len
|
||||||
|
}
|
||||||
|
UnicodeEscape::CharacterName => {
|
||||||
|
let mut char_indices = input.char_indices();
|
||||||
|
|
||||||
|
if !matches!(char_indices.next(), Some((_, '{'))) {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
loop {
|
||||||
|
if let Some((index, c)) = char_indices.next() {
|
||||||
|
match c {
|
||||||
|
'}' => {
|
||||||
|
if !normalised.is_empty() {
|
||||||
|
normalised.push('}');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Name must be at least two characters long.
|
||||||
|
if index < 3 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
break index + '}'.len_utf8();
|
||||||
|
}
|
||||||
|
'0'..='9' | 'A'..='Z' | ' ' | '-' => {
|
||||||
|
if !normalised.is_empty() {
|
||||||
|
normalised.push(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'a'..='z' => {
|
||||||
|
if normalised.is_empty() {
|
||||||
|
normalised.reserve(c.len_utf8() + '}'.len_utf8());
|
||||||
|
normalised.push_str(&input[..index]);
|
||||||
|
normalised.push(c.to_ascii_uppercase());
|
||||||
|
} else {
|
||||||
|
normalised.push(c.to_ascii_uppercase());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
// Seems like an invalid escape sequence, don't normalise it.
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Unterminated escape sequence, don't normalise it.
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Some(if normalised.is_empty() {
|
||||||
|
Cow::Borrowed(&input[..len])
|
||||||
|
} else {
|
||||||
|
Cow::Owned(normalised)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
use crate::string::{QuoteChar, StringPrefix, StringQuotes};
|
||||||
|
|
||||||
|
use super::{normalize_string, UnicodeEscape};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn normalize_32_escape() {
|
||||||
|
let escape_sequence = UnicodeEscape::new('U', true).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
Some(Cow::Owned("0001f60e".to_string())),
|
||||||
|
escape_sequence.normalize("0001F60E")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn normalize_hex_in_byte_string() {
|
||||||
|
let input = r"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A";
|
||||||
|
|
||||||
|
let normalized = normalize_string(
|
||||||
|
input,
|
||||||
|
StringQuotes {
|
||||||
|
triple: false,
|
||||||
|
quote_char: QuoteChar::Double,
|
||||||
|
},
|
||||||
|
StringPrefix::BYTE,
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(r"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", &normalized);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue