mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-03 02:12:22 +00:00
Normalise Hex and unicode escape sequences in string (#9280)
This commit is contained in:
parent
c716acc7a6
commit
5d4825b60f
10 changed files with 263 additions and 113 deletions
|
@ -118,3 +118,5 @@ test_particular = [
|
|||
b'c'
|
||||
)
|
||||
}
|
||||
|
||||
b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}"
|
||||
|
|
|
@ -133,3 +133,8 @@ x = (b"""aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa""" b"""bbbbbbbbbbbbbbbbbbbbbbbbbbb
|
|||
|
||||
# https://github.com/astral-sh/ruff/issues/7460
|
||||
trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
|
||||
|
||||
a = f"""\x1F"""
|
||||
a = """\x1F"""
|
||||
a = """\\x1F"""
|
||||
a = """\\\x1F"""
|
||||
|
|
|
@ -2,6 +2,7 @@ use ruff_python_ast::BytesLiteral;
|
|||
use ruff_text_size::Ranged;
|
||||
|
||||
use crate::prelude::*;
|
||||
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
|
||||
use crate::string::{Quoting, StringPart};
|
||||
|
||||
#[derive(Default)]
|
||||
|
@ -17,6 +18,7 @@ impl FormatNodeRule<BytesLiteral> for FormatBytesLiteral {
|
|||
&locator,
|
||||
f.options().quote_style(),
|
||||
f.context().docstring(),
|
||||
is_hex_codes_in_unicode_sequences_enabled(f.context()),
|
||||
)
|
||||
.fmt(f)
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ use ruff_python_ast::FString;
|
|||
use ruff_text_size::Ranged;
|
||||
|
||||
use crate::prelude::*;
|
||||
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
|
||||
use crate::string::{Quoting, StringPart};
|
||||
|
||||
/// Formats an f-string which is part of a larger f-string expression.
|
||||
|
@ -31,6 +32,7 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
|
|||
&locator,
|
||||
f.options().quote_style(),
|
||||
f.context().docstring(),
|
||||
is_hex_codes_in_unicode_sequences_enabled(f.context()),
|
||||
)
|
||||
.fmt(f);
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ use ruff_python_ast::StringLiteral;
|
|||
use ruff_text_size::Ranged;
|
||||
|
||||
use crate::prelude::*;
|
||||
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
|
||||
use crate::string::{docstring, Quoting, StringPart};
|
||||
use crate::QuoteStyle;
|
||||
|
||||
|
@ -61,6 +62,7 @@ impl Format<PyFormatContext<'_>> for FormatStringLiteral<'_> {
|
|||
&locator,
|
||||
quote_style,
|
||||
f.context().docstring(),
|
||||
is_hex_codes_in_unicode_sequences_enabled(f.context()),
|
||||
);
|
||||
|
||||
if self.layout.is_docstring() {
|
||||
|
|
|
@ -57,3 +57,8 @@ pub(crate) const fn is_module_docstring_newlines_enabled(context: &PyFormatConte
|
|||
pub(crate) const fn is_dummy_implementations_enabled(context: &PyFormatContext) -> bool {
|
||||
context.is_preview()
|
||||
}
|
||||
|
||||
/// Returns `true` if the [`hex_codes_in_unicode_sequences`](https://github.com/psf/black/pull/2916) preview style is enabled.
|
||||
pub(crate) const fn is_hex_codes_in_unicode_sequences_enabled(context: &PyFormatContext) -> bool {
|
||||
context.is_preview()
|
||||
}
|
||||
|
|
|
@ -253,6 +253,7 @@ impl StringPart {
|
|||
locator: &'a Locator,
|
||||
configured_style: QuoteStyle,
|
||||
parent_docstring_quote_char: Option<QuoteChar>,
|
||||
normalize_hex: bool,
|
||||
) -> NormalizedString<'a> {
|
||||
// Per PEP 8, always prefer double quotes for triple-quoted strings.
|
||||
let preferred_style = if self.quotes.triple {
|
||||
|
@ -310,7 +311,7 @@ impl StringPart {
|
|||
configured_style
|
||||
};
|
||||
|
||||
let raw_content = locator.slice(self.content_range);
|
||||
let raw_content = &locator.slice(self.content_range);
|
||||
|
||||
let quotes = match quoting {
|
||||
Quoting::Preserve => self.quotes,
|
||||
|
@ -327,7 +328,7 @@ impl StringPart {
|
|||
}
|
||||
};
|
||||
|
||||
let normalized = normalize_string(locator.slice(self.content_range), quotes, self.prefix);
|
||||
let normalized = normalize_string(raw_content, quotes, self.prefix, normalize_hex);
|
||||
|
||||
NormalizedString {
|
||||
prefix: self.prefix,
|
||||
|
@ -423,6 +424,10 @@ impl StringPrefix {
|
|||
pub(super) const fn is_fstring(self) -> bool {
|
||||
self.contains(StringPrefix::F_STRING)
|
||||
}
|
||||
|
||||
pub(super) const fn is_byte(self) -> bool {
|
||||
self.contains(StringPrefix::BYTE)
|
||||
}
|
||||
}
|
||||
|
||||
impl Format<PyFormatContext<'_>> for StringPrefix {
|
||||
|
@ -722,7 +727,12 @@ impl TryFrom<char> for QuoteChar {
|
|||
/// with the provided [`StringQuotes`] style.
|
||||
///
|
||||
/// Returns the normalized string and whether it contains new lines.
|
||||
fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) -> Cow<str> {
|
||||
fn normalize_string(
|
||||
input: &str,
|
||||
quotes: StringQuotes,
|
||||
prefix: StringPrefix,
|
||||
normalize_hex: bool,
|
||||
) -> Cow<str> {
|
||||
// The normalized string if `input` is not yet normalized.
|
||||
// `output` must remain empty if `input` is already normalized.
|
||||
let mut output = String::new();
|
||||
|
@ -766,9 +776,37 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
|
|||
}
|
||||
|
||||
last_index = index + '\r'.len_utf8();
|
||||
} else if !quotes.triple && !is_raw {
|
||||
} else if !is_raw {
|
||||
if c == '\\' {
|
||||
if let Some((_, next)) = chars.peek().copied() {
|
||||
if let Some((_, next)) = chars.clone().next() {
|
||||
if next == '\\' {
|
||||
// Skip over escaped backslashes
|
||||
chars.next();
|
||||
} else if normalize_hex {
|
||||
if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte())
|
||||
.and_then(|escape| {
|
||||
escape.normalize(&input[index + c.len_utf8() + next.len_utf8()..])
|
||||
})
|
||||
{
|
||||
// Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
|
||||
let escape_start_len = '\\'.len_utf8() + next.len_utf8();
|
||||
let escape_start_offset = index + escape_start_len;
|
||||
if let Cow::Owned(normalised) = &normalised {
|
||||
output.push_str(&input[last_index..escape_start_offset]);
|
||||
output.push_str(normalised);
|
||||
last_index = escape_start_offset + normalised.len();
|
||||
};
|
||||
|
||||
// Move the `chars` iterator passed the escape sequence.
|
||||
// Simply reassigning `chars` doesn't work because the indices` would
|
||||
// then be off.
|
||||
for _ in 0..next.len_utf8() + normalised.len() {
|
||||
chars.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !quotes.triple {
|
||||
#[allow(clippy::if_same_then_else)]
|
||||
if next == opposite_quote && formatted_value_nesting == 0 {
|
||||
// Remove the escape by ending before the backslash and starting again with the quote
|
||||
|
@ -778,12 +816,10 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
|
|||
} else if next == preferred_quote {
|
||||
// Quote is already escaped, skip over it.
|
||||
chars.next();
|
||||
} else if next == '\\' {
|
||||
// Skip over escaped backslashes
|
||||
chars.next();
|
||||
}
|
||||
}
|
||||
} else if c == preferred_quote && formatted_value_nesting == 0 {
|
||||
}
|
||||
} else if !quotes.triple && c == preferred_quote && formatted_value_nesting == 0 {
|
||||
// Escape the quote
|
||||
output.push_str(&input[last_index..index]);
|
||||
output.push('\\');
|
||||
|
@ -802,3 +838,153 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
|
|||
|
||||
normalized
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
enum UnicodeEscape {
|
||||
/// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
|
||||
Hex(usize),
|
||||
|
||||
/// An escaped unicode name (`\N{name}`)
|
||||
CharacterName,
|
||||
}
|
||||
|
||||
impl UnicodeEscape {
|
||||
fn new(first: char, allow_unicode: bool) -> Option<UnicodeEscape> {
|
||||
Some(match first {
|
||||
'x' => UnicodeEscape::Hex(2),
|
||||
'u' if allow_unicode => UnicodeEscape::Hex(4),
|
||||
'U' if allow_unicode => UnicodeEscape::Hex(8),
|
||||
'N' if allow_unicode => UnicodeEscape::CharacterName,
|
||||
_ => return None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Normalises `\u..`, `\U..`, `\x..` and `\N{..}` escape sequences to:
|
||||
///
|
||||
/// * `\u`, `\U'` and `\x`: To use lower case for the characters `a-f`.
|
||||
/// * `\N`: To use uppercase letters
|
||||
fn normalize(self, input: &str) -> Option<Cow<str>> {
|
||||
let mut normalised = String::new();
|
||||
|
||||
let len = match self {
|
||||
UnicodeEscape::Hex(len) => {
|
||||
// It's not a valid escape sequence if the input string has fewer characters
|
||||
// left than required by the escape sequence.
|
||||
if input.len() < len {
|
||||
return None;
|
||||
}
|
||||
|
||||
for (index, c) in input.char_indices().take(len) {
|
||||
match c {
|
||||
'0'..='9' | 'a'..='f' => {
|
||||
if !normalised.is_empty() {
|
||||
normalised.push(c);
|
||||
}
|
||||
}
|
||||
'A'..='F' => {
|
||||
if normalised.is_empty() {
|
||||
normalised.reserve(len);
|
||||
normalised.push_str(&input[..index]);
|
||||
normalised.push(c.to_ascii_lowercase());
|
||||
} else {
|
||||
normalised.push(c.to_ascii_lowercase());
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// not a valid escape sequence
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
len
|
||||
}
|
||||
UnicodeEscape::CharacterName => {
|
||||
let mut char_indices = input.char_indices();
|
||||
|
||||
if !matches!(char_indices.next(), Some((_, '{'))) {
|
||||
return None;
|
||||
}
|
||||
|
||||
loop {
|
||||
if let Some((index, c)) = char_indices.next() {
|
||||
match c {
|
||||
'}' => {
|
||||
if !normalised.is_empty() {
|
||||
normalised.push('}');
|
||||
}
|
||||
|
||||
// Name must be at least two characters long.
|
||||
if index < 3 {
|
||||
return None;
|
||||
}
|
||||
|
||||
break index + '}'.len_utf8();
|
||||
}
|
||||
'0'..='9' | 'A'..='Z' | ' ' | '-' => {
|
||||
if !normalised.is_empty() {
|
||||
normalised.push(c);
|
||||
}
|
||||
}
|
||||
'a'..='z' => {
|
||||
if normalised.is_empty() {
|
||||
normalised.reserve(c.len_utf8() + '}'.len_utf8());
|
||||
normalised.push_str(&input[..index]);
|
||||
normalised.push(c.to_ascii_uppercase());
|
||||
} else {
|
||||
normalised.push(c.to_ascii_uppercase());
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// Seems like an invalid escape sequence, don't normalise it.
|
||||
return None;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Unterminated escape sequence, dont' normalise it.
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Some(if normalised.is_empty() {
|
||||
Cow::Borrowed(&input[..len])
|
||||
} else {
|
||||
Cow::Owned(normalised)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::string::{normalize_string, QuoteChar, StringPrefix, StringQuotes, UnicodeEscape};
|
||||
use std::borrow::Cow;
|
||||
|
||||
#[test]
|
||||
fn normalize_32_escape() {
|
||||
let escape_sequence = UnicodeEscape::new('U', true).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
Some(Cow::Owned("0001f60e".to_string())),
|
||||
escape_sequence.normalize("0001F60E")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_hex_in_byte_string() {
|
||||
let input = r"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A";
|
||||
|
||||
let normalized = normalize_string(
|
||||
input,
|
||||
StringQuotes {
|
||||
triple: false,
|
||||
quote_char: QuoteChar::Double,
|
||||
},
|
||||
StringPrefix::BYTE,
|
||||
true,
|
||||
);
|
||||
|
||||
assert_eq!(r"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", &normalized);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,97 +0,0 @@
|
|||
---
|
||||
source: crates/ruff_python_formatter/tests/fixtures.rs
|
||||
input_file: crates/ruff_python_formatter/resources/test/fixtures/black/cases/preview_format_unicode_escape_seq.py
|
||||
---
|
||||
## Input
|
||||
|
||||
```python
|
||||
x = "\x1F"
|
||||
x = "\\x1B"
|
||||
x = "\\\x1B"
|
||||
x = "\U0001F60E"
|
||||
x = "\u0001F60E"
|
||||
x = r"\u0001F60E"
|
||||
x = "don't format me"
|
||||
x = "\xA3"
|
||||
x = "\u2717"
|
||||
x = "\uFaCe"
|
||||
x = "\N{ox}\N{OX}"
|
||||
x = "\N{lAtIn smaLL letteR x}"
|
||||
x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
|
||||
x = b"\x1Fdon't byte"
|
||||
x = rb"\x1Fdon't format"
|
||||
```
|
||||
|
||||
## Black Differences
|
||||
|
||||
```diff
|
||||
--- Black
|
||||
+++ Ruff
|
||||
@@ -1,15 +1,15 @@
|
||||
-x = "\x1f"
|
||||
+x = "\x1F"
|
||||
x = "\\x1B"
|
||||
-x = "\\\x1b"
|
||||
-x = "\U0001f60e"
|
||||
+x = "\\\x1B"
|
||||
+x = "\U0001F60E"
|
||||
x = "\u0001F60E"
|
||||
x = r"\u0001F60E"
|
||||
x = "don't format me"
|
||||
-x = "\xa3"
|
||||
+x = "\xA3"
|
||||
x = "\u2717"
|
||||
-x = "\uface"
|
||||
-x = "\N{OX}\N{OX}"
|
||||
-x = "\N{LATIN SMALL LETTER X}"
|
||||
-x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}"
|
||||
-x = b"\x1fdon't byte"
|
||||
+x = "\uFaCe"
|
||||
+x = "\N{ox}\N{OX}"
|
||||
+x = "\N{lAtIn smaLL letteR x}"
|
||||
+x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
|
||||
+x = b"\x1Fdon't byte"
|
||||
x = rb"\x1Fdon't format"
|
||||
```
|
||||
|
||||
## Ruff Output
|
||||
|
||||
```python
|
||||
x = "\x1F"
|
||||
x = "\\x1B"
|
||||
x = "\\\x1B"
|
||||
x = "\U0001F60E"
|
||||
x = "\u0001F60E"
|
||||
x = r"\u0001F60E"
|
||||
x = "don't format me"
|
||||
x = "\xA3"
|
||||
x = "\u2717"
|
||||
x = "\uFaCe"
|
||||
x = "\N{ox}\N{OX}"
|
||||
x = "\N{lAtIn smaLL letteR x}"
|
||||
x = "\N{CYRILLIC small LETTER BYELORUSSIAN-UKRAINIAN I}"
|
||||
x = b"\x1Fdon't byte"
|
||||
x = rb"\x1Fdon't format"
|
||||
```
|
||||
|
||||
## Black Output
|
||||
|
||||
```python
|
||||
x = "\x1f"
|
||||
x = "\\x1B"
|
||||
x = "\\\x1b"
|
||||
x = "\U0001f60e"
|
||||
x = "\u0001F60E"
|
||||
x = r"\u0001F60E"
|
||||
x = "don't format me"
|
||||
x = "\xa3"
|
||||
x = "\u2717"
|
||||
x = "\uface"
|
||||
x = "\N{OX}\N{OX}"
|
||||
x = "\N{LATIN SMALL LETTER X}"
|
||||
x = "\N{CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I}"
|
||||
x = b"\x1fdon't byte"
|
||||
x = rb"\x1Fdon't format"
|
||||
```
|
||||
|
||||
|
|
@ -124,6 +124,8 @@ test_particular = [
|
|||
b'c'
|
||||
)
|
||||
}
|
||||
|
||||
b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}"
|
||||
```
|
||||
|
||||
## Outputs
|
||||
|
@ -277,6 +279,8 @@ test_particular = [
|
|||
|
||||
# Parenthesized string continuation with messed up indentation
|
||||
{"key": ([], b"a" b"b" b"c")}
|
||||
|
||||
b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}"
|
||||
```
|
||||
|
||||
|
||||
|
@ -430,6 +434,8 @@ test_particular = [
|
|||
|
||||
# Parenthesized string continuation with messed up indentation
|
||||
{'key': ([], b'a' b'b' b'c')}
|
||||
|
||||
b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}"
|
||||
```
|
||||
|
||||
|
||||
|
|
|
@ -139,6 +139,11 @@ x = (b"""aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa""" b"""bbbbbbbbbbbbbbbbbbbbbbbbbbb
|
|||
|
||||
# https://github.com/astral-sh/ruff/issues/7460
|
||||
trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
|
||||
|
||||
a = f"""\x1F"""
|
||||
a = """\x1F"""
|
||||
a = """\\x1F"""
|
||||
a = """\\\x1F"""
|
||||
```
|
||||
|
||||
## Outputs
|
||||
|
@ -316,6 +321,11 @@ x = (
|
|||
|
||||
# https://github.com/astral-sh/ruff/issues/7460
|
||||
trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
|
||||
|
||||
a = f"""\x1F"""
|
||||
a = """\x1F"""
|
||||
a = """\\x1F"""
|
||||
a = """\\\x1F"""
|
||||
```
|
||||
|
||||
|
||||
|
@ -329,6 +339,17 @@ trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
|
|||
'" test'
|
||||
|
||||
'" test'
|
||||
@@ -158,7 +159,7 @@
|
||||
# https://github.com/astral-sh/ruff/issues/7460
|
||||
trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
|
||||
|
||||
-a = f"""\x1F"""
|
||||
-a = """\x1F"""
|
||||
+a = f"""\x1f"""
|
||||
+a = """\x1f"""
|
||||
a = """\\x1F"""
|
||||
-a = """\\\x1F"""
|
||||
+a = """\\\x1f"""
|
||||
```
|
||||
|
||||
|
||||
|
@ -506,6 +527,11 @@ x = (
|
|||
|
||||
# https://github.com/astral-sh/ruff/issues/7460
|
||||
trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
|
||||
|
||||
a = f"""\x1F"""
|
||||
a = """\x1F"""
|
||||
a = """\\x1F"""
|
||||
a = """\\\x1F"""
|
||||
```
|
||||
|
||||
|
||||
|
@ -519,6 +545,17 @@ trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
|
|||
'" test'
|
||||
|
||||
'" test'
|
||||
@@ -158,7 +159,7 @@
|
||||
# https://github.com/astral-sh/ruff/issues/7460
|
||||
trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']
|
||||
|
||||
-a = f"""\x1F"""
|
||||
-a = """\x1F"""
|
||||
+a = f"""\x1f"""
|
||||
+a = """\x1f"""
|
||||
a = """\\x1F"""
|
||||
-a = """\\\x1F"""
|
||||
+a = """\\\x1f"""
|
||||
```
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue