Remove unnecessary string cloning from the parser (#9884)

Closes https://github.com/astral-sh/ruff/issues/9869.
This commit is contained in:
Charlie Marsh 2024-02-09 16:03:27 -05:00 committed by GitHub
parent 7ca515c0aa
commit 6f0e4ad332
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 227 additions and 119 deletions

13
Cargo.lock generated
View file

@ -217,12 +217,12 @@ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
[[package]]
name = "bstr"
version = "1.6.2"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc"
dependencies = [
"memchr",
"regex-automata 0.3.9",
"regex-automata 0.4.3",
"serde",
]
@ -1921,12 +1921,6 @@ dependencies = [
"regex-syntax 0.6.29",
]
[[package]]
name = "regex-automata"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
[[package]]
name = "regex-automata"
version = "0.4.3"
@ -2342,6 +2336,7 @@ version = "0.0.0"
dependencies = [
"anyhow",
"bitflags 2.4.1",
"bstr",
"insta",
"is-macro",
"itertools 0.12.1",

View file

@ -19,6 +19,7 @@ argfile = { version = "0.1.6" }
assert_cmd = { version = "2.0.13" }
bincode = { version = "1.3.3" }
bitflags = { version = "2.4.1" }
bstr = { version = "1.9.0" }
cachedir = { version = "0.3.1" }
chrono = { version = "0.4.33", default-features = false, features = ["clock"] }
clap = { version = "4.4.18", features = ["derive"] }

View file

@ -40,7 +40,9 @@ impl Violation for HardcodedBindAllInterfaces {
pub(crate) fn hardcoded_bind_all_interfaces(checker: &mut Checker, string: StringLike) {
let is_bind_all_interface = match string {
StringLike::StringLiteral(ast::ExprStringLiteral { value, .. }) => value == "0.0.0.0",
StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => value == "0.0.0.0",
StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => {
&**value == "0.0.0.0"
}
StringLike::BytesLiteral(_) => return,
};

View file

@ -15,7 +15,7 @@ fn to_f_string_expression_element(inner: &Expr) -> ast::FStringElement {
/// Convert a string to a [`ast::FStringElement::Literal`].
pub(super) fn to_f_string_literal_element(s: &str) -> ast::FStringElement {
ast::FStringElement::Literal(ast::FStringLiteralElement {
value: s.to_owned(),
value: s.to_string().into_boxed_str(),
range: TextRange::default(),
})
}
@ -53,7 +53,7 @@ pub(super) fn to_f_string_element(expr: &Expr) -> Option<ast::FStringElement> {
match expr {
Expr::StringLiteral(ast::ExprStringLiteral { value, range }) => {
Some(ast::FStringElement::Literal(ast::FStringLiteralElement {
value: value.to_string(),
value: value.to_string().into_boxed_str(),
range: *range,
}))
}

View file

@ -644,7 +644,7 @@ pub struct ComparableBytesLiteral<'a> {
impl<'a> From<&'a ast::BytesLiteral> for ComparableBytesLiteral<'a> {
fn from(bytes_literal: &'a ast::BytesLiteral) -> Self {
Self {
value: bytes_literal.value.as_slice(),
value: &bytes_literal.value,
}
}
}

View file

@ -949,7 +949,7 @@ impl Ranged for FStringExpressionElement {
#[derive(Clone, Debug, PartialEq)]
pub struct FStringLiteralElement {
pub range: TextRange,
pub value: String,
pub value: Box<str>,
}
impl Ranged for FStringLiteralElement {
@ -962,7 +962,7 @@ impl Deref for FStringLiteralElement {
type Target = str;
fn deref(&self) -> &Self::Target {
self.value.as_str()
&self.value
}
}
@ -1607,7 +1607,7 @@ impl Default for BytesLiteralValueInner {
#[derive(Clone, Debug, Default, PartialEq)]
pub struct BytesLiteral {
pub range: TextRange,
pub value: Vec<u8>,
pub value: Box<[u8]>,
}
impl Ranged for BytesLiteral {
@ -1620,7 +1620,7 @@ impl Deref for BytesLiteral {
type Target = [u8];
fn deref(&self) -> &Self::Target {
self.value.as_slice()
&self.value
}
}

View file

@ -19,14 +19,15 @@ ruff_text_size = { path = "../ruff_text_size" }
anyhow = { workspace = true }
bitflags = { workspace = true }
bstr = { workspace = true }
is-macro = { workspace = true }
itertools = { workspace = true }
lalrpop-util = { workspace = true, default-features = false }
memchr = { workspace = true }
unicode-ident = { workspace = true }
unicode_names2 = { workspace = true }
rustc-hash = { workspace = true }
static_assertions = { workspace = true }
unicode-ident = { workspace = true }
unicode_names2 = { workspace = true }
[dev-dependencies]
insta = { workspace = true }

View file

@ -119,10 +119,10 @@ pub use token::{StringKind, Tok, TokenKind};
use crate::lexer::LexResult;
mod function;
// Skip flattening lexer to distinguish from full ruff_python_parser
mod context;
mod function;
mod invalid;
// Skip flattening lexer to distinguish from full ruff_python_parser
pub mod lexer;
mod parser;
mod soft_keywords;

View file

@ -1616,7 +1616,7 @@ StringLiteralOrFString: StringType = {
StringLiteral: StringType = {
<location:@L> <string:string> <end_location:@R> =>? {
let (source, kind, triple_quoted) = string;
Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
}
};
@ -1633,7 +1633,7 @@ FStringMiddlePattern: ast::FStringElement = {
FStringReplacementField,
<location:@L> <fstring_middle:fstring_middle> <end_location:@R> =>? {
let (source, is_raw, _) = fstring_middle;
Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
}
};

View file

@ -1,5 +1,5 @@
// auto-generated: "lalrpop 0.20.0"
// sha3: 02c60b5c591440061dda68775005d87a203b5448c205120bda1566a62fc2147c
// sha3: d38cc0f2252a58db42d3bd63a102b537865992b3cf51d402cdb4828f48989c9d
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
use crate::{
@ -36369,7 +36369,7 @@ fn __action217<
{
{
let (source, kind, triple_quoted) = string;
Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
}
}
@ -36419,7 +36419,7 @@ fn __action220<
{
{
let (source, is_raw, _) = fstring_middle;
Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
}
}

View file

@ -1,7 +1,9 @@
//! Parsing of string literals, bytes literals, and implicit string concatenation.
use bstr::ByteSlice;
use ruff_python_ast::{self as ast, Expr};
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
use ruff_text_size::{Ranged, TextRange, TextSize};
use crate::lexer::{LexicalError, LexicalErrorType};
use crate::token::{StringKind, Tok};
@ -32,34 +34,40 @@ impl From<StringType> for Expr {
}
}
struct StringParser<'a> {
rest: &'a str,
enum EscapedChar {
Literal(char),
Escape(char),
}
struct StringParser {
source: Box<str>,
cursor: usize,
kind: StringKind,
location: TextSize,
offset: TextSize,
range: TextRange,
}
impl<'a> StringParser<'a> {
fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self {
impl StringParser {
fn new(source: Box<str>, kind: StringKind, offset: TextSize, range: TextRange) -> Self {
Self {
rest: source,
source,
cursor: 0,
kind,
location: start,
offset,
range,
}
}
#[inline]
fn skip_bytes(&mut self, bytes: usize) -> &'a str {
let skipped_str = &self.rest[..bytes];
self.rest = &self.rest[bytes..];
self.location += skipped_str.text_len();
fn skip_bytes(&mut self, bytes: usize) -> &str {
let skipped_str = &self.source[self.cursor..self.cursor + bytes];
self.cursor += bytes;
skipped_str
}
#[inline]
fn get_pos(&self) -> TextSize {
self.location
self.offset + TextSize::try_from(self.cursor).unwrap()
}
/// Returns the next byte in the string, if there is one.
@ -69,25 +77,23 @@ impl<'a> StringParser<'a> {
/// When the next byte is a part of a multi-byte character.
#[inline]
fn next_byte(&mut self) -> Option<u8> {
self.rest.as_bytes().first().map(|&byte| {
self.rest = &self.rest[1..];
self.location += TextSize::new(1);
self.source[self.cursor..].as_bytes().first().map(|&byte| {
self.cursor += 1;
byte
})
}
#[inline]
fn next_char(&mut self) -> Option<char> {
self.rest.chars().next().map(|c| {
self.rest = &self.rest[c.len_utf8()..];
self.location += c.text_len();
self.source[self.cursor..].chars().next().map(|c| {
self.cursor += c.len_utf8();
c
})
}
#[inline]
fn peek_byte(&self) -> Option<u8> {
self.rest.as_bytes().first().copied()
self.source[self.cursor..].as_bytes().first().copied()
}
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
@ -135,7 +141,7 @@ impl<'a> StringParser<'a> {
};
let start_pos = self.get_pos();
let Some(close_idx) = self.rest.find('}') else {
let Some(close_idx) = self.source[self.cursor..].find('}') else {
return Err(LexicalError::new(
LexicalErrorType::StringError,
self.get_pos(),
@ -149,7 +155,8 @@ impl<'a> StringParser<'a> {
.ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
}
fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
/// Parse an escaped character, returning the new character.
fn parse_escaped_char(&mut self) -> Result<Option<EscapedChar>, LexicalError> {
let Some(first_char) = self.next_char() else {
return Err(LexicalError::new(
LexicalErrorType::StringError,
@ -174,13 +181,13 @@ impl<'a> StringParser<'a> {
'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
// Special cases where the escape sequence is not a single character
'\n' => return Ok(()),
'\n' => return Ok(None),
'\r' => {
if self.peek_byte() == Some(b'\n') {
self.next_byte();
}
return Ok(());
return Ok(None);
}
_ => {
if self.kind.is_any_bytes() && !first_char.is_ascii() {
@ -194,21 +201,42 @@ impl<'a> StringParser<'a> {
));
}
string.push('\\');
first_char
return Ok(Some(EscapedChar::Escape(first_char)));
}
};
string.push(new_char);
Ok(())
Ok(Some(EscapedChar::Literal(new_char)))
}
fn parse_fstring_middle(&mut self) -> Result<ast::FStringElement, LexicalError> {
let mut value = String::with_capacity(self.rest.len());
while let Some(ch) = self.next_char() {
match ch {
fn parse_fstring_middle(mut self) -> Result<ast::FStringElement, LexicalError> {
// Fast-path: if the f-string doesn't contain any escape sequences, return the literal.
let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else {
return Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
value: self.source,
range: self.range,
}));
};
let mut value = String::with_capacity(self.source.len());
loop {
// Add the characters before the escape sequence (or curly brace) to the string.
let before_with_slash_or_brace = self.skip_bytes(index + 1);
let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1];
value.push_str(before);
// Add the escaped character to the string.
match &self.source.as_bytes()[self.cursor - 1] {
// If there are any curly braces inside a `FStringMiddle` token,
// then they were escaped (i.e. `{{` or `}}`). This means that
// we need increase the location by 2 instead of 1.
b'{' => {
self.offset += TextSize::from(1);
value.push('{');
}
b'}' => {
self.offset += TextSize::from(1);
value.push('}');
}
// We can encounter a `\` as the last character in a `FStringMiddle`
// token which is valid in this context. For example,
//
@ -229,71 +257,152 @@ impl<'a> StringParser<'a> {
// This is still an invalid escape sequence, but we don't want to
// raise a syntax error as is done by the CPython parser. It might
// be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
self.parse_escaped_char(&mut value)?;
}
// If there are any curly braces inside a `FStringMiddle` token,
// then they were escaped (i.e. `{{` or `}}`). This means that
// we need increase the location by 2 instead of 1.
ch @ ('{' | '}') => {
self.location += ch.text_len();
value.push(ch);
}
ch => value.push(ch),
b'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
match self.parse_escaped_char()? {
None => {}
Some(EscapedChar::Literal(c)) => value.push(c),
Some(EscapedChar::Escape(c)) => {
value.push('\\');
value.push(c);
}
}
}
ch => {
value.push(char::from(*ch));
}
}
let Some(next_index) =
memchr::memchr3(b'{', b'}', b'\\', self.source[self.cursor..].as_bytes())
else {
// Add the rest of the string to the value.
let rest = &self.source[self.cursor..];
value.push_str(rest);
break;
};
index = next_index;
}
Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
value,
value: value.into_boxed_str(),
range: self.range,
}))
}
fn parse_bytes(&mut self) -> Result<StringType, LexicalError> {
let mut content = String::with_capacity(self.rest.len());
while let Some(ch) = self.next_char() {
match ch {
'\\' if !self.kind.is_raw() => {
self.parse_escaped_char(&mut content)?;
}
ch => {
if !ch.is_ascii() {
fn parse_bytes(mut self) -> Result<StringType, LexicalError> {
if let Some(index) = self.source.as_bytes().find_non_ascii_byte() {
return Err(LexicalError::new(
LexicalErrorType::OtherError(
"bytes can only contain ASCII literal characters"
.to_string()
.into_boxed_str(),
),
self.get_pos(),
self.offset + TextSize::try_from(index).unwrap(),
));
}
content.push(ch);
if self.kind.is_raw() {
// For raw strings, no escaping is necessary.
return Ok(StringType::Bytes(ast::BytesLiteral {
value: self.source.into_boxed_bytes(),
range: self.range,
}));
}
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
// If the string doesn't contain any escape sequences, return the owned string.
return Ok(StringType::Bytes(ast::BytesLiteral {
value: self.source.into_boxed_bytes(),
range: self.range,
}));
};
// If the string contains escape sequences, we need to parse them.
let mut value = Vec::with_capacity(self.source.len());
loop {
// Add the characters before the escape sequence to the string.
let before_with_slash = self.skip_bytes(escape + 1);
let before = &before_with_slash[..before_with_slash.len() - 1];
value.extend_from_slice(before.as_bytes());
// Add the escaped character to the string.
match self.parse_escaped_char()? {
None => {}
Some(EscapedChar::Literal(c)) => value.push(c as u8),
Some(EscapedChar::Escape(c)) => {
value.push(b'\\');
value.push(c as u8);
}
}
let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes())
else {
// Add the rest of the string to the value.
let rest = &self.source[self.cursor..];
value.extend_from_slice(rest.as_bytes());
break;
};
// Update the position of the next escape sequence.
escape = next_escape;
}
Ok(StringType::Bytes(ast::BytesLiteral {
value: content.chars().map(|c| c as u8).collect::<Vec<u8>>(),
value: value.into_boxed_slice(),
range: self.range,
}))
}
fn parse_string(&mut self) -> Result<StringType, LexicalError> {
let mut value = String::with_capacity(self.rest.len());
fn parse_string(mut self) -> Result<StringType, LexicalError> {
if self.kind.is_raw() {
value.push_str(self.skip_bytes(self.rest.len()));
} else {
// For raw strings, no escaping is necessary.
return Ok(StringType::Str(ast::StringLiteral {
value: self.source,
unicode: self.kind.is_unicode(),
range: self.range,
}));
}
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
// If the string doesn't contain any escape sequences, return the owned string.
return Ok(StringType::Str(ast::StringLiteral {
value: self.source,
unicode: self.kind.is_unicode(),
range: self.range,
}));
};
// If the string contains escape sequences, we need to parse them.
let mut value = String::with_capacity(self.source.len());
loop {
let Some(escape_idx) = self.rest.find('\\') else {
value.push_str(self.skip_bytes(self.rest.len()));
// Add the characters before the escape sequence to the string.
let before_with_slash = self.skip_bytes(escape + 1);
let before = &before_with_slash[..before_with_slash.len() - 1];
value.push_str(before);
// Add the escaped character to the string.
match self.parse_escaped_char()? {
None => {}
Some(EscapedChar::Literal(c)) => value.push(c),
Some(EscapedChar::Escape(c)) => {
value.push('\\');
value.push(c);
}
}
let Some(next_escape) = self.source[self.cursor..].find('\\') else {
// Add the rest of the string to the value.
let rest = &self.source[self.cursor..];
value.push_str(rest);
break;
};
let before_with_slash = self.skip_bytes(escape_idx + 1);
let before = &before_with_slash[..before_with_slash.len() - 1];
// Update the position of the next escape sequence.
escape = next_escape;
}
value.push_str(before);
self.parse_escaped_char(&mut value)?;
}
}
Ok(StringType::Str(ast::StringLiteral {
value: value.into_boxed_str(),
unicode: self.kind.is_unicode(),
@ -301,7 +410,7 @@ impl<'a> StringParser<'a> {
}))
}
fn parse(&mut self) -> Result<StringType, LexicalError> {
fn parse(self) -> Result<StringType, LexicalError> {
if self.kind.is_any_bytes() {
self.parse_bytes()
} else {
@ -311,7 +420,7 @@ impl<'a> StringParser<'a> {
}
pub(crate) fn parse_string_literal(
source: &str,
source: Box<str>,
kind: StringKind,
triple_quoted: bool,
range: TextRange,
@ -327,7 +436,7 @@ pub(crate) fn parse_string_literal(
}
pub(crate) fn parse_fstring_literal_element(
source: &str,
source: Box<str>,
is_raw: bool,
range: TextRange,
) -> Result<ast::FStringElement, LexicalError> {
@ -360,7 +469,7 @@ pub(crate) fn concatenated_strings(
if has_bytes && byte_literal_count < strings.len() {
return Err(LexicalError::new(
LexicalErrorType::OtherError(
"cannot mix bytes and nonbytes literals"
"cannot mix bytes and non-bytes literals"
.to_string()
.into_boxed_str(),
),