Remove unnecessary string cloning from the parser (#9884)

Closes https://github.com/astral-sh/ruff/issues/9869.
This commit is contained in:
Charlie Marsh 2024-02-09 16:03:27 -05:00 committed by GitHub
parent 7ca515c0aa
commit 6f0e4ad332
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 227 additions and 119 deletions

View file

@ -19,14 +19,15 @@ ruff_text_size = { path = "../ruff_text_size" }
anyhow = { workspace = true }
bitflags = { workspace = true }
bstr = { workspace = true }
is-macro = { workspace = true }
itertools = { workspace = true }
lalrpop-util = { workspace = true, default-features = false }
memchr = { workspace = true }
unicode-ident = { workspace = true }
unicode_names2 = { workspace = true }
rustc-hash = { workspace = true }
static_assertions = { workspace = true }
unicode-ident = { workspace = true }
unicode_names2 = { workspace = true }
[dev-dependencies]
insta = { workspace = true }

View file

@ -119,10 +119,10 @@ pub use token::{StringKind, Tok, TokenKind};
use crate::lexer::LexResult;
mod function;
// Skip flattening lexer to distinguish from full ruff_python_parser
mod context;
mod function;
mod invalid;
// Skip flattening lexer to distinguish from full ruff_python_parser
pub mod lexer;
mod parser;
mod soft_keywords;

View file

@ -1616,7 +1616,7 @@ StringLiteralOrFString: StringType = {
StringLiteral: StringType = {
<location:@L> <string:string> <end_location:@R> =>? {
let (source, kind, triple_quoted) = string;
Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
}
};
@ -1633,7 +1633,7 @@ FStringMiddlePattern: ast::FStringElement = {
FStringReplacementField,
<location:@L> <fstring_middle:fstring_middle> <end_location:@R> =>? {
let (source, is_raw, _) = fstring_middle;
Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
}
};

View file

@ -1,5 +1,5 @@
// auto-generated: "lalrpop 0.20.0"
// sha3: 02c60b5c591440061dda68775005d87a203b5448c205120bda1566a62fc2147c
// sha3: d38cc0f2252a58db42d3bd63a102b537865992b3cf51d402cdb4828f48989c9d
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
use crate::{
@ -36369,7 +36369,7 @@ fn __action217<
{
{
let (source, kind, triple_quoted) = string;
Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
}
}
@ -36419,7 +36419,7 @@ fn __action220<
{
{
let (source, is_raw, _) = fstring_middle;
Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
}
}

View file

@ -1,7 +1,9 @@
//! Parsing of string literals, bytes literals, and implicit string concatenation.
use bstr::ByteSlice;
use ruff_python_ast::{self as ast, Expr};
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
use ruff_text_size::{Ranged, TextRange, TextSize};
use crate::lexer::{LexicalError, LexicalErrorType};
use crate::token::{StringKind, Tok};
@ -32,34 +34,40 @@ impl From<StringType> for Expr {
}
}
struct StringParser<'a> {
rest: &'a str,
enum EscapedChar {
Literal(char),
Escape(char),
}
struct StringParser {
source: Box<str>,
cursor: usize,
kind: StringKind,
location: TextSize,
offset: TextSize,
range: TextRange,
}
impl<'a> StringParser<'a> {
fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self {
impl StringParser {
fn new(source: Box<str>, kind: StringKind, offset: TextSize, range: TextRange) -> Self {
Self {
rest: source,
source,
cursor: 0,
kind,
location: start,
offset,
range,
}
}
#[inline]
fn skip_bytes(&mut self, bytes: usize) -> &'a str {
let skipped_str = &self.rest[..bytes];
self.rest = &self.rest[bytes..];
self.location += skipped_str.text_len();
fn skip_bytes(&mut self, bytes: usize) -> &str {
let skipped_str = &self.source[self.cursor..self.cursor + bytes];
self.cursor += bytes;
skipped_str
}
#[inline]
fn get_pos(&self) -> TextSize {
self.location
self.offset + TextSize::try_from(self.cursor).unwrap()
}
/// Returns the next byte in the string, if there is one.
@ -69,25 +77,23 @@ impl<'a> StringParser<'a> {
/// When the next byte is a part of a multi-byte character.
#[inline]
fn next_byte(&mut self) -> Option<u8> {
self.rest.as_bytes().first().map(|&byte| {
self.rest = &self.rest[1..];
self.location += TextSize::new(1);
self.source[self.cursor..].as_bytes().first().map(|&byte| {
self.cursor += 1;
byte
})
}
#[inline]
fn next_char(&mut self) -> Option<char> {
self.rest.chars().next().map(|c| {
self.rest = &self.rest[c.len_utf8()..];
self.location += c.text_len();
self.source[self.cursor..].chars().next().map(|c| {
self.cursor += c.len_utf8();
c
})
}
#[inline]
fn peek_byte(&self) -> Option<u8> {
self.rest.as_bytes().first().copied()
self.source[self.cursor..].as_bytes().first().copied()
}
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
@ -135,7 +141,7 @@ impl<'a> StringParser<'a> {
};
let start_pos = self.get_pos();
let Some(close_idx) = self.rest.find('}') else {
let Some(close_idx) = self.source[self.cursor..].find('}') else {
return Err(LexicalError::new(
LexicalErrorType::StringError,
self.get_pos(),
@ -149,7 +155,8 @@ impl<'a> StringParser<'a> {
.ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
}
fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
/// Parse an escaped character, returning the new character.
fn parse_escaped_char(&mut self) -> Result<Option<EscapedChar>, LexicalError> {
let Some(first_char) = self.next_char() else {
return Err(LexicalError::new(
LexicalErrorType::StringError,
@ -174,13 +181,13 @@ impl<'a> StringParser<'a> {
'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
// Special cases where the escape sequence is not a single character
'\n' => return Ok(()),
'\n' => return Ok(None),
'\r' => {
if self.peek_byte() == Some(b'\n') {
self.next_byte();
}
return Ok(());
return Ok(None);
}
_ => {
if self.kind.is_any_bytes() && !first_char.is_ascii() {
@ -194,21 +201,42 @@ impl<'a> StringParser<'a> {
));
}
string.push('\\');
first_char
return Ok(Some(EscapedChar::Escape(first_char)));
}
};
string.push(new_char);
Ok(())
Ok(Some(EscapedChar::Literal(new_char)))
}
fn parse_fstring_middle(&mut self) -> Result<ast::FStringElement, LexicalError> {
let mut value = String::with_capacity(self.rest.len());
while let Some(ch) = self.next_char() {
match ch {
fn parse_fstring_middle(mut self) -> Result<ast::FStringElement, LexicalError> {
// Fast-path: if the f-string doesn't contain any escape sequences, return the literal.
let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else {
return Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
value: self.source,
range: self.range,
}));
};
let mut value = String::with_capacity(self.source.len());
loop {
// Add the characters before the escape sequence (or curly brace) to the string.
let before_with_slash_or_brace = self.skip_bytes(index + 1);
let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1];
value.push_str(before);
// Add the escaped character to the string.
match &self.source.as_bytes()[self.cursor - 1] {
// If there are any curly braces inside a `FStringMiddle` token,
// then they were escaped (i.e. `{{` or `}}`). This means that
// we need increase the location by 2 instead of 1.
b'{' => {
self.offset += TextSize::from(1);
value.push('{');
}
b'}' => {
self.offset += TextSize::from(1);
value.push('}');
}
// We can encounter a `\` as the last character in a `FStringMiddle`
// token which is valid in this context. For example,
//
@ -229,71 +257,152 @@ impl<'a> StringParser<'a> {
// This is still an invalid escape sequence, but we don't want to
// raise a syntax error as is done by the CPython parser. It might
// be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
self.parse_escaped_char(&mut value)?;
}
// If there are any curly braces inside a `FStringMiddle` token,
// then they were escaped (i.e. `{{` or `}}`). This means that
// we need increase the location by 2 instead of 1.
ch @ ('{' | '}') => {
self.location += ch.text_len();
value.push(ch);
}
ch => value.push(ch),
}
}
Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
value,
range: self.range,
}))
}
fn parse_bytes(&mut self) -> Result<StringType, LexicalError> {
let mut content = String::with_capacity(self.rest.len());
while let Some(ch) = self.next_char() {
match ch {
'\\' if !self.kind.is_raw() => {
self.parse_escaped_char(&mut content)?;
b'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
match self.parse_escaped_char()? {
None => {}
Some(EscapedChar::Literal(c)) => value.push(c),
Some(EscapedChar::Escape(c)) => {
value.push('\\');
value.push(c);
}
}
}
ch => {
if !ch.is_ascii() {
return Err(LexicalError::new(
LexicalErrorType::OtherError(
"bytes can only contain ASCII literal characters"
.to_string()
.into_boxed_str(),
),
self.get_pos(),
));
}
content.push(ch);
value.push(char::from(*ch));
}
}
let Some(next_index) =
memchr::memchr3(b'{', b'}', b'\\', self.source[self.cursor..].as_bytes())
else {
// Add the rest of the string to the value.
let rest = &self.source[self.cursor..];
value.push_str(rest);
break;
};
index = next_index;
}
Ok(StringType::Bytes(ast::BytesLiteral {
value: content.chars().map(|c| c as u8).collect::<Vec<u8>>(),
Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
value: value.into_boxed_str(),
range: self.range,
}))
}
fn parse_string(&mut self) -> Result<StringType, LexicalError> {
let mut value = String::with_capacity(self.rest.len());
if self.kind.is_raw() {
value.push_str(self.skip_bytes(self.rest.len()));
} else {
loop {
let Some(escape_idx) = self.rest.find('\\') else {
value.push_str(self.skip_bytes(self.rest.len()));
break;
};
let before_with_slash = self.skip_bytes(escape_idx + 1);
let before = &before_with_slash[..before_with_slash.len() - 1];
value.push_str(before);
self.parse_escaped_char(&mut value)?;
}
fn parse_bytes(mut self) -> Result<StringType, LexicalError> {
if let Some(index) = self.source.as_bytes().find_non_ascii_byte() {
return Err(LexicalError::new(
LexicalErrorType::OtherError(
"bytes can only contain ASCII literal characters"
.to_string()
.into_boxed_str(),
),
self.offset + TextSize::try_from(index).unwrap(),
));
}
if self.kind.is_raw() {
// For raw strings, no escaping is necessary.
return Ok(StringType::Bytes(ast::BytesLiteral {
value: self.source.into_boxed_bytes(),
range: self.range,
}));
}
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
// If the string doesn't contain any escape sequences, return the owned string.
return Ok(StringType::Bytes(ast::BytesLiteral {
value: self.source.into_boxed_bytes(),
range: self.range,
}));
};
// If the string contains escape sequences, we need to parse them.
let mut value = Vec::with_capacity(self.source.len());
loop {
// Add the characters before the escape sequence to the string.
let before_with_slash = self.skip_bytes(escape + 1);
let before = &before_with_slash[..before_with_slash.len() - 1];
value.extend_from_slice(before.as_bytes());
// Add the escaped character to the string.
match self.parse_escaped_char()? {
None => {}
Some(EscapedChar::Literal(c)) => value.push(c as u8),
Some(EscapedChar::Escape(c)) => {
value.push(b'\\');
value.push(c as u8);
}
}
let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes())
else {
// Add the rest of the string to the value.
let rest = &self.source[self.cursor..];
value.extend_from_slice(rest.as_bytes());
break;
};
// Update the position of the next escape sequence.
escape = next_escape;
}
Ok(StringType::Bytes(ast::BytesLiteral {
value: value.into_boxed_slice(),
range: self.range,
}))
}
fn parse_string(mut self) -> Result<StringType, LexicalError> {
if self.kind.is_raw() {
// For raw strings, no escaping is necessary.
return Ok(StringType::Str(ast::StringLiteral {
value: self.source,
unicode: self.kind.is_unicode(),
range: self.range,
}));
}
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
// If the string doesn't contain any escape sequences, return the owned string.
return Ok(StringType::Str(ast::StringLiteral {
value: self.source,
unicode: self.kind.is_unicode(),
range: self.range,
}));
};
// If the string contains escape sequences, we need to parse them.
let mut value = String::with_capacity(self.source.len());
loop {
// Add the characters before the escape sequence to the string.
let before_with_slash = self.skip_bytes(escape + 1);
let before = &before_with_slash[..before_with_slash.len() - 1];
value.push_str(before);
// Add the escaped character to the string.
match self.parse_escaped_char()? {
None => {}
Some(EscapedChar::Literal(c)) => value.push(c),
Some(EscapedChar::Escape(c)) => {
value.push('\\');
value.push(c);
}
}
let Some(next_escape) = self.source[self.cursor..].find('\\') else {
// Add the rest of the string to the value.
let rest = &self.source[self.cursor..];
value.push_str(rest);
break;
};
// Update the position of the next escape sequence.
escape = next_escape;
}
Ok(StringType::Str(ast::StringLiteral {
value: value.into_boxed_str(),
unicode: self.kind.is_unicode(),
@ -301,7 +410,7 @@ impl<'a> StringParser<'a> {
}))
}
fn parse(&mut self) -> Result<StringType, LexicalError> {
fn parse(self) -> Result<StringType, LexicalError> {
if self.kind.is_any_bytes() {
self.parse_bytes()
} else {
@ -311,7 +420,7 @@ impl<'a> StringParser<'a> {
}
pub(crate) fn parse_string_literal(
source: &str,
source: Box<str>,
kind: StringKind,
triple_quoted: bool,
range: TextRange,
@ -327,7 +436,7 @@ pub(crate) fn parse_string_literal(
}
pub(crate) fn parse_fstring_literal_element(
source: &str,
source: Box<str>,
is_raw: bool,
range: TextRange,
) -> Result<ast::FStringElement, LexicalError> {
@ -360,7 +469,7 @@ pub(crate) fn concatenated_strings(
if has_bytes && byte_literal_count < strings.len() {
return Err(LexicalError::new(
LexicalErrorType::OtherError(
"cannot mix bytes and nonbytes literals"
"cannot mix bytes and non-bytes literals"
.to_string()
.into_boxed_str(),
),