mirror of
https://github.com/astral-sh/ruff.git
synced 2025-07-12 07:35:07 +00:00
Remove unnecessary string cloning from the parser (#9884)
Closes https://github.com/astral-sh/ruff/issues/9869.
This commit is contained in:
parent
7ca515c0aa
commit
6f0e4ad332
11 changed files with 227 additions and 119 deletions
|
@ -19,14 +19,15 @@ ruff_text_size = { path = "../ruff_text_size" }
|
|||
|
||||
anyhow = { workspace = true }
|
||||
bitflags = { workspace = true }
|
||||
bstr = { workspace = true }
|
||||
is-macro = { workspace = true }
|
||||
itertools = { workspace = true }
|
||||
lalrpop-util = { workspace = true, default-features = false }
|
||||
memchr = { workspace = true }
|
||||
unicode-ident = { workspace = true }
|
||||
unicode_names2 = { workspace = true }
|
||||
rustc-hash = { workspace = true }
|
||||
static_assertions = { workspace = true }
|
||||
unicode-ident = { workspace = true }
|
||||
unicode_names2 = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
insta = { workspace = true }
|
||||
|
|
|
@ -119,10 +119,10 @@ pub use token::{StringKind, Tok, TokenKind};
|
|||
|
||||
use crate::lexer::LexResult;
|
||||
|
||||
mod function;
|
||||
// Skip flattening lexer to distinguish from full ruff_python_parser
|
||||
mod context;
|
||||
mod function;
|
||||
mod invalid;
|
||||
// Skip flattening lexer to distinguish from full ruff_python_parser
|
||||
pub mod lexer;
|
||||
mod parser;
|
||||
mod soft_keywords;
|
||||
|
|
|
@ -1616,7 +1616,7 @@ StringLiteralOrFString: StringType = {
|
|||
StringLiteral: StringType = {
|
||||
<location:@L> <string:string> <end_location:@R> =>? {
|
||||
let (source, kind, triple_quoted) = string;
|
||||
Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
|
||||
Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1633,7 +1633,7 @@ FStringMiddlePattern: ast::FStringElement = {
|
|||
FStringReplacementField,
|
||||
<location:@L> <fstring_middle:fstring_middle> <end_location:@R> =>? {
|
||||
let (source, is_raw, _) = fstring_middle;
|
||||
Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
|
||||
Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
// auto-generated: "lalrpop 0.20.0"
|
||||
// sha3: 02c60b5c591440061dda68775005d87a203b5448c205120bda1566a62fc2147c
|
||||
// sha3: d38cc0f2252a58db42d3bd63a102b537865992b3cf51d402cdb4828f48989c9d
|
||||
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
|
||||
use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
|
||||
use crate::{
|
||||
|
@ -36369,7 +36369,7 @@ fn __action217<
|
|||
{
|
||||
{
|
||||
let (source, kind, triple_quoted) = string;
|
||||
Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
|
||||
Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -36419,7 +36419,7 @@ fn __action220<
|
|||
{
|
||||
{
|
||||
let (source, is_raw, _) = fstring_middle;
|
||||
Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
|
||||
Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
//! Parsing of string literals, bytes literals, and implicit string concatenation.
|
||||
|
||||
use bstr::ByteSlice;
|
||||
|
||||
use ruff_python_ast::{self as ast, Expr};
|
||||
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
|
||||
use ruff_text_size::{Ranged, TextRange, TextSize};
|
||||
|
||||
use crate::lexer::{LexicalError, LexicalErrorType};
|
||||
use crate::token::{StringKind, Tok};
|
||||
|
@ -32,34 +34,40 @@ impl From<StringType> for Expr {
|
|||
}
|
||||
}
|
||||
|
||||
struct StringParser<'a> {
|
||||
rest: &'a str,
|
||||
enum EscapedChar {
|
||||
Literal(char),
|
||||
Escape(char),
|
||||
}
|
||||
|
||||
struct StringParser {
|
||||
source: Box<str>,
|
||||
cursor: usize,
|
||||
kind: StringKind,
|
||||
location: TextSize,
|
||||
offset: TextSize,
|
||||
range: TextRange,
|
||||
}
|
||||
|
||||
impl<'a> StringParser<'a> {
|
||||
fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self {
|
||||
impl StringParser {
|
||||
fn new(source: Box<str>, kind: StringKind, offset: TextSize, range: TextRange) -> Self {
|
||||
Self {
|
||||
rest: source,
|
||||
source,
|
||||
cursor: 0,
|
||||
kind,
|
||||
location: start,
|
||||
offset,
|
||||
range,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn skip_bytes(&mut self, bytes: usize) -> &'a str {
|
||||
let skipped_str = &self.rest[..bytes];
|
||||
self.rest = &self.rest[bytes..];
|
||||
self.location += skipped_str.text_len();
|
||||
fn skip_bytes(&mut self, bytes: usize) -> &str {
|
||||
let skipped_str = &self.source[self.cursor..self.cursor + bytes];
|
||||
self.cursor += bytes;
|
||||
skipped_str
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_pos(&self) -> TextSize {
|
||||
self.location
|
||||
self.offset + TextSize::try_from(self.cursor).unwrap()
|
||||
}
|
||||
|
||||
/// Returns the next byte in the string, if there is one.
|
||||
|
@ -69,25 +77,23 @@ impl<'a> StringParser<'a> {
|
|||
/// When the next byte is a part of a multi-byte character.
|
||||
#[inline]
|
||||
fn next_byte(&mut self) -> Option<u8> {
|
||||
self.rest.as_bytes().first().map(|&byte| {
|
||||
self.rest = &self.rest[1..];
|
||||
self.location += TextSize::new(1);
|
||||
self.source[self.cursor..].as_bytes().first().map(|&byte| {
|
||||
self.cursor += 1;
|
||||
byte
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_char(&mut self) -> Option<char> {
|
||||
self.rest.chars().next().map(|c| {
|
||||
self.rest = &self.rest[c.len_utf8()..];
|
||||
self.location += c.text_len();
|
||||
self.source[self.cursor..].chars().next().map(|c| {
|
||||
self.cursor += c.len_utf8();
|
||||
c
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn peek_byte(&self) -> Option<u8> {
|
||||
self.rest.as_bytes().first().copied()
|
||||
self.source[self.cursor..].as_bytes().first().copied()
|
||||
}
|
||||
|
||||
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
|
||||
|
@ -135,7 +141,7 @@ impl<'a> StringParser<'a> {
|
|||
};
|
||||
|
||||
let start_pos = self.get_pos();
|
||||
let Some(close_idx) = self.rest.find('}') else {
|
||||
let Some(close_idx) = self.source[self.cursor..].find('}') else {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::StringError,
|
||||
self.get_pos(),
|
||||
|
@ -149,7 +155,8 @@ impl<'a> StringParser<'a> {
|
|||
.ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
|
||||
}
|
||||
|
||||
fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
|
||||
/// Parse an escaped character, returning the new character.
|
||||
fn parse_escaped_char(&mut self) -> Result<Option<EscapedChar>, LexicalError> {
|
||||
let Some(first_char) = self.next_char() else {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::StringError,
|
||||
|
@ -174,13 +181,13 @@ impl<'a> StringParser<'a> {
|
|||
'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
|
||||
'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
|
||||
// Special cases where the escape sequence is not a single character
|
||||
'\n' => return Ok(()),
|
||||
'\n' => return Ok(None),
|
||||
'\r' => {
|
||||
if self.peek_byte() == Some(b'\n') {
|
||||
self.next_byte();
|
||||
}
|
||||
|
||||
return Ok(());
|
||||
return Ok(None);
|
||||
}
|
||||
_ => {
|
||||
if self.kind.is_any_bytes() && !first_char.is_ascii() {
|
||||
|
@ -194,21 +201,42 @@ impl<'a> StringParser<'a> {
|
|||
));
|
||||
}
|
||||
|
||||
string.push('\\');
|
||||
|
||||
first_char
|
||||
return Ok(Some(EscapedChar::Escape(first_char)));
|
||||
}
|
||||
};
|
||||
|
||||
string.push(new_char);
|
||||
|
||||
Ok(())
|
||||
Ok(Some(EscapedChar::Literal(new_char)))
|
||||
}
|
||||
|
||||
fn parse_fstring_middle(&mut self) -> Result<ast::FStringElement, LexicalError> {
|
||||
let mut value = String::with_capacity(self.rest.len());
|
||||
while let Some(ch) = self.next_char() {
|
||||
match ch {
|
||||
fn parse_fstring_middle(mut self) -> Result<ast::FStringElement, LexicalError> {
|
||||
// Fast-path: if the f-string doesn't contain any escape sequences, return the literal.
|
||||
let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else {
|
||||
return Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
|
||||
value: self.source,
|
||||
range: self.range,
|
||||
}));
|
||||
};
|
||||
|
||||
let mut value = String::with_capacity(self.source.len());
|
||||
loop {
|
||||
// Add the characters before the escape sequence (or curly brace) to the string.
|
||||
let before_with_slash_or_brace = self.skip_bytes(index + 1);
|
||||
let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1];
|
||||
value.push_str(before);
|
||||
|
||||
// Add the escaped character to the string.
|
||||
match &self.source.as_bytes()[self.cursor - 1] {
|
||||
// If there are any curly braces inside a `FStringMiddle` token,
|
||||
// then they were escaped (i.e. `{{` or `}}`). This means that
|
||||
// we need increase the location by 2 instead of 1.
|
||||
b'{' => {
|
||||
self.offset += TextSize::from(1);
|
||||
value.push('{');
|
||||
}
|
||||
b'}' => {
|
||||
self.offset += TextSize::from(1);
|
||||
value.push('}');
|
||||
}
|
||||
// We can encounter a `\` as the last character in a `FStringMiddle`
|
||||
// token which is valid in this context. For example,
|
||||
//
|
||||
|
@ -229,71 +257,152 @@ impl<'a> StringParser<'a> {
|
|||
// This is still an invalid escape sequence, but we don't want to
|
||||
// raise a syntax error as is done by the CPython parser. It might
|
||||
// be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
|
||||
'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
|
||||
self.parse_escaped_char(&mut value)?;
|
||||
}
|
||||
// If there are any curly braces inside a `FStringMiddle` token,
|
||||
// then they were escaped (i.e. `{{` or `}}`). This means that
|
||||
// we need increase the location by 2 instead of 1.
|
||||
ch @ ('{' | '}') => {
|
||||
self.location += ch.text_len();
|
||||
value.push(ch);
|
||||
}
|
||||
ch => value.push(ch),
|
||||
}
|
||||
}
|
||||
Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
|
||||
value,
|
||||
range: self.range,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_bytes(&mut self) -> Result<StringType, LexicalError> {
|
||||
let mut content = String::with_capacity(self.rest.len());
|
||||
while let Some(ch) = self.next_char() {
|
||||
match ch {
|
||||
'\\' if !self.kind.is_raw() => {
|
||||
self.parse_escaped_char(&mut content)?;
|
||||
b'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
|
||||
match self.parse_escaped_char()? {
|
||||
None => {}
|
||||
Some(EscapedChar::Literal(c)) => value.push(c),
|
||||
Some(EscapedChar::Escape(c)) => {
|
||||
value.push('\\');
|
||||
value.push(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
ch => {
|
||||
if !ch.is_ascii() {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::OtherError(
|
||||
"bytes can only contain ASCII literal characters"
|
||||
.to_string()
|
||||
.into_boxed_str(),
|
||||
),
|
||||
self.get_pos(),
|
||||
));
|
||||
}
|
||||
content.push(ch);
|
||||
value.push(char::from(*ch));
|
||||
}
|
||||
}
|
||||
|
||||
let Some(next_index) =
|
||||
memchr::memchr3(b'{', b'}', b'\\', self.source[self.cursor..].as_bytes())
|
||||
else {
|
||||
// Add the rest of the string to the value.
|
||||
let rest = &self.source[self.cursor..];
|
||||
value.push_str(rest);
|
||||
break;
|
||||
};
|
||||
|
||||
index = next_index;
|
||||
}
|
||||
Ok(StringType::Bytes(ast::BytesLiteral {
|
||||
value: content.chars().map(|c| c as u8).collect::<Vec<u8>>(),
|
||||
|
||||
Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
|
||||
value: value.into_boxed_str(),
|
||||
range: self.range,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_string(&mut self) -> Result<StringType, LexicalError> {
|
||||
let mut value = String::with_capacity(self.rest.len());
|
||||
if self.kind.is_raw() {
|
||||
value.push_str(self.skip_bytes(self.rest.len()));
|
||||
} else {
|
||||
loop {
|
||||
let Some(escape_idx) = self.rest.find('\\') else {
|
||||
value.push_str(self.skip_bytes(self.rest.len()));
|
||||
break;
|
||||
};
|
||||
|
||||
let before_with_slash = self.skip_bytes(escape_idx + 1);
|
||||
let before = &before_with_slash[..before_with_slash.len() - 1];
|
||||
|
||||
value.push_str(before);
|
||||
self.parse_escaped_char(&mut value)?;
|
||||
}
|
||||
fn parse_bytes(mut self) -> Result<StringType, LexicalError> {
|
||||
if let Some(index) = self.source.as_bytes().find_non_ascii_byte() {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::OtherError(
|
||||
"bytes can only contain ASCII literal characters"
|
||||
.to_string()
|
||||
.into_boxed_str(),
|
||||
),
|
||||
self.offset + TextSize::try_from(index).unwrap(),
|
||||
));
|
||||
}
|
||||
|
||||
if self.kind.is_raw() {
|
||||
// For raw strings, no escaping is necessary.
|
||||
return Ok(StringType::Bytes(ast::BytesLiteral {
|
||||
value: self.source.into_boxed_bytes(),
|
||||
range: self.range,
|
||||
}));
|
||||
}
|
||||
|
||||
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
|
||||
// If the string doesn't contain any escape sequences, return the owned string.
|
||||
return Ok(StringType::Bytes(ast::BytesLiteral {
|
||||
value: self.source.into_boxed_bytes(),
|
||||
range: self.range,
|
||||
}));
|
||||
};
|
||||
|
||||
// If the string contains escape sequences, we need to parse them.
|
||||
let mut value = Vec::with_capacity(self.source.len());
|
||||
loop {
|
||||
// Add the characters before the escape sequence to the string.
|
||||
let before_with_slash = self.skip_bytes(escape + 1);
|
||||
let before = &before_with_slash[..before_with_slash.len() - 1];
|
||||
value.extend_from_slice(before.as_bytes());
|
||||
|
||||
// Add the escaped character to the string.
|
||||
match self.parse_escaped_char()? {
|
||||
None => {}
|
||||
Some(EscapedChar::Literal(c)) => value.push(c as u8),
|
||||
Some(EscapedChar::Escape(c)) => {
|
||||
value.push(b'\\');
|
||||
value.push(c as u8);
|
||||
}
|
||||
}
|
||||
|
||||
let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes())
|
||||
else {
|
||||
// Add the rest of the string to the value.
|
||||
let rest = &self.source[self.cursor..];
|
||||
value.extend_from_slice(rest.as_bytes());
|
||||
break;
|
||||
};
|
||||
|
||||
// Update the position of the next escape sequence.
|
||||
escape = next_escape;
|
||||
}
|
||||
|
||||
Ok(StringType::Bytes(ast::BytesLiteral {
|
||||
value: value.into_boxed_slice(),
|
||||
range: self.range,
|
||||
}))
|
||||
}
|
||||
|
||||
fn parse_string(mut self) -> Result<StringType, LexicalError> {
|
||||
if self.kind.is_raw() {
|
||||
// For raw strings, no escaping is necessary.
|
||||
return Ok(StringType::Str(ast::StringLiteral {
|
||||
value: self.source,
|
||||
unicode: self.kind.is_unicode(),
|
||||
range: self.range,
|
||||
}));
|
||||
}
|
||||
|
||||
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
|
||||
// If the string doesn't contain any escape sequences, return the owned string.
|
||||
return Ok(StringType::Str(ast::StringLiteral {
|
||||
value: self.source,
|
||||
unicode: self.kind.is_unicode(),
|
||||
range: self.range,
|
||||
}));
|
||||
};
|
||||
|
||||
// If the string contains escape sequences, we need to parse them.
|
||||
let mut value = String::with_capacity(self.source.len());
|
||||
|
||||
loop {
|
||||
// Add the characters before the escape sequence to the string.
|
||||
let before_with_slash = self.skip_bytes(escape + 1);
|
||||
let before = &before_with_slash[..before_with_slash.len() - 1];
|
||||
value.push_str(before);
|
||||
|
||||
// Add the escaped character to the string.
|
||||
match self.parse_escaped_char()? {
|
||||
None => {}
|
||||
Some(EscapedChar::Literal(c)) => value.push(c),
|
||||
Some(EscapedChar::Escape(c)) => {
|
||||
value.push('\\');
|
||||
value.push(c);
|
||||
}
|
||||
}
|
||||
|
||||
let Some(next_escape) = self.source[self.cursor..].find('\\') else {
|
||||
// Add the rest of the string to the value.
|
||||
let rest = &self.source[self.cursor..];
|
||||
value.push_str(rest);
|
||||
break;
|
||||
};
|
||||
|
||||
// Update the position of the next escape sequence.
|
||||
escape = next_escape;
|
||||
}
|
||||
|
||||
Ok(StringType::Str(ast::StringLiteral {
|
||||
value: value.into_boxed_str(),
|
||||
unicode: self.kind.is_unicode(),
|
||||
|
@ -301,7 +410,7 @@ impl<'a> StringParser<'a> {
|
|||
}))
|
||||
}
|
||||
|
||||
fn parse(&mut self) -> Result<StringType, LexicalError> {
|
||||
fn parse(self) -> Result<StringType, LexicalError> {
|
||||
if self.kind.is_any_bytes() {
|
||||
self.parse_bytes()
|
||||
} else {
|
||||
|
@ -311,7 +420,7 @@ impl<'a> StringParser<'a> {
|
|||
}
|
||||
|
||||
pub(crate) fn parse_string_literal(
|
||||
source: &str,
|
||||
source: Box<str>,
|
||||
kind: StringKind,
|
||||
triple_quoted: bool,
|
||||
range: TextRange,
|
||||
|
@ -327,7 +436,7 @@ pub(crate) fn parse_string_literal(
|
|||
}
|
||||
|
||||
pub(crate) fn parse_fstring_literal_element(
|
||||
source: &str,
|
||||
source: Box<str>,
|
||||
is_raw: bool,
|
||||
range: TextRange,
|
||||
) -> Result<ast::FStringElement, LexicalError> {
|
||||
|
@ -360,7 +469,7 @@ pub(crate) fn concatenated_strings(
|
|||
if has_bytes && byte_literal_count < strings.len() {
|
||||
return Err(LexicalError::new(
|
||||
LexicalErrorType::OtherError(
|
||||
"cannot mix bytes and nonbytes literals"
|
||||
"cannot mix bytes and non-bytes literals"
|
||||
.to_string()
|
||||
.into_boxed_str(),
|
||||
),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue