mirror of
https://github.com/astral-sh/ruff.git
synced 2025-08-31 07:37:38 +00:00
Remove unnecessary string cloning from the parser (#9884)
Closes https://github.com/astral-sh/ruff/issues/9869.
This commit is contained in:
parent
7ca515c0aa
commit
6f0e4ad332
11 changed files with 227 additions and 119 deletions
13
Cargo.lock
generated
13
Cargo.lock
generated
|
@ -217,12 +217,12 @@ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bstr"
|
name = "bstr"
|
||||||
version = "1.6.2"
|
version = "1.9.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
|
checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"memchr",
|
"memchr",
|
||||||
"regex-automata 0.3.9",
|
"regex-automata 0.4.3",
|
||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -1921,12 +1921,6 @@ dependencies = [
|
||||||
"regex-syntax 0.6.29",
|
"regex-syntax 0.6.29",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "regex-automata"
|
|
||||||
version = "0.3.9"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "regex-automata"
|
name = "regex-automata"
|
||||||
version = "0.4.3"
|
version = "0.4.3"
|
||||||
|
@ -2342,6 +2336,7 @@ version = "0.0.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bitflags 2.4.1",
|
"bitflags 2.4.1",
|
||||||
|
"bstr",
|
||||||
"insta",
|
"insta",
|
||||||
"is-macro",
|
"is-macro",
|
||||||
"itertools 0.12.1",
|
"itertools 0.12.1",
|
||||||
|
|
|
@ -19,6 +19,7 @@ argfile = { version = "0.1.6" }
|
||||||
assert_cmd = { version = "2.0.13" }
|
assert_cmd = { version = "2.0.13" }
|
||||||
bincode = { version = "1.3.3" }
|
bincode = { version = "1.3.3" }
|
||||||
bitflags = { version = "2.4.1" }
|
bitflags = { version = "2.4.1" }
|
||||||
|
bstr = { version = "1.9.0" }
|
||||||
cachedir = { version = "0.3.1" }
|
cachedir = { version = "0.3.1" }
|
||||||
chrono = { version = "0.4.33", default-features = false, features = ["clock"] }
|
chrono = { version = "0.4.33", default-features = false, features = ["clock"] }
|
||||||
clap = { version = "4.4.18", features = ["derive"] }
|
clap = { version = "4.4.18", features = ["derive"] }
|
||||||
|
|
|
@ -40,7 +40,9 @@ impl Violation for HardcodedBindAllInterfaces {
|
||||||
pub(crate) fn hardcoded_bind_all_interfaces(checker: &mut Checker, string: StringLike) {
|
pub(crate) fn hardcoded_bind_all_interfaces(checker: &mut Checker, string: StringLike) {
|
||||||
let is_bind_all_interface = match string {
|
let is_bind_all_interface = match string {
|
||||||
StringLike::StringLiteral(ast::ExprStringLiteral { value, .. }) => value == "0.0.0.0",
|
StringLike::StringLiteral(ast::ExprStringLiteral { value, .. }) => value == "0.0.0.0",
|
||||||
StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => value == "0.0.0.0",
|
StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => {
|
||||||
|
&**value == "0.0.0.0"
|
||||||
|
}
|
||||||
StringLike::BytesLiteral(_) => return,
|
StringLike::BytesLiteral(_) => return,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ fn to_f_string_expression_element(inner: &Expr) -> ast::FStringElement {
|
||||||
/// Convert a string to a [`ast::FStringElement::Literal`].
|
/// Convert a string to a [`ast::FStringElement::Literal`].
|
||||||
pub(super) fn to_f_string_literal_element(s: &str) -> ast::FStringElement {
|
pub(super) fn to_f_string_literal_element(s: &str) -> ast::FStringElement {
|
||||||
ast::FStringElement::Literal(ast::FStringLiteralElement {
|
ast::FStringElement::Literal(ast::FStringLiteralElement {
|
||||||
value: s.to_owned(),
|
value: s.to_string().into_boxed_str(),
|
||||||
range: TextRange::default(),
|
range: TextRange::default(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -53,7 +53,7 @@ pub(super) fn to_f_string_element(expr: &Expr) -> Option<ast::FStringElement> {
|
||||||
match expr {
|
match expr {
|
||||||
Expr::StringLiteral(ast::ExprStringLiteral { value, range }) => {
|
Expr::StringLiteral(ast::ExprStringLiteral { value, range }) => {
|
||||||
Some(ast::FStringElement::Literal(ast::FStringLiteralElement {
|
Some(ast::FStringElement::Literal(ast::FStringLiteralElement {
|
||||||
value: value.to_string(),
|
value: value.to_string().into_boxed_str(),
|
||||||
range: *range,
|
range: *range,
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
|
@ -644,7 +644,7 @@ pub struct ComparableBytesLiteral<'a> {
|
||||||
impl<'a> From<&'a ast::BytesLiteral> for ComparableBytesLiteral<'a> {
|
impl<'a> From<&'a ast::BytesLiteral> for ComparableBytesLiteral<'a> {
|
||||||
fn from(bytes_literal: &'a ast::BytesLiteral) -> Self {
|
fn from(bytes_literal: &'a ast::BytesLiteral) -> Self {
|
||||||
Self {
|
Self {
|
||||||
value: bytes_literal.value.as_slice(),
|
value: &bytes_literal.value,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -949,7 +949,7 @@ impl Ranged for FStringExpressionElement {
|
||||||
#[derive(Clone, Debug, PartialEq)]
|
#[derive(Clone, Debug, PartialEq)]
|
||||||
pub struct FStringLiteralElement {
|
pub struct FStringLiteralElement {
|
||||||
pub range: TextRange,
|
pub range: TextRange,
|
||||||
pub value: String,
|
pub value: Box<str>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Ranged for FStringLiteralElement {
|
impl Ranged for FStringLiteralElement {
|
||||||
|
@ -962,7 +962,7 @@ impl Deref for FStringLiteralElement {
|
||||||
type Target = str;
|
type Target = str;
|
||||||
|
|
||||||
fn deref(&self) -> &Self::Target {
|
fn deref(&self) -> &Self::Target {
|
||||||
self.value.as_str()
|
&self.value
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1607,7 +1607,7 @@ impl Default for BytesLiteralValueInner {
|
||||||
#[derive(Clone, Debug, Default, PartialEq)]
|
#[derive(Clone, Debug, Default, PartialEq)]
|
||||||
pub struct BytesLiteral {
|
pub struct BytesLiteral {
|
||||||
pub range: TextRange,
|
pub range: TextRange,
|
||||||
pub value: Vec<u8>,
|
pub value: Box<[u8]>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Ranged for BytesLiteral {
|
impl Ranged for BytesLiteral {
|
||||||
|
@ -1620,7 +1620,7 @@ impl Deref for BytesLiteral {
|
||||||
type Target = [u8];
|
type Target = [u8];
|
||||||
|
|
||||||
fn deref(&self) -> &Self::Target {
|
fn deref(&self) -> &Self::Target {
|
||||||
self.value.as_slice()
|
&self.value
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,14 +19,15 @@ ruff_text_size = { path = "../ruff_text_size" }
|
||||||
|
|
||||||
anyhow = { workspace = true }
|
anyhow = { workspace = true }
|
||||||
bitflags = { workspace = true }
|
bitflags = { workspace = true }
|
||||||
|
bstr = { workspace = true }
|
||||||
is-macro = { workspace = true }
|
is-macro = { workspace = true }
|
||||||
itertools = { workspace = true }
|
itertools = { workspace = true }
|
||||||
lalrpop-util = { workspace = true, default-features = false }
|
lalrpop-util = { workspace = true, default-features = false }
|
||||||
memchr = { workspace = true }
|
memchr = { workspace = true }
|
||||||
unicode-ident = { workspace = true }
|
|
||||||
unicode_names2 = { workspace = true }
|
|
||||||
rustc-hash = { workspace = true }
|
rustc-hash = { workspace = true }
|
||||||
static_assertions = { workspace = true }
|
static_assertions = { workspace = true }
|
||||||
|
unicode-ident = { workspace = true }
|
||||||
|
unicode_names2 = { workspace = true }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
insta = { workspace = true }
|
insta = { workspace = true }
|
||||||
|
|
|
@ -119,10 +119,10 @@ pub use token::{StringKind, Tok, TokenKind};
|
||||||
|
|
||||||
use crate::lexer::LexResult;
|
use crate::lexer::LexResult;
|
||||||
|
|
||||||
mod function;
|
|
||||||
// Skip flattening lexer to distinguish from full ruff_python_parser
|
|
||||||
mod context;
|
mod context;
|
||||||
|
mod function;
|
||||||
mod invalid;
|
mod invalid;
|
||||||
|
// Skip flattening lexer to distinguish from full ruff_python_parser
|
||||||
pub mod lexer;
|
pub mod lexer;
|
||||||
mod parser;
|
mod parser;
|
||||||
mod soft_keywords;
|
mod soft_keywords;
|
||||||
|
|
|
@ -1616,7 +1616,7 @@ StringLiteralOrFString: StringType = {
|
||||||
StringLiteral: StringType = {
|
StringLiteral: StringType = {
|
||||||
<location:@L> <string:string> <end_location:@R> =>? {
|
<location:@L> <string:string> <end_location:@R> =>? {
|
||||||
let (source, kind, triple_quoted) = string;
|
let (source, kind, triple_quoted) = string;
|
||||||
Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
|
Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1633,7 +1633,7 @@ FStringMiddlePattern: ast::FStringElement = {
|
||||||
FStringReplacementField,
|
FStringReplacementField,
|
||||||
<location:@L> <fstring_middle:fstring_middle> <end_location:@R> =>? {
|
<location:@L> <fstring_middle:fstring_middle> <end_location:@R> =>? {
|
||||||
let (source, is_raw, _) = fstring_middle;
|
let (source, is_raw, _) = fstring_middle;
|
||||||
Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
|
Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
// auto-generated: "lalrpop 0.20.0"
|
// auto-generated: "lalrpop 0.20.0"
|
||||||
// sha3: 02c60b5c591440061dda68775005d87a203b5448c205120bda1566a62fc2147c
|
// sha3: d38cc0f2252a58db42d3bd63a102b537865992b3cf51d402cdb4828f48989c9d
|
||||||
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
|
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
|
||||||
use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
|
use ruff_python_ast::{self as ast, Int, IpyEscapeKind};
|
||||||
use crate::{
|
use crate::{
|
||||||
|
@ -36369,7 +36369,7 @@ fn __action217<
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
let (source, kind, triple_quoted) = string;
|
let (source, kind, triple_quoted) = string;
|
||||||
Ok(parse_string_literal(&source, kind, triple_quoted, (location..end_location).into())?)
|
Ok(parse_string_literal(source, kind, triple_quoted, (location..end_location).into())?)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -36419,7 +36419,7 @@ fn __action220<
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
let (source, is_raw, _) = fstring_middle;
|
let (source, is_raw, _) = fstring_middle;
|
||||||
Ok(parse_fstring_literal_element(&source, is_raw, (location..end_location).into())?)
|
Ok(parse_fstring_literal_element(source, is_raw, (location..end_location).into())?)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
//! Parsing of string literals, bytes literals, and implicit string concatenation.
|
//! Parsing of string literals, bytes literals, and implicit string concatenation.
|
||||||
|
|
||||||
|
use bstr::ByteSlice;
|
||||||
|
|
||||||
use ruff_python_ast::{self as ast, Expr};
|
use ruff_python_ast::{self as ast, Expr};
|
||||||
use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
|
use ruff_text_size::{Ranged, TextRange, TextSize};
|
||||||
|
|
||||||
use crate::lexer::{LexicalError, LexicalErrorType};
|
use crate::lexer::{LexicalError, LexicalErrorType};
|
||||||
use crate::token::{StringKind, Tok};
|
use crate::token::{StringKind, Tok};
|
||||||
|
@ -32,34 +34,40 @@ impl From<StringType> for Expr {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct StringParser<'a> {
|
enum EscapedChar {
|
||||||
rest: &'a str,
|
Literal(char),
|
||||||
|
Escape(char),
|
||||||
|
}
|
||||||
|
|
||||||
|
struct StringParser {
|
||||||
|
source: Box<str>,
|
||||||
|
cursor: usize,
|
||||||
kind: StringKind,
|
kind: StringKind,
|
||||||
location: TextSize,
|
offset: TextSize,
|
||||||
range: TextRange,
|
range: TextRange,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> StringParser<'a> {
|
impl StringParser {
|
||||||
fn new(source: &'a str, kind: StringKind, start: TextSize, range: TextRange) -> Self {
|
fn new(source: Box<str>, kind: StringKind, offset: TextSize, range: TextRange) -> Self {
|
||||||
Self {
|
Self {
|
||||||
rest: source,
|
source,
|
||||||
|
cursor: 0,
|
||||||
kind,
|
kind,
|
||||||
location: start,
|
offset,
|
||||||
range,
|
range,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn skip_bytes(&mut self, bytes: usize) -> &'a str {
|
fn skip_bytes(&mut self, bytes: usize) -> &str {
|
||||||
let skipped_str = &self.rest[..bytes];
|
let skipped_str = &self.source[self.cursor..self.cursor + bytes];
|
||||||
self.rest = &self.rest[bytes..];
|
self.cursor += bytes;
|
||||||
self.location += skipped_str.text_len();
|
|
||||||
skipped_str
|
skipped_str
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn get_pos(&self) -> TextSize {
|
fn get_pos(&self) -> TextSize {
|
||||||
self.location
|
self.offset + TextSize::try_from(self.cursor).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the next byte in the string, if there is one.
|
/// Returns the next byte in the string, if there is one.
|
||||||
|
@ -69,25 +77,23 @@ impl<'a> StringParser<'a> {
|
||||||
/// When the next byte is a part of a multi-byte character.
|
/// When the next byte is a part of a multi-byte character.
|
||||||
#[inline]
|
#[inline]
|
||||||
fn next_byte(&mut self) -> Option<u8> {
|
fn next_byte(&mut self) -> Option<u8> {
|
||||||
self.rest.as_bytes().first().map(|&byte| {
|
self.source[self.cursor..].as_bytes().first().map(|&byte| {
|
||||||
self.rest = &self.rest[1..];
|
self.cursor += 1;
|
||||||
self.location += TextSize::new(1);
|
|
||||||
byte
|
byte
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn next_char(&mut self) -> Option<char> {
|
fn next_char(&mut self) -> Option<char> {
|
||||||
self.rest.chars().next().map(|c| {
|
self.source[self.cursor..].chars().next().map(|c| {
|
||||||
self.rest = &self.rest[c.len_utf8()..];
|
self.cursor += c.len_utf8();
|
||||||
self.location += c.text_len();
|
|
||||||
c
|
c
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn peek_byte(&self) -> Option<u8> {
|
fn peek_byte(&self) -> Option<u8> {
|
||||||
self.rest.as_bytes().first().copied()
|
self.source[self.cursor..].as_bytes().first().copied()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
|
fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
|
||||||
|
@ -135,7 +141,7 @@ impl<'a> StringParser<'a> {
|
||||||
};
|
};
|
||||||
|
|
||||||
let start_pos = self.get_pos();
|
let start_pos = self.get_pos();
|
||||||
let Some(close_idx) = self.rest.find('}') else {
|
let Some(close_idx) = self.source[self.cursor..].find('}') else {
|
||||||
return Err(LexicalError::new(
|
return Err(LexicalError::new(
|
||||||
LexicalErrorType::StringError,
|
LexicalErrorType::StringError,
|
||||||
self.get_pos(),
|
self.get_pos(),
|
||||||
|
@ -149,7 +155,8 @@ impl<'a> StringParser<'a> {
|
||||||
.ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
|
.ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
|
/// Parse an escaped character, returning the new character.
|
||||||
|
fn parse_escaped_char(&mut self) -> Result<Option<EscapedChar>, LexicalError> {
|
||||||
let Some(first_char) = self.next_char() else {
|
let Some(first_char) = self.next_char() else {
|
||||||
return Err(LexicalError::new(
|
return Err(LexicalError::new(
|
||||||
LexicalErrorType::StringError,
|
LexicalErrorType::StringError,
|
||||||
|
@ -174,13 +181,13 @@ impl<'a> StringParser<'a> {
|
||||||
'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
|
'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
|
||||||
'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
|
'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
|
||||||
// Special cases where the escape sequence is not a single character
|
// Special cases where the escape sequence is not a single character
|
||||||
'\n' => return Ok(()),
|
'\n' => return Ok(None),
|
||||||
'\r' => {
|
'\r' => {
|
||||||
if self.peek_byte() == Some(b'\n') {
|
if self.peek_byte() == Some(b'\n') {
|
||||||
self.next_byte();
|
self.next_byte();
|
||||||
}
|
}
|
||||||
|
|
||||||
return Ok(());
|
return Ok(None);
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
if self.kind.is_any_bytes() && !first_char.is_ascii() {
|
if self.kind.is_any_bytes() && !first_char.is_ascii() {
|
||||||
|
@ -194,21 +201,42 @@ impl<'a> StringParser<'a> {
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
string.push('\\');
|
return Ok(Some(EscapedChar::Escape(first_char)));
|
||||||
|
|
||||||
first_char
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
string.push(new_char);
|
Ok(Some(EscapedChar::Literal(new_char)))
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_fstring_middle(&mut self) -> Result<ast::FStringElement, LexicalError> {
|
fn parse_fstring_middle(mut self) -> Result<ast::FStringElement, LexicalError> {
|
||||||
let mut value = String::with_capacity(self.rest.len());
|
// Fast-path: if the f-string doesn't contain any escape sequences, return the literal.
|
||||||
while let Some(ch) = self.next_char() {
|
let Some(mut index) = memchr::memchr3(b'{', b'}', b'\\', self.source.as_bytes()) else {
|
||||||
match ch {
|
return Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
|
||||||
|
value: self.source,
|
||||||
|
range: self.range,
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut value = String::with_capacity(self.source.len());
|
||||||
|
loop {
|
||||||
|
// Add the characters before the escape sequence (or curly brace) to the string.
|
||||||
|
let before_with_slash_or_brace = self.skip_bytes(index + 1);
|
||||||
|
let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1];
|
||||||
|
value.push_str(before);
|
||||||
|
|
||||||
|
// Add the escaped character to the string.
|
||||||
|
match &self.source.as_bytes()[self.cursor - 1] {
|
||||||
|
// If there are any curly braces inside a `FStringMiddle` token,
|
||||||
|
// then they were escaped (i.e. `{{` or `}}`). This means that
|
||||||
|
// we need increase the location by 2 instead of 1.
|
||||||
|
b'{' => {
|
||||||
|
self.offset += TextSize::from(1);
|
||||||
|
value.push('{');
|
||||||
|
}
|
||||||
|
b'}' => {
|
||||||
|
self.offset += TextSize::from(1);
|
||||||
|
value.push('}');
|
||||||
|
}
|
||||||
// We can encounter a `\` as the last character in a `FStringMiddle`
|
// We can encounter a `\` as the last character in a `FStringMiddle`
|
||||||
// token which is valid in this context. For example,
|
// token which is valid in this context. For example,
|
||||||
//
|
//
|
||||||
|
@ -229,71 +257,152 @@ impl<'a> StringParser<'a> {
|
||||||
// This is still an invalid escape sequence, but we don't want to
|
// This is still an invalid escape sequence, but we don't want to
|
||||||
// raise a syntax error as is done by the CPython parser. It might
|
// raise a syntax error as is done by the CPython parser. It might
|
||||||
// be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
|
// be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
|
||||||
'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
|
b'\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
|
||||||
self.parse_escaped_char(&mut value)?;
|
match self.parse_escaped_char()? {
|
||||||
}
|
None => {}
|
||||||
// If there are any curly braces inside a `FStringMiddle` token,
|
Some(EscapedChar::Literal(c)) => value.push(c),
|
||||||
// then they were escaped (i.e. `{{` or `}}`). This means that
|
Some(EscapedChar::Escape(c)) => {
|
||||||
// we need increase the location by 2 instead of 1.
|
value.push('\\');
|
||||||
ch @ ('{' | '}') => {
|
value.push(c);
|
||||||
self.location += ch.text_len();
|
}
|
||||||
value.push(ch);
|
}
|
||||||
}
|
|
||||||
ch => value.push(ch),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
|
|
||||||
value,
|
|
||||||
range: self.range,
|
|
||||||
}))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_bytes(&mut self) -> Result<StringType, LexicalError> {
|
|
||||||
let mut content = String::with_capacity(self.rest.len());
|
|
||||||
while let Some(ch) = self.next_char() {
|
|
||||||
match ch {
|
|
||||||
'\\' if !self.kind.is_raw() => {
|
|
||||||
self.parse_escaped_char(&mut content)?;
|
|
||||||
}
|
}
|
||||||
ch => {
|
ch => {
|
||||||
if !ch.is_ascii() {
|
value.push(char::from(*ch));
|
||||||
return Err(LexicalError::new(
|
|
||||||
LexicalErrorType::OtherError(
|
|
||||||
"bytes can only contain ASCII literal characters"
|
|
||||||
.to_string()
|
|
||||||
.into_boxed_str(),
|
|
||||||
),
|
|
||||||
self.get_pos(),
|
|
||||||
));
|
|
||||||
}
|
|
||||||
content.push(ch);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let Some(next_index) =
|
||||||
|
memchr::memchr3(b'{', b'}', b'\\', self.source[self.cursor..].as_bytes())
|
||||||
|
else {
|
||||||
|
// Add the rest of the string to the value.
|
||||||
|
let rest = &self.source[self.cursor..];
|
||||||
|
value.push_str(rest);
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
|
||||||
|
index = next_index;
|
||||||
}
|
}
|
||||||
Ok(StringType::Bytes(ast::BytesLiteral {
|
|
||||||
value: content.chars().map(|c| c as u8).collect::<Vec<u8>>(),
|
Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
|
||||||
|
value: value.into_boxed_str(),
|
||||||
range: self.range,
|
range: self.range,
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_string(&mut self) -> Result<StringType, LexicalError> {
|
fn parse_bytes(mut self) -> Result<StringType, LexicalError> {
|
||||||
let mut value = String::with_capacity(self.rest.len());
|
if let Some(index) = self.source.as_bytes().find_non_ascii_byte() {
|
||||||
if self.kind.is_raw() {
|
return Err(LexicalError::new(
|
||||||
value.push_str(self.skip_bytes(self.rest.len()));
|
LexicalErrorType::OtherError(
|
||||||
} else {
|
"bytes can only contain ASCII literal characters"
|
||||||
loop {
|
.to_string()
|
||||||
let Some(escape_idx) = self.rest.find('\\') else {
|
.into_boxed_str(),
|
||||||
value.push_str(self.skip_bytes(self.rest.len()));
|
),
|
||||||
break;
|
self.offset + TextSize::try_from(index).unwrap(),
|
||||||
};
|
));
|
||||||
|
|
||||||
let before_with_slash = self.skip_bytes(escape_idx + 1);
|
|
||||||
let before = &before_with_slash[..before_with_slash.len() - 1];
|
|
||||||
|
|
||||||
value.push_str(before);
|
|
||||||
self.parse_escaped_char(&mut value)?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if self.kind.is_raw() {
|
||||||
|
// For raw strings, no escaping is necessary.
|
||||||
|
return Ok(StringType::Bytes(ast::BytesLiteral {
|
||||||
|
value: self.source.into_boxed_bytes(),
|
||||||
|
range: self.range,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
|
||||||
|
// If the string doesn't contain any escape sequences, return the owned string.
|
||||||
|
return Ok(StringType::Bytes(ast::BytesLiteral {
|
||||||
|
value: self.source.into_boxed_bytes(),
|
||||||
|
range: self.range,
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
|
||||||
|
// If the string contains escape sequences, we need to parse them.
|
||||||
|
let mut value = Vec::with_capacity(self.source.len());
|
||||||
|
loop {
|
||||||
|
// Add the characters before the escape sequence to the string.
|
||||||
|
let before_with_slash = self.skip_bytes(escape + 1);
|
||||||
|
let before = &before_with_slash[..before_with_slash.len() - 1];
|
||||||
|
value.extend_from_slice(before.as_bytes());
|
||||||
|
|
||||||
|
// Add the escaped character to the string.
|
||||||
|
match self.parse_escaped_char()? {
|
||||||
|
None => {}
|
||||||
|
Some(EscapedChar::Literal(c)) => value.push(c as u8),
|
||||||
|
Some(EscapedChar::Escape(c)) => {
|
||||||
|
value.push(b'\\');
|
||||||
|
value.push(c as u8);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let Some(next_escape) = memchr::memchr(b'\\', self.source[self.cursor..].as_bytes())
|
||||||
|
else {
|
||||||
|
// Add the rest of the string to the value.
|
||||||
|
let rest = &self.source[self.cursor..];
|
||||||
|
value.extend_from_slice(rest.as_bytes());
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Update the position of the next escape sequence.
|
||||||
|
escape = next_escape;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(StringType::Bytes(ast::BytesLiteral {
|
||||||
|
value: value.into_boxed_slice(),
|
||||||
|
range: self.range,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_string(mut self) -> Result<StringType, LexicalError> {
|
||||||
|
if self.kind.is_raw() {
|
||||||
|
// For raw strings, no escaping is necessary.
|
||||||
|
return Ok(StringType::Str(ast::StringLiteral {
|
||||||
|
value: self.source,
|
||||||
|
unicode: self.kind.is_unicode(),
|
||||||
|
range: self.range,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
|
||||||
|
// If the string doesn't contain any escape sequences, return the owned string.
|
||||||
|
return Ok(StringType::Str(ast::StringLiteral {
|
||||||
|
value: self.source,
|
||||||
|
unicode: self.kind.is_unicode(),
|
||||||
|
range: self.range,
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
|
||||||
|
// If the string contains escape sequences, we need to parse them.
|
||||||
|
let mut value = String::with_capacity(self.source.len());
|
||||||
|
|
||||||
|
loop {
|
||||||
|
// Add the characters before the escape sequence to the string.
|
||||||
|
let before_with_slash = self.skip_bytes(escape + 1);
|
||||||
|
let before = &before_with_slash[..before_with_slash.len() - 1];
|
||||||
|
value.push_str(before);
|
||||||
|
|
||||||
|
// Add the escaped character to the string.
|
||||||
|
match self.parse_escaped_char()? {
|
||||||
|
None => {}
|
||||||
|
Some(EscapedChar::Literal(c)) => value.push(c),
|
||||||
|
Some(EscapedChar::Escape(c)) => {
|
||||||
|
value.push('\\');
|
||||||
|
value.push(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let Some(next_escape) = self.source[self.cursor..].find('\\') else {
|
||||||
|
// Add the rest of the string to the value.
|
||||||
|
let rest = &self.source[self.cursor..];
|
||||||
|
value.push_str(rest);
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Update the position of the next escape sequence.
|
||||||
|
escape = next_escape;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(StringType::Str(ast::StringLiteral {
|
Ok(StringType::Str(ast::StringLiteral {
|
||||||
value: value.into_boxed_str(),
|
value: value.into_boxed_str(),
|
||||||
unicode: self.kind.is_unicode(),
|
unicode: self.kind.is_unicode(),
|
||||||
|
@ -301,7 +410,7 @@ impl<'a> StringParser<'a> {
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse(&mut self) -> Result<StringType, LexicalError> {
|
fn parse(self) -> Result<StringType, LexicalError> {
|
||||||
if self.kind.is_any_bytes() {
|
if self.kind.is_any_bytes() {
|
||||||
self.parse_bytes()
|
self.parse_bytes()
|
||||||
} else {
|
} else {
|
||||||
|
@ -311,7 +420,7 @@ impl<'a> StringParser<'a> {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn parse_string_literal(
|
pub(crate) fn parse_string_literal(
|
||||||
source: &str,
|
source: Box<str>,
|
||||||
kind: StringKind,
|
kind: StringKind,
|
||||||
triple_quoted: bool,
|
triple_quoted: bool,
|
||||||
range: TextRange,
|
range: TextRange,
|
||||||
|
@ -327,7 +436,7 @@ pub(crate) fn parse_string_literal(
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn parse_fstring_literal_element(
|
pub(crate) fn parse_fstring_literal_element(
|
||||||
source: &str,
|
source: Box<str>,
|
||||||
is_raw: bool,
|
is_raw: bool,
|
||||||
range: TextRange,
|
range: TextRange,
|
||||||
) -> Result<ast::FStringElement, LexicalError> {
|
) -> Result<ast::FStringElement, LexicalError> {
|
||||||
|
@ -360,7 +469,7 @@ pub(crate) fn concatenated_strings(
|
||||||
if has_bytes && byte_literal_count < strings.len() {
|
if has_bytes && byte_literal_count < strings.len() {
|
||||||
return Err(LexicalError::new(
|
return Err(LexicalError::new(
|
||||||
LexicalErrorType::OtherError(
|
LexicalErrorType::OtherError(
|
||||||
"cannot mix bytes and nonbytes literals"
|
"cannot mix bytes and non-bytes literals"
|
||||||
.to_string()
|
.to_string()
|
||||||
.into_boxed_str(),
|
.into_boxed_str(),
|
||||||
),
|
),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue