mirror of
https://github.com/apache/datafusion-sqlparser-rs.git
synced 2025-12-23 11:12:51 +00:00
Reduce string copies cow (#2075)
Some checks failed
Rust / codestyle (push) Has been cancelled
Rust / lint (push) Has been cancelled
Rust / benchmark-lint (push) Has been cancelled
Rust / compile (push) Has been cancelled
Rust / docs (push) Has been cancelled
Rust / test (beta) (push) Has been cancelled
Rust / compile-no-std (push) Has been cancelled
Rust / test (nightly) (push) Has been cancelled
Rust / test (stable) (push) Has been cancelled
Some checks failed
Rust / codestyle (push) Has been cancelled
Rust / lint (push) Has been cancelled
Rust / benchmark-lint (push) Has been cancelled
Rust / compile (push) Has been cancelled
Rust / docs (push) Has been cancelled
Rust / test (beta) (push) Has been cancelled
Rust / compile-no-std (push) Has been cancelled
Rust / test (nightly) (push) Has been cancelled
Rust / test (stable) (push) Has been cancelled
This commit is contained in:
parent
c8acf9f52d
commit
b098976cab
1 changed files with 423 additions and 116 deletions
539
src/tokenizer.rs
539
src/tokenizer.rs
|
|
@ -23,12 +23,15 @@
|
|||
|
||||
#[cfg(not(feature = "std"))]
|
||||
use alloc::{
|
||||
borrow::ToOwned,
|
||||
borrow::{Cow, ToOwned},
|
||||
format,
|
||||
string::{String, ToString},
|
||||
vec,
|
||||
vec::Vec,
|
||||
};
|
||||
#[cfg(feature = "std")]
|
||||
use std::borrow::Cow;
|
||||
|
||||
use core::iter::Peekable;
|
||||
use core::num::NonZeroU8;
|
||||
use core::str::Chars;
|
||||
|
|
@ -934,7 +937,7 @@ impl<'a> Tokenizer<'a> {
|
|||
chars: &mut State<'a>,
|
||||
) -> Result<Option<Token>, TokenizerError> {
|
||||
chars.next(); // consume the first char
|
||||
let word = self.tokenize_word(consumed_byte_len, chars);
|
||||
let word = self.tokenize_word(consumed_byte_len, chars)?;
|
||||
|
||||
// TODO: implement parsing of exponent here
|
||||
if word.chars().all(|x| x.is_ascii_digit() || x == '.') {
|
||||
|
|
@ -1008,7 +1011,7 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
_ => {
|
||||
// regular identifier starting with an "b" or "B"
|
||||
let s = self.tokenize_word(b.len_utf8(), chars);
|
||||
let s = self.tokenize_word(b.len_utf8(), chars)?;
|
||||
Ok(Some(Token::make_word(&s, None)))
|
||||
}
|
||||
}
|
||||
|
|
@ -1035,7 +1038,7 @@ impl<'a> Tokenizer<'a> {
|
|||
),
|
||||
_ => {
|
||||
// regular identifier starting with an "r" or "R"
|
||||
let s = self.tokenize_word(b.len_utf8(), chars);
|
||||
let s = self.tokenize_word(b.len_utf8(), chars)?;
|
||||
Ok(Some(Token::make_word(&s, None)))
|
||||
}
|
||||
}
|
||||
|
|
@ -1054,7 +1057,7 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
_ => {
|
||||
// regular identifier starting with an "N"
|
||||
let s = self.tokenize_word(n.len_utf8(), chars);
|
||||
let s = self.tokenize_word(n.len_utf8(), chars)?;
|
||||
Ok(Some(Token::make_word(&s, None)))
|
||||
}
|
||||
}
|
||||
|
|
@ -1071,7 +1074,7 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
_ => {
|
||||
// regular identifier starting with an "E" or "e"
|
||||
let s = self.tokenize_word(x.len_utf8(), chars);
|
||||
let s = self.tokenize_word(x.len_utf8(), chars)?;
|
||||
Ok(Some(Token::make_word(&s, None)))
|
||||
}
|
||||
}
|
||||
|
|
@ -1090,7 +1093,7 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
}
|
||||
// regular identifier starting with an "U" or "u"
|
||||
let s = self.tokenize_word(x.len_utf8(), chars);
|
||||
let s = self.tokenize_word(x.len_utf8(), chars)?;
|
||||
Ok(Some(Token::make_word(&s, None)))
|
||||
}
|
||||
// The spec only allows an uppercase 'X' to introduce a hex
|
||||
|
|
@ -1105,7 +1108,7 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
_ => {
|
||||
// regular identifier starting with an "X"
|
||||
let s = self.tokenize_word(x.len_utf8(), chars);
|
||||
let s = self.tokenize_word(x.len_utf8(), chars)?;
|
||||
Ok(Some(Token::make_word(&s, None)))
|
||||
}
|
||||
}
|
||||
|
|
@ -1351,7 +1354,7 @@ impl<'a> Tokenizer<'a> {
|
|||
|
||||
if is_comment {
|
||||
chars.next(); // consume second '-'
|
||||
let comment = self.tokenize_single_line_comment(chars);
|
||||
let comment = self.tokenize_single_line_comment(chars)?;
|
||||
return Ok(Some(Token::Whitespace(
|
||||
Whitespace::SingleLineComment {
|
||||
prefix: "--".to_owned(),
|
||||
|
|
@ -1382,7 +1385,7 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
Some('/') if dialect_of!(self is SnowflakeDialect) => {
|
||||
chars.next(); // consume the second '/', starting a snowflake single-line comment
|
||||
let comment = self.tokenize_single_line_comment(chars);
|
||||
let comment = self.tokenize_single_line_comment(chars)?;
|
||||
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
|
||||
prefix: "//".to_owned(),
|
||||
comment,
|
||||
|
|
@ -1588,7 +1591,7 @@ impl<'a> Tokenizer<'a> {
|
|||
'#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
|
||||
{
|
||||
chars.next(); // consume the '#', starting a snowflake single-line comment
|
||||
let comment = self.tokenize_single_line_comment(chars);
|
||||
let comment = self.tokenize_single_line_comment(chars)?;
|
||||
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
|
||||
prefix: "#".to_owned(),
|
||||
comment,
|
||||
|
|
@ -1783,80 +1786,133 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
|
||||
/// Tokenize dollar preceded value (i.e: a string/placeholder)
|
||||
fn tokenize_dollar_preceded_value(&self, chars: &mut State) -> Result<Token, TokenizerError> {
|
||||
let mut s = String::new();
|
||||
let mut value = String::new();
|
||||
fn tokenize_dollar_preceded_value(
|
||||
&self,
|
||||
chars: &mut State<'a>,
|
||||
) -> Result<Token, TokenizerError> {
|
||||
let starting_loc = chars.location();
|
||||
|
||||
chars.next();
|
||||
// Validate we're at a $ before consuming
|
||||
if chars.peek() != Some(&'$') {
|
||||
return self.tokenizer_error(starting_loc, "Expected $ character");
|
||||
}
|
||||
chars.next(); // consume first $
|
||||
|
||||
// If the dialect does not support dollar-quoted strings, then `$$` is rather a placeholder.
|
||||
// Case 1: $$text$$ (untagged dollar-quoted string)
|
||||
if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
|
||||
chars.next();
|
||||
let (value, tag) = self.tokenize_dollar_quoted_string_borrowed(chars, None)?;
|
||||
return Ok(Token::DollarQuotedString(DollarQuotedString {
|
||||
value: value.into_owned(),
|
||||
tag: tag.map(|t| t.into_owned()),
|
||||
}));
|
||||
}
|
||||
|
||||
let mut is_terminated = false;
|
||||
let mut prev: Option<char> = None;
|
||||
// If it's not $$ we have 2 options :
|
||||
// Case 2: $tag$text$tag$ (tagged dollar-quoted string) if dialect supports it
|
||||
// Case 3: $placeholder (e.g., $1, $name)
|
||||
let tag_start = chars.byte_pos;
|
||||
let _tag_slice = peeking_take_while_ref(chars, |ch| {
|
||||
ch.is_alphanumeric()
|
||||
|| ch == '_'
|
||||
|| matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
|
||||
});
|
||||
let tag_end = chars.byte_pos;
|
||||
|
||||
while let Some(&ch) = chars.peek() {
|
||||
if prev == Some('$') {
|
||||
if ch == '$' {
|
||||
chars.next();
|
||||
is_terminated = true;
|
||||
break;
|
||||
} else {
|
||||
s.push('$');
|
||||
s.push(ch);
|
||||
// Case 2: $tag$text$tag$ (tagged dollar-quoted string)
|
||||
if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
|
||||
let tag_value = self.safe_slice(chars.source, tag_start, tag_end, starting_loc)?;
|
||||
let (value, tag) =
|
||||
self.tokenize_dollar_quoted_string_borrowed(chars, Some(tag_value))?;
|
||||
return Ok(Token::DollarQuotedString(DollarQuotedString {
|
||||
value: value.into_owned(),
|
||||
tag: tag.map(|t| t.into_owned()),
|
||||
}));
|
||||
}
|
||||
|
||||
// Case 3: $placeholder (e.g., $1, $name)
|
||||
let tag_value = self.safe_slice(chars.source, tag_start, tag_end, starting_loc)?;
|
||||
Ok(Token::Placeholder(format!("${}", tag_value)))
|
||||
}
|
||||
|
||||
/// Tokenize a dollar-quoted string ($$text$$ or $tag$text$tag$), returning borrowed slices.
|
||||
/// tag_prefix: None for $$, Some("tag") for $tag$
|
||||
/// Returns (value: Cow<'a, str>, tag: Option<Cow<'a, str>>)
|
||||
fn tokenize_dollar_quoted_string_borrowed(
|
||||
&self,
|
||||
chars: &mut State<'a>,
|
||||
tag_prefix: Option<&'a str>,
|
||||
) -> Result<(Cow<'a, str>, Option<Cow<'a, str>>), TokenizerError> {
|
||||
let starting_loc = chars.location();
|
||||
|
||||
// Validate we're at a $ before consuming
|
||||
if chars.peek() != Some(&'$') {
|
||||
return self.tokenizer_error(starting_loc, "Expected $ for dollar-quoted string");
|
||||
}
|
||||
chars.next(); // consume $ after tag (or second $ for $$)
|
||||
let content_start = chars.byte_pos;
|
||||
|
||||
match tag_prefix {
|
||||
None => {
|
||||
// Case: $$text$$
|
||||
let mut prev: Option<char> = None;
|
||||
|
||||
while let Some(&ch) = chars.peek() {
|
||||
if prev == Some('$') && ch == '$' {
|
||||
chars.next(); // consume final $
|
||||
// content_end is before the first $ of $$
|
||||
let content_end = chars.byte_pos - 2;
|
||||
let value = self.safe_slice(
|
||||
chars.source,
|
||||
content_start,
|
||||
content_end,
|
||||
starting_loc,
|
||||
)?;
|
||||
return Ok((Cow::Borrowed(value), None));
|
||||
}
|
||||
} else if ch != '$' {
|
||||
s.push(ch);
|
||||
|
||||
prev = Some(ch);
|
||||
chars.next();
|
||||
}
|
||||
|
||||
prev = Some(ch);
|
||||
chars.next();
|
||||
}
|
||||
|
||||
return if chars.peek().is_none() && !is_terminated {
|
||||
self.tokenizer_error(chars.location(), "Unterminated dollar-quoted string")
|
||||
} else {
|
||||
Ok(Token::DollarQuotedString(DollarQuotedString {
|
||||
value: s,
|
||||
tag: None,
|
||||
}))
|
||||
};
|
||||
} else {
|
||||
value.push_str(&peeking_take_while(chars, |ch| {
|
||||
ch.is_alphanumeric()
|
||||
|| ch == '_'
|
||||
// Allow $ as a placeholder character if the dialect supports it
|
||||
|| matches!(ch, '$' if self.dialect.supports_dollar_placeholder())
|
||||
}));
|
||||
|
||||
// If the dialect does not support dollar-quoted strings, don't look for the end delimiter.
|
||||
if matches!(chars.peek(), Some('$')) && !self.dialect.supports_dollar_placeholder() {
|
||||
chars.next();
|
||||
|
||||
let mut temp = String::new();
|
||||
let end_delimiter = format!("${value}$");
|
||||
}
|
||||
Some(tag) => {
|
||||
// Case: $tag$text$tag$
|
||||
let end_delimiter = format!("${}$", tag);
|
||||
|
||||
// Scan for the end delimiter
|
||||
let buffer_start = content_start;
|
||||
loop {
|
||||
match chars.next() {
|
||||
Some(ch) => {
|
||||
temp.push(ch);
|
||||
Some(_) => {
|
||||
let current_pos = chars.byte_pos;
|
||||
let buffer = self.safe_slice(
|
||||
chars.source,
|
||||
buffer_start,
|
||||
current_pos,
|
||||
starting_loc,
|
||||
)?;
|
||||
|
||||
if temp.ends_with(&end_delimiter) {
|
||||
if let Some(temp) = temp.strip_suffix(&end_delimiter) {
|
||||
s.push_str(temp);
|
||||
}
|
||||
break;
|
||||
if buffer.ends_with(&end_delimiter) {
|
||||
// Found the end delimiter
|
||||
let content_end = current_pos - end_delimiter.len();
|
||||
let value = self.safe_slice(
|
||||
chars.source,
|
||||
content_start,
|
||||
content_end,
|
||||
starting_loc,
|
||||
)?;
|
||||
return Ok((
|
||||
Cow::Borrowed(value),
|
||||
if tag.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Cow::Borrowed(tag))
|
||||
},
|
||||
));
|
||||
}
|
||||
}
|
||||
None => {
|
||||
if temp.ends_with(&end_delimiter) {
|
||||
if let Some(temp) = temp.strip_suffix(&end_delimiter) {
|
||||
s.push_str(temp);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
return self.tokenizer_error(
|
||||
chars.location(),
|
||||
"Unterminated dollar-quoted, expected $",
|
||||
|
|
@ -1864,15 +1920,23 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Ok(Token::Placeholder(String::from("$") + &value));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Token::DollarQuotedString(DollarQuotedString {
|
||||
value: s,
|
||||
tag: if value.is_empty() { None } else { Some(value) },
|
||||
}))
|
||||
/// Helper function to safely slice a string with bounds validation
|
||||
fn safe_slice<'b>(
|
||||
&self,
|
||||
source: &'b str,
|
||||
start: usize,
|
||||
end: usize,
|
||||
error_loc: Location,
|
||||
) -> Result<&'b str, TokenizerError> {
|
||||
// Validate slice bounds
|
||||
if end < start || end > source.len() {
|
||||
return self.tokenizer_error(error_loc, "Invalid string slice bounds");
|
||||
}
|
||||
Ok(&source[start..end])
|
||||
}
|
||||
|
||||
fn tokenizer_error<R>(
|
||||
|
|
@ -1887,63 +1951,90 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
|
||||
// Consume characters until newline
|
||||
fn tokenize_single_line_comment(&self, chars: &mut State) -> String {
|
||||
let mut comment = peeking_take_while(chars, |ch| match ch {
|
||||
fn tokenize_single_line_comment(
|
||||
&self,
|
||||
chars: &mut State<'a>,
|
||||
) -> Result<String, TokenizerError> {
|
||||
Ok(self
|
||||
.tokenize_single_line_comment_borrowed(chars)?
|
||||
.to_string())
|
||||
}
|
||||
|
||||
/// Tokenize a single-line comment, returning a borrowed slice.
|
||||
/// Returns a slice that includes the terminating newline character.
|
||||
fn tokenize_single_line_comment_borrowed(
|
||||
&self,
|
||||
chars: &mut State<'a>,
|
||||
) -> Result<&'a str, TokenizerError> {
|
||||
let start_pos = chars.byte_pos;
|
||||
let error_loc = chars.location();
|
||||
|
||||
// Consume until newline
|
||||
peeking_take_while_ref(chars, |ch| match ch {
|
||||
'\n' => false, // Always stop at \n
|
||||
'\r' if dialect_of!(self is PostgreSqlDialect) => false, // Stop at \r for Postgres
|
||||
_ => true, // Keep consuming for other characters
|
||||
});
|
||||
|
||||
// Consume the newline character
|
||||
if let Some(ch) = chars.next() {
|
||||
assert!(ch == '\n' || ch == '\r');
|
||||
comment.push(ch);
|
||||
}
|
||||
|
||||
comment
|
||||
// Return slice including the newline
|
||||
self.safe_slice(chars.source, start_pos, chars.byte_pos, error_loc)
|
||||
}
|
||||
|
||||
/// Tokenize an identifier or keyword, after the first char(s) have already been consumed.
|
||||
/// `consumed_byte_len` is the byte length of the consumed character(s).
|
||||
fn tokenize_word(&self, consumed_byte_len: usize, chars: &mut State<'a>) -> String {
|
||||
fn tokenize_word(
|
||||
&self,
|
||||
consumed_byte_len: usize,
|
||||
chars: &mut State<'a>,
|
||||
) -> Result<String, TokenizerError> {
|
||||
let error_loc = chars.location();
|
||||
|
||||
// Overflow check: ensure we can safely subtract
|
||||
if consumed_byte_len > chars.byte_pos {
|
||||
return String::new();
|
||||
return self.tokenizer_error(error_loc, "Invalid byte position in tokenize_word");
|
||||
}
|
||||
|
||||
// Calculate where the first character started
|
||||
let first_char_byte_pos = chars.byte_pos - consumed_byte_len;
|
||||
|
||||
// Use the zero-copy version and convert to String
|
||||
self.tokenize_word_borrowed(first_char_byte_pos, chars)
|
||||
.to_string()
|
||||
Ok(self
|
||||
.tokenize_word_borrowed(first_char_byte_pos, chars)?
|
||||
.to_string())
|
||||
}
|
||||
|
||||
/// Tokenize an identifier or keyword, returning a borrowed slice when possible.
|
||||
/// The first character position must be provided (before it was consumed).
|
||||
/// Returns a slice with the same lifetime as the State's source.
|
||||
fn tokenize_word_borrowed(&self, first_char_byte_pos: usize, chars: &mut State<'a>) -> &'a str {
|
||||
fn tokenize_word_borrowed(
|
||||
&self,
|
||||
first_char_byte_pos: usize,
|
||||
chars: &mut State<'a>,
|
||||
) -> Result<&'a str, TokenizerError> {
|
||||
let error_loc = chars.location();
|
||||
|
||||
// Consume the rest of the word
|
||||
peeking_take_while_ref(chars, |ch| self.dialect.is_identifier_part(ch));
|
||||
|
||||
// Boundary check: ensure first_char_byte_pos is valid
|
||||
if first_char_byte_pos > chars.byte_pos || first_char_byte_pos > chars.source.len() {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Return a slice from the first char to the current position
|
||||
&chars.source[first_char_byte_pos..chars.byte_pos]
|
||||
// Return a slice from the first char to the current position using safe_slice
|
||||
self.safe_slice(chars.source, first_char_byte_pos, chars.byte_pos, error_loc)
|
||||
}
|
||||
|
||||
/// Read a quoted identifier
|
||||
fn tokenize_quoted_identifier(
|
||||
&self,
|
||||
quote_start: char,
|
||||
chars: &mut State,
|
||||
chars: &mut State<'a>,
|
||||
) -> Result<String, TokenizerError> {
|
||||
let error_loc = chars.location();
|
||||
chars.next(); // consume the opening quote
|
||||
let quote_end = Word::matching_end_quote(quote_start);
|
||||
let (s, last_char) = self.parse_quoted_ident(chars, quote_end);
|
||||
let (s, last_char) = self.parse_quoted_ident(chars, quote_end)?;
|
||||
|
||||
if last_char == Some(quote_end) {
|
||||
Ok(s)
|
||||
|
|
@ -2152,9 +2243,21 @@ impl<'a> Tokenizer<'a> {
|
|||
|
||||
fn tokenize_multiline_comment(
|
||||
&self,
|
||||
chars: &mut State,
|
||||
chars: &mut State<'a>,
|
||||
) -> Result<Option<Token>, TokenizerError> {
|
||||
let mut s = String::new();
|
||||
let s = self.tokenize_multiline_comment_borrowed(chars)?;
|
||||
Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(
|
||||
s.to_string(),
|
||||
))))
|
||||
}
|
||||
|
||||
/// Tokenize a multi-line comment, returning a borrowed slice.
|
||||
/// Returns a slice that excludes the opening `/*` (already consumed) and the final closing `*/`.
|
||||
fn tokenize_multiline_comment_borrowed(
|
||||
&self,
|
||||
chars: &mut State<'a>,
|
||||
) -> Result<&'a str, TokenizerError> {
|
||||
let start_pos = chars.byte_pos;
|
||||
let mut nested = 1;
|
||||
let supports_nested_comments = self.dialect.supports_nested_comments();
|
||||
|
||||
|
|
@ -2162,24 +2265,22 @@ impl<'a> Tokenizer<'a> {
|
|||
match chars.next() {
|
||||
Some('/') if matches!(chars.peek(), Some('*')) && supports_nested_comments => {
|
||||
chars.next(); // consume the '*'
|
||||
s.push('/');
|
||||
s.push('*');
|
||||
nested += 1;
|
||||
}
|
||||
Some('*') if matches!(chars.peek(), Some('/')) => {
|
||||
chars.next(); // consume the '/'
|
||||
nested -= 1;
|
||||
if nested == 0 {
|
||||
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
|
||||
// We've consumed the final */, so exclude it from the slice
|
||||
let end_pos = chars.byte_pos - 2; // Subtract 2 bytes for '*' and '/'
|
||||
return self.safe_slice(chars.source, start_pos, end_pos, chars.location());
|
||||
}
|
||||
s.push('*');
|
||||
s.push('/');
|
||||
}
|
||||
Some(ch) => {
|
||||
s.push(ch);
|
||||
Some(_) => {
|
||||
// Just consume the character, don't need to push to string
|
||||
}
|
||||
None => {
|
||||
break self.tokenizer_error(
|
||||
return self.tokenizer_error(
|
||||
chars.location(),
|
||||
"Unexpected EOF while in a multi-line comment",
|
||||
);
|
||||
|
|
@ -2188,27 +2289,71 @@ impl<'a> Tokenizer<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
fn parse_quoted_ident(&self, chars: &mut State, quote_end: char) -> (String, Option<char>) {
|
||||
fn parse_quoted_ident(
|
||||
&self,
|
||||
chars: &mut State<'a>,
|
||||
quote_end: char,
|
||||
) -> Result<(String, Option<char>), TokenizerError> {
|
||||
let (cow, last_char) = self.parse_quoted_ident_borrowed(chars, quote_end)?;
|
||||
Ok((cow.into_owned(), last_char))
|
||||
}
|
||||
|
||||
/// Parse quoted identifier, returning borrowed slice when possible.
|
||||
/// Returns `(Cow<'a, str>, Option<char>)` where the `Option<char>` is the closing quote.
|
||||
fn parse_quoted_ident_borrowed(
|
||||
&self,
|
||||
chars: &mut State<'a>,
|
||||
quote_end: char,
|
||||
) -> Result<(Cow<'a, str>, Option<char>), TokenizerError> {
|
||||
let content_start = chars.byte_pos;
|
||||
let mut has_doubled_quotes = false;
|
||||
let mut last_char = None;
|
||||
let mut s = String::new();
|
||||
|
||||
// Scan to find the end and detect doubled quotes
|
||||
while let Some(ch) = chars.next() {
|
||||
if ch == quote_end {
|
||||
if chars.peek() == Some("e_end) {
|
||||
chars.next();
|
||||
s.push(ch);
|
||||
if !self.unescape {
|
||||
// In no-escape mode, the given query has to be saved completely
|
||||
s.push(ch);
|
||||
}
|
||||
has_doubled_quotes = true;
|
||||
chars.next(); // consume the second quote
|
||||
} else {
|
||||
last_char = Some(quote_end);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
s.push(ch);
|
||||
}
|
||||
}
|
||||
(s, last_char)
|
||||
|
||||
let content_end = if last_char.is_some() {
|
||||
chars.byte_pos - 1 // exclude the closing quote
|
||||
} else {
|
||||
chars.byte_pos
|
||||
};
|
||||
|
||||
let content =
|
||||
self.safe_slice(chars.source, content_start, content_end, chars.location())?;
|
||||
|
||||
// If no doubled quotes, we can always borrow
|
||||
if !has_doubled_quotes {
|
||||
return Ok((Cow::Borrowed(content), last_char));
|
||||
}
|
||||
|
||||
// If unescape=false, keep the content as-is (with doubled quotes)
|
||||
if !self.unescape {
|
||||
return Ok((Cow::Borrowed(content), last_char));
|
||||
}
|
||||
|
||||
// Need to unescape: process doubled quotes
|
||||
let mut result = String::new();
|
||||
let mut chars_iter = content.chars();
|
||||
|
||||
while let Some(ch) = chars_iter.next() {
|
||||
result.push(ch);
|
||||
if ch == quote_end {
|
||||
// This is the first of a doubled quote, skip the second one
|
||||
chars_iter.next();
|
||||
}
|
||||
}
|
||||
|
||||
Ok((Cow::Owned(result), last_char))
|
||||
}
|
||||
|
||||
#[allow(clippy::unnecessary_wraps)]
|
||||
|
|
@ -2304,7 +2449,78 @@ fn peeking_next_take_while(
|
|||
}
|
||||
|
||||
fn unescape_single_quoted_string(chars: &mut State<'_>) -> Option<String> {
|
||||
Unescape::new(chars).unescape()
|
||||
borrow_or_unescape_single_quoted_string(chars, true).map(|cow| cow.into_owned())
|
||||
}
|
||||
|
||||
/// Scans a single-quoted string and returns either a borrowed slice or an unescaped owned string.
|
||||
///
|
||||
/// Strategy: Scan once to find the end and detect escape sequences.
|
||||
/// - If no escapes exist (or unescape=false), return [Cow::Borrowed]
|
||||
/// - If escapes exist and unescape=true, reprocess using existing [Unescape] logic
|
||||
fn borrow_or_unescape_single_quoted_string<'a>(
|
||||
chars: &mut State<'a>,
|
||||
unescape: bool,
|
||||
) -> Option<Cow<'a, str>> {
|
||||
let content_start = chars.byte_pos;
|
||||
|
||||
// Validate we're at an opening quote before consuming
|
||||
if chars.peek() != Some(&'\'') {
|
||||
return None;
|
||||
}
|
||||
chars.next(); // consume opening '
|
||||
|
||||
// Scan to find end and check for escape sequences
|
||||
let mut has_escapes = false;
|
||||
|
||||
loop {
|
||||
match chars.next() {
|
||||
Some('\'') => {
|
||||
// Check for doubled single quote (escape)
|
||||
if chars.peek() == Some(&'\'') {
|
||||
has_escapes = true;
|
||||
chars.next(); // consume the second '
|
||||
} else {
|
||||
// End of string found (including closing ')
|
||||
let content_end = chars.byte_pos;
|
||||
let full_content = &chars.source[content_start..content_end];
|
||||
|
||||
// If no unescaping needed, return borrowed (without quotes)
|
||||
if !unescape || !has_escapes {
|
||||
// Strip opening and closing quotes
|
||||
// Safety: full_content includes opening and closing quotes (at least 2 chars)
|
||||
if full_content.len() < 2 {
|
||||
return None;
|
||||
}
|
||||
return Some(Cow::Borrowed(&full_content[1..full_content.len() - 1]));
|
||||
}
|
||||
|
||||
// Need to unescape - reprocess using existing logic
|
||||
// Create a temporary State from the content
|
||||
let mut temp_state = State {
|
||||
peekable: full_content.chars().peekable(),
|
||||
source: full_content,
|
||||
line: 0,
|
||||
col: 0,
|
||||
byte_pos: 0,
|
||||
};
|
||||
|
||||
return Unescape::new(&mut temp_state).unescape().map(Cow::Owned);
|
||||
}
|
||||
}
|
||||
Some('\\') => {
|
||||
has_escapes = true;
|
||||
// Skip next character (it's escaped)
|
||||
chars.next();
|
||||
}
|
||||
Some(_) => {
|
||||
// Regular character, continue scanning
|
||||
}
|
||||
None => {
|
||||
// Unexpected EOF
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Unescape<'a: 'b, 'b> {
|
||||
|
|
@ -2452,8 +2668,98 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
|
|||
}
|
||||
|
||||
fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String, TokenizerError> {
|
||||
borrow_or_unescape_unicode_single_quoted_string(chars, true).map(|cow| cow.into_owned())
|
||||
}
|
||||
|
||||
/// Scans a unicode-escaped single-quoted string and returns either a borrowed slice or an unescaped owned string.
|
||||
///
|
||||
/// Strategy: Scan once to find the end and detect escape sequences.
|
||||
/// - If no escapes exist (or unescape=false), return [Cow::Borrowed]
|
||||
/// - If escapes exist and unescape=true, reprocess with unicode escaping logic
|
||||
fn borrow_or_unescape_unicode_single_quoted_string<'a>(
|
||||
chars: &mut State<'a>,
|
||||
unescape: bool,
|
||||
) -> Result<Cow<'a, str>, TokenizerError> {
|
||||
let content_start = chars.byte_pos;
|
||||
let error_loc = chars.location();
|
||||
|
||||
// Validate we're at an opening quote before consuming
|
||||
if chars.peek() != Some(&'\'') {
|
||||
return Err(TokenizerError {
|
||||
message: "Expected opening quote for unicode string literal".to_string(),
|
||||
location: error_loc,
|
||||
});
|
||||
}
|
||||
chars.next(); // consume the opening quote
|
||||
|
||||
// Scan to find end and check for escape sequences
|
||||
let mut has_escapes = false;
|
||||
|
||||
loop {
|
||||
match chars.next() {
|
||||
Some('\'') => {
|
||||
// Check for doubled single quote (escape)
|
||||
if chars.peek() == Some(&'\'') {
|
||||
has_escapes = true;
|
||||
chars.next(); // consume the second '
|
||||
} else {
|
||||
// End of string found (including closing ')
|
||||
let content_end = chars.byte_pos;
|
||||
let full_content = &chars.source[content_start..content_end];
|
||||
|
||||
// If no unescaping needed, return borrowed (without quotes)
|
||||
if !unescape || !has_escapes {
|
||||
// Strip opening and closing quotes
|
||||
// Safety: full_content includes opening and closing quotes (at least 2 chars)
|
||||
if full_content.len() < 2 {
|
||||
return Err(TokenizerError {
|
||||
message: "Invalid unicode string literal".to_string(),
|
||||
location: error_loc,
|
||||
});
|
||||
}
|
||||
return Ok(Cow::Borrowed(&full_content[1..full_content.len() - 1]));
|
||||
}
|
||||
|
||||
// Need to unescape - reprocess with unicode logic
|
||||
// Create a temporary State from the content
|
||||
let mut temp_state = State {
|
||||
peekable: full_content.chars().peekable(),
|
||||
source: full_content,
|
||||
line: 0,
|
||||
col: 0,
|
||||
byte_pos: 0,
|
||||
};
|
||||
|
||||
return process_unicode_string_with_escapes(&mut temp_state, error_loc)
|
||||
.map(Cow::Owned);
|
||||
}
|
||||
}
|
||||
Some('\\') => {
|
||||
has_escapes = true;
|
||||
// Skip next character (it's escaped or part of unicode sequence)
|
||||
chars.next();
|
||||
}
|
||||
Some(_) => {
|
||||
// Regular character, continue scanning
|
||||
}
|
||||
None => {
|
||||
return Err(TokenizerError {
|
||||
message: "Unterminated unicode encoded string literal".to_string(),
|
||||
location: error_loc,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Process a unicode-escaped string using the original unescape logic
|
||||
fn process_unicode_string_with_escapes(
|
||||
chars: &mut State<'_>,
|
||||
error_loc: Location,
|
||||
) -> Result<String, TokenizerError> {
|
||||
let mut unescaped = String::new();
|
||||
chars.next(); // consume the opening quote
|
||||
|
||||
while let Some(c) = chars.next() {
|
||||
match c {
|
||||
'\'' => {
|
||||
|
|
@ -2480,9 +2786,10 @@ fn unescape_unicode_single_quoted_string(chars: &mut State<'_>) -> Result<String
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(TokenizerError {
|
||||
message: "Unterminated unicode encoded string literal".to_string(),
|
||||
location: chars.location(),
|
||||
location: error_loc,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue