ruff/crates/ruff_python_trivia/src/tokenizer.rs
Charlie Marsh a3d4f08f29
Add general support for parenthesized comments on expressions (#6485)
## Summary

This PR adds support for parenthesized comments. A parenthesized comment
is a comment that appears within a parenthesis, but not within the range
of the expression enclosed by the parenthesis. For example, the comment
here is a parenthesized comment:

```python
if (
    # comment
    True
):
    ...
```

The parentheses enclose the `True`, but the range of `True` doesn’t
include the `# comment`.

There are at least two problems associated with parenthesized comments:
(1) associating the comment with the correct (i.e., enclosed) node; and
(2) formatting the comment correctly, once it has been associated with
the enclosed node.

The solution proposed here for (1) is to search for parentheses between
preceding and following node, and use open and close parentheses to
break ties, rather than always assigning to the preceding node.

For (2), we handle these special parenthesized comments in `FormatExpr`.
The biggest risk with this approach is that we forget some codepath that
force-disables parenthesization (by passing in `Parentheses::Never`).
I've audited all usages of that enum and added additional handling +
test coverage for such cases.

Closes https://github.com/astral-sh/ruff/issues/6390.

## Test Plan

`cargo test` with new cases.

Before:

| project      | similarity index |
|--------------|------------------|
| build        | 0.75623          |
| cpython      | 0.75472          |
| django       | 0.99804          |
| transformers | 0.99618          |
| typeshed     | 0.74233          |
| warehouse    | 0.99601          |
| zulip        | 0.99727          |

After:

| project      | similarity index |
|--------------|------------------|
| build        | 0.75623          |
| cpython      | 0.75472          |
| django       | 0.99804          |
| transformers | 0.99618          |
| typeshed     | 0.74237          |
| warehouse    | 0.99601          |
| zulip        | 0.99727          |
2023-08-15 18:59:18 +00:00

1188 lines
33 KiB
Rust

use memchr::{memchr2, memchr3, memrchr3_iter};
use ruff_text_size::{TextLen, TextRange, TextSize};
use unic_ucd_ident::{is_xid_continue, is_xid_start};
use crate::{is_python_whitespace, Cursor};
/// Searches for the first non-trivia character in `range`.
///
/// The search skips over any whitespace and comments.
///
/// Returns `Some` if the range contains any non-trivia character. The first item is the absolute offset
/// of the character, the second item the non-trivia character.
///
/// Returns `None` if the range is empty or only contains trivia (whitespace or comments).
pub fn first_non_trivia_token(offset: TextSize, code: &str) -> Option<SimpleToken> {
SimpleTokenizer::starts_at(offset, code)
.skip_trivia()
.next()
}
/// Returns the number of newlines between `offset` and the first non whitespace character in the source code.
pub fn lines_before(offset: TextSize, code: &str) -> u32 {
let mut cursor = Cursor::new(&code[TextRange::up_to(offset)]);
let mut newlines = 0u32;
while let Some(c) = cursor.bump_back() {
match c {
'\n' => {
cursor.eat_char_back('\r');
newlines += 1;
}
'\r' => {
newlines += 1;
}
c if is_python_whitespace(c) => {
continue;
}
_ => {
break;
}
}
}
newlines
}
/// Counts the empty lines between `offset` and the first non-whitespace character.
pub fn lines_after(offset: TextSize, code: &str) -> u32 {
let mut cursor = Cursor::new(&code[offset.to_usize()..]);
let mut newlines = 0u32;
while let Some(c) = cursor.bump() {
match c {
'\n' => {
newlines += 1;
}
'\r' => {
cursor.eat_char('\n');
newlines += 1;
}
c if is_python_whitespace(c) => {
continue;
}
_ => {
break;
}
}
}
newlines
}
/// Counts the empty lines after `offset`, ignoring any trailing trivia on the same line as
/// `offset`.
#[allow(clippy::cast_possible_truncation)]
pub fn lines_after_ignoring_trivia(offset: TextSize, code: &str) -> u32 {
// SAFETY: We don't support files greater than 4GB, so casting to u32 is safe.
SimpleTokenizer::starts_at(offset, code)
.skip_while(|token| token.kind != SimpleTokenKind::Newline && token.kind.is_trivia())
.take_while(|token| {
token.kind == SimpleTokenKind::Newline || token.kind == SimpleTokenKind::Whitespace
})
.filter(|token| token.kind == SimpleTokenKind::Newline)
.count() as u32
}
fn is_identifier_start(c: char) -> bool {
c.is_ascii_alphabetic() || c == '_' || is_non_ascii_identifier_start(c)
}
// Checks if the character c is a valid continuation character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_identifier_continuation(c: char) -> bool {
if c.is_ascii() {
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
} else {
is_xid_continue(c)
}
}
fn is_non_ascii_identifier_start(c: char) -> bool {
is_xid_start(c)
}
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
pub struct SimpleToken {
pub kind: SimpleTokenKind,
pub range: TextRange,
}
impl SimpleToken {
pub const fn kind(&self) -> SimpleTokenKind {
self.kind
}
#[allow(unused)]
pub const fn range(&self) -> TextRange {
self.range
}
pub const fn start(&self) -> TextSize {
self.range.start()
}
pub const fn end(&self) -> TextSize {
self.range.end()
}
}
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
pub enum SimpleTokenKind {
/// A comment, not including the trailing new line.
Comment,
/// Sequence of ' ' or '\t'
Whitespace,
/// Start or end of the file
EndOfFile,
/// `\\`
Continuation,
/// `\n` or `\r` or `\r\n`
Newline,
/// `(`
LParen,
/// `)`
RParen,
/// `{`
LBrace,
/// `}`
RBrace,
/// `[`
LBracket,
/// `]`
RBracket,
/// `,`
Comma,
/// `:`
Colon,
/// `;`
Semi,
/// '/'
Slash,
/// '*'
Star,
/// `.`.
Dot,
/// `+`
Plus,
/// `-`
Minus,
/// `=`
Equals,
/// `>`
Greater,
/// `<`
Less,
/// `%`
Percent,
/// `&`
Ampersand,
/// `^`
Circumflex,
/// `|`
Vbar,
/// `@`
At,
/// `~`
Tilde,
/// `and`
And,
/// `as`
As,
/// `assert`
Assert,
/// `async`
Async,
/// `await`
Await,
/// `break`
Break,
/// `class`
Class,
/// `continue`
Continue,
/// `def`
Def,
/// `del`
Del,
/// `elif`
Elif,
/// `else`
Else,
/// `except`
Except,
/// `finally`
Finally,
/// `for`
For,
/// `from`
From,
/// `global`
Global,
/// `if`
If,
/// `import`
Import,
/// `in`
In,
/// `is`
Is,
/// `lambda`
Lambda,
/// `nonlocal`
Nonlocal,
/// `not`
Not,
/// `or`
Or,
/// `pass`
Pass,
/// `raise`
Raise,
/// `return`
Return,
/// `try`
Try,
/// `while`
While,
/// `match`
Match,
/// `type`
Type,
/// `case`
Case,
/// `with`
With,
/// `yield`
Yield,
/// Any other non trivia token.
Other,
/// Returned for each character after [`SimpleTokenKind::Other`] has been returned once.
Bogus,
}
impl SimpleTokenKind {
const fn from_non_trivia_char(c: char) -> SimpleTokenKind {
match c {
'(' => SimpleTokenKind::LParen,
')' => SimpleTokenKind::RParen,
'[' => SimpleTokenKind::LBracket,
']' => SimpleTokenKind::RBracket,
'{' => SimpleTokenKind::LBrace,
'}' => SimpleTokenKind::RBrace,
',' => SimpleTokenKind::Comma,
':' => SimpleTokenKind::Colon,
';' => SimpleTokenKind::Semi,
'/' => SimpleTokenKind::Slash,
'*' => SimpleTokenKind::Star,
'.' => SimpleTokenKind::Dot,
'+' => SimpleTokenKind::Plus,
'-' => SimpleTokenKind::Minus,
'=' => SimpleTokenKind::Equals,
'>' => SimpleTokenKind::Greater,
'<' => SimpleTokenKind::Less,
'%' => SimpleTokenKind::Percent,
'&' => SimpleTokenKind::Ampersand,
'^' => SimpleTokenKind::Circumflex,
'|' => SimpleTokenKind::Vbar,
'@' => SimpleTokenKind::At,
'~' => SimpleTokenKind::Tilde,
_ => SimpleTokenKind::Other,
}
}
const fn is_trivia(self) -> bool {
matches!(
self,
SimpleTokenKind::Whitespace
| SimpleTokenKind::Newline
| SimpleTokenKind::Comment
| SimpleTokenKind::Continuation
)
}
}
/// Simple zero allocation tokenizer for tokenizing trivia (and some tokens).
///
/// The tokenizer must start at an offset that is trivia (e.g. not inside of a multiline string).
///
/// The tokenizer doesn't guarantee any correctness after it returned a [`SimpleTokenKind::Other`]. That's why it
/// will return [`SimpleTokenKind::Bogus`] for every character after until it reaches the end of the file.
pub struct SimpleTokenizer<'a> {
offset: TextSize,
back_offset: TextSize,
/// `true` when it is known that the current `back` line has no comment for sure.
back_line_has_no_comment: bool,
bogus: bool,
source: &'a str,
cursor: Cursor<'a>,
}
impl<'a> SimpleTokenizer<'a> {
pub fn new(source: &'a str, range: TextRange) -> Self {
Self {
offset: range.start(),
back_offset: range.end(),
back_line_has_no_comment: false,
bogus: false,
source,
cursor: Cursor::new(&source[range]),
}
}
pub fn starts_at(offset: TextSize, source: &'a str) -> Self {
let range = TextRange::new(offset, source.text_len());
Self::new(source, range)
}
/// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`.
///
/// Consider using [`SimpleTokenizer::up_to_without_back_comment`] if intend to lex backwards.
pub fn up_to(offset: TextSize, source: &'a str) -> Self {
Self::new(source, TextRange::up_to(offset))
}
/// Creates a tokenizer that lexes tokens from the start of `source` up to `offset`, and informs
/// the lexer that the line at `offset` contains no comments. This can significantly speed up backwards lexing
/// because the lexer doesn't need to scan for comments.
pub fn up_to_without_back_comment(offset: TextSize, source: &'a str) -> Self {
let mut tokenizer = Self::up_to(offset, source);
tokenizer.back_line_has_no_comment = true;
tokenizer
}
fn to_keyword_or_other(&self, range: TextRange) -> SimpleTokenKind {
let source = &self.source[range];
match source {
"and" => SimpleTokenKind::And,
"as" => SimpleTokenKind::As,
"assert" => SimpleTokenKind::Assert,
"async" => SimpleTokenKind::Async,
"await" => SimpleTokenKind::Await,
"break" => SimpleTokenKind::Break,
"class" => SimpleTokenKind::Class,
"continue" => SimpleTokenKind::Continue,
"def" => SimpleTokenKind::Def,
"del" => SimpleTokenKind::Del,
"elif" => SimpleTokenKind::Elif,
"else" => SimpleTokenKind::Else,
"except" => SimpleTokenKind::Except,
"finally" => SimpleTokenKind::Finally,
"for" => SimpleTokenKind::For,
"from" => SimpleTokenKind::From,
"global" => SimpleTokenKind::Global,
"if" => SimpleTokenKind::If,
"import" => SimpleTokenKind::Import,
"in" => SimpleTokenKind::In,
"is" => SimpleTokenKind::Is,
"lambda" => SimpleTokenKind::Lambda,
"nonlocal" => SimpleTokenKind::Nonlocal,
"not" => SimpleTokenKind::Not,
"or" => SimpleTokenKind::Or,
"pass" => SimpleTokenKind::Pass,
"raise" => SimpleTokenKind::Raise,
"return" => SimpleTokenKind::Return,
"try" => SimpleTokenKind::Try,
"while" => SimpleTokenKind::While,
"match" => SimpleTokenKind::Match, // Match is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
"type" => SimpleTokenKind::Type, // Type is a soft keyword that depends on the context but we can always lex it as a keyword and leave it to the caller (parser) to decide if it should be handled as an identifier or keyword.
"case" => SimpleTokenKind::Case,
"with" => SimpleTokenKind::With,
"yield" => SimpleTokenKind::Yield,
_ => SimpleTokenKind::Other, // Potentially an identifier, but only if it isn't a string prefix. We can ignore this for now https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
}
}
fn next_token(&mut self) -> SimpleToken {
self.cursor.start_token();
let Some(first) = self.cursor.bump() else {
return SimpleToken {
kind: SimpleTokenKind::EndOfFile,
range: TextRange::empty(self.offset),
};
};
if self.bogus {
let token = SimpleToken {
kind: SimpleTokenKind::Bogus,
range: TextRange::at(self.offset, first.text_len()),
};
self.offset += first.text_len();
return token;
}
let kind = match first {
' ' | '\t' => {
self.cursor.eat_while(|c| matches!(c, ' ' | '\t'));
SimpleTokenKind::Whitespace
}
'\n' => SimpleTokenKind::Newline,
'\r' => {
self.cursor.eat_char('\n');
SimpleTokenKind::Newline
}
'#' => {
self.cursor.eat_while(|c| !matches!(c, '\n' | '\r'));
SimpleTokenKind::Comment
}
'\\' => SimpleTokenKind::Continuation,
c => {
let kind = if is_identifier_start(c) {
self.cursor.eat_while(is_identifier_continuation);
let token_len = self.cursor.token_len();
let range = TextRange::at(self.offset, token_len);
self.to_keyword_or_other(range)
} else {
SimpleTokenKind::from_non_trivia_char(c)
};
if kind == SimpleTokenKind::Other {
self.bogus = true;
}
kind
}
};
let token_len = self.cursor.token_len();
let token = SimpleToken {
kind,
range: TextRange::at(self.offset, token_len),
};
self.offset += token_len;
token
}
/// Returns the next token from the back. Prefer iterating forwards. Iterating backwards is significantly more expensive
/// because it needs to check if the line has any comments when encountering any non-trivia token.
pub fn next_token_back(&mut self) -> SimpleToken {
self.cursor.start_token();
let Some(last) = self.cursor.bump_back() else {
return SimpleToken {
kind: SimpleTokenKind::EndOfFile,
range: TextRange::empty(self.back_offset),
};
};
if self.bogus {
let token = SimpleToken {
kind: SimpleTokenKind::Bogus,
range: TextRange::at(self.back_offset - last.text_len(), last.text_len()),
};
self.back_offset -= last.text_len();
return token;
}
let kind = match last {
// This may not be 100% correct because it will lex-out trailing whitespace from a comment
// as whitespace rather than being part of the token. This shouldn't matter for what we use the lexer for.
' ' | '\t' => {
self.cursor.eat_back_while(|c| matches!(c, ' ' | '\t'));
SimpleTokenKind::Whitespace
}
'\r' => {
self.back_line_has_no_comment = false;
SimpleTokenKind::Newline
}
'\n' => {
self.back_line_has_no_comment = false;
self.cursor.eat_char_back('\r');
SimpleTokenKind::Newline
}
// Empty comment (could also be a comment nested in another comment, but this shouldn't matter for what we use the lexer for)
'#' => SimpleTokenKind::Comment,
// For all other tokens, test if the character isn't part of a comment.
c => {
// Skip the test whether there's a preceding comment if it has been performed before.
let comment_length = if self.back_line_has_no_comment {
None
} else {
let bytes = self.cursor.chars().as_str().as_bytes();
let mut potential_comment_starts: smallvec::SmallVec<[TextSize; 2]> =
smallvec::SmallVec::new();
// Find the start of the line, or any potential comments.
for index in memrchr3_iter(b'\n', b'\r', b'#', bytes) {
if bytes[index] == b'#' {
// Potentially a comment, but not guaranteed
// SAFETY: Safe, because ruff only supports files up to 4GB
potential_comment_starts.push(TextSize::try_from(index).unwrap());
} else {
break;
}
}
// No comments
if potential_comment_starts.is_empty() {
None
} else {
// The line contains at least one `#` token. The `#` can indicate the start of a
// comment, meaning the current token is commented out, or it is a regular `#` inside of a string.
self.comment_from_hash_positions(&potential_comment_starts)
}
};
// From here on it is guaranteed that this line has no other comment.
self.back_line_has_no_comment = true;
if let Some(comment_length) = comment_length {
// It is a comment, bump all tokens
for _ in 0..usize::from(comment_length) {
self.cursor.bump_back().unwrap();
}
SimpleTokenKind::Comment
} else if c == '\\' {
SimpleTokenKind::Continuation
} else {
let kind = if is_identifier_continuation(c) {
// if we only have identifier continuations but no start (e.g. 555) we
// don't want to consume the chars, so in that case, we want to rewind the
// cursor to here
let savepoint = self.cursor.clone();
self.cursor.eat_back_while(is_identifier_continuation);
let token_len = self.cursor.token_len();
let range = TextRange::at(self.back_offset - token_len, token_len);
if self.source[range]
.chars()
.next()
.is_some_and(is_identifier_start)
{
self.to_keyword_or_other(range)
} else {
self.cursor = savepoint;
SimpleTokenKind::Other
}
} else {
SimpleTokenKind::from_non_trivia_char(c)
};
if kind == SimpleTokenKind::Other {
self.bogus = true;
}
kind
}
}
};
let token_len = self.cursor.token_len();
let start = self.back_offset - token_len;
let token = SimpleToken {
kind,
range: TextRange::at(start, token_len),
};
self.back_offset = start;
token
}
pub fn skip_trivia(self) -> impl Iterator<Item = SimpleToken> + DoubleEndedIterator + 'a {
self.filter(|t| !t.kind().is_trivia())
}
/// Given the position of `#` tokens on a line, test if any `#` is the start of a comment and, if so, return the
/// length of the comment.
///
/// The challenge is that `#` tokens can also appear inside of strings:
///
/// ```python
/// ' #not a comment'
/// ```
///
/// This looks innocent but is the `'` really the start of the new string or could it be a closing delimiter
/// of a previously started string:
///
/// ```python
/// ' a string\
/// ` # a comment '
/// ```
///
/// The only way to reliability tell whether the `#` is a comment when the comment contains a quote char is
/// to forward lex all strings and comments and test if there's any unclosed string literal. If so, then
/// the hash cannot be a comment.
fn comment_from_hash_positions(&self, hash_positions: &[TextSize]) -> Option<TextSize> {
// Iterate over the `#` positions from the start to the end of the line.
// This is necessary to correctly support `a # comment # comment`.
for possible_start in hash_positions.iter().rev() {
let comment_bytes =
self.source[TextRange::new(*possible_start, self.back_offset)].as_bytes();
// Test if the comment contains any quotes. If so, then it's possible that the `#` token isn't
// the start of a comment, but instead part of a string:
// ```python
// a + 'a string # not a comment'
// a + '''a string
// # not a comment'''
// ```
match memchr2(b'\'', b'"', comment_bytes) {
// Most comments don't contain quotes, and most strings don't contain comments.
// For these it's safe to assume that they are comments.
None => return Some(self.cursor.chars().as_str().text_len() - possible_start),
// Now it gets complicated... There's no good way to know whether this is a string or not.
// It is necessary to lex all strings and comments from the start to know if it is one or the other.
Some(_) => {
if find_unterminated_string_kind(
&self.cursor.chars().as_str()[TextRange::up_to(*possible_start)],
)
.is_none()
{
// There's no unterminated string at the comment's start position. This *must*
// be a comment.
return Some(self.cursor.chars().as_str().text_len() - possible_start);
}
// This is a hash inside of a string: `'test # not a comment'` continue with the next potential comment on the line.
}
}
}
None
}
}
fn find_unterminated_string_kind(input: &str) -> Option<StringKind> {
let mut rest = input;
while let Some(comment_or_string_start) = memchr3(b'#', b'\'', b'\"', rest.as_bytes()) {
let c = rest.as_bytes()[comment_or_string_start] as char;
let after = &rest[comment_or_string_start + 1..];
if c == '#' {
let comment_end = memchr2(b'\n', b'\r', after.as_bytes()).unwrap_or(after.len());
rest = &after[comment_end..];
} else {
let mut cursor = Cursor::new(after);
let quote_kind = if c == '\'' {
QuoteKind::Single
} else {
QuoteKind::Double
};
let string_kind = if cursor.eat_char(quote_kind.as_char()) {
// `''` or `""`
if cursor.eat_char(quote_kind.as_char()) {
// `'''` or `"""`
StringKind::Triple(quote_kind)
} else {
// empty string literal, nothing more to lex
rest = cursor.chars().as_str();
continue;
}
} else {
StringKind::Single(quote_kind)
};
if !is_string_terminated(string_kind, &mut cursor) {
return Some(string_kind);
}
rest = cursor.chars().as_str();
}
}
None
}
fn is_string_terminated(kind: StringKind, cursor: &mut Cursor) -> bool {
let quote_char = kind.quote_kind().as_char();
while let Some(c) = cursor.bump() {
match c {
'\n' | '\r' if kind.is_single() => {
// Reached the end of the line without a closing quote, this is an unterminated string literal.
return false;
}
'\\' => {
// Skip over escaped quotes that match this strings quotes or double escaped backslashes
if cursor.eat_char(quote_char) || cursor.eat_char('\\') {
continue;
}
// Eat over line continuation
cursor.eat_char('\r');
cursor.eat_char('\n');
}
c if c == quote_char => {
if kind.is_single() || (cursor.eat_char(quote_char) && cursor.eat_char(quote_char))
{
return true;
}
}
_ => {
// continue
}
}
}
// Reached end without a closing quote
false
}
impl Iterator for SimpleTokenizer<'_> {
type Item = SimpleToken;
fn next(&mut self) -> Option<Self::Item> {
let token = self.next_token();
if token.kind == SimpleTokenKind::EndOfFile {
None
} else {
Some(token)
}
}
}
impl DoubleEndedIterator for SimpleTokenizer<'_> {
fn next_back(&mut self) -> Option<Self::Item> {
let token = self.next_token_back();
if token.kind == SimpleTokenKind::EndOfFile {
None
} else {
Some(token)
}
}
}
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum StringKind {
/// `'...'` or `"..."`
Single(QuoteKind),
/// `'''...'''` or `"""..."""`
Triple(QuoteKind),
}
impl StringKind {
const fn quote_kind(self) -> QuoteKind {
match self {
StringKind::Single(kind) => kind,
StringKind::Triple(kind) => kind,
}
}
const fn is_single(self) -> bool {
matches!(self, StringKind::Single(_))
}
}
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum QuoteKind {
/// `'``
Single,
/// `"`
Double,
}
impl QuoteKind {
const fn as_char(self) -> char {
match self {
QuoteKind::Single => '\'',
QuoteKind::Double => '"',
}
}
}
#[cfg(test)]
mod tests {
use insta::assert_debug_snapshot;
use ruff_text_size::{TextLen, TextRange, TextSize};
use crate::tokenizer::{lines_after, lines_before, SimpleToken, SimpleTokenizer};
struct TokenizationTestCase {
source: &'static str,
range: TextRange,
tokens: Vec<SimpleToken>,
}
impl TokenizationTestCase {
fn assert_reverse_tokenization(&self) {
let mut backwards = self.tokenize_reverse();
// Re-reverse to get the tokens in forward order.
backwards.reverse();
assert_eq!(&backwards, &self.tokens);
}
fn tokenize_reverse(&self) -> Vec<SimpleToken> {
SimpleTokenizer::new(self.source, self.range)
.rev()
.collect()
}
fn tokens(&self) -> &[SimpleToken] {
&self.tokens
}
}
fn tokenize_range(source: &'static str, range: TextRange) -> TokenizationTestCase {
let tokens: Vec<_> = SimpleTokenizer::new(source, range).collect();
TokenizationTestCase {
source,
range,
tokens,
}
}
fn tokenize(source: &'static str) -> TokenizationTestCase {
tokenize_range(source, TextRange::new(TextSize::new(0), source.text_len()))
}
#[test]
fn tokenize_trivia() {
let source = "# comment\n # comment";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_parentheses() {
let source = "([{}])";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_comma() {
let source = ",,,,";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_continuation() {
let source = "( \\\n )";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_characters() {
let source = "-> *= (~=)";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tricky_unicode() {
let source = "មុ";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn identifier_ending_in_non_start_char() {
let source = "i5";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn ignore_word_with_only_id_continuing_chars() {
let source = "555";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
// note: not reversible: [other, bogus, bogus] vs [bogus, bogus, other]
}
#[test]
fn tokenize_multichar() {
let source = "if in else match";
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_substring() {
let source = "('some string') # comment";
let test_case =
tokenize_range(source, TextRange::new(TextSize::new(14), source.text_len()));
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_slash() {
let source = r#" # trailing positional comment
# Positional arguments only after here
,/"#;
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
test_case.assert_reverse_tokenization();
}
#[test]
fn tokenize_bogus() {
let source = r#"# leading comment
"a string"
a = (10)"#;
let test_case = tokenize(source);
assert_debug_snapshot!(test_case.tokens());
assert_debug_snapshot!("Reverse", test_case.tokenize_reverse());
}
#[test]
fn single_quoted_multiline_string_containing_comment() {
let test_case = tokenize(
r"'This string contains a hash looking like a comment\
# This is not a comment'",
);
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn single_quoted_multiline_string_implicit_concatenation() {
let test_case = tokenize(
r#"'This string contains a hash looking like a comment\
# This is' "not_a_comment""#,
);
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn triple_quoted_multiline_string_containing_comment() {
let test_case = tokenize(
r#"'''This string contains a hash looking like a comment
# This is not a comment'''"#,
);
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn comment_containing_triple_quoted_string() {
let test_case = tokenize("'''leading string''' # a comment '''not a string'''");
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn comment_containing_single_quoted_string() {
let test_case = tokenize("'leading string' # a comment 'not a string'");
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn string_followed_by_multiple_comments() {
let test_case =
tokenize(r#"'a string # containing a hash " # and another hash ' # finally a comment"#);
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn string_with_escaped_quote() {
let test_case = tokenize(r"'a string \' # containing a hash ' # finally a comment");
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn string_with_double_escaped_backslash() {
let test_case = tokenize(r"'a string \\' # a comment '");
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn empty_string_literal() {
let test_case = tokenize(r#"'' # a comment '"#);
assert_debug_snapshot!(test_case.tokenize_reverse());
}
#[test]
fn lines_before_empty_string() {
assert_eq!(lines_before(TextSize::new(0), ""), 0);
}
#[test]
fn lines_before_in_the_middle_of_a_line() {
assert_eq!(lines_before(TextSize::new(4), "a = 20"), 0);
}
#[test]
fn lines_before_on_a_new_line() {
assert_eq!(lines_before(TextSize::new(7), "a = 20\nb = 10"), 1);
}
#[test]
fn lines_before_multiple_leading_newlines() {
assert_eq!(lines_before(TextSize::new(9), "a = 20\n\r\nb = 10"), 2);
}
#[test]
fn lines_before_with_comment_offset() {
assert_eq!(lines_before(TextSize::new(8), "a = 20\n# a comment"), 0);
}
#[test]
fn lines_before_with_trailing_comment() {
assert_eq!(
lines_before(TextSize::new(22), "a = 20 # some comment\nb = 10"),
1
);
}
#[test]
fn lines_before_with_comment_only_line() {
assert_eq!(
lines_before(TextSize::new(22), "a = 20\n# some comment\nb = 10"),
1
);
}
#[test]
fn lines_after_empty_string() {
assert_eq!(lines_after(TextSize::new(0), ""), 0);
}
#[test]
fn lines_after_in_the_middle_of_a_line() {
assert_eq!(lines_after(TextSize::new(4), "a = 20"), 0);
}
#[test]
fn lines_after_before_a_new_line() {
assert_eq!(lines_after(TextSize::new(6), "a = 20\nb = 10"), 1);
}
#[test]
fn lines_after_multiple_newlines() {
assert_eq!(lines_after(TextSize::new(6), "a = 20\n\r\nb = 10"), 2);
}
#[test]
fn lines_after_before_comment_offset() {
assert_eq!(lines_after(TextSize::new(7), "a = 20 # a comment\n"), 0);
}
#[test]
fn lines_after_with_comment_only_line() {
assert_eq!(
lines_after(TextSize::new(6), "a = 20\n# some comment\nb = 10"),
1
);
}
}