From f1f31324d03204fc74321f433b51a4a246360884 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Sun, 19 Feb 2023 22:37:00 -0500 Subject: [PATCH] Use muiltipeek --- parser/src/parser.rs | 4 +- parser/src/soft_keywords.rs | 175 +++++++++++++++++++----------------- 2 files changed, 94 insertions(+), 85 deletions(-) diff --git a/parser/src/parser.rs b/parser/src/parser.rs index d8f09cd..f4c00f0 100644 --- a/parser/src/parser.rs +++ b/parser/src/parser.rs @@ -14,11 +14,11 @@ use crate::lexer::{LexResult, Tok}; pub use crate::mode::Mode; +use crate::soft_keywords::SoftKeywordTransformer; use crate::{ast, error::ParseError, lexer, python}; use ast::Location; use itertools::Itertools; use std::iter; -use crate::soft_keywords::soft_keywords; /// Parse a full Python program usually consisting of multiple lines. /// @@ -190,7 +190,7 @@ pub fn parse_tokens( .chain(lxr) .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline)); python::TopParser::new() - .parse(soft_keywords(tokenizer, mode).into_iter()) + .parse(SoftKeywordTransformer::new(tokenizer, mode).into_iter()) .map_err(|e| crate::error::parse_error_from_lalrpop(e, source_path)) } diff --git a/parser/src/soft_keywords.rs b/parser/src/soft_keywords.rs index 5a96929..b2c16f9 100644 --- a/parser/src/soft_keywords.rs +++ b/parser/src/soft_keywords.rs @@ -1,20 +1,10 @@ +use itertools::{Itertools, MultiPeek}; + use crate::lexer::{LexResult, Tok}; pub use crate::mode::Mode; -/// Collect all tokens from a token stream in a vector. -fn collect_tokens(tokenizer: impl IntoIterator) -> Vec { - let mut tokens: Vec = vec![]; - for tok in tokenizer { - let is_err = tok.is_err(); - tokens.push(tok); - if is_err { - break; - } - } - tokens -} - -/// Modify a token stream to accommodate soft keywords (namely, `match` and `case`). +/// An [`Iterator`] that transforms a token stream to accommodate soft keywords (namely, `match` +/// and `case`). /// /// [PEP 634](https://www.python.org/dev/peps/pep-0634/) introduced the `match` and `case` keywords /// as soft keywords, meaning that they can be used as identifiers (e.g., variable names) in certain @@ -25,93 +15,112 @@ fn collect_tokens(tokenizer: impl IntoIterator) -> Vec, - mode: Mode, -) -> Vec { - let mut tokenizer: Vec = collect_tokens(tokenizer); - let mut start_of_line = matches!(mode, Mode::Module | Mode::Interactive); - for i in 0..tokenizer.len() { - // If the token is a `match` or `case` token, check if it's used as an identifier. - // We assume every `match` or `case` is an identifier unless both of the following - // conditions are met: - // 1. The token is at the start of a logical line. - // 2. The logical line contains a top-level colon (that is, a colon that is not nested - // inside a parenthesized expression, list, or dictionary). - // 3. The top-level colon is not the immediate sibling of a `match` or `case` token. - // (This is to avoid treating `match` and `case` as identifiers when annotated with - // type hints.) - if tokenizer[i] - .as_ref() - .map_or(false, |(_, tok, _)| matches!(tok, Tok::Match | Tok::Case)) - { - let is_identifier = { - if !start_of_line { - // If the `match` or `case` token is not at the start of a line, it's definitely - // an identifier. - true +pub struct SoftKeywordTransformer +where + I: Iterator, +{ + pub underlying: MultiPeek, + pub start_of_line: bool, +} + +impl SoftKeywordTransformer +where + I: Iterator, +{ + pub fn new(tokenizer: I, mode: Mode) -> Self { + Self { + underlying: tokenizer.multipeek(), + start_of_line: matches!(mode, Mode::Interactive | Mode::Module), + } + } +} + +impl Iterator for SoftKeywordTransformer +where + I: Iterator, +{ + type Item = LexResult; + + #[inline] + fn next(&mut self) -> Option { + let mut next = self.underlying.next(); + if let Some(Ok((start, tok, end))) = next.as_ref() { + // If the token is a `match` or `case` token, check if it's used as an identifier. + // We assume every `match` or `case` is an identifier unless both of the following + // conditions are met: + // 1. The token is at the start of a logical line. + // 2. The logical line contains a top-level colon (that is, a colon that is not nested + // inside a parenthesized expression, list, or dictionary). + // 3. The top-level colon is not the immediate sibling of a `match` or `case` token. + // (This is to avoid treating `match` and `case` as identifiers when annotated with + // type hints.) + if matches!(tok, Tok::Match | Tok::Case) { + if !self.start_of_line { + next = Some(Ok(( + *start, + Tok::Name { + name: if matches!(tok, Tok::Match) { + "match".to_string() + } else { + "case".to_string() + }, + }, + *end, + ))); } else { - // - let mut seen_colon = false; - let mut first = true; let mut par_count = 0; let mut sqb_count = 0; let mut brace_count = 0; - for (_, tok, _) in tokenizer.iter().skip(i + 1).flatten() { + let mut first = true; + let mut seen_colon = false; + while let Some(Ok((_, tok, _))) = self.underlying.peek() { match tok { Tok::Newline => break, Tok::Colon if par_count == 0 && sqb_count == 0 && brace_count == 0 => { if !first { seen_colon = true; } - break; - } - Tok::Lpar => { - par_count += 1; - } - Tok::Rpar => { - par_count -= 1; - } - Tok::Lsqb => { - sqb_count += 1; - } - Tok::Rsqb => { - sqb_count -= 1; - } - Tok::Lbrace => { - brace_count += 1; - } - Tok::Rbrace => { - brace_count -= 1; } + Tok::Lpar => par_count += 1, + Tok::Rpar => par_count -= 1, + Tok::Lsqb => sqb_count += 1, + Tok::Rsqb => sqb_count -= 1, + Tok::Lbrace => brace_count += 1, + Tok::Rbrace => brace_count -= 1, _ => {} } first = false; } - !seen_colon - } - }; - if is_identifier { - if let Ok((_, tok, _)) = &mut tokenizer[i] { - if let Tok::Match = tok { - *tok = Tok::Name { - name: "match".to_string(), - }; - } else if let Tok::Case = tok { - *tok = Tok::Name { - name: "case".to_string(), - }; + if !seen_colon { + next = Some(Ok(( + *start, + Tok::Name { + name: if matches!(tok, Tok::Match) { + "match".to_string() + } else { + "case".to_string() + }, + }, + *end, + ))); } } } } - start_of_line = tokenizer[i].as_ref().map_or(false, |(_, tok, _)| { - matches!( - tok, - Tok::StartModule | Tok::StartInteractive | Tok::Newline | Tok::Indent | Tok::Dedent - ) - }); - } - tokenizer + self.start_of_line = next.as_ref().map_or(false, |lex_result| { + lex_result.as_ref().map_or(false, |(_, tok, _)| { + matches!( + tok, + Tok::StartModule + | Tok::StartInteractive + | Tok::Newline + | Tok::Indent + | Tok::Dedent + ) + }) + }); + + next + } }