Use muiltipeek

This commit is contained in:
Charlie Marsh 2023-02-19 22:37:00 -05:00
parent 8649bf6f8f
commit f1f31324d0
2 changed files with 94 additions and 85 deletions

View file

@ -14,11 +14,11 @@
use crate::lexer::{LexResult, Tok};
pub use crate::mode::Mode;
use crate::soft_keywords::SoftKeywordTransformer;
use crate::{ast, error::ParseError, lexer, python};
use ast::Location;
use itertools::Itertools;
use std::iter;
use crate::soft_keywords::soft_keywords;
/// Parse a full Python program usually consisting of multiple lines.
///
@ -190,7 +190,7 @@ pub fn parse_tokens(
.chain(lxr)
.filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline));
python::TopParser::new()
.parse(soft_keywords(tokenizer, mode).into_iter())
.parse(SoftKeywordTransformer::new(tokenizer, mode).into_iter())
.map_err(|e| crate::error::parse_error_from_lalrpop(e, source_path))
}

View file

@ -1,20 +1,10 @@
use itertools::{Itertools, MultiPeek};
use crate::lexer::{LexResult, Tok};
pub use crate::mode::Mode;
/// Collect all tokens from a token stream in a vector.
fn collect_tokens(tokenizer: impl IntoIterator<Item = LexResult>) -> Vec<LexResult> {
let mut tokens: Vec<LexResult> = vec![];
for tok in tokenizer {
let is_err = tok.is_err();
tokens.push(tok);
if is_err {
break;
}
}
tokens
}
/// Modify a token stream to accommodate soft keywords (namely, `match` and `case`).
/// An [`Iterator`] that transforms a token stream to accommodate soft keywords (namely, `match`
/// and `case`).
///
/// [PEP 634](https://www.python.org/dev/peps/pep-0634/) introduced the `match` and `case` keywords
/// as soft keywords, meaning that they can be used as identifiers (e.g., variable names) in certain
@ -25,93 +15,112 @@ fn collect_tokens(tokenizer: impl IntoIterator<Item = LexResult>) -> Vec<LexResu
///
/// Handling soft keywords in this intermediary pass allows us to simplify both the lexer and
/// parser, as neither of them need to be aware of soft keywords.
pub fn soft_keywords(
tokenizer: impl IntoIterator<Item = LexResult>,
mode: Mode,
) -> Vec<LexResult> {
let mut tokenizer: Vec<LexResult> = collect_tokens(tokenizer);
let mut start_of_line = matches!(mode, Mode::Module | Mode::Interactive);
for i in 0..tokenizer.len() {
// If the token is a `match` or `case` token, check if it's used as an identifier.
// We assume every `match` or `case` is an identifier unless both of the following
// conditions are met:
// 1. The token is at the start of a logical line.
// 2. The logical line contains a top-level colon (that is, a colon that is not nested
// inside a parenthesized expression, list, or dictionary).
// 3. The top-level colon is not the immediate sibling of a `match` or `case` token.
// (This is to avoid treating `match` and `case` as identifiers when annotated with
// type hints.)
if tokenizer[i]
.as_ref()
.map_or(false, |(_, tok, _)| matches!(tok, Tok::Match | Tok::Case))
{
let is_identifier = {
if !start_of_line {
// If the `match` or `case` token is not at the start of a line, it's definitely
// an identifier.
true
pub struct SoftKeywordTransformer<I>
where
I: Iterator<Item = LexResult>,
{
pub underlying: MultiPeek<I>,
pub start_of_line: bool,
}
impl<I> SoftKeywordTransformer<I>
where
I: Iterator<Item = LexResult>,
{
pub fn new(tokenizer: I, mode: Mode) -> Self {
Self {
underlying: tokenizer.multipeek(),
start_of_line: matches!(mode, Mode::Interactive | Mode::Module),
}
}
}
impl<I> Iterator for SoftKeywordTransformer<I>
where
I: Iterator<Item = LexResult>,
{
type Item = LexResult;
#[inline]
fn next(&mut self) -> Option<LexResult> {
let mut next = self.underlying.next();
if let Some(Ok((start, tok, end))) = next.as_ref() {
// If the token is a `match` or `case` token, check if it's used as an identifier.
// We assume every `match` or `case` is an identifier unless both of the following
// conditions are met:
// 1. The token is at the start of a logical line.
// 2. The logical line contains a top-level colon (that is, a colon that is not nested
// inside a parenthesized expression, list, or dictionary).
// 3. The top-level colon is not the immediate sibling of a `match` or `case` token.
// (This is to avoid treating `match` and `case` as identifiers when annotated with
// type hints.)
if matches!(tok, Tok::Match | Tok::Case) {
if !self.start_of_line {
next = Some(Ok((
*start,
Tok::Name {
name: if matches!(tok, Tok::Match) {
"match".to_string()
} else {
"case".to_string()
},
},
*end,
)));
} else {
//
let mut seen_colon = false;
let mut first = true;
let mut par_count = 0;
let mut sqb_count = 0;
let mut brace_count = 0;
for (_, tok, _) in tokenizer.iter().skip(i + 1).flatten() {
let mut first = true;
let mut seen_colon = false;
while let Some(Ok((_, tok, _))) = self.underlying.peek() {
match tok {
Tok::Newline => break,
Tok::Colon if par_count == 0 && sqb_count == 0 && brace_count == 0 => {
if !first {
seen_colon = true;
}
break;
}
Tok::Lpar => {
par_count += 1;
}
Tok::Rpar => {
par_count -= 1;
}
Tok::Lsqb => {
sqb_count += 1;
}
Tok::Rsqb => {
sqb_count -= 1;
}
Tok::Lbrace => {
brace_count += 1;
}
Tok::Rbrace => {
brace_count -= 1;
}
Tok::Lpar => par_count += 1,
Tok::Rpar => par_count -= 1,
Tok::Lsqb => sqb_count += 1,
Tok::Rsqb => sqb_count -= 1,
Tok::Lbrace => brace_count += 1,
Tok::Rbrace => brace_count -= 1,
_ => {}
}
first = false;
}
!seen_colon
}
};
if is_identifier {
if let Ok((_, tok, _)) = &mut tokenizer[i] {
if let Tok::Match = tok {
*tok = Tok::Name {
name: "match".to_string(),
};
} else if let Tok::Case = tok {
*tok = Tok::Name {
name: "case".to_string(),
};
if !seen_colon {
next = Some(Ok((
*start,
Tok::Name {
name: if matches!(tok, Tok::Match) {
"match".to_string()
} else {
"case".to_string()
},
},
*end,
)));
}
}
}
}
start_of_line = tokenizer[i].as_ref().map_or(false, |(_, tok, _)| {
matches!(
tok,
Tok::StartModule | Tok::StartInteractive | Tok::Newline | Tok::Indent | Tok::Dedent
)
});
}
tokenizer
self.start_of_line = next.as_ref().map_or(false, |lex_result| {
lex_result.as_ref().map_or(false, |(_, tok, _)| {
matches!(
tok,
Tok::StartModule
| Tok::StartInteractive
| Tok::Newline
| Tok::Indent
| Tok::Dedent
)
})
});
next
}
}