Approximate tokens len (#9546)

This commit is contained in:
Micha Reiser 2024-01-19 17:39:37 +01:00 committed by GitHub
parent b3a6f0ce81
commit 47ad7b4500
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 38 additions and 13 deletions

View file

@ -78,14 +78,14 @@
//! These tokens can be directly fed into the `ruff_python_parser` to generate an AST:
//!
//! ```
//! use ruff_python_parser::{lexer::lex, Mode, parse_tokens};
//! use ruff_python_parser::{Mode, parse_tokens, tokenize_all};
//!
//! let python_source = r#"
//! def is_odd(i):
//! return bool(i & 1)
//! "#;
//! let tokens = lex(python_source, Mode::Module);
//! let ast = parse_tokens(tokens.collect(), python_source, Mode::Module);
//! let tokens = tokenize_all(python_source, Mode::Module);
//! let ast = parse_tokens(tokens, python_source, Mode::Module);
//!
//! assert!(ast.is_ok());
//! ```
@ -133,7 +133,7 @@ pub mod typing;
/// Collect tokens up to and including the first error.
pub fn tokenize(contents: &str, mode: Mode) -> Vec<LexResult> {
let mut tokens: Vec<LexResult> = vec![];
let mut tokens: Vec<LexResult> = allocate_tokens_vec(contents);
for tok in lexer::lex(contents, mode) {
let is_err = tok.is_err();
tokens.push(tok);
@ -141,9 +141,35 @@ pub fn tokenize(contents: &str, mode: Mode) -> Vec<LexResult> {
break;
}
}
tokens
}
/// Tokenizes all tokens.
///
/// It differs from [`tokenize`] in that it tokenizes all tokens and doesn't stop
/// after the first `Err`.
pub fn tokenize_all(contents: &str, mode: Mode) -> Vec<LexResult> {
let mut tokens = allocate_tokens_vec(contents);
for token in lexer::lex(contents, mode) {
tokens.push(token);
}
tokens
}
/// Allocates a [`Vec`] with an approximated capacity to fit all tokens
/// of `contents`.
///
/// See [#9546](https://github.com/astral-sh/ruff/pull/9546) for a more detailed explanation.
pub fn allocate_tokens_vec(contents: &str) -> Vec<LexResult> {
Vec::with_capacity(approximate_tokens_lower_bound(contents))
}
/// Approximates the number of tokens when lexing `contents`.
fn approximate_tokens_lower_bound(contents: &str) -> usize {
contents.len().saturating_mul(15) / 100
}
/// Parse a full Python program from its tokens.
pub fn parse_program_tokens(
tokens: Vec<LexResult>,

View file

@ -31,7 +31,7 @@ use crate::{
lexer::{self, LexicalError, LexicalErrorType},
python,
token::Tok,
Mode,
tokenize_all, Mode,
};
/// Parse a full Python program usually consisting of multiple lines.
@ -55,8 +55,7 @@ use crate::{
/// assert!(program.is_ok());
/// ```
pub fn parse_program(source: &str) -> Result<ModModule, ParseError> {
let lexer = lex(source, Mode::Module);
match parse_tokens(lexer.collect(), source, Mode::Module)? {
match parse_tokens(tokenize_all(source, Mode::Module), source, Mode::Module)? {
Mod::Module(m) => Ok(m),
Mod::Expression(_) => unreachable!("Mode::Module doesn't return other variant"),
}