Document parser crate.

2025-07-08 05:35:22 +00:00 · 2023-02-07 21:42:15 +02:00 · 2023-02-07 21:42:15 +02:00 · 07918f0a9a
commit 07918f0a9a
parent e7f14ab9b8
6 changed files with 429 additions and 86 deletions
--- a/parser/src/context.rs
+++ b/parser/src/context.rs
@ -1,6 +1,6 @@
 use rustpython_ast::{Expr, ExprContext, ExprKind};

-pub fn set_context(expr: Expr, ctx: ExprContext) -> Expr {
+pub(crate) fn set_context(expr: Expr, ctx: ExprContext) -> Expr {
    match expr.node {
        ExprKind::Name { id, .. } => Expr {
            node: ExprKind::Name { id, ctx },
--- a/parser/src/error.rs
+++ b/parser/src/error.rs
@ -1,40 +1,71 @@
-//! Define internal parse error types
-//! The goal is to provide a matching and a safe error API, maksing errors from LALR
+//! Error types for the parser.
+//!
+//! These types are used to represent errors that occur during lexing and parsing and are
+//! returned by the `parse_*` functions in the [parser] module and the iterator in the
+//! [lexer] implementation.
+//!
+//! [parser]: crate::parser
+//! [lexer]: crate::lexer

+// Define internal parse error types.
+// The goal is to provide a matching and a safe error API, masking errors from LALR
 use crate::{ast::Location, token::Tok};
 use lalrpop_util::ParseError as LalrpopError;
 use std::fmt;

-/// Represents an error during lexical scanning.
+/// Represents an error during lexing.
 #[derive(Debug, PartialEq)]
 pub struct LexicalError {
+    /// The type of error that occurred.
    pub error: LexicalErrorType,
+    /// The location of the error.
    pub location: Location,
 }

 impl LexicalError {
+    /// Creates a new `LexicalError` with the given error type and location.
    pub fn new(error: LexicalErrorType, location: Location) -> Self {
        Self { error, location }
    }
 }

+/// Represents the different types of errors that can occur during lexing.
 #[derive(Debug, PartialEq)]
 pub enum LexicalErrorType {
+    // TODO: Can probably be removed, the places it is used seem to be able
+    // to use the `UnicodeError` variant instead.
+    #[doc(hidden)]
    StringError,
+    // TODO: Should take a start/end position to report.
+    /// Decoding of a unicode escape sequence in a string literal failed.
    UnicodeError,
+    /// The nesting of brackets/braces/parentheses is not balanced.
    NestingError,
+    /// The indentation is not consistent.
    IndentationError,
+    /// Inconsistent use of tabs and spaces.
    TabError,
+    /// Encountered a tab after a space.
    TabsAfterSpaces,
+    /// A non-default argument follows a default argument.
    DefaultArgumentError,
+    /// A duplicate argument was found in a function definition.
    DuplicateArgumentError(String),
+    /// A positional argument follows a keyword argument.
    PositionalArgumentError,
+    /// An iterable argument unpacking `*args` follows keyword argument unpacking `**kwargs`.
    UnpackedArgumentError,
+    /// A keyword argument was repeated.
    DuplicateKeywordArgumentError(String),
+    /// An unrecognized token was encountered.
    UnrecognizedToken { tok: char },
+    /// An f-string error containing the [`FStringErrorType`].
    FStringError(FStringErrorType),
+    /// An unexpected character was encountered after a line continuation.
    LineContinuationError,
+    /// An unexpected end of file was encountered.
    Eof,
+    /// An unexpected error occurred.
    OtherError(String),
 }

@ -85,13 +116,17 @@ impl fmt::Display for LexicalErrorType {
 }

 // TODO: consolidate these with ParseError
+/// An error that occurred during parsing of an f-string.
 #[derive(Debug, PartialEq)]
 pub struct FStringError {
+    /// The type of error that occurred.
    pub error: FStringErrorType,
+    /// The location of the error.
    pub location: Location,
 }

 impl FStringError {
+    /// Creates a new `FStringError` with the given error type and location.
    pub fn new(error: FStringErrorType, location: Location) -> Self {
        Self { error, location }
    }
@ -106,19 +141,33 @@ impl From<FStringError> for LexicalError {
    }
 }

+/// Represents the different types of errors that can occur during parsing of an f-string.
 #[derive(Debug, PartialEq)]
 pub enum FStringErrorType {
+    /// Expected a right brace after an opened left brace.
    UnclosedLbrace,
+    /// Expected a left brace after an ending right brace.
    UnopenedRbrace,
+    /// Expected a right brace after a conversion flag.
    ExpectedRbrace,
+    /// An error occurred while parsing an f-string expression.
    InvalidExpression(Box<ParseErrorType>),
+    /// An invalid conversion flag was encountered.
    InvalidConversionFlag,
+    /// An empty expression was encountered.
    EmptyExpression,
+    /// An opening delimiter was not closed properly.
    MismatchedDelimiter(char, char),
+    /// Too many nested expressions in an f-string.
    ExpressionNestedTooDeeply,
+    /// The f-string expression cannot include the given character.
    ExpressionCannotInclude(char),
+    /// A single right brace was encountered.
    SingleRbrace,
+    /// A closing delimiter was not opened properly.
    Unmatched(char),
+    // TODO: Test this case.
+    /// Unterminated string.
    UnterminatedString,
 }

@ -167,9 +216,10 @@ impl From<FStringError> for LalrpopError<Location, Tok, LexicalError> {
    }
 }

-/// Represents an error during parsing
+/// Represents an error during parsing.
 pub type ParseError = rustpython_compiler_core::BaseError<ParseErrorType>;

+/// Represents the different types of errors that can occur during parsing.
 #[derive(Debug, PartialEq, thiserror::Error)]
 pub enum ParseErrorType {
    /// Parser encountered an unexpected end of input
@ -180,11 +230,12 @@ pub enum ParseErrorType {
    InvalidToken,
    /// Parser encountered an unexpected token
    UnrecognizedToken(Tok, Option<String>),
-    /// Maps to `User` type from `lalrpop-util`
+    // Maps to `User` type from `lalrpop-util`
+    /// Parser encountered an error during lexing.
    Lexical(LexicalErrorType),
 }

-/// Convert `lalrpop_util::ParseError` to our internal type
+// Convert `lalrpop_util::ParseError` to our internal type
 pub(crate) fn parse_error_from_lalrpop(
    err: LalrpopError<Location, Tok, LexicalError>,
    source_path: &str,
@ -258,6 +309,7 @@ impl fmt::Display for ParseErrorType {
 }

 impl ParseErrorType {
+    /// Returns true if the error is an indentation error.
    pub fn is_indentation_error(&self) -> bool {
        match self {
            ParseErrorType::Lexical(LexicalErrorType::IndentationError) => true,
@ -267,6 +319,8 @@ impl ParseErrorType {
            _ => false,
        }
    }
+
+    /// Returns true if the error is a tab error.
    pub fn is_tab_error(&self) -> bool {
        matches!(
            self,
--- a/parser/src/lib.rs
+++ b/parser/src/lib.rs
@ -1,19 +1,119 @@
-//! This crate can be used to parse python sourcecode into a so
-//! called AST (abstract syntax tree).
+//! This crate can be used to parse Python source code into an Abstract
+//! Syntax Tree.
 //!
-//! The stages involved in this process are lexical analysis and
-//! parsing. The lexical analysis splits the sourcecode into
-//! tokens, and the parsing transforms those tokens into an AST.
+//! ## Overview:
 //!
-//! For example, one could do this:
+//! The process by which source code is parsed into an AST can be broken down
+//! into two general stages: [lexical analysis] and [parsing].
+//!
+//! During lexical analysis, the source code is converted into a stream of lexical
+//! tokens that represent the smallest meaningful units of the language. For example,
+//! the source code `print("Hello world")` would _roughly_ be converted into the following
+//! stream of tokens:
+//!
+//! ```text
+//! Name("print"), LeftParen, String("Hello world"), RightParen
+//! ```
+//!
+//! these tokens are then consumed by the parser, which matches them against a set of
+//! grammar rules to verify that the source code is syntactically valid and to construct
+//! an AST that represents the source code.
+//!  
+//! During parsing, the parser consumes the tokens generated by the lexer and constructs
+//! a tree representation of the source code. The tree is made up of nodes that represent
+//! the different syntactic constructs of the language. If the source code is syntactically
+//! invalid, parsing fails and an error is returned. After a successful parse, the AST can
+//! be used to perform further analysis on the source code. Continuing with the example
+//! above, the AST generated by the parser would _roughly_ look something like this:
+//!
+//! ```text
+//! node: Expr {
+//!     value: {
+//!         node: Call {
+//!             func: {
+//!                 node: Name {
+//!                     id: "print",
+//!                     ctx: Load,
+//!                 },
+//!             },
+//!             args: [
+//!                 node: Constant {
+//!                     value: Str("Hello World"),
+//!                     kind: None,
+//!                 },
+//!             ],
+//!             keywords: [],
+//!         },
+//!     },
+//! },
+//!```
+//!
+//! Note: The Tokens/ASTs shown above are not the exact tokens/ASTs generated by the parser.
+//!
+//! ## Source code layout:
+//!
+//! The functionality of this crate is split into several modules:
+//!
+//! - [token]: This module contains the definition of the tokens that are generated by the lexer.
+//! - [lexer]: This module contains the lexer and is responsible for generating the tokens.
+//! - [parser]: This module contains an interface to the parser and is responsible for generating the AST.
+//!     - Functions and strings have special parsing requirements that are handled in additional files.
+//! - [mode]: This module contains the definition of the different modes that the parser can be in.
+//! - [error]: This module contains the definition of the errors that can be returned by the parser.
+//!
+//! # Examples
+//!
+//! For example, to get a stream of tokens from a given string, one could do this:
 //!
 //! ```
-//! use rustpython_parser::{parser, ast};
+//! use rustpython_parser::lexer::make_tokenizer;
 //!
-//! let python_source = "print('Hello world')";
-//! let python_ast = parser::parse_expression(python_source, "<embedded>").unwrap();
+//! let python_source = r#"
+//! def is_odd(i):
+//!     return bool(i & 1)
+//! "#;
+//! let mut tokens = make_tokenizer(python_source);
+//! assert!(tokens.all(|t| t.is_ok()));
+//! ```
+//!
+//! These tokens can be directly fed into the parser to generate an AST:
 //!
 //! ```
+//! use rustpython_parser::parser::{parse_tokens, Mode};
+//! use rustpython_parser::lexer::make_tokenizer;
+//!
+//! let python_source = r#"
+//! def is_odd(i):
+//!    return bool(i & 1)
+//! "#;
+//! let tokens = make_tokenizer(python_source);
+//! let ast = parse_tokens(tokens, Mode::Module, "<embedded>");
+//!
+//! assert!(ast.is_ok());
+//! ```
+//!
+//! Alternatively, you can use one of the other `parse_*` functions to parse a string directly without using a specific
+//! mode or tokenizing the source beforehand:
+//!
+//! ```
+//! use rustpython_parser::parser::parse_program;
+//!
+//! let python_source = r#"
+//! def is_odd(i):
+//!   return bool(i & 1)
+//! "#;
+//! let ast = parse_program(python_source, "<embedded>");
+//!
+//! assert!(ast.is_ok());
+//! ```
+//!
+//! [lexical analysis]: https://en.wikipedia.org/wiki/Lexical_analysis
+//! [parsing]: https://en.wikipedia.org/wiki/Parsing
+//! [token]: crate::token
+//! [lexer]: crate::lexer
+//! [parser]: crate::parser
+//! [mode]: crate::mode
+//! [error]: crate::error

 #![doc(html_logo_url = "https://raw.githubusercontent.com/RustPython/RustPython/main/logo.png")]
 #![doc(html_root_url = "https://docs.rs/rustpython-parser/")]
--- a/parser/src/mode.rs
+++ b/parser/src/mode.rs
@ -1,9 +1,14 @@
+//! Control in the different modes by which a source file can be parsed.
 use crate::token::Tok;

+/// The mode argument specifies in what way code must be parsed.
 #[derive(Clone, Copy)]
 pub enum Mode {
+    /// The code consists of a sequence of statements.
    Module,
+    /// The code consists of a sequence of interactive statement.
    Interactive,
+    /// The code consists of a single expression.
    Expression,
 }

@ -39,6 +44,7 @@ impl std::str::FromStr for Mode {
    }
 }

+/// Returned when a given mode is not valid.
 #[derive(Debug)]
 pub struct ModeParseError {
    _priv: (),
--- a/parser/src/parser.rs
+++ b/parser/src/parser.rs
@ -1,9 +1,16 @@
-//! Python parsing.
+//! Contains the interface to the Python parser.
 //!
-//! Use this module to parse python code into an AST.
-//! There are three ways to parse python code. You could
-//! parse a whole program, a single statement, or a single
-//! expression.
+//! Functions in this module can be used to parse Python code into an [Abstract Syntax Tree]
+//! (AST) that is then transformed into bytecode.
+//!
+//! There are three ways to parse Python code corresponding to the different [`Mode`]s
+//! defined in the [`mode`] module.
+//!
+//! All functions return a [`Result`](std::result::Result) containing the parsed AST or
+//! a [`ParseError`] if parsing failed.
+//!
+//! [Abstract Syntax Tree]: https://en.wikipedia.org/wiki/Abstract_syntax_tree
+//! [`Mode`]: crate::mode

 use crate::lexer::{LexResult, Tok};
 pub use crate::mode::Mode;
@ -12,13 +19,26 @@ use ast::Location;
 use itertools::Itertools;
 use std::iter;

-/*
- * Parse python code.
- * Grammar may be inspired by antlr grammar for python:
- * https://github.com/antlr/grammars-v4/tree/master/python3
- */
-
-/// Parse a full python program, containing usually multiple lines.
+/// Parse a full Python program usually consisting of multiple lines.
+///  
+/// This is a convenience function that can be used to parse a full Python program without having to
+/// specify the [`Mode`] or the location. It is probably what you want to use most of the time.
+///
+/// # Example
+///
+/// For example, parsing a simple function definition and a call to that function:
+///
+/// ```
+/// use rustpython_parser::parser;
+/// let source = r#"
+/// def foo():
+///    return 42
+///
+/// print(foo())
+/// "#;
+/// let program = parser::parse_program(source, "<embedded>");
+/// assert!(program.is_ok());
+/// ```
 pub fn parse_program(source: &str, source_path: &str) -> Result<ast::Suite, ParseError> {
    parse(source, Mode::Module, source_path).map(|top| match top {
        ast::Mod::Module { body, .. } => body,
@ -26,49 +46,44 @@ pub fn parse_program(source: &str, source_path: &str) -> Result<ast::Suite, Pars
    })
 }

-/// Parses a python expression
+/// Parses a single Python expression.
+///
+/// This convenience function can be used to parse a single expression without having to
+/// specify the Mode or the location.
 ///
 /// # Example
-/// ```
+///
+/// For example, parsing a single expression denoting the addition of two numbers:
+///
+///  ```
 /// extern crate num_bigint;
 /// use rustpython_parser::{parser, ast};
-/// let expr = parser::parse_expression("1 + 2", "<embedded>").unwrap();
+/// let expr = parser::parse_expression("1 + 2", "<embedded>");
 ///
-/// assert_eq!(
-///     expr,
-///     ast::Expr {
-///         location: ast::Location::new(1, 0),
-///         end_location: Some(ast::Location::new(1, 5)),
-///         custom: (),
-///         node: ast::ExprKind::BinOp {
-///             left: Box::new(ast::Expr {
-///                 location: ast::Location::new(1, 0),
-///                 end_location: Some(ast::Location::new(1, 1)),
-///                 custom: (),
-///                 node: ast::ExprKind::Constant {
-///                     value: ast::Constant::Int(1.into()),
-///                     kind: None,
-///                 }
-///             }),
-///             op: ast::Operator::Add,
-///             right: Box::new(ast::Expr {
-///                 location: ast::Location::new(1, 4),
-///                 end_location: Some(ast::Location::new(1, 5)),
-///                 custom: (),
-///                 node: ast::ExprKind::Constant {
-///                     value: ast::Constant::Int(2.into()),
-///                     kind: None,
-///                 }
-///             })
-///         }
-///     },
-/// );
+/// assert!(expr.is_ok());
 ///
 /// ```
 pub fn parse_expression(source: &str, path: &str) -> Result<ast::Expr, ParseError> {
    parse_expression_located(source, path, Location::new(1, 0))
 }

+/// Parses a Python expression from a given location.
+///
+/// This function allows to specify the location of the expression in the source code, other than
+/// that, it behaves exactly like [`parse_expression`].
+///
+/// # Example
+///
+/// Parsing a single expression denoting the addition of two numbers, but this time specifying a different,
+/// somewhat silly, location:
+///
+/// ```
+/// use rustpython_parser::parser::parse_expression_located;
+/// use rustpython_parser::ast::Location;
+///
+/// let expr = parse_expression_located("1 + 2", "<embedded>", Location::new(5, 20));
+/// assert!(expr.is_ok());
+/// ```
 pub fn parse_expression_located(
    source: &str,
    path: &str,
@ -80,12 +95,64 @@ pub fn parse_expression_located(
    })
 }

-// Parse a given source code
+/// Parse the given Python source code using the specified [`Mode`].
+///
+/// This function is the most general function to parse Python code. Based on the [`Mode`] supplied,
+/// it can be used to parse a single expression, a full Python program or an interactive expression.
+///
+/// # Example
+///
+/// If we want to parse a simple expression, we can use the [`Mode::Expression`] mode during
+/// parsing:
+///
+/// ```
+/// use rustpython_parser::parser::{parse, Mode};
+///
+/// let expr = parse("1 + 2", Mode::Expression, "<embedded>");
+/// assert!(expr.is_ok());
+/// ```
+///
+/// Alternatively, we can parse a full Python program consisting of multiple lines:
+///
+/// ```
+/// use rustpython_parser::parser::{parse, Mode};
+///
+/// let source = r#"
+/// class Greeter:
+///
+///   def greet(self):
+///    print("Hello, world!")
+/// "#;
+/// let program = parse(source, Mode::Module, "<embedded>");
+/// assert!(program.is_ok());
+/// ```
 pub fn parse(source: &str, mode: Mode, source_path: &str) -> Result<ast::Mod, ParseError> {
    parse_located(source, mode, source_path, Location::new(1, 0))
 }

-// Parse a given source code from a given location
+/// Parse the given Python source code using the specified [`Mode`] and [`Location`].
+///
+/// This function allows to specify the location of the the source code, other than
+/// that, it behaves exactly like [`parse`].
+///
+/// # Example
+///
+/// ```
+/// use rustpython_parser::parser::{parse_located, Mode};
+/// use rustpython_parser::ast::Location;
+///
+/// let source = r#"
+/// def fib(i):
+///    a, b = 0, 1
+///    for _ in range(i):
+///       a, b = b, a + b
+///    return a
+///
+/// print(fib(42))
+/// "#;
+/// let program = parse_located(source, Mode::Module, "<embedded>", Location::new(1, 0));
+/// assert!(program.is_ok());
+/// ```
 pub fn parse_located(
    source: &str,
    mode: Mode,
@ -96,7 +163,22 @@ pub fn parse_located(
    parse_tokens(lxr, mode, source_path)
 }

-// Parse a given token iterator.
+/// Parse an iterator of [`LexResult`]s using the specified [`Mode`].
+///
+/// This could allow you to perform some preprocessing on the tokens before parsing them.
+///
+/// # Example
+///
+/// As an example, instead of parsing a string, we can parse a list of tokens after we generate
+/// them using the [`lexer::make_tokenizer`] function:
+///
+/// ```
+/// use rustpython_parser::parser::{parse_tokens, Mode};
+/// use rustpython_parser::lexer::make_tokenizer;
+///
+/// let expr = parse_tokens(make_tokenizer("1 + 2"), Mode::Expression, "<embedded>");
+/// assert!(expr.is_ok());
+/// ```
 pub fn parse_tokens(
    lxr: impl IntoIterator<Item = LexResult>,
    mode: Mode,
@ -328,4 +410,13 @@ with (0 as a, 1 as b,): pass
        let parse_ast = parse_expression(r#"{"a": "b", **c, "d": "e"}"#, "<test>").unwrap();
        insta::assert_debug_snapshot!(parse_ast);
    }
+
+    #[test]
+    fn test_modes() {
+        let source = "a[0][1][2][3][4]";
+
+        assert!(parse(&source, Mode::Expression, "<embedded>").is_ok());
+        assert!(parse(&source, Mode::Module, "<embedded>").is_ok());
+        assert!(parse(&source, Mode::Interactive, "<embedded>").is_ok());
+    }
 }
--- a/parser/src/token.rs
+++ b/parser/src/token.rs
@ -1,86 +1,154 @@
-//! Different token definitions.
-//! Loosely based on token.h from CPython source:
+//! Token type for Python source code created by the lexer and consumed by the parser.
+//!
+//! This module defines the tokens that the lexer recognizes. The tokens are
+//! loosely based on the token definitions found in the [CPython source].
+//!
+//! [CPython source]: https://github.com/python/cpython/blob/dfc2e065a2e71011017077e549cd2f9bf4944c54/Include/internal/pycore_token.h
 use num_bigint::BigInt;
 use std::fmt;

-/// Python source code can be tokenized in a sequence of these tokens.
+/// The set of tokens the Python source code can be tokenized in.
 #[derive(Clone, Debug, PartialEq)]
 pub enum Tok {
+    /// Token value for a name, commonly known as an identifier.
    Name {
+        /// The name value.
        name: String,
    },
+    /// Token value for an integer.
    Int {
+        /// The integer value.
        value: BigInt,
    },
+    /// Token value for a floating point number.
    Float {
+        /// The float value.
        value: f64,
    },
+    /// Token value for a complex number.
    Complex {
+        /// The real part of the complex number.
        real: f64,
+        /// The imaginary part of the complex number.
        imag: f64,
    },
+    /// Token value for a string.
    String {
+        /// The string value.
        value: String,
+        /// The kind of string.
        kind: StringKind,
+        /// Whether the string is triple quoted.
        triple_quoted: bool,
    },
-    Newline,
-    NonLogicalNewline,
-    Indent,
-    Dedent,
-    StartModule,
-    StartInteractive,
-    StartExpression,
-    EndOfFile,
-    Lpar,
-    Rpar,
-    Lsqb,
-    Rsqb,
-    Colon,
-    Comma,
+    /// Token value for a comment. These are filtered out of the token stream prior to parsing.
    Comment(String),
+    /// Token value for a newline.
+    Newline,
+    /// Token value for a newline that is not a logical line break. These are filtered out of
+    /// the token stream prior to parsing.
+    NonLogicalNewline,
+    /// Token value for an indent.
+    Indent,
+    /// Token value for a dedent.
+    Dedent,
+    EndOfFile,
+    /// Token value for a left parenthesis `(`.
+    Lpar,
+    /// Token value for a right parenthesis `)`.
+    Rpar,
+    /// Token value for a left square bracket `[`.
+    Lsqb,
+    /// Token value for a right square bracket `]`.
+    Rsqb,
+    /// Token value for a colon `:`.
+    Colon,
+    /// Token value for a comma `,`.
+    Comma,
+    /// Token value for a semicolon `;`.
    Semi,
+    /// Token value for plus `+`.
    Plus,
+    /// Token value for minus `-`.
    Minus,
+    /// Token value for star `*`.
    Star,
+    /// Token value for slash `/`.
    Slash,
-    Vbar,  // '|'
-    Amper, // '&'
+    /// Token value for vertical bar `|`.
+    Vbar,
+    /// Token value for ampersand `&`.
+    Amper,
+    /// Token value for less than `<`.
    Less,
+    /// Token value for greater than `>`.
    Greater,
+    /// Token value for equal `=`.
    Equal,
+    /// Token value for dot `.`.
    Dot,
+    /// Token value for percent `%`.
    Percent,
+    /// Token value for left bracket `{`.
    Lbrace,
+    /// Token value for right bracket `}`.
    Rbrace,
+    /// Token value for double equal `==`.
    EqEqual,
+    /// Token value for not equal `!=`.
    NotEqual,
+    /// Token value for less than or equal `<=`.
    LessEqual,
+    /// Token value for greater than or equal `>=`.
    GreaterEqual,
+    /// Token value for tilde `~`.
    Tilde,
+    /// Token value for caret `^`.
    CircumFlex,
+    /// Token value for left shift `<<`.
    LeftShift,
+    /// Token value for right shift `>>`.
    RightShift,
+    /// Token value for double star `**`.
    DoubleStar,
-    DoubleStarEqual, // '**='
+    /// Token value for double star equal `**=`.
+    DoubleStarEqual,
+    /// Token value for plus equal `+=`.
    PlusEqual,
+    /// Token value for minus equal `-=`.
    MinusEqual,
+    /// Token value for star equal `*=`.
    StarEqual,
+    /// Token value for slash equal `/=`.
    SlashEqual,
+    /// Token value for percent equal `%=`.
    PercentEqual,
-    AmperEqual, // '&='
+    /// Token value for ampersand equal `&=`.
+    AmperEqual,
+    /// Token value for vertical bar equal `|=`.
    VbarEqual,
-    CircumflexEqual, // '^='
+    /// Token value for caret equal `^=`.
+    CircumflexEqual,
+    /// Token value for left shift equal `<<=`.
    LeftShiftEqual,
+    /// Token value for right shift equal `>>=`.
    RightShiftEqual,
-    DoubleSlash, // '//'
+    /// Token value for double slash `//`.
+    DoubleSlash,
+    /// Token value for double slash equal `//=`.
    DoubleSlashEqual,
+    /// Token value for colon equal `:=`.
    ColonEqual,
+    /// Token value for at `@`.
    At,
+    /// Token value for at equal `@=`.
    AtEqual,
+    /// Token value for arrow `->`.
    Rarrow,
+    /// Token value for ellipsis `...`.
    Ellipsis,

+    // Self documenting.
    // Keywords (alphabetically):
    False,
    None,
@ -118,6 +186,11 @@ pub enum Tok {
    While,
    With,
    Yield,
+
+    // RustPython specific.
+    StartModule,
+    StartInteractive,
+    StartExpression,
 }

 impl fmt::Display for Tok {
@ -231,14 +304,25 @@ impl fmt::Display for Tok {
    }
 }

+/// The kind of string literal as described in the [String and Bytes literals]
+/// section of the Python reference.
+///
+/// [String and Bytes literals]: https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
 #[derive(PartialEq, Eq, Debug, Clone)]
 pub enum StringKind {
+    /// A normal string literal with no prefix.
    String,
+    /// A f-string literal, with a `f` or `F` prefix.
    FString,
+    /// A byte string literal, with a `b` or `B` prefix.
    Bytes,
+    /// A raw string literal, with a `r` or `R` prefix.
    RawString,
+    /// A raw f-string literal, with a `rf`/`fr` or `rF`/`Fr` or `Rf`/`fR` or `RF`/`FR` prefix.
    RawFString,
+    /// A raw byte string literal, with a `rb`/`br` or `rB`/`Br` or `Rb`/`bR` or `RB`/`BR` prefix.
    RawBytes,
+    /// A unicode string literal, with a `u` or `U` prefix.
    Unicode,
 }

@ -286,25 +370,33 @@ impl fmt::Display for StringKind {
 }

 impl StringKind {
+    /// Returns true if the string is a raw string, i,e one of
+    /// [`StringKind::RawString`] or [`StringKind::RawFString`] or [`StringKind::RawBytes`].
    pub fn is_raw(&self) -> bool {
        use StringKind::{RawBytes, RawFString, RawString};
        matches!(self, RawString | RawFString | RawBytes)
    }

+    /// Returns true if the string is an f-string, i,e one of
+    /// [`StringKind::FString`] or [`StringKind::RawFString`].
    pub fn is_fstring(&self) -> bool {
        use StringKind::{FString, RawFString};
        matches!(self, FString | RawFString)
    }

+    /// Returns true if the string is a byte string, i,e one of
+    /// [`StringKind::Bytes`] or [`StringKind::RawBytes`].
    pub fn is_bytes(&self) -> bool {
        use StringKind::{Bytes, RawBytes};
        matches!(self, Bytes | RawBytes)
    }

+    /// Returns true if the string is a unicode string, i,e [`StringKind::Unicode`].
    pub fn is_unicode(&self) -> bool {
        matches!(self, StringKind::Unicode)
    }

+    /// Returns the number of characters in the prefix.
    pub fn prefix_len(&self) -> usize {
        use StringKind::*;
        match self {