gh-130587: Add hand-written docs for non-OP tokens (GH-130588)

Co-authored-by: Blaise Pabon <blaise@gmail.com>
Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com>
Co-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
This commit is contained in:
Petr Viktorin 2025-03-19 16:42:11 +01:00 committed by GitHub
parent b8367e7cf3
commit 4bced29a74
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 293 additions and 258 deletions

View file

@ -1,230 +1,104 @@
.. Auto-generated by Tools/build/generate_token.py
.. data:: ENDMARKER
.. data:: NAME
.. data:: NUMBER
.. data:: STRING
.. data:: NEWLINE
.. data:: INDENT
.. data:: DEDENT
.. data:: LPAR
Token value for ``"("``.
.. data:: RPAR
Token value for ``")"``.
.. data:: LSQB
Token value for ``"["``.
.. data:: RSQB
Token value for ``"]"``.
.. data:: COLON
Token value for ``":"``.
.. data:: COMMA
Token value for ``","``.
.. data:: SEMI
Token value for ``";"``.
.. data:: PLUS
Token value for ``"+"``.
.. data:: MINUS
Token value for ``"-"``.
.. data:: STAR
Token value for ``"*"``.
.. data:: SLASH
Token value for ``"/"``.
.. data:: VBAR
Token value for ``"|"``.
.. data:: AMPER
Token value for ``"&"``.
.. data:: LESS
Token value for ``"<"``.
.. data:: GREATER
Token value for ``">"``.
.. data:: EQUAL
Token value for ``"="``.
.. data:: DOT
Token value for ``"."``.
.. data:: PERCENT
Token value for ``"%"``.
.. data:: LBRACE
Token value for ``"{"``.
.. data:: RBRACE
Token value for ``"}"``.
.. data:: EQEQUAL
Token value for ``"=="``.
.. data:: NOTEQUAL
Token value for ``"!="``.
.. data:: LESSEQUAL
Token value for ``"<="``.
.. data:: GREATEREQUAL
Token value for ``">="``.
.. data:: TILDE
Token value for ``"~"``.
.. data:: CIRCUMFLEX
Token value for ``"^"``.
.. data:: LEFTSHIFT
Token value for ``"<<"``.
.. data:: RIGHTSHIFT
Token value for ``">>"``.
.. data:: DOUBLESTAR
Token value for ``"**"``.
.. data:: PLUSEQUAL
Token value for ``"+="``.
.. data:: MINEQUAL
Token value for ``"-="``.
.. data:: STAREQUAL
Token value for ``"*="``.
.. data:: SLASHEQUAL
Token value for ``"/="``.
.. data:: PERCENTEQUAL
Token value for ``"%="``.
.. data:: AMPEREQUAL
Token value for ``"&="``.
.. data:: VBAREQUAL
Token value for ``"|="``.
.. data:: CIRCUMFLEXEQUAL
Token value for ``"^="``.
.. data:: LEFTSHIFTEQUAL
Token value for ``"<<="``.
.. data:: RIGHTSHIFTEQUAL
Token value for ``">>="``.
.. data:: DOUBLESTAREQUAL
Token value for ``"**="``.
.. data:: DOUBLESLASH
Token value for ``"//"``.
.. data:: DOUBLESLASHEQUAL
Token value for ``"//="``.
.. data:: AT
Token value for ``"@"``.
.. data:: ATEQUAL
Token value for ``"@="``.
.. data:: RARROW
Token value for ``"->"``.
.. data:: ELLIPSIS
Token value for ``"..."``.
.. data:: COLONEQUAL
Token value for ``":="``.
.. data:: EXCLAMATION
Token value for ``"!"``.
.. data:: OP
.. data:: TYPE_IGNORE
.. data:: TYPE_COMMENT
.. data:: SOFT_KEYWORD
.. data:: FSTRING_START
.. data:: FSTRING_MIDDLE
.. data:: FSTRING_END
.. data:: COMMENT
.. data:: NL
.. data:: ERRORTOKEN
.. data:: N_TOKENS
.. data:: NT_OFFSET
.. list-table::
:align: left
:header-rows: 1
* - Token
- Value
* - .. data:: LPAR
- ``"("``
* - .. data:: RPAR
- ``")"``
* - .. data:: LSQB
- ``"["``
* - .. data:: RSQB
- ``"]"``
* - .. data:: COLON
- ``":"``
* - .. data:: COMMA
- ``","``
* - .. data:: SEMI
- ``";"``
* - .. data:: PLUS
- ``"+"``
* - .. data:: MINUS
- ``"-"``
* - .. data:: STAR
- ``"*"``
* - .. data:: SLASH
- ``"/"``
* - .. data:: VBAR
- ``"|"``
* - .. data:: AMPER
- ``"&"``
* - .. data:: LESS
- ``"<"``
* - .. data:: GREATER
- ``">"``
* - .. data:: EQUAL
- ``"="``
* - .. data:: DOT
- ``"."``
* - .. data:: PERCENT
- ``"%"``
* - .. data:: LBRACE
- ``"{"``
* - .. data:: RBRACE
- ``"}"``
* - .. data:: EQEQUAL
- ``"=="``
* - .. data:: NOTEQUAL
- ``"!="``
* - .. data:: LESSEQUAL
- ``"<="``
* - .. data:: GREATEREQUAL
- ``">="``
* - .. data:: TILDE
- ``"~"``
* - .. data:: CIRCUMFLEX
- ``"^"``
* - .. data:: LEFTSHIFT
- ``"<<"``
* - .. data:: RIGHTSHIFT
- ``">>"``
* - .. data:: DOUBLESTAR
- ``"**"``
* - .. data:: PLUSEQUAL
- ``"+="``
* - .. data:: MINEQUAL
- ``"-="``
* - .. data:: STAREQUAL
- ``"*="``
* - .. data:: SLASHEQUAL
- ``"/="``
* - .. data:: PERCENTEQUAL
- ``"%="``
* - .. data:: AMPEREQUAL
- ``"&="``
* - .. data:: VBAREQUAL
- ``"|="``
* - .. data:: CIRCUMFLEXEQUAL
- ``"^="``
* - .. data:: LEFTSHIFTEQUAL
- ``"<<="``
* - .. data:: RIGHTSHIFTEQUAL
- ``">>="``
* - .. data:: DOUBLESTAREQUAL
- ``"**="``
* - .. data:: DOUBLESLASH
- ``"//"``
* - .. data:: DOUBLESLASHEQUAL
- ``"//="``
* - .. data:: AT
- ``"@"``
* - .. data:: ATEQUAL
- ``"@="``
* - .. data:: RARROW
- ``"->"``
* - .. data:: ELLIPSIS
- ``"..."``
* - .. data:: COLONEQUAL
- ``":="``
* - .. data:: EXCLAMATION
- ``"!"``

View file

@ -19,6 +19,10 @@ change between Python versions.
The module also provides a mapping from numeric codes to names and some
functions. The functions mirror definitions in the Python C header files.
Note that a token's value may depend on tokenizer options. For example, a
``"+"`` token may be reported as either :data:`PLUS` or :data:`OP`, or
a ``"match"`` token may be either :data:`NAME` or :data:`SOFT_KEYWORD`.
.. data:: tok_name
@ -44,25 +48,93 @@ functions. The functions mirror definitions in the Python C header files.
The token constants are:
.. include:: token-list.inc
.. data:: NAME
The following token type values aren't used by the C tokenizer but are needed for
the :mod:`tokenize` module.
Token value that indicates an :ref:`identifier <identifiers>`.
Note that keywords are also initially tokenized an ``NAME`` tokens.
.. data:: NUMBER
Token value that indicates a :ref:`numeric literal <numbers>`
.. data:: STRING
Token value that indicates a :ref:`string or byte literal <strings>`,
excluding :ref:`formatted string literals <f-strings>`.
The token string is not interpreted:
it includes the surrounding quotation marks and the prefix (if given);
backslashes are included literally, without processing escape sequences.
.. data:: OP
A generic token value that indicates an
:ref:`operator <operators>` or :ref:`delimiter <delimiters>`.
.. impl-detail::
This value is only reported by the :mod:`tokenize` module.
Internally, the tokenizer uses
:ref:`exact token types <token_operators_delimiters>` instead.
.. data:: COMMENT
:noindex:
Token value used to indicate a comment.
The parser ignores :data:`!COMMENT` tokens.
.. data:: NEWLINE
Token value that indicates the end of a :ref:`logical line <logical-lines>`.
.. data:: NL
:noindex:
Token value used to indicate a non-terminating newline. The
:data:`NEWLINE` token indicates the end of a logical line of Python code;
``NL`` tokens are generated when a logical line of code is continued over
multiple physical lines.
Token value used to indicate a non-terminating newline.
:data:`!NL` tokens are generated when a logical line of code is continued
over multiple physical lines. The parser ignores :data:`!NL` tokens.
.. data:: INDENT
Token value used at the beginning of a :ref:`logical line <logical-lines>`
to indicate the start of an :ref:`indented block <indentation>`.
.. data:: DEDENT
Token value used at the beginning of a :ref:`logical line <logical-lines>`
to indicate the end of an :ref:`indented block <indentation>`.
.. data:: FSTRING_START
Token value used to indicate the beginning of an
:ref:`f-string literal <f-strings>`.
.. impl-detail::
The token string includes the prefix and the opening quote(s), but none
of the contents of the literal.
.. data:: FSTRING_MIDDLE
Token value used for literal text inside an :ref:`f-string literal <f-strings>`,
including format specifications.
.. impl-detail::
Replacement fields (that is, the non-literal parts of f-strings) use
the same tokens as other expressions, and are delimited by
:data:`LBRACE`, :data:`RBRACE`, :data:`EXCLAMATION` and :data:`COLON`
tokens.
.. data:: FSTRING_END
Token value used to indicate the end of a :ref:`f-string <f-strings>`.
.. impl-detail::
The token string contains the closing quote(s).
.. data:: ENDMARKER
Token value that indicates the end of input.
Used in :ref:`top-level grammar rules <top-level>`.
.. data:: ENCODING
@ -70,14 +142,63 @@ the :mod:`tokenize` module.
into text. The first token returned by :func:`tokenize.tokenize` will
always be an ``ENCODING`` token.
.. impl-detail::
This token type isn't used by the C tokenizer but is needed for
the :mod:`tokenize` module.
The following token types are not produced by the :mod:`tokenize` module,
and are defined for special uses in the tokenizer or parser:
.. data:: TYPE_IGNORE
Token value indicating that a ``type: ignore`` comment was recognized.
Such tokens are produced instead of regular :data:`COMMENT` tokens only
with the :data:`~ast.PyCF_TYPE_COMMENTS` flag.
.. data:: TYPE_COMMENT
:noindex:
Token value indicating that a type comment was recognized. Such
tokens are only produced when :func:`ast.parse` is invoked with
``type_comments=True``.
Token value indicating that a type comment was recognized.
Such tokens are produced instead of regular :data:`COMMENT` tokens only
with the :data:`~ast.PyCF_TYPE_COMMENTS` flag.
.. data:: SOFT_KEYWORD
Token value indicating a :ref:`soft keyword <soft-keywords>`.
The tokenizer never produces this value.
To check for a soft keyword, pass a :data:`NAME` token's string to
:func:`keyword.issoftkeyword`.
.. data:: ERRORTOKEN
Token value used to indicate wrong input.
The :mod:`tokenize` module generally indicates errors by
raising exceptions instead of emitting this token.
It can also emit tokens such as :data:`OP` or :data:`NAME` with strings that
are later rejected by the parser.
.. _token_operators_delimiters:
The remaining tokens represent specific :ref:`operators <operators>` and
:ref:`delimiters <delimiters>`.
(The :mod:`tokenize` module reports these as :data:`OP`; see ``exact_type``
in the :mod:`tokenize` documentation for details.)
.. include:: token-list.inc
The following non-token constants are provided:
.. data:: N_TOKENS
The number of token types defined in this module.
.. NT_OFFSET is deliberately undocumented; if you need it you should be
reading the source
.. data:: EXACT_TOKEN_TYPES
@ -102,6 +223,9 @@ the :mod:`tokenize` module.
to support parsing older Python versions for :func:`ast.parse` with
``feature_version`` set to 6 or lower).
.. versionchanged:: 3.12
Added :data:`EXCLAMATION`.
.. versionchanged:: 3.13
Removed :data:`!AWAIT` and :data:`!ASYNC` tokens again.

View file

@ -69,7 +69,7 @@ All input read from non-interactive files has the same form:
.. grammar-snippet::
:group: python-grammar
file_input: (NEWLINE | `statement`)*
file_input: (NEWLINE | `statement`)* ENDMARKER
This syntax is used in the following situations:
@ -90,7 +90,7 @@ Input in interactive mode is parsed using the following grammar:
.. grammar-snippet::
:group: python-grammar
interactive_input: [`stmt_list`] NEWLINE | `compound_stmt` NEWLINE
interactive_input: [`stmt_list`] NEWLINE | `compound_stmt` NEWLINE | ENDMARKER
Note that a (top-level) compound statement must be followed by a blank line in
interactive mode; this is needed to help the parser detect the end of the input.
@ -107,5 +107,7 @@ Expression input
:func:`eval` is used for expression input. It ignores leading whitespace. The
string argument to :func:`eval` must have the following form:
.. productionlist:: python-grammar
eval_input: `expression_list` NEWLINE*
.. grammar-snippet::
:group: python-grammar
eval_input: `expression_list` NEWLINE* ENDMARKER

View file

@ -1,10 +1,17 @@
#! /usr/bin/env python3
# This script generates token related files from Grammar/Tokens:
#
# Doc/library/token-list.inc
# Include/token.h
# Parser/token.c
# Lib/token.py
# make_rst:
# Doc/library/token-list.inc
# Doc/library/token.rst (checked, not generated)
# make_h:
# Include/token.h
# make_c:
# Parser/token.c
# make_py:
# Lib/token.py
import re
SCRIPT_NAME = 'Tools/build/generate_token.py'
@ -199,23 +206,51 @@ def make_c(infile, outfile='Parser/token.c'):
token_inc_template = f"""\
.. {AUTO_GENERATED_BY_SCRIPT}
%s
.. data:: N_TOKENS
.. data:: NT_OFFSET
.. list-table::
:align: left
:header-rows: 1
* - Token
- Value
%s
"""
def make_rst(infile, outfile='Doc/library/token-list.inc'):
def make_rst(infile, outfile='Doc/library/token-list.inc',
rstfile='Doc/library/token.rst'):
tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
tok_to_string = {value: s for s, value in string_to_tok.items()}
needs_handwritten_doc = set()
names = []
for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
names.append('.. data:: %s' % (name,))
for value, name in enumerate(tok_names):
if value in tok_to_string:
names.append('')
names.append(' Token value for ``"%s"``.' % tok_to_string[value])
names.append('')
assert name.isupper()
names.append(f' * - .. data:: {name}')
names.append(f' - ``"{tok_to_string[value]}"``')
else:
needs_handwritten_doc.add(name)
has_handwritten_doc = set()
with open(rstfile) as fileobj:
tokendef_re = re.compile(r'.. data:: ([0-9A-Z_]+)\s*')
for line in fileobj:
if match := tokendef_re.fullmatch(line):
has_handwritten_doc.add(match[1])
# Exclude non-token constants in token.py
has_handwritten_doc -= {'N_TOKENS', 'NT_OFFSET', 'EXACT_TOKEN_TYPES'}
if needs_handwritten_doc != has_handwritten_doc:
message_parts = [f'ERROR: {rstfile} does not document all tokens!']
undocumented = needs_handwritten_doc - has_handwritten_doc
extra = has_handwritten_doc - needs_handwritten_doc
if undocumented:
message_parts.append(f'Undocumented tokens: {undocumented}')
if extra:
message_parts.append(f'Documented nonexistent tokens: {extra}')
exit('\n'.join(message_parts))
if update_file(outfile, token_inc_template % '\n'.join(names)):
print("%s regenerated from %s" % (outfile, infile))