gh-130587: Add hand-written docs for non-OP tokens (GH-130588)

Co-authored-by: Blaise Pabon <blaise@gmail.com> Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> Co-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
2025-07-07 19:35:27 +00:00 · 2025-03-19 16:42:11 +01:00 · 2025-03-19 16:42:11 +01:00 · 4bced29a74
commit 4bced29a74
parent b8367e7cf3
4 changed files with 293 additions and 258 deletions
--- a/Doc/library/token-list.inc
+++ b/Doc/library/token-list.inc
@ -1,230 +1,104 @@
 .. Auto-generated by Tools/build/generate_token.py
-.. data:: ENDMARKER

-.. data:: NAME
-
-.. data:: NUMBER
-
-.. data:: STRING
-
-.. data:: NEWLINE
-
-.. data:: INDENT
-
-.. data:: DEDENT
-
-.. data:: LPAR
-
-   Token value for ``"("``.
-
-.. data:: RPAR
-
-   Token value for ``")"``.
-
-.. data:: LSQB
-
-   Token value for ``"["``.
-
-.. data:: RSQB
-
-   Token value for ``"]"``.
-
-.. data:: COLON
-
-   Token value for ``":"``.
-
-.. data:: COMMA
-
-   Token value for ``","``.
-
-.. data:: SEMI
-
-   Token value for ``";"``.
-
-.. data:: PLUS
-
-   Token value for ``"+"``.
-
-.. data:: MINUS
-
-   Token value for ``"-"``.
-
-.. data:: STAR
-
-   Token value for ``"*"``.
-
-.. data:: SLASH
-
-   Token value for ``"/"``.
-
-.. data:: VBAR
-
-   Token value for ``"|"``.
-
-.. data:: AMPER
-
-   Token value for ``"&"``.
-
-.. data:: LESS
-
-   Token value for ``"<"``.
-
-.. data:: GREATER
-
-   Token value for ``">"``.
-
-.. data:: EQUAL
-
-   Token value for ``"="``.
-
-.. data:: DOT
-
-   Token value for ``"."``.
-
-.. data:: PERCENT
-
-   Token value for ``"%"``.
-
-.. data:: LBRACE
-
-   Token value for ``"{"``.
-
-.. data:: RBRACE
-
-   Token value for ``"}"``.
-
-.. data:: EQEQUAL
-
-   Token value for ``"=="``.
-
-.. data:: NOTEQUAL
-
-   Token value for ``"!="``.
-
-.. data:: LESSEQUAL
-
-   Token value for ``"<="``.
-
-.. data:: GREATEREQUAL
-
-   Token value for ``">="``.
-
-.. data:: TILDE
-
-   Token value for ``"~"``.
-
-.. data:: CIRCUMFLEX
-
-   Token value for ``"^"``.
-
-.. data:: LEFTSHIFT
-
-   Token value for ``"<<"``.
-
-.. data:: RIGHTSHIFT
-
-   Token value for ``">>"``.
-
-.. data:: DOUBLESTAR
-
-   Token value for ``"**"``.
-
-.. data:: PLUSEQUAL
-
-   Token value for ``"+="``.
-
-.. data:: MINEQUAL
-
-   Token value for ``"-="``.
-
-.. data:: STAREQUAL
-
-   Token value for ``"*="``.
-
-.. data:: SLASHEQUAL
-
-   Token value for ``"/="``.
-
-.. data:: PERCENTEQUAL
-
-   Token value for ``"%="``.
-
-.. data:: AMPEREQUAL
-
-   Token value for ``"&="``.
-
-.. data:: VBAREQUAL
-
-   Token value for ``"|="``.
-
-.. data:: CIRCUMFLEXEQUAL
-
-   Token value for ``"^="``.
-
-.. data:: LEFTSHIFTEQUAL
-
-   Token value for ``"<<="``.
-
-.. data:: RIGHTSHIFTEQUAL
-
-   Token value for ``">>="``.
-
-.. data:: DOUBLESTAREQUAL
-
-   Token value for ``"**="``.
-
-.. data:: DOUBLESLASH
-
-   Token value for ``"//"``.
-
-.. data:: DOUBLESLASHEQUAL
-
-   Token value for ``"//="``.
-
-.. data:: AT
-
-   Token value for ``"@"``.
-
-.. data:: ATEQUAL
-
-   Token value for ``"@="``.
-
-.. data:: RARROW
-
-   Token value for ``"->"``.
-
-.. data:: ELLIPSIS
-
-   Token value for ``"..."``.
-
-.. data:: COLONEQUAL
-
-   Token value for ``":="``.
-
-.. data:: EXCLAMATION
-
-   Token value for ``"!"``.
-
-.. data:: OP
-
-.. data:: TYPE_IGNORE
-
-.. data:: TYPE_COMMENT
-
-.. data:: SOFT_KEYWORD
-
-.. data:: FSTRING_START
-
-.. data:: FSTRING_MIDDLE
-
-.. data:: FSTRING_END
-
-.. data:: COMMENT
-
-.. data:: NL
-
-.. data:: ERRORTOKEN
-
-.. data:: N_TOKENS
-
-.. data:: NT_OFFSET
+.. list-table::
+   :align: left
+   :header-rows: 1
+
+   * - Token
+     - Value
+   * - .. data:: LPAR
+     - ``"("``
+   * - .. data:: RPAR
+     - ``")"``
+   * - .. data:: LSQB
+     - ``"["``
+   * - .. data:: RSQB
+     - ``"]"``
+   * - .. data:: COLON
+     - ``":"``
+   * - .. data:: COMMA
+     - ``","``
+   * - .. data:: SEMI
+     - ``";"``
+   * - .. data:: PLUS
+     - ``"+"``
+   * - .. data:: MINUS
+     - ``"-"``
+   * - .. data:: STAR
+     - ``"*"``
+   * - .. data:: SLASH
+     - ``"/"``
+   * - .. data:: VBAR
+     - ``"|"``
+   * - .. data:: AMPER
+     - ``"&"``
+   * - .. data:: LESS
+     - ``"<"``
+   * - .. data:: GREATER
+     - ``">"``
+   * - .. data:: EQUAL
+     - ``"="``
+   * - .. data:: DOT
+     - ``"."``
+   * - .. data:: PERCENT
+     - ``"%"``
+   * - .. data:: LBRACE
+     - ``"{"``
+   * - .. data:: RBRACE
+     - ``"}"``
+   * - .. data:: EQEQUAL
+     - ``"=="``
+   * - .. data:: NOTEQUAL
+     - ``"!="``
+   * - .. data:: LESSEQUAL
+     - ``"<="``
+   * - .. data:: GREATEREQUAL
+     - ``">="``
+   * - .. data:: TILDE
+     - ``"~"``
+   * - .. data:: CIRCUMFLEX
+     - ``"^"``
+   * - .. data:: LEFTSHIFT
+     - ``"<<"``
+   * - .. data:: RIGHTSHIFT
+     - ``">>"``
+   * - .. data:: DOUBLESTAR
+     - ``"**"``
+   * - .. data:: PLUSEQUAL
+     - ``"+="``
+   * - .. data:: MINEQUAL
+     - ``"-="``
+   * - .. data:: STAREQUAL
+     - ``"*="``
+   * - .. data:: SLASHEQUAL
+     - ``"/="``
+   * - .. data:: PERCENTEQUAL
+     - ``"%="``
+   * - .. data:: AMPEREQUAL
+     - ``"&="``
+   * - .. data:: VBAREQUAL
+     - ``"|="``
+   * - .. data:: CIRCUMFLEXEQUAL
+     - ``"^="``
+   * - .. data:: LEFTSHIFTEQUAL
+     - ``"<<="``
+   * - .. data:: RIGHTSHIFTEQUAL
+     - ``">>="``
+   * - .. data:: DOUBLESTAREQUAL
+     - ``"**="``
+   * - .. data:: DOUBLESLASH
+     - ``"//"``
+   * - .. data:: DOUBLESLASHEQUAL
+     - ``"//="``
+   * - .. data:: AT
+     - ``"@"``
+   * - .. data:: ATEQUAL
+     - ``"@="``
+   * - .. data:: RARROW
+     - ``"->"``
+   * - .. data:: ELLIPSIS
+     - ``"..."``
+   * - .. data:: COLONEQUAL
+     - ``":="``
+   * - .. data:: EXCLAMATION
+     - ``"!"``
--- a/Doc/library/token.rst
+++ b/Doc/library/token.rst
@ -19,6 +19,10 @@ change between Python versions.
 The module also provides a mapping from numeric codes to names and some
 functions.  The functions mirror definitions in the Python C header files.

+Note that a token's value may depend on tokenizer options. For example, a
+``"+"`` token may be reported as either :data:`PLUS` or :data:`OP`, or
+a ``"match"`` token may be either :data:`NAME` or :data:`SOFT_KEYWORD`.
+

 .. data:: tok_name

@ -44,25 +48,93 @@ functions.  The functions mirror definitions in the Python C header files.

 The token constants are:

-.. include:: token-list.inc
+.. data:: NAME

-The following token type values aren't used by the C tokenizer but are needed for
-the :mod:`tokenize` module.
+   Token value that indicates an :ref:`identifier <identifiers>`.
+   Note that keywords are also initially tokenized an ``NAME`` tokens.
+
+.. data:: NUMBER
+
+   Token value that indicates a :ref:`numeric literal <numbers>`
+
+.. data:: STRING
+
+   Token value that indicates a :ref:`string or byte literal <strings>`,
+   excluding :ref:`formatted string literals <f-strings>`.
+   The token string is not interpreted:
+   it includes the surrounding quotation marks and the prefix (if given);
+   backslashes are included literally, without processing escape sequences.
+
+.. data:: OP
+
+   A generic token value that indicates an
+   :ref:`operator <operators>` or :ref:`delimiter <delimiters>`.
+
+   .. impl-detail::
+
+      This value is only reported by the :mod:`tokenize` module.
+      Internally, the tokenizer uses
+      :ref:`exact token types <token_operators_delimiters>` instead.

 .. data:: COMMENT
-   :noindex:

   Token value used to indicate a comment.
+   The parser ignores :data:`!COMMENT` tokens.

+.. data:: NEWLINE
+
+   Token value that indicates the end of a :ref:`logical line <logical-lines>`.

 .. data:: NL
-   :noindex:

-   Token value used to indicate a non-terminating newline.  The
-   :data:`NEWLINE` token indicates the end of a logical line of Python code;
-   ``NL`` tokens are generated when a logical line of code is continued over
-   multiple physical lines.
+   Token value used to indicate a non-terminating newline.
+   :data:`!NL` tokens are generated when a logical line of code is continued
+   over multiple physical lines. The parser ignores :data:`!NL` tokens.

+.. data:: INDENT
+
+   Token value used at the beginning of a :ref:`logical line <logical-lines>`
+   to indicate the start of an :ref:`indented block <indentation>`.
+
+.. data:: DEDENT
+
+   Token value used at the beginning of a :ref:`logical line <logical-lines>`
+   to indicate the end of an :ref:`indented block <indentation>`.
+
+.. data:: FSTRING_START
+
+   Token value used to indicate the beginning of an
+   :ref:`f-string literal <f-strings>`.
+
+   .. impl-detail::
+
+      The token string includes the prefix and the opening quote(s), but none
+      of the contents of the literal.
+
+.. data:: FSTRING_MIDDLE
+
+   Token value used for literal text inside an :ref:`f-string literal <f-strings>`,
+   including format specifications.
+
+   .. impl-detail::
+
+      Replacement fields (that is, the non-literal parts of f-strings) use
+      the same tokens as other expressions, and are delimited by
+      :data:`LBRACE`, :data:`RBRACE`, :data:`EXCLAMATION` and :data:`COLON`
+      tokens.
+
+.. data:: FSTRING_END
+
+   Token value used to indicate the end of a :ref:`f-string <f-strings>`.
+
+   .. impl-detail::
+
+      The token string contains the closing quote(s).
+
+.. data:: ENDMARKER
+
+   Token value that indicates the end of input.
+   Used in :ref:`top-level grammar rules <top-level>`.

 .. data:: ENCODING

@ -70,14 +142,63 @@ the :mod:`tokenize` module.
   into text. The first token returned by :func:`tokenize.tokenize` will
   always be an ``ENCODING`` token.

+   .. impl-detail::
+
+      This token type isn't used by the C tokenizer but is needed for
+      the :mod:`tokenize` module.
+
+
+The following token types are not produced by the :mod:`tokenize` module,
+and are defined for special uses in the tokenizer or parser:
+
+.. data:: TYPE_IGNORE
+
+   Token value indicating that a ``type: ignore`` comment was recognized.
+   Such tokens are produced instead of regular :data:`COMMENT` tokens only
+   with the :data:`~ast.PyCF_TYPE_COMMENTS` flag.

 .. data:: TYPE_COMMENT
-   :noindex:

-   Token value indicating that a type comment was recognized.  Such
-   tokens are only produced when :func:`ast.parse` is invoked with
-   ``type_comments=True``.
+   Token value indicating that a type comment was recognized.
+   Such tokens are produced instead of regular :data:`COMMENT` tokens only
+   with the :data:`~ast.PyCF_TYPE_COMMENTS` flag.

+.. data:: SOFT_KEYWORD
+
+   Token value indicating a :ref:`soft keyword <soft-keywords>`.
+
+   The tokenizer never produces this value.
+   To check for a soft keyword, pass a :data:`NAME` token's string to
+   :func:`keyword.issoftkeyword`.
+
+.. data:: ERRORTOKEN
+
+   Token value used to indicate wrong input.
+
+   The :mod:`tokenize` module generally indicates errors by
+   raising exceptions instead of emitting this token.
+   It can also emit tokens such as :data:`OP` or :data:`NAME` with strings that
+   are later rejected by the parser.
+
+
+.. _token_operators_delimiters:
+
+The remaining tokens represent specific :ref:`operators <operators>` and
+:ref:`delimiters <delimiters>`.
+(The :mod:`tokenize` module reports these as :data:`OP`; see ``exact_type``
+in the :mod:`tokenize` documentation for details.)
+
+.. include:: token-list.inc
+
+
+The following non-token constants are provided:
+
+.. data:: N_TOKENS
+
+   The number of token types defined in this module.
+
+.. NT_OFFSET is deliberately undocumented; if you need it you should be
+   reading the source

 .. data:: EXACT_TOKEN_TYPES

@ -102,6 +223,9 @@ the :mod:`tokenize` module.
   to support parsing older Python versions for :func:`ast.parse` with
   ``feature_version`` set to 6 or lower).

+.. versionchanged:: 3.12
+   Added :data:`EXCLAMATION`.
+
 .. versionchanged:: 3.13
   Removed :data:`!AWAIT` and :data:`!ASYNC` tokens again.

--- a/Doc/reference/toplevel_components.rst
+++ b/Doc/reference/toplevel_components.rst
@ -69,7 +69,7 @@ All input read from non-interactive files has the same form:
 .. grammar-snippet::
   :group: python-grammar

-   file_input: (NEWLINE | `statement`)*
+   file_input: (NEWLINE | `statement`)* ENDMARKER

 This syntax is used in the following situations:

@ -90,7 +90,7 @@ Input in interactive mode is parsed using the following grammar:
 .. grammar-snippet::
   :group: python-grammar

-   interactive_input: [`stmt_list`] NEWLINE | `compound_stmt` NEWLINE
+   interactive_input: [`stmt_list`] NEWLINE | `compound_stmt` NEWLINE | ENDMARKER

 Note that a (top-level) compound statement must be followed by a blank line in
 interactive mode; this is needed to help the parser detect the end of the input.
@ -107,5 +107,7 @@ Expression input
 :func:`eval` is used for expression input.  It ignores leading whitespace. The
 string argument to :func:`eval` must have the following form:

-.. productionlist:: python-grammar
-   eval_input: `expression_list` NEWLINE*
+.. grammar-snippet::
+   :group: python-grammar
+
+   eval_input: `expression_list` NEWLINE* ENDMARKER
--- a/Tools/build/generate_token.py
+++ b/Tools/build/generate_token.py
@ -1,10 +1,17 @@
 #! /usr/bin/env python3
 # This script generates token related files from Grammar/Tokens:
 #
-#   Doc/library/token-list.inc
-#   Include/token.h
-#   Parser/token.c
-#   Lib/token.py
+#   make_rst:
+#       Doc/library/token-list.inc
+#       Doc/library/token.rst  (checked, not generated)
+#   make_h:
+#       Include/token.h
+#   make_c:
+#       Parser/token.c
+#   make_py:
+#       Lib/token.py
+
+import re


 SCRIPT_NAME = 'Tools/build/generate_token.py'
@ -199,23 +206,51 @@ def make_c(infile, outfile='Parser/token.c'):

 token_inc_template = f"""\
 .. {AUTO_GENERATED_BY_SCRIPT}
-%s
-.. data:: N_TOKENS

-.. data:: NT_OFFSET
+.. list-table::
+   :align: left
+   :header-rows: 1
+
+   * - Token
+     - Value
+%s
 """

-def make_rst(infile, outfile='Doc/library/token-list.inc'):
+def make_rst(infile, outfile='Doc/library/token-list.inc',
+             rstfile='Doc/library/token.rst'):
    tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
    tok_to_string = {value: s for s, value in string_to_tok.items()}

+    needs_handwritten_doc = set()
+
    names = []
-    for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
-        names.append('.. data:: %s' % (name,))
+    for value, name in enumerate(tok_names):
        if value in tok_to_string:
-            names.append('')
-            names.append('   Token value for ``"%s"``.' % tok_to_string[value])
-        names.append('')
+            assert name.isupper()
+            names.append(f'   * - .. data:: {name}')
+            names.append(f'     - ``"{tok_to_string[value]}"``')
+        else:
+            needs_handwritten_doc.add(name)
+
+    has_handwritten_doc = set()
+    with open(rstfile) as fileobj:
+        tokendef_re = re.compile(r'.. data:: ([0-9A-Z_]+)\s*')
+        for line in fileobj:
+            if match := tokendef_re.fullmatch(line):
+                has_handwritten_doc.add(match[1])
+
+    # Exclude non-token constants in token.py
+    has_handwritten_doc -= {'N_TOKENS', 'NT_OFFSET', 'EXACT_TOKEN_TYPES'}
+
+    if needs_handwritten_doc != has_handwritten_doc:
+        message_parts = [f'ERROR: {rstfile} does not document all tokens!']
+        undocumented = needs_handwritten_doc - has_handwritten_doc
+        extra = has_handwritten_doc - needs_handwritten_doc
+        if undocumented:
+            message_parts.append(f'Undocumented tokens: {undocumented}')
+        if extra:
+            message_parts.append(f'Documented nonexistent tokens: {extra}')
+        exit('\n'.join(message_parts))

    if update_file(outfile, token_inc_template % '\n'.join(names)):
        print("%s regenerated from %s" % (outfile, infile))