gh-134675: Add t-string prefixes to tokenizer module, lexical analysis doc, and add a test to make sure we catch this error in the future. (#134734)

* Add t-string prefixes to _all_string_prefixes, and add a test to make sure we catch this error in the future. * Update lexical analysis docs for t-string prefixes.
2025-07-07 19:35:27 +00:00 · 2025-05-26 13:49:39 -04:00 · 2025-05-26 13:49:39 -04:00 · 08c78e02fa
commit 08c78e02fa
parent c60f39ada6
3 changed files with 59 additions and 2 deletions
--- a/Doc/reference/lexical_analysis.rst
+++ b/Doc/reference/lexical_analysis.rst
@ -489,8 +489,9 @@ String literals are described by the following lexical definitions:
 .. productionlist:: python-grammar
   stringliteral: [`stringprefix`](`shortstring` | `longstring`)
-   stringprefix: "r" | "u" | "R" | "U" | "f" | "F"
+   stringprefix: "r" | "u" | "R" | "U" | "f" | "F" | "t" | "T"
               : | "fr" | "Fr" | "fR" | "FR" | "rf" | "rF" | "Rf" | "RF"
               : | "tr" | "Tr" | "tR" | "TR" | "rt" | "rT" | "Rt" | "RT"
   shortstring: "'" `shortstringitem`* "'" | '"' `shortstringitem`* '"'
   longstring: "'''" `longstringitem`* "'''" | '"""' `longstringitem`* '"""'
   shortstringitem: `shortstringchar` | `stringescapeseq`
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@ -1,6 +1,8 @@
 import contextlib
 import itertools
 import os
 import re
 import string
 import tempfile
 import token
 import tokenize
@ -3238,5 +3240,59 @@ class CommandLineTest(unittest.TestCase):
            self.check_output(source, expect, flag)
 class StringPrefixTest(unittest.TestCase):
    def test_prefixes(self):
        # Get the list of defined string prefixes.  I don't see an
        # obvious documented way of doing this, but probably the best
        # thing is to split apart tokenize.StringPrefix.
        # Make sure StringPrefix begins and ends in parens.
        self.assertEqual(tokenize.StringPrefix[0], '(')
        self.assertEqual(tokenize.StringPrefix[-1], ')')
        # Then split apart everything else by '|'.
        defined_prefixes = set(tokenize.StringPrefix[1:-1].split('|'))
        # Now compute the actual string prefixes, by exec-ing all
        # valid prefix combinations, followed by an empty string.
        # Try all prefix lengths until we find a length that has zero
        # valid prefixes.  This will miss the case where for example
        # there are no valid 3 character prefixes, but there are valid
        # 4 character prefixes.  That seems extremely unlikely.
        # Note that the empty prefix is being included, because length
        # starts at 0.  That's expected, since StringPrefix includes
        # the empty prefix.
        valid_prefixes = set()
        for length in itertools.count():
            num_at_this_length = 0
            for prefix in (
                "".join(l) for l in list(itertools.combinations(string.ascii_lowercase, length))
            ):
                for t in itertools.permutations(prefix):
                    for u in itertools.product(*[(c, c.upper()) for c in t]):
                        p = ''.join(u)
                        if p == "not":
                            # 'not' can never be a string prefix,
                            # because it's a valid expression: not ""
                            continue
                        try:
                            eval(f'{p}""')
                            # No syntax error, so p is a valid string
                            # prefix.
                            valid_prefixes.add(p)
                            num_at_this_length += 1
                        except SyntaxError:
                            pass
            if num_at_this_length == 0:
                break
        self.assertEqual(defined_prefixes, valid_prefixes)
 if __name__ == "__main__":
    unittest.main()
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@ -86,7 +86,7 @@ def _all_string_prefixes():
    # The valid string prefixes. Only contain the lower case versions,
    #  and don't contain any permutations (include 'fr', but not
    #  'rf'). The various permutations will be generated.
-    _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
+    _valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'br', 'fr', 'tr']
    # if we add binary f-strings, add: ['fb', 'fbr']
    result = {''}
    for prefix in _valid_string_prefixes: