gh-134675: Add t-string prefixes to tokenizer module, lexical analysis doc, and add a test to make sure we catch this error in the future. (#134734)

* Add t-string prefixes to _all_string_prefixes, and add a test to make sure we catch this error in the future.

* Update lexical analysis docs for t-string prefixes.
This commit is contained in:
Eric V. Smith 2025-05-26 13:49:39 -04:00 committed by GitHub
parent c60f39ada6
commit 08c78e02fa
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 59 additions and 2 deletions

View file

@ -489,8 +489,9 @@ String literals are described by the following lexical definitions:
.. productionlist:: python-grammar
stringliteral: [`stringprefix`](`shortstring` | `longstring`)
stringprefix: "r" | "u" | "R" | "U" | "f" | "F"
stringprefix: "r" | "u" | "R" | "U" | "f" | "F" | "t" | "T"
: | "fr" | "Fr" | "fR" | "FR" | "rf" | "rF" | "Rf" | "RF"
: | "tr" | "Tr" | "tR" | "TR" | "rt" | "rT" | "Rt" | "RT"
shortstring: "'" `shortstringitem`* "'" | '"' `shortstringitem`* '"'
longstring: "'''" `longstringitem`* "'''" | '"""' `longstringitem`* '"""'
shortstringitem: `shortstringchar` | `stringescapeseq`

View file

@ -1,6 +1,8 @@
import contextlib
import itertools
import os
import re
import string
import tempfile
import token
import tokenize
@ -3238,5 +3240,59 @@ class CommandLineTest(unittest.TestCase):
self.check_output(source, expect, flag)
class StringPrefixTest(unittest.TestCase):
def test_prefixes(self):
# Get the list of defined string prefixes. I don't see an
# obvious documented way of doing this, but probably the best
# thing is to split apart tokenize.StringPrefix.
# Make sure StringPrefix begins and ends in parens.
self.assertEqual(tokenize.StringPrefix[0], '(')
self.assertEqual(tokenize.StringPrefix[-1], ')')
# Then split apart everything else by '|'.
defined_prefixes = set(tokenize.StringPrefix[1:-1].split('|'))
# Now compute the actual string prefixes, by exec-ing all
# valid prefix combinations, followed by an empty string.
# Try all prefix lengths until we find a length that has zero
# valid prefixes. This will miss the case where for example
# there are no valid 3 character prefixes, but there are valid
# 4 character prefixes. That seems extremely unlikely.
# Note that the empty prefix is being included, because length
# starts at 0. That's expected, since StringPrefix includes
# the empty prefix.
valid_prefixes = set()
for length in itertools.count():
num_at_this_length = 0
for prefix in (
"".join(l) for l in list(itertools.combinations(string.ascii_lowercase, length))
):
for t in itertools.permutations(prefix):
for u in itertools.product(*[(c, c.upper()) for c in t]):
p = ''.join(u)
if p == "not":
# 'not' can never be a string prefix,
# because it's a valid expression: not ""
continue
try:
eval(f'{p}""')
# No syntax error, so p is a valid string
# prefix.
valid_prefixes.add(p)
num_at_this_length += 1
except SyntaxError:
pass
if num_at_this_length == 0:
break
self.assertEqual(defined_prefixes, valid_prefixes)
if __name__ == "__main__":
unittest.main()

View file

@ -86,7 +86,7 @@ def _all_string_prefixes():
# The valid string prefixes. Only contain the lower case versions,
# and don't contain any permutations (include 'fr', but not
# 'rf'). The various permutations will be generated.
_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
_valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'br', 'fr', 'tr']
# if we add binary f-strings, add: ['fb', 'fbr']
result = {''}
for prefix in _valid_string_prefixes: