[3.12] gh-105549: Tokenize separately NUMBER and NAME tokens and allow 0-prefixed literals (GH-105555) (#105602)

Co-authored-by: Pablo Galindo Salgado <Pablogsal@gmail.com>
2025-09-26 18:29:57 +00:00 · 2023-06-09 14:40:07 -07:00 · 2023-06-09 14:40:07 -07:00 · ae6e002f5a
commit ae6e002f5a
parent 411366ccdb
3 changed files with 45 additions and 3 deletions
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@ -284,7 +284,12 @@ def k(x):
                # this won't work with compound complex inputs
                continue
            self.assertEqual(number_token(lit), lit)
        # Valid cases with extra underscores in the tokenize module
        # See gh-105549 for context
        extra_valid_cases = {"0_7", "09_99"}
        for lit in INVALID_UNDERSCORE_LITERALS:
            if lit in extra_valid_cases:
                continue
            try:
                number_token(lit)
            except TokenError:
@ -1873,6 +1878,34 @@ class TestRoundtrip(TestCase):
        self.check_roundtrip(code)
 class InvalidPythonTests(TestCase):
    def test_number_followed_by_name(self):
        # See issue #gh-105549
        source = "2sin(x)"
        expected_tokens = [
            TokenInfo(type=token.NUMBER, string='2', start=(1, 0), end=(1, 1), line='2sin(x)'),
            TokenInfo(type=token.NAME, string='sin', start=(1, 1), end=(1, 4), line='2sin(x)'),
            TokenInfo(type=token.OP, string='(', start=(1, 4), end=(1, 5), line='2sin(x)'),
            TokenInfo(type=token.NAME, string='x', start=(1, 5), end=(1, 6), line='2sin(x)'),
            TokenInfo(type=token.OP, string=')', start=(1, 6), end=(1, 7), line='2sin(x)'),
            TokenInfo(type=token.NEWLINE, string='', start=(1, 7), end=(1, 8), line='2sin(x)'),
            TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
        ]
        tokens = list(generate_tokens(StringIO(source).readline))
        self.assertEqual(tokens, expected_tokens)
    def test_number_starting_with_zero(self):
        source = "01234"
        expected_tokens = [
            TokenInfo(type=token.NUMBER, string='01234', start=(1, 0), end=(1, 5), line='01234'),
            TokenInfo(type=token.NEWLINE, string='', start=(1, 5), end=(1, 6), line='01234'),
            TokenInfo(type=token.ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
        ]
        tokens = list(generate_tokens(StringIO(source).readline))
        self.assertEqual(tokens, expected_tokens)
 class CTokenizeTest(TestCase):
    def check_tokenize(self, s, expected):
        # Format the tokens in s in a table format.
--- a/Builtins/2023-06-09-12-59-18.gh-issue-105549.PYfTNp.rst
+++ b/Builtins/2023-06-09-12-59-18.gh-issue-105549.PYfTNp.rst
@ -0,0 +1,2 @@
 Tokenize separately `NUMBER` and `NAME` tokens that are not ambiguous. Patch
 by Pablo Galindo
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@ -1600,8 +1600,12 @@ lookahead(struct tok_state *tok, const char *test)
 }
 static int
-verify_end_of_number(struct tok_state *tok, int c, const char *kind)
+verify_end_of_number(struct tok_state *tok, int c, const char *kind) {
-{
+    if (tok->tok_extra_tokens) {
        // When we are parsing extra tokens, we don't want to emit warnings
        // about invalid literals, because we want to be a bit more liberal.
        return 1;
    }
    /* Emit a deprecation warning only if the numeric literal is immediately
     * followed by one of keywords which can occur after a numeric literal
     * in valid code: "and", "else", "for", "if", "in", "is" and "or".
@ -1659,6 +1663,9 @@ verify_end_of_number(struct tok_state *tok, int c, const char *kind)
 static int
 verify_identifier(struct tok_state *tok)
 {
    if (tok->tok_extra_tokens) {
        return 1;
    }
    PyObject *s;
    if (tok->decoding_erred)
        return 0;
@ -2318,7 +2325,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                else if (c == 'j' || c == 'J') {
                    goto imaginary;
                }
-                else if (nonzero) {
+                else if (nonzero && !tok->tok_extra_tokens) {
                    /* Old-style octal: now disallowed. */
                    tok_backup(tok, c);
                    return MAKE_TOKEN(syntaxerror_known_range(
		`@ -0,0 +1,2 @@`
							Tokenize separately `NUMBER` and `NAME` tokens that are not ambiguous. Patch
							`by Pablo Galindo`