mirror of
https://github.com/python/cpython.git
synced 2025-08-22 17:55:18 +00:00
gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively (#105070)
This commit is contained in:
parent
2ea34cfb3a
commit
9216e69a87
7 changed files with 276 additions and 98 deletions
|
@ -2203,7 +2203,7 @@ def _signature_strip_non_python_syntax(signature):
|
||||||
add(string)
|
add(string)
|
||||||
if (string == ','):
|
if (string == ','):
|
||||||
add(' ')
|
add(' ')
|
||||||
clean_signature = ''.join(text).strip()
|
clean_signature = ''.join(text).strip().replace("\n", "")
|
||||||
return clean_signature, self_parameter
|
return clean_signature, self_parameter
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from test import support
|
from test import support
|
||||||
from test.support import os_helper
|
from test.support import os_helper
|
||||||
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
|
from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
|
||||||
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
|
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
|
||||||
open as tokenize_open, Untokenizer, generate_tokens,
|
open as tokenize_open, Untokenizer, generate_tokens,
|
||||||
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
|
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
|
||||||
|
@ -51,6 +51,25 @@ class TokenizeTest(TestCase):
|
||||||
[" ENCODING 'utf-8' (0, 0) (0, 0)"] +
|
[" ENCODING 'utf-8' (0, 0) (0, 0)"] +
|
||||||
expected.rstrip().splitlines())
|
expected.rstrip().splitlines())
|
||||||
|
|
||||||
|
def test_invalid_readline(self):
|
||||||
|
def gen():
|
||||||
|
yield "sdfosdg"
|
||||||
|
yield "sdfosdg"
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
list(tokenize(gen().__next__))
|
||||||
|
|
||||||
|
def gen():
|
||||||
|
yield b"sdfosdg"
|
||||||
|
yield b"sdfosdg"
|
||||||
|
with self.assertRaises(TypeError):
|
||||||
|
list(generate_tokens(gen().__next__))
|
||||||
|
|
||||||
|
def gen():
|
||||||
|
yield "sdfosdg"
|
||||||
|
1/0
|
||||||
|
with self.assertRaises(ZeroDivisionError):
|
||||||
|
list(generate_tokens(gen().__next__))
|
||||||
|
|
||||||
def test_implicit_newline(self):
|
def test_implicit_newline(self):
|
||||||
# Make sure that the tokenizer puts in an implicit NEWLINE
|
# Make sure that the tokenizer puts in an implicit NEWLINE
|
||||||
# when the input lacks a trailing new line.
|
# when the input lacks a trailing new line.
|
||||||
|
@ -1154,7 +1173,8 @@ class TestTokenizerAdheresToPep0263(TestCase):
|
||||||
|
|
||||||
def _testFile(self, filename):
|
def _testFile(self, filename):
|
||||||
path = os.path.join(os.path.dirname(__file__), filename)
|
path = os.path.join(os.path.dirname(__file__), filename)
|
||||||
TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
|
with open(path, 'rb') as f:
|
||||||
|
TestRoundtrip.check_roundtrip(self, f)
|
||||||
|
|
||||||
def test_utf8_coding_cookie_and_no_utf8_bom(self):
|
def test_utf8_coding_cookie_and_no_utf8_bom(self):
|
||||||
f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
|
f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
|
||||||
|
@ -1199,7 +1219,8 @@ class Test_Tokenize(TestCase):
|
||||||
yield b''
|
yield b''
|
||||||
|
|
||||||
# skip the initial encoding token and the end tokens
|
# skip the initial encoding token and the end tokens
|
||||||
tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
|
tokens = list(_generate_tokens_from_c_tokenizer(readline().__next__, encoding='utf-8',
|
||||||
|
extra_tokens=True))[:-2]
|
||||||
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
|
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
|
||||||
self.assertEqual(tokens, expected_tokens,
|
self.assertEqual(tokens, expected_tokens,
|
||||||
"bytes not decoded with encoding")
|
"bytes not decoded with encoding")
|
||||||
|
@ -1468,13 +1489,13 @@ class TestTokenize(TestCase):
|
||||||
def mock_detect_encoding(readline):
|
def mock_detect_encoding(readline):
|
||||||
return encoding, [b'first', b'second']
|
return encoding, [b'first', b'second']
|
||||||
|
|
||||||
def mock__tokenize(readline, encoding):
|
def mock__tokenize(readline, encoding, **kwargs):
|
||||||
nonlocal encoding_used
|
nonlocal encoding_used
|
||||||
encoding_used = encoding
|
encoding_used = encoding
|
||||||
out = []
|
out = []
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
next_line = next(readline)
|
next_line = readline()
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
return out
|
return out
|
||||||
if next_line:
|
if next_line:
|
||||||
|
@ -1491,16 +1512,16 @@ class TestTokenize(TestCase):
|
||||||
return str(counter).encode()
|
return str(counter).encode()
|
||||||
|
|
||||||
orig_detect_encoding = tokenize_module.detect_encoding
|
orig_detect_encoding = tokenize_module.detect_encoding
|
||||||
orig__tokenize = tokenize_module._tokenize
|
orig_c_token = tokenize_module._generate_tokens_from_c_tokenizer
|
||||||
tokenize_module.detect_encoding = mock_detect_encoding
|
tokenize_module.detect_encoding = mock_detect_encoding
|
||||||
tokenize_module._tokenize = mock__tokenize
|
tokenize_module._generate_tokens_from_c_tokenizer = mock__tokenize
|
||||||
try:
|
try:
|
||||||
results = tokenize(mock_readline)
|
results = tokenize(mock_readline)
|
||||||
self.assertEqual(list(results)[1:],
|
self.assertEqual(list(results)[1:],
|
||||||
[b'first', b'second', b'1', b'2', b'3', b'4'])
|
[b'first', b'second', b'1', b'2', b'3', b'4'])
|
||||||
finally:
|
finally:
|
||||||
tokenize_module.detect_encoding = orig_detect_encoding
|
tokenize_module.detect_encoding = orig_detect_encoding
|
||||||
tokenize_module._tokenize = orig__tokenize
|
tokenize_module._generate_tokens_from_c_tokenizer = orig_c_token
|
||||||
|
|
||||||
self.assertEqual(encoding_used, encoding)
|
self.assertEqual(encoding_used, encoding)
|
||||||
|
|
||||||
|
@ -1827,12 +1848,33 @@ class CTokenizeTest(TestCase):
|
||||||
def check_tokenize(self, s, expected):
|
def check_tokenize(self, s, expected):
|
||||||
# Format the tokens in s in a table format.
|
# Format the tokens in s in a table format.
|
||||||
# The ENDMARKER and final NEWLINE are omitted.
|
# The ENDMARKER and final NEWLINE are omitted.
|
||||||
|
f = StringIO(s)
|
||||||
with self.subTest(source=s):
|
with self.subTest(source=s):
|
||||||
result = stringify_tokens_from_source(
|
result = stringify_tokens_from_source(
|
||||||
_generate_tokens_from_c_tokenizer(s), s
|
_generate_tokens_from_c_tokenizer(f.readline), s
|
||||||
)
|
)
|
||||||
self.assertEqual(result, expected.rstrip().splitlines())
|
self.assertEqual(result, expected.rstrip().splitlines())
|
||||||
|
|
||||||
|
def test_encoding(self):
|
||||||
|
def readline(encoding):
|
||||||
|
yield "1+1".encode(encoding)
|
||||||
|
|
||||||
|
expected = [
|
||||||
|
TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1\n'),
|
||||||
|
TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1\n'),
|
||||||
|
TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1\n'),
|
||||||
|
TokenInfo(type=NEWLINE, string='\n', start=(1, 3), end=(1, 4), line='1+1\n'),
|
||||||
|
TokenInfo(type=ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
|
||||||
|
]
|
||||||
|
for encoding in ["utf-8", "latin-1", "utf-16"]:
|
||||||
|
with self.subTest(encoding=encoding):
|
||||||
|
tokens = list(_generate_tokens_from_c_tokenizer(
|
||||||
|
readline(encoding).__next__,
|
||||||
|
extra_tokens=True,
|
||||||
|
encoding=encoding,
|
||||||
|
))
|
||||||
|
self.assertEqual(tokens, expected)
|
||||||
|
|
||||||
def test_int(self):
|
def test_int(self):
|
||||||
|
|
||||||
self.check_tokenize('0xff <= 255', """\
|
self.check_tokenize('0xff <= 255', """\
|
||||||
|
@ -2668,43 +2710,44 @@ async def f():
|
||||||
|
|
||||||
def test_invalid_syntax(self):
|
def test_invalid_syntax(self):
|
||||||
def get_tokens(string):
|
def get_tokens(string):
|
||||||
return list(_generate_tokens_from_c_tokenizer(string))
|
the_string = StringIO(string)
|
||||||
|
return list(_generate_tokens_from_c_tokenizer(the_string.readline))
|
||||||
|
|
||||||
self.assertRaises(SyntaxError, get_tokens, "(1+2]")
|
for case in [
|
||||||
self.assertRaises(SyntaxError, get_tokens, "(1+2}")
|
"(1+2]",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "{1+2]")
|
"(1+2}",
|
||||||
|
"{1+2]",
|
||||||
|
"1_",
|
||||||
|
"1.2_",
|
||||||
|
"1e2_",
|
||||||
|
"1e+",
|
||||||
|
|
||||||
self.assertRaises(SyntaxError, get_tokens, "1_")
|
"\xa0",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "1.2_")
|
"€",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "1e2_")
|
"0b12",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "1e+")
|
"0b1_2",
|
||||||
|
"0b2",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "\xa0")
|
"0b1_",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "€")
|
"0b",
|
||||||
|
"0o18",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "0b12")
|
"0o1_8",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "0b1_2")
|
"0o8",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "0b2")
|
"0o1_",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "0b1_")
|
"0o",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "0b")
|
"0x1_",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "0o18")
|
"0x",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "0o1_8")
|
"1_",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "0o8")
|
"012",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "0o1_")
|
"1.2_",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "0o")
|
"1e2_",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "0x1_")
|
"1e+",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "0x")
|
"'sdfsdf",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "1_")
|
"'''sdfsdf''",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "012")
|
"("*1000+"a"+")"*1000,
|
||||||
self.assertRaises(SyntaxError, get_tokens, "1.2_")
|
"]",
|
||||||
self.assertRaises(SyntaxError, get_tokens, "1e2_")
|
]:
|
||||||
self.assertRaises(SyntaxError, get_tokens, "1e+")
|
with self.subTest(case=case):
|
||||||
|
self.assertRaises(SyntaxError, get_tokens, case)
|
||||||
self.assertRaises(SyntaxError, get_tokens, "'sdfsdf")
|
|
||||||
self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''")
|
|
||||||
|
|
||||||
self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
|
|
||||||
self.assertRaises(SyntaxError, get_tokens, "]")
|
|
||||||
|
|
||||||
def test_max_indent(self):
|
def test_max_indent(self):
|
||||||
MAXINDENT = 100
|
MAXINDENT = 100
|
||||||
|
@ -2715,20 +2758,24 @@ async def f():
|
||||||
return source
|
return source
|
||||||
|
|
||||||
valid = generate_source(MAXINDENT - 1)
|
valid = generate_source(MAXINDENT - 1)
|
||||||
tokens = list(_generate_tokens_from_c_tokenizer(valid))
|
the_input = StringIO(valid)
|
||||||
|
tokens = list(_generate_tokens_from_c_tokenizer(the_input.readline))
|
||||||
self.assertEqual(tokens[-2].type, DEDENT)
|
self.assertEqual(tokens[-2].type, DEDENT)
|
||||||
self.assertEqual(tokens[-1].type, ENDMARKER)
|
self.assertEqual(tokens[-1].type, ENDMARKER)
|
||||||
compile(valid, "<string>", "exec")
|
compile(valid, "<string>", "exec")
|
||||||
|
|
||||||
invalid = generate_source(MAXINDENT)
|
invalid = generate_source(MAXINDENT)
|
||||||
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid)))
|
the_input = StringIO(invalid)
|
||||||
|
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
IndentationError, compile, invalid, "<string>", "exec"
|
IndentationError, compile, invalid, "<string>", "exec"
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_continuation_lines_indentation(self):
|
def test_continuation_lines_indentation(self):
|
||||||
def get_tokens(string):
|
def get_tokens(string):
|
||||||
return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)]
|
the_string = StringIO(string)
|
||||||
|
return [(kind, string) for (kind, string, *_)
|
||||||
|
in _generate_tokens_from_c_tokenizer(the_string.readline)]
|
||||||
|
|
||||||
code = dedent("""
|
code = dedent("""
|
||||||
def fib(n):
|
def fib(n):
|
||||||
|
|
|
@ -34,6 +34,7 @@ import re
|
||||||
import sys
|
import sys
|
||||||
from token import *
|
from token import *
|
||||||
from token import EXACT_TOKEN_TYPES
|
from token import EXACT_TOKEN_TYPES
|
||||||
|
import _tokenize
|
||||||
|
|
||||||
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
|
||||||
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
|
||||||
|
@ -443,12 +444,7 @@ def tokenize(readline):
|
||||||
# BOM will already have been stripped.
|
# BOM will already have been stripped.
|
||||||
encoding = "utf-8"
|
encoding = "utf-8"
|
||||||
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
|
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
|
||||||
yield from _tokenize(rl_gen, encoding)
|
yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True)
|
||||||
|
|
||||||
def _tokenize(rl_gen, encoding):
|
|
||||||
source = b"".join(rl_gen).decode(encoding)
|
|
||||||
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
|
|
||||||
yield token
|
|
||||||
|
|
||||||
def generate_tokens(readline):
|
def generate_tokens(readline):
|
||||||
"""Tokenize a source reading Python code as unicode strings.
|
"""Tokenize a source reading Python code as unicode strings.
|
||||||
|
@ -456,16 +452,7 @@ def generate_tokens(readline):
|
||||||
This has the same API as tokenize(), except that it expects the *readline*
|
This has the same API as tokenize(), except that it expects the *readline*
|
||||||
callable to return str objects instead of bytes.
|
callable to return str objects instead of bytes.
|
||||||
"""
|
"""
|
||||||
def _gen():
|
return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True)
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
line = readline()
|
|
||||||
except StopIteration:
|
|
||||||
return
|
|
||||||
if not line:
|
|
||||||
return
|
|
||||||
yield line.encode()
|
|
||||||
return _tokenize(_gen(), 'utf-8')
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
import argparse
|
import argparse
|
||||||
|
@ -502,9 +489,9 @@ def main():
|
||||||
tokens = list(tokenize(f.readline))
|
tokens = list(tokenize(f.readline))
|
||||||
else:
|
else:
|
||||||
filename = "<stdin>"
|
filename = "<stdin>"
|
||||||
tokens = _tokenize(
|
tokens = _generate_tokens_from_c_tokenizer(
|
||||||
(x.encode('utf-8') for x in iter(sys.stdin.readline, "")
|
(x.encode('utf-8') for x in iter(sys.stdin.readline, "")
|
||||||
), "utf-8")
|
), "utf-8", extra_tokens=True)
|
||||||
|
|
||||||
|
|
||||||
# Output the tokenization
|
# Output the tokenization
|
||||||
|
@ -531,10 +518,13 @@ def main():
|
||||||
perror("unexpected error: %s" % err)
|
perror("unexpected error: %s" % err)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
|
def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
|
||||||
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
|
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
|
||||||
import _tokenize as c_tokenizer
|
if encoding is None:
|
||||||
for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
|
it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
|
||||||
|
else:
|
||||||
|
it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
|
||||||
|
for info in it:
|
||||||
yield TokenInfo._make(info)
|
yield TokenInfo._make(info)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -103,6 +103,7 @@ tok_new(void)
|
||||||
tok->filename = NULL;
|
tok->filename = NULL;
|
||||||
tok->decoding_readline = NULL;
|
tok->decoding_readline = NULL;
|
||||||
tok->decoding_buffer = NULL;
|
tok->decoding_buffer = NULL;
|
||||||
|
tok->readline = NULL;
|
||||||
tok->type_comments = 0;
|
tok->type_comments = 0;
|
||||||
tok->async_hacks = 0;
|
tok->async_hacks = 0;
|
||||||
tok->async_def = 0;
|
tok->async_def = 0;
|
||||||
|
@ -139,8 +140,9 @@ static char *
|
||||||
error_ret(struct tok_state *tok) /* XXX */
|
error_ret(struct tok_state *tok) /* XXX */
|
||||||
{
|
{
|
||||||
tok->decoding_erred = 1;
|
tok->decoding_erred = 1;
|
||||||
if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
|
if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */
|
||||||
PyMem_Free(tok->buf);
|
PyMem_Free(tok->buf);
|
||||||
|
}
|
||||||
tok->buf = tok->cur = tok->inp = NULL;
|
tok->buf = tok->cur = tok->inp = NULL;
|
||||||
tok->start = NULL;
|
tok->start = NULL;
|
||||||
tok->end = NULL;
|
tok->end = NULL;
|
||||||
|
@ -900,6 +902,33 @@ _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
|
||||||
return tok;
|
return tok;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct tok_state *
|
||||||
|
_PyTokenizer_FromReadline(PyObject* readline, const char* enc,
|
||||||
|
int exec_input, int preserve_crlf)
|
||||||
|
{
|
||||||
|
struct tok_state *tok = tok_new();
|
||||||
|
if (tok == NULL)
|
||||||
|
return NULL;
|
||||||
|
if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
|
||||||
|
_PyTokenizer_Free(tok);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
tok->cur = tok->inp = tok->buf;
|
||||||
|
tok->end = tok->buf + BUFSIZ;
|
||||||
|
tok->fp = NULL;
|
||||||
|
if (enc != NULL) {
|
||||||
|
tok->encoding = new_string(enc, strlen(enc), tok);
|
||||||
|
if (!tok->encoding) {
|
||||||
|
_PyTokenizer_Free(tok);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tok->decoding_state = STATE_NORMAL;
|
||||||
|
Py_INCREF(readline);
|
||||||
|
tok->readline = readline;
|
||||||
|
return tok;
|
||||||
|
}
|
||||||
|
|
||||||
/* Set up tokenizer for UTF-8 string */
|
/* Set up tokenizer for UTF-8 string */
|
||||||
|
|
||||||
struct tok_state *
|
struct tok_state *
|
||||||
|
@ -969,8 +998,9 @@ _PyTokenizer_Free(struct tok_state *tok)
|
||||||
}
|
}
|
||||||
Py_XDECREF(tok->decoding_readline);
|
Py_XDECREF(tok->decoding_readline);
|
||||||
Py_XDECREF(tok->decoding_buffer);
|
Py_XDECREF(tok->decoding_buffer);
|
||||||
|
Py_XDECREF(tok->readline);
|
||||||
Py_XDECREF(tok->filename);
|
Py_XDECREF(tok->filename);
|
||||||
if (tok->fp != NULL && tok->buf != NULL) {
|
if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) {
|
||||||
PyMem_Free(tok->buf);
|
PyMem_Free(tok->buf);
|
||||||
}
|
}
|
||||||
if (tok->input) {
|
if (tok->input) {
|
||||||
|
@ -1021,6 +1051,71 @@ tok_readline_raw(struct tok_state *tok)
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
tok_readline_string(struct tok_state* tok) {
|
||||||
|
PyObject* line = NULL;
|
||||||
|
PyObject* raw_line = PyObject_CallNoArgs(tok->readline);
|
||||||
|
if (raw_line == NULL) {
|
||||||
|
if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
|
||||||
|
PyErr_Clear();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
error_ret(tok);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
if(tok->encoding != NULL) {
|
||||||
|
if (!PyBytes_Check(raw_line)) {
|
||||||
|
PyErr_Format(PyExc_TypeError, "readline() returned a non-bytes object");
|
||||||
|
error_ret(tok);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
line = PyUnicode_Decode(PyBytes_AS_STRING(raw_line), PyBytes_GET_SIZE(raw_line),
|
||||||
|
tok->encoding, "replace");
|
||||||
|
Py_CLEAR(raw_line);
|
||||||
|
if (line == NULL) {
|
||||||
|
error_ret(tok);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if(!PyUnicode_Check(raw_line)) {
|
||||||
|
PyErr_Format(PyExc_TypeError, "readline() returned a non-string object");
|
||||||
|
error_ret(tok);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
line = raw_line;
|
||||||
|
raw_line = NULL;
|
||||||
|
}
|
||||||
|
Py_ssize_t buflen;
|
||||||
|
const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen);
|
||||||
|
if (buf == NULL) {
|
||||||
|
error_ret(tok);
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make room for the null terminator *and* potentially
|
||||||
|
// an extra newline character that we may need to artificially
|
||||||
|
// add.
|
||||||
|
size_t buffer_size = buflen + 2;
|
||||||
|
if (!tok_reserve_buf(tok, buffer_size)) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
memcpy(tok->inp, buf, buflen);
|
||||||
|
tok->inp += buflen;
|
||||||
|
*tok->inp = '\0';
|
||||||
|
|
||||||
|
if (tok->start == NULL) {
|
||||||
|
tok->buf = tok->cur;
|
||||||
|
}
|
||||||
|
tok->line_start = tok->cur;
|
||||||
|
|
||||||
|
Py_DECREF(line);
|
||||||
|
return 1;
|
||||||
|
error:
|
||||||
|
Py_XDECREF(raw_line);
|
||||||
|
Py_XDECREF(line);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
tok_underflow_string(struct tok_state *tok) {
|
tok_underflow_string(struct tok_state *tok) {
|
||||||
char *end = strchr(tok->inp, '\n');
|
char *end = strchr(tok->inp, '\n');
|
||||||
|
@ -1195,6 +1290,38 @@ tok_underflow_file(struct tok_state *tok) {
|
||||||
return tok->done == E_OK;
|
return tok->done == E_OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
tok_underflow_readline(struct tok_state* tok) {
|
||||||
|
assert(tok->decoding_state == STATE_NORMAL);
|
||||||
|
assert(tok->fp == NULL && tok->input == NULL && tok->decoding_readline == NULL);
|
||||||
|
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
|
||||||
|
tok->cur = tok->inp = tok->buf;
|
||||||
|
}
|
||||||
|
if (!tok_readline_string(tok)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (tok->inp == tok->cur) {
|
||||||
|
tok->done = E_EOF;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (tok->inp[-1] != '\n') {
|
||||||
|
assert(tok->inp + 1 < tok->end);
|
||||||
|
/* Last line does not end in \n, fake one */
|
||||||
|
*tok->inp++ = '\n';
|
||||||
|
*tok->inp = '\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
ADVANCE_LINENO();
|
||||||
|
/* The default encoding is UTF-8, so make sure we don't have any
|
||||||
|
non-UTF-8 sequences in it. */
|
||||||
|
if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
|
||||||
|
error_ret(tok);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
assert(tok->done == E_OK);
|
||||||
|
return tok->done == E_OK;
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(Py_DEBUG)
|
#if defined(Py_DEBUG)
|
||||||
static void
|
static void
|
||||||
print_escape(FILE *f, const char *s, Py_ssize_t size)
|
print_escape(FILE *f, const char *s, Py_ssize_t size)
|
||||||
|
@ -1238,7 +1365,10 @@ tok_nextc(struct tok_state *tok)
|
||||||
if (tok->done != E_OK) {
|
if (tok->done != E_OK) {
|
||||||
return EOF;
|
return EOF;
|
||||||
}
|
}
|
||||||
if (tok->fp == NULL) {
|
if (tok->readline) {
|
||||||
|
rc = tok_underflow_readline(tok);
|
||||||
|
}
|
||||||
|
else if (tok->fp == NULL) {
|
||||||
rc = tok_underflow_string(tok);
|
rc = tok_underflow_string(tok);
|
||||||
}
|
}
|
||||||
else if (tok->prompt != NULL) {
|
else if (tok->prompt != NULL) {
|
||||||
|
|
|
@ -109,6 +109,7 @@ struct tok_state {
|
||||||
expression (cf. issue 16806) */
|
expression (cf. issue 16806) */
|
||||||
PyObject *decoding_readline; /* open(...).readline */
|
PyObject *decoding_readline; /* open(...).readline */
|
||||||
PyObject *decoding_buffer;
|
PyObject *decoding_buffer;
|
||||||
|
PyObject *readline; /* readline() function */
|
||||||
const char* enc; /* Encoding for the current str. */
|
const char* enc; /* Encoding for the current str. */
|
||||||
char* str; /* Source string being tokenized (if tokenizing from a string)*/
|
char* str; /* Source string being tokenized (if tokenizing from a string)*/
|
||||||
char* input; /* Tokenizer's newline translated copy of the string. */
|
char* input; /* Tokenizer's newline translated copy of the string. */
|
||||||
|
@ -137,6 +138,7 @@ struct tok_state {
|
||||||
|
|
||||||
extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
|
extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
|
||||||
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
|
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
|
||||||
|
extern struct tok_state *_PyTokenizer_FromReadline(PyObject*, const char*, int, int);
|
||||||
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
|
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
|
||||||
const char *, const char *);
|
const char *, const char *);
|
||||||
extern void _PyTokenizer_Free(struct tok_state *);
|
extern void _PyTokenizer_Free(struct tok_state *);
|
||||||
|
|
|
@ -37,15 +37,17 @@ typedef struct
|
||||||
@classmethod
|
@classmethod
|
||||||
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
|
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
|
||||||
|
|
||||||
source: str
|
readline: object
|
||||||
|
/
|
||||||
*
|
*
|
||||||
extra_tokens: bool
|
extra_tokens: bool
|
||||||
|
encoding: str(c_default="NULL") = 'utf-8'
|
||||||
[clinic start generated code]*/
|
[clinic start generated code]*/
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
tokenizeriter_new_impl(PyTypeObject *type, const char *source,
|
tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
|
||||||
int extra_tokens)
|
int extra_tokens, const char *encoding)
|
||||||
/*[clinic end generated code: output=f6f9d8b4beec8106 input=90dc5b6a5df180c2]*/
|
/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
|
||||||
{
|
{
|
||||||
tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
|
tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
|
||||||
if (self == NULL) {
|
if (self == NULL) {
|
||||||
|
@ -55,7 +57,7 @@ tokenizeriter_new_impl(PyTypeObject *type, const char *source,
|
||||||
if (filename == NULL) {
|
if (filename == NULL) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
self->tok = _PyTokenizer_FromUTF8(source, 1, 1);
|
self->tok = _PyTokenizer_FromReadline(readline, encoding, 1, 1);
|
||||||
if (self->tok == NULL) {
|
if (self->tok == NULL) {
|
||||||
Py_DECREF(filename);
|
Py_DECREF(filename);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
49
Python/clinic/Python-tokenize.c.h
generated
49
Python/clinic/Python-tokenize.c.h
generated
|
@ -9,8 +9,8 @@ preserve
|
||||||
|
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
tokenizeriter_new_impl(PyTypeObject *type, const char *source,
|
tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
|
||||||
int extra_tokens);
|
int extra_tokens, const char *encoding);
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
|
tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
|
||||||
|
@ -25,7 +25,7 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
|
||||||
PyObject *ob_item[NUM_KEYWORDS];
|
PyObject *ob_item[NUM_KEYWORDS];
|
||||||
} _kwtuple = {
|
} _kwtuple = {
|
||||||
.ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
|
.ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
|
||||||
.ob_item = { &_Py_ID(source), &_Py_ID(extra_tokens), },
|
.ob_item = { &_Py_ID(extra_tokens), &_Py_ID(encoding), },
|
||||||
};
|
};
|
||||||
#undef NUM_KEYWORDS
|
#undef NUM_KEYWORDS
|
||||||
#define KWTUPLE (&_kwtuple.ob_base.ob_base)
|
#define KWTUPLE (&_kwtuple.ob_base.ob_base)
|
||||||
|
@ -34,43 +34,50 @@ tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
|
||||||
# define KWTUPLE NULL
|
# define KWTUPLE NULL
|
||||||
#endif // !Py_BUILD_CORE
|
#endif // !Py_BUILD_CORE
|
||||||
|
|
||||||
static const char * const _keywords[] = {"source", "extra_tokens", NULL};
|
static const char * const _keywords[] = {"", "extra_tokens", "encoding", NULL};
|
||||||
static _PyArg_Parser _parser = {
|
static _PyArg_Parser _parser = {
|
||||||
.keywords = _keywords,
|
.keywords = _keywords,
|
||||||
.fname = "tokenizeriter",
|
.fname = "tokenizeriter",
|
||||||
.kwtuple = KWTUPLE,
|
.kwtuple = KWTUPLE,
|
||||||
};
|
};
|
||||||
#undef KWTUPLE
|
#undef KWTUPLE
|
||||||
PyObject *argsbuf[2];
|
PyObject *argsbuf[3];
|
||||||
PyObject * const *fastargs;
|
PyObject * const *fastargs;
|
||||||
Py_ssize_t nargs = PyTuple_GET_SIZE(args);
|
Py_ssize_t nargs = PyTuple_GET_SIZE(args);
|
||||||
const char *source;
|
Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 2;
|
||||||
|
PyObject *readline;
|
||||||
int extra_tokens;
|
int extra_tokens;
|
||||||
|
const char *encoding = NULL;
|
||||||
|
|
||||||
fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 1, argsbuf);
|
fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 1, argsbuf);
|
||||||
if (!fastargs) {
|
if (!fastargs) {
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
if (!PyUnicode_Check(fastargs[0])) {
|
readline = fastargs[0];
|
||||||
_PyArg_BadArgument("tokenizeriter", "argument 'source'", "str", fastargs[0]);
|
|
||||||
goto exit;
|
|
||||||
}
|
|
||||||
Py_ssize_t source_length;
|
|
||||||
source = PyUnicode_AsUTF8AndSize(fastargs[0], &source_length);
|
|
||||||
if (source == NULL) {
|
|
||||||
goto exit;
|
|
||||||
}
|
|
||||||
if (strlen(source) != (size_t)source_length) {
|
|
||||||
PyErr_SetString(PyExc_ValueError, "embedded null character");
|
|
||||||
goto exit;
|
|
||||||
}
|
|
||||||
extra_tokens = PyObject_IsTrue(fastargs[1]);
|
extra_tokens = PyObject_IsTrue(fastargs[1]);
|
||||||
if (extra_tokens < 0) {
|
if (extra_tokens < 0) {
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
return_value = tokenizeriter_new_impl(type, source, extra_tokens);
|
if (!noptargs) {
|
||||||
|
goto skip_optional_kwonly;
|
||||||
|
}
|
||||||
|
if (!PyUnicode_Check(fastargs[2])) {
|
||||||
|
_PyArg_BadArgument("tokenizeriter", "argument 'encoding'", "str", fastargs[2]);
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
Py_ssize_t encoding_length;
|
||||||
|
encoding = PyUnicode_AsUTF8AndSize(fastargs[2], &encoding_length);
|
||||||
|
if (encoding == NULL) {
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
if (strlen(encoding) != (size_t)encoding_length) {
|
||||||
|
PyErr_SetString(PyExc_ValueError, "embedded null character");
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
skip_optional_kwonly:
|
||||||
|
return_value = tokenizeriter_new_impl(type, readline, extra_tokens, encoding);
|
||||||
|
|
||||||
exit:
|
exit:
|
||||||
return return_value;
|
return return_value;
|
||||||
}
|
}
|
||||||
/*[clinic end generated code: output=940b564c67f6e0e2 input=a9049054013a1b77]*/
|
/*[clinic end generated code: output=48be65a2808bdfa6 input=a9049054013a1b77]*/
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue