mirror of
https://github.com/python/cpython.git
synced 2025-08-03 08:34:29 +00:00
gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively (#105070)
This commit is contained in:
parent
2ea34cfb3a
commit
9216e69a87
7 changed files with 276 additions and 98 deletions
|
@ -103,6 +103,7 @@ tok_new(void)
|
|||
tok->filename = NULL;
|
||||
tok->decoding_readline = NULL;
|
||||
tok->decoding_buffer = NULL;
|
||||
tok->readline = NULL;
|
||||
tok->type_comments = 0;
|
||||
tok->async_hacks = 0;
|
||||
tok->async_def = 0;
|
||||
|
@ -139,8 +140,9 @@ static char *
|
|||
error_ret(struct tok_state *tok) /* XXX */
|
||||
{
|
||||
tok->decoding_erred = 1;
|
||||
if (tok->fp != NULL && tok->buf != NULL) /* see _PyTokenizer_Free */
|
||||
if ((tok->fp != NULL || tok->readline != NULL) && tok->buf != NULL) {/* see _PyTokenizer_Free */
|
||||
PyMem_Free(tok->buf);
|
||||
}
|
||||
tok->buf = tok->cur = tok->inp = NULL;
|
||||
tok->start = NULL;
|
||||
tok->end = NULL;
|
||||
|
@ -900,6 +902,33 @@ _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
|
|||
return tok;
|
||||
}
|
||||
|
||||
struct tok_state *
|
||||
_PyTokenizer_FromReadline(PyObject* readline, const char* enc,
|
||||
int exec_input, int preserve_crlf)
|
||||
{
|
||||
struct tok_state *tok = tok_new();
|
||||
if (tok == NULL)
|
||||
return NULL;
|
||||
if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
|
||||
_PyTokenizer_Free(tok);
|
||||
return NULL;
|
||||
}
|
||||
tok->cur = tok->inp = tok->buf;
|
||||
tok->end = tok->buf + BUFSIZ;
|
||||
tok->fp = NULL;
|
||||
if (enc != NULL) {
|
||||
tok->encoding = new_string(enc, strlen(enc), tok);
|
||||
if (!tok->encoding) {
|
||||
_PyTokenizer_Free(tok);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
tok->decoding_state = STATE_NORMAL;
|
||||
Py_INCREF(readline);
|
||||
tok->readline = readline;
|
||||
return tok;
|
||||
}
|
||||
|
||||
/* Set up tokenizer for UTF-8 string */
|
||||
|
||||
struct tok_state *
|
||||
|
@ -969,8 +998,9 @@ _PyTokenizer_Free(struct tok_state *tok)
|
|||
}
|
||||
Py_XDECREF(tok->decoding_readline);
|
||||
Py_XDECREF(tok->decoding_buffer);
|
||||
Py_XDECREF(tok->readline);
|
||||
Py_XDECREF(tok->filename);
|
||||
if (tok->fp != NULL && tok->buf != NULL) {
|
||||
if ((tok->readline != NULL || tok->fp != NULL ) && tok->buf != NULL) {
|
||||
PyMem_Free(tok->buf);
|
||||
}
|
||||
if (tok->input) {
|
||||
|
@ -1021,6 +1051,71 @@ tok_readline_raw(struct tok_state *tok)
|
|||
return 1;
|
||||
}
|
||||
|
||||
static int
|
||||
tok_readline_string(struct tok_state* tok) {
|
||||
PyObject* line = NULL;
|
||||
PyObject* raw_line = PyObject_CallNoArgs(tok->readline);
|
||||
if (raw_line == NULL) {
|
||||
if (PyErr_ExceptionMatches(PyExc_StopIteration)) {
|
||||
PyErr_Clear();
|
||||
return 1;
|
||||
}
|
||||
error_ret(tok);
|
||||
goto error;
|
||||
}
|
||||
if(tok->encoding != NULL) {
|
||||
if (!PyBytes_Check(raw_line)) {
|
||||
PyErr_Format(PyExc_TypeError, "readline() returned a non-bytes object");
|
||||
error_ret(tok);
|
||||
goto error;
|
||||
}
|
||||
line = PyUnicode_Decode(PyBytes_AS_STRING(raw_line), PyBytes_GET_SIZE(raw_line),
|
||||
tok->encoding, "replace");
|
||||
Py_CLEAR(raw_line);
|
||||
if (line == NULL) {
|
||||
error_ret(tok);
|
||||
goto error;
|
||||
}
|
||||
} else {
|
||||
if(!PyUnicode_Check(raw_line)) {
|
||||
PyErr_Format(PyExc_TypeError, "readline() returned a non-string object");
|
||||
error_ret(tok);
|
||||
goto error;
|
||||
}
|
||||
line = raw_line;
|
||||
raw_line = NULL;
|
||||
}
|
||||
Py_ssize_t buflen;
|
||||
const char* buf = PyUnicode_AsUTF8AndSize(line, &buflen);
|
||||
if (buf == NULL) {
|
||||
error_ret(tok);
|
||||
goto error;
|
||||
}
|
||||
|
||||
// Make room for the null terminator *and* potentially
|
||||
// an extra newline character that we may need to artificially
|
||||
// add.
|
||||
size_t buffer_size = buflen + 2;
|
||||
if (!tok_reserve_buf(tok, buffer_size)) {
|
||||
goto error;
|
||||
}
|
||||
memcpy(tok->inp, buf, buflen);
|
||||
tok->inp += buflen;
|
||||
*tok->inp = '\0';
|
||||
|
||||
if (tok->start == NULL) {
|
||||
tok->buf = tok->cur;
|
||||
}
|
||||
tok->line_start = tok->cur;
|
||||
|
||||
Py_DECREF(line);
|
||||
return 1;
|
||||
error:
|
||||
Py_XDECREF(raw_line);
|
||||
Py_XDECREF(line);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
tok_underflow_string(struct tok_state *tok) {
|
||||
char *end = strchr(tok->inp, '\n');
|
||||
|
@ -1195,6 +1290,38 @@ tok_underflow_file(struct tok_state *tok) {
|
|||
return tok->done == E_OK;
|
||||
}
|
||||
|
||||
static int
|
||||
tok_underflow_readline(struct tok_state* tok) {
|
||||
assert(tok->decoding_state == STATE_NORMAL);
|
||||
assert(tok->fp == NULL && tok->input == NULL && tok->decoding_readline == NULL);
|
||||
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
|
||||
tok->cur = tok->inp = tok->buf;
|
||||
}
|
||||
if (!tok_readline_string(tok)) {
|
||||
return 0;
|
||||
}
|
||||
if (tok->inp == tok->cur) {
|
||||
tok->done = E_EOF;
|
||||
return 0;
|
||||
}
|
||||
if (tok->inp[-1] != '\n') {
|
||||
assert(tok->inp + 1 < tok->end);
|
||||
/* Last line does not end in \n, fake one */
|
||||
*tok->inp++ = '\n';
|
||||
*tok->inp = '\0';
|
||||
}
|
||||
|
||||
ADVANCE_LINENO();
|
||||
/* The default encoding is UTF-8, so make sure we don't have any
|
||||
non-UTF-8 sequences in it. */
|
||||
if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
|
||||
error_ret(tok);
|
||||
return 0;
|
||||
}
|
||||
assert(tok->done == E_OK);
|
||||
return tok->done == E_OK;
|
||||
}
|
||||
|
||||
#if defined(Py_DEBUG)
|
||||
static void
|
||||
print_escape(FILE *f, const char *s, Py_ssize_t size)
|
||||
|
@ -1238,7 +1365,10 @@ tok_nextc(struct tok_state *tok)
|
|||
if (tok->done != E_OK) {
|
||||
return EOF;
|
||||
}
|
||||
if (tok->fp == NULL) {
|
||||
if (tok->readline) {
|
||||
rc = tok_underflow_readline(tok);
|
||||
}
|
||||
else if (tok->fp == NULL) {
|
||||
rc = tok_underflow_string(tok);
|
||||
}
|
||||
else if (tok->prompt != NULL) {
|
||||
|
|
|
@ -109,6 +109,7 @@ struct tok_state {
|
|||
expression (cf. issue 16806) */
|
||||
PyObject *decoding_readline; /* open(...).readline */
|
||||
PyObject *decoding_buffer;
|
||||
PyObject *readline; /* readline() function */
|
||||
const char* enc; /* Encoding for the current str. */
|
||||
char* str; /* Source string being tokenized (if tokenizing from a string)*/
|
||||
char* input; /* Tokenizer's newline translated copy of the string. */
|
||||
|
@ -137,6 +138,7 @@ struct tok_state {
|
|||
|
||||
extern struct tok_state *_PyTokenizer_FromString(const char *, int, int);
|
||||
extern struct tok_state *_PyTokenizer_FromUTF8(const char *, int, int);
|
||||
extern struct tok_state *_PyTokenizer_FromReadline(PyObject*, const char*, int, int);
|
||||
extern struct tok_state *_PyTokenizer_FromFile(FILE *, const char*,
|
||||
const char *, const char *);
|
||||
extern void _PyTokenizer_Free(struct tok_state *);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue