Add tests for the C tokenizer and expose it as a private module (GH-27924)

This commit is contained in:
Pablo Galindo Salgado 2021-08-24 17:50:05 +01:00 committed by GitHub
parent 9ed523159c
commit a24676bedc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 1114 additions and 5 deletions

195
Python/Python-tokenize.c Normal file
View file

@ -0,0 +1,195 @@
#include "Python.h"
#include "../Parser/tokenizer.h"
static struct PyModuleDef _tokenizemodule;
typedef struct {
PyTypeObject* TokenizerIter;
} tokenize_state;
static tokenize_state*
get_tokenize_state(PyObject* module)
{
return (tokenize_state*)PyModule_GetState(module);
}
#define _tokenize_get_state_by_type(type) \
get_tokenize_state(_PyType_GetModuleByDef(type, &_tokenizemodule))
#include "clinic/Python-tokenize.c.h"
/*[clinic input]
module _tokenizer
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
[clinic start generated code]*/
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
typedef struct {
PyObject_HEAD
struct tok_state* tok;
} tokenizeriterobject;
/*[clinic input]
@classmethod
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
source: str
[clinic start generated code]*/
static PyObject *
tokenizeriter_new_impl(PyTypeObject *type, const char *source)
/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
{
tokenizeriterobject* self = (tokenizeriterobject*)type->tp_alloc(type, 0);
if (self == NULL) {
return NULL;
}
PyObject* filename = PyUnicode_FromString("<string>");
if (filename == NULL) {
return NULL;
}
self->tok = PyTokenizer_FromUTF8(source, 1);
if (self->tok == NULL) {
return NULL;
}
self->tok->filename = filename;
return (PyObject*)self;
}
static PyObject*
tokenizeriter_next(tokenizeriterobject* it)
{
const char* start;
const char* end;
int type = PyTokenizer_Get(it->tok, &start, &end);
if (type == ERRORTOKEN && PyErr_Occurred()) {
return NULL;
}
if (type == ERRORTOKEN || type == ENDMARKER) {
PyErr_SetString(PyExc_StopIteration, "EOF");
return NULL;
}
PyObject* str = NULL;
if (start == NULL || end == NULL) {
str = PyUnicode_FromString("");
} else {
str = PyUnicode_FromStringAndSize(start, end - start);
}
if (str == NULL) {
return NULL;
}
Py_ssize_t size = it->tok->inp - it->tok->buf;
PyObject* line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
if (line == NULL) {
Py_DECREF(str);
return NULL;
}
const char* line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
int end_lineno = it->tok->lineno;
int col_offset = -1;
int end_col_offset = -1;
if (start != NULL && start >= line_start) {
col_offset = (int)(start - line_start);
}
if (end != NULL && end >= it->tok->line_start) {
end_col_offset = (int)(end - it->tok->line_start);
}
return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
}
static void
tokenizeriter_dealloc(tokenizeriterobject* it)
{
PyTypeObject* tp = Py_TYPE(it);
PyTokenizer_Free(it->tok);
tp->tp_free(it);
Py_DECREF(tp);
}
static PyType_Slot tokenizeriter_slots[] = {
{Py_tp_new, tokenizeriter_new},
{Py_tp_dealloc, tokenizeriter_dealloc},
{Py_tp_getattro, PyObject_GenericGetAttr},
{Py_tp_iter, PyObject_SelfIter},
{Py_tp_iternext, tokenizeriter_next},
{0, NULL},
};
static PyType_Spec tokenizeriter_spec = {
.name = "_tokenize.TokenizerIter",
.basicsize = sizeof(tokenizeriterobject),
.flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
.slots = tokenizeriter_slots,
};
static int
tokenizemodule_exec(PyObject* m)
{
tokenize_state* state = get_tokenize_state(m);
if (state == NULL) {
return -1;
}
state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(
m, &tokenizeriter_spec, NULL);
if (state->TokenizerIter == NULL) {
return -1;
}
if (PyModule_AddType(m, state->TokenizerIter) < 0) {
return -1;
}
return 0;
}
static PyMethodDef tokenize_methods[] = {
{NULL, NULL, 0, NULL} /* Sentinel */
};
static PyModuleDef_Slot tokenizemodule_slots[] = {
{Py_mod_exec, tokenizemodule_exec},
{0, NULL}
};
static int
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
{
tokenize_state *state = get_tokenize_state(m);
Py_VISIT(state->TokenizerIter);
return 0;
}
static int
tokenizemodule_clear(PyObject *m)
{
tokenize_state *state = get_tokenize_state(m);
Py_CLEAR(state->TokenizerIter);
return 0;
}
static void
tokenizemodule_free(void *m)
{
tokenizemodule_clear((PyObject *)m);
}
static struct PyModuleDef _tokenizemodule = {
PyModuleDef_HEAD_INIT,
.m_name = "_tokenize",
.m_size = sizeof(tokenize_state),
.m_slots = tokenizemodule_slots,
.m_methods = tokenize_methods,
.m_traverse = tokenizemodule_traverse,
.m_clear = tokenizemodule_clear,
.m_free = tokenizemodule_free,
};
PyMODINIT_FUNC
PyInit__tokenize(void)
{
return PyModuleDef_Init(&_tokenizemodule);
}

41
Python/clinic/Python-tokenize.c.h generated Normal file
View file

@ -0,0 +1,41 @@
/*[clinic input]
preserve
[clinic start generated code]*/
static PyObject *
tokenizeriter_new_impl(PyTypeObject *type, const char *source);
static PyObject *
tokenizeriter_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
{
PyObject *return_value = NULL;
static const char * const _keywords[] = {"source", NULL};
static _PyArg_Parser _parser = {NULL, _keywords, "tokenizeriter", 0};
PyObject *argsbuf[1];
PyObject * const *fastargs;
Py_ssize_t nargs = PyTuple_GET_SIZE(args);
const char *source;
fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, 1, 1, 0, argsbuf);
if (!fastargs) {
goto exit;
}
if (!PyUnicode_Check(fastargs[0])) {
_PyArg_BadArgument("tokenizeriter", "argument 'source'", "str", fastargs[0]);
goto exit;
}
Py_ssize_t source_length;
source = PyUnicode_AsUTF8AndSize(fastargs[0], &source_length);
if (source == NULL) {
goto exit;
}
if (strlen(source) != (size_t)source_length) {
PyErr_SetString(PyExc_ValueError, "embedded null character");
goto exit;
}
return_value = tokenizeriter_new_impl(type, source);
exit:
return return_value;
}
/*[clinic end generated code: output=dfcd64774e01bfe6 input=a9049054013a1b77]*/

View file

@ -80,6 +80,7 @@ static const char* _Py_stdlib_module_names[] = {
"_thread",
"_threading_local",
"_tkinter",
"_tokenize",
"_tracemalloc",
"_typing",
"_uuid",