mirror of
https://github.com/python/cpython.git
synced 2025-07-07 19:35:27 +00:00
Add tests for the C tokenizer and expose it as a private module (GH-27924)
This commit is contained in:
parent
9ed523159c
commit
a24676bedc
9 changed files with 1114 additions and 5 deletions
195
Python/Python-tokenize.c
Normal file
195
Python/Python-tokenize.c
Normal file
|
@ -0,0 +1,195 @@
|
|||
#include "Python.h"
|
||||
#include "../Parser/tokenizer.h"
|
||||
|
||||
static struct PyModuleDef _tokenizemodule;
|
||||
|
||||
typedef struct {
|
||||
PyTypeObject* TokenizerIter;
|
||||
} tokenize_state;
|
||||
|
||||
static tokenize_state*
|
||||
get_tokenize_state(PyObject* module)
|
||||
{
|
||||
return (tokenize_state*)PyModule_GetState(module);
|
||||
}
|
||||
|
||||
#define _tokenize_get_state_by_type(type) \
|
||||
get_tokenize_state(_PyType_GetModuleByDef(type, &_tokenizemodule))
|
||||
|
||||
#include "clinic/Python-tokenize.c.h"
|
||||
|
||||
/*[clinic input]
|
||||
module _tokenizer
|
||||
class _tokenizer.tokenizeriter "tokenizeriterobject *" "_tokenize_get_state_by_type(type)->TokenizerIter"
|
||||
[clinic start generated code]*/
|
||||
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=96d98ee2fef7a8bc]*/
|
||||
|
||||
typedef struct {
|
||||
PyObject_HEAD
|
||||
struct tok_state* tok;
|
||||
} tokenizeriterobject;
|
||||
|
||||
/*[clinic input]
|
||||
@classmethod
|
||||
_tokenizer.tokenizeriter.__new__ as tokenizeriter_new
|
||||
|
||||
source: str
|
||||
[clinic start generated code]*/
|
||||
|
||||
static PyObject *
|
||||
tokenizeriter_new_impl(PyTypeObject *type, const char *source)
|
||||
/*[clinic end generated code: output=7fd9f46cf9263cbb input=4384b368407375c6]*/
|
||||
{
|
||||
tokenizeriterobject* self = (tokenizeriterobject*)type->tp_alloc(type, 0);
|
||||
if (self == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
PyObject* filename = PyUnicode_FromString("<string>");
|
||||
if (filename == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
self->tok = PyTokenizer_FromUTF8(source, 1);
|
||||
if (self->tok == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
self->tok->filename = filename;
|
||||
return (PyObject*)self;
|
||||
}
|
||||
|
||||
static PyObject*
|
||||
tokenizeriter_next(tokenizeriterobject* it)
|
||||
{
|
||||
const char* start;
|
||||
const char* end;
|
||||
int type = PyTokenizer_Get(it->tok, &start, &end);
|
||||
if (type == ERRORTOKEN && PyErr_Occurred()) {
|
||||
return NULL;
|
||||
}
|
||||
if (type == ERRORTOKEN || type == ENDMARKER) {
|
||||
PyErr_SetString(PyExc_StopIteration, "EOF");
|
||||
return NULL;
|
||||
}
|
||||
PyObject* str = NULL;
|
||||
if (start == NULL || end == NULL) {
|
||||
str = PyUnicode_FromString("");
|
||||
} else {
|
||||
str = PyUnicode_FromStringAndSize(start, end - start);
|
||||
}
|
||||
if (str == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_ssize_t size = it->tok->inp - it->tok->buf;
|
||||
PyObject* line = PyUnicode_DecodeUTF8(it->tok->buf, size, "replace");
|
||||
if (line == NULL) {
|
||||
Py_DECREF(str);
|
||||
return NULL;
|
||||
}
|
||||
const char* line_start = type == STRING ? it->tok->multi_line_start : it->tok->line_start;
|
||||
int lineno = type == STRING ? it->tok->first_lineno : it->tok->lineno;
|
||||
int end_lineno = it->tok->lineno;
|
||||
int col_offset = -1;
|
||||
int end_col_offset = -1;
|
||||
if (start != NULL && start >= line_start) {
|
||||
col_offset = (int)(start - line_start);
|
||||
}
|
||||
if (end != NULL && end >= it->tok->line_start) {
|
||||
end_col_offset = (int)(end - it->tok->line_start);
|
||||
}
|
||||
|
||||
return Py_BuildValue("(NiiiiiN)", str, type, lineno, end_lineno, col_offset, end_col_offset, line);
|
||||
}
|
||||
|
||||
static void
|
||||
tokenizeriter_dealloc(tokenizeriterobject* it)
|
||||
{
|
||||
PyTypeObject* tp = Py_TYPE(it);
|
||||
PyTokenizer_Free(it->tok);
|
||||
tp->tp_free(it);
|
||||
Py_DECREF(tp);
|
||||
}
|
||||
|
||||
static PyType_Slot tokenizeriter_slots[] = {
|
||||
{Py_tp_new, tokenizeriter_new},
|
||||
{Py_tp_dealloc, tokenizeriter_dealloc},
|
||||
{Py_tp_getattro, PyObject_GenericGetAttr},
|
||||
{Py_tp_iter, PyObject_SelfIter},
|
||||
{Py_tp_iternext, tokenizeriter_next},
|
||||
{0, NULL},
|
||||
};
|
||||
|
||||
static PyType_Spec tokenizeriter_spec = {
|
||||
.name = "_tokenize.TokenizerIter",
|
||||
.basicsize = sizeof(tokenizeriterobject),
|
||||
.flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_IMMUTABLETYPE),
|
||||
.slots = tokenizeriter_slots,
|
||||
};
|
||||
|
||||
|
||||
static int
|
||||
tokenizemodule_exec(PyObject* m)
|
||||
{
|
||||
tokenize_state* state = get_tokenize_state(m);
|
||||
if (state == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
state->TokenizerIter = (PyTypeObject *)PyType_FromModuleAndSpec(
|
||||
m, &tokenizeriter_spec, NULL);
|
||||
if (state->TokenizerIter == NULL) {
|
||||
return -1;
|
||||
}
|
||||
if (PyModule_AddType(m, state->TokenizerIter) < 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static PyMethodDef tokenize_methods[] = {
|
||||
{NULL, NULL, 0, NULL} /* Sentinel */
|
||||
};
|
||||
|
||||
static PyModuleDef_Slot tokenizemodule_slots[] = {
|
||||
{Py_mod_exec, tokenizemodule_exec},
|
||||
{0, NULL}
|
||||
};
|
||||
|
||||
static int
|
||||
tokenizemodule_traverse(PyObject *m, visitproc visit, void *arg)
|
||||
{
|
||||
tokenize_state *state = get_tokenize_state(m);
|
||||
Py_VISIT(state->TokenizerIter);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
tokenizemodule_clear(PyObject *m)
|
||||
{
|
||||
tokenize_state *state = get_tokenize_state(m);
|
||||
Py_CLEAR(state->TokenizerIter);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
tokenizemodule_free(void *m)
|
||||
{
|
||||
tokenizemodule_clear((PyObject *)m);
|
||||
}
|
||||
|
||||
static struct PyModuleDef _tokenizemodule = {
|
||||
PyModuleDef_HEAD_INIT,
|
||||
.m_name = "_tokenize",
|
||||
.m_size = sizeof(tokenize_state),
|
||||
.m_slots = tokenizemodule_slots,
|
||||
.m_methods = tokenize_methods,
|
||||
.m_traverse = tokenizemodule_traverse,
|
||||
.m_clear = tokenizemodule_clear,
|
||||
.m_free = tokenizemodule_free,
|
||||
};
|
||||
|
||||
PyMODINIT_FUNC
|
||||
PyInit__tokenize(void)
|
||||
{
|
||||
return PyModuleDef_Init(&_tokenizemodule);
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue