gh-102856: Initial implementation of PEP 701 (#102855)

Co-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
Co-authored-by: Batuhan Taskaya <isidentical@gmail.com>
Co-authored-by: Marta Gómez Macías <mgmacias@google.com>
Co-authored-by: sunmy2019 <59365878+sunmy2019@users.noreply.github.com>
This commit is contained in:
Pablo Galindo Salgado 2023-04-19 17:18:16 +01:00 committed by GitHub
parent a6b07b5a34
commit 1ef61cf71a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
27 changed files with 8859 additions and 6573 deletions

View file

@ -1,6 +1,7 @@
#include <Python.h>
#include "pegen.h"
#include "tokenizer.h"
#include "string_parser.h"
#include "pycore_runtime.h" // _PyRuntime
@ -853,96 +854,6 @@ _PyPegen_seq_delete_starred_exprs(Parser *p, asdl_seq *kwargs)
return new_seq;
}
expr_ty
_PyPegen_concatenate_strings(Parser *p, asdl_seq *strings)
{
Py_ssize_t len = asdl_seq_LEN(strings);
assert(len > 0);
Token *first = asdl_seq_GET_UNTYPED(strings, 0);
Token *last = asdl_seq_GET_UNTYPED(strings, len - 1);
int bytesmode = 0;
PyObject *bytes_str = NULL;
FstringParser state;
_PyPegen_FstringParser_Init(&state);
for (Py_ssize_t i = 0; i < len; i++) {
Token *t = asdl_seq_GET_UNTYPED(strings, i);
int this_bytesmode;
int this_rawmode;
PyObject *s;
const char *fstr;
Py_ssize_t fstrlen = -1;
if (_PyPegen_parsestr(p, &this_bytesmode, &this_rawmode, &s, &fstr, &fstrlen, t) != 0) {
goto error;
}
/* Check that we are not mixing bytes with unicode. */
if (i != 0 && bytesmode != this_bytesmode) {
RAISE_SYNTAX_ERROR("cannot mix bytes and nonbytes literals");
Py_XDECREF(s);
goto error;
}
bytesmode = this_bytesmode;
if (fstr != NULL) {
assert(s == NULL && !bytesmode);
int result = _PyPegen_FstringParser_ConcatFstring(p, &state, &fstr, fstr + fstrlen,
this_rawmode, 0, first, t, last);
if (result < 0) {
goto error;
}
}
else {
/* String or byte string. */
assert(s != NULL && fstr == NULL);
assert(bytesmode ? PyBytes_CheckExact(s) : PyUnicode_CheckExact(s));
if (bytesmode) {
if (i == 0) {
bytes_str = s;
}
else {
PyBytes_ConcatAndDel(&bytes_str, s);
if (!bytes_str) {
goto error;
}
}
}
else {
/* This is a regular string. Concatenate it. */
if (_PyPegen_FstringParser_ConcatAndDel(&state, s) < 0) {
goto error;
}
}
}
}
if (bytesmode) {
if (_PyArena_AddPyObject(p->arena, bytes_str) < 0) {
goto error;
}
return _PyAST_Constant(bytes_str, NULL, first->lineno,
first->col_offset, last->end_lineno,
last->end_col_offset, p->arena);
}
return _PyPegen_FstringParser_Finish(p, &state, first, last);
error:
Py_XDECREF(bytes_str);
_PyPegen_FstringParser_Dealloc(&state);
if (PyErr_Occurred()) {
_Pypegen_raise_decode_error(p);
}
return NULL;
}
expr_ty
_PyPegen_ensure_imaginary(Parser *p, expr_ty exp)
{
@ -1054,6 +965,18 @@ _PyPegen_check_legacy_stmt(Parser *p, expr_ty name) {
return 0;
}
expr_ty
_PyPegen_check_fstring_conversion(Parser *p, Token* symbol, expr_ty conv) {
if (symbol->lineno != conv->lineno || symbol->end_col_offset != conv->col_offset) {
return RAISE_SYNTAX_ERROR_KNOWN_RANGE(
symbol, conv,
"f-string: conversion type must come right after the exclamanation mark"
);
}
return conv;
}
const char *
_PyPegen_get_expr_name(expr_ty e)
{
@ -1271,3 +1194,439 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq
"Generator expression must be parenthesized"
);
}
// Fstring stuff
static expr_ty
decode_fstring_buffer(Parser *p, int lineno, int col_offset, int end_lineno,
int end_col_offset)
{
tokenizer_mode *tok_mode = &(p->tok->tok_mode_stack[p->tok->tok_mode_stack_index]);
assert(tok_mode->last_expr_buffer != NULL);
assert(tok_mode->last_expr_size >= 0 && tok_mode->last_expr_end >= 0);
PyObject *res = PyUnicode_DecodeUTF8(
tok_mode->last_expr_buffer,
tok_mode->last_expr_size - tok_mode->last_expr_end,
NULL
);
if (!res || _PyArena_AddPyObject(p->arena, res) < 0) {
Py_XDECREF(res);
return NULL;
}
return _PyAST_Constant(res, NULL, lineno, col_offset, end_lineno, end_col_offset, p->arena);
}
static expr_ty
_PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant) {
assert(PyUnicode_CheckExact(constant->v.Constant.value));
const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value);
if (bstr == NULL) {
return NULL;
}
size_t len;
if (strcmp(bstr, "{{") == 0 || strcmp(bstr, "}}") == 0) {
len = 1;
} else {
len = strlen(bstr);
}
is_raw = is_raw || strchr(bstr, '\\') == NULL;
PyObject *str = _PyPegen_decode_string(p, is_raw, bstr, len, NULL);
if (str == NULL) {
_Pypegen_raise_decode_error(p);
return NULL;
}
if (_PyArena_AddPyObject(p->arena, str) < 0) {
Py_DECREF(str);
return NULL;
}
return _PyAST_Constant(str, NULL, constant->lineno, constant->col_offset,
constant->end_lineno, constant->end_col_offset,
p->arena);
}
static asdl_expr_seq *
unpack_top_level_joined_strs(Parser *p, asdl_expr_seq *raw_expressions)
{
/* The parser might put multiple f-string values into an individual
* JoinedStr node at the top level due to stuff like f-string debugging
* expressions. This function flattens those and promotes them to the
* upper level. Only simplifies AST, but the compiler already takes care
* of the regular output, so this is not necessary if you are not going
* to expose the output AST to Python level. */
Py_ssize_t i, req_size, raw_size;
req_size = raw_size = asdl_seq_LEN(raw_expressions);
expr_ty expr;
for (i = 0; i < raw_size; i++) {
expr = asdl_seq_GET(raw_expressions, i);
if (expr->kind == JoinedStr_kind) {
req_size += asdl_seq_LEN(expr->v.JoinedStr.values) - 1;
}
}
asdl_expr_seq *expressions = _Py_asdl_expr_seq_new(req_size, p->arena);
Py_ssize_t raw_index, req_index = 0;
for (raw_index = 0; raw_index < raw_size; raw_index++) {
expr = asdl_seq_GET(raw_expressions, raw_index);
if (expr->kind == JoinedStr_kind) {
asdl_expr_seq *values = expr->v.JoinedStr.values;
for (Py_ssize_t n = 0; n < asdl_seq_LEN(values); n++) {
asdl_seq_SET(expressions, req_index, asdl_seq_GET(values, n));
req_index++;
}
} else {
asdl_seq_SET(expressions, req_index, expr);
req_index++;
}
}
return expressions;
}
expr_ty
_PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b) {
asdl_expr_seq *expr = unpack_top_level_joined_strs(p, raw_expressions);
Py_ssize_t n_items = asdl_seq_LEN(expr);
const char* quote_str = PyBytes_AsString(a->bytes);
if (quote_str == NULL) {
return NULL;
}
int is_raw = strpbrk(quote_str, "rR") != NULL;
asdl_expr_seq *seq = _Py_asdl_expr_seq_new(n_items, p->arena);
if (seq == NULL) {
return NULL;
}
Py_ssize_t index = 0;
for (Py_ssize_t i = 0; i < n_items; i++) {
expr_ty item = asdl_seq_GET(expr, i);
if (item->kind == Constant_kind) {
item = _PyPegen_decode_fstring_part(p, is_raw, item);
if (item == NULL) {
return NULL;
}
/* Tokenizer emits string parts even when the underlying string
might become an empty value (e.g. FSTRING_MIDDLE with the value \\n)
so we need to check for them and simplify it here. */
if (PyUnicode_CheckExact(item->v.Constant.value)
&& PyUnicode_GET_LENGTH(item->v.Constant.value) == 0) {
continue;
}
}
asdl_seq_SET(seq, index++, item);
}
asdl_expr_seq *resized_exprs;
if (index != n_items) {
resized_exprs = _Py_asdl_expr_seq_new(index, p->arena);
if (resized_exprs == NULL) {
return NULL;
}
for (Py_ssize_t i = 0; i < index; i++) {
asdl_seq_SET(resized_exprs, i, asdl_seq_GET(seq, i));
}
}
else {
resized_exprs = seq;
}
return _PyAST_JoinedStr(resized_exprs, a->lineno, a->col_offset,
b->end_lineno, b->end_col_offset,
p->arena);
}
expr_ty _PyPegen_constant_from_token(Parser* p, Token* tok) {
char* bstr = PyBytes_AsString(tok->bytes);
if (bstr == NULL) {
return NULL;
}
PyObject* str = PyUnicode_FromString(bstr);
if (str == NULL) {
return NULL;
}
if (_PyArena_AddPyObject(p->arena, str) < 0) {
Py_DECREF(str);
return NULL;
}
return _PyAST_Constant(str, NULL, tok->lineno, tok->col_offset,
tok->end_lineno, tok->end_col_offset,
p->arena);
}
expr_ty _PyPegen_constant_from_string(Parser* p, Token* tok) {
char* the_str = PyBytes_AsString(tok->bytes);
if (the_str == NULL) {
return NULL;
}
PyObject *s = _PyPegen_parse_string(p, tok);
if (s == NULL) {
_Pypegen_raise_decode_error(p);
return NULL;
}
if (_PyArena_AddPyObject(p->arena, s) < 0) {
Py_DECREF(s);
return NULL;
}
PyObject *kind = NULL;
if (the_str && the_str[0] == 'u') {
kind = _PyPegen_new_identifier(p, "u");
if (kind == NULL) {
return NULL;
}
}
return _PyAST_Constant(s, kind, tok->lineno, tok->col_offset, tok->end_lineno, tok->end_col_offset, p->arena);
}
expr_ty _PyPegen_formatted_value(Parser *p, expr_ty expression, Token *debug, expr_ty conversion,
expr_ty format, int lineno, int col_offset, int end_lineno, int end_col_offset,
PyArena *arena) {
int conversion_val = -1;
if (conversion != NULL) {
assert(conversion->kind == Name_kind);
Py_UCS4 first = PyUnicode_READ_CHAR(conversion->v.Name.id, 0);
if (PyUnicode_GET_LENGTH(conversion->v.Name.id) > 1 ||
!(first == 's' || first == 'r' || first == 'a')) {
RAISE_SYNTAX_ERROR_KNOWN_LOCATION(conversion,
"f-string: invalid conversion character %R: expected 's', 'r', or 'a'",
conversion->v.Name.id);
return NULL;
}
conversion_val = Py_SAFE_DOWNCAST(first, Py_UCS4, int);
}
else if (debug && !format) {
/* If no conversion is specified, use !r for debug expressions */
conversion_val = (int)'r';
}
expr_ty formatted_value = _PyAST_FormattedValue(
expression, conversion_val, format,
lineno, col_offset, end_lineno,
end_col_offset, arena
);
if (debug) {
/* Find the non whitespace token after the "=" */
int debug_end_line, debug_end_offset;
if (conversion) {
debug_end_line = conversion->lineno;
debug_end_offset = conversion->col_offset;
}
else if (format) {
debug_end_line = format->lineno;
debug_end_offset = format->col_offset + 1; // HACK: ??
}
else {
debug_end_line = end_lineno;
debug_end_offset = end_col_offset;
}
expr_ty debug_text = decode_fstring_buffer(p, lineno, col_offset + 1,
debug_end_line, debug_end_offset - 1);
if (!debug_text) {
return NULL;
}
asdl_expr_seq *values = _Py_asdl_expr_seq_new(2, arena);
asdl_seq_SET(values, 0, debug_text);
asdl_seq_SET(values, 1, formatted_value);
return _PyAST_JoinedStr(values, lineno, col_offset, debug_end_line, debug_end_offset, p->arena);
}
else {
return formatted_value;
}
}
expr_ty
_PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *strings,
int lineno, int col_offset, int end_lineno,
int end_col_offset, PyArena *arena)
{
Py_ssize_t len = asdl_seq_LEN(strings);
assert(len > 0);
int f_string_found = 0;
int unicode_string_found = 0;
int bytes_found = 0;
Py_ssize_t i = 0;
Py_ssize_t n_flattened_elements = 0;
for (i = 0; i < len; i++) {
expr_ty elem = asdl_seq_GET(strings, i);
if (elem->kind == Constant_kind) {
if (PyBytes_CheckExact(elem->v.Constant.value)) {
bytes_found = 1;
} else {
unicode_string_found = 1;
}
n_flattened_elements++;
} else {
n_flattened_elements += asdl_seq_LEN(elem->v.JoinedStr.values);
f_string_found = 1;
}
}
if ((unicode_string_found || f_string_found) && bytes_found) {
RAISE_SYNTAX_ERROR("cannot mix bytes and nonbytes literals");
return NULL;
}
if (bytes_found) {
PyObject* res = PyBytes_FromString("");
/* Bytes literals never get a kind, but just for consistency
since they are represented as Constant nodes, we'll mirror
the same behavior as unicode strings for determining the
kind. */
PyObject* kind = asdl_seq_GET(strings, 0)->v.Constant.kind;
for (i = 0; i < len; i++) {
expr_ty elem = asdl_seq_GET(strings, i);
PyBytes_Concat(&res, elem->v.Constant.value);
}
if (!res || _PyArena_AddPyObject(arena, res) < 0) {
Py_XDECREF(res);
return NULL;
}
return _PyAST_Constant(res, kind, lineno, col_offset, end_lineno, end_col_offset, p->arena);
}
if (!f_string_found && len == 1) {
return asdl_seq_GET(strings, 0);
}
asdl_expr_seq* flattened = _Py_asdl_expr_seq_new(n_flattened_elements, p->arena);
if (flattened == NULL) {
return NULL;
}
/* build flattened list */
Py_ssize_t current_pos = 0;
Py_ssize_t j = 0;
for (i = 0; i < len; i++) {
expr_ty elem = asdl_seq_GET(strings, i);
if (elem->kind == Constant_kind) {
asdl_seq_SET(flattened, current_pos++, elem);
} else {
for (j = 0; j < asdl_seq_LEN(elem->v.JoinedStr.values); j++) {
expr_ty subvalue = asdl_seq_GET(elem->v.JoinedStr.values, j);
if (subvalue == NULL) {
return NULL;
}
asdl_seq_SET(flattened, current_pos++, subvalue);
}
}
}
/* calculate folded element count */
Py_ssize_t n_elements = 0;
int prev_is_constant = 0;
for (i = 0; i < n_flattened_elements; i++) {
expr_ty elem = asdl_seq_GET(flattened, i);
/* The concatenation of a FormattedValue and an empty Contant should
lead to the FormattedValue itself. Thus, we will not take any empty
constants into account, just as in `_PyPegen_joined_str` */
if (f_string_found && elem->kind == Constant_kind &&
PyUnicode_CheckExact(elem->v.Constant.value) &&
PyUnicode_GET_LENGTH(elem->v.Constant.value) == 0)
continue;
if (!prev_is_constant || elem->kind != Constant_kind) {
n_elements++;
}
prev_is_constant = elem->kind == Constant_kind;
}
asdl_expr_seq* values = _Py_asdl_expr_seq_new(n_elements, p->arena);
if (values == NULL) {
return NULL;
}
/* build folded list */
_PyUnicodeWriter writer;
current_pos = 0;
for (i = 0; i < n_flattened_elements; i++) {
expr_ty elem = asdl_seq_GET(flattened, i);
/* if the current elem and the following are constants,
fold them and all consequent constants */
if (elem->kind == Constant_kind) {
if (i + 1 < n_flattened_elements &&
asdl_seq_GET(flattened, i + 1)->kind == Constant_kind) {
expr_ty first_elem = elem;
/* When a string is getting concatenated, the kind of the string
is determined by the first string in the concatenation
sequence.
u"abc" "def" -> u"abcdef"
"abc" u"abc" -> "abcabc" */
PyObject *kind = elem->v.Constant.kind;
_PyUnicodeWriter_Init(&writer);
expr_ty last_elem = elem;
for (j = i; j < n_flattened_elements; j++) {
expr_ty current_elem = asdl_seq_GET(flattened, j);
if (current_elem->kind == Constant_kind) {
if (_PyUnicodeWriter_WriteStr(
&writer, current_elem->v.Constant.value)) {
_PyUnicodeWriter_Dealloc(&writer);
return NULL;
}
last_elem = current_elem;
} else {
break;
}
}
i = j - 1;
PyObject *concat_str = _PyUnicodeWriter_Finish(&writer);
if (concat_str == NULL) {
_PyUnicodeWriter_Dealloc(&writer);
return NULL;
}
if (_PyArena_AddPyObject(p->arena, concat_str) < 0) {
Py_DECREF(concat_str);
return NULL;
}
elem = _PyAST_Constant(concat_str, kind, first_elem->lineno,
first_elem->col_offset,
last_elem->end_lineno,
last_elem->end_col_offset, p->arena);
if (elem == NULL) {
return NULL;
}
}
/* Drop all empty contanst strings */
if (f_string_found &&
PyUnicode_CheckExact(elem->v.Constant.value) &&
PyUnicode_GET_LENGTH(elem->v.Constant.value) == 0) {
continue;
}
}
asdl_seq_SET(values, current_pos++, elem);
}
if (!f_string_found) {
assert(n_elements == 1);
expr_ty elem = asdl_seq_GET(values, 0);
assert(elem->kind == Constant_kind);
return elem;
}
assert(current_pos == n_elements);
return _PyAST_JoinedStr(values, lineno, col_offset, end_lineno, end_col_offset, p->arena);
}

12714
Parser/parser.c generated

File diff suppressed because it is too large Load diff

View file

@ -359,7 +359,7 @@ _PyPegen_expect_token(Parser *p, int type)
}
Token *t = p->tokens[p->mark];
if (t->type != type) {
return NULL;
return NULL;
}
p->mark += 1;
return t;

View file

@ -138,6 +138,7 @@ void* _PyPegen_expect_forced_result(Parser *p, void* result, const char* expecte
Token *_PyPegen_expect_forced_token(Parser *p, int type, const char* expected);
expr_ty _PyPegen_expect_soft_keyword(Parser *p, const char *keyword);
expr_ty _PyPegen_soft_keyword_token(Parser *p);
expr_ty _PyPegen_fstring_middle_token(Parser* p);
Token *_PyPegen_get_last_nonnwhitespace_token(Parser *);
int _PyPegen_fill_token(Parser *p);
expr_ty _PyPegen_name_token(Parser *p);
@ -155,7 +156,7 @@ typedef enum {
int _Pypegen_raise_decode_error(Parser *p);
void _PyPegen_raise_tokenizer_init_error(PyObject *filename);
int _Pypegen_tokenizer_error(Parser *p);
void *_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...);
void *_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...);
void *_PyPegen_raise_error_known_location(Parser *p, PyObject *errtype,
Py_ssize_t lineno, Py_ssize_t col_offset,
Py_ssize_t end_lineno, Py_ssize_t end_col_offset,
@ -175,8 +176,9 @@ RAISE_ERROR_KNOWN_LOCATION(Parser *p, PyObject *errtype,
va_end(va);
return NULL;
}
#define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, msg, ##__VA_ARGS__)
#define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, msg, ##__VA_ARGS__)
#define RAISE_SYNTAX_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 0, msg, ##__VA_ARGS__)
#define RAISE_INDENTATION_ERROR(msg, ...) _PyPegen_raise_error(p, PyExc_IndentationError, 0, msg, ##__VA_ARGS__)
#define RAISE_SYNTAX_ERROR_ON_NEXT_TOKEN(msg, ...) _PyPegen_raise_error(p, PyExc_SyntaxError, 1, msg, ##__VA_ARGS__)
#define RAISE_SYNTAX_ERROR_KNOWN_RANGE(a, b, msg, ...) \
RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, (a)->lineno, (a)->col_offset, (b)->end_lineno, (b)->end_col_offset, msg, ##__VA_ARGS__)
#define RAISE_SYNTAX_ERROR_KNOWN_LOCATION(a, msg, ...) \
@ -308,6 +310,7 @@ StarEtc *_PyPegen_star_etc(Parser *, arg_ty, asdl_seq *, arg_ty);
arguments_ty _PyPegen_make_arguments(Parser *, asdl_arg_seq *, SlashWithDefault *,
asdl_arg_seq *, asdl_seq *, StarEtc *);
arguments_ty _PyPegen_empty_arguments(Parser *);
expr_ty _PyPegen_formatted_value(Parser *, expr_ty, Token *, expr_ty, expr_ty, int, int, int, int, PyArena *);
AugOperator *_PyPegen_augoperator(Parser*, operator_ty type);
stmt_ty _PyPegen_function_def_decorators(Parser *, asdl_expr_seq *, stmt_ty);
stmt_ty _PyPegen_class_def_decorators(Parser *, asdl_expr_seq *, stmt_ty);
@ -317,12 +320,16 @@ asdl_keyword_seq *_PyPegen_seq_delete_starred_exprs(Parser *, asdl_seq *);
expr_ty _PyPegen_collect_call_seqs(Parser *, asdl_expr_seq *, asdl_seq *,
int lineno, int col_offset, int end_lineno,
int end_col_offset, PyArena *arena);
expr_ty _PyPegen_concatenate_strings(Parser *p, asdl_seq *);
expr_ty _PyPegen_constant_from_token(Parser* p, Token* tok);
expr_ty _PyPegen_constant_from_string(Parser* p, Token* tok);
expr_ty _PyPegen_concatenate_strings(Parser *p, asdl_expr_seq *, int, int, int, int, PyArena *);
expr_ty _PyPegen_FetchRawForm(Parser *p, int, int, int, int);
expr_ty _PyPegen_ensure_imaginary(Parser *p, expr_ty);
expr_ty _PyPegen_ensure_real(Parser *p, expr_ty);
asdl_seq *_PyPegen_join_sequences(Parser *, asdl_seq *, asdl_seq *);
int _PyPegen_check_barry_as_flufl(Parser *, Token *);
int _PyPegen_check_legacy_stmt(Parser *p, expr_ty t);
expr_ty _PyPegen_check_fstring_conversion(Parser *p, Token *, expr_ty t);
mod_ty _PyPegen_make_module(Parser *, asdl_stmt_seq *);
void *_PyPegen_arguments_parsing_error(Parser *, expr_ty);
expr_ty _PyPegen_get_last_comprehension_item(comprehension_ty comprehension);
@ -338,6 +345,9 @@ void *_PyPegen_run_parser(Parser *);
mod_ty _PyPegen_run_parser_from_string(const char *, int, PyObject *, PyCompilerFlags *, PyArena *);
asdl_stmt_seq *_PyPegen_interactive_exit(Parser *);
// TODO: move to the correct place in this file
expr_ty _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* expr, Token*b);
// Generated function in parse.c - function definition in python.gram
void *_PyPegen_parse(Parser *);

View file

@ -192,7 +192,10 @@ _PyPegen_tokenize_full_source_to_check_for_errors(Parser *p) {
exit:
if (PyErr_Occurred()) {
// If we're in an f-string, we want the syntax error in the expression part
// to propagate, so that tokenizer errors (like expecting '}') that happen afterwards
// do not swallow it.
if (PyErr_Occurred() && p->tok->tok_mode_stack_index <= 0) {
Py_XDECREF(value);
Py_XDECREF(type);
Py_XDECREF(traceback);
@ -205,7 +208,7 @@ exit:
// PARSER ERRORS
void *
_PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
_PyPegen_raise_error(Parser *p, PyObject *errtype, int use_mark, const char *errmsg, ...)
{
if (p->fill == 0) {
va_list va;
@ -214,8 +217,13 @@ _PyPegen_raise_error(Parser *p, PyObject *errtype, const char *errmsg, ...)
va_end(va);
return NULL;
}
Token *t = p->known_err_token != NULL ? p->known_err_token : p->tokens[p->fill - 1];
if (use_mark && p->mark == p->fill && _PyPegen_fill_token(p) < 0) {
p->error_indicator = 1;
return NULL;
}
Token *t = p->known_err_token != NULL
? p->known_err_token
: p->tokens[use_mark ? p->mark : p->fill - 1];
Py_ssize_t col_offset;
Py_ssize_t end_col_offset = -1;
if (t->col_offset == -1) {

File diff suppressed because it is too large Load diff

View file

@ -5,42 +5,7 @@
#include <pycore_ast.h>
#include "pegen.h"
#define EXPRLIST_N_CACHED 64
typedef struct {
/* Incrementally build an array of expr_ty, so be used in an
asdl_seq. Cache some small but reasonably sized number of
expr_ty's, and then after that start dynamically allocating,
doubling the number allocated each time. Note that the f-string
f'{0}a{1}' contains 3 expr_ty's: 2 FormattedValue's, and one
Constant for the literal 'a'. So you add expr_ty's about twice as
fast as you add expressions in an f-string. */
Py_ssize_t allocated; /* Number we've allocated. */
Py_ssize_t size; /* Number we've used. */
expr_ty *p; /* Pointer to the memory we're actually
using. Will point to 'data' until we
start dynamically allocating. */
expr_ty data[EXPRLIST_N_CACHED];
} ExprList;
/* The FstringParser is designed to add a mix of strings and
f-strings, and concat them together as needed. Ultimately, it
generates an expr_ty. */
typedef struct {
PyObject *last_str;
ExprList expr_list;
int fmode;
} FstringParser;
void _PyPegen_FstringParser_Init(FstringParser *);
int _PyPegen_parsestr(Parser *, int *, int *, PyObject **,
const char **, Py_ssize_t *, Token *);
int _PyPegen_FstringParser_ConcatFstring(Parser *, FstringParser *, const char **,
const char *, int, int, Token *, Token *,
Token *);
int _PyPegen_FstringParser_ConcatAndDel(FstringParser *, PyObject *);
expr_ty _PyPegen_FstringParser_Finish(Parser *, FstringParser *, Token *, Token *);
void _PyPegen_FstringParser_Dealloc(FstringParser *);
PyObject *_PyPegen_parse_string(Parser *, Token *);
PyObject *_PyPegen_decode_string(Parser *, int, const char *, size_t, Token *);
#endif

5
Parser/token.c generated
View file

@ -60,12 +60,16 @@ const char * const _PyParser_TokenNames[] = {
"RARROW",
"ELLIPSIS",
"COLONEQUAL",
"EXCLAMATION",
"OP",
"AWAIT",
"ASYNC",
"TYPE_IGNORE",
"TYPE_COMMENT",
"SOFT_KEYWORD",
"FSTRING_START",
"FSTRING_MIDDLE",
"FSTRING_END",
"<ERRORTOKEN>",
"<COMMENT>",
"<NL>",
@ -79,6 +83,7 @@ int
_PyToken_OneChar(int c1)
{
switch (c1) {
case '!': return EXCLAMATION;
case '%': return PERCENT;
case '&': return AMPER;
case '(': return LPAR;

View file

@ -43,6 +43,28 @@
tok->lineno++; \
tok->col_offset = 0;
#ifdef Py_DEBUG
static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) {
assert(tok->tok_mode_stack_index >= 0);
assert(tok->tok_mode_stack_index < MAXLEVEL);
return &(tok->tok_mode_stack[tok->tok_mode_stack_index]);
}
static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) {
assert(tok->tok_mode_stack_index >= 0);
assert(tok->tok_mode_stack_index < MAXLEVEL);
return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]);
}
static inline int *TOK_GET_BRACKET_MARK(tokenizer_mode* mode) {
assert(mode->bracket_mark_index >= 0);
assert(mode->bracket_mark_index < MAX_EXPR_NESTING);
return &(mode->bracket_mark[mode->bracket_mark_index]);
}
#else
#define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index]))
#define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index]))
#define TOK_GET_BRACKET_MARK(mode) (&(mode->bracket_mark[mode->bracket_mark_index]))
#endif
/* Forward */
static struct tok_state *tok_new(void);
static int tok_nextc(struct tok_state *tok);
@ -98,6 +120,9 @@ tok_new(void)
tok->interactive_underflow = IUNDERFLOW_NORMAL;
tok->str = NULL;
tok->report_warnings = 1;
tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .f_string_quote='\0', .f_string_quote_size = 0};
tok->tok_mode_stack_index = 0;
tok->tok_report_warnings = 1;
#ifdef Py_DEBUG
tok->debug = _Py_GetConfig()->parser_debug;
#endif
@ -346,6 +371,92 @@ tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
}
/* Traverse and update all f-string buffers with the value */
static void
update_fstring_buffers(struct tok_state *tok, char value, int regular, int multiline)
{
int index;
tokenizer_mode *mode;
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
mode = &(tok->tok_mode_stack[index]);
if (regular && mode->f_string_start != NULL) {
mode->f_string_start += value;
}
if (multiline && mode->f_string_multi_line_start != NULL) {
mode->f_string_multi_line_start += value;
}
}
}
static int
update_fstring_expr(struct tok_state *tok, char cur)
{
assert(tok->cur != NULL);
Py_ssize_t size = strlen(tok->cur);
tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
switch (cur) {
case '{':
if (tok_mode->last_expr_buffer != NULL) {
PyMem_Free(tok_mode->last_expr_buffer);
}
tok_mode->last_expr_buffer = PyMem_Malloc(size);
if (tok_mode->last_expr_buffer == NULL) {
tok->done = E_NOMEM;
return 0;
}
tok_mode->last_expr_size = size;
tok_mode->last_expr_end = -1;
strncpy(tok_mode->last_expr_buffer, tok->cur, size);
break;
case 0:
if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) {
return 1;
}
char *new_buffer = PyMem_Realloc(
tok_mode->last_expr_buffer,
tok_mode->last_expr_size + size
);
if (new_buffer == NULL) {
PyMem_Free(tok_mode->last_expr_buffer);
tok->done = E_NOMEM;
return 0;
}
tok_mode->last_expr_buffer = new_buffer;
strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size);
tok_mode->last_expr_size += size;
break;
case '}':
case '!':
case ':':
if (tok_mode->last_expr_end == -1) {
tok_mode->last_expr_end = strlen(tok->start);
}
break;
}
return 1;
}
static void
free_fstring_expressions(struct tok_state *tok)
{
int index;
tokenizer_mode *mode;
for (index = tok->tok_mode_stack_index; index >= 0; --index) {
mode = &(tok->tok_mode_stack[index]);
if (mode->last_expr_buffer != NULL) {
PyMem_Free(mode->last_expr_buffer);
mode->last_expr_buffer = NULL;
mode->last_expr_size = 0;
mode->last_expr_end = -1;
}
}
}
/* Read a line of text from TOK into S, using the stream in TOK.
Return NULL on failure, else S.
@ -372,6 +483,7 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
update_fstring_buffers(tok, -*tok->buf, /*regular=*/1, /*multiline=*/1);
newbuf = (char *)PyMem_Realloc(newbuf, newsize);
if (newbuf == NULL) {
tok->done = E_NOMEM;
@ -384,6 +496,7 @@ tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
tok->start = start < 0 ? NULL : tok->buf + start;
tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
update_fstring_buffers(tok, *tok->buf, /*regular=*/1, /*multiline=*/1);
}
return 1;
}
@ -838,6 +951,7 @@ _PyTokenizer_Free(struct tok_state *tok)
if (tok->interactive_src_start != NULL) {
PyMem_Free(tok->interactive_src_start);
}
free_fstring_expressions(tok);
PyMem_Free(tok);
}
@ -854,6 +968,9 @@ tok_readline_raw(struct tok_state *tok)
if (line == NULL) {
return 1;
}
if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
return 0;
}
if (tok->fp_interactive &&
tok_concatenate_interactive_new_line(tok, line) == -1) {
return 0;
@ -941,6 +1058,7 @@ tok_underflow_interactive(struct tok_state *tok) {
}
else if (tok->start != NULL) {
Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
update_fstring_buffers(tok, -*tok->buf, /*regular=*/0, /*multiline=*/1);
size_t size = strlen(newtok);
ADVANCE_LINENO();
if (!tok_reserve_buf(tok, size + 1)) {
@ -953,6 +1071,7 @@ tok_underflow_interactive(struct tok_state *tok) {
PyMem_Free(newtok);
tok->inp += size;
tok->multi_line_start = tok->buf + cur_multi_line_start;
update_fstring_buffers(tok, *tok->buf, /*regular=*/0, /*multiline=*/1);
}
else {
ADVANCE_LINENO();
@ -969,6 +1088,10 @@ tok_underflow_interactive(struct tok_state *tok) {
}
return 0;
}
if (tok->tok_mode_stack_index && !update_fstring_expr(tok, 0)) {
return 0;
}
return 1;
}
@ -1073,7 +1196,7 @@ tok_nextc(struct tok_state *tok)
return Py_CHARMASK(*tok->cur++); /* Fast path */
}
if (tok->done != E_OK) {
return EOF;
return EOF;
}
if (tok->fp == NULL) {
rc = tok_underflow_string(tok);
@ -1115,7 +1238,7 @@ tok_backup(struct tok_state *tok, int c)
if (--tok->cur < tok->buf) {
Py_FatalError("tokenizer beginning of buffer");
}
if ((int)(unsigned char)*tok->cur != c) {
if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) {
Py_FatalError("tok_backup: wrong character");
}
tok->col_offset--;
@ -1172,6 +1295,7 @@ error:
static int
syntaxerror(struct tok_state *tok, const char *format, ...)
{
// This errors are cleaned on startup. Todo: Fix it.
va_list vargs;
va_start(vargs, format);
int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
@ -1234,6 +1358,41 @@ error:
return -1;
}
static int
warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char)
{
if (!tok->tok_report_warnings) {
return 0;
}
PyObject *msg = PyUnicode_FromFormat(
"invalid escape sequence '\\%c'",
(char) first_invalid_escape_char
);
if (msg == NULL) {
return -1;
}
if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, tok->filename,
tok->lineno, NULL, NULL) < 0) {
Py_DECREF(msg);
if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
/* Replace the DeprecationWarning exception with a SyntaxError
to get a more accurate error report */
PyErr_Clear();
return syntaxerror(tok, "invalid escape sequence '\\%c'", (char) first_invalid_escape_char);
}
return -1;
}
Py_DECREF(msg);
return 0;
}
static int
lookahead(struct tok_state *tok, const char *test)
{
@ -1389,7 +1548,6 @@ tok_decimal_tail(struct tok_state *tok)
return c;
}
/* Get next token, after space stripping etc. */
static inline int
tok_continuation_line(struct tok_state *tok) {
@ -1427,7 +1585,12 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
{
assert((start == NULL && end == NULL) || (start != NULL && end != NULL));
token->level = tok->level;
token->lineno = type == STRING ? tok->first_lineno : tok->lineno;
if (ISSTRINGLIT(type)) {
token->lineno = tok->first_lineno;
}
else {
token->lineno = tok->lineno;
}
token->end_lineno = tok->lineno;
token->col_offset = token->end_col_offset = -1;
token->start = start;
@ -1441,7 +1604,7 @@ token_setup(struct tok_state *tok, struct token *token, int type, const char *st
}
static int
tok_get(struct tok_state *tok, struct token *token)
tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
{
int c;
int blankline, nonascii;
@ -1602,6 +1765,11 @@ tok_get(struct tok_state *tok, struct token *token)
/* Skip comment, unless it's a type comment */
if (c == '#') {
if (tok->tok_mode_stack_index > 0) {
return MAKE_TOKEN(syntaxerror(tok, "f-string expression part cannot include '#'"));
}
const char *prefix, *p, *type_start;
int current_starting_col_offset;
@ -1703,6 +1871,9 @@ tok_get(struct tok_state *tok, struct token *token)
}
c = tok_nextc(tok);
if (c == '"' || c == '\'') {
if (saw_f) {
goto f_string_quote;
}
goto letter_quote;
}
}
@ -1748,7 +1919,9 @@ tok_get(struct tok_state *tok, struct token *token)
int ahead_tok_kind;
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
ahead_tok_kind = tok_get(&ahead_tok, &ahead_token);
ahead_tok_kind = tok_get_normal_mode(&ahead_tok,
current_tok,
&ahead_token);
if (ahead_tok_kind == NAME
&& ahead_tok.cur - ahead_tok.start == 3
@ -2003,6 +2176,67 @@ tok_get(struct tok_state *tok, struct token *token)
return MAKE_TOKEN(NUMBER);
}
f_string_quote:
if (((tolower(*tok->start) == 'f' || tolower(*tok->start) == 'r') && (c == '\'' || c == '"'))) {
int quote = c;
int quote_size = 1; /* 1 or 3 */
/* Nodes of type STRING, especially multi line strings
must be handled differently in order to get both
the starting line number and the column offset right.
(cf. issue 16806) */
tok->first_lineno = tok->lineno;
tok->multi_line_start = tok->line_start;
/* Find the quote size and start of string */
int after_quote = tok_nextc(tok);
if (after_quote == quote) {
int after_after_quote = tok_nextc(tok);
if (after_after_quote == quote) {
quote_size = 3;
}
else {
// TODO: Check this
tok_backup(tok, after_after_quote);
tok_backup(tok, after_quote);
}
}
if (after_quote != quote) {
tok_backup(tok, after_quote);
}
p_start = tok->start;
p_end = tok->cur;
tokenizer_mode *current_tok = TOK_NEXT_MODE(tok);
current_tok->kind = TOK_FSTRING_MODE;
current_tok->f_string_quote = quote;
current_tok->f_string_quote_size = quote_size;
current_tok->f_string_start = tok->start;
current_tok->f_string_multi_line_start = tok->line_start;
current_tok->last_expr_buffer = NULL;
current_tok->last_expr_size = 0;
current_tok->last_expr_end = -1;
switch (*tok->start) {
case 'F':
case 'f':
current_tok->f_string_raw = tolower(*(tok->start + 1)) == 'r';
break;
case 'R':
case 'r':
current_tok->f_string_raw = 1;
break;
default:
Py_UNREACHABLE();
}
current_tok->bracket_stack = 0;
current_tok->bracket_mark[0] = 0;
current_tok->bracket_mark_index = -1;
return MAKE_TOKEN(FSTRING_START);
}
letter_quote:
/* String */
if (c == '\'' || c == '"') {
@ -2047,6 +2281,20 @@ tok_get(struct tok_state *tok, struct token *token)
tok->line_start = tok->multi_line_start;
int start = tok->lineno;
tok->lineno = tok->first_lineno;
if (tok->tok_mode_stack_index > 0) {
/* When we are in an f-string, before raising the
* unterminated string literal error, check whether
* does the initial quote matches with f-strings quotes
* and if it is, then this must be a missing '}' token
* so raise the proper error */
tokenizer_mode *current_tok = TOK_GET_MODE(tok);
if (current_tok->f_string_quote == quote &&
current_tok->f_string_quote_size == quote_size) {
return MAKE_TOKEN(syntaxerror(tok, "f-string: expecting '}'", start));
}
}
if (quote_size == 3) {
syntaxerror(tok, "unterminated triple-quoted string literal"
" (detected at line %d)", start);
@ -2089,6 +2337,27 @@ tok_get(struct tok_state *tok, struct token *token)
goto again; /* Read next line */
}
/* Punctuation character */
int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{');
if (is_punctuation && tok->tok_mode_stack_index > 0 && current_tok->bracket_mark_index >= 0) {
int mark = *TOK_GET_BRACKET_MARK(current_tok);
/* This code block gets executed before the bracket_stack is incremented
* by the `{` case, so for ensuring that we are on the 0th level, we need
* to adjust it manually */
int cursor = current_tok->bracket_stack - (c != '{');
if (cursor == 0 && !update_fstring_expr(tok, c)) {
return MAKE_TOKEN(ENDMARKER);
}
if (c == ':' && cursor == mark) {
current_tok->kind = TOK_FSTRING_MODE;
p_start = tok->start;
p_end = tok->cur;
return MAKE_TOKEN(_PyToken_OneChar(c));
}
}
/* Check for two-character token */
{
int c2 = tok_nextc(tok);
@ -2121,11 +2390,18 @@ tok_get(struct tok_state *tok, struct token *token)
tok->parenlinenostack[tok->level] = tok->lineno;
tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
tok->level++;
if (tok->tok_mode_stack_index > 0) {
current_tok->bracket_stack++;
}
break;
case ')':
case ']':
case '}':
if (!tok->level) {
if (tok->tok_mode_stack_index > 0 && !current_tok->bracket_stack && c == '}') {
return MAKE_TOKEN(syntaxerror(tok, "f-string: single '}' is not allowed"));
}
return MAKE_TOKEN(syntaxerror(tok, "unmatched '%c'", c));
}
tok->level--;
@ -2134,6 +2410,18 @@ tok_get(struct tok_state *tok, struct token *token)
(opening == '[' && c == ']') ||
(opening == '{' && c == '}')))
{
/* If the opening bracket belongs to an f-string's expression
part (e.g. f"{)}") and the closing bracket is an arbitrary
nested expression, then instead of matching a different
syntactical construct with it; we'll throw an unmatched
parentheses error. */
if (tok->tok_mode_stack_index > 0 && opening == '{') {
assert(current_tok->bracket_stack >= 0);
int previous_bracket = current_tok->bracket_stack - 1;
if (previous_bracket == *TOK_GET_BRACKET_MARK(current_tok)) {
return MAKE_TOKEN(syntaxerror(tok, "f-string: unmatched '%c'", c));
}
}
if (tok->parenlinenostack[tok->level] != tok->lineno) {
return MAKE_TOKEN(syntaxerror(tok,
"closing parenthesis '%c' does not match "
@ -2147,6 +2435,14 @@ tok_get(struct tok_state *tok, struct token *token)
c, opening));
}
}
if (tok->tok_mode_stack_index > 0) {
current_tok->bracket_stack--;
if (c == '}' && current_tok->bracket_stack == *TOK_GET_BRACKET_MARK(current_tok)) {
current_tok->bracket_mark_index--;
current_tok->kind = TOK_FSTRING_MODE;
}
}
break;
}
@ -2162,6 +2458,187 @@ tok_get(struct tok_state *tok, struct token *token)
return MAKE_TOKEN(_PyToken_OneChar(c));
}
static int
tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
{
const char *p_start = NULL;
const char *p_end = NULL;
int end_quote_size = 0;
int unicode_escape = 0;
tok->start = tok->cur;
tok->first_lineno = tok->lineno;
tok->starting_col_offset = tok->col_offset;
// If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize
// before it.
int start_char = tok_nextc(tok);
int peek1 = tok_nextc(tok);
tok_backup(tok, peek1);
tok_backup(tok, start_char);
if ((start_char == '{' && peek1 != '{') || (start_char == '}' && peek1 != '}')) {
if (start_char == '{') {
current_tok->bracket_mark_index++;
if (current_tok->bracket_mark_index >= MAX_EXPR_NESTING) {
return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply"));
}
*TOK_GET_BRACKET_MARK(current_tok) = current_tok->bracket_stack;
}
TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
return tok_get_normal_mode(tok, current_tok, token);
}
// Check if we are at the end of the string
for (int i = 0; i < current_tok->f_string_quote_size; i++) {
int quote = tok_nextc(tok);
if (quote != current_tok->f_string_quote) {
tok_backup(tok, quote);
goto f_string_middle;
}
}
if (current_tok->last_expr_buffer != NULL) {
PyMem_Free(current_tok->last_expr_buffer);
current_tok->last_expr_buffer = NULL;
current_tok->last_expr_size = 0;
current_tok->last_expr_end = -1;
}
p_start = tok->start;
p_end = tok->cur;
tok->tok_mode_stack_index--;
return MAKE_TOKEN(FSTRING_END);
f_string_middle:
while (end_quote_size != current_tok->f_string_quote_size) {
int c = tok_nextc(tok);
if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) {
assert(tok->multi_line_start != NULL);
// shift the tok_state's location into
// the start of string, and report the error
// from the initial quote character
tok->cur = (char *)current_tok->f_string_start;
tok->cur++;
tok->line_start = current_tok->f_string_multi_line_start;
int start = tok->lineno;
tok->lineno = tok->first_lineno;
if (current_tok->f_string_quote_size == 3) {
return MAKE_TOKEN(syntaxerror(tok,
"unterminated triple-quoted f-string literal"
" (detected at line %d)", start));
}
else {
return MAKE_TOKEN(syntaxerror(tok,
"unterminated f-string literal (detected at"
" line %d)", start));
}
}
if (c == current_tok->f_string_quote) {
end_quote_size += 1;
continue;
} else {
end_quote_size = 0;
}
int in_format_spec = current_tok->last_expr_end != -1 && current_tok->bracket_mark_index >= 0;
if (c == '{') {
int peek = tok_nextc(tok);
if (peek != '{' || in_format_spec) {
tok_backup(tok, peek);
tok_backup(tok, c);
current_tok->bracket_mark_index++;
if (current_tok->bracket_mark_index >= MAX_EXPR_NESTING) {
return MAKE_TOKEN(syntaxerror(tok, "f-string: expressions nested too deeply"));
}
*TOK_GET_BRACKET_MARK(current_tok) = current_tok->bracket_stack;
TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
p_start = tok->start;
p_end = tok->cur;
} else {
p_start = tok->start;
p_end = tok->cur - 1;
}
return MAKE_TOKEN(FSTRING_MIDDLE);
} else if (c == '}') {
if (unicode_escape) {
p_start = tok->start;
p_end = tok->cur;
return MAKE_TOKEN(FSTRING_MIDDLE);
}
int peek = tok_nextc(tok);
// The tokenizer can only be in the format spec if we have already completed the expression
// scanning (indicated by the end of the expression being set) and we are not at the top level
// of the bracket stack (-1 is the top level). Since format specifiers can't legally use double
// brackets, we can bypass it here.
if (peek == '}' && !in_format_spec) {
p_start = tok->start;
p_end = tok->cur - 1;
} else {
tok_backup(tok, peek);
tok_backup(tok, c);
TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
p_start = tok->start;
p_end = tok->cur;
}
return MAKE_TOKEN(FSTRING_MIDDLE);
} else if (c == '\\') {
int peek = tok_nextc(tok);
// Special case when the backslash is right before a curly
// brace. We have to restore and return the control back
// to the loop for the next iteration.
if (peek == '{' || peek == '}') {
if (!current_tok->f_string_raw) {
if (warn_invalid_escape_sequence(tok, peek)) {
return MAKE_TOKEN(ERRORTOKEN);
}
}
tok_backup(tok, peek);
continue;
}
if (!current_tok->f_string_raw) {
if (peek == 'N') {
/* Handle named unicode escapes (\N{BULLET}) */
peek = tok_nextc(tok);
if (peek == '{') {
unicode_escape = 1;
} else {
tok_backup(tok, peek);
}
}
} /* else {
skip the escaped character
}*/
}
}
// Backup the f-string quotes to emit a final FSTRING_MIDDLE and
// add the quotes to the FSTRING_END in the next tokenizer iteration.
for (int i = 0; i < current_tok->f_string_quote_size; i++) {
tok_backup(tok, current_tok->f_string_quote);
}
p_start = tok->start;
p_end = tok->cur;
return MAKE_TOKEN(FSTRING_MIDDLE);
}
static int
tok_get(struct tok_state *tok, struct token *token)
{
tokenizer_mode *current_tok = TOK_GET_MODE(tok);
if (current_tok->kind == TOK_REGULAR_MODE) {
return tok_get_normal_mode(tok, current_tok, token);
} else {
return tok_get_fstring_mode(tok, current_tok, token);
}
}
int
_PyTokenizer_Get(struct tok_state *tok, struct token *token)
{

View file

@ -33,6 +33,31 @@ struct token {
const char *start, *end;
};
enum tokenizer_mode_kind_t {
TOK_REGULAR_MODE,
TOK_FSTRING_MODE,
};
#define MAX_EXPR_NESTING 3
typedef struct _tokenizer_mode {
enum tokenizer_mode_kind_t kind;
int bracket_stack;
int bracket_mark[MAX_EXPR_NESTING];
int bracket_mark_index;
char f_string_quote;
int f_string_quote_size;
int f_string_raw;
const char* f_string_start;
const char* f_string_multi_line_start;
Py_ssize_t last_expr_size;
Py_ssize_t last_expr_end;
char* last_expr_buffer;
} tokenizer_mode;
/* Tokenizer state */
struct tok_state {
/* Input state; buf <= cur <= inp <= end */
@ -93,6 +118,10 @@ struct tok_state {
/* How to proceed when asked for a new token in interactive mode */
enum interactive_underflow_t interactive_underflow;
int report_warnings;
// TODO: Factor this into its own thing
tokenizer_mode tok_mode_stack[MAXLEVEL];
int tok_mode_stack_index;
int tok_report_warnings;
#ifdef Py_DEBUG
int debug;
#endif