Issue #24619: New approach for tokenizing async/await.

This commit fixes how one-line async-defs and defs are tracked
by tokenizer.  It allows to correctly parse invalid code such
as:

>>> async def f():
...     def g(): pass
...     async = 10

and valid code such as:

>>> async def f():
...     async def g(): pass
...     await z

As a consequence, is is now possible to have one-line
'async def foo(): await ..' functions:

>>> async def foo(): return await bar()
This commit is contained in:
Yury Selivanov 2015-07-22 13:33:45 +03:00
parent 80acc3ebbc
commit 8fb307cd65
13 changed files with 343 additions and 69 deletions

View file

@ -31,6 +31,12 @@
|| c == '_'\
|| (c >= 128))
/* The following DEFTYPE* flags are used in 'tok_state->deftypestack',
and should be removed in 3.7, when async/await are regular
keywords. */
#define DEFTYPE_ASYNC 1
#define DEFTYPE_HAS_NL 2
extern char *PyOS_Readline(FILE *, FILE *, const char *);
/* Return malloc'ed string including trailing \n;
empty malloc'ed string for EOF;
@ -130,6 +136,8 @@ tok_new(void)
tok->def = 0;
tok->defstack[0] = 0;
tok->deftypestack[0] = 0;
tok->def_async_behind = 0;
tok->def_in_async = 0;
tok->atbol = 1;
tok->pendin = 0;
@ -1436,7 +1444,12 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
tok->pendin++;
while (tok->def && tok->defstack[tok->def] >= tok->indent) {
if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
tok->def_in_async--;
assert(tok->def_in_async >= 0);
}
tok->def--;
assert(tok->def >= 0);
}
return DEDENT;
@ -1447,6 +1460,22 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
}
}
if (!blankline && tok->level == 0
&& tok->def && tok->deftypestack[tok->def] & DEFTYPE_HAS_NL
&& tok->defstack[tok->def] >= tok->indent)
{
/* The top function on the stack did have a NEWLINE
token, but didn't have an INDENT. That means that
it's a one-line function and it should now be removed
from the stack. */
if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
tok->def_in_async--;
assert(tok->def_in_async >= 0);
}
tok->def--;
assert(tok->def >= 0);
}
again:
tok->start = NULL;
/* Skip spaces */
@ -1501,59 +1530,58 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
tok_len = tok->cur - tok->start;
if (tok_len == 3 && memcmp(tok->start, "def", 3) == 0) {
if (tok->def && tok->deftypestack[tok->def] == 3) {
tok->deftypestack[tok->def] = 2;
/* The current token is 'def'. */
if (tok->def + 1 >= MAXINDENT) {
tok->done = E_TOODEEP;
tok->cur = tok->inp;
return ERRORTOKEN;
}
else if (tok->defstack[tok->def] < tok->indent) {
/* We advance defs stack only when we see "def" *and*
the indentation level was increased relative to the
previous "def". */
if (tok->def + 1 >= MAXINDENT) {
tok->done = E_TOODEEP;
tok->cur = tok->inp;
return ERRORTOKEN;
}
/* Advance defs stack. */
tok->def++;
tok->defstack[tok->def] = tok->indent;
tok->def++;
tok->defstack[tok->def] = tok->indent;
tok->deftypestack[tok->def] = 1;
if (tok->def_async_behind) {
/* The previous token was 'async'. */
tok->def_async_behind = 0;
tok->deftypestack[tok->def] = DEFTYPE_ASYNC;
tok->def_in_async++;
}
else {
/* This is a regular function (not async def). */
tok->deftypestack[tok->def] = 0;
}
}
else if (tok_len == 5) {
if (memcmp(tok->start, "async", 5) == 0) {
/* The current token is 'async'. */
memcpy(&ahead_tok, tok, sizeof(ahead_tok));
/* Try to look ahead one token. */
ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
&ahead_top_end);
if (ahead_tok_kind == NAME &&
ahead_tok.cur - ahead_tok.start == 3 &&
memcmp(ahead_tok.start, "def", 3) == 0) {
if (tok->def + 1 >= MAXINDENT) {
tok->done = E_TOODEEP;
tok->cur = tok->inp;
return ERRORTOKEN;
}
tok->def++;
tok->defstack[tok->def] = tok->indent;
tok->deftypestack[tok->def] = 3;
if (ahead_tok_kind == NAME
&& ahead_tok.cur - ahead_tok.start == 3
&& memcmp(ahead_tok.start, "def", 3) == 0)
{
/* The next token is going to be 'def', so instead of
returning 'async' NAME token, we return ASYNC. */
tok->def_async_behind = 1;
return ASYNC;
}
else if (tok->def && tok->deftypestack[tok->def] == 2
&& tok->defstack[tok->def] < tok->indent) {
else if (tok->def_in_async)
{
/* We're inside an 'async def' function, so we treat
'async' token as ASYNC, instead of NAME. */
return ASYNC;
}
}
else if (memcmp(tok->start, "await", 5) == 0
&& tok->def && tok->deftypestack[tok->def] == 2
&& tok->defstack[tok->def] < tok->indent) {
else if (memcmp(tok->start, "await", 5) == 0 && tok->def_in_async)
{
/* We're inside an 'async def' function, so we treat
'await' token as AWAIT, instead of NAME. */
return AWAIT;
}
}
@ -1569,6 +1597,13 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
*p_start = tok->start;
*p_end = tok->cur - 1; /* Leave '\n' out of the string */
tok->cont_line = 0;
if (tok->def) {
/* Mark the top function on the stack that it had
at least one NEWLINE. That will help us to
distinguish one-line functions from functions
with multiple statements. */
tok->deftypestack[tok->def] |= DEFTYPE_HAS_NL;
}
return NEWLINE;
}

View file

@ -66,12 +66,21 @@ struct tok_state {
const char* str;
const char* input; /* Tokenizer's newline translated copy of the string. */
int defstack[MAXINDENT]; /* stack if funcs & indents where they
were defined */
int deftypestack[MAXINDENT]; /* stack of func types
(0 not func; 1: "def name";
2: "async def name") */
int def; /* Length of stack of func types */
/* `def*` fields are for parsing async/await in a backwards compatible
way. They should be removed in 3.7, when they will become
regular constants. See PEP 492 for more details. */
int defstack[MAXINDENT]; /* Stack of funcs & indents where they
were defined. */
int deftypestack[MAXINDENT]; /* Stack of func flags, see DEFTYPE_*
constants. */
int def; /* Length of stack of func types/flags. */
int def_async_behind; /* 1 if there was an 'async' token before
a 'def' token. */
int def_in_async; /* Counter of how deep 'async def's
are nested. If greater than 0,
we are somewhere in an 'async def'
body, so 'async' and 'await' should
be parsed as keywords.*/
};
extern struct tok_state *PyTokenizer_FromString(const char *, int);