Issue #24619: New approach for tokenizing async/await.

This commit fixes how one-line async-defs and defs are tracked
by tokenizer.  It allows to correctly parse invalid code such
as:

>>> async def f():
...     def g(): pass
...     async = 10

and valid code such as:

>>> async def f():
...     async def g(): pass
...     await z

As a consequence, is is now possible to have one-line
'async def foo(): await ..' functions:

>>> async def foo(): return await bar()
This commit is contained in:
Yury Selivanov 2015-07-22 13:33:45 +03:00
parent 80acc3ebbc
commit 8fb307cd65
13 changed files with 343 additions and 69 deletions

View file

@ -685,9 +685,7 @@ Execution of Python coroutines can be suspended and resumed at many points
(see :term:`coroutine`). In the body of a coroutine, any ``await`` and (see :term:`coroutine`). In the body of a coroutine, any ``await`` and
``async`` identifiers become reserved keywords; :keyword:`await` expressions, ``async`` identifiers become reserved keywords; :keyword:`await` expressions,
:keyword:`async for` and :keyword:`async with` can only be used in :keyword:`async for` and :keyword:`async with` can only be used in
coroutine bodies. However, to simplify the parser, these keywords cannot coroutine bodies.
be used on the same line as a function or coroutine (:keyword:`def`
statement) header.
Functions defined with ``async def`` syntax are always coroutine functions, Functions defined with ``async def`` syntax are always coroutine functions,
even if they do not contain ``await`` or ``async`` keywords. even if they do not contain ``await`` or ``async`` keywords.

View file

@ -369,6 +369,7 @@ def generate_tokens(readline):
# 'stashed' and 'ctx' are used for async/await parsing # 'stashed' and 'ctx' are used for async/await parsing
stashed = None stashed = None
ctx = [('sync', 0)] ctx = [('sync', 0)]
in_async = 0
while 1: # loop over lines in stream while 1: # loop over lines in stream
try: try:
@ -436,6 +437,14 @@ def generate_tokens(readline):
"unindent does not match any outer indentation level", "unindent does not match any outer indentation level",
("<tokenize>", lnum, pos, line)) ("<tokenize>", lnum, pos, line))
indents = indents[:-1] indents = indents[:-1]
cur_indent = indents[-1]
while len(ctx) > 1 and ctx[-1][1] >= cur_indent:
if ctx[-1][0] == 'async':
in_async -= 1
assert in_async >= 0
ctx.pop()
yield (DEDENT, '', (lnum, pos), (lnum, pos), line) yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
else: # continued statement else: # continued statement
@ -499,7 +508,7 @@ def generate_tokens(readline):
yield (STRING, token, spos, epos, line) yield (STRING, token, spos, epos, line)
elif initial in namechars: # ordinary name elif initial in namechars: # ordinary name
if token in ('async', 'await'): if token in ('async', 'await'):
if ctx[-1][0] == 'async' and ctx[-1][1] < indents[-1]: if in_async:
yield (ASYNC if token == 'async' else AWAIT, yield (ASYNC if token == 'async' else AWAIT,
token, spos, epos, line) token, spos, epos, line)
continue continue
@ -515,6 +524,7 @@ def generate_tokens(readline):
and stashed[1] == 'async'): and stashed[1] == 'async'):
ctx.append(('async', indents[-1])) ctx.append(('async', indents[-1]))
in_async += 1
yield (ASYNC, stashed[1], yield (ASYNC, stashed[1],
stashed[2], stashed[3], stashed[2], stashed[3],

View file

@ -1,3 +1,2 @@
async def foo(): async def foo(a=await something()):
def foo(a=await something()): pass
pass

View file

@ -1,3 +1,2 @@
async def foo(): async def foo(a:await something()):
def foo(a:await something()): pass
pass

View file

@ -1,2 +1,2 @@
async def foo(): async def foo():
async def foo(): await something() await

View file

@ -1,2 +0,0 @@
async def foo():
await

View file

@ -67,11 +67,11 @@ def silence_coro_gc():
class AsyncBadSyntaxTest(unittest.TestCase): class AsyncBadSyntaxTest(unittest.TestCase):
def test_badsyntax_1(self): def test_badsyntax_1(self):
with self.assertRaisesRegex(SyntaxError, 'invalid syntax'): with self.assertRaisesRegex(SyntaxError, "'await' outside"):
import test.badsyntax_async1 import test.badsyntax_async1
def test_badsyntax_2(self): def test_badsyntax_2(self):
with self.assertRaisesRegex(SyntaxError, 'invalid syntax'): with self.assertRaisesRegex(SyntaxError, "'await' outside"):
import test.badsyntax_async2 import test.badsyntax_async2
def test_badsyntax_3(self): def test_badsyntax_3(self):
@ -103,10 +103,6 @@ class AsyncBadSyntaxTest(unittest.TestCase):
import test.badsyntax_async8 import test.badsyntax_async8
def test_badsyntax_9(self): def test_badsyntax_9(self):
with self.assertRaisesRegex(SyntaxError, 'invalid syntax'):
import test.badsyntax_async9
def test_badsyntax_10(self):
ns = {} ns = {}
for comp in {'(await a for a in b)', for comp in {'(await a for a in b)',
'[await a for a in b]', '[await a for a in b]',
@ -116,6 +112,221 @@ class AsyncBadSyntaxTest(unittest.TestCase):
with self.assertRaisesRegex(SyntaxError, 'await.*in comprehen'): with self.assertRaisesRegex(SyntaxError, 'await.*in comprehen'):
exec('async def f():\n\t{}'.format(comp), ns, ns) exec('async def f():\n\t{}'.format(comp), ns, ns)
def test_badsyntax_10(self):
# Tests for issue 24619
samples = [
"""async def foo():
def bar(): pass
await = 1
""",
"""async def foo():
def bar(): pass
await = 1
""",
"""async def foo():
def bar(): pass
if 1:
await = 1
""",
"""def foo():
async def bar(): pass
if 1:
await a
""",
"""def foo():
async def bar(): pass
await a
""",
"""def foo():
def baz(): pass
async def bar(): pass
await a
""",
"""def foo():
def baz(): pass
# 456
async def bar(): pass
# 123
await a
""",
"""async def foo():
def baz(): pass
# 456
async def bar(): pass
# 123
await = 2
""",
"""def foo():
def baz(): pass
async def bar(): pass
await a
""",
"""async def foo():
def baz(): pass
async def bar(): pass
await = 2
""",
"""async def foo():
def async(): pass
""",
"""async def foo():
def await(): pass
""",
"""async def foo():
def bar():
await
""",
"""async def foo():
return lambda async: await
""",
"""async def foo():
return lambda a: await
""",
"""async def foo(a: await b):
pass
""",
"""def baz():
async def foo(a: await b):
pass
""",
"""async def foo(async):
pass
""",
"""async def foo():
def bar():
def baz():
async = 1
""",
"""async def foo():
def bar():
def baz():
pass
async = 1
""",
"""def foo():
async def bar():
async def baz():
pass
def baz():
42
async = 1
""",
"""async def foo():
def bar():
def baz():
pass\nawait foo()
""",
"""def foo():
def bar():
async def baz():
pass\nawait foo()
""",
"""async def foo(await):
pass
""",
"""def foo():
async def bar(): pass
await a
""",
"""def foo():
async def bar():
pass\nawait a
"""]
ns = {}
for code in samples:
with self.subTest(code=code), self.assertRaises(SyntaxError):
exec(code, ns, ns)
def test_goodsyntax_1(self):
# Tests for issue 24619
def foo(await):
async def foo(): pass
async def foo():
pass
return await + 1
self.assertEqual(foo(10), 11)
def foo(await):
async def foo(): pass
async def foo(): pass
return await + 2
self.assertEqual(foo(20), 22)
def foo(await):
async def foo(): pass
async def foo(): pass
return await + 2
self.assertEqual(foo(20), 22)
def foo(await):
"""spam"""
async def foo(): \
pass
# 123
async def foo(): pass
# 456
return await + 2
self.assertEqual(foo(20), 22)
def foo(await):
def foo(): pass
def foo(): pass
async def bar(): return await_
await_ = await
try:
bar().send(None)
except StopIteration as ex:
return ex.args[0]
self.assertEqual(foo(42), 42)
async def f():
async def g(): pass
await z
self.assertTrue(inspect.iscoroutinefunction(f))
class TokenizerRegrTest(unittest.TestCase): class TokenizerRegrTest(unittest.TestCase):
@ -461,8 +672,7 @@ class CoroutineTest(unittest.TestCase):
class Awaitable: class Awaitable:
pass pass
async def foo(): async def foo(): return await Awaitable()
return (await Awaitable())
with self.assertRaisesRegex( with self.assertRaisesRegex(
TypeError, "object Awaitable can't be used in 'await' expression"): TypeError, "object Awaitable can't be used in 'await' expression"):

View file

@ -1051,10 +1051,7 @@ class GrammarTests(unittest.TestCase):
async def test(): async def test():
def sum(): def sum():
async = 1 pass
await = 41
return async + await
if 1: if 1:
await someobj() await someobj()

View file

@ -786,12 +786,12 @@ Async/await extension:
NAME 'def' (2, 2) (2, 5) NAME 'def' (2, 2) (2, 5)
NAME 'foo' (2, 6) (2, 9) NAME 'foo' (2, 6) (2, 9)
OP '(' (2, 9) (2, 10) OP '(' (2, 9) (2, 10)
NAME 'await' (2, 10) (2, 15) AWAIT 'await' (2, 10) (2, 15)
OP ')' (2, 15) (2, 16) OP ')' (2, 15) (2, 16)
OP ':' (2, 16) (2, 17) OP ':' (2, 16) (2, 17)
NEWLINE '\\n' (2, 17) (2, 18) NEWLINE '\\n' (2, 17) (2, 18)
INDENT ' ' (3, 0) (3, 4) INDENT ' ' (3, 0) (3, 4)
NAME 'await' (3, 4) (3, 9) AWAIT 'await' (3, 4) (3, 9)
OP '=' (3, 10) (3, 11) OP '=' (3, 10) (3, 11)
NUMBER '1' (3, 12) (3, 13) NUMBER '1' (3, 12) (3, 13)
NEWLINE '\\n' (3, 13) (3, 14) NEWLINE '\\n' (3, 13) (3, 14)
@ -829,6 +829,17 @@ Async/await extension:
OP ':' (2, 18) (2, 19) OP ':' (2, 18) (2, 19)
NAME 'pass' (2, 20) (2, 24) NAME 'pass' (2, 20) (2, 24)
DEDENT '' (3, 0) (3, 0) DEDENT '' (3, 0) (3, 0)
>>> dump_tokens('''async def foo(async): await''')
ENCODING 'utf-8' (0, 0) (0, 0)
ASYNC 'async' (1, 0) (1, 5)
NAME 'def' (1, 6) (1, 9)
NAME 'foo' (1, 10) (1, 13)
OP '(' (1, 13) (1, 14)
ASYNC 'async' (1, 14) (1, 19)
OP ')' (1, 19) (1, 20)
OP ':' (1, 20) (1, 21)
AWAIT 'await' (1, 22) (1, 27)
""" """
from test import support from test import support

View file

@ -501,6 +501,7 @@ def _tokenize(readline, encoding):
# 'stashed' and 'ctx' are used for async/await parsing # 'stashed' and 'ctx' are used for async/await parsing
stashed = None stashed = None
ctx = [('sync', 0)] ctx = [('sync', 0)]
in_async = 0
if encoding is not None: if encoding is not None:
if encoding == "utf-8-sig": if encoding == "utf-8-sig":
@ -580,6 +581,9 @@ def _tokenize(readline, encoding):
cur_indent = indents[-1] cur_indent = indents[-1]
while len(ctx) > 1 and ctx[-1][1] >= cur_indent: while len(ctx) > 1 and ctx[-1][1] >= cur_indent:
if ctx[-1][0] == 'async':
in_async -= 1
assert in_async >= 0
ctx.pop() ctx.pop()
yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
@ -640,7 +644,7 @@ def _tokenize(readline, encoding):
yield TokenInfo(STRING, token, spos, epos, line) yield TokenInfo(STRING, token, spos, epos, line)
elif initial.isidentifier(): # ordinary name elif initial.isidentifier(): # ordinary name
if token in ('async', 'await'): if token in ('async', 'await'):
if ctx[-1][0] == 'async' and ctx[-1][1] < indents[-1]: if in_async:
yield TokenInfo( yield TokenInfo(
ASYNC if token == 'async' else AWAIT, ASYNC if token == 'async' else AWAIT,
token, spos, epos, line) token, spos, epos, line)
@ -657,6 +661,7 @@ def _tokenize(readline, encoding):
and stashed.string == 'async'): and stashed.string == 'async'):
ctx.append(('async', indents[-1])) ctx.append(('async', indents[-1]))
in_async += 1
yield TokenInfo(ASYNC, stashed.string, yield TokenInfo(ASYNC, stashed.string,
stashed.start, stashed.end, stashed.start, stashed.end,

View file

@ -19,6 +19,9 @@ Core and Builtins
- Issue #24407: Fix crash when dict is mutated while being updated. - Issue #24407: Fix crash when dict is mutated while being updated.
- Issue #24619: New approach for tokenizing async/await. As a consequence,
is is now possible to have one-line 'async def foo(): await ..' functions.
Library Library
------- -------

View file

@ -31,6 +31,12 @@
|| c == '_'\ || c == '_'\
|| (c >= 128)) || (c >= 128))
/* The following DEFTYPE* flags are used in 'tok_state->deftypestack',
and should be removed in 3.7, when async/await are regular
keywords. */
#define DEFTYPE_ASYNC 1
#define DEFTYPE_HAS_NL 2
extern char *PyOS_Readline(FILE *, FILE *, const char *); extern char *PyOS_Readline(FILE *, FILE *, const char *);
/* Return malloc'ed string including trailing \n; /* Return malloc'ed string including trailing \n;
empty malloc'ed string for EOF; empty malloc'ed string for EOF;
@ -130,6 +136,8 @@ tok_new(void)
tok->def = 0; tok->def = 0;
tok->defstack[0] = 0; tok->defstack[0] = 0;
tok->deftypestack[0] = 0; tok->deftypestack[0] = 0;
tok->def_async_behind = 0;
tok->def_in_async = 0;
tok->atbol = 1; tok->atbol = 1;
tok->pendin = 0; tok->pendin = 0;
@ -1436,7 +1444,12 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
tok->pendin++; tok->pendin++;
while (tok->def && tok->defstack[tok->def] >= tok->indent) { while (tok->def && tok->defstack[tok->def] >= tok->indent) {
if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
tok->def_in_async--;
assert(tok->def_in_async >= 0);
}
tok->def--; tok->def--;
assert(tok->def >= 0);
} }
return DEDENT; return DEDENT;
@ -1447,6 +1460,22 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
} }
} }
if (!blankline && tok->level == 0
&& tok->def && tok->deftypestack[tok->def] & DEFTYPE_HAS_NL
&& tok->defstack[tok->def] >= tok->indent)
{
/* The top function on the stack did have a NEWLINE
token, but didn't have an INDENT. That means that
it's a one-line function and it should now be removed
from the stack. */
if (tok->deftypestack[tok->def] & DEFTYPE_ASYNC) {
tok->def_in_async--;
assert(tok->def_in_async >= 0);
}
tok->def--;
assert(tok->def >= 0);
}
again: again:
tok->start = NULL; tok->start = NULL;
/* Skip spaces */ /* Skip spaces */
@ -1501,59 +1530,58 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
tok_len = tok->cur - tok->start; tok_len = tok->cur - tok->start;
if (tok_len == 3 && memcmp(tok->start, "def", 3) == 0) { if (tok_len == 3 && memcmp(tok->start, "def", 3) == 0) {
if (tok->def && tok->deftypestack[tok->def] == 3) { /* The current token is 'def'. */
tok->deftypestack[tok->def] = 2; if (tok->def + 1 >= MAXINDENT) {
tok->done = E_TOODEEP;
tok->cur = tok->inp;
return ERRORTOKEN;
} }
else if (tok->defstack[tok->def] < tok->indent) {
/* We advance defs stack only when we see "def" *and*
the indentation level was increased relative to the
previous "def". */
if (tok->def + 1 >= MAXINDENT) { /* Advance defs stack. */
tok->done = E_TOODEEP; tok->def++;
tok->cur = tok->inp; tok->defstack[tok->def] = tok->indent;
return ERRORTOKEN;
}
tok->def++; if (tok->def_async_behind) {
tok->defstack[tok->def] = tok->indent; /* The previous token was 'async'. */
tok->deftypestack[tok->def] = 1; tok->def_async_behind = 0;
tok->deftypestack[tok->def] = DEFTYPE_ASYNC;
tok->def_in_async++;
}
else {
/* This is a regular function (not async def). */
tok->deftypestack[tok->def] = 0;
} }
} }
else if (tok_len == 5) { else if (tok_len == 5) {
if (memcmp(tok->start, "async", 5) == 0) { if (memcmp(tok->start, "async", 5) == 0) {
/* The current token is 'async'. */
memcpy(&ahead_tok, tok, sizeof(ahead_tok)); memcpy(&ahead_tok, tok, sizeof(ahead_tok));
/* Try to look ahead one token. */
ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start, ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
&ahead_top_end); &ahead_top_end);
if (ahead_tok_kind == NAME && if (ahead_tok_kind == NAME
ahead_tok.cur - ahead_tok.start == 3 && && ahead_tok.cur - ahead_tok.start == 3
memcmp(ahead_tok.start, "def", 3) == 0) { && memcmp(ahead_tok.start, "def", 3) == 0)
{
if (tok->def + 1 >= MAXINDENT) { /* The next token is going to be 'def', so instead of
tok->done = E_TOODEEP; returning 'async' NAME token, we return ASYNC. */
tok->cur = tok->inp; tok->def_async_behind = 1;
return ERRORTOKEN;
}
tok->def++;
tok->defstack[tok->def] = tok->indent;
tok->deftypestack[tok->def] = 3;
return ASYNC; return ASYNC;
} }
else if (tok->def && tok->deftypestack[tok->def] == 2 else if (tok->def_in_async)
&& tok->defstack[tok->def] < tok->indent) { {
/* We're inside an 'async def' function, so we treat
'async' token as ASYNC, instead of NAME. */
return ASYNC; return ASYNC;
} }
} }
else if (memcmp(tok->start, "await", 5) == 0 else if (memcmp(tok->start, "await", 5) == 0 && tok->def_in_async)
&& tok->def && tok->deftypestack[tok->def] == 2 {
&& tok->defstack[tok->def] < tok->indent) { /* We're inside an 'async def' function, so we treat
'await' token as AWAIT, instead of NAME. */
return AWAIT; return AWAIT;
} }
} }
@ -1569,6 +1597,13 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
*p_start = tok->start; *p_start = tok->start;
*p_end = tok->cur - 1; /* Leave '\n' out of the string */ *p_end = tok->cur - 1; /* Leave '\n' out of the string */
tok->cont_line = 0; tok->cont_line = 0;
if (tok->def) {
/* Mark the top function on the stack that it had
at least one NEWLINE. That will help us to
distinguish one-line functions from functions
with multiple statements. */
tok->deftypestack[tok->def] |= DEFTYPE_HAS_NL;
}
return NEWLINE; return NEWLINE;
} }

View file

@ -66,12 +66,21 @@ struct tok_state {
const char* str; const char* str;
const char* input; /* Tokenizer's newline translated copy of the string. */ const char* input; /* Tokenizer's newline translated copy of the string. */
int defstack[MAXINDENT]; /* stack if funcs & indents where they /* `def*` fields are for parsing async/await in a backwards compatible
were defined */ way. They should be removed in 3.7, when they will become
int deftypestack[MAXINDENT]; /* stack of func types regular constants. See PEP 492 for more details. */
(0 not func; 1: "def name"; int defstack[MAXINDENT]; /* Stack of funcs & indents where they
2: "async def name") */ were defined. */
int def; /* Length of stack of func types */ int deftypestack[MAXINDENT]; /* Stack of func flags, see DEFTYPE_*
constants. */
int def; /* Length of stack of func types/flags. */
int def_async_behind; /* 1 if there was an 'async' token before
a 'def' token. */
int def_in_async; /* Counter of how deep 'async def's
are nested. If greater than 0,
we are somewhere in an 'async def'
body, so 'async' and 'await' should
be parsed as keywords.*/
}; };
extern struct tok_state *PyTokenizer_FromString(const char *, int); extern struct tok_state *PyTokenizer_FromString(const char *, int);