bpo-40688: Use the correct parser in the peg_generator scripts (GH-20235)

The scripts in `Tools/peg_generator/scripts` mostly assume that
`ast.parse` and `compile` use the old parser, since this was the
state of things, while we were developing them. They need to be
updated to always use the correct parser. `_peg_parser` is being
extended to support both parsing and compiling with both parsers.
(cherry picked from commit 9645930b5b)

Co-authored-by: Lysandros Nikolaou <lisandrosnik@gmail.com>
This commit is contained in:
Miss Islington (bot) 2020-05-25 13:11:36 -07:00 committed by GitHub
parent 318a18eb88
commit 3c6c86ab77
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 151 additions and 187 deletions

View file

@ -1,60 +1,9 @@
#include <Python.h> #include <Python.h>
#include "pegen_interface.h" #include "pegen_interface.h"
PyObject * static int
_Py_parse_file(PyObject *self, PyObject *args, PyObject *kwds) _mode_str_to_int(char *mode_str)
{ {
static char *keywords[] = {"file", "mode", NULL};
char *filename;
char *mode_str = "exec";
if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|s", keywords, &filename, &mode_str)) {
return NULL;
}
int mode;
if (strcmp(mode_str, "exec") == 0) {
mode = Py_file_input;
}
else if (strcmp(mode_str, "single") == 0) {
mode = Py_single_input;
}
else {
return PyErr_Format(PyExc_ValueError, "mode must be either 'exec' or 'single'");
}
PyArena *arena = PyArena_New();
if (arena == NULL) {
return NULL;
}
PyCompilerFlags flags = _PyCompilerFlags_INIT;
PyObject *result = NULL;
mod_ty res = PyPegen_ASTFromFilename(filename, mode, &flags, arena);
if (res == NULL) {
goto error;
}
result = PyAST_mod2obj(res);
error:
PyArena_Free(arena);
return result;
}
PyObject *
_Py_parse_string(PyObject *self, PyObject *args, PyObject *kwds)
{
static char *keywords[] = {"string", "mode", "oldparser", NULL};
char *the_string;
char *mode_str = "exec";
int oldparser = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|sp", keywords,
&the_string, &mode_str, &oldparser)) {
return NULL;
}
int mode; int mode;
if (strcmp(mode_str, "exec") == 0) { if (strcmp(mode_str, "exec") == 0) {
mode = Py_file_input; mode = Py_file_input;
@ -66,39 +15,119 @@ _Py_parse_string(PyObject *self, PyObject *args, PyObject *kwds)
mode = Py_single_input; mode = Py_single_input;
} }
else { else {
mode = -1;
}
return mode;
}
static mod_ty
_run_parser(char *str, char *filename, int mode, PyCompilerFlags *flags, PyArena *arena, int oldparser)
{
mod_ty mod;
if (!oldparser) {
mod = PyPegen_ASTFromString(str, filename, mode, flags, arena);
}
else {
mod = PyParser_ASTFromString(str, filename, mode, flags, arena);
}
return mod;
}
PyObject *
_Py_compile_string(PyObject *self, PyObject *args, PyObject *kwds)
{
static char *keywords[] = {"string", "filename", "mode", "oldparser", NULL};
char *the_string;
char *filename = "<string>";
char *mode_str = "exec";
int oldparser = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|ssp", keywords,
&the_string, &filename, &mode_str, &oldparser)) {
return NULL;
}
int mode = _mode_str_to_int(mode_str);
if (mode == -1) {
return PyErr_Format(PyExc_ValueError, "mode must be either 'exec' or 'eval' or 'single'"); return PyErr_Format(PyExc_ValueError, "mode must be either 'exec' or 'eval' or 'single'");
} }
PyCompilerFlags flags = _PyCompilerFlags_INIT;
flags.cf_flags = PyCF_IGNORE_COOKIE;
PyArena *arena = PyArena_New(); PyArena *arena = PyArena_New();
if (arena == NULL) { if (arena == NULL) {
return NULL; return NULL;
} }
PyObject *result = NULL; mod_ty mod = _run_parser(the_string, filename, mode, &flags, arena, oldparser);
if (mod == NULL) {
PyArena_Free(arena);
return NULL;
}
PyObject *filename_ob = PyUnicode_DecodeFSDefault(filename);
if (filename_ob == NULL) {
PyArena_Free(arena);
return NULL;
}
PyCodeObject *result = PyAST_CompileObject(mod, filename_ob, &flags, -1, arena);
Py_XDECREF(filename_ob);
PyArena_Free(arena);
return (PyObject *)result;
}
PyObject *
_Py_parse_string(PyObject *self, PyObject *args, PyObject *kwds)
{
static char *keywords[] = {"string", "filename", "mode", "oldparser", NULL};
char *the_string;
char *filename = "<string>";
char *mode_str = "exec";
int oldparser = 0;
if (!PyArg_ParseTupleAndKeywords(args, kwds, "s|ssp", keywords,
&the_string, &filename, &mode_str, &oldparser)) {
return NULL;
}
int mode = _mode_str_to_int(mode_str);
if (mode == -1) {
return PyErr_Format(PyExc_ValueError, "mode must be either 'exec' or 'eval' or 'single'");
}
PyCompilerFlags flags = _PyCompilerFlags_INIT; PyCompilerFlags flags = _PyCompilerFlags_INIT;
flags.cf_flags = PyCF_IGNORE_COOKIE; flags.cf_flags = PyCF_IGNORE_COOKIE;
mod_ty res; PyArena *arena = PyArena_New();
if (oldparser) { if (arena == NULL) {
res = PyParser_ASTFromString(the_string, "<string>", mode, &flags, arena); return NULL;
} }
else {
res = PyPegen_ASTFromString(the_string, "<string>", mode, &flags, arena);
}
if (res == NULL) {
goto error;
}
result = PyAST_mod2obj(res);
error: mod_ty mod = _run_parser(the_string, filename, mode, &flags, arena, oldparser);
if (mod == NULL) {
PyArena_Free(arena);
return NULL;
}
PyObject *result = PyAST_mod2obj(mod);
PyArena_Free(arena); PyArena_Free(arena);
return result; return result;
} }
static PyMethodDef ParseMethods[] = { static PyMethodDef ParseMethods[] = {
{"parse_file", (PyCFunction)(void (*)(void))_Py_parse_file, METH_VARARGS|METH_KEYWORDS, "Parse a file."}, {
{"parse_string", (PyCFunction)(void (*)(void))_Py_parse_string, METH_VARARGS|METH_KEYWORDS,"Parse a string."}, "parse_string",
(PyCFunction)(void (*)(void))_Py_parse_string,
METH_VARARGS|METH_KEYWORDS,
"Parse a string, return an AST."
},
{
"compile_string",
(PyCFunction)(void (*)(void))_Py_compile_string,
METH_VARARGS|METH_KEYWORDS,
"Compile a string, return a code object."
},
{NULL, NULL, 0, NULL} /* Sentinel */ {NULL, NULL, 0, NULL} /* Sentinel */
}; };

View file

@ -69,25 +69,22 @@ stats: peg_extension/parse.c data/xxl.py
time: time_compile time: time_compile
time_compile: venv peg_extension/parse.c data/xxl.py time_compile: venv data/xxl.py
$(VENVPYTHON) scripts/benchmark.py --parser=pegen --target=xxl compile $(VENVPYTHON) scripts/benchmark.py --parser=pegen --target=xxl compile
time_parse: venv peg_extension/parse.c data/xxl.py time_parse: venv data/xxl.py
$(VENVPYTHON) scripts/benchmark.py --parser=pegen --target=xxl parse $(VENVPYTHON) scripts/benchmark.py --parser=pegen --target=xxl parse
time_check: venv peg_extension/parse.c data/xxl.py time_old: time_old_compile
$(VENVPYTHON) scripts/benchmark.py --parser=pegen --target=xxl check
time_stdlib: time_stdlib_compile time_old_compile: venv data/xxl.py
time_stdlib_compile: venv peg_extension/parse.c data/xxl.py
$(VENVPYTHON) scripts/benchmark.py --parser=cpython --target=xxl compile $(VENVPYTHON) scripts/benchmark.py --parser=cpython --target=xxl compile
time_stdlib_parse: venv peg_extension/parse.c data/xxl.py time_old_parse: venv data/xxl.py
$(VENVPYTHON) scripts/benchmark.py --parser=cpython --target=xxl parse $(VENVPYTHON) scripts/benchmark.py --parser=cpython --target=xxl parse
test_local: time_peg_dir: venv
$(PYTHON) scripts/test_parse_directory.py \ $(VENVPYTHON) scripts/test_parse_directory.py \
--grammar-file $(GRAMMAR) \ --grammar-file $(GRAMMAR) \
--tokens-file $(TOKENS) \ --tokens-file $(TOKENS) \
-d $(TESTDIR) \ -d $(TESTDIR) \
@ -96,8 +93,8 @@ test_local:
--exclude "*/failset/**" \ --exclude "*/failset/**" \
--exclude "*/failset/**/*" --exclude "*/failset/**/*"
test_global: $(CPYTHON) time_stdlib: $(CPYTHON) venv
$(PYTHON) scripts/test_parse_directory.py \ $(VENVPYTHON) scripts/test_parse_directory.py \
--grammar-file $(GRAMMAR) \ --grammar-file $(GRAMMAR) \
--tokens-file $(TOKENS) \ --tokens-file $(TOKENS) \
-d $(CPYTHON) \ -d $(CPYTHON) \
@ -113,9 +110,6 @@ mypy: regen-metaparser
format-python: format-python:
black pegen scripts black pegen scripts
bench: venv
$(VENVPYTHON) scripts/benchmark.py --parser=pegen --target=stdlib check
format: format-python format: format-python
find_max_nesting: find_max_nesting:

View file

@ -6,6 +6,8 @@ import sys
import os import os
from time import time from time import time
import _peg_parser
try: try:
import memory_profiler import memory_profiler
except ModuleNotFoundError: except ModuleNotFoundError:
@ -14,8 +16,6 @@ except ModuleNotFoundError:
sys.exit(1) sys.exit(1)
sys.path.insert(0, os.getcwd()) sys.path.insert(0, os.getcwd())
from peg_extension import parse
from pegen.build import build_c_parser_and_generator
from scripts.test_parse_directory import parse_directory from scripts.test_parse_directory import parse_directory
argparser = argparse.ArgumentParser( argparser = argparse.ArgumentParser(
@ -41,9 +41,6 @@ command_compile = subcommands.add_parser(
"compile", help="Benchmark parsing and compiling to bytecode" "compile", help="Benchmark parsing and compiling to bytecode"
) )
command_parse = subcommands.add_parser("parse", help="Benchmark parsing and generating an ast.AST") command_parse = subcommands.add_parser("parse", help="Benchmark parsing and generating an ast.AST")
command_check = subcommands.add_parser(
"check", help="Benchmark parsing and throwing the tree away"
)
def benchmark(func): def benchmark(func):
@ -66,22 +63,20 @@ def benchmark(func):
@benchmark @benchmark
def time_compile(source, parser): def time_compile(source, parser):
if parser == "cpython": if parser == "cpython":
return compile(source, os.path.join("data", "xxl.py"), "exec") return _peg_parser.compile_string(
source,
oldparser=True,
)
else: else:
return parse.parse_string(source, mode=2) return _peg_parser.compile_string(source)
@benchmark @benchmark
def time_parse(source, parser): def time_parse(source, parser):
if parser == "cpython": if parser == "cpython":
return ast.parse(source, os.path.join("data", "xxl.py"), "exec") return _peg_parser.parse_string(source, oldparser=True)
else: else:
return parse.parse_string(source, mode=1) return _peg_parser.parse_string(source)
@benchmark
def time_check(source):
return parse.parse_string(source, mode=0)
def run_benchmark_xxl(subcommand, parser, source): def run_benchmark_xxl(subcommand, parser, source):
@ -89,32 +84,20 @@ def run_benchmark_xxl(subcommand, parser, source):
time_compile(source, parser) time_compile(source, parser)
elif subcommand == "parse": elif subcommand == "parse":
time_parse(source, parser) time_parse(source, parser)
elif subcommand == "check":
time_check(source)
def run_benchmark_stdlib(subcommand, parser): def run_benchmark_stdlib(subcommand, parser):
modes = {"compile": 2, "parse": 1, "check": 0}
extension = None
if parser == "pegen":
extension = build_c_parser_and_generator(
"../../Grammar/python.gram",
"../../Grammar/Tokens",
"peg_extension/parse.c",
compile_extension=True,
skip_actions=False,
)
for _ in range(3): for _ in range(3):
parse_directory( parse_directory(
"../../Lib", "../../Lib",
"../../Grammar/python.gram", "../../Grammar/python.gram",
"../../Grammar/Tokens",
verbose=False, verbose=False,
excluded_files=["*/bad*", "*/lib2to3/tests/data/*",], excluded_files=["*/bad*", "*/lib2to3/tests/data/*",],
skip_actions=False, skip_actions=False,
tree_arg=0, tree_arg=0,
short=True, short=True,
extension=extension, mode=2 if subcommand == "compile" else 1,
mode=modes[subcommand],
parser=parser, parser=parser,
) )
@ -127,8 +110,6 @@ def main():
if subcommand is None: if subcommand is None:
argparser.error("A benchmark to run is required") argparser.error("A benchmark to run is required")
if subcommand == "check" and parser == "cpython":
argparser.error("Cannot use check target with the CPython parser")
if target == "xxl": if target == "xxl":
with open(os.path.join("data", "xxl.py"), "r") as f: with open(os.path.join("data", "xxl.py"), "r") as f:

View file

@ -30,6 +30,8 @@ import os
import sys import sys
import tempfile import tempfile
import _peg_parser
from typing import List from typing import List
sys.path.insert(0, os.getcwd()) sys.path.insert(0, os.getcwd())
@ -72,7 +74,7 @@ def diff_trees(a: ast.AST, b: ast.AST, verbose: bool = False) -> List[str]:
def show_parse(source: str, verbose: bool = False) -> str: def show_parse(source: str, verbose: bool = False) -> str:
tree = ast.parse(source) tree = _peg_parser.parse_string(source, oldparser=True)
return format_tree(tree, verbose).rstrip("\n") return format_tree(tree, verbose).rstrip("\n")
@ -90,17 +92,11 @@ def main() -> None:
sep = " " sep = " "
program = sep.join(args.program) program = sep.join(args.program)
if args.grammar_file: if args.grammar_file:
sys.path.insert(0, os.curdir) tree = _peg_parser.parse_string(program)
from pegen.build import build_parser_and_generator
build_parser_and_generator(args.grammar_file, "peg_parser/parse.c", compile_extension=True)
from pegen.parse import parse_string # type: ignore[import]
tree = parse_string(program, mode=1)
if args.diff: if args.diff:
a = tree a = tree
b = ast.parse(program) b = _peg_parser.parse_string(program, oldparser=True)
diff = diff_trees(a, b, args.verbose) diff = diff_trees(a, b, args.verbose)
if diff: if diff:
for line in diff: for line in diff:
@ -111,8 +107,8 @@ def main() -> None:
print(f"# Parsed using {args.grammar_file}") print(f"# Parsed using {args.grammar_file}")
print(format_tree(tree, args.verbose)) print(format_tree(tree, args.verbose))
else: else:
tree = ast.parse(program) tree = _peg_parser.parse_string(program, oldparser=True)
print("# Parse using ast.parse()") print("# Parse using the old parser")
print(format_tree(tree, args.verbose)) print(format_tree(tree, args.verbose))

View file

@ -6,13 +6,14 @@ import os
import sys import sys
import time import time
import traceback import traceback
import tokenize
import _peg_parser
from glob import glob from glob import glob
from pathlib import PurePath from pathlib import PurePath
from typing import List, Optional, Any from typing import List, Optional, Any
sys.path.insert(0, os.getcwd()) sys.path.insert(0, os.getcwd())
from pegen.build import build_c_parser_and_generator
from pegen.ast_dump import ast_dump from pegen.ast_dump import ast_dump
from pegen.testutil import print_memstats from pegen.testutil import print_memstats
from scripts import show_parse from scripts import show_parse
@ -83,7 +84,7 @@ def compare_trees(
actual_tree: ast.AST, file: str, verbose: bool, include_attributes: bool = False, actual_tree: ast.AST, file: str, verbose: bool, include_attributes: bool = False,
) -> int: ) -> int:
with open(file) as f: with open(file) as f:
expected_tree = ast.parse(f.read()) expected_tree = _peg_parser.parse_string(f.read(), oldparser=True)
expected_text = ast_dump(expected_tree, include_attributes=include_attributes) expected_text = ast_dump(expected_tree, include_attributes=include_attributes)
actual_text = ast_dump(actual_tree, include_attributes=include_attributes) actual_text = ast_dump(actual_tree, include_attributes=include_attributes)
@ -121,7 +122,6 @@ def parse_directory(
skip_actions: bool, skip_actions: bool,
tree_arg: int, tree_arg: int,
short: bool, short: bool,
extension: Any,
mode: int, mode: int,
parser: str, parser: str,
) -> int: ) -> int:
@ -137,47 +137,21 @@ def parse_directory(
if not os.path.exists(grammar_file): if not os.path.exists(grammar_file):
print(f"The specified grammar file, {grammar_file}, does not exist.", file=sys.stderr) print(f"The specified grammar file, {grammar_file}, does not exist.", file=sys.stderr)
return 1 return 1
try:
if not extension and parser == "pegen":
build_c_parser_and_generator(
grammar_file,
tokens_file,
"peg_extension/parse.c",
compile_extension=True,
skip_actions=skip_actions,
)
except Exception as err:
print(
f"{FAIL}The following error occurred when generating the parser. Please check your grammar file.\n{ENDC}",
file=sys.stderr,
)
traceback.print_exception(err.__class__, err, None)
return 1
else: else:
print( print(
"A grammar file or a tokens file was not provided - attempting to use existing parser from stdlib...\n" "A grammar file or a tokens file was not provided - attempting to use existing parser from stdlib...\n"
) )
if parser == "pegen": if tree_arg:
try: assert mode == 1, "Mode should be 1 (parse), when comparing the generated trees"
from peg_extension import parse # type: ignore
except Exception as e:
print(
"An existing parser was not found. Please run `make` or specify a grammar file with the `-g` flag.",
file=sys.stderr,
)
return 1
# For a given directory, traverse files and attempt to parse each one # For a given directory, traverse files and attempt to parse each one
# - Output success/failure for each file # - Output success/failure for each file
errors = 0 errors = 0
files = [] files = []
trees = {} # Trees to compare (after everything else is done) trees = {} # Trees to compare (after everything else is done)
total_seconds = 0
t0 = time.time()
for file in sorted(glob(f"{directory}/**/*.py", recursive=True)): for file in sorted(glob(f"{directory}/**/*.py", recursive=True)):
# Only attempt to parse Python files and files that are not excluded # Only attempt to parse Python files and files that are not excluded
should_exclude_file = False should_exclude_file = False
@ -187,25 +161,31 @@ def parse_directory(
break break
if not should_exclude_file: if not should_exclude_file:
with tokenize.open(file) as f:
source = f.read()
try: try:
if tree_arg: t0 = time.time()
mode = 1 if mode == 2:
if parser == "cpython": result = _peg_parser.compile_string(
with open(file, "r") as f: source,
source = f.read() filename=file,
if mode == 2: oldparser=parser == "cpython",
compile(source, file, "exec") )
elif mode == 1:
ast.parse(source, file, "exec")
else: else:
tree = parse.parse_file(file, mode=mode) result = _peg_parser.parse_string(
source,
filename=file,
oldparser=parser == "cpython"
)
t1 = time.time()
total_seconds += (t1 - t0)
if tree_arg: if tree_arg:
trees[file] = tree trees[file] = result
if not short: if not short:
report_status(succeeded=True, file=file, verbose=verbose) report_status(succeeded=True, file=file, verbose=verbose)
except Exception as error: except Exception as error:
try: try:
ast.parse(file) _peg_parser.parse_string(source, mode="exec", oldparser=True)
except Exception: except Exception:
if not short: if not short:
print(f"File {file} cannot be parsed by either pegen or the ast module.") print(f"File {file} cannot be parsed by either pegen or the ast module.")
@ -217,7 +197,6 @@ def parse_directory(
files.append(file) files.append(file)
t1 = time.time() t1 = time.time()
total_seconds = t1 - t0
total_files = len(files) total_files = len(files)
total_bytes = 0 total_bytes = 0
@ -238,13 +217,6 @@ def parse_directory(
f"or {total_bytes / total_seconds :,.0f} bytes/sec.", f"or {total_bytes / total_seconds :,.0f} bytes/sec.",
) )
if parser == "pegen":
# Dump memo stats to @data.
with open("@data", "w") as datafile:
for i, count in enumerate(parse.get_memo_stats()):
if count:
datafile.write(f"{i:4d} {count:9d}\n")
if short: if short:
print_memstats() print_memstats()
@ -275,6 +247,7 @@ def main() -> None:
skip_actions = args.skip_actions skip_actions = args.skip_actions
tree = args.tree tree = args.tree
short = args.short short = args.short
mode = 1 if args.tree else 2
sys.exit( sys.exit(
parse_directory( parse_directory(
directory, directory,
@ -285,8 +258,7 @@ def main() -> None:
skip_actions, skip_actions,
tree, tree,
short, short,
None, mode,
0,
"pegen", "pegen",
) )
) )

View file

@ -54,7 +54,7 @@ def find_dirname(package_name: str) -> str:
assert False # This is to fix mypy, should never be reached assert False # This is to fix mypy, should never be reached
def run_tests(dirname: str, tree: int, extension: Any) -> int: def run_tests(dirname: str, tree: int) -> int:
return test_parse_directory.parse_directory( return test_parse_directory.parse_directory(
dirname, dirname,
HERE / ".." / ".." / ".." / "Grammar" / "python.gram", HERE / ".." / ".." / ".." / "Grammar" / "python.gram",
@ -72,7 +72,6 @@ def run_tests(dirname: str, tree: int, extension: Any) -> int:
skip_actions=False, skip_actions=False,
tree_arg=tree, tree_arg=tree,
short=True, short=True,
extension=extension,
mode=1, mode=1,
parser="pegen", parser="pegen",
) )
@ -82,13 +81,6 @@ def main() -> None:
args = argparser.parse_args() args = argparser.parse_args()
tree = args.tree tree = args.tree
extension = build.build_c_parser_and_generator(
HERE / ".." / ".." / ".." / "Grammar" / "python.gram",
HERE / ".." / ".." / ".." / "Grammar" / "Tokens",
"peg_extension/parse.c",
compile_extension=True,
)
for package in get_packages(): for package in get_packages():
print(f"Extracting files from {package}... ", end="") print(f"Extracting files from {package}... ", end="")
try: try:
@ -100,7 +92,7 @@ def main() -> None:
print(f"Trying to parse all python files ... ") print(f"Trying to parse all python files ... ")
dirname = find_dirname(package) dirname = find_dirname(package)
status = run_tests(dirname, tree, extension) status = run_tests(dirname, tree)
if status == 0: if status == 0:
shutil.rmtree(dirname) shutil.rmtree(dirname)
else: else: