mirror of
https://github.com/python/cpython.git
synced 2025-08-04 08:59:19 +00:00
New batch from Fred
This commit is contained in:
parent
3a1fbb4c70
commit
4747887880
6 changed files with 1192 additions and 470 deletions
|
@ -14,12 +14,6 @@
|
||||||
\section{Built-in Module \sectcode{parser}}
|
\section{Built-in Module \sectcode{parser}}
|
||||||
\bimodindex{parser}
|
\bimodindex{parser}
|
||||||
|
|
||||||
|
|
||||||
% ==== 2. ====
|
|
||||||
% Give a short overview of what the module does.
|
|
||||||
% If it is platform specific, mention this.
|
|
||||||
% Mention other important restrictions or general operating principles.
|
|
||||||
|
|
||||||
The \code{parser} module provides an interface to Python's internal
|
The \code{parser} module provides an interface to Python's internal
|
||||||
parser and byte-code compiler. The primary purpose for this interface
|
parser and byte-code compiler. The primary purpose for this interface
|
||||||
is to allow Python code to edit the parse tree of a Python expression
|
is to allow Python code to edit the parse tree of a Python expression
|
||||||
|
@ -40,24 +34,37 @@ is created from a grammar specification defined in the file
|
||||||
trees stored in the ``AST objects'' created by this module are the
|
trees stored in the ``AST objects'' created by this module are the
|
||||||
actual output from the internal parser when created by the
|
actual output from the internal parser when created by the
|
||||||
\code{expr()} or \code{suite()} functions, described below. The AST
|
\code{expr()} or \code{suite()} functions, described below. The AST
|
||||||
objects created by \code{tuple2ast()} faithfully simulate those
|
objects created by \code{sequence2ast()} faithfully simulate those
|
||||||
structures.
|
structures. Be aware that the values of the sequences which are
|
||||||
|
considered ``correct'' will vary from one version of Python to another
|
||||||
|
as the formal grammar for the language is revised. However,
|
||||||
|
transporting code from one Python version to another as source text
|
||||||
|
will always allow correct parse trees to be created in the target
|
||||||
|
version, with the only restriction being that migrating to an older
|
||||||
|
version of the interpreter will not support more recent language
|
||||||
|
constructs. The parse trees are not typically compatible from one
|
||||||
|
version to another, whereas source code has always been
|
||||||
|
forward-compatible.
|
||||||
|
|
||||||
Each element of the tuples returned by \code{ast2tuple()} has a simple
|
Each element of the sequences returned by \code{ast2list} or
|
||||||
form. Tuples representing non-terminal elements in the grammar always
|
\code{ast2tuple()} has a simple form. Sequences representing
|
||||||
have a length greater than one. The first element is an integer which
|
non-terminal elements in the grammar always have a length greater than
|
||||||
identifies a production in the grammar. These integers are given
|
one. The first element is an integer which identifies a production in
|
||||||
symbolic names in the C header file \code{Include/graminit.h} and the
|
the grammar. These integers are given symbolic names in the C header
|
||||||
Python module \code{Lib/symbol.py}. Each additional element of the
|
file \code{Include/graminit.h} and the Python module
|
||||||
tuple represents a component of the production as recognized in the
|
\code{Lib/symbol.py}. Each additional element of the sequence represents
|
||||||
input string: these are always tuples which have the same form as the
|
a component of the production as recognized in the input string: these
|
||||||
parent. An important aspect of this structure which should be noted
|
are always sequences which have the same form as the parent. An
|
||||||
is that keywords used to identify the parent node type, such as the
|
important aspect of this structure which should be noted is that
|
||||||
keyword \code{if} in an \emph{if\_stmt}, are included in the node tree
|
keywords used to identify the parent node type, such as the keyword
|
||||||
without any special treatment. For example, the \code{if} keyword is
|
\code{if} in an \emph{if\_stmt}, are included in the node tree without
|
||||||
|
any special treatment. For example, the \code{if} keyword is
|
||||||
represented by the tuple \code{(1, 'if')}, where \code{1} is the
|
represented by the tuple \code{(1, 'if')}, where \code{1} is the
|
||||||
numeric value associated with all \code{NAME} elements, including
|
numeric value associated with all \code{NAME} elements, including
|
||||||
variable and function names defined by the user.
|
variable and function names defined by the user. In an alternate form
|
||||||
|
returned when line number information is requested, the same token
|
||||||
|
might be represented as \code{(1, 'if', 12)}, where the \code{12}
|
||||||
|
represents the line number at which the terminal symbol was found.
|
||||||
|
|
||||||
Terminal elements are represented in much the same way, but without
|
Terminal elements are represented in much the same way, but without
|
||||||
any child elements and the addition of the source text which was
|
any child elements and the addition of the source text which was
|
||||||
|
@ -70,27 +77,47 @@ The AST objects are not actually required to support the functionality
|
||||||
of this module, but are provided for three purposes: to allow an
|
of this module, but are provided for three purposes: to allow an
|
||||||
application to amortize the cost of processing complex parse trees, to
|
application to amortize the cost of processing complex parse trees, to
|
||||||
provide a parse tree representation which conserves memory space when
|
provide a parse tree representation which conserves memory space when
|
||||||
compared to the Python tuple representation, and to ease the creation
|
compared to the Python list or tuple representation, and to ease the
|
||||||
of additional modules in C which manipulate parse trees. A simple
|
creation of additional modules in C which manipulate parse trees. A
|
||||||
``wrapper'' module may be created in Python to hide the use of AST
|
simple ``wrapper'' module may be created in Python to hide the use of
|
||||||
objects.
|
AST objects.
|
||||||
|
|
||||||
|
|
||||||
The \code{parser} module defines the following functions:
|
The \code{parser} module defines the following functions:
|
||||||
|
|
||||||
\renewcommand{\indexsubitem}{(in module parser)}
|
\renewcommand{\indexsubitem}{(in module parser)}
|
||||||
|
|
||||||
\begin{funcdesc}{ast2tuple}{ast}
|
\begin{funcdesc}{ast2list}{ast\optional{\, line\_info\code{ = 0}}}
|
||||||
This function accepts an AST object from the caller in
|
This function accepts an AST object from the caller in
|
||||||
\code{\var{ast}} and returns a Python tuple representing the
|
\code{\var{ast}} and returns a Python list representing the
|
||||||
equivelent parse tree. The resulting tuple representation can be used
|
equivelent parse tree. The resulting list representation can be used
|
||||||
for inspection or the creation of a new parse tree in tuple form.
|
for inspection or the creation of a new parse tree in list form.
|
||||||
This function does not fail so long as memory is available to build
|
This function does not fail so long as memory is available to build
|
||||||
the tuple representation.
|
the list representation. If a parse tree will only be used for
|
||||||
|
inspection, \code{ast2tuple()} should be used instead to reduce memory
|
||||||
|
consumption and fragmentation. When modifications are to be made to
|
||||||
|
the parse tree, this function is significantly faster than retrieving
|
||||||
|
a tuple representation and converting that to nested lists.
|
||||||
|
|
||||||
|
If the \code{line\_info} flag is given true value, line number
|
||||||
|
information will be included for all terminal tokens as a third
|
||||||
|
element of the list representing the token. This information is
|
||||||
|
omitted if the flag is false or omitted.
|
||||||
\end{funcdesc}
|
\end{funcdesc}
|
||||||
|
|
||||||
|
\begin{funcdesc}{ast2tuple}{ast\optional{\, line\_info\code{ = 0}}}
|
||||||
|
This function accepts an AST object from the caller in
|
||||||
|
\code{\var{ast}} and returns a Python tuple representing the
|
||||||
|
equivelent parse tree. Other than returning a tuple instead of a
|
||||||
|
list, this function is identical to \code{ast2list()}.
|
||||||
|
|
||||||
\begin{funcdesc}{compileast}{ast\optional{\, filename \code{= '<ast>'}}}
|
If the \code{line\_info} flag is given true value, line number
|
||||||
|
information will be included for all terminal tokens as a third
|
||||||
|
element of the list representing the token. This information is
|
||||||
|
omitted if the flag is false or omitted.
|
||||||
|
\end{funcdesc}
|
||||||
|
|
||||||
|
\begin{funcdesc}{compileast}{ast\optional{\, filename\code{ = '<ast>'}}}
|
||||||
The Python byte compiler can be invoked on an AST object to produce
|
The Python byte compiler can be invoked on an AST object to produce
|
||||||
code objects which can be used as part of an \code{exec} statement or
|
code objects which can be used as part of an \code{exec} statement or
|
||||||
a call to the built-in \code{eval()} function. This function provides
|
a call to the built-in \code{eval()} function. This function provides
|
||||||
|
@ -98,6 +125,16 @@ the interface to the compiler, passing the internal parse tree from
|
||||||
\code{\var{ast}} to the parser, using the source file name specified
|
\code{\var{ast}} to the parser, using the source file name specified
|
||||||
by the \code{\var{filename}} parameter. The default value supplied
|
by the \code{\var{filename}} parameter. The default value supplied
|
||||||
for \code{\var{filename}} indicates that the source was an AST object.
|
for \code{\var{filename}} indicates that the source was an AST object.
|
||||||
|
|
||||||
|
Compiling an AST object may result in exceptions related to
|
||||||
|
compilation; an example would be a \code{SyntaxError} caused by the
|
||||||
|
parse tree for \code{del f(0)}; this statement is considered legal
|
||||||
|
within the formal grammar for Python but is not a legal language
|
||||||
|
construct. The \code{SyntaxError} raised for this condition is
|
||||||
|
actually generated by the Python byte-compiler normally, which is why
|
||||||
|
it can be raised at this point by the \code{parser} module. Most
|
||||||
|
causes of compilation failure can be diagnosed programmatically by
|
||||||
|
inspection of the parse tree.
|
||||||
\end{funcdesc}
|
\end{funcdesc}
|
||||||
|
|
||||||
|
|
||||||
|
@ -138,19 +175,33 @@ thrown.
|
||||||
\end{funcdesc}
|
\end{funcdesc}
|
||||||
|
|
||||||
|
|
||||||
\begin{funcdesc}{tuple2ast}{tuple}
|
\begin{funcdesc}{sequence2ast}{sequence}
|
||||||
This function accepts a parse tree represented as a tuple and builds
|
This function accepts a parse tree represented as a sequence and
|
||||||
an internal representation if possible. If it can validate that the
|
builds an internal representation if possible. If it can validate
|
||||||
tree conforms to the Python syntax and all nodes are valid node types
|
that the tree conforms to the Python grammar and all nodes are valid
|
||||||
in the host version of Python, an AST object is created from the
|
node types in the host version of Python, an AST object is created
|
||||||
internal representation and returned to the called. If there is a
|
from the internal representation and returned to the called. If there
|
||||||
problem creating the internal representation, or if the tree cannot be
|
is a problem creating the internal representation, or if the tree
|
||||||
validated, a \code{ParserError} exception is thrown. An AST object
|
cannot be validated, a \code{ParserError} exception is thrown. An AST
|
||||||
created this way should not be assumed to compile correctly; normal
|
object created this way should not be assumed to compile correctly;
|
||||||
exceptions thrown by compilation may still be initiated when the AST
|
normal exceptions thrown by compilation may still be initiated when
|
||||||
object is passed to \code{compileast()}. This will normally indicate
|
the AST object is passed to \code{compileast()}. This will normally
|
||||||
problems not related to syntax (such as a \code{MemoryError}
|
indicate problems not related to syntax (such as a \code{MemoryError}
|
||||||
exception).
|
exception), but may also be due to constructs such as the result of
|
||||||
|
parsing \code{del f(0)}, which escapes the Python parser but is
|
||||||
|
checked by the bytecode compiler.
|
||||||
|
|
||||||
|
Sequences representing terminal tokens may be represented as either
|
||||||
|
two-element lists of the form \code{(1, 'name')} or as three-element
|
||||||
|
lists of the form \code{(1, 'name', 56)}. If the third element is
|
||||||
|
present, it is assumed to be a valid line number. The line number
|
||||||
|
may be specified for any subset of the terminal symbols in the input
|
||||||
|
tree.
|
||||||
|
\end{funcdesc}
|
||||||
|
|
||||||
|
\begin{funcdesc}{tuple2ast}{sequence}
|
||||||
|
This is the same function as \code{sequence2ast}. This entry point is
|
||||||
|
maintained for backward compatibility.
|
||||||
\end{funcdesc}
|
\end{funcdesc}
|
||||||
|
|
||||||
|
|
||||||
|
@ -166,9 +217,9 @@ Exception raised when a failure occurs within the parser module. This
|
||||||
is generally produced for validation failures rather than the built in
|
is generally produced for validation failures rather than the built in
|
||||||
\code{SyntaxError} thrown during normal parsing.
|
\code{SyntaxError} thrown during normal parsing.
|
||||||
The exception argument is either a string describing the reason of the
|
The exception argument is either a string describing the reason of the
|
||||||
failure or a tuple containing a tuple causing the failure from a parse
|
failure or a tuple containing a sequence causing the failure from a parse
|
||||||
tree passed to \code{tuple2ast()} and an explanatory string. Calls to
|
tree passed to \code{sequence2ast()} and an explanatory string. Calls to
|
||||||
\code{tuple2ast()} need to be able to handle either type of exception,
|
\code{sequence2ast()} need to be able to handle either type of exception,
|
||||||
while calls to other functions in the module will only need to be
|
while calls to other functions in the module will only need to be
|
||||||
aware of the simple string values.
|
aware of the simple string values.
|
||||||
\end{excdesc}
|
\end{excdesc}
|
||||||
|
@ -182,9 +233,36 @@ exceptions carry all the meaning normally associated with them. Refer
|
||||||
to the descriptions of each function for detailed information.
|
to the descriptions of each function for detailed information.
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{AST Objects}
|
||||||
|
|
||||||
|
AST objects (returned by \code{expr()}, \code{suite()}, and
|
||||||
|
\code{tuple2ast()}, described above) have no methods of their own.
|
||||||
|
Some of the functions defined which accept an AST object as their
|
||||||
|
first argument may change to object methods in the future.
|
||||||
|
|
||||||
|
Ordered and equality comparisons are supported between AST objects.
|
||||||
|
|
||||||
|
|
||||||
\subsection{Example}
|
\subsection{Example}
|
||||||
|
|
||||||
A simple example:
|
The parser modules allows operations to be performed on the parse tree
|
||||||
|
of Python source code before the bytecode is generated, and provides
|
||||||
|
for inspection of the parse tree for information gathering purposes as
|
||||||
|
well. While many useful operations may take place between parsing and
|
||||||
|
bytecode generation, the simplest operation is to do nothing. For
|
||||||
|
this purpose, using the \code{parser} module to produce an
|
||||||
|
intermediate data structure is equivelent to the code
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
>>> code = compile('a + 5', 'eval')
|
||||||
|
>>> a = 5
|
||||||
|
>>> eval(code)
|
||||||
|
10
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
The equivelent operation using the \code{parser} module is somewhat
|
||||||
|
longer, and allows the intermediate internal parse tree to be retained
|
||||||
|
as an AST object:
|
||||||
|
|
||||||
\begin{verbatim}
|
\begin{verbatim}
|
||||||
>>> import parser
|
>>> import parser
|
||||||
|
@ -195,18 +273,187 @@ A simple example:
|
||||||
10
|
10
|
||||||
\end{verbatim}
|
\end{verbatim}
|
||||||
|
|
||||||
|
Some applications can benfit from access to the parse tree itself, and
|
||||||
|
can take advantage of the intermediate data structure provided by the
|
||||||
|
\code{parser} module. The remainder of this section of examples will
|
||||||
|
demonstrate how the intermediate data structure can provide access to
|
||||||
|
module documentation defined in docstrings without requiring that the
|
||||||
|
code being examined be imported into a running interpreter. This can
|
||||||
|
be very useful for performing analyses of untrusted code.
|
||||||
|
|
||||||
\subsection{AST Objects}
|
Generally, the example will demonstrate how the parse tree may be
|
||||||
|
traversed to distill interesting information. Two functions and a set
|
||||||
|
of classes is developed which provide programmatic access to high
|
||||||
|
level function and class definitions provided by a module. The
|
||||||
|
classes extract information from the parse tree and provide access to
|
||||||
|
the information at a useful semantic level, one function provides a
|
||||||
|
simple low-level pattern matching capability, and the other function
|
||||||
|
defines a high-level interface to the classes by handling file
|
||||||
|
operations on behalf of the caller. All source files mentioned here
|
||||||
|
which are not part of the Python installation are located in the
|
||||||
|
\file{Demo/parser} directory of the distribution.
|
||||||
|
|
||||||
AST objects (returned by \code{expr()}, \code{suite()}, and
|
To construct the upper-level extraction methods, we need to know what
|
||||||
\code{tuple2ast()}, described above) have no methods of their own.
|
the parse tree structure looks like and how much of it we actually
|
||||||
Some of the functions defined which accept an AST object as their
|
need to be concerned about. Python uses a moderately deep parse tree,
|
||||||
first argument may change to object methods in the future.
|
so there are a large number of intermediate nodes. It is important to
|
||||||
|
read and understand the formal grammar used by Python. This is
|
||||||
|
specified in the file \file{Grammar/Grammar} in the distribution.
|
||||||
|
Consider the simplest case of interest when searching for docstrings:
|
||||||
|
a module consisting of a docstring and nothing else:
|
||||||
|
|
||||||
Ordered and equality comparisons are supported between AST objects.
|
\begin{verbatim}
|
||||||
|
"""Some documentation.
|
||||||
|
"""
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
\renewcommand{\indexsubitem}{(ast method)}
|
Using the interpreter to take a look at the parse tree, we find a
|
||||||
|
bewildering mass of numbers and parentheses, with the documentation
|
||||||
|
buried deep in the nested tuples:
|
||||||
|
|
||||||
%\begin{funcdesc}{empty}{}
|
\begin{verbatim}
|
||||||
%Empty the can into the trash.
|
>>> import parser
|
||||||
%\end{funcdesc}
|
>>> import pprint
|
||||||
|
>>> ast = parser.suite(open('docstring.py').read())
|
||||||
|
>>> tup = parser.ast2tuple(ast)
|
||||||
|
>>> pprint.pprint(tup)
|
||||||
|
(257,
|
||||||
|
(264,
|
||||||
|
(265,
|
||||||
|
(266,
|
||||||
|
(267,
|
||||||
|
(307,
|
||||||
|
(287,
|
||||||
|
(288,
|
||||||
|
(289,
|
||||||
|
(290,
|
||||||
|
(292,
|
||||||
|
(293,
|
||||||
|
(294,
|
||||||
|
(295,
|
||||||
|
(296,
|
||||||
|
(297,
|
||||||
|
(298,
|
||||||
|
(299,
|
||||||
|
(300, (3, '"""Some documentation.\012"""'))))))))))))))))),
|
||||||
|
(4, ''))),
|
||||||
|
(4, ''),
|
||||||
|
(0, ''))
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
The numbers at the first element of each node in the tree are the node
|
||||||
|
types; they map directly to terminal and non-terminal symbols in the
|
||||||
|
grammar. Unfortunately, they are represented as integers in the
|
||||||
|
internal representation, and the Python structures generated do not
|
||||||
|
change that. However, the \code{symbol} and \code{token} modules
|
||||||
|
provide symbolic names for the node types and dictionaries which map
|
||||||
|
from the integers to the symbolic names for the node types.
|
||||||
|
|
||||||
|
In the output presented above, the outermost tuple contains four
|
||||||
|
elements: the integer \code{257} and three additional tuples. Node
|
||||||
|
type \code{257} has the symbolic name \code{file_input}. Each of
|
||||||
|
these inner tuples contains an integer as the first element; these
|
||||||
|
integers, \code{264}, \code{4}, and \code{0}, represent the node types
|
||||||
|
\code{stmt}, \code{NEWLINE}, and \code{ENDMARKER}, respectively.
|
||||||
|
Note that these values may change depending on the version of Python
|
||||||
|
you are using; consult \file{symbol.py} and \file{token.py} for
|
||||||
|
details of the mapping. It should be fairly clear that the outermost
|
||||||
|
node is related primarily to the input source rather than the contents
|
||||||
|
of the file, and may be disregarded for the moment. The \code{stmt}
|
||||||
|
node is much more interesting. In particular, all docstrings are
|
||||||
|
found in subtrees which are formed exactly as this node is formed,
|
||||||
|
with the only difference being the string itself. The association
|
||||||
|
between the docstring in a similar tree and the defined entity (class,
|
||||||
|
function, or module) which it describes is given by the position of
|
||||||
|
the docstring subtree within the tree defining the described
|
||||||
|
structure.
|
||||||
|
|
||||||
|
By replacing the actual docstring with something to signify a variable
|
||||||
|
component of the tree, we allow a simple pattern matching approach may
|
||||||
|
be taken to checking any given subtree for equivelence to the general
|
||||||
|
pattern for docstrings. Since the example demonstrates information
|
||||||
|
extraction, we can safely require that the tree be in tuple form
|
||||||
|
rather than list form, allowing a simple variable representation to be
|
||||||
|
\code{['variable\_name']}. A simple recursive function can implement
|
||||||
|
the pattern matching, returning a boolean and a dictionary of variable
|
||||||
|
name to value mappings.
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
from types import ListType, TupleType
|
||||||
|
|
||||||
|
def match(pattern, data, vars=None):
|
||||||
|
if vars is None:
|
||||||
|
vars = {}
|
||||||
|
if type(pattern) is ListType:
|
||||||
|
vars[pattern[0]] = data
|
||||||
|
return 1, vars
|
||||||
|
if type(pattern) is not TupleType:
|
||||||
|
return (pattern == data), vars
|
||||||
|
if len(data) != len(pattern):
|
||||||
|
return 0, vars
|
||||||
|
for pattern, data in map(None, pattern, data):
|
||||||
|
same, vars = match(pattern, data, vars)
|
||||||
|
if not same:
|
||||||
|
break
|
||||||
|
return same, vars
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
Using this simple recursive pattern matching function and the symbolic
|
||||||
|
node types, the pattern for the candidate docstring subtrees becomes:
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
>>> DOCSTRING_STMT_PATTERN = (
|
||||||
|
... symbol.stmt,
|
||||||
|
... (symbol.simple_stmt,
|
||||||
|
... (symbol.small_stmt,
|
||||||
|
... (symbol.expr_stmt,
|
||||||
|
... (symbol.testlist,
|
||||||
|
... (symbol.test,
|
||||||
|
... (symbol.and_test,
|
||||||
|
... (symbol.not_test,
|
||||||
|
... (symbol.comparison,
|
||||||
|
... (symbol.expr,
|
||||||
|
... (symbol.xor_expr,
|
||||||
|
... (symbol.and_expr,
|
||||||
|
... (symbol.shift_expr,
|
||||||
|
... (symbol.arith_expr,
|
||||||
|
... (symbol.term,
|
||||||
|
... (symbol.factor,
|
||||||
|
... (symbol.power,
|
||||||
|
... (symbol.atom,
|
||||||
|
... (token.STRING, ['docstring'])
|
||||||
|
... )))))))))))))))),
|
||||||
|
... (token.NEWLINE, '')
|
||||||
|
... ))
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
Using the \code{match()} function with this pattern, extracting the
|
||||||
|
module docstring from the parse tree created previously is easy:
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
>>> found, vars = match(DOCSTRING_STMT_PATTERN, tup[1])
|
||||||
|
>>> found
|
||||||
|
1
|
||||||
|
>>> vars
|
||||||
|
{'docstring': '"""Some documentation.\012"""'}
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
Once specific data can be extracted from a location where it is
|
||||||
|
expected, the question of where information can be expected
|
||||||
|
needs to be answered. When dealing with docstrings, the answer is
|
||||||
|
fairly simple: the docstring is the first \code{stmt} node in a code
|
||||||
|
block (\code{file_input} or \code{suite} node types). A module
|
||||||
|
consists of a single \code{file_input} node, and class and function
|
||||||
|
definitions each contain exactly one \code{suite} node. Classes and
|
||||||
|
functions are readily identified as subtrees of code block nodes which
|
||||||
|
start with \code{(stmt, (compound_stmt, (classdef, ...} or
|
||||||
|
\code{(stmt, (compound_stmt, (funcdef, ...}. Note that these subtrees
|
||||||
|
cannot be matched by \code{match()} since it does not support multiple
|
||||||
|
sibling nodes to match without regard to number. A more elaborate
|
||||||
|
matching function could be used to overcome this limitation, but this
|
||||||
|
is sufficient for the example.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
%%
|
||||||
|
%% end of file
|
||||||
|
|
|
@ -14,12 +14,6 @@
|
||||||
\section{Built-in Module \sectcode{parser}}
|
\section{Built-in Module \sectcode{parser}}
|
||||||
\bimodindex{parser}
|
\bimodindex{parser}
|
||||||
|
|
||||||
|
|
||||||
% ==== 2. ====
|
|
||||||
% Give a short overview of what the module does.
|
|
||||||
% If it is platform specific, mention this.
|
|
||||||
% Mention other important restrictions or general operating principles.
|
|
||||||
|
|
||||||
The \code{parser} module provides an interface to Python's internal
|
The \code{parser} module provides an interface to Python's internal
|
||||||
parser and byte-code compiler. The primary purpose for this interface
|
parser and byte-code compiler. The primary purpose for this interface
|
||||||
is to allow Python code to edit the parse tree of a Python expression
|
is to allow Python code to edit the parse tree of a Python expression
|
||||||
|
@ -40,24 +34,37 @@ is created from a grammar specification defined in the file
|
||||||
trees stored in the ``AST objects'' created by this module are the
|
trees stored in the ``AST objects'' created by this module are the
|
||||||
actual output from the internal parser when created by the
|
actual output from the internal parser when created by the
|
||||||
\code{expr()} or \code{suite()} functions, described below. The AST
|
\code{expr()} or \code{suite()} functions, described below. The AST
|
||||||
objects created by \code{tuple2ast()} faithfully simulate those
|
objects created by \code{sequence2ast()} faithfully simulate those
|
||||||
structures.
|
structures. Be aware that the values of the sequences which are
|
||||||
|
considered ``correct'' will vary from one version of Python to another
|
||||||
|
as the formal grammar for the language is revised. However,
|
||||||
|
transporting code from one Python version to another as source text
|
||||||
|
will always allow correct parse trees to be created in the target
|
||||||
|
version, with the only restriction being that migrating to an older
|
||||||
|
version of the interpreter will not support more recent language
|
||||||
|
constructs. The parse trees are not typically compatible from one
|
||||||
|
version to another, whereas source code has always been
|
||||||
|
forward-compatible.
|
||||||
|
|
||||||
Each element of the tuples returned by \code{ast2tuple()} has a simple
|
Each element of the sequences returned by \code{ast2list} or
|
||||||
form. Tuples representing non-terminal elements in the grammar always
|
\code{ast2tuple()} has a simple form. Sequences representing
|
||||||
have a length greater than one. The first element is an integer which
|
non-terminal elements in the grammar always have a length greater than
|
||||||
identifies a production in the grammar. These integers are given
|
one. The first element is an integer which identifies a production in
|
||||||
symbolic names in the C header file \code{Include/graminit.h} and the
|
the grammar. These integers are given symbolic names in the C header
|
||||||
Python module \code{Lib/symbol.py}. Each additional element of the
|
file \code{Include/graminit.h} and the Python module
|
||||||
tuple represents a component of the production as recognized in the
|
\code{Lib/symbol.py}. Each additional element of the sequence represents
|
||||||
input string: these are always tuples which have the same form as the
|
a component of the production as recognized in the input string: these
|
||||||
parent. An important aspect of this structure which should be noted
|
are always sequences which have the same form as the parent. An
|
||||||
is that keywords used to identify the parent node type, such as the
|
important aspect of this structure which should be noted is that
|
||||||
keyword \code{if} in an \emph{if\_stmt}, are included in the node tree
|
keywords used to identify the parent node type, such as the keyword
|
||||||
without any special treatment. For example, the \code{if} keyword is
|
\code{if} in an \emph{if\_stmt}, are included in the node tree without
|
||||||
|
any special treatment. For example, the \code{if} keyword is
|
||||||
represented by the tuple \code{(1, 'if')}, where \code{1} is the
|
represented by the tuple \code{(1, 'if')}, where \code{1} is the
|
||||||
numeric value associated with all \code{NAME} elements, including
|
numeric value associated with all \code{NAME} elements, including
|
||||||
variable and function names defined by the user.
|
variable and function names defined by the user. In an alternate form
|
||||||
|
returned when line number information is requested, the same token
|
||||||
|
might be represented as \code{(1, 'if', 12)}, where the \code{12}
|
||||||
|
represents the line number at which the terminal symbol was found.
|
||||||
|
|
||||||
Terminal elements are represented in much the same way, but without
|
Terminal elements are represented in much the same way, but without
|
||||||
any child elements and the addition of the source text which was
|
any child elements and the addition of the source text which was
|
||||||
|
@ -70,27 +77,47 @@ The AST objects are not actually required to support the functionality
|
||||||
of this module, but are provided for three purposes: to allow an
|
of this module, but are provided for three purposes: to allow an
|
||||||
application to amortize the cost of processing complex parse trees, to
|
application to amortize the cost of processing complex parse trees, to
|
||||||
provide a parse tree representation which conserves memory space when
|
provide a parse tree representation which conserves memory space when
|
||||||
compared to the Python tuple representation, and to ease the creation
|
compared to the Python list or tuple representation, and to ease the
|
||||||
of additional modules in C which manipulate parse trees. A simple
|
creation of additional modules in C which manipulate parse trees. A
|
||||||
``wrapper'' module may be created in Python to hide the use of AST
|
simple ``wrapper'' module may be created in Python to hide the use of
|
||||||
objects.
|
AST objects.
|
||||||
|
|
||||||
|
|
||||||
The \code{parser} module defines the following functions:
|
The \code{parser} module defines the following functions:
|
||||||
|
|
||||||
\renewcommand{\indexsubitem}{(in module parser)}
|
\renewcommand{\indexsubitem}{(in module parser)}
|
||||||
|
|
||||||
\begin{funcdesc}{ast2tuple}{ast}
|
\begin{funcdesc}{ast2list}{ast\optional{\, line\_info\code{ = 0}}}
|
||||||
This function accepts an AST object from the caller in
|
This function accepts an AST object from the caller in
|
||||||
\code{\var{ast}} and returns a Python tuple representing the
|
\code{\var{ast}} and returns a Python list representing the
|
||||||
equivelent parse tree. The resulting tuple representation can be used
|
equivelent parse tree. The resulting list representation can be used
|
||||||
for inspection or the creation of a new parse tree in tuple form.
|
for inspection or the creation of a new parse tree in list form.
|
||||||
This function does not fail so long as memory is available to build
|
This function does not fail so long as memory is available to build
|
||||||
the tuple representation.
|
the list representation. If a parse tree will only be used for
|
||||||
|
inspection, \code{ast2tuple()} should be used instead to reduce memory
|
||||||
|
consumption and fragmentation. When modifications are to be made to
|
||||||
|
the parse tree, this function is significantly faster than retrieving
|
||||||
|
a tuple representation and converting that to nested lists.
|
||||||
|
|
||||||
|
If the \code{line\_info} flag is given true value, line number
|
||||||
|
information will be included for all terminal tokens as a third
|
||||||
|
element of the list representing the token. This information is
|
||||||
|
omitted if the flag is false or omitted.
|
||||||
\end{funcdesc}
|
\end{funcdesc}
|
||||||
|
|
||||||
|
\begin{funcdesc}{ast2tuple}{ast\optional{\, line\_info\code{ = 0}}}
|
||||||
|
This function accepts an AST object from the caller in
|
||||||
|
\code{\var{ast}} and returns a Python tuple representing the
|
||||||
|
equivelent parse tree. Other than returning a tuple instead of a
|
||||||
|
list, this function is identical to \code{ast2list()}.
|
||||||
|
|
||||||
\begin{funcdesc}{compileast}{ast\optional{\, filename \code{= '<ast>'}}}
|
If the \code{line\_info} flag is given true value, line number
|
||||||
|
information will be included for all terminal tokens as a third
|
||||||
|
element of the list representing the token. This information is
|
||||||
|
omitted if the flag is false or omitted.
|
||||||
|
\end{funcdesc}
|
||||||
|
|
||||||
|
\begin{funcdesc}{compileast}{ast\optional{\, filename\code{ = '<ast>'}}}
|
||||||
The Python byte compiler can be invoked on an AST object to produce
|
The Python byte compiler can be invoked on an AST object to produce
|
||||||
code objects which can be used as part of an \code{exec} statement or
|
code objects which can be used as part of an \code{exec} statement or
|
||||||
a call to the built-in \code{eval()} function. This function provides
|
a call to the built-in \code{eval()} function. This function provides
|
||||||
|
@ -98,6 +125,16 @@ the interface to the compiler, passing the internal parse tree from
|
||||||
\code{\var{ast}} to the parser, using the source file name specified
|
\code{\var{ast}} to the parser, using the source file name specified
|
||||||
by the \code{\var{filename}} parameter. The default value supplied
|
by the \code{\var{filename}} parameter. The default value supplied
|
||||||
for \code{\var{filename}} indicates that the source was an AST object.
|
for \code{\var{filename}} indicates that the source was an AST object.
|
||||||
|
|
||||||
|
Compiling an AST object may result in exceptions related to
|
||||||
|
compilation; an example would be a \code{SyntaxError} caused by the
|
||||||
|
parse tree for \code{del f(0)}; this statement is considered legal
|
||||||
|
within the formal grammar for Python but is not a legal language
|
||||||
|
construct. The \code{SyntaxError} raised for this condition is
|
||||||
|
actually generated by the Python byte-compiler normally, which is why
|
||||||
|
it can be raised at this point by the \code{parser} module. Most
|
||||||
|
causes of compilation failure can be diagnosed programmatically by
|
||||||
|
inspection of the parse tree.
|
||||||
\end{funcdesc}
|
\end{funcdesc}
|
||||||
|
|
||||||
|
|
||||||
|
@ -138,19 +175,33 @@ thrown.
|
||||||
\end{funcdesc}
|
\end{funcdesc}
|
||||||
|
|
||||||
|
|
||||||
\begin{funcdesc}{tuple2ast}{tuple}
|
\begin{funcdesc}{sequence2ast}{sequence}
|
||||||
This function accepts a parse tree represented as a tuple and builds
|
This function accepts a parse tree represented as a sequence and
|
||||||
an internal representation if possible. If it can validate that the
|
builds an internal representation if possible. If it can validate
|
||||||
tree conforms to the Python syntax and all nodes are valid node types
|
that the tree conforms to the Python grammar and all nodes are valid
|
||||||
in the host version of Python, an AST object is created from the
|
node types in the host version of Python, an AST object is created
|
||||||
internal representation and returned to the called. If there is a
|
from the internal representation and returned to the called. If there
|
||||||
problem creating the internal representation, or if the tree cannot be
|
is a problem creating the internal representation, or if the tree
|
||||||
validated, a \code{ParserError} exception is thrown. An AST object
|
cannot be validated, a \code{ParserError} exception is thrown. An AST
|
||||||
created this way should not be assumed to compile correctly; normal
|
object created this way should not be assumed to compile correctly;
|
||||||
exceptions thrown by compilation may still be initiated when the AST
|
normal exceptions thrown by compilation may still be initiated when
|
||||||
object is passed to \code{compileast()}. This will normally indicate
|
the AST object is passed to \code{compileast()}. This will normally
|
||||||
problems not related to syntax (such as a \code{MemoryError}
|
indicate problems not related to syntax (such as a \code{MemoryError}
|
||||||
exception).
|
exception), but may also be due to constructs such as the result of
|
||||||
|
parsing \code{del f(0)}, which escapes the Python parser but is
|
||||||
|
checked by the bytecode compiler.
|
||||||
|
|
||||||
|
Sequences representing terminal tokens may be represented as either
|
||||||
|
two-element lists of the form \code{(1, 'name')} or as three-element
|
||||||
|
lists of the form \code{(1, 'name', 56)}. If the third element is
|
||||||
|
present, it is assumed to be a valid line number. The line number
|
||||||
|
may be specified for any subset of the terminal symbols in the input
|
||||||
|
tree.
|
||||||
|
\end{funcdesc}
|
||||||
|
|
||||||
|
\begin{funcdesc}{tuple2ast}{sequence}
|
||||||
|
This is the same function as \code{sequence2ast}. This entry point is
|
||||||
|
maintained for backward compatibility.
|
||||||
\end{funcdesc}
|
\end{funcdesc}
|
||||||
|
|
||||||
|
|
||||||
|
@ -166,9 +217,9 @@ Exception raised when a failure occurs within the parser module. This
|
||||||
is generally produced for validation failures rather than the built in
|
is generally produced for validation failures rather than the built in
|
||||||
\code{SyntaxError} thrown during normal parsing.
|
\code{SyntaxError} thrown during normal parsing.
|
||||||
The exception argument is either a string describing the reason of the
|
The exception argument is either a string describing the reason of the
|
||||||
failure or a tuple containing a tuple causing the failure from a parse
|
failure or a tuple containing a sequence causing the failure from a parse
|
||||||
tree passed to \code{tuple2ast()} and an explanatory string. Calls to
|
tree passed to \code{sequence2ast()} and an explanatory string. Calls to
|
||||||
\code{tuple2ast()} need to be able to handle either type of exception,
|
\code{sequence2ast()} need to be able to handle either type of exception,
|
||||||
while calls to other functions in the module will only need to be
|
while calls to other functions in the module will only need to be
|
||||||
aware of the simple string values.
|
aware of the simple string values.
|
||||||
\end{excdesc}
|
\end{excdesc}
|
||||||
|
@ -182,9 +233,36 @@ exceptions carry all the meaning normally associated with them. Refer
|
||||||
to the descriptions of each function for detailed information.
|
to the descriptions of each function for detailed information.
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{AST Objects}
|
||||||
|
|
||||||
|
AST objects (returned by \code{expr()}, \code{suite()}, and
|
||||||
|
\code{tuple2ast()}, described above) have no methods of their own.
|
||||||
|
Some of the functions defined which accept an AST object as their
|
||||||
|
first argument may change to object methods in the future.
|
||||||
|
|
||||||
|
Ordered and equality comparisons are supported between AST objects.
|
||||||
|
|
||||||
|
|
||||||
\subsection{Example}
|
\subsection{Example}
|
||||||
|
|
||||||
A simple example:
|
The parser modules allows operations to be performed on the parse tree
|
||||||
|
of Python source code before the bytecode is generated, and provides
|
||||||
|
for inspection of the parse tree for information gathering purposes as
|
||||||
|
well. While many useful operations may take place between parsing and
|
||||||
|
bytecode generation, the simplest operation is to do nothing. For
|
||||||
|
this purpose, using the \code{parser} module to produce an
|
||||||
|
intermediate data structure is equivelent to the code
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
>>> code = compile('a + 5', 'eval')
|
||||||
|
>>> a = 5
|
||||||
|
>>> eval(code)
|
||||||
|
10
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
The equivelent operation using the \code{parser} module is somewhat
|
||||||
|
longer, and allows the intermediate internal parse tree to be retained
|
||||||
|
as an AST object:
|
||||||
|
|
||||||
\begin{verbatim}
|
\begin{verbatim}
|
||||||
>>> import parser
|
>>> import parser
|
||||||
|
@ -195,18 +273,187 @@ A simple example:
|
||||||
10
|
10
|
||||||
\end{verbatim}
|
\end{verbatim}
|
||||||
|
|
||||||
|
Some applications can benfit from access to the parse tree itself, and
|
||||||
|
can take advantage of the intermediate data structure provided by the
|
||||||
|
\code{parser} module. The remainder of this section of examples will
|
||||||
|
demonstrate how the intermediate data structure can provide access to
|
||||||
|
module documentation defined in docstrings without requiring that the
|
||||||
|
code being examined be imported into a running interpreter. This can
|
||||||
|
be very useful for performing analyses of untrusted code.
|
||||||
|
|
||||||
\subsection{AST Objects}
|
Generally, the example will demonstrate how the parse tree may be
|
||||||
|
traversed to distill interesting information. Two functions and a set
|
||||||
|
of classes is developed which provide programmatic access to high
|
||||||
|
level function and class definitions provided by a module. The
|
||||||
|
classes extract information from the parse tree and provide access to
|
||||||
|
the information at a useful semantic level, one function provides a
|
||||||
|
simple low-level pattern matching capability, and the other function
|
||||||
|
defines a high-level interface to the classes by handling file
|
||||||
|
operations on behalf of the caller. All source files mentioned here
|
||||||
|
which are not part of the Python installation are located in the
|
||||||
|
\file{Demo/parser} directory of the distribution.
|
||||||
|
|
||||||
AST objects (returned by \code{expr()}, \code{suite()}, and
|
To construct the upper-level extraction methods, we need to know what
|
||||||
\code{tuple2ast()}, described above) have no methods of their own.
|
the parse tree structure looks like and how much of it we actually
|
||||||
Some of the functions defined which accept an AST object as their
|
need to be concerned about. Python uses a moderately deep parse tree,
|
||||||
first argument may change to object methods in the future.
|
so there are a large number of intermediate nodes. It is important to
|
||||||
|
read and understand the formal grammar used by Python. This is
|
||||||
|
specified in the file \file{Grammar/Grammar} in the distribution.
|
||||||
|
Consider the simplest case of interest when searching for docstrings:
|
||||||
|
a module consisting of a docstring and nothing else:
|
||||||
|
|
||||||
Ordered and equality comparisons are supported between AST objects.
|
\begin{verbatim}
|
||||||
|
"""Some documentation.
|
||||||
|
"""
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
\renewcommand{\indexsubitem}{(ast method)}
|
Using the interpreter to take a look at the parse tree, we find a
|
||||||
|
bewildering mass of numbers and parentheses, with the documentation
|
||||||
|
buried deep in the nested tuples:
|
||||||
|
|
||||||
%\begin{funcdesc}{empty}{}
|
\begin{verbatim}
|
||||||
%Empty the can into the trash.
|
>>> import parser
|
||||||
%\end{funcdesc}
|
>>> import pprint
|
||||||
|
>>> ast = parser.suite(open('docstring.py').read())
|
||||||
|
>>> tup = parser.ast2tuple(ast)
|
||||||
|
>>> pprint.pprint(tup)
|
||||||
|
(257,
|
||||||
|
(264,
|
||||||
|
(265,
|
||||||
|
(266,
|
||||||
|
(267,
|
||||||
|
(307,
|
||||||
|
(287,
|
||||||
|
(288,
|
||||||
|
(289,
|
||||||
|
(290,
|
||||||
|
(292,
|
||||||
|
(293,
|
||||||
|
(294,
|
||||||
|
(295,
|
||||||
|
(296,
|
||||||
|
(297,
|
||||||
|
(298,
|
||||||
|
(299,
|
||||||
|
(300, (3, '"""Some documentation.\012"""'))))))))))))))))),
|
||||||
|
(4, ''))),
|
||||||
|
(4, ''),
|
||||||
|
(0, ''))
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
The numbers at the first element of each node in the tree are the node
|
||||||
|
types; they map directly to terminal and non-terminal symbols in the
|
||||||
|
grammar. Unfortunately, they are represented as integers in the
|
||||||
|
internal representation, and the Python structures generated do not
|
||||||
|
change that. However, the \code{symbol} and \code{token} modules
|
||||||
|
provide symbolic names for the node types and dictionaries which map
|
||||||
|
from the integers to the symbolic names for the node types.
|
||||||
|
|
||||||
|
In the output presented above, the outermost tuple contains four
|
||||||
|
elements: the integer \code{257} and three additional tuples. Node
|
||||||
|
type \code{257} has the symbolic name \code{file_input}. Each of
|
||||||
|
these inner tuples contains an integer as the first element; these
|
||||||
|
integers, \code{264}, \code{4}, and \code{0}, represent the node types
|
||||||
|
\code{stmt}, \code{NEWLINE}, and \code{ENDMARKER}, respectively.
|
||||||
|
Note that these values may change depending on the version of Python
|
||||||
|
you are using; consult \file{symbol.py} and \file{token.py} for
|
||||||
|
details of the mapping. It should be fairly clear that the outermost
|
||||||
|
node is related primarily to the input source rather than the contents
|
||||||
|
of the file, and may be disregarded for the moment. The \code{stmt}
|
||||||
|
node is much more interesting. In particular, all docstrings are
|
||||||
|
found in subtrees which are formed exactly as this node is formed,
|
||||||
|
with the only difference being the string itself. The association
|
||||||
|
between the docstring in a similar tree and the defined entity (class,
|
||||||
|
function, or module) which it describes is given by the position of
|
||||||
|
the docstring subtree within the tree defining the described
|
||||||
|
structure.
|
||||||
|
|
||||||
|
By replacing the actual docstring with something to signify a variable
|
||||||
|
component of the tree, we allow a simple pattern matching approach may
|
||||||
|
be taken to checking any given subtree for equivelence to the general
|
||||||
|
pattern for docstrings. Since the example demonstrates information
|
||||||
|
extraction, we can safely require that the tree be in tuple form
|
||||||
|
rather than list form, allowing a simple variable representation to be
|
||||||
|
\code{['variable\_name']}. A simple recursive function can implement
|
||||||
|
the pattern matching, returning a boolean and a dictionary of variable
|
||||||
|
name to value mappings.
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
from types import ListType, TupleType
|
||||||
|
|
||||||
|
def match(pattern, data, vars=None):
|
||||||
|
if vars is None:
|
||||||
|
vars = {}
|
||||||
|
if type(pattern) is ListType:
|
||||||
|
vars[pattern[0]] = data
|
||||||
|
return 1, vars
|
||||||
|
if type(pattern) is not TupleType:
|
||||||
|
return (pattern == data), vars
|
||||||
|
if len(data) != len(pattern):
|
||||||
|
return 0, vars
|
||||||
|
for pattern, data in map(None, pattern, data):
|
||||||
|
same, vars = match(pattern, data, vars)
|
||||||
|
if not same:
|
||||||
|
break
|
||||||
|
return same, vars
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
Using this simple recursive pattern matching function and the symbolic
|
||||||
|
node types, the pattern for the candidate docstring subtrees becomes:
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
>>> DOCSTRING_STMT_PATTERN = (
|
||||||
|
... symbol.stmt,
|
||||||
|
... (symbol.simple_stmt,
|
||||||
|
... (symbol.small_stmt,
|
||||||
|
... (symbol.expr_stmt,
|
||||||
|
... (symbol.testlist,
|
||||||
|
... (symbol.test,
|
||||||
|
... (symbol.and_test,
|
||||||
|
... (symbol.not_test,
|
||||||
|
... (symbol.comparison,
|
||||||
|
... (symbol.expr,
|
||||||
|
... (symbol.xor_expr,
|
||||||
|
... (symbol.and_expr,
|
||||||
|
... (symbol.shift_expr,
|
||||||
|
... (symbol.arith_expr,
|
||||||
|
... (symbol.term,
|
||||||
|
... (symbol.factor,
|
||||||
|
... (symbol.power,
|
||||||
|
... (symbol.atom,
|
||||||
|
... (token.STRING, ['docstring'])
|
||||||
|
... )))))))))))))))),
|
||||||
|
... (token.NEWLINE, '')
|
||||||
|
... ))
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
Using the \code{match()} function with this pattern, extracting the
|
||||||
|
module docstring from the parse tree created previously is easy:
|
||||||
|
|
||||||
|
\begin{verbatim}
|
||||||
|
>>> found, vars = match(DOCSTRING_STMT_PATTERN, tup[1])
|
||||||
|
>>> found
|
||||||
|
1
|
||||||
|
>>> vars
|
||||||
|
{'docstring': '"""Some documentation.\012"""'}
|
||||||
|
\end{verbatim}
|
||||||
|
|
||||||
|
Once specific data can be extracted from a location where it is
|
||||||
|
expected, the question of where information can be expected
|
||||||
|
needs to be answered. When dealing with docstrings, the answer is
|
||||||
|
fairly simple: the docstring is the first \code{stmt} node in a code
|
||||||
|
block (\code{file_input} or \code{suite} node types). A module
|
||||||
|
consists of a single \code{file_input} node, and class and function
|
||||||
|
definitions each contain exactly one \code{suite} node. Classes and
|
||||||
|
functions are readily identified as subtrees of code block nodes which
|
||||||
|
start with \code{(stmt, (compound_stmt, (classdef, ...} or
|
||||||
|
\code{(stmt, (compound_stmt, (funcdef, ...}. Note that these subtrees
|
||||||
|
cannot be matched by \code{match()} since it does not support multiple
|
||||||
|
sibling nodes to match without regard to number. A more elaborate
|
||||||
|
matching function could be used to overcome this limitation, but this
|
||||||
|
is sufficient for the example.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
%%
|
||||||
|
%% end of file
|
||||||
|
|
31
Lib/AST.py
31
Lib/AST.py
|
@ -1,13 +1,13 @@
|
||||||
"""Object-oriented interface to the parser module.
|
"""Object-oriented interface to the parser module.
|
||||||
|
|
||||||
This module exports three classes which together provide an interface
|
This module exports four classes which together provide an interface
|
||||||
to the parser module. Together, the three classes represent two ways
|
to the parser module. Together, the three classes represent two ways
|
||||||
to create parsed representations of Python source and the two starting
|
to create parsed representations of Python source and the two starting
|
||||||
data types (source text and tuple representations). Each class
|
data types (source text and tuple representations). Each class
|
||||||
provides interfaces which are identical other than the constructors.
|
provides interfaces which are identical other than the constructors.
|
||||||
The constructors are described in detail in the documentation for each
|
The constructors are described in detail in the documentation for each
|
||||||
class and the remaining, shared portion of the interface is documented
|
class and the remaining, shared portion of the interface is documented
|
||||||
below. Briefly, the three classes provided are:
|
below. Briefly, the classes provided are:
|
||||||
|
|
||||||
AST
|
AST
|
||||||
Defines the primary interface to the AST objects and supports creation
|
Defines the primary interface to the AST objects and supports creation
|
||||||
|
@ -23,6 +23,9 @@ FileSuiteAST
|
||||||
Convenience subclass of the `SuiteAST' class; loads source text of the
|
Convenience subclass of the `SuiteAST' class; loads source text of the
|
||||||
suite from an external file.
|
suite from an external file.
|
||||||
|
|
||||||
|
Common Methods
|
||||||
|
--------------
|
||||||
|
|
||||||
Aside from the constructors, several methods are provided to allow
|
Aside from the constructors, several methods are provided to allow
|
||||||
access to the various interpretations of the parse tree and to check
|
access to the various interpretations of the parse tree and to check
|
||||||
conditions of the construct represented by the parse tree.
|
conditions of the construct represented by the parse tree.
|
||||||
|
@ -68,8 +71,8 @@ class AST:
|
||||||
This base class provides all of the query methods for subclass
|
This base class provides all of the query methods for subclass
|
||||||
objects defined in this module.
|
objects defined in this module.
|
||||||
"""
|
"""
|
||||||
_p = __import__('parser') # import internally to avoid
|
import parser # import internally to avoid
|
||||||
# namespace pollution at the
|
_p = parser # namespace pollution at the
|
||||||
# top level
|
# top level
|
||||||
_text = None
|
_text = None
|
||||||
_code = None
|
_code = None
|
||||||
|
@ -84,7 +87,8 @@ class AST:
|
||||||
The tuple tree to convert.
|
The tuple tree to convert.
|
||||||
|
|
||||||
The tuple-tree may represent either an expression or a suite; the
|
The tuple-tree may represent either an expression or a suite; the
|
||||||
type will be determined automatically.
|
type will be determined automatically. Line number information may
|
||||||
|
optionally be present for any subset of the terminal tokens.
|
||||||
"""
|
"""
|
||||||
if type(tuple) is not type(()):
|
if type(tuple) is not type(()):
|
||||||
raise TypeError, 'Base AST class requires tuple parameter.'
|
raise TypeError, 'Base AST class requires tuple parameter.'
|
||||||
|
@ -93,11 +97,24 @@ class AST:
|
||||||
self._ast = self._p.tuple2ast(tuple)
|
self._ast = self._p.tuple2ast(tuple)
|
||||||
self._type = (self._p.isexpr(self._ast) and 'expression') or 'suite'
|
self._type = (self._p.isexpr(self._ast) and 'expression') or 'suite'
|
||||||
|
|
||||||
def tuple(self):
|
def list(self, line_info = 0):
|
||||||
|
"""Returns a fresh list representing the parse tree.
|
||||||
|
|
||||||
|
line_info
|
||||||
|
If true, includes line number information for terminal tokens in
|
||||||
|
the output data structure,
|
||||||
|
"""
|
||||||
|
return self._p.ast2list(self._ast, line_info)
|
||||||
|
|
||||||
|
def tuple(self, line_info = 0):
|
||||||
"""Returns the tuple representing the parse tree.
|
"""Returns the tuple representing the parse tree.
|
||||||
|
|
||||||
|
line_info
|
||||||
|
If true, includes line number information for terminal tokens in
|
||||||
|
the output data structure,
|
||||||
"""
|
"""
|
||||||
if self._tupl is None:
|
if self._tupl is None:
|
||||||
self._tupl = self._p.ast2tuple(self._ast)
|
self._tupl = self._p.ast2tuple(self._ast, line_info)
|
||||||
return self._tupl
|
return self._tupl
|
||||||
|
|
||||||
def code(self):
|
def code(self):
|
||||||
|
|
|
@ -1,5 +1,18 @@
|
||||||
# Non-terminal symbols of Python grammar (from "graminit.h")
|
#! /usr/bin/env python
|
||||||
|
#
|
||||||
|
# Non-terminal symbols of Python grammar (from "graminit.h")
|
||||||
|
#
|
||||||
|
# This file is automatically generated; please don't muck it up!
|
||||||
|
#
|
||||||
|
# To update the symbols in this file, 'cd' to the top directory of
|
||||||
|
# the python source tree after building the interpreter and run:
|
||||||
|
#
|
||||||
|
# PYTHONPATH=Lib:Modules ./python Lib/symbol.py
|
||||||
|
#
|
||||||
|
# (this path allows the import of string.py, token.py, and regexmodule.so
|
||||||
|
# for a site with no installation in place)
|
||||||
|
|
||||||
|
#--start constants--
|
||||||
single_input = 256
|
single_input = 256
|
||||||
file_input = 257
|
file_input = 257
|
||||||
eval_input = 258
|
eval_input = 258
|
||||||
|
@ -23,39 +36,40 @@ raise_stmt = 275
|
||||||
import_stmt = 276
|
import_stmt = 276
|
||||||
dotted_name = 277
|
dotted_name = 277
|
||||||
global_stmt = 278
|
global_stmt = 278
|
||||||
access_stmt = 279
|
exec_stmt = 279
|
||||||
accesstype = 280
|
compound_stmt = 280
|
||||||
exec_stmt = 281
|
if_stmt = 281
|
||||||
compound_stmt = 282
|
while_stmt = 282
|
||||||
if_stmt = 283
|
for_stmt = 283
|
||||||
while_stmt = 284
|
try_stmt = 284
|
||||||
for_stmt = 285
|
except_clause = 285
|
||||||
try_stmt = 286
|
suite = 286
|
||||||
except_clause = 287
|
test = 287
|
||||||
suite = 288
|
and_test = 288
|
||||||
test = 289
|
not_test = 289
|
||||||
and_test = 290
|
comparison = 290
|
||||||
not_test = 291
|
comp_op = 291
|
||||||
comparison = 292
|
expr = 292
|
||||||
comp_op = 293
|
xor_expr = 293
|
||||||
expr = 294
|
and_expr = 294
|
||||||
xor_expr = 295
|
shift_expr = 295
|
||||||
and_expr = 296
|
arith_expr = 296
|
||||||
shift_expr = 297
|
term = 297
|
||||||
arith_expr = 298
|
factor = 298
|
||||||
term = 299
|
power = 299
|
||||||
factor = 300
|
atom = 300
|
||||||
power = 301
|
lambdef = 301
|
||||||
atom = 302
|
trailer = 302
|
||||||
lambdef = 303
|
subscriptlist = 303
|
||||||
trailer = 304
|
subscript = 304
|
||||||
subscript = 305
|
sliceop = 305
|
||||||
exprlist = 306
|
exprlist = 306
|
||||||
testlist = 307
|
testlist = 307
|
||||||
dictmaker = 308
|
dictmaker = 308
|
||||||
classdef = 309
|
classdef = 309
|
||||||
arglist = 310
|
arglist = 310
|
||||||
argument = 311
|
argument = 311
|
||||||
|
#--end constants--
|
||||||
|
|
||||||
names = dir()
|
names = dir()
|
||||||
sym_name = {}
|
sym_name = {}
|
||||||
|
@ -63,3 +77,17 @@ for name in names:
|
||||||
number = eval(name)
|
number = eval(name)
|
||||||
if type(number) is type(0):
|
if type(number) is type(0):
|
||||||
sym_name[number] = name
|
sym_name[number] = name
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import sys
|
||||||
|
import token
|
||||||
|
if len(sys.argv) == 1:
|
||||||
|
sys.argv = sys.argv + ["Include/graminit.h", "Lib/symbol.py"]
|
||||||
|
token.main()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
||||||
|
#
|
||||||
|
# end of file
|
||||||
|
|
79
Lib/token.py
79
Lib/token.py
|
@ -1,5 +1,18 @@
|
||||||
# Tokens (from "token.h")
|
#! /usr/bin/env python
|
||||||
|
#
|
||||||
|
# Tokens (from "token.h")
|
||||||
|
#
|
||||||
|
# This file is automatically generated; please don't muck it up!
|
||||||
|
#
|
||||||
|
# To update the symbols in this file, 'cd' to the top directory of
|
||||||
|
# the python source tree after building the interpreter and run:
|
||||||
|
#
|
||||||
|
# PYTHONPATH=./Lib ./python Lib/token.py
|
||||||
|
#
|
||||||
|
# (this path allows the import of string.py and regexmodule.so
|
||||||
|
# for a site with no installation in place)
|
||||||
|
|
||||||
|
#--start constants--
|
||||||
ENDMARKER = 0
|
ENDMARKER = 0
|
||||||
NAME = 1
|
NAME = 1
|
||||||
NUMBER = 2
|
NUMBER = 2
|
||||||
|
@ -39,6 +52,9 @@ RIGHTSHIFT = 35
|
||||||
DOUBLESTAR = 36
|
DOUBLESTAR = 36
|
||||||
OP = 37
|
OP = 37
|
||||||
ERRORTOKEN = 38
|
ERRORTOKEN = 38
|
||||||
|
N_TOKENS = 39
|
||||||
|
NT_OFFSET = 256
|
||||||
|
#--end constants--
|
||||||
|
|
||||||
names = dir()
|
names = dir()
|
||||||
tok_name = {}
|
tok_name = {}
|
||||||
|
@ -47,9 +63,6 @@ for name in names:
|
||||||
if type(number) is type(0):
|
if type(number) is type(0):
|
||||||
tok_name[number] = name
|
tok_name[number] = name
|
||||||
|
|
||||||
N_TOKENS = 39 # Number of tokens including ERRORTOKEN
|
|
||||||
NT_OFFSET = 256 # Start of non-terminal symbols
|
|
||||||
|
|
||||||
def ISTERMINAL(x):
|
def ISTERMINAL(x):
|
||||||
return x < NT_OFFSET
|
return x < NT_OFFSET
|
||||||
|
|
||||||
|
@ -58,3 +71,61 @@ def ISNONTERMINAL(x):
|
||||||
|
|
||||||
def ISEOF(x):
|
def ISEOF(x):
|
||||||
return x == ENDMARKER
|
return x == ENDMARKER
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
import regex
|
||||||
|
import string
|
||||||
|
import sys
|
||||||
|
args = sys.argv[1:]
|
||||||
|
inFileName = args and args[0] or "Include/token.h"
|
||||||
|
outFileName = "Lib/token.py"
|
||||||
|
if len(args) > 1:
|
||||||
|
outFileName = args[1]
|
||||||
|
try:
|
||||||
|
fp = open(inFileName)
|
||||||
|
except IOError, err:
|
||||||
|
sys.stdout.write("I/O error: %s\n" % str(err))
|
||||||
|
sys.exit(1)
|
||||||
|
lines = string.splitfields(fp.read(), "\n")
|
||||||
|
fp.close()
|
||||||
|
re = regex.compile(
|
||||||
|
"#define[ \t][ \t]*\([A-Z][A-Z_]*\)[ \t][ \t]*\([0-9][0-9]*\)",
|
||||||
|
regex.casefold)
|
||||||
|
tokens = {}
|
||||||
|
for line in lines:
|
||||||
|
if re.match(line) > -1:
|
||||||
|
name, val = re.group(1, 2)
|
||||||
|
val = string.atoi(val)
|
||||||
|
tokens[val] = name # reverse so we can sort them...
|
||||||
|
keys = tokens.keys()
|
||||||
|
keys.sort()
|
||||||
|
# load the output skeleton from the target:
|
||||||
|
try:
|
||||||
|
fp = open(outFileName)
|
||||||
|
except IOError, err:
|
||||||
|
sys.stderr.write("I/O error: %s\n" % str(err))
|
||||||
|
sys.exit(2)
|
||||||
|
format = string.splitfields(fp.read(), "\n")
|
||||||
|
fp.close()
|
||||||
|
try:
|
||||||
|
start = format.index("#--start constants--") + 1
|
||||||
|
end = format.index("#--end constants--")
|
||||||
|
except ValueError:
|
||||||
|
sys.stderr.write("target does not contain format markers")
|
||||||
|
sys.exit(3)
|
||||||
|
lines = []
|
||||||
|
for val in keys:
|
||||||
|
lines.append("%s = %d" % (tokens[val], val))
|
||||||
|
format[start:end] = lines
|
||||||
|
try:
|
||||||
|
fp = open(outFileName, 'w')
|
||||||
|
except IOError, err:
|
||||||
|
sys.stderr.write("I/O error: %s\n" % str(err))
|
||||||
|
sys.exit(4)
|
||||||
|
fp.write(string.joinfields(format, "\n"))
|
||||||
|
fp.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue