Rewritten using the tokenize module, which gives us a real tokenizer

rather than a number of approximating regular expressions.
Alas, it is 3-4 times slower.  Let that be a challenge for the
tokenize module.
This commit is contained in:
Guido van Rossum 2002-08-23 01:36:01 +00:00
parent fd372aa8e9
commit 040d7ca498

View file

@ -4,10 +4,11 @@ Parse enough of a Python file to recognize class and method
definitions and to find out the superclasses of a class. definitions and to find out the superclasses of a class.
The interface consists of a single function: The interface consists of a single function:
readmodule(module, path) readmodule_ex(module [, path[, inpackage]])
module is the name of a Python module, path is an optional list of module is the name of a Python module, path is an optional list of
directories where the module is to be searched. If present, path is directories where the module is to be searched. If present, path is
prepended to the system search path sys.path. prepended to the system search path sys.path. (inpackage is used
internally to search for a submodule of a package.)
The return value is a dictionary. The keys of the dictionary are The return value is a dictionary. The keys of the dictionary are
the names of the classes defined in the module (including classes the names of the classes defined in the module (including classes
that are defined via the from XXX import YYY construct). The values that are defined via the from XXX import YYY construct). The values
@ -28,12 +29,10 @@ string giving the name of the super class. Since import statements
are recognized and imported modules are scanned as well, this are recognized and imported modules are scanned as well, this
shouldn't happen often. shouldn't happen often.
XXX describe the Function class.
BUGS BUGS
- Continuation lines are not dealt with at all, except inside strings.
- Nested classes and functions can confuse it. - Nested classes and functions can confuse it.
- Code that doesn't pass tabnanny or python -t will confuse it, unless
you set the module TABWIDTH vrbl (default 8) to the correct tab width
for the file.
PACKAGE RELATED BUGS PACKAGE RELATED BUGS
- If you have a package and a module inside that or another package - If you have a package and a module inside that or another package
@ -52,69 +51,11 @@ PACKAGE RELATED BUGS
import sys import sys
import imp import imp
import re import tokenize # Python tokenizer
import string from token import NAME
__all__ = ["readmodule"] __all__ = ["readmodule"]
TABWIDTH = 8
_getnext = re.compile(r"""
(?P<String>
\""" [^"\\]* (?:
(?: \\. | "(?!"") )
[^"\\]*
)*
\"""
| ''' [^'\\]* (?:
(?: \\. | '(?!'') )
[^'\\]*
)*
'''
| " [^"\\\n]* (?: \\. [^"\\\n]*)* "
| ' [^'\\\n]* (?: \\. [^'\\\n]*)* '
)
| (?P<Method>
^
(?P<MethodIndent> [ \t]* )
def [ \t]+
(?P<MethodName> [a-zA-Z_] \w* )
[ \t]* \(
)
| (?P<Class>
^
(?P<ClassIndent> [ \t]* )
class [ \t]+
(?P<ClassName> [a-zA-Z_] \w* )
[ \t]*
(?P<ClassSupers> \( [^)\n]* \) )?
[ \t]* :
)
| (?P<Import>
^ import [ \t]+
(?P<ImportList> [^#;\n]+ )
)
| (?P<ImportFrom>
^ from [ \t]+
(?P<ImportFromPath>
[a-zA-Z_] \w*
(?:
[ \t]* \. [ \t]* [a-zA-Z_] \w*
)*
)
[ \t]+
import [ \t]+
(?P<ImportFromList> [^#;\n]+ )
)
""", re.VERBOSE | re.DOTALL | re.MULTILINE).search
_modules = {} # cache of modules we've seen _modules = {} # cache of modules we've seen
# each Python class is represented by an instance of this class # each Python class is represented by an instance of this class
@ -140,7 +81,7 @@ class Function(Class):
def _addmethod(self, name, lineno): def _addmethod(self, name, lineno):
assert 0, "Function._addmethod() shouldn't be called" assert 0, "Function._addmethod() shouldn't be called"
def readmodule(module, path=[], inpackage=0): def readmodule(module, path=[], inpackage=False):
'''Backwards compatible interface. '''Backwards compatible interface.
Like readmodule_ex() but strips Function objects from the Like readmodule_ex() but strips Function objects from the
@ -153,7 +94,7 @@ def readmodule(module, path=[], inpackage=0):
res[key] = value res[key] = value
return res return res
def readmodule_ex(module, path=[], inpackage=0): def readmodule_ex(module, path=[], inpackage=False):
'''Read a module file and return a dictionary of classes. '''Read a module file and return a dictionary of classes.
Search for MODULE in PATH and sys.path, read and parse the Search for MODULE in PATH and sys.path, read and parse the
@ -168,7 +109,7 @@ def readmodule_ex(module, path=[], inpackage=0):
package = module[:i].strip() package = module[:i].strip()
submodule = module[i+1:].strip() submodule = module[i+1:].strip()
parent = readmodule_ex(package, path, inpackage) parent = readmodule_ex(package, path, inpackage)
child = readmodule_ex(submodule, parent['__path__'], 1) child = readmodule_ex(submodule, parent['__path__'], True)
return child return child
if module in _modules: if module in _modules:
@ -204,129 +145,144 @@ def readmodule_ex(module, path=[], inpackage=0):
_modules[module] = dict _modules[module] = dict
classstack = [] # stack of (class, indent) pairs classstack = [] # stack of (class, indent) pairs
src = f.read()
f.close()
# To avoid having to stop the regexp at each newline, instead g = tokenize.generate_tokens(f.readline)
# when we need a line number we simply count the number of try:
# newlines in the string since the last time we did this; i.e., for tokentype, token, start, end, line in g:
# lineno += src.count('\n', last_lineno_pos, here) if token == 'def':
# last_lineno_pos = here lineno, thisindent = start
lineno, last_lineno_pos = 1, 0 tokentype, meth_name, start, end, line = g.next()
i = 0 if tokentype != NAME:
while 1: continue # Syntax error
m = _getnext(src, i) # close all classes indented at least as much
if not m: while classstack and \
break classstack[-1][1] >= thisindent:
start, i = m.span() del classstack[-1]
if classstack:
if m.start("Method") >= 0: # it's a class method
# found a method definition or function cur_class = classstack[-1][0]
thisindent = _indent(m.group("MethodIndent")) cur_class._addmethod(meth_name, lineno)
meth_name = m.group("MethodName") else:
lineno += src.count('\n', last_lineno_pos, start) # it's a function
last_lineno_pos = start dict[meth_name] = Function(module, meth_name, file, lineno)
# close all classes indented at least as much elif token == 'class':
while classstack and \ lineno, thisindent = start
classstack[-1][1] >= thisindent: tokentype, class_name, start, end, line = g.next()
del classstack[-1] if tokentype != NAME:
if classstack: continue # Syntax error
# it's a class method # close all classes indented at least as much
cur_class = classstack[-1][0] while classstack and \
cur_class._addmethod(meth_name, lineno) classstack[-1][1] >= thisindent:
else: del classstack[-1]
# it's a function # parse what follows the class name
f = Function(module, meth_name, tokentype, token, start, end, line = g.next()
file, lineno) inherit = None
dict[meth_name] = f if token == '(':
names = [] # List of superclasses
elif m.start("String") >= 0: # there's a list of superclasses
pass level = 1
super = [] # Tokens making up current superclass
elif m.start("Class") >= 0: while True:
# we found a class definition tokentype, token, start, end, line = g.next()
thisindent = _indent(m.group("ClassIndent")) if token in (')', ',') and level == 1:
# close all classes indented at least as much n = "".join(super)
while classstack and \ if n in dict:
classstack[-1][1] >= thisindent: # we know this super class
del classstack[-1] n = dict[n]
lineno += src.count('\n', last_lineno_pos, start) else:
last_lineno_pos = start c = n.split('.')
class_name = m.group("ClassName") if len(c) > 1:
inherit = m.group("ClassSupers") # super class is of the form
if inherit: # module.class: look in module for
# the class inherits from other classes # class
inherit = inherit[1:-1].strip() m = c[-2]
names = [] c = c[-1]
for n in inherit.split(','): if m in _modules:
n = n.strip() d = _modules[m]
if n in dict: if c in d:
# we know this super class n = d[c]
n = dict[n] names.append(n)
else: if token == '(':
c = n.split('.') level += 1
if len(c) > 1: elif token == ')':
# super class level -= 1
# is of the if level == 0:
# form module.class: break
# look in elif token == ',' and level == 1:
# module for class pass
m = c[-2] else:
c = c[-1] super.append(token)
if m in _modules: inherit = names
d = _modules[m] cur_class = Class(module, class_name, inherit, file, lineno)
if c in d: dict[class_name] = cur_class
n = d[c] classstack.append((cur_class, thisindent))
names.append(n) elif token == 'import' and start[1] == 0:
inherit = names modules = _getnamelist(g)
# remember this class for mod, mod2 in modules:
cur_class = Class(module, class_name, inherit, readmodule_ex(mod, path, inpackage)
file, lineno) elif token == 'from' and start[1] == 0:
dict[class_name] = cur_class mod, token = _getname(g)
classstack.append((cur_class, thisindent)) if not mod or token != "import":
continue
elif m.start("Import") >= 0: names = _getnamelist(g)
# import module
for n in m.group("ImportList").split(','):
n = n.strip()
try: try:
# recursively read the imported module # recursively read the imported module
d = readmodule_ex(n, path, inpackage) d = readmodule_ex(mod, path, inpackage)
except: except:
##print 'module', n, 'not found' continue
pass # add any classes that were defined in the imported module
# to our name space if they were mentioned in the list
elif m.start("ImportFrom") >= 0: for n, n2 in names:
# from module import stuff if n in d:
mod = m.group("ImportFromPath") dict[n2 or n] = d[n]
names = m.group("ImportFromList").split(',') elif n == '*':
try: # only add a name if not already there (to mimic
# recursively read the imported module # what Python does internally) also don't add
d = readmodule_ex(mod, path, inpackage) # names that start with _
except: for n in d:
##print 'module', mod, 'not found' if n[0] != '_' and not n in dict:
continue dict[n] = d[n]
# add any classes that were defined in the except StopIteration:
# imported module to our name space if they pass
# were mentioned in the list
for n in names:
n = n.strip()
if n in d:
dict[n] = d[n]
elif n == '*':
# only add a name if not
# already there (to mimic what
# Python does internally)
# also don't add names that
# start with _
for n in d:
if n[0] != '_' and \
not n in dict:
dict[n] = d[n]
else:
assert 0, "regexp _getnext found something unexpected"
f.close()
return dict return dict
def _indent(ws, _expandtabs=string.expandtabs): def _getnamelist(g):
return len(_expandtabs(ws, TABWIDTH)) # Helper to get a comma-separated list of dotted names plus 'as'
# clauses. Return a list of pairs (name, name2) where name2 is
# the 'as' name, or None if there is no 'as' clause.
names = []
while True:
name, token = _getname(g)
if not name:
break
if token == 'as':
name2, token = _getname(g)
else:
name2 = None
names.append((name, name2))
while token != "," and "\n" not in token:
tokentype, token, start, end, line = g.next()
if token != ",":
break
return names
def _getname(g):
# Helper to get a dotted name, return a pair (name, token) where
# name is the dotted name, or None if there was no dotted name,
# and token is the next input token.
parts = []
tokentype, token, start, end, line = g.next()
if tokentype != NAME and token != '*':
return (None, token)
parts.append(token)
while True:
tokentype, token, start, end, line = g.next()
if token != '.':
break
tokentype, token, start, end, line = g.next()
if tokentype != NAME:
break
parts.append(token)
return (".".join(parts), token)