Rewritten using the tokenize module, which gives us a real tokenizer

rather than a number of approximating regular expressions. Alas, it is 3-4 times slower. Let that be a challenge for the tokenize module.
2025-10-12 18:02:39 +00:00 · 2002-08-23 01:36:01 +00:00 · 2002-08-23 01:36:01 +00:00 · 040d7ca498
commit 040d7ca498
parent fd372aa8e9
1 changed files with 144 additions and 188 deletions
--- a/Lib/pyclbr.py
+++ b/Lib/pyclbr.py
@ -4,10 +4,11 @@ Parse enough of a Python file to recognize class and method
 definitions and to find out the superclasses of a class.
 The interface consists of a single function:
-        readmodule(module, path)
+        readmodule_ex(module [, path[, inpackage]])
 module is the name of a Python module, path is an optional list of
 directories where the module is to be searched.  If present, path is
-prepended to the system search path sys.path.
+prepended to the system search path sys.path.  (inpackage is used
 internally to search for a submodule of a package.)
 The return value is a dictionary.  The keys of the dictionary are
 the names of the classes defined in the module (including classes
 that are defined via the from XXX import YYY construct).  The values
@ -28,12 +29,10 @@ string giving the name of the super class.  Since import statements
 are recognized and imported modules are scanned as well, this
 shouldn't happen often.
 XXX describe the Function class.
 BUGS
 - Continuation lines are not dealt with at all, except inside strings.
 - Nested classes and functions can confuse it.
 - Code that doesn't pass tabnanny or python -t will confuse it, unless
  you set the module TABWIDTH vrbl (default 8) to the correct tab width
  for the file.
 PACKAGE RELATED BUGS
 - If you have a package and a module inside that or another package
@ -52,69 +51,11 @@ PACKAGE RELATED BUGS
 import sys
 import imp
-import re
+import tokenize # Python tokenizer
-import string
+from token import NAME
 __all__ = ["readmodule"]
 TABWIDTH = 8
 _getnext = re.compile(r"""
    (?P<String>
       \""" [^"\\]* (?:
                        (?: \\. | "(?!"") )
                        [^"\\]*
                    )*
       \"""
    |   ''' [^'\\]* (?:
                        (?: \\. | '(?!'') )
                        [^'\\]*
                    )*
        '''
    |   " [^"\\\n]* (?: \\. [^"\\\n]*)* "
    |   ' [^'\\\n]* (?: \\. [^'\\\n]*)* '
    )
 |   (?P<Method>
        ^
        (?P<MethodIndent> [ \t]* )
        def [ \t]+
        (?P<MethodName> [a-zA-Z_] \w* )
        [ \t]* \(
    )
 |   (?P<Class>
        ^
        (?P<ClassIndent> [ \t]* )
        class [ \t]+
        (?P<ClassName> [a-zA-Z_] \w* )
        [ \t]*
        (?P<ClassSupers> \( [^)\n]* \) )?
        [ \t]* :
    )
 |   (?P<Import>
        ^ import [ \t]+
        (?P<ImportList> [^#;\n]+ )
    )
 |   (?P<ImportFrom>
        ^ from [ \t]+
        (?P<ImportFromPath>
            [a-zA-Z_] \w*
            (?:
                [ \t]* \. [ \t]* [a-zA-Z_] \w*
            )*
        )
        [ \t]+
        import [ \t]+
        (?P<ImportFromList> [^#;\n]+ )
    )
 """, re.VERBOSE | re.DOTALL | re.MULTILINE).search
 _modules = {}                           # cache of modules we've seen
 # each Python class is represented by an instance of this class
@ -140,7 +81,7 @@ class Function(Class):
    def _addmethod(self, name, lineno):
        assert 0, "Function._addmethod() shouldn't be called"
-def readmodule(module, path=[], inpackage=0):
+def readmodule(module, path=[], inpackage=False):
    '''Backwards compatible interface.
    Like readmodule_ex() but strips Function objects from the
@ -153,7 +94,7 @@ def readmodule(module, path=[], inpackage=0):
            res[key] = value
    return res
-def readmodule_ex(module, path=[], inpackage=0):
+def readmodule_ex(module, path=[], inpackage=False):
    '''Read a module file and return a dictionary of classes.
    Search for MODULE in PATH and sys.path, read and parse the
@ -168,7 +109,7 @@ def readmodule_ex(module, path=[], inpackage=0):
        package = module[:i].strip()
        submodule = module[i+1:].strip()
        parent = readmodule_ex(package, path, inpackage)
-        child = readmodule_ex(submodule, parent['__path__'], 1)
+        child = readmodule_ex(submodule, parent['__path__'], True)
        return child
    if module in _modules:
@ -204,129 +145,144 @@ def readmodule_ex(module, path=[], inpackage=0):
    _modules[module] = dict
    classstack = [] # stack of (class, indent) pairs
    src = f.read()
    f.close()
-    # To avoid having to stop the regexp at each newline, instead
+    g = tokenize.generate_tokens(f.readline)
-    # when we need a line number we simply count the number of
+    try:
-    # newlines in the string since the last time we did this; i.e.,
+        for tokentype, token, start, end, line in g:
-    #    lineno += src.count('\n', last_lineno_pos, here)
+            if token == 'def':
-    #    last_lineno_pos = here
+                lineno, thisindent = start
-    lineno, last_lineno_pos = 1, 0
+                tokentype, meth_name, start, end, line = g.next()
-    i = 0
+                if tokentype != NAME:
-    while 1:
+                    continue # Syntax error
-        m = _getnext(src, i)
+                # close all classes indented at least as much
-        if not m:
+                while classstack and \
-            break
+                      classstack[-1][1] >= thisindent:
-        start, i = m.span()
+                    del classstack[-1]
-
+                if classstack:
-        if m.start("Method") >= 0:
+                    # it's a class method
-            # found a method definition or function
+                    cur_class = classstack[-1][0]
-            thisindent = _indent(m.group("MethodIndent"))
+                    cur_class._addmethod(meth_name, lineno)
-            meth_name = m.group("MethodName")
+                else:
-            lineno += src.count('\n', last_lineno_pos, start)
+                    # it's a function
-            last_lineno_pos = start
+                    dict[meth_name] = Function(module, meth_name, file, lineno)
-            # close all classes indented at least as much
+            elif token == 'class':
-            while classstack and \
+                lineno, thisindent = start
-                  classstack[-1][1] >= thisindent:
+                tokentype, class_name, start, end, line = g.next()
-                del classstack[-1]
+                if tokentype != NAME:
-            if classstack:
+                    continue # Syntax error
-                # it's a class method
+                # close all classes indented at least as much
-                cur_class = classstack[-1][0]
+                while classstack and \
-                cur_class._addmethod(meth_name, lineno)
+                      classstack[-1][1] >= thisindent:
-            else:
+                    del classstack[-1]
-                # it's a function
+                # parse what follows the class name
-                f = Function(module, meth_name,
+                tokentype, token, start, end, line = g.next()
-                             file, lineno)
+                inherit = None
-                dict[meth_name] = f
+                if token == '(':
-
+                    names = [] # List of superclasses
-        elif m.start("String") >= 0:
+                    # there's a list of superclasses
-            pass
+                    level = 1
-
+                    super = [] # Tokens making up current superclass
-        elif m.start("Class") >= 0:
+                    while True:
-            # we found a class definition
+                        tokentype, token, start, end, line = g.next()
-            thisindent = _indent(m.group("ClassIndent"))
+                        if token in (')', ',') and level == 1:
-            # close all classes indented at least as much
+                            n = "".join(super)
-            while classstack and \
+                            if n in dict:
-                  classstack[-1][1] >= thisindent:
+                                # we know this super class
-                del classstack[-1]
+                                n = dict[n]
-            lineno += src.count('\n', last_lineno_pos, start)
+                            else:
-            last_lineno_pos = start
+                                c = n.split('.')
-            class_name = m.group("ClassName")
+                                if len(c) > 1:
-            inherit = m.group("ClassSupers")
+                                    # super class is of the form
-            if inherit:
+                                    # module.class: look in module for
-                # the class inherits from other classes
+                                    # class
-                inherit = inherit[1:-1].strip()
+                                    m = c[-2]
-                names = []
+                                    c = c[-1]
-                for n in inherit.split(','):
+                                    if m in _modules:
-                    n = n.strip()
+                                        d = _modules[m]
-                    if n in dict:
+                                        if c in d:
-                        # we know this super class
+                                            n = d[c]
-                        n = dict[n]
+                            names.append(n)
-                    else:
+                        if token == '(':
-                        c = n.split('.')
+                            level += 1
-                        if len(c) > 1:
+                        elif token == ')':
-                            # super class
+                            level -= 1
-                            # is of the
+                            if level == 0:
-                            # form module.class:
+                                break
-                            # look in
+                        elif token == ',' and level == 1:
-                            # module for class
+                            pass
-                            m = c[-2]
+                        else:
-                            c = c[-1]
+                            super.append(token)
-                            if m in _modules:
+                    inherit = names
-                                d = _modules[m]
+                cur_class = Class(module, class_name, inherit, file, lineno)
-                                if c in d:
+                dict[class_name] = cur_class
-                                    n = d[c]
+                classstack.append((cur_class, thisindent))
-                    names.append(n)
+            elif token == 'import' and start[1] == 0:
-                inherit = names
+                modules = _getnamelist(g)
-            # remember this class
+                for mod, mod2 in modules:
-            cur_class = Class(module, class_name, inherit,
+                    readmodule_ex(mod, path, inpackage)
-                              file, lineno)
+            elif token == 'from' and start[1] == 0:
-            dict[class_name] = cur_class
+                mod, token = _getname(g)
-            classstack.append((cur_class, thisindent))
+                if not mod or token != "import":
-
+                    continue
-        elif m.start("Import") >= 0:
+                names = _getnamelist(g)
            # import module
            for n in m.group("ImportList").split(','):
                n = n.strip()
                try:
                    # recursively read the imported module
-                    d = readmodule_ex(n, path, inpackage)
+                    d = readmodule_ex(mod, path, inpackage)
                except:
-                    ##print 'module', n, 'not found'
+                    continue
-                    pass
+                # add any classes that were defined in the imported module
-
+                # to our name space if they were mentioned in the list
-        elif m.start("ImportFrom") >= 0:
+                for n, n2 in names:
-            # from module import stuff
+                    if n in d:
-            mod = m.group("ImportFromPath")
+                        dict[n2 or n] = d[n]
-            names = m.group("ImportFromList").split(',')
+                    elif n == '*':
-            try:
+                        # only add a name if not already there (to mimic
-                # recursively read the imported module
+                        # what Python does internally) also don't add
-                d = readmodule_ex(mod, path, inpackage)
+                        # names that start with _
-            except:
+                        for n in d:
-                ##print 'module', mod, 'not found'
+                            if n[0] != '_' and not n in dict:
-                continue
+                                dict[n] = d[n]
-            # add any classes that were defined in the
+    except StopIteration:
-            # imported module to our name space if they
+        pass
            # were mentioned in the list
            for n in names:
                n = n.strip()
                if n in d:
                    dict[n] = d[n]
                elif n == '*':
                    # only add a name if not
                    # already there (to mimic what
                    # Python does internally)
                    # also don't add names that
                    # start with _
                    for n in d:
                        if n[0] != '_' and \
                           not n in dict:
                            dict[n] = d[n]
        else:
            assert 0, "regexp _getnext found something unexpected"
    f.close()
    return dict
-def _indent(ws, _expandtabs=string.expandtabs):
+def _getnamelist(g):
-    return len(_expandtabs(ws, TABWIDTH))
+    # Helper to get a comma-separated list of dotted names plus 'as'
    # clauses.  Return a list of pairs (name, name2) where name2 is
    # the 'as' name, or None if there is no 'as' clause.
    names = []
    while True:
        name, token = _getname(g)
        if not name:
            break
        if token == 'as':
            name2, token = _getname(g)
        else:
            name2 = None
        names.append((name, name2))
        while token != "," and "\n" not in token:
            tokentype, token, start, end, line = g.next()
        if token != ",":
            break
    return names
 def _getname(g):
    # Helper to get a dotted name, return a pair (name, token) where
    # name is the dotted name, or None if there was no dotted name,
    # and token is the next input token.
    parts = []
    tokentype, token, start, end, line = g.next()
    if tokentype != NAME and token != '*':
        return (None, token)
    parts.append(token)
    while True:
        tokentype, token, start, end, line = g.next()
        if token != '.':
            break
        tokentype, token, start, end, line = g.next()
        if tokentype != NAME:
            break
        parts.append(token)
    return (".".join(parts), token)