mirror of
				https://github.com/python/cpython.git
				synced 2025-11-03 19:34:08 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			274 lines
		
	
	
	
		
			9.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			274 lines
		
	
	
	
		
			9.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						|
Try to detect suspicious constructs, resembling markup
 | 
						|
that has leaked into the final output.
 | 
						|
 | 
						|
Suspicious lines are reported in a comma-separated-file,
 | 
						|
``suspicious.csv``, located in the output directory.
 | 
						|
 | 
						|
The file is utf-8 encoded, and each line contains four fields:
 | 
						|
 | 
						|
 * document name (normalized)
 | 
						|
 * line number in the source document
 | 
						|
 * problematic text
 | 
						|
 * complete line showing the problematic text in context
 | 
						|
 | 
						|
It is common to find many false positives. To avoid reporting them
 | 
						|
again and again, they may be added to the ``ignored.csv`` file
 | 
						|
(located in the configuration directory). The file has the same
 | 
						|
format as ``suspicious.csv`` with a few differences:
 | 
						|
 | 
						|
  - each line defines a rule; if the rule matches, the issue
 | 
						|
    is ignored.
 | 
						|
  - line number may be empty (that is, nothing between the
 | 
						|
    commas: ",,"). In this case, line numbers are ignored (the
 | 
						|
    rule matches anywhere in the file).
 | 
						|
  - the last field does not have to be a complete line; some
 | 
						|
    surrounding text (never more than a line) is enough for
 | 
						|
    context.
 | 
						|
 | 
						|
Rules are processed sequentially. A rule matches when:
 | 
						|
 | 
						|
 * document names are the same
 | 
						|
 * problematic texts are the same
 | 
						|
 * line numbers are close to each other (5 lines up or down)
 | 
						|
 * the rule text is completely contained into the source line
 | 
						|
 | 
						|
The simplest way to create the ignored.csv file is by copying
 | 
						|
undesired entries from suspicious.csv (possibly trimming the last
 | 
						|
field.)
 | 
						|
 | 
						|
Copyright 2009 Gabriel A. Genellina
 | 
						|
 | 
						|
"""
 | 
						|
 | 
						|
import os
 | 
						|
import re
 | 
						|
import csv
 | 
						|
import sys
 | 
						|
 | 
						|
from docutils import nodes
 | 
						|
from sphinx.builders import Builder
 | 
						|
 | 
						|
detect_all = re.compile(r'''
 | 
						|
    ::(?=[^=])|            # two :: (but NOT ::=)
 | 
						|
    :[a-zA-Z][a-zA-Z0-9]+| # :foo
 | 
						|
    `|                     # ` (seldom used by itself)
 | 
						|
    (?<!\.)\.\.[ \t]*\w+:  # .. foo: (but NOT ... else:)
 | 
						|
    ''', re.UNICODE | re.VERBOSE).finditer
 | 
						|
 | 
						|
py3 = sys.version_info >= (3, 0)
 | 
						|
 | 
						|
 | 
						|
class Rule:
 | 
						|
    def __init__(self, docname, lineno, issue, line):
 | 
						|
        """A rule for ignoring issues"""
 | 
						|
        self.docname = docname # document to which this rule applies
 | 
						|
        self.lineno = lineno   # line number in the original source;
 | 
						|
                               # this rule matches only near that.
 | 
						|
                               # None -> don't care
 | 
						|
        self.issue = issue     # the markup fragment that triggered this rule
 | 
						|
        self.line = line       # text of the container element (single line only)
 | 
						|
        self.used = False
 | 
						|
 | 
						|
    def __repr__(self):
 | 
						|
        return '{0.docname},,{0.issue},{0.line}'.format(self)
 | 
						|
 | 
						|
 | 
						|
 | 
						|
class dialect(csv.excel):
 | 
						|
    """Our dialect: uses only linefeed as newline."""
 | 
						|
    lineterminator = '\n'
 | 
						|
 | 
						|
 | 
						|
class CheckSuspiciousMarkupBuilder(Builder):
 | 
						|
    """
 | 
						|
    Checks for possibly invalid markup that may leak into the output.
 | 
						|
    """
 | 
						|
    name = 'suspicious'
 | 
						|
 | 
						|
    def init(self):
 | 
						|
        # create output file
 | 
						|
        self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
 | 
						|
        open(self.log_file_name, 'w').close()
 | 
						|
        # load database of previously ignored issues
 | 
						|
        self.load_rules(os.path.join(os.path.dirname(__file__), '..',
 | 
						|
                                     'susp-ignored.csv'))
 | 
						|
 | 
						|
    def get_outdated_docs(self):
 | 
						|
        return self.env.found_docs
 | 
						|
 | 
						|
    def get_target_uri(self, docname, typ=None):
 | 
						|
        return ''
 | 
						|
 | 
						|
    def prepare_writing(self, docnames):
 | 
						|
        pass
 | 
						|
 | 
						|
    def write_doc(self, docname, doctree):
 | 
						|
        # set when any issue is encountered in this document
 | 
						|
        self.any_issue = False
 | 
						|
        self.docname = docname
 | 
						|
        visitor = SuspiciousVisitor(doctree, self)
 | 
						|
        doctree.walk(visitor)
 | 
						|
 | 
						|
    def finish(self):
 | 
						|
        unused_rules = [rule for rule in self.rules if not rule.used]
 | 
						|
        if unused_rules:
 | 
						|
            self.warn('Found %s/%s unused rules:' %
 | 
						|
                      (len(unused_rules), len(self.rules)))
 | 
						|
            for rule in unused_rules:
 | 
						|
                self.info(repr(rule))
 | 
						|
        return
 | 
						|
 | 
						|
    def check_issue(self, line, lineno, issue):
 | 
						|
        if not self.is_ignored(line, lineno, issue):
 | 
						|
            self.report_issue(line, lineno, issue)
 | 
						|
 | 
						|
    def is_ignored(self, line, lineno, issue):
 | 
						|
        """Determine whether this issue should be ignored."""
 | 
						|
        docname = self.docname
 | 
						|
        for rule in self.rules:
 | 
						|
            if rule.docname != docname: continue
 | 
						|
            if rule.issue != issue: continue
 | 
						|
            # Both lines must match *exactly*. This is rather strict,
 | 
						|
            # and probably should be improved.
 | 
						|
            # Doing fuzzy matches with levenshtein distance could work,
 | 
						|
            # but that means bringing other libraries...
 | 
						|
            # Ok, relax that requirement: just check if the rule fragment
 | 
						|
            # is contained in the document line
 | 
						|
            if rule.line not in line: continue
 | 
						|
            # Check both line numbers. If they're "near"
 | 
						|
            # this rule matches. (lineno=None means "don't care")
 | 
						|
            if (rule.lineno is not None) and \
 | 
						|
                abs(rule.lineno - lineno) > 5: continue
 | 
						|
            # if it came this far, the rule matched
 | 
						|
            rule.used = True
 | 
						|
            return True
 | 
						|
        return False
 | 
						|
 | 
						|
    def report_issue(self, text, lineno, issue):
 | 
						|
        if not self.any_issue: self.info()
 | 
						|
        self.any_issue = True
 | 
						|
        self.write_log_entry(lineno, issue, text)
 | 
						|
        if py3:
 | 
						|
            self.warn('[%s:%d] "%s" found in "%-.120s"' %
 | 
						|
                      (self.docname, lineno, issue, text))
 | 
						|
        else:
 | 
						|
            self.warn('[%s:%d] "%s" found in "%-.120s"' % (
 | 
						|
                self.docname.encode(sys.getdefaultencoding(),'replace'),
 | 
						|
                lineno,
 | 
						|
                issue.encode(sys.getdefaultencoding(),'replace'),
 | 
						|
                text.strip().encode(sys.getdefaultencoding(),'replace')))
 | 
						|
        self.app.statuscode = 1
 | 
						|
 | 
						|
    def write_log_entry(self, lineno, issue, text):
 | 
						|
        if py3:
 | 
						|
            f = open(self.log_file_name, 'a')
 | 
						|
            writer = csv.writer(f, dialect)
 | 
						|
            writer.writerow([self.docname, lineno, issue, text.strip()])
 | 
						|
            f.close()
 | 
						|
        else:
 | 
						|
            f = open(self.log_file_name, 'ab')
 | 
						|
            writer = csv.writer(f, dialect)
 | 
						|
            writer.writerow([self.docname.encode('utf-8'),
 | 
						|
                             lineno,
 | 
						|
                             issue.encode('utf-8'),
 | 
						|
                             text.strip().encode('utf-8')])
 | 
						|
            f.close()
 | 
						|
 | 
						|
    def load_rules(self, filename):
 | 
						|
        """Load database of previously ignored issues.
 | 
						|
 | 
						|
        A csv file, with exactly the same format as suspicious.csv
 | 
						|
        Fields: document name (normalized), line number, issue, surrounding text
 | 
						|
        """
 | 
						|
        self.info("loading ignore rules... ", nonl=1)
 | 
						|
        self.rules = rules = []
 | 
						|
        try:
 | 
						|
            if py3:
 | 
						|
                f = open(filename, 'r')
 | 
						|
            else:
 | 
						|
                f = open(filename, 'rb')
 | 
						|
        except IOError:
 | 
						|
            return
 | 
						|
        for i, row in enumerate(csv.reader(f)):
 | 
						|
            if len(row) != 4:
 | 
						|
                raise ValueError(
 | 
						|
                    "wrong format in %s, line %d: %s" % (filename, i+1, row))
 | 
						|
            docname, lineno, issue, text = row
 | 
						|
            if lineno:
 | 
						|
                lineno = int(lineno)
 | 
						|
            else:
 | 
						|
                lineno = None
 | 
						|
            if not py3:
 | 
						|
                docname = docname.decode('utf-8')
 | 
						|
                issue = issue.decode('utf-8')
 | 
						|
                text = text.decode('utf-8')
 | 
						|
            rule = Rule(docname, lineno, issue, text)
 | 
						|
            rules.append(rule)
 | 
						|
        f.close()
 | 
						|
        self.info('done, %d rules loaded' % len(self.rules))
 | 
						|
 | 
						|
 | 
						|
def get_lineno(node):
 | 
						|
    """Obtain line number information for a node."""
 | 
						|
    lineno = None
 | 
						|
    while lineno is None and node:
 | 
						|
        node = node.parent
 | 
						|
        lineno = node.line
 | 
						|
    return lineno
 | 
						|
 | 
						|
 | 
						|
def extract_line(text, index):
 | 
						|
    """text may be a multiline string; extract
 | 
						|
    only the line containing the given character index.
 | 
						|
 | 
						|
    >>> extract_line("abc\ndefgh\ni", 6)
 | 
						|
    >>> 'defgh'
 | 
						|
    >>> for i in (0, 2, 3, 4, 10):
 | 
						|
    ...   print extract_line("abc\ndefgh\ni", i)
 | 
						|
    abc
 | 
						|
    abc
 | 
						|
    abc
 | 
						|
    defgh
 | 
						|
    defgh
 | 
						|
    i
 | 
						|
    """
 | 
						|
    p = text.rfind('\n', 0, index) + 1
 | 
						|
    q = text.find('\n', index)
 | 
						|
    if q < 0:
 | 
						|
        q = len(text)
 | 
						|
    return text[p:q]
 | 
						|
 | 
						|
 | 
						|
class SuspiciousVisitor(nodes.GenericNodeVisitor):
 | 
						|
 | 
						|
    lastlineno = 0
 | 
						|
 | 
						|
    def __init__(self, document, builder):
 | 
						|
        nodes.GenericNodeVisitor.__init__(self, document)
 | 
						|
        self.builder = builder
 | 
						|
 | 
						|
    def default_visit(self, node):
 | 
						|
        if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
 | 
						|
            text = node.astext()
 | 
						|
            # lineno seems to go backwards sometimes (?)
 | 
						|
            self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
 | 
						|
            seen = set() # don't report the same issue more than only once per line
 | 
						|
            for match in detect_all(text):
 | 
						|
                issue = match.group()
 | 
						|
                line = extract_line(text, match.start())
 | 
						|
                if (issue, line) not in seen:
 | 
						|
                    self.builder.check_issue(line, lineno, issue)
 | 
						|
                    seen.add((issue, line))
 | 
						|
 | 
						|
    unknown_visit = default_visit
 | 
						|
 | 
						|
    def visit_document(self, node):
 | 
						|
        self.lastlineno = 0
 | 
						|
 | 
						|
    def visit_comment(self, node):
 | 
						|
        # ignore comments -- too much false positives.
 | 
						|
        # (although doing this could miss some errors;
 | 
						|
        # there were two sections "commented-out" by mistake
 | 
						|
        # in the Python docs that would not be caught)
 | 
						|
        raise nodes.SkipNode
 |