gh-90110: Fix the c-analyzer Tool (gh-96731)

This includes: * update the whitelists * fixes so we can stop ignoring some of the files * ensure Include/cpython/*.h get analyzed
2025-08-04 00:48:58 +00:00 · 2022-09-12 11:09:31 -06:00 · 2022-09-12 11:09:31 -06:00 · 1756ffd66a
commit 1756ffd66a
parent 662782e95f
10 changed files with 499 additions and 118 deletions
--- a/Tools/c-analyzer/c_parser/init.py
+++ b/Tools/c-analyzer/c_parser/init.py
@ -22,8 +22,12 @@ def parse_files(filenames, *,
    if get_file_preprocessor is None:
        get_file_preprocessor = _get_preprocessor()
    for filename in filenames:
-        yield from _parse_file(
-                filename, match_kind, get_file_preprocessor, file_maxsizes)
+        try:
+            yield from _parse_file(
+                    filename, match_kind, get_file_preprocessor, file_maxsizes)
+        except Exception:
+            print(f'# requested file: <{filename}>')
+            raise  # re-raise


 def _parse_file(filename, match_kind, get_file_preprocessor, maxsizes):
--- a/Tools/c-analyzer/c_parser/preprocessor/init.py
+++ b/Tools/c-analyzer/c_parser/preprocessor/init.py
@ -35,9 +35,11 @@ logger = logging.getLogger(__name__)

 def preprocess(source, *,
               incldirs=None,
+               includes=None,
               macros=None,
               samefiles=None,
               filename=None,
+               cwd=None,
               tool=True,
               ):
    """...
@ -45,17 +47,27 @@ def preprocess(source, *,
    CWD should be the project root and "source" should be relative.
    """
    if tool:
-        logger.debug(f'CWD: {os.getcwd()!r}')
-        logger.debug(f'incldirs: {incldirs!r}')
-        logger.debug(f'macros: {macros!r}')
+        if not cwd:
+            cwd = os.getcwd()
+        logger.debug(f'CWD:       {cwd!r}')
+        logger.debug(f'incldirs:  {incldirs!r}')
+        logger.debug(f'includes:  {includes!r}')
+        logger.debug(f'macros:    {macros!r}')
        logger.debug(f'samefiles: {samefiles!r}')
        _preprocess = _get_preprocessor(tool)
        with _good_file(source, filename) as source:
-            return _preprocess(source, incldirs, macros, samefiles) or ()
+            return _preprocess(
+                source,
+                incldirs,
+                includes,
+                macros,
+                samefiles,
+                cwd,
+            ) or ()
    else:
        source, filename = _resolve_source(source, filename)
        # We ignore "includes", "macros", etc.
-        return _pure.preprocess(source, filename)
+        return _pure.preprocess(source, filename, cwd)

    # if _run() returns just the lines:
 #    text = _run(source)
@ -72,6 +84,7 @@ def preprocess(source, *,

 def get_preprocessor(*,
                     file_macros=None,
+                     file_includes=None,
                     file_incldirs=None,
                     file_same=None,
                     ignore_exc=False,
@ -80,10 +93,12 @@ def get_preprocessor(*,
    _preprocess = preprocess
    if file_macros:
        file_macros = tuple(_parse_macros(file_macros))
+    if file_includes:
+        file_includes = tuple(_parse_includes(file_includes))
    if file_incldirs:
        file_incldirs = tuple(_parse_incldirs(file_incldirs))
    if file_same:
-        file_same = tuple(file_same)
+        file_same = dict(file_same or ())
    if not callable(ignore_exc):
        ignore_exc = (lambda exc, _ig=ignore_exc: _ig)

@ -91,16 +106,26 @@ def get_preprocessor(*,
        filename = filename.strip()
        if file_macros:
            macros = list(_resolve_file_values(filename, file_macros))
+        if file_includes:
+            # There's a small chance we could need to filter out any
+            # includes that import "filename".  It isn't clear that it's
+            # a problem any longer.  If we do end up filtering then
+            # it may make sense to use c_common.fsutil.match_path_tail().
+            includes = [i for i, in _resolve_file_values(filename, file_includes)]
        if file_incldirs:
            incldirs = [v for v, in _resolve_file_values(filename, file_incldirs)]
+        if file_same:
+            samefiles = _resolve_samefiles(filename, file_same)

        def preprocess(**kwargs):
            if file_macros and 'macros' not in kwargs:
                kwargs['macros'] = macros
+            if file_includes and 'includes' not in kwargs:
+                kwargs['includes'] = includes
            if file_incldirs and 'incldirs' not in kwargs:
-                kwargs['incldirs'] = [v for v, in _resolve_file_values(filename, file_incldirs)]
-            if file_same and 'file_same' not in kwargs:
-                kwargs['samefiles'] = file_same
+                kwargs['incldirs'] = incldirs
+            if file_same and 'samefiles' not in kwargs:
+                kwargs['samefiles'] = samefiles
            kwargs.setdefault('filename', filename)
            with handling_errors(ignore_exc, log_err=log_err):
                return _preprocess(filename, **kwargs)
@ -120,6 +145,11 @@ def _parse_macros(macros):
        yield row


+def _parse_includes(includes):
+    for row, srcfile in _parse_table(includes, '\t', 'glob\tinclude', default=None):
+        yield row
+
+
 def _parse_incldirs(incldirs):
    for row, srcfile in _parse_table(incldirs, '\t', 'glob\tdirname', default=None):
        glob, dirname = row
@ -130,6 +160,43 @@ def _parse_incldirs(incldirs):
        yield row


+def _resolve_samefiles(filename, file_same):
+    assert '*' not in filename, (filename,)
+    assert os.path.normpath(filename) == filename, (filename,)
+    _, suffix = os.path.splitext(filename)
+    samefiles = []
+    for patterns, in _resolve_file_values(filename, file_same.items()):
+        for pattern in patterns:
+            same = _resolve_samefile(filename, pattern, suffix)
+            if not same:
+                continue
+            samefiles.append(same)
+    return samefiles
+
+
+def _resolve_samefile(filename, pattern, suffix):
+    if pattern == filename:
+        return None
+    if pattern.endswith(os.path.sep):
+        pattern += f'*{suffix}'
+    assert os.path.normpath(pattern) == pattern, (pattern,)
+    if '*' in os.path.dirname(pattern):
+        raise NotImplementedError((filename, pattern))
+    if '*' not in os.path.basename(pattern):
+        return pattern
+
+    common = os.path.commonpath([filename, pattern])
+    relpattern = pattern[len(common) + len(os.path.sep):]
+    relpatterndir = os.path.dirname(relpattern)
+    relfile = filename[len(common) + len(os.path.sep):]
+    if os.path.basename(pattern) == '*':
+        return os.path.join(common, relpatterndir, relfile)
+    elif os.path.basename(relpattern) == '*' + suffix:
+        return os.path.join(common, relpatterndir, relfile)
+    else:
+        raise NotImplementedError((filename, pattern))
+
+
@contextlib.contextmanager
 def handling_errors(ignore_exc=None, *, log_err=None):
    try:
--- a/Tools/c-analyzer/c_parser/preprocessor/common.py
+++ b/Tools/c-analyzer/c_parser/preprocessor/common.py
@ -44,7 +44,7 @@ def run_cmd(argv, *,
    return proc.stdout


-def preprocess(tool, filename, **kwargs):
+def preprocess(tool, filename, cwd=None, **kwargs):
    argv = _build_argv(tool, filename, **kwargs)
    logger.debug(' '.join(shlex.quote(v) for v in argv))

@ -59,19 +59,24 @@ def preprocess(tool, filename, **kwargs):
        # distutil compiler object's preprocess() method, since that
        # one writes to stdout/stderr and it's simpler to do it directly
        # through subprocess.
-        return run_cmd(argv)
+        return run_cmd(argv, cwd=cwd)


 def _build_argv(
    tool,
    filename,
    incldirs=None,
+    includes=None,
    macros=None,
    preargs=None,
    postargs=None,
    executable=None,
    compiler=None,
 ):
+    if includes:
+        includes = tuple(f'-include{i}' for i in includes)
+        postargs = (includes + postargs) if postargs else includes
+
    compiler = distutils.ccompiler.new_compiler(
        compiler=compiler or tool,
    )
--- a/Tools/c-analyzer/c_parser/preprocessor/gcc.py
+++ b/Tools/c-analyzer/c_parser/preprocessor/gcc.py
@ -7,7 +7,12 @@ from . import common as _common
 TOOL = 'gcc'

 # https://gcc.gnu.org/onlinedocs/cpp/Preprocessor-Output.html
-LINE_MARKER_RE = re.compile(r'^# (\d+) "([^"]+)"(?: [1234])*$')
+# flags:
+#  1  start of a new file
+#  2  returning to a file (after including another)
+#  3  following text comes from a system header file
+#  4  following text treated wrapped in implicit extern "C" block
+LINE_MARKER_RE = re.compile(r'^# (\d+) "([^"]+)"((?: [1234])*)$')
 PREPROC_DIRECTIVE_RE = re.compile(r'^\s*#\s*(\w+)\b.*')
 COMPILER_DIRECTIVE_RE = re.compile(r'''
    ^
@ -40,32 +45,112 @@ POST_ARGS = (
 )


-def preprocess(filename, incldirs=None, macros=None, samefiles=None):
+def preprocess(filename,
+               incldirs=None,
+               includes=None,
+               macros=None,
+               samefiles=None,
+               cwd=None,
+               ):
+    if not cwd or not os.path.isabs(cwd):
+        cwd = os.path.abspath(cwd or '.')
+    filename = _normpath(filename, cwd)
    text = _common.preprocess(
        TOOL,
        filename,
        incldirs=incldirs,
+        includes=includes,
        macros=macros,
        #preargs=PRE_ARGS,
        postargs=POST_ARGS,
        executable=['gcc'],
        compiler='unix',
+        cwd=cwd,
    )
-    return _iter_lines(text, filename, samefiles)
+    return _iter_lines(text, filename, samefiles, cwd)


-def _iter_lines(text, filename, samefiles, *, raw=False):
+def _iter_lines(text, reqfile, samefiles, cwd, raw=False):
    lines = iter(text.splitlines())

-    # Build the lines and filter out directives.
-    partial = 0  # depth
-    origfile = None
+    # The first line is special.
+    # The next two lines are consistent.
+    for expected in [
+        f'# 1 "{reqfile}"',
+        '# 1 "<built-in>"',
+        '# 1 "<command-line>"',
+    ]:
+        line = next(lines)
+        if line != expected:
+            raise NotImplementedError((line, expected))
+
+    # Do all the CLI-provided includes.
+    filter_reqfile = (lambda f: _filter_reqfile(f, reqfile, samefiles))
+    make_info = (lambda lno: _common.FileInfo(reqfile, lno))
+    last = None
    for line in lines:
-        m = LINE_MARKER_RE.match(line)
-        if m:
-            lno, origfile = m.groups()
-            lno = int(lno)
-        elif _filter_orig_file(origfile, filename, samefiles):
+        assert last != reqfile, (last,)
+        lno, included, flags = _parse_marker_line(line, reqfile)
+        if not included:
+            raise NotImplementedError((line,))
+        if included == reqfile:
+            # This will be the last one.
+            assert not flags, (line, flags)
+        else:
+            assert 1 in flags, (line, flags)
+        yield from _iter_top_include_lines(
+            lines,
+            _normpath(included, cwd),
+            cwd,
+            filter_reqfile,
+            make_info,
+            raw,
+        )
+        last = included
+    # The last one is always the requested file.
+    assert included == reqfile, (line,)
+
+
+def _iter_top_include_lines(lines, topfile, cwd,
+                            filter_reqfile, make_info,
+                            raw):
+    partial = 0  # depth
+    files = [topfile]
+    # We start at 1 in case there are source lines (including blank onces)
+    # before the first marker line.  Also, we already verified in
+    # _parse_marker_line() that the preprocessor reported lno as 1.
+    lno = 1
+    for line in lines:
+        if line == '# 1 "<command-line>" 2':
+            # We're done with this top-level include.
+            return
+
+        _lno, included, flags = _parse_marker_line(line)
+        if included:
+            lno = _lno
+            included = _normpath(included, cwd)
+            # We hit a marker line.
+            if 1 in flags:
+                # We're entering a file.
+                # XXX Cycles are unexpected?
+                #assert included not in files, (line, files)
+                files.append(included)
+            elif 2 in flags:
+                # We're returning to a file.
+                assert files and included in files, (line, files)
+                assert included != files[-1], (line, files)
+                while files[-1] != included:
+                    files.pop()
+                # XXX How can a file return to line 1?
+                #assert lno > 1, (line, lno)
+            else:
+                # It's the next line from the file.
+                assert included == files[-1], (line, files)
+                assert lno > 1, (line, lno)
+        elif not files:
+            raise NotImplementedError((line,))
+        elif filter_reqfile(files[-1]):
+            assert lno is not None, (line, files[-1])
            if (m := PREPROC_DIRECTIVE_RE.match(line)):
                name, = m.groups()
                if name != 'pragma':
@ -74,7 +159,7 @@ def _iter_lines(text, filename, samefiles, *, raw=False):
                if not raw:
                    line, partial = _strip_directives(line, partial=partial)
                yield _common.SourceLine(
-                    _common.FileInfo(filename, lno),
+                    make_info(lno),
                    'source',
                    line or '',
                    None,
@ -82,6 +167,34 @@ def _iter_lines(text, filename, samefiles, *, raw=False):
            lno += 1


+def _parse_marker_line(line, reqfile=None):
+    m = LINE_MARKER_RE.match(line)
+    if not m:
+        return None, None, None
+    lno, origfile, flags = m.groups()
+    lno = int(lno)
+    assert lno > 0, (line, lno)
+    assert origfile not in ('<built-in>', '<command-line>'), (line,)
+    flags = set(int(f) for f in flags.split()) if flags else ()
+
+    if 1 in flags:
+        # We're entering a file.
+        assert lno == 1, (line, lno)
+        assert 2 not in flags, (line,)
+    elif 2 in flags:
+        # We're returning to a file.
+        #assert lno > 1, (line, lno)
+        pass
+    elif reqfile and origfile == reqfile:
+        # We're starting the requested file.
+        assert lno == 1, (line, lno)
+        assert not flags, (line, flags)
+    else:
+        # It's the next line from the file.
+        assert lno > 1, (line, lno)
+    return lno, origfile, flags
+
+
 def _strip_directives(line, partial=0):
    # We assume there are no string literals with parens in directive bodies.
    while partial > 0:
@ -106,18 +219,16 @@ def _strip_directives(line, partial=0):
    return line, partial


-def _filter_orig_file(origfile, current, samefiles):
-    if origfile == current:
+def _filter_reqfile(current, reqfile, samefiles):
+    if current == reqfile:
        return True
-    if origfile == '<stdin>':
+    if current == '<stdin>':
+        return True
+    if current in samefiles:
        return True
-    if os.path.isabs(origfile):
-        return False
-
-    for filename in samefiles or ():
-        if filename.endswith(os.path.sep):
-            filename += os.path.basename(current)
-        if origfile == filename:
-            return True
-
    return False
+
+
+def _normpath(filename, cwd):
+    assert cwd
+    return os.path.normpath(os.path.join(cwd, filename))
--- a/Tools/c-analyzer/c_parser/preprocessor/pure.py
+++ b/Tools/c-analyzer/c_parser/preprocessor/pure.py
@ -4,7 +4,7 @@ from ..source import (
 from . import common as _common


-def preprocess(lines, filename=None):
+def preprocess(lines, filename=None, cwd=None):
    if isinstance(lines, str):
        with _open_source(lines, filename) as (lines, filename):
            yield from preprocess(lines, filename)