bpo-36876: Fix the C analyzer tool. (GH-22841)

The original tool wasn't working right and it was simpler to create a new one, partially re-using some of the old code. At this point the tool runs properly on the master. (Try: ./python Tools/c-analyzer/c-analyzer.py analyze.)  It take ~40 seconds on my machine to analyze the full CPython code base.

Note that we'll need to iron out some OS-specific stuff (e.g. preprocessor). We're okay though since this tool isn't used yet in our workflow. We will also need to verify the analysis results in detail before activating the check in CI, though I'm pretty sure it's close.

https://bugs.python.org/issue36876
This commit is contained in:
Eric Snow 2020-10-22 18:42:51 -06:00 committed by GitHub
parent ec388cfb4e
commit 345cd37abe
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
92 changed files with 8868 additions and 10539 deletions

View file

@ -0,0 +1,190 @@
import contextlib
import distutils.ccompiler
import logging
import os.path
from c_common.fsutil import match_glob as _match_glob
from c_common.tables import parse_table as _parse_table
from ..source import (
resolve as _resolve_source,
good_file as _good_file,
)
from . import errors as _errors
from . import (
pure as _pure,
gcc as _gcc,
)
logger = logging.getLogger(__name__)
# Supprted "source":
# * filename (string)
# * lines (iterable)
# * text (string)
# Supported return values:
# * iterator of SourceLine
# * sequence of SourceLine
# * text (string)
# * something that combines all those
# XXX Add the missing support from above.
# XXX Add more low-level functions to handle permutations?
def preprocess(source, *,
incldirs=None,
macros=None,
samefiles=None,
filename=None,
tool=True,
):
"""...
CWD should be the project root and "source" should be relative.
"""
if tool:
logger.debug(f'CWD: {os.getcwd()!r}')
logger.debug(f'incldirs: {incldirs!r}')
logger.debug(f'macros: {macros!r}')
logger.debug(f'samefiles: {samefiles!r}')
_preprocess = _get_preprocessor(tool)
with _good_file(source, filename) as source:
return _preprocess(source, incldirs, macros, samefiles) or ()
else:
source, filename = _resolve_source(source, filename)
# We ignore "includes", "macros", etc.
return _pure.preprocess(source, filename)
# if _run() returns just the lines:
# text = _run(source)
# lines = [line + os.linesep for line in text.splitlines()]
# lines[-1] = lines[-1].splitlines()[0]
#
# conditions = None
# for lno, line in enumerate(lines, 1):
# kind = 'source'
# directive = None
# data = line
# yield lno, kind, data, conditions
def get_preprocessor(*,
file_macros=None,
file_incldirs=None,
file_same=None,
ignore_exc=False,
log_err=None,
):
_preprocess = preprocess
if file_macros:
file_macros = tuple(_parse_macros(file_macros))
if file_incldirs:
file_incldirs = tuple(_parse_incldirs(file_incldirs))
if file_same:
file_same = tuple(file_same)
if not callable(ignore_exc):
ignore_exc = (lambda exc, _ig=ignore_exc: _ig)
def get_file_preprocessor(filename):
filename = filename.strip()
if file_macros:
macros = list(_resolve_file_values(filename, file_macros))
if file_incldirs:
incldirs = [v for v, in _resolve_file_values(filename, file_incldirs)]
def preprocess(**kwargs):
if file_macros and 'macros' not in kwargs:
kwargs['macros'] = macros
if file_incldirs and 'incldirs' not in kwargs:
kwargs['incldirs'] = [v for v, in _resolve_file_values(filename, file_incldirs)]
if file_same and 'file_same' not in kwargs:
kwargs['samefiles'] = file_same
kwargs.setdefault('filename', filename)
with handling_errors(ignore_exc, log_err=log_err):
return _preprocess(filename, **kwargs)
return preprocess
return get_file_preprocessor
def _resolve_file_values(filename, file_values):
# We expect the filename and all patterns to be absolute paths.
for pattern, *value in file_values or ():
if _match_glob(filename, pattern):
yield value
def _parse_macros(macros):
for row, srcfile in _parse_table(macros, '\t', 'glob\tname\tvalue', rawsep='=', default=None):
yield row
def _parse_incldirs(incldirs):
for row, srcfile in _parse_table(incldirs, '\t', 'glob\tdirname', default=None):
glob, dirname = row
if dirname is None:
# Match all files.
dirname = glob
row = ('*', dirname.strip())
yield row
@contextlib.contextmanager
def handling_errors(ignore_exc=None, *, log_err=None):
try:
yield
except _errors.OSMismatchError as exc:
if not ignore_exc(exc):
raise # re-raise
if log_err is not None:
log_err(f'<OS mismatch (expected {" or ".join(exc.expected)})>')
return None
except _errors.MissingDependenciesError as exc:
if not ignore_exc(exc):
raise # re-raise
if log_err is not None:
log_err(f'<missing dependency {exc.missing}')
return None
except _errors.ErrorDirectiveError as exc:
if not ignore_exc(exc):
raise # re-raise
if log_err is not None:
log_err(exc)
return None
##################################
# tools
_COMPILERS = {
# matching disutils.ccompiler.compiler_class:
'unix': _gcc.preprocess,
'msvc': None,
'cygwin': None,
'mingw32': None,
'bcpp': None,
# aliases/extras:
'gcc': _gcc.preprocess,
'clang': None,
}
def _get_preprocessor(tool):
if tool is True:
tool = distutils.ccompiler.get_default_compiler()
preprocess = _COMPILERS.get(tool)
if preprocess is None:
raise ValueError(f'unsupported tool {tool}')
return preprocess
##################################
# aliases
from .errors import (
PreprocessorError,
PreprocessorFailure,
ErrorDirectiveError,
MissingDependenciesError,
OSMismatchError,
)
from .common import FileInfo, SourceLine

View file

@ -0,0 +1,196 @@
import logging
import sys
from c_common.scriptutil import (
CLIArgSpec as Arg,
add_verbosity_cli,
add_traceback_cli,
add_kind_filtering_cli,
add_files_cli,
add_failure_filtering_cli,
add_commands_cli,
process_args_by_key,
configure_logger,
get_prog,
main_for_filenames,
)
from . import (
errors as _errors,
get_preprocessor as _get_preprocessor,
)
FAIL = {
'err': _errors.ErrorDirectiveError,
'deps': _errors.MissingDependenciesError,
'os': _errors.OSMismatchError,
}
FAIL_DEFAULT = tuple(v for v in FAIL if v != 'os')
logger = logging.getLogger(__name__)
##################################
# CLI helpers
def add_common_cli(parser, *, get_preprocessor=_get_preprocessor):
parser.add_argument('--macros', action='append')
parser.add_argument('--incldirs', action='append')
parser.add_argument('--same', action='append')
process_fail_arg = add_failure_filtering_cli(parser, FAIL)
def process_args(args):
ns = vars(args)
process_fail_arg(args)
ignore_exc = ns.pop('ignore_exc')
# We later pass ignore_exc to _get_preprocessor().
args.get_file_preprocessor = get_preprocessor(
file_macros=ns.pop('macros'),
file_incldirs=ns.pop('incldirs'),
file_same=ns.pop('same'),
ignore_exc=ignore_exc,
log_err=print,
)
return process_args
def _iter_preprocessed(filename, *,
get_preprocessor,
match_kind=None,
pure=False,
):
preprocess = get_preprocessor(filename)
for line in preprocess(tool=not pure) or ():
if match_kind is not None and not match_kind(line.kind):
continue
yield line
#######################################
# the commands
def _cli_preprocess(parser, excluded=None, **prepr_kwargs):
parser.add_argument('--pure', action='store_true')
parser.add_argument('--no-pure', dest='pure', action='store_const', const=False)
process_kinds = add_kind_filtering_cli(parser)
process_common = add_common_cli(parser, **prepr_kwargs)
parser.add_argument('--raw', action='store_true')
process_files = add_files_cli(parser, excluded=excluded)
return [
process_kinds,
process_common,
process_files,
]
def cmd_preprocess(filenames, *,
raw=False,
iter_filenames=None,
**kwargs
):
if 'get_file_preprocessor' not in kwargs:
kwargs['get_file_preprocessor'] = _get_preprocessor()
if raw:
def show_file(filename, lines):
for line in lines:
print(line)
#print(line.raw)
else:
def show_file(filename, lines):
for line in lines:
linefile = ''
if line.filename != filename:
linefile = f' ({line.filename})'
text = line.data
if line.kind == 'comment':
text = '/* ' + line.data.splitlines()[0]
text += ' */' if '\n' in line.data else r'\n... */'
print(f' {line.lno:>4} {line.kind:10} | {text}')
filenames = main_for_filenames(filenames, iter_filenames)
for filename in filenames:
lines = _iter_preprocessed(filename, **kwargs)
show_file(filename, lines)
def _cli_data(parser):
...
return None
def cmd_data(filenames,
**kwargs
):
# XXX
raise NotImplementedError
COMMANDS = {
'preprocess': (
'preprocess the given C source & header files',
[_cli_preprocess],
cmd_preprocess,
),
'data': (
'check/manage local data (e.g. excludes, macros)',
[_cli_data],
cmd_data,
),
}
#######################################
# the script
def parse_args(argv=sys.argv[1:], prog=sys.argv[0], *,
subset='preprocess',
excluded=None,
**prepr_kwargs
):
import argparse
parser = argparse.ArgumentParser(
prog=prog or get_prog(),
)
processors = add_commands_cli(
parser,
commands={k: v[1] for k, v in COMMANDS.items()},
commonspecs=[
add_verbosity_cli,
add_traceback_cli,
],
subset=subset,
)
args = parser.parse_args(argv)
ns = vars(args)
cmd = ns.pop('cmd')
verbosity, traceback_cm = process_args_by_key(
args,
processors[cmd],
['verbosity', 'traceback_cm'],
)
return cmd, ns, verbosity, traceback_cm
def main(cmd, cmd_kwargs):
try:
run_cmd = COMMANDS[cmd][0]
except KeyError:
raise ValueError(f'unsupported cmd {cmd!r}')
run_cmd(**cmd_kwargs)
if __name__ == '__main__':
cmd, cmd_kwargs, verbosity, traceback_cm = parse_args()
configure_logger(verbosity)
with traceback_cm:
main(cmd, cmd_kwargs)

View file

@ -0,0 +1,173 @@
import contextlib
import distutils.ccompiler
import logging
import shlex
import subprocess
import sys
from ..info import FileInfo, SourceLine
from .errors import (
PreprocessorFailure,
ErrorDirectiveError,
MissingDependenciesError,
OSMismatchError,
)
logger = logging.getLogger(__name__)
# XXX Add aggregate "source" class(es)?
# * expose all lines as single text string
# * expose all lines as sequence
# * iterate all lines
def run_cmd(argv, *,
#capture_output=True,
stdout=subprocess.PIPE,
#stderr=subprocess.STDOUT,
stderr=subprocess.PIPE,
text=True,
check=True,
**kwargs
):
if isinstance(stderr, str) and stderr.lower() == 'stdout':
stderr = subprocess.STDOUT
kw = dict(locals())
kw.pop('argv')
kw.pop('kwargs')
kwargs.update(kw)
proc = subprocess.run(argv, **kwargs)
return proc.stdout
def preprocess(tool, filename, **kwargs):
argv = _build_argv(tool, filename, **kwargs)
logger.debug(' '.join(shlex.quote(v) for v in argv))
# Make sure the OS is supported for this file.
if (_expected := is_os_mismatch(filename)):
error = None
raise OSMismatchError(filename, _expected, argv, error, TOOL)
# Run the command.
with converted_error(tool, argv, filename):
# We use subprocess directly here, instead of calling the
# distutil compiler object's preprocess() method, since that
# one writes to stdout/stderr and it's simpler to do it directly
# through subprocess.
return run_cmd(argv)
def _build_argv(
tool,
filename,
incldirs=None,
macros=None,
preargs=None,
postargs=None,
executable=None,
compiler=None,
):
compiler = distutils.ccompiler.new_compiler(
compiler=compiler or tool,
)
if executable:
compiler.set_executable('preprocessor', executable)
argv = None
def _spawn(_argv):
nonlocal argv
argv = _argv
compiler.spawn = _spawn
compiler.preprocess(
filename,
macros=[tuple(v) for v in macros or ()],
include_dirs=incldirs or (),
extra_preargs=preargs or (),
extra_postargs=postargs or (),
)
return argv
@contextlib.contextmanager
def converted_error(tool, argv, filename):
try:
yield
except subprocess.CalledProcessError as exc:
convert_error(
tool,
argv,
filename,
exc.stderr,
exc.returncode,
)
def convert_error(tool, argv, filename, stderr, rc):
error = (stderr.splitlines()[0], rc)
if (_expected := is_os_mismatch(filename, stderr)):
logger.debug(stderr.strip())
raise OSMismatchError(filename, _expected, argv, error, tool)
elif (_missing := is_missing_dep(stderr)):
logger.debug(stderr.strip())
raise MissingDependenciesError(filename, (_missing,), argv, error, tool)
elif '#error' in stderr:
# XXX Ignore incompatible files.
error = (stderr.splitlines()[1], rc)
logger.debug(stderr.strip())
raise ErrorDirectiveError(filename, argv, error, tool)
else:
# Try one more time, with stderr written to the terminal.
try:
output = run_cmd(argv, stderr=None)
except subprocess.CalledProcessError:
raise PreprocessorFailure(filename, argv, error, tool)
def is_os_mismatch(filename, errtext=None):
# See: https://docs.python.org/3/library/sys.html#sys.platform
actual = sys.platform
if actual == 'unknown':
raise NotImplementedError
if errtext is not None:
if (missing := is_missing_dep(errtext)):
matching = get_matching_oses(missing, filename)
if actual not in matching:
return matching
return False
def get_matching_oses(missing, filename):
# OSX
if 'darwin' in filename or 'osx' in filename:
return ('darwin',)
elif missing == 'SystemConfiguration/SystemConfiguration.h':
return ('darwin',)
# Windows
elif missing in ('windows.h', 'winsock2.h'):
return ('win32',)
# other
elif missing == 'sys/ldr.h':
return ('aix',)
elif missing == 'dl.h':
# XXX The existence of Python/dynload_dl.c implies others...
# Note that hpux isn't actual supported any more.
return ('hpux', '???')
# unrecognized
else:
return ()
def is_missing_dep(errtext):
if 'No such file or directory' in errtext:
missing = errtext.split(': No such file or directory')[0].split()[-1]
return missing
return False

View file

@ -0,0 +1,110 @@
import sys
OS = sys.platform
def _as_tuple(items):
if isinstance(items, str):
return tuple(items.strip().replace(',', ' ').split())
elif items:
return tuple(items)
else:
return ()
class PreprocessorError(Exception):
"""Something preprocessor-related went wrong."""
@classmethod
def _msg(cls, filename, reason, **ignored):
msg = 'failure while preprocessing'
if reason:
msg = f'{msg} ({reason})'
return msg
def __init__(self, filename, preprocessor=None, reason=None):
if isinstance(reason, str):
reason = reason.strip()
self.filename = filename
self.preprocessor = preprocessor or None
self.reason = str(reason) if reason else None
msg = self._msg(**vars(self))
msg = f'({filename}) {msg}'
if preprocessor:
msg = f'[{preprocessor}] {msg}'
super().__init__(msg)
class PreprocessorFailure(PreprocessorError):
"""The preprocessor command failed."""
@classmethod
def _msg(cls, error, **ignored):
msg = 'preprocessor command failed'
if error:
msg = f'{msg} {error}'
return msg
def __init__(self, filename, argv, error=None, preprocessor=None):
exitcode = -1
if isinstance(error, tuple):
if len(error) == 2:
error, exitcode = error
else:
error = str(error)
if isinstance(error, str):
error = error.strip()
self.argv = _as_tuple(argv) or None
self.error = error if error else None
self.exitcode = exitcode
reason = str(self.error)
super().__init__(filename, preprocessor, reason)
class ErrorDirectiveError(PreprocessorFailure):
"""The file hit a #error directive."""
@classmethod
def _msg(cls, error, **ignored):
return f'#error directive hit ({error})'
def __init__(self, filename, argv, error, *args, **kwargs):
super().__init__(filename, argv, error, *args, **kwargs)
class MissingDependenciesError(PreprocessorFailure):
"""The preprocessor did not have access to all the target's dependencies."""
@classmethod
def _msg(cls, missing, **ignored):
msg = 'preprocessing failed due to missing dependencies'
if missing:
msg = f'{msg} ({", ".join(missing)})'
return msg
def __init__(self, filename, missing=None, *args, **kwargs):
self.missing = _as_tuple(missing) or None
super().__init__(filename, *args, **kwargs)
class OSMismatchError(MissingDependenciesError):
"""The target is not compatible with the host OS."""
@classmethod
def _msg(cls, expected, **ignored):
return f'OS is {OS} but expected {expected or "???"}'
def __init__(self, filename, expected=None, *args, **kwargs):
if isinstance(expected, str):
expected = expected.strip()
self.actual = OS
self.expected = expected if expected else None
super().__init__(filename, None, *args, **kwargs)

View file

@ -0,0 +1,123 @@
import os.path
import re
from . import common as _common
TOOL = 'gcc'
# https://gcc.gnu.org/onlinedocs/cpp/Preprocessor-Output.html
LINE_MARKER_RE = re.compile(r'^# (\d+) "([^"]+)"(?: [1234])*$')
PREPROC_DIRECTIVE_RE = re.compile(r'^\s*#\s*(\w+)\b.*')
COMPILER_DIRECTIVE_RE = re.compile(r'''
^
(.*?) # <before>
(__\w+__) # <directive>
\s*
[(] [(]
(
[^()]*
(?:
[(]
[^()]*
[)]
[^()]*
)*
) # <args>
( [)] [)] )? # <closed>
''', re.VERBOSE)
POST_ARGS = (
'-pthread',
'-std=c99',
#'-g',
#'-Og',
#'-Wno-unused-result',
#'-Wsign-compare',
#'-Wall',
#'-Wextra',
'-E',
)
def preprocess(filename, incldirs=None, macros=None, samefiles=None):
text = _common.preprocess(
TOOL,
filename,
incldirs=incldirs,
macros=macros,
#preargs=PRE_ARGS,
postargs=POST_ARGS,
executable=['gcc'],
compiler='unix',
)
return _iter_lines(text, filename, samefiles)
def _iter_lines(text, filename, samefiles, *, raw=False):
lines = iter(text.splitlines())
# Build the lines and filter out directives.
partial = 0 # depth
origfile = None
for line in lines:
m = LINE_MARKER_RE.match(line)
if m:
lno, origfile = m.groups()
lno = int(lno)
elif _filter_orig_file(origfile, filename, samefiles):
if (m := PREPROC_DIRECTIVE_RE.match(line)):
name, = m.groups()
if name != 'pragma':
raise Exception(line)
else:
if not raw:
line, partial = _strip_directives(line, partial=partial)
yield _common.SourceLine(
_common.FileInfo(filename, lno),
'source',
line or '',
None,
)
lno += 1
def _strip_directives(line, partial=0):
# We assume there are no string literals with parens in directive bodies.
while partial > 0:
if not (m := re.match(r'[^{}]*([()])', line)):
return None, partial
delim, = m.groups()
partial += 1 if delim == '(' else -1 # opened/closed
line = line[m.end():]
line = re.sub(r'__extension__', '', line)
while (m := COMPILER_DIRECTIVE_RE.match(line)):
before, _, _, closed = m.groups()
if closed:
line = f'{before} {line[m.end():]}'
else:
after, partial = _strip_directives(line[m.end():], 2)
line = f'{before} {after or ""}'
if partial:
break
return line, partial
def _filter_orig_file(origfile, current, samefiles):
if origfile == current:
return True
if origfile == '<stdin>':
return True
if os.path.isabs(origfile):
return False
for filename in samefiles or ():
if filename.endswith(os.path.sep):
filename += os.path.basename(current)
if origfile == filename:
return True
return False

View file

@ -0,0 +1,23 @@
from ..source import (
opened as _open_source,
)
from . import common as _common
def preprocess(lines, filename=None):
if isinstance(lines, str):
with _open_source(lines, filename) as (lines, filename):
yield from preprocess(lines, filename)
return
# XXX actually preprocess...
for lno, line in enumerate(lines, 1):
kind = 'source'
data = line
conditions = None
yield _common.SourceLine(
_common.FileInfo(filename, lno),
kind,
data,
conditions,
)