bpo-36876: Fix the C analyzer tool. (GH-22841)

The original tool wasn't working right and it was simpler to create a new one, partially re-using some of the old code. At this point the tool runs properly on the master. (Try: ./python Tools/c-analyzer/c-analyzer.py analyze.) It take ~40 seconds on my machine to analyze the full CPython code base. Note that we'll need to iron out some OS-specific stuff (e.g. preprocessor). We're okay though since this tool isn't used yet in our workflow. We will also need to verify the analysis results in detail before activating the check in CI, though I'm pretty sure it's close. https://bugs.python.org/issue36876
2025-08-04 00:48:58 +00:00 · 2020-10-22 18:42:51 -06:00 · 2020-10-22 18:42:51 -06:00 · 345cd37abe
commit 345cd37abe
parent ec388cfb4e
92 changed files with 8868 additions and 10539 deletions
--- a/Tools/c-analyzer/c_parser/init.py
+++ b/Tools/c-analyzer/c_parser/init.py
@ -0,0 +1,46 @@
+from .parser import parse as _parse
+from .preprocessor import get_preprocessor as _get_preprocessor
+
+
+def parse_file(filename, *,
+               match_kind=None,
+               get_file_preprocessor=None,
+               ):
+    if get_file_preprocessor is None:
+        get_file_preprocessor = _get_preprocessor()
+    yield from _parse_file(filename, match_kind, get_file_preprocessor)
+
+
+def parse_files(filenames, *,
+                match_kind=None,
+                get_file_preprocessor=None,
+                ):
+    if get_file_preprocessor is None:
+        get_file_preprocessor = _get_preprocessor()
+    for filename in filenames:
+        yield from _parse_file(filename, match_kind, get_file_preprocessor)
+
+
+def _parse_file(filename, match_kind, get_file_preprocessor):
+    # Preprocess the file.
+    preprocess = get_file_preprocessor(filename)
+    preprocessed = preprocess()
+    if preprocessed is None:
+        return
+
+    # Parse the lines.
+    srclines = ((l.file, l.data) for l in preprocessed if l.kind == 'source')
+    for item in _parse(srclines):
+        if match_kind is not None and not match_kind(item.kind):
+            continue
+        if not item.filename:
+            raise NotImplementedError(repr(item))
+        yield item
+
+
+def parse_signature(text):
+    raise NotImplementedError
+
+
+# aliases
+from .info import resolve_parsed
--- a/Tools/c-analyzer/c_parser/main.py
+++ b/Tools/c-analyzer/c_parser/main.py
@ -0,0 +1,261 @@
+import logging
+import os.path
+import sys
+
+from c_common.scriptutil import (
+    CLIArgSpec as Arg,
+    add_verbosity_cli,
+    add_traceback_cli,
+    add_kind_filtering_cli,
+    add_files_cli,
+    add_commands_cli,
+    process_args_by_key,
+    configure_logger,
+    get_prog,
+    main_for_filenames,
+)
+from .preprocessor import get_preprocessor
+from .preprocessor.__main__ import (
+    add_common_cli as add_preprocessor_cli,
+)
+from .info import KIND
+from . import parse_file as _iter_parsed
+
+
+logger = logging.getLogger(__name__)
+
+
+def _format_vartype(vartype):
+    if isinstance(vartype, str):
+        return vartype
+
+    data = vartype
+    try:
+        vartype = data['vartype']
+    except KeyError:
+        storage, typequal, typespec, abstract = vartype.values()
+    else:
+        storage = data.get('storage')
+        if storage:
+            _, typequal, typespec, abstract = vartype.values()
+        else:
+            storage, typequal, typespec, abstract = vartype.values()
+
+    vartype = f'{typespec} {abstract}'
+    if typequal:
+        vartype = f'{typequal} {vartype}'
+    if storage:
+        vartype = f'{storage} {vartype}'
+    return vartype
+
+
+def _get_preprocessor(filename, **kwargs):
+    return get_processor(filename,
+                         log_err=print,
+                         **kwargs
+                         )
+
+
+#######################################
+# the formats
+
+def fmt_raw(filename, item, *, showfwd=None):
+    yield str(tuple(item))
+
+
+def fmt_summary(filename, item, *, showfwd=None):
+    if item.filename and item.filename != os.path.join('.', filename):
+        yield f'> {item.filename}'
+    if showfwd is None:
+        LINE = ' {lno:>5} {kind:10} {funcname:40} {fwd:1} {name:40} {data}'
+    else:
+        LINE = ' {lno:>5} {kind:10} {funcname:40} {name:40} {data}'
+    lno = kind = funcname = fwd = name = data = ''
+    MIN_LINE = len(LINE.format(**locals()))
+
+    fileinfo, kind, funcname, name, data = item
+    lno = fileinfo.lno if fileinfo and fileinfo.lno >= 0 else ''
+    funcname = funcname or ' --'
+    name = name or ' --'
+    isforward = False
+    if kind is KIND.FUNCTION:
+        storage, inline, params, returntype, isforward = data.values()
+        returntype = _format_vartype(returntype)
+        data = returntype + params
+        if inline:
+            data = f'inline {data}'
+        if storage:
+            data = f'{storage} {data}'
+    elif kind is KIND.VARIABLE:
+        data = _format_vartype(data)
+    elif kind is KIND.STRUCT or kind is KIND.UNION:
+        if data is None:
+            isforward = True
+        else:
+            fields = data
+            data = f'({len(data)}) {{ '
+            indent = ',\n' + ' ' * (MIN_LINE + len(data))
+            data += ', '.join(f.name for f in fields[:5])
+            fields = fields[5:]
+            while fields:
+                data = f'{data}{indent}{", ".join(f.name for f in fields[:5])}'
+                fields = fields[5:]
+            data += ' }'
+    elif kind is KIND.ENUM:
+        if data is None:
+            isforward = True
+        else:
+            names = [d if isinstance(d, str) else d.name
+                     for d in data]
+            data = f'({len(data)}) {{ '
+            indent = ',\n' + ' ' * (MIN_LINE + len(data))
+            data += ', '.join(names[:5])
+            names = names[5:]
+            while names:
+                data = f'{data}{indent}{", ".join(names[:5])}'
+                names = names[5:]
+            data += ' }'
+    elif kind is KIND.TYPEDEF:
+        data = f'typedef {data}'
+    elif kind == KIND.STATEMENT:
+        pass
+    else:
+        raise NotImplementedError(item)
+    if isforward:
+        fwd = '*'
+        if not showfwd and showfwd is not None:
+            return
+    elif showfwd:
+        return
+    kind = kind.value
+    yield LINE.format(**locals())
+
+
+def fmt_full(filename, item, *, showfwd=None):
+    raise NotImplementedError
+
+
+FORMATS = {
+    'raw': fmt_raw,
+    'summary': fmt_summary,
+    'full': fmt_full,
+}
+
+
+def add_output_cli(parser):
+    parser.add_argument('--format', dest='fmt', default='summary', choices=tuple(FORMATS))
+    parser.add_argument('--showfwd', action='store_true', default=None)
+    parser.add_argument('--no-showfwd', dest='showfwd', action='store_false', default=None)
+
+    def process_args(args):
+        pass
+    return process_args
+
+
+#######################################
+# the commands
+
+def _cli_parse(parser, excluded=None, **prepr_kwargs):
+    process_output = add_output_cli(parser)
+    process_kinds = add_kind_filtering_cli(parser)
+    process_preprocessor = add_preprocessor_cli(parser, **prepr_kwargs)
+    process_files = add_files_cli(parser, excluded=excluded)
+    return [
+        process_output,
+        process_kinds,
+        process_preprocessor,
+        process_files,
+    ]
+
+
+def cmd_parse(filenames, *,
+              fmt='summary',
+              showfwd=None,
+              iter_filenames=None,
+              **kwargs
+              ):
+    if 'get_file_preprocessor' not in kwargs:
+        kwargs['get_file_preprocessor'] = _get_preprocessor()
+    try:
+        do_fmt = FORMATS[fmt]
+    except KeyError:
+        raise ValueError(f'unsupported fmt {fmt!r}')
+    for filename in main_for_filenames(filenames, iter_filenames):
+        for item in _iter_parsed(filename, **kwargs):
+            for line in do_fmt(filename, item, showfwd=showfwd):
+                print(line)
+
+
+def _cli_data(parser):
+    ...
+
+    return []
+
+
+def cmd_data(filenames,
+             **kwargs
+             ):
+    # XXX
+    raise NotImplementedError
+
+
+COMMANDS = {
+    'parse': (
+        'parse the given C source & header files',
+        [_cli_parse],
+        cmd_parse,
+    ),
+    'data': (
+        'check/manage local data (e.g. excludes, macros)',
+        [_cli_data],
+        cmd_data,
+    ),
+}
+
+
+#######################################
+# the script
+
+def parse_args(argv=sys.argv[1:], prog=sys.argv[0], *, subset='parse'):
+    import argparse
+    parser = argparse.ArgumentParser(
+        prog=prog or get_prog,
+    )
+
+    processors = add_commands_cli(
+        parser,
+        commands={k: v[1] for k, v in COMMANDS.items()},
+        commonspecs=[
+            add_verbosity_cli,
+            add_traceback_cli,
+        ],
+        subset=subset,
+    )
+
+    args = parser.parse_args(argv)
+    ns = vars(args)
+
+    cmd = ns.pop('cmd')
+
+    verbosity, traceback_cm = process_args_by_key(
+        args,
+        processors[cmd],
+        ['verbosity', 'traceback_cm'],
+    )
+
+    return cmd, ns, verbosity, traceback_cm
+
+
+def main(cmd, cmd_kwargs):
+    try:
+        run_cmd = COMMANDS[cmd][0]
+    except KeyError:
+        raise ValueError(f'unsupported cmd {cmd!r}')
+    run_cmd(**cmd_kwargs)
+
+
+if __name__ == '__main__':
+    cmd, cmd_kwargs, verbosity, traceback_cm = parse_args()
+    configure_logger(verbosity)
+    with traceback_cm:
+        main(cmd, cmd_kwargs)
--- a/Tools/c-analyzer/c_parser/_state_machine.py
+++ b/Tools/c-analyzer/c_parser/_state_machine.py
@ -0,0 +1,244 @@
+
+f'''
+    struct {ANON_IDENTIFIER};
+    struct {{ ... }}
+    struct {IDENTIFIER} {{ ... }}
+
+    union {ANON_IDENTIFIER};
+    union {{ ... }}
+    union {IDENTIFIER} {{ ... }}
+
+    enum {ANON_IDENTIFIER};
+    enum {{ ... }}
+    enum {IDENTIFIER} {{ ... }}
+
+    typedef {VARTYPE} {IDENTIFIER};
+    typedef {IDENTIFIER};
+    typedef {IDENTIFIER};
+    typedef {IDENTIFIER};
+'''
+
+
+def parse(srclines):
+    if isinstance(srclines, str):  # a filename
+        raise NotImplementedError
+
+    
+
+# This only handles at most 10 nested levels.
+#MATCHED_PARENS = textwrap.dedent(rf'''
+#    # matched parens
+#    (?:
+#        [(]  # level 0
+#        (?:
+#            [^()]*
+#            [(]  # level 1
+#            (?:
+#                [^()]*
+#                [(]  # level 2
+#                (?:
+#                    [^()]*
+#                    [(]  # level 3
+#                    (?:
+#                        [^()]*
+#                        [(]  # level 4
+#                        (?:
+#                            [^()]*
+#                            [(]  # level 5
+#                            (?:
+#                                [^()]*
+#                                [(]  # level 6
+#                                (?:
+#                                    [^()]*
+#                                    [(]  # level 7
+#                                    (?:
+#                                        [^()]*
+#                                        [(]  # level 8
+#                                        (?:
+#                                            [^()]*
+#                                            [(]  # level 9
+#                                            (?:
+#                                                [^()]*
+#                                                [(]  # level 10
+#                                                [^()]*
+#                                                [)]
+#                                             )*
+#                                            [^()]*
+#                                            [)]
+#                                         )*
+#                                        [^()]*
+#                                        [)]
+#                                     )*
+#                                    [^()]*
+#                                    [)]
+#                                 )*
+#                                [^()]*
+#                                [)]
+#                             )*
+#                            [^()]*
+#                            [)]
+#                         )*
+#                        [^()]*
+#                        [)]
+#                     )*
+#                    [^()]*
+#                    [)]
+#                 )*
+#                [^()]*
+#                [)]
+#             )*
+#            [^()]*
+#            [)]
+#         )*
+#        [^()]*
+#        [)]
+#     )
+#    # end matched parens
+#    ''')
+
+'''
+        # for loop
+        (?:
+            \s* \b for
+            \s* [(]
+            (
+                [^;]* ;
+                [^;]* ;
+                .*?
+             )  # <header>
+            [)]
+            \s*
+            (?:
+                (?:
+                    (
+                        {_ind(SIMPLE_STMT, 6)}
+                     )  # <stmt>
+                    ;
+                 )
+                |
+                ( {{ )  # <open>
+             )
+         )
+        |
+
+
+
+            (
+                (?:
+                    (?:
+                        (?:
+                            {_ind(SIMPLE_STMT, 6)}
+                         )?
+                        return \b \s*
+                        {_ind(INITIALIZER, 5)}
+                     )
+                    |
+                    (?:
+                        (?:
+                            {IDENTIFIER} \s*
+                            (?: . | -> ) \s*
+                         )*
+                        {IDENTIFIER}
+                        \s* = \s*
+                        {_ind(INITIALIZER, 5)}
+                     )
+                    |
+                    (?:
+                        {_ind(SIMPLE_STMT, 5)}
+                     )
+                 )
+                |
+                # cast compound literal
+                (?:
+                    (?:
+                        [^'"{{}};]*
+                        {_ind(STRING_LITERAL, 5)}
+                     )*
+                    [^'"{{}};]*?
+                    [^'"{{}};=]
+                    =
+                    \s* [(] [^)]* [)]
+                    \s* {{ [^;]* }}
+                 )
+             )  # <stmt>
+
+
+
+        # compound statement
+        (?:
+            (
+                (?:
+
+                    # "for" statements are handled separately above.
+                    (?: (?: else \s+ )? if | switch | while ) \s*
+                    {_ind(COMPOUND_HEAD, 5)}
+                 )
+                |
+                (?: else | do )
+                # We do not worry about compound statements for labels,
+                # "case", or "default".
+             )?  # <header>
+            \s*
+            ( {{ )  # <open>
+         )
+
+
+
+            (
+                (?:
+                    [^'"{{}};]*
+                    {_ind(STRING_LITERAL, 5)}
+                 )*
+                [^'"{{}};]*
+                # Presumably we will not see "== {{".
+                [^\s='"{{}};]
+             )?  # <header>
+
+
+
+            (
+                \b
+                (?:
+                    # We don't worry about labels with a compound statement.
+                    (?:
+                        switch \s* [(] [^{{]* [)]
+                     )
+                    |
+                    (?:
+                        case \b \s* [^:]+ [:]
+                     )
+                    |
+                    (?:
+                        default \s* [:]
+                     )
+                    |
+                    (?:
+                        do
+                     )
+                    |
+                    (?:
+                        while \s* [(] [^{{]* [)]
+                     )
+                    |
+                    #(?:
+                    #    for \s* [(] [^{{]* [)]
+                    # )
+                    #|
+                    (?:
+                        if \s* [(]
+                        (?: [^{{]* [^)] \s* {{ )* [^{{]*
+                        [)]
+                     )
+                    |
+                    (?:
+                        else
+                        (?:
+                            \s*
+                            if \s* [(]
+                            (?: [^{{]* [^)] \s* {{ )* [^{{]*
+                            [)]
+                         )?
+                     )
+                 )
+             )?  # <header>
+'''
--- a/Tools/c-analyzer/c_parser/datafiles.py
+++ b/Tools/c-analyzer/c_parser/datafiles.py
@ -0,0 +1,150 @@
+import os.path
+
+import c_common.tables as _tables
+import c_parser.info as _info
+
+
+BASE_COLUMNS = [
+    'filename',
+    'funcname',
+    'name',
+    'kind',
+]
+END_COLUMNS = {
+    'parsed': 'data',
+    'decls': 'declaration',
+}
+
+
+def _get_columns(group, extra=None):
+    return BASE_COLUMNS + list(extra or ()) + [END_COLUMNS[group]]
+    #return [
+    #    *BASE_COLUMNS,
+    #    *extra or (),
+    #    END_COLUMNS[group],
+    #]
+
+
+#############################
+# high-level
+
+def read_parsed(infile):
+    # XXX Support other formats than TSV?
+    columns = _get_columns('parsed')
+    for row in _tables.read_table(infile, columns, sep='\t', fix='-'):
+        yield _info.ParsedItem.from_row(row, columns)
+
+
+def write_parsed(items, outfile):
+    # XXX Support other formats than TSV?
+    columns = _get_columns('parsed')
+    rows = (item.as_row(columns) for item in items)
+    _tables.write_table(outfile, columns, rows, sep='\t', fix='-')
+
+
+def read_decls(infile, fmt=None):
+    if fmt is None:
+        fmt = _get_format(infile)
+    read_all, _ = _get_format_handlers('decls', fmt)
+    for decl, _ in read_all(infile):
+        yield decl
+
+
+def write_decls(decls, outfile, fmt=None, *, backup=False):
+    if fmt is None:
+        fmt = _get_format(infile)
+    _, write_all = _get_format_handlers('decls', fmt)
+    write_all(decls, outfile, backup=backup)
+
+
+#############################
+# formats
+
+def _get_format(file, default='tsv'):
+    if isinstance(file, str):
+        filename = file
+    else:
+        filename = getattr(file, 'name', '')
+    _, ext = os.path.splitext(filename)
+    return ext[1:] if ext else default
+
+
+def _get_format_handlers(group, fmt):
+    # XXX Use a registry.
+    if group != 'decls':
+        raise NotImplementedError(group)
+    if fmt == 'tsv':
+        return (_iter_decls_tsv, _write_decls_tsv)
+    else:
+        raise NotImplementedError(fmt)
+
+
+# tsv
+
+def iter_decls_tsv(infile, extracolumns=None, relroot=None):
+    for info, extra in _iter_decls_tsv(infile, extracolumns, relroot):
+        decl = _info.Declaration.from_row(info)
+        yield decl, extra
+
+
+def write_decls_tsv(decls, outfile, extracolumns=None, *,
+                    relroot=None,
+                    **kwargs
+                    ):
+    # XXX Move the row rendering here.
+    _write_decls_tsv(rows, outfile, extracolumns, relroot, kwargs)
+
+
+def _iter_decls_tsv(infile, extracolumns=None, relroot=None):
+    columns = _get_columns('decls', extracolumns)
+    for row in _tables.read_table(infile, columns, sep='\t'):
+        if extracolumns:
+            declinfo = row[:4] + row[-1:]
+            extra = row[4:-1]
+        else:
+            declinfo = row
+            extra = None
+        if relroot:
+            # XXX Use something like tables.fix_row() here.
+            declinfo = [None if v == '-' else v
+                        for v in declinfo]
+            declinfo[0] = os.path.join(relroot, declinfo[0])
+        yield declinfo, extra
+
+
+def _write_decls_tsv(decls, outfile, extracolumns, relroot,kwargs):
+    columns = _get_columns('decls', extracolumns)
+    if extracolumns:
+        def render_decl(decl):
+            if type(row) is tuple:
+                decl, *extra = decl
+            else:
+                extra = ()
+            extra += ('???',) * (len(extraColumns) - len(extra))
+            *row, declaration = _render_known_row(decl, relroot)
+            row += extra + (declaration,)
+            return row
+    else:
+        render_decl = _render_known_decl
+    _tables.write_table(
+        outfile,
+        header='\t'.join(columns),
+        rows=(render_decl(d, relroot) for d in decls),
+        sep='\t',
+        **kwargs
+    )
+
+
+def _render_known_decl(decl, relroot, *,
+                       # These match BASE_COLUMNS + END_COLUMNS[group].
+                       _columns = 'filename parent name kind data'.split(),
+                       ):
+    if not isinstance(decl, _info.Declaration):
+        # e.g. Analyzed
+        decl = decl.decl
+    rowdata = decl.render_rowdata(_columns)
+    if relroot:
+        rowdata['filename'] = os.path.relpath(rowdata['filename'], relroot)
+    return [rowdata[c] or '-' for c in _columns]
+    # XXX
+    #return _tables.fix_row(rowdata[c] for c in columns)
--- a/Tools/c-analyzer/c_parser/info.py
+++ b/Tools/c-analyzer/c_parser/info.py
--- a/Tools/c-analyzer/c_parser/parser/init.py
+++ b/Tools/c-analyzer/c_parser/parser/init.py
@ -0,0 +1,212 @@
+"""A simple non-validating parser for C99.
+
+The functions and regex patterns here are not entirely suitable for
+validating C syntax.  Please rely on a proper compiler for that.
+Instead our goal here is merely matching and extracting information from
+valid C code.
+
+Furthermore, the grammar rules for the C syntax (particularly as
+described in the K&R book) actually describe a superset, of which the
+full C langage is a proper subset.  Here are some of the extra
+conditions that must be applied when parsing C code:
+
+* ...
+
+(see: http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf)
+
+We have taken advantage of the elements of the C grammar that are used
+only in a few limited contexts, mostly as delimiters.  They allow us to
+focus the regex patterns confidently.  Here are the relevant tokens and
+in which grammar rules they are used:
+
+separators:
+* ";"
+   + (decl) struct/union:  at end of each member decl
+   + (decl) declaration:  at end of each (non-compound) decl
+   + (stmt) expr stmt:  at end of each stmt
+   + (stmt) for:  between exprs in "header"
+   + (stmt) goto:  at end
+   + (stmt) continue:  at end
+   + (stmt) break:  at end
+   + (stmt) return:  at end
+* ","
+   + (decl) struct/union:  between member declators
+   + (decl) param-list:  between params
+   + (decl) enum: between enumerators
+   + (decl) initializer (compound):  between initializers
+   + (expr) postfix:  between func call args
+   + (expr) expression:  between "assignment" exprs
+* ":"
+   + (decl) struct/union:  in member declators
+   + (stmt) label:  between label and stmt
+   + (stmt) case:  between expression and stmt
+   + (stmt) default:  between "default" and stmt
+* "="
+   + (decl) delaration:  between decl and initializer
+   + (decl) enumerator:  between identifier and "initializer"
+   + (expr) assignment:  between "var" and expr
+
+wrappers:
+* "(...)"
+   + (decl) declarator (func ptr):  to wrap ptr/name
+   + (decl) declarator (func ptr):  around params
+   + (decl) declarator:  around sub-declarator (for readability)
+   + (expr) postfix (func call):  around args
+   + (expr) primary:  around sub-expr
+   + (stmt) if:  around condition
+   + (stmt) switch:  around source expr
+   + (stmt) while:  around condition
+   + (stmt) do-while:  around condition
+   + (stmt) for:  around "header"
+* "{...}"
+   + (decl) enum:  around enumerators
+   + (decl) func:  around body
+   + (stmt) compound:  around stmts
+* "[...]"
+   * (decl) declarator:  for arrays
+   * (expr) postfix:  array access
+
+other:
+* "*"
+   + (decl) declarator:  for pointer types
+   + (expr) unary:  for pointer deref
+
+
+To simplify the regular expressions used here, we've takens some
+shortcuts and made certain assumptions about the code we are parsing.
+Some of these allow us to skip context-sensitive matching (e.g. braces)
+or otherwise still match arbitrary C code unambiguously.  However, in
+some cases there are certain corner cases where the patterns are
+ambiguous relative to arbitrary C code.  However, they are still
+unambiguous in the specific code we are parsing.
+
+Here are the cases where we've taken shortcuts or made assumptions:
+
+* there is no overlap syntactically between the local context (func
+  bodies) and the global context (other than variable decls), so we
+  do not need to worry about ambiguity due to the overlap:
+   + the global context has no expressions or statements
+   + the local context has no function definitions or type decls
+* no "inline" type declarations (struct, union, enum) in function
+  parameters ~(including function pointers)~
+* no "inline" type decls in function return types
+* no superflous parentheses in declarators
+* var decls in for loops are always "simple" (e.g. no inline types)
+* only inline struct/union/enum decls may be anonymouns (without a name)
+* no function pointers in function pointer parameters
+* for loop "headers" do not have curly braces (e.g. compound init)
+* syntactically, variable decls do not overlap with stmts/exprs, except
+  in the following case:
+    spam (*eggs) (...)
+  This could be either a function pointer variable named "eggs"
+  or a call to a function named "spam", which returns a function
+  pointer that gets called.  The only differentiator is the
+  syntax used in the "..." part.  It will be comma-separated
+  parameters for the former and comma-separated expressions for
+  the latter.  Thus, if we expect such decls or calls then we must
+  parse the decl params.
+"""
+
+"""
+TODO:
+* extract CPython-specific code
+* drop include injection (or only add when needed)
+* track position instead of slicing "text"
+* Parser class instead of the _iter_source() mess
+* alt impl using a state machine (& tokenizer or split on delimiters)
+"""
+
+from ..info import ParsedItem
+from ._info import SourceInfo
+
+
+def parse(srclines):
+    if isinstance(srclines, str):  # a filename
+        raise NotImplementedError
+
+    anon_name = anonymous_names()
+    for result in _parse(srclines, anon_name):
+        yield ParsedItem.from_raw(result)
+
+
+# XXX Later: Add a separate function to deal with preprocessor directives
+# parsed out of raw source.
+
+
+def anonymous_names():
+    counter = 1
+    def anon_name(prefix='anon-'):
+        nonlocal counter
+        name = f'{prefix}{counter}'
+        counter += 1
+        return name
+    return anon_name
+
+
+#############################
+# internal impl
+
+import logging
+
+
+_logger = logging.getLogger(__name__)
+
+
+def _parse(srclines, anon_name):
+    from ._global import parse_globals
+
+    source = _iter_source(srclines)
+    #source = _iter_source(srclines, showtext=True)
+    for result in parse_globals(source, anon_name):
+        # XXX Handle blocks here insted of in parse_globals().
+        yield result
+
+
+def _iter_source(lines, *, maxtext=20_000, maxlines=700, showtext=False):
+    filestack = []
+    allinfo = {}
+    # "lines" should be (fileinfo, data), as produced by the preprocessor code.
+    for fileinfo, line in lines:
+        if fileinfo.filename in filestack:
+            while fileinfo.filename != filestack[-1]:
+                filename = filestack.pop()
+                del allinfo[filename]
+            filename = fileinfo.filename
+            srcinfo = allinfo[filename]
+        else:
+            filename = fileinfo.filename
+            srcinfo = SourceInfo(filename)
+            filestack.append(filename)
+            allinfo[filename] = srcinfo
+
+        _logger.debug(f'-> {line}')
+        srcinfo._add_line(line, fileinfo.lno)
+        if len(srcinfo.text) > maxtext:
+            break
+        if srcinfo.end - srcinfo.start > maxlines:
+            break
+        while srcinfo._used():
+            yield srcinfo
+            if showtext:
+                _logger.debug(f'=> {srcinfo.text}')
+    else:
+        if not filestack:
+            srcinfo = SourceInfo('???')
+        else:
+            filename = filestack[-1]
+            srcinfo = allinfo[filename]
+            while srcinfo._used():
+                yield srcinfo
+                if showtext:
+                    _logger.debug(f'=> {srcinfo.text}')
+        yield srcinfo
+        if showtext:
+            _logger.debug(f'=> {srcinfo.text}')
+        if not srcinfo._ready:
+            return
+    # At this point either the file ended prematurely
+    # or there's "too much" text.
+    filename, lno, text = srcinfo.filename, srcinfo._start, srcinfo.text
+    if len(text) > 500:
+        text = text[:500] + '...'
+    raise Exception(f'unmatched text ({filename} starting at line {lno}):\n{text}')
--- a/Tools/c-analyzer/c_parser/parser/_alt.py
+++ b/Tools/c-analyzer/c_parser/parser/_alt.py
@ -0,0 +1,6 @@
+
+def _parse(srclines, anon_name):
+    text = ' '.join(l for _, l in srclines)
+
+    from ._delim import parse
+    yield from parse(text, anon_name)
--- a/Tools/c-analyzer/c_parser/parser/_common.py
+++ b/Tools/c-analyzer/c_parser/parser/_common.py
@ -0,0 +1,115 @@
+import re
+
+from ._regexes import (
+    _ind,
+    STRING_LITERAL,
+    VAR_DECL as _VAR_DECL,
+)
+
+
+def log_match(group, m):
+    from . import _logger
+    _logger.debug(f'matched <{group}> ({m.group(0)})')
+
+
+#############################
+# regex utils
+
+def set_capture_group(pattern, group, *, strict=True):
+    old = f'(?:  # <{group}>'
+    if strict and f'(?:  # <{group}>' not in pattern:
+        raise ValueError(f'{old!r} not found in pattern')
+    return pattern.replace(old, f'(  # <{group}>', 1)
+
+
+def set_capture_groups(pattern, groups, *, strict=True):
+    for group in groups:
+        pattern = set_capture_group(pattern, group, strict=strict)
+    return pattern
+
+
+#############################
+# syntax-related utils
+
+_PAREN_RE = re.compile(rf'''
+    (?:
+        (?:
+            [^'"()]*
+            {_ind(STRING_LITERAL, 3)}
+         )*
+        [^'"()]*
+        (?:
+            ( [(] )
+            |
+            ( [)] )
+         )
+     )
+    ''', re.VERBOSE)
+
+
+def match_paren(text, depth=0):
+    pos = 0
+    while (m := _PAREN_RE.match(text, pos)):
+        pos = m.end()
+        _open, _close = m.groups()
+        if _open:
+            depth += 1
+        else:  # _close
+            depth -= 1
+            if depth == 0:
+                return pos
+    else:
+        raise ValueError(f'could not find matching parens for {text!r}')
+
+
+VAR_DECL = set_capture_groups(_VAR_DECL, (
+    'STORAGE',
+    'TYPE_QUAL',
+    'TYPE_SPEC',
+    'DECLARATOR',
+    'IDENTIFIER',
+    'WRAPPED_IDENTIFIER',
+    'FUNC_IDENTIFIER',
+))
+
+
+def parse_var_decl(decl):
+    m = re.match(VAR_DECL, decl, re.VERBOSE)
+    (storage, typequal, typespec, declarator,
+     name,
+     wrappedname,
+     funcptrname,
+     ) = m.groups()
+    if name:
+        kind = 'simple'
+    elif wrappedname:
+        kind = 'wrapped'
+        name = wrappedname
+    elif funcptrname:
+        kind = 'funcptr'
+        name = funcptrname
+    else:
+        raise NotImplementedError
+    abstract = declarator.replace(name, '')
+    vartype = {
+        'storage': storage,
+        'typequal': typequal,
+        'typespec': typespec,
+        'abstract': abstract,
+    }
+    return (kind, name, vartype)
+
+
+#############################
+# parser state utils
+
+# XXX Drop this or use it!
+def iter_results(results):
+    if not results:
+        return
+    if callable(results):
+        results = results()
+
+    for result, text in results():
+        if result:
+            yield result, text
--- a/Tools/c-analyzer/c_parser/parser/_compound_decl_body.py
+++ b/Tools/c-analyzer/c_parser/parser/_compound_decl_body.py
@ -0,0 +1,158 @@
+import re
+
+from ._regexes import (
+    STRUCT_MEMBER_DECL as _STRUCT_MEMBER_DECL,
+    ENUM_MEMBER_DECL as _ENUM_MEMBER_DECL,
+)
+from ._common import (
+    log_match,
+    parse_var_decl,
+    set_capture_groups,
+)
+
+
+#############################
+# struct / union
+
+STRUCT_MEMBER_DECL = set_capture_groups(_STRUCT_MEMBER_DECL, (
+    'COMPOUND_TYPE_KIND',
+    'COMPOUND_TYPE_NAME',
+    'SPECIFIER_QUALIFIER',
+    'DECLARATOR',
+    'SIZE',
+    'ENDING',
+    'CLOSE',
+))
+STRUCT_MEMBER_RE = re.compile(rf'^ \s* {STRUCT_MEMBER_DECL}', re.VERBOSE)
+
+
+def parse_struct_body(source, anon_name, parent):
+    done = False
+    while not done:
+        done = True
+        for srcinfo in source:
+            m = STRUCT_MEMBER_RE.match(srcinfo.text)
+            if m:
+                break
+        else:
+            # We ran out of lines.
+            if srcinfo is not None:
+                srcinfo.done()
+            return
+        for item in _parse_struct_next(m, srcinfo, anon_name, parent):
+            if callable(item):
+                parse_body = item
+                yield from parse_body(source)
+            else:
+                yield item
+            done = False
+
+
+def _parse_struct_next(m, srcinfo, anon_name, parent):
+    (inline_kind, inline_name,
+     qualspec, declarator,
+     size,
+     ending,
+     close,
+     ) = m.groups()
+    remainder = srcinfo.text[m.end():]
+
+    if close:
+        log_match('compound close', m)
+        srcinfo.advance(remainder)
+
+    elif inline_kind:
+        log_match('compound inline', m)
+        kind = inline_kind
+        name = inline_name or anon_name('inline-')
+        # Immediately emit a forward declaration.
+        yield srcinfo.resolve(kind, name=name, data=None)
+
+        # un-inline the decl.  Note that it might not actually be inline.
+        # We handle the case in the "maybe_inline_actual" branch.
+        srcinfo.nest(
+            remainder,
+            f'{kind} {name}',
+        )
+        def parse_body(source):
+            _parse_body = DECL_BODY_PARSERS[kind]
+
+            data = []  # members
+            ident = f'{kind} {name}'
+            for item in _parse_body(source, anon_name, ident):
+                if item.kind == 'field':
+                    data.append(item)
+                else:
+                    yield item
+            # XXX Should "parent" really be None for inline type decls?
+            yield srcinfo.resolve(kind, data, name, parent=None)
+
+            srcinfo.resume()
+        yield parse_body
+
+    else:
+        # not inline (member)
+        log_match('compound member', m)
+        if qualspec:
+            _, name, data = parse_var_decl(f'{qualspec} {declarator}')
+            if not name:
+                name = anon_name('struct-field-')
+            if size:
+#                data = (data, size)
+                data['size'] = int(size)
+        else:
+            # This shouldn't happen (we expect each field to have a name).
+            raise NotImplementedError
+            name = sized_name or anon_name('struct-field-')
+            data = int(size)
+
+        yield srcinfo.resolve('field', data, name, parent)  # XXX Restart?
+        if ending == ',':
+            remainder = rf'{qualspec} {remainder}'
+        srcinfo.advance(remainder)
+
+
+#############################
+# enum
+
+ENUM_MEMBER_DECL = set_capture_groups(_ENUM_MEMBER_DECL, (
+    'CLOSE',
+    'NAME',
+    'INIT',
+    'ENDING',
+))
+ENUM_MEMBER_RE = re.compile(rf'{ENUM_MEMBER_DECL}', re.VERBOSE)
+
+
+def parse_enum_body(source, _anon_name, _parent):
+    ending = None
+    while ending != '}':
+        for srcinfo in source:
+            m = ENUM_MEMBER_RE.match(srcinfo.text)
+            if m:
+                break
+        else:
+            # We ran out of lines.
+            if srcinfo is not None:
+                srcinfo.done()
+            return
+        remainder = srcinfo.text[m.end():]
+
+        (close,
+         name, init, ending,
+         ) = m.groups()
+        if close:
+            ending = '}'
+        else:
+            data = init
+            yield srcinfo.resolve('field', data, name, _parent)
+        srcinfo.advance(remainder)
+
+
+#############################
+
+DECL_BODY_PARSERS = {
+    'struct': parse_struct_body,
+    'union': parse_struct_body,
+    'enum': parse_enum_body,
+}
--- a/Tools/c-analyzer/c_parser/parser/_delim.py
+++ b/Tools/c-analyzer/c_parser/parser/_delim.py
@ -0,0 +1,54 @@
+import re
+import textwrap
+
+from ._regexes import _ind, STRING_LITERAL
+
+
+def parse(text, anon_name):
+    context = None
+    data = None
+    for m in DELIMITER_RE.find_iter(text):
+        before, opened, closed = m.groups()
+        delim = opened or closed
+
+        handle_segment = HANDLERS[context][delim]
+        result, context, data = handle_segment(before, delim, data)
+        if result:
+            yield result
+
+
+DELIMITER = textwrap.dedent(rf'''
+    (
+        (?:
+            [^'"()\[\]{};]*
+            {_ind(STRING_LITERAL, 3)}
+        }*
+        [^'"()\[\]{};]+
+     )?  # <before>
+    (?:
+        (
+            [(\[{]
+         )  # <open>
+        |
+        (
+            [)\]};]
+         )  # <close>
+     )?
+    ''')
+DELIMITER_RE = re.compile(DELIMITER, re.VERBOSE)
+
+_HANDLERS = {
+    None: {  # global
+        # opened
+        '{': ...,
+        '[': None,
+        '(': None,
+        # closed
+        '}': None,
+        ']': None,
+        ')': None,
+        ';': ...,
+    },
+    '': {
+    },
+}
--- a/Tools/c-analyzer/c_parser/parser/_func_body.py
+++ b/Tools/c-analyzer/c_parser/parser/_func_body.py
@ -0,0 +1,278 @@
+import re
+
+from ._regexes import (
+    LOCAL as _LOCAL,
+    LOCAL_STATICS as _LOCAL_STATICS,
+)
+from ._common import (
+    log_match,
+    parse_var_decl,
+    set_capture_groups,
+    match_paren,
+)
+from ._compound_decl_body import DECL_BODY_PARSERS
+
+
+LOCAL = set_capture_groups(_LOCAL, (
+    'EMPTY',
+    'INLINE_LEADING',
+    'INLINE_PRE',
+    'INLINE_KIND',
+    'INLINE_NAME',
+    'STORAGE',
+    'VAR_DECL',
+    'VAR_INIT',
+    'VAR_ENDING',
+    'COMPOUND_BARE',
+    'COMPOUND_LABELED',
+    'COMPOUND_PAREN',
+    'BLOCK_LEADING',
+    'BLOCK_OPEN',
+    'SIMPLE_STMT',
+    'SIMPLE_ENDING',
+    'BLOCK_CLOSE',
+))
+LOCAL_RE = re.compile(rf'^ \s* {LOCAL}', re.VERBOSE)
+
+
+# Note that parse_function_body() still has trouble with a few files
+# in the CPython codebase.
+
+def parse_function_body(source, name, anon_name):
+    # XXX
+    raise NotImplementedError
+
+
+def parse_function_body(name, text, resolve, source, anon_name, parent):
+    raise NotImplementedError
+    # For now we do not worry about locals declared in for loop "headers".
+    depth = 1;
+    while depth > 0:
+        m = LOCAL_RE.match(text)
+        while not m:
+            text, resolve = continue_text(source, text or '{', resolve)
+            m = LOCAL_RE.match(text)
+        text = text[m.end():]
+        (
+         empty,
+         inline_leading, inline_pre, inline_kind, inline_name,
+         storage, decl,
+         var_init, var_ending,
+         compound_bare, compound_labeled, compound_paren,
+         block_leading, block_open,
+         simple_stmt, simple_ending,
+         block_close,
+         ) = m.groups()
+
+        if empty:
+            log_match('', m)
+            resolve(None, None, None, text)
+            yield None, text
+        elif inline_kind:
+            log_match('', m)
+            kind = inline_kind
+            name = inline_name or anon_name('inline-')
+            data = []  # members
+            # We must set the internal "text" from _iter_source() to the
+            # start of the inline compound body,
+            # Note that this is effectively like a forward reference that
+            # we do not emit.
+            resolve(kind, None, name, text, None)
+            _parse_body = DECL_BODY_PARSERS[kind]
+            before = []
+            ident = f'{kind} {name}'
+            for member, inline, text in _parse_body(text, resolve, source, anon_name, ident):
+                if member:
+                    data.append(member)
+                if inline:
+                    yield from inline
+            # un-inline the decl.  Note that it might not actually be inline.
+            # We handle the case in the "maybe_inline_actual" branch.
+            text = f'{inline_leading or ""} {inline_pre or ""} {kind} {name} {text}'
+            # XXX Should "parent" really be None for inline type decls?
+            yield resolve(kind, data, name, text, None), text
+        elif block_close:
+            log_match('', m)
+            depth -= 1
+            resolve(None, None, None, text)
+            # XXX This isn't great.  Calling resolve() should have
+            # cleared the closing bracket.  However, some code relies
+            # on the yielded value instead of the resolved one.  That
+            # needs to be fixed.
+            yield None, text
+        elif compound_bare:
+            log_match('', m)
+            yield resolve('statement', compound_bare, None, text, parent), text
+        elif compound_labeled:
+            log_match('', m)
+            yield resolve('statement', compound_labeled, None, text, parent), text
+        elif compound_paren:
+            log_match('', m)
+            try:
+                pos = match_paren(text)
+            except ValueError:
+                text = f'{compound_paren} {text}'
+                #resolve(None, None, None, text)
+                text, resolve = continue_text(source, text, resolve)
+                yield None, text
+            else:
+                head = text[:pos]
+                text = text[pos:]
+                if compound_paren == 'for':
+                    # XXX Parse "head" as a compound statement.
+                    stmt1, stmt2, stmt3 = head.split(';', 2)
+                    data = {
+                        'compound': compound_paren,
+                        'statements': (stmt1, stmt2, stmt3),
+                    }
+                else:
+                    data = {
+                        'compound': compound_paren,
+                        'statement': head,
+                    }
+                yield resolve('statement', data, None, text, parent), text
+        elif block_open:
+            log_match('', m)
+            depth += 1
+            if block_leading:
+                # An inline block: the last evaluated expression is used
+                # in place of the block.
+                # XXX Combine it with the remainder after the block close.
+                stmt = f'{block_open}{{<expr>}}...;'
+                yield resolve('statement', stmt, None, text, parent), text
+            else:
+                resolve(None, None, None, text)
+                yield None, text
+        elif simple_ending:
+            log_match('', m)
+            yield resolve('statement', simple_stmt, None, text, parent), text
+        elif var_ending:
+            log_match('', m)
+            kind = 'variable'
+            _, name, vartype = parse_var_decl(decl)
+            data = {
+                'storage': storage,
+                'vartype': vartype,
+            }
+            after = ()
+            if var_ending == ',':
+                # It was a multi-declaration, so queue up the next one.
+                _, qual, typespec, _ = vartype.values()
+                text = f'{storage or ""} {qual or ""} {typespec} {text}'
+            yield resolve(kind, data, name, text, parent), text
+            if var_init:
+                _data = f'{name} = {var_init.strip()}'
+                yield resolve('statement', _data, None, text, parent), text
+        else:
+            # This should be unreachable.
+            raise NotImplementedError
+
+
+#############################
+# static local variables
+
+LOCAL_STATICS = set_capture_groups(_LOCAL_STATICS, (
+    'INLINE_LEADING',
+    'INLINE_PRE',
+    'INLINE_KIND',
+    'INLINE_NAME',
+    'STATIC_DECL',
+    'STATIC_INIT',
+    'STATIC_ENDING',
+    'DELIM_LEADING',
+    'BLOCK_OPEN',
+    'BLOCK_CLOSE',
+    'STMT_END',
+))
+LOCAL_STATICS_RE = re.compile(rf'^ \s* {LOCAL_STATICS}', re.VERBOSE)
+
+
+def parse_function_statics(source, func, anon_name):
+    # For now we do not worry about locals declared in for loop "headers".
+    depth = 1;
+    while depth > 0:
+        for srcinfo in source:
+            m = LOCAL_STATICS_RE.match(srcinfo.text)
+            if m:
+                break
+        else:
+            # We ran out of lines.
+            if srcinfo is not None:
+                srcinfo.done()
+            return
+        for item, depth in _parse_next_local_static(m, srcinfo,
+                                                    anon_name, func, depth):
+            if callable(item):
+                parse_body = item
+                yield from parse_body(source)
+            elif item is not None:
+                yield item
+
+
+def _parse_next_local_static(m, srcinfo, anon_name, func, depth):
+    (inline_leading, inline_pre, inline_kind, inline_name,
+     static_decl, static_init, static_ending,
+     _delim_leading,
+     block_open,
+     block_close,
+     stmt_end,
+     ) = m.groups()
+    remainder = srcinfo.text[m.end():]
+
+    if inline_kind:
+        log_match('func inline', m)
+        kind = inline_kind
+        name = inline_name or anon_name('inline-')
+        # Immediately emit a forward declaration.
+        yield srcinfo.resolve(kind, name=name, data=None), depth
+
+        # un-inline the decl.  Note that it might not actually be inline.
+        # We handle the case in the "maybe_inline_actual" branch.
+        srcinfo.nest(
+            remainder,
+            f'{inline_leading or ""} {inline_pre or ""} {kind} {name}'
+        )
+        def parse_body(source):
+            _parse_body = DECL_BODY_PARSERS[kind]
+
+            data = []  # members
+            ident = f'{kind} {name}'
+            for item in _parse_body(source, anon_name, ident):
+                if item.kind == 'field':
+                    data.append(item)
+                else:
+                    yield item
+            # XXX Should "parent" really be None for inline type decls?
+            yield srcinfo.resolve(kind, data, name, parent=None)
+
+            srcinfo.resume()
+        yield parse_body, depth
+
+    elif static_decl:
+        log_match('local variable', m)
+        _, name, data = parse_var_decl(static_decl)
+
+        yield srcinfo.resolve('variable', data, name, parent=func), depth
+
+        if static_init:
+            srcinfo.advance(f'{name} {static_init} {remainder}')
+        elif static_ending == ',':
+            # It was a multi-declaration, so queue up the next one.
+            _, qual, typespec, _ = data.values()
+            srcinfo.advance(f'static {qual or ""} {typespec} {remainder}')
+        else:
+            srcinfo.advance('')
+
+    else:
+        log_match('func other', m)
+        if block_open:
+            depth += 1
+        elif block_close:
+            depth -= 1
+        elif stmt_end:
+            pass
+        else:
+            # This should be unreachable.
+            raise NotImplementedError
+        srcinfo.advance(remainder)
+        yield None, depth
--- a/Tools/c-analyzer/c_parser/parser/_global.py
+++ b/Tools/c-analyzer/c_parser/parser/_global.py
@ -0,0 +1,179 @@
+import re
+
+from ._regexes import (
+    GLOBAL as _GLOBAL,
+)
+from ._common import (
+    log_match,
+    parse_var_decl,
+    set_capture_groups,
+)
+from ._compound_decl_body import DECL_BODY_PARSERS
+#from ._func_body import parse_function_body
+from ._func_body import parse_function_statics as parse_function_body
+
+
+GLOBAL = set_capture_groups(_GLOBAL, (
+    'EMPTY',
+    'COMPOUND_LEADING',
+    'COMPOUND_KIND',
+    'COMPOUND_NAME',
+    'FORWARD_KIND',
+    'FORWARD_NAME',
+    'MAYBE_INLINE_ACTUAL',
+    'TYPEDEF_DECL',
+    'TYPEDEF_FUNC_PARAMS',
+    'VAR_STORAGE',
+    'FUNC_INLINE',
+    'VAR_DECL',
+    'FUNC_PARAMS',
+    'FUNC_DELIM',
+    'FUNC_LEGACY_PARAMS',
+    'VAR_INIT',
+    'VAR_ENDING',
+))
+GLOBAL_RE = re.compile(rf'^ \s* {GLOBAL}', re.VERBOSE)
+
+
+def parse_globals(source, anon_name):
+    for srcinfo in source:
+        m = GLOBAL_RE.match(srcinfo.text)
+        if not m:
+            # We need more text.
+            continue
+        for item in _parse_next(m, srcinfo, anon_name):
+            if callable(item):
+                parse_body = item
+                yield from parse_body(source)
+            else:
+                yield item
+    else:
+        # We ran out of lines.
+        if srcinfo is not None:
+            srcinfo.done()
+        return
+
+
+def _parse_next(m, srcinfo, anon_name):
+    (
+     empty,
+     # compound type decl (maybe inline)
+     compound_leading, compound_kind, compound_name,
+     forward_kind, forward_name, maybe_inline_actual,
+     # typedef
+     typedef_decl, typedef_func_params,
+     # vars and funcs
+     storage, func_inline, decl,
+     func_params, func_delim, func_legacy_params,
+     var_init, var_ending,
+     ) = m.groups()
+    remainder = srcinfo.text[m.end():]
+
+    if empty:
+        log_match('global empty', m)
+        srcinfo.advance(remainder)
+
+    elif maybe_inline_actual:
+        log_match('maybe_inline_actual', m)
+        # Ignore forward declarations.
+        # XXX Maybe return them too (with an "isforward" flag)?
+        if not maybe_inline_actual.strip().endswith(';'):
+            remainder = maybe_inline_actual + remainder
+        yield srcinfo.resolve(forward_kind, None, forward_name)
+        if maybe_inline_actual.strip().endswith('='):
+            # We use a dummy prefix for a fake typedef.
+            # XXX Ideally this case would not be caught by MAYBE_INLINE_ACTUAL.
+            _, name, data = parse_var_decl(f'{forward_kind} {forward_name} fake_typedef_{forward_name}')
+            yield srcinfo.resolve('typedef', data, name, parent=None)
+            remainder = f'{name} {remainder}'
+        srcinfo.advance(remainder)
+
+    elif compound_kind:
+        kind = compound_kind
+        name = compound_name or anon_name('inline-')
+        # Immediately emit a forward declaration.
+        yield srcinfo.resolve(kind, name=name, data=None)
+
+        # un-inline the decl.  Note that it might not actually be inline.
+        # We handle the case in the "maybe_inline_actual" branch.
+        srcinfo.nest(
+            remainder,
+            f'{compound_leading or ""} {compound_kind} {name}',
+        )
+        def parse_body(source):
+            _parse_body = DECL_BODY_PARSERS[compound_kind]
+
+            data = []  # members
+            ident = f'{kind} {name}'
+            for item in _parse_body(source, anon_name, ident):
+                if item.kind == 'field':
+                    data.append(item)
+                else:
+                    yield item
+            # XXX Should "parent" really be None for inline type decls?
+            yield srcinfo.resolve(kind, data, name, parent=None)
+
+            srcinfo.resume()
+        yield parse_body
+
+    elif typedef_decl:
+        log_match('typedef', m)
+        kind = 'typedef'
+        _, name, data = parse_var_decl(typedef_decl)
+        if typedef_func_params:
+            return_type = data
+            # This matches the data for func declarations.
+            data = {
+                'storage': None,
+                'inline': None,
+                'params': f'({typedef_func_params})',
+                'returntype': return_type,
+                'isforward': True,
+            }
+        yield srcinfo.resolve(kind, data, name, parent=None)
+        srcinfo.advance(remainder)
+
+    elif func_delim or func_legacy_params:
+        log_match('function', m)
+        kind = 'function'
+        _, name, return_type = parse_var_decl(decl)
+        func_params = func_params or func_legacy_params
+        data = {
+            'storage': storage,
+            'inline': func_inline,
+            'params': f'({func_params})',
+            'returntype': return_type,
+            'isforward': func_delim == ';',
+        }
+
+        yield srcinfo.resolve(kind, data, name, parent=None)
+        srcinfo.advance(remainder)
+
+        if func_delim == '{' or func_legacy_params:
+            def parse_body(source):
+                yield from parse_function_body(source, name, anon_name)
+            yield parse_body
+
+    elif var_ending:
+        log_match('global variable', m)
+        kind = 'variable'
+        _, name, vartype = parse_var_decl(decl)
+        data = {
+            'storage': storage,
+            'vartype': vartype,
+        }
+        yield srcinfo.resolve(kind, data, name, parent=None)
+
+        if var_ending == ',':
+            # It was a multi-declaration, so queue up the next one.
+            _, qual, typespec, _ = vartype.values()
+            remainder = f'{storage or ""} {qual or ""} {typespec} {remainder}'
+        srcinfo.advance(remainder)
+
+        if var_init:
+            _data = f'{name} = {var_init.strip()}'
+            yield srcinfo.resolve('statement', _data, name=None)
+
+    else:
+        # This should be unreachable.
+        raise NotImplementedError
--- a/Tools/c-analyzer/c_parser/parser/_info.py
+++ b/Tools/c-analyzer/c_parser/parser/_info.py
@ -0,0 +1,168 @@
+from ..info import KIND, ParsedItem, FileInfo
+
+
+class TextInfo:
+
+    def __init__(self, text, start=None, end=None):
+        # immutable:
+        if not start:
+            start = 1
+        self.start = start
+
+        # mutable:
+        lines = text.splitlines() or ['']
+        self.text = text.strip()
+        if not end:
+            end = start + len(lines) - 1
+        self.end = end
+        self.line = lines[-1]
+
+    def __repr__(self):
+        args = (f'{a}={getattr(self, a)!r}'
+                for a in ['text', 'start', 'end'])
+        return f'{type(self).__name__}({", ".join(args)})'
+
+    def add_line(self, line, lno=None):
+        if lno is None:
+            lno = self.end + 1
+        else:
+            if isinstance(lno, FileInfo):
+                fileinfo = lno
+                if fileinfo.filename != self.filename:
+                    raise NotImplementedError((fileinfo, self.filename))
+                lno = fileinfo.lno
+            # XXX
+            #if lno < self.end:
+            #    raise NotImplementedError((lno, self.end))
+        line = line.lstrip()
+        self.text += ' ' + line
+        self.line = line
+        self.end = lno
+
+
+class SourceInfo:
+
+    _ready = False
+
+    def __init__(self, filename, _current=None):
+        # immutable:
+        self.filename = filename
+        # mutable:
+        if isinstance(_current, str):
+            _current = TextInfo(_current)
+        self._current = _current
+        start = -1
+        self._start = _current.start if _current else -1
+        self._nested = []
+        self._set_ready()
+
+    def __repr__(self):
+        args = (f'{a}={getattr(self, a)!r}'
+                for a in ['filename', '_current'])
+        return f'{type(self).__name__}({", ".join(args)})'
+
+    @property
+    def start(self):
+        if self._current is None:
+            return self._start
+        return self._current.start
+
+    @property
+    def end(self):
+        if self._current is None:
+            return self._start
+        return self._current.end
+
+    @property
+    def text(self):
+        if self._current is None:
+            return ''
+        return self._current.text
+
+    def nest(self, text, before, start=None):
+        if self._current is None:
+            raise Exception('nesting requires active source text')
+        current = self._current
+        current.text = before
+        self._nested.append(current)
+        self._replace(text, start)
+
+    def resume(self, remainder=None):
+        if not self._nested:
+            raise Exception('no nested text to resume')
+        if self._current is None:
+            raise Exception('un-nesting requires active source text')
+        if remainder is None:
+            remainder = self._current.text
+        self._clear()
+        self._current = self._nested.pop()
+        self._current.text += ' ' + remainder
+        self._set_ready()
+
+    def advance(self, remainder, start=None):
+        if self._current is None:
+            raise Exception('advancing requires active source text')
+        if remainder.strip():
+            self._replace(remainder, start, fixnested=True)
+        else:
+            if self._nested:
+                self._replace('', start, fixnested=True)
+                #raise Exception('cannot advance while nesting')
+            else:
+                self._clear(start)
+
+    def resolve(self, kind, data, name, parent=None):
+        # "field" isn't a top-level kind, so we leave it as-is.
+        if kind and kind != 'field':
+            kind = KIND._from_raw(kind)
+        fileinfo = FileInfo(self.filename, self._start)
+        return ParsedItem(fileinfo, kind, parent, name, data)
+
+    def done(self):
+        self._set_ready()
+
+    def _set_ready(self):
+        if self._current is None:
+            self._ready = False
+        else:
+            self._ready = self._current.text.strip() != ''
+
+    def _used(self):
+        ready = self._ready
+        self._ready = False
+        return ready
+
+    def _clear(self, start=None):
+        old = self._current
+        if self._current is not None:
+            # XXX Fail if self._current wasn't used up?
+            if start is None:
+                start = self._current.end
+            self._current = None
+        if start is not None:
+            self._start = start
+        self._set_ready()
+        return old
+
+    def _replace(self, text, start=None, *, fixnested=False):
+        end = self._current.end
+        old = self._clear(start)
+        self._current = TextInfo(text, self._start, end)
+        if fixnested and self._nested and self._nested[-1] is old:
+            self._nested[-1] = self._current
+        self._set_ready()
+
+    def _add_line(self, line, lno=None):
+        if not line.strip():
+            # We don't worry about multi-line string literals.
+            return
+        if self._current is None:
+            self._start = lno
+            self._current = TextInfo(line, lno)
+        else:
+            # XXX
+            #if lno < self._current.end:
+            #    # A circular include?
+            #    raise NotImplementedError((lno, self))
+            self._current.add_line(line, lno)
+        self._ready = True
--- a/Tools/c-analyzer/c_parser/parser/_regexes.py
+++ b/Tools/c-analyzer/c_parser/parser/_regexes.py
@ -0,0 +1,796 @@
+# Regular expression patterns for C syntax.
+#
+# None of these patterns has any capturing.  However, a number of them
+# have capturing markers compatible with utils.set_capture_groups().
+
+import textwrap
+
+
+def _ind(text, level=1, edges='both'):
+    indent = '    ' * level
+    text = textwrap.indent(text, indent)
+    if edges == 'pre' or edges == 'both':
+        text = '\n' + indent + text.lstrip()
+    if edges == 'post' or edges == 'both':
+        text = text.rstrip() + '\n' + '    ' * (level - 1)
+    return text
+
+
+#######################################
+# general
+
+HEX = r'(?: [0-9a-zA-Z] )'
+
+STRING_LITERAL = textwrap.dedent(rf'''
+    (?:
+        # character literal
+        (?:
+            ['] [^'] [']
+            |
+            ['] \\ . [']
+            |
+            ['] \\x{HEX}{HEX} [']
+            |
+            ['] \\0\d\d [']
+            |
+            (?:
+                ['] \\o[01]\d\d [']
+                |
+                ['] \\o2[0-4]\d [']
+                |
+                ['] \\o25[0-5] [']
+             )
+         )
+        |
+        # string literal
+        (?:
+            ["] (?: [^"\\]* \\ . )* [^"\\]* ["]
+         )
+        # end string literal
+     )
+    ''')
+
+_KEYWORD = textwrap.dedent(r'''
+    (?:
+        \b
+        (?:
+            auto |
+            extern |
+            register |
+            static |
+            typedef |
+
+            const |
+            volatile |
+
+            signed |
+            unsigned |
+            char |
+            short |
+            int |
+            long |
+            float |
+            double |
+            void |
+
+            struct |
+            union |
+            enum |
+
+            goto |
+            return |
+            sizeof |
+            break |
+            continue |
+            if |
+            else |
+            for |
+            do |
+            while |
+            switch |
+            case |
+            default |
+            entry
+         )
+        \b
+     )
+    ''')
+KEYWORD = rf'''
+    # keyword
+    {_KEYWORD}
+    # end keyword
+    '''
+_KEYWORD = ''.join(_KEYWORD.split())
+
+IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
+# We use a negative lookahead to filter out keywords.
+STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
+ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'
+
+
+#######################################
+# types
+
+SIMPLE_TYPE = textwrap.dedent(rf'''
+    # simple type
+    (?:
+        \b
+        (?:
+            void
+            |
+            (?: signed | unsigned )  # implies int
+            |
+            (?:
+                (?: (?: signed | unsigned ) \s+ )?
+                (?: (?: long | short ) \s+ )?
+                (?: char | short | int | long | float | double )
+             )
+         )
+        \b
+     )
+    # end simple type
+    ''')
+
+COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'
+
+
+#######################################
+# variable declarations
+
+STORAGE_CLASS = r'(?: \b (?: auto | register | static | extern ) \b )'
+TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
+PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
+
+TYPE_SPEC = textwrap.dedent(rf'''
+    # type spec
+    (?:
+        {_ind(SIMPLE_TYPE, 2)}
+        |
+        (?:
+            [_]*typeof[_]*
+            \s* [(]
+            (?: \s* [*&] )*
+            \s* {STRICT_IDENTIFIER}
+            \s* [)]
+         )
+        |
+        # reference to a compound type
+        (?:
+            {COMPOUND_TYPE_KIND}
+            (?: \s* {ANON_IDENTIFIER} )?
+         )
+        |
+        # reference to a typedef
+        {STRICT_IDENTIFIER}
+     )
+    # end type spec
+    ''')
+
+DECLARATOR = textwrap.dedent(rf'''
+    # declarator  (possibly abstract)
+    (?:
+        (?: {PTR_QUALIFIER} \s* )*
+        (?:
+            (?:
+                (?:  # <IDENTIFIER>
+                    {STRICT_IDENTIFIER}
+                )
+                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
+             )
+            |
+            (?:
+                [(] \s*
+                (?:  # <WRAPPED_IDENTIFIER>
+                    {STRICT_IDENTIFIER}
+                )
+                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
+                \s* [)]
+             )
+            |
+            # func ptr
+            (?:
+                [(] (?: \s* {PTR_QUALIFIER} )? \s*
+                (?:  # <FUNC_IDENTIFIER>
+                    {STRICT_IDENTIFIER}
+                )
+                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
+                \s* [)]
+                # We allow for a single level of paren nesting in parameters.
+                \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
+             )
+         )
+     )
+    # end declarator
+    ''')
+
+VAR_DECL = textwrap.dedent(rf'''
+    # var decl (and typedef and func return type)
+    (?:
+        (?:
+            (?:  # <STORAGE>
+                {STORAGE_CLASS}
+            )
+            \s*
+        )?
+        (?:
+            (?:  # <TYPE_QUAL>
+                {TYPE_QUALIFIER}
+            )
+            \s*
+         )?
+        (?:
+            (?:  # <TYPE_SPEC>
+                {_ind(TYPE_SPEC, 4)}
+            )
+         )
+        \s*
+        (?:
+            (?:  # <DECLARATOR>
+                {_ind(DECLARATOR, 4)}
+            )
+         )
+     )
+    # end var decl
+    ''')
+
+INITIALIZER = textwrap.dedent(rf'''
+    # initializer
+    (?:
+        (?:
+            [(]
+            # no nested parens (e.g. func ptr)
+            [^)]*
+            [)]
+            \s*
+         )?
+        (?:
+            # a string literal
+            (?:
+                (?: {_ind(STRING_LITERAL, 4)} \s* )*
+                {_ind(STRING_LITERAL, 4)}
+             )
+            |
+
+            # a simple initializer
+            (?:
+                (?:
+                    [^'",;{{]*
+                    {_ind(STRING_LITERAL, 4)}
+                 )*
+                [^'",;{{]*
+             )
+            |
+
+            # a struct/array literal
+            (?:
+                # We only expect compound initializers with
+                # single-variable declarations.
+                {{
+                (?:
+                    [^'";]*?
+                    {_ind(STRING_LITERAL, 5)}
+                 )*
+                [^'";]*?
+                }}
+                (?= \s* ; )  # Note this lookahead.
+             )
+         )
+     )
+    # end initializer
+    ''')
+
+
+#######################################
+# compound type declarations
+
+STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
+    (?:
+        # inline compound type decl
+        (?:
+            (?:  # <COMPOUND_TYPE_KIND>
+                {COMPOUND_TYPE_KIND}
+             )
+            (?:
+                \s+
+                (?:  # <COMPOUND_TYPE_NAME>
+                    {STRICT_IDENTIFIER}
+                 )
+             )?
+            \s* {{
+         )
+        |
+        (?:
+            # typed member
+            (?:
+                # Technically it doesn't have to have a type...
+                (?:  # <SPECIFIER_QUALIFIER>
+                    (?: {TYPE_QUALIFIER} \s* )?
+                    {_ind(TYPE_SPEC, 5)}
+                 )
+                (?:
+                    # If it doesn't have a declarator then it will have
+                    # a size and vice versa.
+                    \s*
+                    (?:  # <DECLARATOR>
+                        {_ind(DECLARATOR, 6)}
+                     )
+                 )?
+            )
+
+            # sized member
+            (?:
+                \s* [:] \s*
+                (?:  # <SIZE>
+                    \d+
+                 )
+             )?
+            \s*
+            (?:  # <ENDING>
+                [,;]
+             )
+         )
+        |
+        (?:
+            \s*
+            (?:  # <CLOSE>
+                }}
+             )
+         )
+     )
+    ''')
+
+ENUM_MEMBER_DECL = textwrap.dedent(rf'''
+    (?:
+        (?:
+            \s*
+            (?:  # <CLOSE>
+                }}
+             )
+         )
+        |
+        (?:
+            \s*
+            (?:  # <NAME>
+                {IDENTIFIER}
+             )
+            (?:
+                \s* = \s*
+                (?:  # <INIT>
+                    {_ind(STRING_LITERAL, 4)}
+                    |
+                    [^'",}}]+
+                 )
+             )?
+            \s*
+            (?:  # <ENDING>
+                , | }}
+             )
+         )
+     )
+    ''')
+
+
+#######################################
+# statements
+
+SIMPLE_STMT_BODY = textwrap.dedent(rf'''
+    # simple statement body
+    (?:
+        (?:
+            [^'"{{}};]*
+            {_ind(STRING_LITERAL, 3)}
+         )*
+        [^'"{{}};]*
+        #(?= [;{{] )  # Note this lookahead.
+     )
+    # end simple statement body
+    ''')
+SIMPLE_STMT = textwrap.dedent(rf'''
+    # simple statement
+    (?:
+        (?:  # <SIMPLE_STMT>
+            # stmt-inline "initializer"
+            (?:
+                return \b
+                (?:
+                    \s*
+                    {_ind(INITIALIZER, 5)}
+                )?
+             )
+            |
+            # variable assignment
+            (?:
+                (?: [*] \s* )?
+                (?:
+                    {STRICT_IDENTIFIER} \s*
+                    (?: . | -> ) \s*
+                 )*
+                {STRICT_IDENTIFIER}
+                (?: \s* \[ \s* \d+ \s* \] )?
+                \s* = \s*
+                {_ind(INITIALIZER, 4)}
+             )
+            |
+            # catchall return statement
+            (?:
+                return \b
+                (?:
+                    (?:
+                        [^'";]*
+                        {_ind(STRING_LITERAL, 6)}
+                     )*
+                    \s* [^'";]*
+                 )?
+             )
+            |
+            # simple statement
+            (?:
+                {_ind(SIMPLE_STMT_BODY, 4)}
+             )
+         )
+        \s*
+        (?:  # <SIMPLE_ENDING>
+            ;
+         )
+     )
+    # end simple statement
+    ''')
+COMPOUND_STMT = textwrap.dedent(rf'''
+    # compound statement
+    (?:
+        \b
+        (?:
+            (?:
+                (?:  # <COMPOUND_BARE>
+                    else | do
+                 )
+                \b
+             )
+            |
+            (?:
+                (?:  # <COMPOUND_LABELED>
+                    (?:
+                        case \b
+                        (?:
+                            [^'":]*
+                            {_ind(STRING_LITERAL, 7)}
+                         )*
+                        \s* [^'":]*
+                     )
+                    |
+                    default
+                    |
+                    {STRICT_IDENTIFIER}
+                 )
+                \s* [:]
+             )
+            |
+            (?:
+                (?:  # <COMPOUND_PAREN>
+                    for | while | if | switch
+                 )
+                \s* (?= [(] )  # Note this lookahead.
+             )
+         )
+        \s*
+     )
+    # end compound statement
+    ''')
+
+
+#######################################
+# function bodies
+
+LOCAL = textwrap.dedent(rf'''
+    (?:
+        # an empty statement
+        (?:  # <EMPTY>
+            ;
+         )
+        |
+        # inline type decl
+        (?:
+            (?:
+                (?:  # <INLINE_LEADING>
+                    [^;{{}}]+?
+                 )
+                \s*
+             )?
+            (?:  # <INLINE_PRE>
+                (?: {STORAGE_CLASS} \s* )?
+                (?: {TYPE_QUALIFIER} \s* )?
+             )?  # </INLINE_PRE>
+            (?:  # <INLINE_KIND>
+                {COMPOUND_TYPE_KIND}
+             )
+            (?:
+                \s+
+                (?:  # <INLINE_NAME>
+                    {STRICT_IDENTIFIER}
+                 )
+             )?
+            \s* {{
+         )
+        |
+        # var decl
+        (?:
+            (?:  # <STORAGE>
+                {STORAGE_CLASS}
+             )?  # </STORAGE>
+            (?:
+                \s*
+                (?:  # <VAR_DECL>
+                    {_ind(VAR_DECL, 5)}
+                 )
+             )
+            (?:
+                (?:
+                    # initializer
+                    # We expect only basic initializers.
+                    \s* = \s*
+                    (?:  # <VAR_INIT>
+                        {_ind(INITIALIZER, 6)}
+                     )
+                 )?
+                (?:
+                    \s*
+                    (?:  # <VAR_ENDING>
+                        [,;]
+                     )
+                 )
+             )
+         )
+        |
+        {_ind(COMPOUND_STMT, 2)}
+        |
+        # start-of-block
+        (?:
+            (?:  # <BLOCK_LEADING>
+                (?:
+                    [^'"{{}};]*
+                    {_ind(STRING_LITERAL, 5)}
+                 )*
+                [^'"{{}};]*
+                # Presumably we will not see "== {{".
+                [^\s='"{{}});]
+                \s*
+             )?  # </BLOCK_LEADING>
+            (?:  # <BLOCK_OPEN>
+                {{
+             )
+         )
+        |
+        {_ind(SIMPLE_STMT, 2)}
+        |
+        # end-of-block
+        (?:  # <BLOCK_CLOSE>
+            }}
+         )
+     )
+    ''')
+
+LOCAL_STATICS = textwrap.dedent(rf'''
+    (?:
+        # inline type decl
+        (?:
+            (?:
+                (?:  # <INLINE_LEADING>
+                    [^;{{}}]+?
+                 )
+                \s*
+             )?
+            (?:  # <INLINE_PRE>
+                (?: {STORAGE_CLASS} \s* )?
+                (?: {TYPE_QUALIFIER} \s* )?
+             )?
+            (?:  # <INLINE_KIND>
+                {COMPOUND_TYPE_KIND}
+             )
+            (?:
+                \s+
+                (?:  # <INLINE_NAME>
+                    {STRICT_IDENTIFIER}
+                 )
+             )?
+            \s* {{
+         )
+        |
+        # var decl
+        (?:
+            # We only look for static variables.
+            (?:  # <STATIC_DECL>
+                static \b
+                (?: \s* {TYPE_QUALIFIER} )?
+                \s* {_ind(TYPE_SPEC, 4)}
+                \s* {_ind(DECLARATOR, 4)}
+             )
+            \s*
+            (?:
+                (?:  # <STATIC_INIT>
+                    = \s*
+                    {_ind(INITIALIZER, 4)}
+                    \s*
+                    [,;{{]
+                 )
+                |
+                (?:  # <STATIC_ENDING>
+                    [,;]
+                 )
+             )
+         )
+        |
+        # everything else
+        (?:
+            (?:  # <DELIM_LEADING>
+                (?:
+                    [^'"{{}};]*
+                    {_ind(STRING_LITERAL, 4)}
+                 )*
+                \s* [^'"{{}};]*
+             )
+            (?:
+                (?:  # <BLOCK_OPEN>
+                    {{
+                 )
+                |
+                (?:  # <BLOCK_CLOSE>
+                    }}
+                 )
+                |
+                (?:  # <STMT_END>
+                    ;
+                 )
+             )
+         )
+     )
+    ''')
+
+
+#######################################
+# global declarations
+
+GLOBAL = textwrap.dedent(rf'''
+    (?:
+        # an empty statement
+        (?:  # <EMPTY>
+            ;
+         )
+        |
+
+        # compound type decl (maybe inline)
+        (?:
+            (?:
+                (?:  # <COMPOUND_LEADING>
+                    [^;{{}}]+?
+                 )
+                 \s*
+             )?
+            (?:  # <COMPOUND_KIND>
+                {COMPOUND_TYPE_KIND}
+             )
+            (?:
+                \s+
+                (?:  # <COMPOUND_NAME>
+                    {STRICT_IDENTIFIER}
+                 )
+             )?
+            \s* {{
+         )
+        |
+        # bogus inline decl artifact
+        # This simplifies resolving the relative syntactic ambiguity of
+        # inline structs.
+        (?:
+            (?:  # <FORWARD_KIND>
+                {COMPOUND_TYPE_KIND}
+             )
+            \s*
+            (?:  # <FORWARD_NAME>
+                {ANON_IDENTIFIER}
+             )
+            (?:  # <MAYBE_INLINE_ACTUAL>
+                [^=,;({{[*\]]*
+                [=,;({{]
+             )
+         )
+        |
+
+        # typedef
+        (?:
+            \b typedef \b \s*
+            (?:  # <TYPEDEF_DECL>
+                {_ind(VAR_DECL, 4)}
+             )
+            (?:
+                # We expect no inline type definitions in the parameters.
+                \s* [(] \s*
+                (?:  # <TYPEDEF_FUNC_PARAMS>
+                    [^{{;]*
+                 )
+                \s* [)]
+             )?
+            \s* ;
+         )
+        |
+
+        # func decl/definition & var decls
+        # XXX dedicated pattern for funcs (more restricted)?
+        (?:
+            (?:
+                (?:  # <VAR_STORAGE>
+                    {STORAGE_CLASS}
+                 )
+                \s*
+             )?
+            (?:
+                (?:  # <FUNC_INLINE>
+                    \b inline \b
+                 )
+                \s*
+             )?
+            (?:  # <VAR_DECL>
+                {_ind(VAR_DECL, 4)}
+             )
+            (?:
+                # func decl / definition
+                (?:
+                    (?:
+                        # We expect no inline type definitions in the parameters.
+                        \s* [(] \s*
+                        (?:  # <FUNC_PARAMS>
+                            [^{{;]*
+                         )
+                        \s* [)] \s*
+                        (?:  # <FUNC_DELIM>
+                            [{{;]
+                         )
+                     )
+                    |
+                    (?:
+                        # This is some old-school syntax!
+                        \s* [(] \s*
+                        # We throw away the bare names:
+                        {STRICT_IDENTIFIER}
+                        (?: \s* , \s* {STRICT_IDENTIFIER} )*
+                        \s* [)] \s*
+
+                        # We keep the trailing param declarations:
+                        (?:  # <FUNC_LEGACY_PARAMS>
+                            # There's at least one!
+                            (?: {TYPE_QUALIFIER} \s* )?
+                            {_ind(TYPE_SPEC, 7)}
+                            \s*
+                            {_ind(DECLARATOR, 7)}
+                            \s* ;
+                            (?:
+                                \s*
+                                (?: {TYPE_QUALIFIER} \s* )?
+                                {_ind(TYPE_SPEC, 8)}
+                                \s*
+                                {_ind(DECLARATOR, 8)}
+                                \s* ;
+                             )*
+                         )
+                        \s* {{
+                     )
+                 )
+                |
+                # var / typedef
+                (?:
+                    (?:
+                        # initializer
+                        # We expect only basic initializers.
+                        \s* = \s*
+                        (?:  # <VAR_INIT>
+                            {_ind(INITIALIZER, 6)}
+                         )
+                     )?
+                    \s*
+                    (?:  # <VAR_ENDING>
+                        [,;]
+                     )
+                 )
+             )
+         )
+     )
+    ''')
--- a/Tools/c-analyzer/c_parser/preprocessor/init.py
+++ b/Tools/c-analyzer/c_parser/preprocessor/init.py
@ -0,0 +1,190 @@
+import contextlib
+import distutils.ccompiler
+import logging
+import os.path
+
+from c_common.fsutil import match_glob as _match_glob
+from c_common.tables import parse_table as _parse_table
+from ..source import (
+    resolve as _resolve_source,
+    good_file as _good_file,
+)
+from . import errors as _errors
+from . import (
+    pure as _pure,
+    gcc as _gcc,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+# Supprted "source":
+#  * filename (string)
+#  * lines (iterable)
+#  * text (string)
+# Supported return values:
+#  * iterator of SourceLine
+#  * sequence of SourceLine
+#  * text (string)
+#  * something that combines all those
+# XXX Add the missing support from above.
+# XXX Add more low-level functions to handle permutations?
+
+def preprocess(source, *,
+               incldirs=None,
+               macros=None,
+               samefiles=None,
+               filename=None,
+               tool=True,
+               ):
+    """...
+
+    CWD should be the project root and "source" should be relative.
+    """
+    if tool:
+        logger.debug(f'CWD: {os.getcwd()!r}')
+        logger.debug(f'incldirs: {incldirs!r}')
+        logger.debug(f'macros: {macros!r}')
+        logger.debug(f'samefiles: {samefiles!r}')
+        _preprocess = _get_preprocessor(tool)
+        with _good_file(source, filename) as source:
+            return _preprocess(source, incldirs, macros, samefiles) or ()
+    else:
+        source, filename = _resolve_source(source, filename)
+        # We ignore "includes", "macros", etc.
+        return _pure.preprocess(source, filename)
+
+    # if _run() returns just the lines:
+#    text = _run(source)
+#    lines = [line + os.linesep for line in text.splitlines()]
+#    lines[-1] = lines[-1].splitlines()[0]
+#
+#    conditions = None
+#    for lno, line in enumerate(lines, 1):
+#        kind = 'source'
+#        directive = None
+#        data = line
+#        yield lno, kind, data, conditions
+
+
+def get_preprocessor(*,
+                     file_macros=None,
+                     file_incldirs=None,
+                     file_same=None,
+                     ignore_exc=False,
+                     log_err=None,
+                     ):
+    _preprocess = preprocess
+    if file_macros:
+        file_macros = tuple(_parse_macros(file_macros))
+    if file_incldirs:
+        file_incldirs = tuple(_parse_incldirs(file_incldirs))
+    if file_same:
+        file_same = tuple(file_same)
+    if not callable(ignore_exc):
+        ignore_exc = (lambda exc, _ig=ignore_exc: _ig)
+
+    def get_file_preprocessor(filename):
+        filename = filename.strip()
+        if file_macros:
+            macros = list(_resolve_file_values(filename, file_macros))
+        if file_incldirs:
+            incldirs = [v for v, in _resolve_file_values(filename, file_incldirs)]
+    
+        def preprocess(**kwargs):
+            if file_macros and 'macros' not in kwargs:
+                kwargs['macros'] = macros
+            if file_incldirs and 'incldirs' not in kwargs:
+                kwargs['incldirs'] = [v for v, in _resolve_file_values(filename, file_incldirs)]
+            if file_same and 'file_same' not in kwargs:
+                kwargs['samefiles'] = file_same
+            kwargs.setdefault('filename', filename)
+            with handling_errors(ignore_exc, log_err=log_err):
+                return _preprocess(filename, **kwargs)
+        return preprocess
+    return get_file_preprocessor
+
+
+def _resolve_file_values(filename, file_values):
+    # We expect the filename and all patterns to be absolute paths.
+    for pattern, *value in file_values or ():
+        if _match_glob(filename, pattern):
+            yield value
+
+
+def _parse_macros(macros):
+    for row, srcfile in _parse_table(macros, '\t', 'glob\tname\tvalue', rawsep='=', default=None):
+        yield row
+
+
+def _parse_incldirs(incldirs):
+    for row, srcfile in _parse_table(incldirs, '\t', 'glob\tdirname', default=None):
+        glob, dirname = row
+        if dirname is None:
+            # Match all files.
+            dirname = glob
+            row = ('*', dirname.strip())
+        yield row
+
+
+@contextlib.contextmanager
+def handling_errors(ignore_exc=None, *, log_err=None):
+    try:
+        yield
+    except _errors.OSMismatchError as exc:
+        if not ignore_exc(exc):
+            raise  # re-raise
+        if log_err is not None:
+            log_err(f'<OS mismatch (expected {" or ".join(exc.expected)})>')
+        return None
+    except _errors.MissingDependenciesError as exc:
+        if not ignore_exc(exc):
+            raise  # re-raise
+        if log_err is not None:
+            log_err(f'<missing dependency {exc.missing}')
+        return None
+    except _errors.ErrorDirectiveError as exc:
+        if not ignore_exc(exc):
+            raise  # re-raise
+        if log_err is not None:
+            log_err(exc)
+        return None
+
+
+##################################
+# tools
+
+_COMPILERS = {
+    # matching disutils.ccompiler.compiler_class:
+    'unix': _gcc.preprocess,
+    'msvc': None,
+    'cygwin': None,
+    'mingw32': None,
+    'bcpp': None,
+    # aliases/extras:
+    'gcc': _gcc.preprocess,
+    'clang': None,
+}
+
+
+def _get_preprocessor(tool):
+    if tool is True:
+        tool = distutils.ccompiler.get_default_compiler()
+    preprocess = _COMPILERS.get(tool)
+    if preprocess is None:
+        raise ValueError(f'unsupported tool {tool}')
+    return preprocess
+
+
+##################################
+# aliases
+
+from .errors import (
+    PreprocessorError,
+    PreprocessorFailure,
+    ErrorDirectiveError,
+    MissingDependenciesError,
+    OSMismatchError,
+)
+from .common import FileInfo, SourceLine
--- a/Tools/c-analyzer/c_parser/preprocessor/main.py
+++ b/Tools/c-analyzer/c_parser/preprocessor/main.py
@ -0,0 +1,196 @@
+import logging
+import sys
+
+from c_common.scriptutil import (
+    CLIArgSpec as Arg,
+    add_verbosity_cli,
+    add_traceback_cli,
+    add_kind_filtering_cli,
+    add_files_cli,
+    add_failure_filtering_cli,
+    add_commands_cli,
+    process_args_by_key,
+    configure_logger,
+    get_prog,
+    main_for_filenames,
+)
+from . import (
+    errors as _errors,
+    get_preprocessor as _get_preprocessor,
+)
+
+
+FAIL = {
+    'err': _errors.ErrorDirectiveError,
+    'deps': _errors.MissingDependenciesError,
+    'os': _errors.OSMismatchError,
+}
+FAIL_DEFAULT = tuple(v for v in FAIL if v != 'os')
+
+
+logger = logging.getLogger(__name__)
+
+
+##################################
+# CLI helpers
+
+def add_common_cli(parser, *, get_preprocessor=_get_preprocessor):
+    parser.add_argument('--macros', action='append')
+    parser.add_argument('--incldirs', action='append')
+    parser.add_argument('--same', action='append')
+    process_fail_arg = add_failure_filtering_cli(parser, FAIL)
+
+    def process_args(args):
+        ns = vars(args)
+
+        process_fail_arg(args)
+        ignore_exc = ns.pop('ignore_exc')
+        # We later pass ignore_exc to _get_preprocessor().
+
+        args.get_file_preprocessor = get_preprocessor(
+            file_macros=ns.pop('macros'),
+            file_incldirs=ns.pop('incldirs'),
+            file_same=ns.pop('same'),
+            ignore_exc=ignore_exc,
+            log_err=print,
+        )
+    return process_args
+
+
+def _iter_preprocessed(filename, *,
+                       get_preprocessor,
+                       match_kind=None,
+                       pure=False,
+                       ):
+    preprocess = get_preprocessor(filename)
+    for line in preprocess(tool=not pure) or ():
+        if match_kind is not None and not match_kind(line.kind):
+            continue
+        yield line
+
+
+#######################################
+# the commands
+
+def _cli_preprocess(parser, excluded=None, **prepr_kwargs):
+    parser.add_argument('--pure', action='store_true')
+    parser.add_argument('--no-pure', dest='pure', action='store_const', const=False)
+    process_kinds = add_kind_filtering_cli(parser)
+    process_common = add_common_cli(parser, **prepr_kwargs)
+    parser.add_argument('--raw', action='store_true')
+    process_files = add_files_cli(parser, excluded=excluded)
+
+    return [
+        process_kinds,
+        process_common,
+        process_files,
+    ]
+
+
+def cmd_preprocess(filenames, *,
+                   raw=False,
+                   iter_filenames=None,
+                   **kwargs
+                   ):
+    if 'get_file_preprocessor' not in kwargs:
+        kwargs['get_file_preprocessor'] = _get_preprocessor()
+    if raw:
+        def show_file(filename, lines):
+            for line in lines:
+                print(line)
+                #print(line.raw)
+    else:
+        def show_file(filename, lines):
+            for line in lines:
+                linefile = ''
+                if line.filename != filename:
+                    linefile = f' ({line.filename})'
+                text = line.data
+                if line.kind == 'comment':
+                    text = '/* ' + line.data.splitlines()[0]
+                    text += ' */' if '\n' in line.data else r'\n... */'
+                print(f' {line.lno:>4} {line.kind:10} | {text}')
+
+    filenames = main_for_filenames(filenames, iter_filenames)
+    for filename in filenames:
+        lines = _iter_preprocessed(filename, **kwargs)
+        show_file(filename, lines)
+
+
+def _cli_data(parser):
+    ...
+
+    return None
+
+
+def cmd_data(filenames,
+             **kwargs
+             ):
+    # XXX
+    raise NotImplementedError
+
+
+COMMANDS = {
+    'preprocess': (
+        'preprocess the given C source & header files',
+        [_cli_preprocess],
+        cmd_preprocess,
+    ),
+    'data': (
+        'check/manage local data (e.g. excludes, macros)',
+        [_cli_data],
+        cmd_data,
+    ),
+}
+
+
+#######################################
+# the script
+
+def parse_args(argv=sys.argv[1:], prog=sys.argv[0], *,
+               subset='preprocess',
+               excluded=None,
+               **prepr_kwargs
+               ):
+    import argparse
+    parser = argparse.ArgumentParser(
+        prog=prog or get_prog(),
+    )
+
+    processors = add_commands_cli(
+        parser,
+        commands={k: v[1] for k, v in COMMANDS.items()},
+        commonspecs=[
+            add_verbosity_cli,
+            add_traceback_cli,
+        ],
+        subset=subset,
+    )
+
+    args = parser.parse_args(argv)
+    ns = vars(args)
+
+    cmd = ns.pop('cmd')
+
+    verbosity, traceback_cm = process_args_by_key(
+        args,
+        processors[cmd],
+        ['verbosity', 'traceback_cm'],
+    )
+
+    return cmd, ns, verbosity, traceback_cm
+
+
+def main(cmd, cmd_kwargs):
+    try:
+        run_cmd = COMMANDS[cmd][0]
+    except KeyError:
+        raise ValueError(f'unsupported cmd {cmd!r}')
+    run_cmd(**cmd_kwargs)
+
+
+if __name__ == '__main__':
+    cmd, cmd_kwargs, verbosity, traceback_cm = parse_args()
+    configure_logger(verbosity)
+    with traceback_cm:
+        main(cmd, cmd_kwargs)
--- a/Tools/c-analyzer/c_parser/preprocessor/common.py
+++ b/Tools/c-analyzer/c_parser/preprocessor/common.py
@ -0,0 +1,173 @@
+import contextlib
+import distutils.ccompiler
+import logging
+import shlex
+import subprocess
+import sys
+
+from ..info import FileInfo, SourceLine
+from .errors import (
+    PreprocessorFailure,
+    ErrorDirectiveError,
+    MissingDependenciesError,
+    OSMismatchError,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+# XXX Add aggregate "source" class(es)?
+#  * expose all lines as single text string
+#  * expose all lines as sequence
+#  * iterate all lines
+
+
+def run_cmd(argv, *,
+            #capture_output=True,
+            stdout=subprocess.PIPE,
+            #stderr=subprocess.STDOUT,
+            stderr=subprocess.PIPE,
+            text=True,
+            check=True,
+            **kwargs
+            ):
+    if isinstance(stderr, str) and stderr.lower() == 'stdout':
+        stderr = subprocess.STDOUT
+
+    kw = dict(locals())
+    kw.pop('argv')
+    kw.pop('kwargs')
+    kwargs.update(kw)
+
+    proc = subprocess.run(argv, **kwargs)
+    return proc.stdout
+
+
+def preprocess(tool, filename, **kwargs):
+    argv = _build_argv(tool, filename, **kwargs)
+    logger.debug(' '.join(shlex.quote(v) for v in argv))
+
+    # Make sure the OS is supported for this file.
+    if (_expected := is_os_mismatch(filename)):
+        error = None
+        raise OSMismatchError(filename, _expected, argv, error, TOOL)
+
+    # Run the command.
+    with converted_error(tool, argv, filename):
+        # We use subprocess directly here, instead of calling the
+        # distutil compiler object's preprocess() method, since that
+        # one writes to stdout/stderr and it's simpler to do it directly
+        # through subprocess.
+        return run_cmd(argv)
+
+
+def _build_argv(
+    tool,
+    filename,
+    incldirs=None,
+    macros=None,
+    preargs=None,
+    postargs=None,
+    executable=None,
+    compiler=None,
+):
+    compiler = distutils.ccompiler.new_compiler(
+        compiler=compiler or tool,
+    )
+    if executable:
+        compiler.set_executable('preprocessor', executable)
+
+    argv = None
+    def _spawn(_argv):
+        nonlocal argv
+        argv = _argv
+    compiler.spawn = _spawn
+    compiler.preprocess(
+        filename,
+        macros=[tuple(v) for v in macros or ()],
+        include_dirs=incldirs or (),
+        extra_preargs=preargs or (),
+        extra_postargs=postargs or (),
+    )
+    return argv
+
+
+@contextlib.contextmanager
+def converted_error(tool, argv, filename):
+    try:
+        yield
+    except subprocess.CalledProcessError as exc:
+        convert_error(
+            tool,
+            argv,
+            filename,
+            exc.stderr,
+            exc.returncode,
+        )
+
+
+def convert_error(tool, argv, filename, stderr, rc):
+    error = (stderr.splitlines()[0], rc)
+    if (_expected := is_os_mismatch(filename, stderr)):
+        logger.debug(stderr.strip())
+        raise OSMismatchError(filename, _expected, argv, error, tool)
+    elif (_missing := is_missing_dep(stderr)):
+        logger.debug(stderr.strip())
+        raise MissingDependenciesError(filename, (_missing,), argv, error, tool)
+    elif '#error' in stderr:
+        # XXX Ignore incompatible files.
+        error = (stderr.splitlines()[1], rc)
+        logger.debug(stderr.strip())
+        raise ErrorDirectiveError(filename, argv, error, tool)
+    else:
+        # Try one more time, with stderr written to the terminal.
+        try:
+            output = run_cmd(argv, stderr=None)
+        except subprocess.CalledProcessError:
+            raise PreprocessorFailure(filename, argv, error, tool)
+
+
+def is_os_mismatch(filename, errtext=None):
+    # See: https://docs.python.org/3/library/sys.html#sys.platform
+    actual = sys.platform
+    if actual == 'unknown':
+        raise NotImplementedError
+
+    if errtext is not None:
+        if (missing := is_missing_dep(errtext)):
+            matching = get_matching_oses(missing, filename)
+            if actual not in matching:
+                return matching
+    return False
+
+
+def get_matching_oses(missing, filename):
+    # OSX
+    if 'darwin' in filename or 'osx' in filename:
+        return ('darwin',)
+    elif missing == 'SystemConfiguration/SystemConfiguration.h':
+        return ('darwin',)
+
+    # Windows
+    elif missing in ('windows.h', 'winsock2.h'):
+        return ('win32',)
+
+    # other
+    elif missing == 'sys/ldr.h':
+        return ('aix',)
+    elif missing == 'dl.h':
+        # XXX The existence of Python/dynload_dl.c implies others...
+        # Note that hpux isn't actual supported any more.
+        return ('hpux', '???')
+
+    # unrecognized
+    else:
+        return ()
+
+
+def is_missing_dep(errtext):
+    if 'No such file or directory' in errtext:
+        missing = errtext.split(': No such file or directory')[0].split()[-1]
+        return missing
+    return False
--- a/Tools/c-analyzer/c_parser/preprocessor/errors.py
+++ b/Tools/c-analyzer/c_parser/preprocessor/errors.py
@ -0,0 +1,110 @@
+import sys
+
+
+OS = sys.platform
+
+
+def _as_tuple(items):
+    if isinstance(items, str):
+        return tuple(items.strip().replace(',', ' ').split())
+    elif items:
+        return tuple(items)
+    else:
+        return ()
+
+
+class PreprocessorError(Exception):
+    """Something preprocessor-related went wrong."""
+
+    @classmethod
+    def _msg(cls, filename, reason, **ignored):
+        msg = 'failure while preprocessing'
+        if reason:
+            msg = f'{msg} ({reason})'
+        return msg
+
+    def __init__(self, filename, preprocessor=None, reason=None):
+        if isinstance(reason, str):
+            reason = reason.strip()
+
+        self.filename = filename
+        self.preprocessor = preprocessor or None
+        self.reason = str(reason) if reason else None
+
+        msg = self._msg(**vars(self))
+        msg = f'({filename}) {msg}'
+        if preprocessor:
+            msg = f'[{preprocessor}] {msg}'
+        super().__init__(msg)
+
+
+class PreprocessorFailure(PreprocessorError):
+    """The preprocessor command failed."""
+
+    @classmethod
+    def _msg(cls, error, **ignored):
+        msg = 'preprocessor command failed'
+        if error:
+            msg = f'{msg} {error}'
+        return msg
+
+    def __init__(self, filename, argv, error=None, preprocessor=None):
+        exitcode = -1
+        if isinstance(error, tuple):
+            if len(error) == 2:
+                error, exitcode = error
+            else:
+                error = str(error)
+        if isinstance(error, str):
+            error = error.strip()
+
+        self.argv = _as_tuple(argv) or None
+        self.error = error if error else None
+        self.exitcode = exitcode
+
+        reason = str(self.error)
+        super().__init__(filename, preprocessor, reason)
+
+
+class ErrorDirectiveError(PreprocessorFailure):
+    """The file hit a #error directive."""
+
+    @classmethod
+    def _msg(cls, error, **ignored):
+        return f'#error directive hit ({error})'
+
+    def __init__(self, filename, argv, error, *args, **kwargs):
+        super().__init__(filename, argv, error, *args, **kwargs)
+
+
+class MissingDependenciesError(PreprocessorFailure):
+    """The preprocessor did not have access to all the target's dependencies."""
+
+    @classmethod
+    def _msg(cls, missing, **ignored):
+        msg = 'preprocessing failed due to missing dependencies'
+        if missing:
+            msg = f'{msg} ({", ".join(missing)})'
+        return msg
+
+    def __init__(self, filename, missing=None, *args, **kwargs):
+        self.missing = _as_tuple(missing) or None
+
+        super().__init__(filename, *args, **kwargs)
+
+
+class OSMismatchError(MissingDependenciesError):
+    """The target is not compatible with the host OS."""
+
+    @classmethod
+    def _msg(cls, expected, **ignored):
+        return f'OS is {OS} but expected {expected or "???"}'
+
+    def __init__(self, filename, expected=None, *args, **kwargs):
+        if isinstance(expected, str):
+            expected = expected.strip()
+
+        self.actual = OS
+        self.expected = expected if expected else None
+
+        super().__init__(filename, None, *args, **kwargs)
--- a/Tools/c-analyzer/c_parser/preprocessor/gcc.py
+++ b/Tools/c-analyzer/c_parser/preprocessor/gcc.py
@ -0,0 +1,123 @@
+import os.path
+import re
+
+from . import common as _common
+
+
+TOOL = 'gcc'
+
+# https://gcc.gnu.org/onlinedocs/cpp/Preprocessor-Output.html
+LINE_MARKER_RE = re.compile(r'^# (\d+) "([^"]+)"(?: [1234])*$')
+PREPROC_DIRECTIVE_RE = re.compile(r'^\s*#\s*(\w+)\b.*')
+COMPILER_DIRECTIVE_RE = re.compile(r'''
+    ^
+    (.*?)  # <before>
+    (__\w+__)  # <directive>
+    \s*
+    [(] [(]
+    (
+        [^()]*
+        (?:
+            [(]
+            [^()]*
+            [)]
+            [^()]*
+         )*
+     )  # <args>
+    ( [)] [)] )?  # <closed>
+''', re.VERBOSE)
+
+POST_ARGS = (
+    '-pthread',
+    '-std=c99',
+    #'-g',
+    #'-Og',
+    #'-Wno-unused-result',
+    #'-Wsign-compare',
+    #'-Wall',
+    #'-Wextra',
+    '-E',
+)
+
+
+def preprocess(filename, incldirs=None, macros=None, samefiles=None):
+    text = _common.preprocess(
+        TOOL,
+        filename,
+        incldirs=incldirs,
+        macros=macros,
+        #preargs=PRE_ARGS,
+        postargs=POST_ARGS,
+        executable=['gcc'],
+        compiler='unix',
+    )
+    return _iter_lines(text, filename, samefiles)
+
+
+def _iter_lines(text, filename, samefiles, *, raw=False):
+    lines = iter(text.splitlines())
+
+    # Build the lines and filter out directives.
+    partial = 0  # depth
+    origfile = None
+    for line in lines:
+        m = LINE_MARKER_RE.match(line)
+        if m:
+            lno, origfile = m.groups()
+            lno = int(lno)
+        elif _filter_orig_file(origfile, filename, samefiles):
+            if (m := PREPROC_DIRECTIVE_RE.match(line)):
+                name, = m.groups()
+                if name != 'pragma':
+                    raise Exception(line)
+            else:
+                if not raw:
+                    line, partial = _strip_directives(line, partial=partial)
+                yield _common.SourceLine(
+                    _common.FileInfo(filename, lno),
+                    'source',
+                    line or '',
+                    None,
+                )
+            lno += 1
+
+
+def _strip_directives(line, partial=0):
+    # We assume there are no string literals with parens in directive bodies.
+    while partial > 0:
+        if not (m := re.match(r'[^{}]*([()])', line)):
+            return None, partial
+        delim, = m.groups()
+        partial += 1 if delim == '(' else -1  # opened/closed
+        line = line[m.end():]
+
+    line = re.sub(r'__extension__', '', line)
+
+    while (m := COMPILER_DIRECTIVE_RE.match(line)):
+        before, _, _, closed = m.groups()
+        if closed:
+            line = f'{before} {line[m.end():]}'
+        else:
+            after, partial = _strip_directives(line[m.end():], 2)
+            line = f'{before} {after or ""}'
+            if partial:
+                break
+
+    return line, partial
+
+
+def _filter_orig_file(origfile, current, samefiles):
+    if origfile == current:
+        return True
+    if origfile == '<stdin>':
+        return True
+    if os.path.isabs(origfile):
+        return False
+
+    for filename in samefiles or ():
+        if filename.endswith(os.path.sep):
+            filename += os.path.basename(current)
+        if origfile == filename:
+            return True
+
+    return False
--- a/Tools/c-analyzer/c_parser/preprocessor/pure.py
+++ b/Tools/c-analyzer/c_parser/preprocessor/pure.py
@ -0,0 +1,23 @@
+from ..source import (
+    opened as _open_source,
+)
+from . import common as _common
+
+
+def preprocess(lines, filename=None):
+    if isinstance(lines, str):
+        with _open_source(lines, filename) as (lines, filename):
+            yield from preprocess(lines, filename)
+        return
+
+    # XXX actually preprocess...
+    for lno, line in enumerate(lines, 1):
+        kind = 'source'
+        data = line
+        conditions = None
+        yield _common.SourceLine(
+            _common.FileInfo(filename, lno),
+            kind,
+            data,
+            conditions,
+        )
--- a/Tools/c-analyzer/c_parser/source.py
+++ b/Tools/c-analyzer/c_parser/source.py
@ -0,0 +1,64 @@
+import contextlib
+import os.path
+
+
+def resolve(source, filename):
+    if _looks_like_filename(source):
+        return _resolve_filename(source, filename)
+
+    if isinstance(source, str):
+        source = source.splitlines()
+
+    # At this point "source" is not a str.
+    if not filename:
+        filename = None
+    elif not isinstance(filename, str):
+        raise TypeError(f'filename should be str (or None), got {filename!r}')
+    else:
+        filename, _ = _resolve_filename(filename)
+    return source, filename
+
+
+@contextlib.contextmanager
+def good_file(filename, alt=None):
+    if not _looks_like_filename(filename):
+        raise ValueError(f'expected a filename, got {filename}')
+    filename, _ = _resolve_filename(filename, alt)
+    try:
+        yield filename
+    except Exception:
+        if not os.path.exists(filename):
+            raise FileNotFoundError(f'file not found: {filename}')
+        raise  # re-raise
+
+
+def _looks_like_filename(value):
+    if not isinstance(value, str):
+        return False
+    return value.endswith(('.c', '.h'))
+
+
+def _resolve_filename(filename, alt=None):
+    if os.path.isabs(filename):
+        ...
+#        raise NotImplementedError
+    else:
+        filename = os.path.join('.', filename)
+
+    if not alt:
+        alt = filename
+    elif os.path.abspath(filename) == os.path.abspath(alt):
+        alt = filename
+    else:
+        raise ValueError(f'mismatch: {filename} != {alt}')
+    return filename, alt
+
+
+@contextlib.contextmanager
+def opened(source, filename=None):
+    source, filename = resolve(source, filename)
+    if isinstance(source, str):
+        with open(source) as srcfile:
+            yield srcfile, filename
+    else:
+        yield source, filename