bpo-36876: Fix the C analyzer tool. (GH-22841)

The original tool wasn't working right and it was simpler to create a new one, partially re-using some of the old code. At this point the tool runs properly on the master. (Try: ./python Tools/c-analyzer/c-analyzer.py analyze.)  It take ~40 seconds on my machine to analyze the full CPython code base.

Note that we'll need to iron out some OS-specific stuff (e.g. preprocessor). We're okay though since this tool isn't used yet in our workflow. We will also need to verify the analysis results in detail before activating the check in CI, though I'm pretty sure it's close.

https://bugs.python.org/issue36876
This commit is contained in:
Eric Snow 2020-10-22 18:42:51 -06:00 committed by GitHub
parent ec388cfb4e
commit 345cd37abe
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
92 changed files with 8868 additions and 10539 deletions

View file

@ -0,0 +1,46 @@
from .parser import parse as _parse
from .preprocessor import get_preprocessor as _get_preprocessor
def parse_file(filename, *,
match_kind=None,
get_file_preprocessor=None,
):
if get_file_preprocessor is None:
get_file_preprocessor = _get_preprocessor()
yield from _parse_file(filename, match_kind, get_file_preprocessor)
def parse_files(filenames, *,
match_kind=None,
get_file_preprocessor=None,
):
if get_file_preprocessor is None:
get_file_preprocessor = _get_preprocessor()
for filename in filenames:
yield from _parse_file(filename, match_kind, get_file_preprocessor)
def _parse_file(filename, match_kind, get_file_preprocessor):
# Preprocess the file.
preprocess = get_file_preprocessor(filename)
preprocessed = preprocess()
if preprocessed is None:
return
# Parse the lines.
srclines = ((l.file, l.data) for l in preprocessed if l.kind == 'source')
for item in _parse(srclines):
if match_kind is not None and not match_kind(item.kind):
continue
if not item.filename:
raise NotImplementedError(repr(item))
yield item
def parse_signature(text):
raise NotImplementedError
# aliases
from .info import resolve_parsed

View file

@ -0,0 +1,261 @@
import logging
import os.path
import sys
from c_common.scriptutil import (
CLIArgSpec as Arg,
add_verbosity_cli,
add_traceback_cli,
add_kind_filtering_cli,
add_files_cli,
add_commands_cli,
process_args_by_key,
configure_logger,
get_prog,
main_for_filenames,
)
from .preprocessor import get_preprocessor
from .preprocessor.__main__ import (
add_common_cli as add_preprocessor_cli,
)
from .info import KIND
from . import parse_file as _iter_parsed
logger = logging.getLogger(__name__)
def _format_vartype(vartype):
if isinstance(vartype, str):
return vartype
data = vartype
try:
vartype = data['vartype']
except KeyError:
storage, typequal, typespec, abstract = vartype.values()
else:
storage = data.get('storage')
if storage:
_, typequal, typespec, abstract = vartype.values()
else:
storage, typequal, typespec, abstract = vartype.values()
vartype = f'{typespec} {abstract}'
if typequal:
vartype = f'{typequal} {vartype}'
if storage:
vartype = f'{storage} {vartype}'
return vartype
def _get_preprocessor(filename, **kwargs):
return get_processor(filename,
log_err=print,
**kwargs
)
#######################################
# the formats
def fmt_raw(filename, item, *, showfwd=None):
yield str(tuple(item))
def fmt_summary(filename, item, *, showfwd=None):
if item.filename and item.filename != os.path.join('.', filename):
yield f'> {item.filename}'
if showfwd is None:
LINE = ' {lno:>5} {kind:10} {funcname:40} {fwd:1} {name:40} {data}'
else:
LINE = ' {lno:>5} {kind:10} {funcname:40} {name:40} {data}'
lno = kind = funcname = fwd = name = data = ''
MIN_LINE = len(LINE.format(**locals()))
fileinfo, kind, funcname, name, data = item
lno = fileinfo.lno if fileinfo and fileinfo.lno >= 0 else ''
funcname = funcname or ' --'
name = name or ' --'
isforward = False
if kind is KIND.FUNCTION:
storage, inline, params, returntype, isforward = data.values()
returntype = _format_vartype(returntype)
data = returntype + params
if inline:
data = f'inline {data}'
if storage:
data = f'{storage} {data}'
elif kind is KIND.VARIABLE:
data = _format_vartype(data)
elif kind is KIND.STRUCT or kind is KIND.UNION:
if data is None:
isforward = True
else:
fields = data
data = f'({len(data)}) {{ '
indent = ',\n' + ' ' * (MIN_LINE + len(data))
data += ', '.join(f.name for f in fields[:5])
fields = fields[5:]
while fields:
data = f'{data}{indent}{", ".join(f.name for f in fields[:5])}'
fields = fields[5:]
data += ' }'
elif kind is KIND.ENUM:
if data is None:
isforward = True
else:
names = [d if isinstance(d, str) else d.name
for d in data]
data = f'({len(data)}) {{ '
indent = ',\n' + ' ' * (MIN_LINE + len(data))
data += ', '.join(names[:5])
names = names[5:]
while names:
data = f'{data}{indent}{", ".join(names[:5])}'
names = names[5:]
data += ' }'
elif kind is KIND.TYPEDEF:
data = f'typedef {data}'
elif kind == KIND.STATEMENT:
pass
else:
raise NotImplementedError(item)
if isforward:
fwd = '*'
if not showfwd and showfwd is not None:
return
elif showfwd:
return
kind = kind.value
yield LINE.format(**locals())
def fmt_full(filename, item, *, showfwd=None):
raise NotImplementedError
FORMATS = {
'raw': fmt_raw,
'summary': fmt_summary,
'full': fmt_full,
}
def add_output_cli(parser):
parser.add_argument('--format', dest='fmt', default='summary', choices=tuple(FORMATS))
parser.add_argument('--showfwd', action='store_true', default=None)
parser.add_argument('--no-showfwd', dest='showfwd', action='store_false', default=None)
def process_args(args):
pass
return process_args
#######################################
# the commands
def _cli_parse(parser, excluded=None, **prepr_kwargs):
process_output = add_output_cli(parser)
process_kinds = add_kind_filtering_cli(parser)
process_preprocessor = add_preprocessor_cli(parser, **prepr_kwargs)
process_files = add_files_cli(parser, excluded=excluded)
return [
process_output,
process_kinds,
process_preprocessor,
process_files,
]
def cmd_parse(filenames, *,
fmt='summary',
showfwd=None,
iter_filenames=None,
**kwargs
):
if 'get_file_preprocessor' not in kwargs:
kwargs['get_file_preprocessor'] = _get_preprocessor()
try:
do_fmt = FORMATS[fmt]
except KeyError:
raise ValueError(f'unsupported fmt {fmt!r}')
for filename in main_for_filenames(filenames, iter_filenames):
for item in _iter_parsed(filename, **kwargs):
for line in do_fmt(filename, item, showfwd=showfwd):
print(line)
def _cli_data(parser):
...
return []
def cmd_data(filenames,
**kwargs
):
# XXX
raise NotImplementedError
COMMANDS = {
'parse': (
'parse the given C source & header files',
[_cli_parse],
cmd_parse,
),
'data': (
'check/manage local data (e.g. excludes, macros)',
[_cli_data],
cmd_data,
),
}
#######################################
# the script
def parse_args(argv=sys.argv[1:], prog=sys.argv[0], *, subset='parse'):
import argparse
parser = argparse.ArgumentParser(
prog=prog or get_prog,
)
processors = add_commands_cli(
parser,
commands={k: v[1] for k, v in COMMANDS.items()},
commonspecs=[
add_verbosity_cli,
add_traceback_cli,
],
subset=subset,
)
args = parser.parse_args(argv)
ns = vars(args)
cmd = ns.pop('cmd')
verbosity, traceback_cm = process_args_by_key(
args,
processors[cmd],
['verbosity', 'traceback_cm'],
)
return cmd, ns, verbosity, traceback_cm
def main(cmd, cmd_kwargs):
try:
run_cmd = COMMANDS[cmd][0]
except KeyError:
raise ValueError(f'unsupported cmd {cmd!r}')
run_cmd(**cmd_kwargs)
if __name__ == '__main__':
cmd, cmd_kwargs, verbosity, traceback_cm = parse_args()
configure_logger(verbosity)
with traceback_cm:
main(cmd, cmd_kwargs)

View file

@ -0,0 +1,244 @@
f'''
struct {ANON_IDENTIFIER};
struct {{ ... }}
struct {IDENTIFIER} {{ ... }}
union {ANON_IDENTIFIER};
union {{ ... }}
union {IDENTIFIER} {{ ... }}
enum {ANON_IDENTIFIER};
enum {{ ... }}
enum {IDENTIFIER} {{ ... }}
typedef {VARTYPE} {IDENTIFIER};
typedef {IDENTIFIER};
typedef {IDENTIFIER};
typedef {IDENTIFIER};
'''
def parse(srclines):
if isinstance(srclines, str): # a filename
raise NotImplementedError
# This only handles at most 10 nested levels.
#MATCHED_PARENS = textwrap.dedent(rf'''
# # matched parens
# (?:
# [(] # level 0
# (?:
# [^()]*
# [(] # level 1
# (?:
# [^()]*
# [(] # level 2
# (?:
# [^()]*
# [(] # level 3
# (?:
# [^()]*
# [(] # level 4
# (?:
# [^()]*
# [(] # level 5
# (?:
# [^()]*
# [(] # level 6
# (?:
# [^()]*
# [(] # level 7
# (?:
# [^()]*
# [(] # level 8
# (?:
# [^()]*
# [(] # level 9
# (?:
# [^()]*
# [(] # level 10
# [^()]*
# [)]
# )*
# [^()]*
# [)]
# )*
# [^()]*
# [)]
# )*
# [^()]*
# [)]
# )*
# [^()]*
# [)]
# )*
# [^()]*
# [)]
# )*
# [^()]*
# [)]
# )*
# [^()]*
# [)]
# )*
# [^()]*
# [)]
# )*
# [^()]*
# [)]
# )*
# [^()]*
# [)]
# )
# # end matched parens
# ''')
'''
# for loop
(?:
\s* \b for
\s* [(]
(
[^;]* ;
[^;]* ;
.*?
) # <header>
[)]
\s*
(?:
(?:
(
{_ind(SIMPLE_STMT, 6)}
) # <stmt>
;
)
|
( {{ ) # <open>
)
)
|
(
(?:
(?:
(?:
{_ind(SIMPLE_STMT, 6)}
)?
return \b \s*
{_ind(INITIALIZER, 5)}
)
|
(?:
(?:
{IDENTIFIER} \s*
(?: . | -> ) \s*
)*
{IDENTIFIER}
\s* = \s*
{_ind(INITIALIZER, 5)}
)
|
(?:
{_ind(SIMPLE_STMT, 5)}
)
)
|
# cast compound literal
(?:
(?:
[^'"{{}};]*
{_ind(STRING_LITERAL, 5)}
)*
[^'"{{}};]*?
[^'"{{}};=]
=
\s* [(] [^)]* [)]
\s* {{ [^;]* }}
)
) # <stmt>
# compound statement
(?:
(
(?:
# "for" statements are handled separately above.
(?: (?: else \s+ )? if | switch | while ) \s*
{_ind(COMPOUND_HEAD, 5)}
)
|
(?: else | do )
# We do not worry about compound statements for labels,
# "case", or "default".
)? # <header>
\s*
( {{ ) # <open>
)
(
(?:
[^'"{{}};]*
{_ind(STRING_LITERAL, 5)}
)*
[^'"{{}};]*
# Presumably we will not see "== {{".
[^\s='"{{}};]
)? # <header>
(
\b
(?:
# We don't worry about labels with a compound statement.
(?:
switch \s* [(] [^{{]* [)]
)
|
(?:
case \b \s* [^:]+ [:]
)
|
(?:
default \s* [:]
)
|
(?:
do
)
|
(?:
while \s* [(] [^{{]* [)]
)
|
#(?:
# for \s* [(] [^{{]* [)]
# )
#|
(?:
if \s* [(]
(?: [^{{]* [^)] \s* {{ )* [^{{]*
[)]
)
|
(?:
else
(?:
\s*
if \s* [(]
(?: [^{{]* [^)] \s* {{ )* [^{{]*
[)]
)?
)
)
)? # <header>
'''

View file

@ -0,0 +1,150 @@
import os.path
import c_common.tables as _tables
import c_parser.info as _info
BASE_COLUMNS = [
'filename',
'funcname',
'name',
'kind',
]
END_COLUMNS = {
'parsed': 'data',
'decls': 'declaration',
}
def _get_columns(group, extra=None):
return BASE_COLUMNS + list(extra or ()) + [END_COLUMNS[group]]
#return [
# *BASE_COLUMNS,
# *extra or (),
# END_COLUMNS[group],
#]
#############################
# high-level
def read_parsed(infile):
# XXX Support other formats than TSV?
columns = _get_columns('parsed')
for row in _tables.read_table(infile, columns, sep='\t', fix='-'):
yield _info.ParsedItem.from_row(row, columns)
def write_parsed(items, outfile):
# XXX Support other formats than TSV?
columns = _get_columns('parsed')
rows = (item.as_row(columns) for item in items)
_tables.write_table(outfile, columns, rows, sep='\t', fix='-')
def read_decls(infile, fmt=None):
if fmt is None:
fmt = _get_format(infile)
read_all, _ = _get_format_handlers('decls', fmt)
for decl, _ in read_all(infile):
yield decl
def write_decls(decls, outfile, fmt=None, *, backup=False):
if fmt is None:
fmt = _get_format(infile)
_, write_all = _get_format_handlers('decls', fmt)
write_all(decls, outfile, backup=backup)
#############################
# formats
def _get_format(file, default='tsv'):
if isinstance(file, str):
filename = file
else:
filename = getattr(file, 'name', '')
_, ext = os.path.splitext(filename)
return ext[1:] if ext else default
def _get_format_handlers(group, fmt):
# XXX Use a registry.
if group != 'decls':
raise NotImplementedError(group)
if fmt == 'tsv':
return (_iter_decls_tsv, _write_decls_tsv)
else:
raise NotImplementedError(fmt)
# tsv
def iter_decls_tsv(infile, extracolumns=None, relroot=None):
for info, extra in _iter_decls_tsv(infile, extracolumns, relroot):
decl = _info.Declaration.from_row(info)
yield decl, extra
def write_decls_tsv(decls, outfile, extracolumns=None, *,
relroot=None,
**kwargs
):
# XXX Move the row rendering here.
_write_decls_tsv(rows, outfile, extracolumns, relroot, kwargs)
def _iter_decls_tsv(infile, extracolumns=None, relroot=None):
columns = _get_columns('decls', extracolumns)
for row in _tables.read_table(infile, columns, sep='\t'):
if extracolumns:
declinfo = row[:4] + row[-1:]
extra = row[4:-1]
else:
declinfo = row
extra = None
if relroot:
# XXX Use something like tables.fix_row() here.
declinfo = [None if v == '-' else v
for v in declinfo]
declinfo[0] = os.path.join(relroot, declinfo[0])
yield declinfo, extra
def _write_decls_tsv(decls, outfile, extracolumns, relroot,kwargs):
columns = _get_columns('decls', extracolumns)
if extracolumns:
def render_decl(decl):
if type(row) is tuple:
decl, *extra = decl
else:
extra = ()
extra += ('???',) * (len(extraColumns) - len(extra))
*row, declaration = _render_known_row(decl, relroot)
row += extra + (declaration,)
return row
else:
render_decl = _render_known_decl
_tables.write_table(
outfile,
header='\t'.join(columns),
rows=(render_decl(d, relroot) for d in decls),
sep='\t',
**kwargs
)
def _render_known_decl(decl, relroot, *,
# These match BASE_COLUMNS + END_COLUMNS[group].
_columns = 'filename parent name kind data'.split(),
):
if not isinstance(decl, _info.Declaration):
# e.g. Analyzed
decl = decl.decl
rowdata = decl.render_rowdata(_columns)
if relroot:
rowdata['filename'] = os.path.relpath(rowdata['filename'], relroot)
return [rowdata[c] or '-' for c in _columns]
# XXX
#return _tables.fix_row(rowdata[c] for c in columns)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,212 @@
"""A simple non-validating parser for C99.
The functions and regex patterns here are not entirely suitable for
validating C syntax. Please rely on a proper compiler for that.
Instead our goal here is merely matching and extracting information from
valid C code.
Furthermore, the grammar rules for the C syntax (particularly as
described in the K&R book) actually describe a superset, of which the
full C langage is a proper subset. Here are some of the extra
conditions that must be applied when parsing C code:
* ...
(see: http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf)
We have taken advantage of the elements of the C grammar that are used
only in a few limited contexts, mostly as delimiters. They allow us to
focus the regex patterns confidently. Here are the relevant tokens and
in which grammar rules they are used:
separators:
* ";"
+ (decl) struct/union: at end of each member decl
+ (decl) declaration: at end of each (non-compound) decl
+ (stmt) expr stmt: at end of each stmt
+ (stmt) for: between exprs in "header"
+ (stmt) goto: at end
+ (stmt) continue: at end
+ (stmt) break: at end
+ (stmt) return: at end
* ","
+ (decl) struct/union: between member declators
+ (decl) param-list: between params
+ (decl) enum: between enumerators
+ (decl) initializer (compound): between initializers
+ (expr) postfix: between func call args
+ (expr) expression: between "assignment" exprs
* ":"
+ (decl) struct/union: in member declators
+ (stmt) label: between label and stmt
+ (stmt) case: between expression and stmt
+ (stmt) default: between "default" and stmt
* "="
+ (decl) delaration: between decl and initializer
+ (decl) enumerator: between identifier and "initializer"
+ (expr) assignment: between "var" and expr
wrappers:
* "(...)"
+ (decl) declarator (func ptr): to wrap ptr/name
+ (decl) declarator (func ptr): around params
+ (decl) declarator: around sub-declarator (for readability)
+ (expr) postfix (func call): around args
+ (expr) primary: around sub-expr
+ (stmt) if: around condition
+ (stmt) switch: around source expr
+ (stmt) while: around condition
+ (stmt) do-while: around condition
+ (stmt) for: around "header"
* "{...}"
+ (decl) enum: around enumerators
+ (decl) func: around body
+ (stmt) compound: around stmts
* "[...]"
* (decl) declarator: for arrays
* (expr) postfix: array access
other:
* "*"
+ (decl) declarator: for pointer types
+ (expr) unary: for pointer deref
To simplify the regular expressions used here, we've takens some
shortcuts and made certain assumptions about the code we are parsing.
Some of these allow us to skip context-sensitive matching (e.g. braces)
or otherwise still match arbitrary C code unambiguously. However, in
some cases there are certain corner cases where the patterns are
ambiguous relative to arbitrary C code. However, they are still
unambiguous in the specific code we are parsing.
Here are the cases where we've taken shortcuts or made assumptions:
* there is no overlap syntactically between the local context (func
bodies) and the global context (other than variable decls), so we
do not need to worry about ambiguity due to the overlap:
+ the global context has no expressions or statements
+ the local context has no function definitions or type decls
* no "inline" type declarations (struct, union, enum) in function
parameters ~(including function pointers)~
* no "inline" type decls in function return types
* no superflous parentheses in declarators
* var decls in for loops are always "simple" (e.g. no inline types)
* only inline struct/union/enum decls may be anonymouns (without a name)
* no function pointers in function pointer parameters
* for loop "headers" do not have curly braces (e.g. compound init)
* syntactically, variable decls do not overlap with stmts/exprs, except
in the following case:
spam (*eggs) (...)
This could be either a function pointer variable named "eggs"
or a call to a function named "spam", which returns a function
pointer that gets called. The only differentiator is the
syntax used in the "..." part. It will be comma-separated
parameters for the former and comma-separated expressions for
the latter. Thus, if we expect such decls or calls then we must
parse the decl params.
"""
"""
TODO:
* extract CPython-specific code
* drop include injection (or only add when needed)
* track position instead of slicing "text"
* Parser class instead of the _iter_source() mess
* alt impl using a state machine (& tokenizer or split on delimiters)
"""
from ..info import ParsedItem
from ._info import SourceInfo
def parse(srclines):
if isinstance(srclines, str): # a filename
raise NotImplementedError
anon_name = anonymous_names()
for result in _parse(srclines, anon_name):
yield ParsedItem.from_raw(result)
# XXX Later: Add a separate function to deal with preprocessor directives
# parsed out of raw source.
def anonymous_names():
counter = 1
def anon_name(prefix='anon-'):
nonlocal counter
name = f'{prefix}{counter}'
counter += 1
return name
return anon_name
#############################
# internal impl
import logging
_logger = logging.getLogger(__name__)
def _parse(srclines, anon_name):
from ._global import parse_globals
source = _iter_source(srclines)
#source = _iter_source(srclines, showtext=True)
for result in parse_globals(source, anon_name):
# XXX Handle blocks here insted of in parse_globals().
yield result
def _iter_source(lines, *, maxtext=20_000, maxlines=700, showtext=False):
filestack = []
allinfo = {}
# "lines" should be (fileinfo, data), as produced by the preprocessor code.
for fileinfo, line in lines:
if fileinfo.filename in filestack:
while fileinfo.filename != filestack[-1]:
filename = filestack.pop()
del allinfo[filename]
filename = fileinfo.filename
srcinfo = allinfo[filename]
else:
filename = fileinfo.filename
srcinfo = SourceInfo(filename)
filestack.append(filename)
allinfo[filename] = srcinfo
_logger.debug(f'-> {line}')
srcinfo._add_line(line, fileinfo.lno)
if len(srcinfo.text) > maxtext:
break
if srcinfo.end - srcinfo.start > maxlines:
break
while srcinfo._used():
yield srcinfo
if showtext:
_logger.debug(f'=> {srcinfo.text}')
else:
if not filestack:
srcinfo = SourceInfo('???')
else:
filename = filestack[-1]
srcinfo = allinfo[filename]
while srcinfo._used():
yield srcinfo
if showtext:
_logger.debug(f'=> {srcinfo.text}')
yield srcinfo
if showtext:
_logger.debug(f'=> {srcinfo.text}')
if not srcinfo._ready:
return
# At this point either the file ended prematurely
# or there's "too much" text.
filename, lno, text = srcinfo.filename, srcinfo._start, srcinfo.text
if len(text) > 500:
text = text[:500] + '...'
raise Exception(f'unmatched text ({filename} starting at line {lno}):\n{text}')

View file

@ -0,0 +1,6 @@
def _parse(srclines, anon_name):
text = ' '.join(l for _, l in srclines)
from ._delim import parse
yield from parse(text, anon_name)

View file

@ -0,0 +1,115 @@
import re
from ._regexes import (
_ind,
STRING_LITERAL,
VAR_DECL as _VAR_DECL,
)
def log_match(group, m):
from . import _logger
_logger.debug(f'matched <{group}> ({m.group(0)})')
#############################
# regex utils
def set_capture_group(pattern, group, *, strict=True):
old = f'(?: # <{group}>'
if strict and f'(?: # <{group}>' not in pattern:
raise ValueError(f'{old!r} not found in pattern')
return pattern.replace(old, f'( # <{group}>', 1)
def set_capture_groups(pattern, groups, *, strict=True):
for group in groups:
pattern = set_capture_group(pattern, group, strict=strict)
return pattern
#############################
# syntax-related utils
_PAREN_RE = re.compile(rf'''
(?:
(?:
[^'"()]*
{_ind(STRING_LITERAL, 3)}
)*
[^'"()]*
(?:
( [(] )
|
( [)] )
)
)
''', re.VERBOSE)
def match_paren(text, depth=0):
pos = 0
while (m := _PAREN_RE.match(text, pos)):
pos = m.end()
_open, _close = m.groups()
if _open:
depth += 1
else: # _close
depth -= 1
if depth == 0:
return pos
else:
raise ValueError(f'could not find matching parens for {text!r}')
VAR_DECL = set_capture_groups(_VAR_DECL, (
'STORAGE',
'TYPE_QUAL',
'TYPE_SPEC',
'DECLARATOR',
'IDENTIFIER',
'WRAPPED_IDENTIFIER',
'FUNC_IDENTIFIER',
))
def parse_var_decl(decl):
m = re.match(VAR_DECL, decl, re.VERBOSE)
(storage, typequal, typespec, declarator,
name,
wrappedname,
funcptrname,
) = m.groups()
if name:
kind = 'simple'
elif wrappedname:
kind = 'wrapped'
name = wrappedname
elif funcptrname:
kind = 'funcptr'
name = funcptrname
else:
raise NotImplementedError
abstract = declarator.replace(name, '')
vartype = {
'storage': storage,
'typequal': typequal,
'typespec': typespec,
'abstract': abstract,
}
return (kind, name, vartype)
#############################
# parser state utils
# XXX Drop this or use it!
def iter_results(results):
if not results:
return
if callable(results):
results = results()
for result, text in results():
if result:
yield result, text

View file

@ -0,0 +1,158 @@
import re
from ._regexes import (
STRUCT_MEMBER_DECL as _STRUCT_MEMBER_DECL,
ENUM_MEMBER_DECL as _ENUM_MEMBER_DECL,
)
from ._common import (
log_match,
parse_var_decl,
set_capture_groups,
)
#############################
# struct / union
STRUCT_MEMBER_DECL = set_capture_groups(_STRUCT_MEMBER_DECL, (
'COMPOUND_TYPE_KIND',
'COMPOUND_TYPE_NAME',
'SPECIFIER_QUALIFIER',
'DECLARATOR',
'SIZE',
'ENDING',
'CLOSE',
))
STRUCT_MEMBER_RE = re.compile(rf'^ \s* {STRUCT_MEMBER_DECL}', re.VERBOSE)
def parse_struct_body(source, anon_name, parent):
done = False
while not done:
done = True
for srcinfo in source:
m = STRUCT_MEMBER_RE.match(srcinfo.text)
if m:
break
else:
# We ran out of lines.
if srcinfo is not None:
srcinfo.done()
return
for item in _parse_struct_next(m, srcinfo, anon_name, parent):
if callable(item):
parse_body = item
yield from parse_body(source)
else:
yield item
done = False
def _parse_struct_next(m, srcinfo, anon_name, parent):
(inline_kind, inline_name,
qualspec, declarator,
size,
ending,
close,
) = m.groups()
remainder = srcinfo.text[m.end():]
if close:
log_match('compound close', m)
srcinfo.advance(remainder)
elif inline_kind:
log_match('compound inline', m)
kind = inline_kind
name = inline_name or anon_name('inline-')
# Immediately emit a forward declaration.
yield srcinfo.resolve(kind, name=name, data=None)
# un-inline the decl. Note that it might not actually be inline.
# We handle the case in the "maybe_inline_actual" branch.
srcinfo.nest(
remainder,
f'{kind} {name}',
)
def parse_body(source):
_parse_body = DECL_BODY_PARSERS[kind]
data = [] # members
ident = f'{kind} {name}'
for item in _parse_body(source, anon_name, ident):
if item.kind == 'field':
data.append(item)
else:
yield item
# XXX Should "parent" really be None for inline type decls?
yield srcinfo.resolve(kind, data, name, parent=None)
srcinfo.resume()
yield parse_body
else:
# not inline (member)
log_match('compound member', m)
if qualspec:
_, name, data = parse_var_decl(f'{qualspec} {declarator}')
if not name:
name = anon_name('struct-field-')
if size:
# data = (data, size)
data['size'] = int(size)
else:
# This shouldn't happen (we expect each field to have a name).
raise NotImplementedError
name = sized_name or anon_name('struct-field-')
data = int(size)
yield srcinfo.resolve('field', data, name, parent) # XXX Restart?
if ending == ',':
remainder = rf'{qualspec} {remainder}'
srcinfo.advance(remainder)
#############################
# enum
ENUM_MEMBER_DECL = set_capture_groups(_ENUM_MEMBER_DECL, (
'CLOSE',
'NAME',
'INIT',
'ENDING',
))
ENUM_MEMBER_RE = re.compile(rf'{ENUM_MEMBER_DECL}', re.VERBOSE)
def parse_enum_body(source, _anon_name, _parent):
ending = None
while ending != '}':
for srcinfo in source:
m = ENUM_MEMBER_RE.match(srcinfo.text)
if m:
break
else:
# We ran out of lines.
if srcinfo is not None:
srcinfo.done()
return
remainder = srcinfo.text[m.end():]
(close,
name, init, ending,
) = m.groups()
if close:
ending = '}'
else:
data = init
yield srcinfo.resolve('field', data, name, _parent)
srcinfo.advance(remainder)
#############################
DECL_BODY_PARSERS = {
'struct': parse_struct_body,
'union': parse_struct_body,
'enum': parse_enum_body,
}

View file

@ -0,0 +1,54 @@
import re
import textwrap
from ._regexes import _ind, STRING_LITERAL
def parse(text, anon_name):
context = None
data = None
for m in DELIMITER_RE.find_iter(text):
before, opened, closed = m.groups()
delim = opened or closed
handle_segment = HANDLERS[context][delim]
result, context, data = handle_segment(before, delim, data)
if result:
yield result
DELIMITER = textwrap.dedent(rf'''
(
(?:
[^'"()\[\]{};]*
{_ind(STRING_LITERAL, 3)}
}*
[^'"()\[\]{};]+
)? # <before>
(?:
(
[(\[{]
) # <open>
|
(
[)\]};]
) # <close>
)?
''')
DELIMITER_RE = re.compile(DELIMITER, re.VERBOSE)
_HANDLERS = {
None: { # global
# opened
'{': ...,
'[': None,
'(': None,
# closed
'}': None,
']': None,
')': None,
';': ...,
},
'': {
},
}

View file

@ -0,0 +1,278 @@
import re
from ._regexes import (
LOCAL as _LOCAL,
LOCAL_STATICS as _LOCAL_STATICS,
)
from ._common import (
log_match,
parse_var_decl,
set_capture_groups,
match_paren,
)
from ._compound_decl_body import DECL_BODY_PARSERS
LOCAL = set_capture_groups(_LOCAL, (
'EMPTY',
'INLINE_LEADING',
'INLINE_PRE',
'INLINE_KIND',
'INLINE_NAME',
'STORAGE',
'VAR_DECL',
'VAR_INIT',
'VAR_ENDING',
'COMPOUND_BARE',
'COMPOUND_LABELED',
'COMPOUND_PAREN',
'BLOCK_LEADING',
'BLOCK_OPEN',
'SIMPLE_STMT',
'SIMPLE_ENDING',
'BLOCK_CLOSE',
))
LOCAL_RE = re.compile(rf'^ \s* {LOCAL}', re.VERBOSE)
# Note that parse_function_body() still has trouble with a few files
# in the CPython codebase.
def parse_function_body(source, name, anon_name):
# XXX
raise NotImplementedError
def parse_function_body(name, text, resolve, source, anon_name, parent):
raise NotImplementedError
# For now we do not worry about locals declared in for loop "headers".
depth = 1;
while depth > 0:
m = LOCAL_RE.match(text)
while not m:
text, resolve = continue_text(source, text or '{', resolve)
m = LOCAL_RE.match(text)
text = text[m.end():]
(
empty,
inline_leading, inline_pre, inline_kind, inline_name,
storage, decl,
var_init, var_ending,
compound_bare, compound_labeled, compound_paren,
block_leading, block_open,
simple_stmt, simple_ending,
block_close,
) = m.groups()
if empty:
log_match('', m)
resolve(None, None, None, text)
yield None, text
elif inline_kind:
log_match('', m)
kind = inline_kind
name = inline_name or anon_name('inline-')
data = [] # members
# We must set the internal "text" from _iter_source() to the
# start of the inline compound body,
# Note that this is effectively like a forward reference that
# we do not emit.
resolve(kind, None, name, text, None)
_parse_body = DECL_BODY_PARSERS[kind]
before = []
ident = f'{kind} {name}'
for member, inline, text in _parse_body(text, resolve, source, anon_name, ident):
if member:
data.append(member)
if inline:
yield from inline
# un-inline the decl. Note that it might not actually be inline.
# We handle the case in the "maybe_inline_actual" branch.
text = f'{inline_leading or ""} {inline_pre or ""} {kind} {name} {text}'
# XXX Should "parent" really be None for inline type decls?
yield resolve(kind, data, name, text, None), text
elif block_close:
log_match('', m)
depth -= 1
resolve(None, None, None, text)
# XXX This isn't great. Calling resolve() should have
# cleared the closing bracket. However, some code relies
# on the yielded value instead of the resolved one. That
# needs to be fixed.
yield None, text
elif compound_bare:
log_match('', m)
yield resolve('statement', compound_bare, None, text, parent), text
elif compound_labeled:
log_match('', m)
yield resolve('statement', compound_labeled, None, text, parent), text
elif compound_paren:
log_match('', m)
try:
pos = match_paren(text)
except ValueError:
text = f'{compound_paren} {text}'
#resolve(None, None, None, text)
text, resolve = continue_text(source, text, resolve)
yield None, text
else:
head = text[:pos]
text = text[pos:]
if compound_paren == 'for':
# XXX Parse "head" as a compound statement.
stmt1, stmt2, stmt3 = head.split(';', 2)
data = {
'compound': compound_paren,
'statements': (stmt1, stmt2, stmt3),
}
else:
data = {
'compound': compound_paren,
'statement': head,
}
yield resolve('statement', data, None, text, parent), text
elif block_open:
log_match('', m)
depth += 1
if block_leading:
# An inline block: the last evaluated expression is used
# in place of the block.
# XXX Combine it with the remainder after the block close.
stmt = f'{block_open}{{<expr>}}...;'
yield resolve('statement', stmt, None, text, parent), text
else:
resolve(None, None, None, text)
yield None, text
elif simple_ending:
log_match('', m)
yield resolve('statement', simple_stmt, None, text, parent), text
elif var_ending:
log_match('', m)
kind = 'variable'
_, name, vartype = parse_var_decl(decl)
data = {
'storage': storage,
'vartype': vartype,
}
after = ()
if var_ending == ',':
# It was a multi-declaration, so queue up the next one.
_, qual, typespec, _ = vartype.values()
text = f'{storage or ""} {qual or ""} {typespec} {text}'
yield resolve(kind, data, name, text, parent), text
if var_init:
_data = f'{name} = {var_init.strip()}'
yield resolve('statement', _data, None, text, parent), text
else:
# This should be unreachable.
raise NotImplementedError
#############################
# static local variables
LOCAL_STATICS = set_capture_groups(_LOCAL_STATICS, (
'INLINE_LEADING',
'INLINE_PRE',
'INLINE_KIND',
'INLINE_NAME',
'STATIC_DECL',
'STATIC_INIT',
'STATIC_ENDING',
'DELIM_LEADING',
'BLOCK_OPEN',
'BLOCK_CLOSE',
'STMT_END',
))
LOCAL_STATICS_RE = re.compile(rf'^ \s* {LOCAL_STATICS}', re.VERBOSE)
def parse_function_statics(source, func, anon_name):
# For now we do not worry about locals declared in for loop "headers".
depth = 1;
while depth > 0:
for srcinfo in source:
m = LOCAL_STATICS_RE.match(srcinfo.text)
if m:
break
else:
# We ran out of lines.
if srcinfo is not None:
srcinfo.done()
return
for item, depth in _parse_next_local_static(m, srcinfo,
anon_name, func, depth):
if callable(item):
parse_body = item
yield from parse_body(source)
elif item is not None:
yield item
def _parse_next_local_static(m, srcinfo, anon_name, func, depth):
(inline_leading, inline_pre, inline_kind, inline_name,
static_decl, static_init, static_ending,
_delim_leading,
block_open,
block_close,
stmt_end,
) = m.groups()
remainder = srcinfo.text[m.end():]
if inline_kind:
log_match('func inline', m)
kind = inline_kind
name = inline_name or anon_name('inline-')
# Immediately emit a forward declaration.
yield srcinfo.resolve(kind, name=name, data=None), depth
# un-inline the decl. Note that it might not actually be inline.
# We handle the case in the "maybe_inline_actual" branch.
srcinfo.nest(
remainder,
f'{inline_leading or ""} {inline_pre or ""} {kind} {name}'
)
def parse_body(source):
_parse_body = DECL_BODY_PARSERS[kind]
data = [] # members
ident = f'{kind} {name}'
for item in _parse_body(source, anon_name, ident):
if item.kind == 'field':
data.append(item)
else:
yield item
# XXX Should "parent" really be None for inline type decls?
yield srcinfo.resolve(kind, data, name, parent=None)
srcinfo.resume()
yield parse_body, depth
elif static_decl:
log_match('local variable', m)
_, name, data = parse_var_decl(static_decl)
yield srcinfo.resolve('variable', data, name, parent=func), depth
if static_init:
srcinfo.advance(f'{name} {static_init} {remainder}')
elif static_ending == ',':
# It was a multi-declaration, so queue up the next one.
_, qual, typespec, _ = data.values()
srcinfo.advance(f'static {qual or ""} {typespec} {remainder}')
else:
srcinfo.advance('')
else:
log_match('func other', m)
if block_open:
depth += 1
elif block_close:
depth -= 1
elif stmt_end:
pass
else:
# This should be unreachable.
raise NotImplementedError
srcinfo.advance(remainder)
yield None, depth

View file

@ -0,0 +1,179 @@
import re
from ._regexes import (
GLOBAL as _GLOBAL,
)
from ._common import (
log_match,
parse_var_decl,
set_capture_groups,
)
from ._compound_decl_body import DECL_BODY_PARSERS
#from ._func_body import parse_function_body
from ._func_body import parse_function_statics as parse_function_body
GLOBAL = set_capture_groups(_GLOBAL, (
'EMPTY',
'COMPOUND_LEADING',
'COMPOUND_KIND',
'COMPOUND_NAME',
'FORWARD_KIND',
'FORWARD_NAME',
'MAYBE_INLINE_ACTUAL',
'TYPEDEF_DECL',
'TYPEDEF_FUNC_PARAMS',
'VAR_STORAGE',
'FUNC_INLINE',
'VAR_DECL',
'FUNC_PARAMS',
'FUNC_DELIM',
'FUNC_LEGACY_PARAMS',
'VAR_INIT',
'VAR_ENDING',
))
GLOBAL_RE = re.compile(rf'^ \s* {GLOBAL}', re.VERBOSE)
def parse_globals(source, anon_name):
for srcinfo in source:
m = GLOBAL_RE.match(srcinfo.text)
if not m:
# We need more text.
continue
for item in _parse_next(m, srcinfo, anon_name):
if callable(item):
parse_body = item
yield from parse_body(source)
else:
yield item
else:
# We ran out of lines.
if srcinfo is not None:
srcinfo.done()
return
def _parse_next(m, srcinfo, anon_name):
(
empty,
# compound type decl (maybe inline)
compound_leading, compound_kind, compound_name,
forward_kind, forward_name, maybe_inline_actual,
# typedef
typedef_decl, typedef_func_params,
# vars and funcs
storage, func_inline, decl,
func_params, func_delim, func_legacy_params,
var_init, var_ending,
) = m.groups()
remainder = srcinfo.text[m.end():]
if empty:
log_match('global empty', m)
srcinfo.advance(remainder)
elif maybe_inline_actual:
log_match('maybe_inline_actual', m)
# Ignore forward declarations.
# XXX Maybe return them too (with an "isforward" flag)?
if not maybe_inline_actual.strip().endswith(';'):
remainder = maybe_inline_actual + remainder
yield srcinfo.resolve(forward_kind, None, forward_name)
if maybe_inline_actual.strip().endswith('='):
# We use a dummy prefix for a fake typedef.
# XXX Ideally this case would not be caught by MAYBE_INLINE_ACTUAL.
_, name, data = parse_var_decl(f'{forward_kind} {forward_name} fake_typedef_{forward_name}')
yield srcinfo.resolve('typedef', data, name, parent=None)
remainder = f'{name} {remainder}'
srcinfo.advance(remainder)
elif compound_kind:
kind = compound_kind
name = compound_name or anon_name('inline-')
# Immediately emit a forward declaration.
yield srcinfo.resolve(kind, name=name, data=None)
# un-inline the decl. Note that it might not actually be inline.
# We handle the case in the "maybe_inline_actual" branch.
srcinfo.nest(
remainder,
f'{compound_leading or ""} {compound_kind} {name}',
)
def parse_body(source):
_parse_body = DECL_BODY_PARSERS[compound_kind]
data = [] # members
ident = f'{kind} {name}'
for item in _parse_body(source, anon_name, ident):
if item.kind == 'field':
data.append(item)
else:
yield item
# XXX Should "parent" really be None for inline type decls?
yield srcinfo.resolve(kind, data, name, parent=None)
srcinfo.resume()
yield parse_body
elif typedef_decl:
log_match('typedef', m)
kind = 'typedef'
_, name, data = parse_var_decl(typedef_decl)
if typedef_func_params:
return_type = data
# This matches the data for func declarations.
data = {
'storage': None,
'inline': None,
'params': f'({typedef_func_params})',
'returntype': return_type,
'isforward': True,
}
yield srcinfo.resolve(kind, data, name, parent=None)
srcinfo.advance(remainder)
elif func_delim or func_legacy_params:
log_match('function', m)
kind = 'function'
_, name, return_type = parse_var_decl(decl)
func_params = func_params or func_legacy_params
data = {
'storage': storage,
'inline': func_inline,
'params': f'({func_params})',
'returntype': return_type,
'isforward': func_delim == ';',
}
yield srcinfo.resolve(kind, data, name, parent=None)
srcinfo.advance(remainder)
if func_delim == '{' or func_legacy_params:
def parse_body(source):
yield from parse_function_body(source, name, anon_name)
yield parse_body
elif var_ending:
log_match('global variable', m)
kind = 'variable'
_, name, vartype = parse_var_decl(decl)
data = {
'storage': storage,
'vartype': vartype,
}
yield srcinfo.resolve(kind, data, name, parent=None)
if var_ending == ',':
# It was a multi-declaration, so queue up the next one.
_, qual, typespec, _ = vartype.values()
remainder = f'{storage or ""} {qual or ""} {typespec} {remainder}'
srcinfo.advance(remainder)
if var_init:
_data = f'{name} = {var_init.strip()}'
yield srcinfo.resolve('statement', _data, name=None)
else:
# This should be unreachable.
raise NotImplementedError

View file

@ -0,0 +1,168 @@
from ..info import KIND, ParsedItem, FileInfo
class TextInfo:
def __init__(self, text, start=None, end=None):
# immutable:
if not start:
start = 1
self.start = start
# mutable:
lines = text.splitlines() or ['']
self.text = text.strip()
if not end:
end = start + len(lines) - 1
self.end = end
self.line = lines[-1]
def __repr__(self):
args = (f'{a}={getattr(self, a)!r}'
for a in ['text', 'start', 'end'])
return f'{type(self).__name__}({", ".join(args)})'
def add_line(self, line, lno=None):
if lno is None:
lno = self.end + 1
else:
if isinstance(lno, FileInfo):
fileinfo = lno
if fileinfo.filename != self.filename:
raise NotImplementedError((fileinfo, self.filename))
lno = fileinfo.lno
# XXX
#if lno < self.end:
# raise NotImplementedError((lno, self.end))
line = line.lstrip()
self.text += ' ' + line
self.line = line
self.end = lno
class SourceInfo:
_ready = False
def __init__(self, filename, _current=None):
# immutable:
self.filename = filename
# mutable:
if isinstance(_current, str):
_current = TextInfo(_current)
self._current = _current
start = -1
self._start = _current.start if _current else -1
self._nested = []
self._set_ready()
def __repr__(self):
args = (f'{a}={getattr(self, a)!r}'
for a in ['filename', '_current'])
return f'{type(self).__name__}({", ".join(args)})'
@property
def start(self):
if self._current is None:
return self._start
return self._current.start
@property
def end(self):
if self._current is None:
return self._start
return self._current.end
@property
def text(self):
if self._current is None:
return ''
return self._current.text
def nest(self, text, before, start=None):
if self._current is None:
raise Exception('nesting requires active source text')
current = self._current
current.text = before
self._nested.append(current)
self._replace(text, start)
def resume(self, remainder=None):
if not self._nested:
raise Exception('no nested text to resume')
if self._current is None:
raise Exception('un-nesting requires active source text')
if remainder is None:
remainder = self._current.text
self._clear()
self._current = self._nested.pop()
self._current.text += ' ' + remainder
self._set_ready()
def advance(self, remainder, start=None):
if self._current is None:
raise Exception('advancing requires active source text')
if remainder.strip():
self._replace(remainder, start, fixnested=True)
else:
if self._nested:
self._replace('', start, fixnested=True)
#raise Exception('cannot advance while nesting')
else:
self._clear(start)
def resolve(self, kind, data, name, parent=None):
# "field" isn't a top-level kind, so we leave it as-is.
if kind and kind != 'field':
kind = KIND._from_raw(kind)
fileinfo = FileInfo(self.filename, self._start)
return ParsedItem(fileinfo, kind, parent, name, data)
def done(self):
self._set_ready()
def _set_ready(self):
if self._current is None:
self._ready = False
else:
self._ready = self._current.text.strip() != ''
def _used(self):
ready = self._ready
self._ready = False
return ready
def _clear(self, start=None):
old = self._current
if self._current is not None:
# XXX Fail if self._current wasn't used up?
if start is None:
start = self._current.end
self._current = None
if start is not None:
self._start = start
self._set_ready()
return old
def _replace(self, text, start=None, *, fixnested=False):
end = self._current.end
old = self._clear(start)
self._current = TextInfo(text, self._start, end)
if fixnested and self._nested and self._nested[-1] is old:
self._nested[-1] = self._current
self._set_ready()
def _add_line(self, line, lno=None):
if not line.strip():
# We don't worry about multi-line string literals.
return
if self._current is None:
self._start = lno
self._current = TextInfo(line, lno)
else:
# XXX
#if lno < self._current.end:
# # A circular include?
# raise NotImplementedError((lno, self))
self._current.add_line(line, lno)
self._ready = True

View file

@ -0,0 +1,796 @@
# Regular expression patterns for C syntax.
#
# None of these patterns has any capturing. However, a number of them
# have capturing markers compatible with utils.set_capture_groups().
import textwrap
def _ind(text, level=1, edges='both'):
indent = ' ' * level
text = textwrap.indent(text, indent)
if edges == 'pre' or edges == 'both':
text = '\n' + indent + text.lstrip()
if edges == 'post' or edges == 'both':
text = text.rstrip() + '\n' + ' ' * (level - 1)
return text
#######################################
# general
HEX = r'(?: [0-9a-zA-Z] )'
STRING_LITERAL = textwrap.dedent(rf'''
(?:
# character literal
(?:
['] [^'] [']
|
['] \\ . [']
|
['] \\x{HEX}{HEX} [']
|
['] \\0\d\d [']
|
(?:
['] \\o[01]\d\d [']
|
['] \\o2[0-4]\d [']
|
['] \\o25[0-5] [']
)
)
|
# string literal
(?:
["] (?: [^"\\]* \\ . )* [^"\\]* ["]
)
# end string literal
)
''')
_KEYWORD = textwrap.dedent(r'''
(?:
\b
(?:
auto |
extern |
register |
static |
typedef |
const |
volatile |
signed |
unsigned |
char |
short |
int |
long |
float |
double |
void |
struct |
union |
enum |
goto |
return |
sizeof |
break |
continue |
if |
else |
for |
do |
while |
switch |
case |
default |
entry
)
\b
)
''')
KEYWORD = rf'''
# keyword
{_KEYWORD}
# end keyword
'''
_KEYWORD = ''.join(_KEYWORD.split())
IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
# We use a negative lookahead to filter out keywords.
STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'
#######################################
# types
SIMPLE_TYPE = textwrap.dedent(rf'''
# simple type
(?:
\b
(?:
void
|
(?: signed | unsigned ) # implies int
|
(?:
(?: (?: signed | unsigned ) \s+ )?
(?: (?: long | short ) \s+ )?
(?: char | short | int | long | float | double )
)
)
\b
)
# end simple type
''')
COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'
#######################################
# variable declarations
STORAGE_CLASS = r'(?: \b (?: auto | register | static | extern ) \b )'
TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
TYPE_SPEC = textwrap.dedent(rf'''
# type spec
(?:
{_ind(SIMPLE_TYPE, 2)}
|
(?:
[_]*typeof[_]*
\s* [(]
(?: \s* [*&] )*
\s* {STRICT_IDENTIFIER}
\s* [)]
)
|
# reference to a compound type
(?:
{COMPOUND_TYPE_KIND}
(?: \s* {ANON_IDENTIFIER} )?
)
|
# reference to a typedef
{STRICT_IDENTIFIER}
)
# end type spec
''')
DECLARATOR = textwrap.dedent(rf'''
# declarator (possibly abstract)
(?:
(?: {PTR_QUALIFIER} \s* )*
(?:
(?:
(?: # <IDENTIFIER>
{STRICT_IDENTIFIER}
)
(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
)
|
(?:
[(] \s*
(?: # <WRAPPED_IDENTIFIER>
{STRICT_IDENTIFIER}
)
(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
\s* [)]
)
|
# func ptr
(?:
[(] (?: \s* {PTR_QUALIFIER} )? \s*
(?: # <FUNC_IDENTIFIER>
{STRICT_IDENTIFIER}
)
(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
\s* [)]
# We allow for a single level of paren nesting in parameters.
\s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
)
)
)
# end declarator
''')
VAR_DECL = textwrap.dedent(rf'''
# var decl (and typedef and func return type)
(?:
(?:
(?: # <STORAGE>
{STORAGE_CLASS}
)
\s*
)?
(?:
(?: # <TYPE_QUAL>
{TYPE_QUALIFIER}
)
\s*
)?
(?:
(?: # <TYPE_SPEC>
{_ind(TYPE_SPEC, 4)}
)
)
\s*
(?:
(?: # <DECLARATOR>
{_ind(DECLARATOR, 4)}
)
)
)
# end var decl
''')
INITIALIZER = textwrap.dedent(rf'''
# initializer
(?:
(?:
[(]
# no nested parens (e.g. func ptr)
[^)]*
[)]
\s*
)?
(?:
# a string literal
(?:
(?: {_ind(STRING_LITERAL, 4)} \s* )*
{_ind(STRING_LITERAL, 4)}
)
|
# a simple initializer
(?:
(?:
[^'",;{{]*
{_ind(STRING_LITERAL, 4)}
)*
[^'",;{{]*
)
|
# a struct/array literal
(?:
# We only expect compound initializers with
# single-variable declarations.
{{
(?:
[^'";]*?
{_ind(STRING_LITERAL, 5)}
)*
[^'";]*?
}}
(?= \s* ; ) # Note this lookahead.
)
)
)
# end initializer
''')
#######################################
# compound type declarations
STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
(?:
# inline compound type decl
(?:
(?: # <COMPOUND_TYPE_KIND>
{COMPOUND_TYPE_KIND}
)
(?:
\s+
(?: # <COMPOUND_TYPE_NAME>
{STRICT_IDENTIFIER}
)
)?
\s* {{
)
|
(?:
# typed member
(?:
# Technically it doesn't have to have a type...
(?: # <SPECIFIER_QUALIFIER>
(?: {TYPE_QUALIFIER} \s* )?
{_ind(TYPE_SPEC, 5)}
)
(?:
# If it doesn't have a declarator then it will have
# a size and vice versa.
\s*
(?: # <DECLARATOR>
{_ind(DECLARATOR, 6)}
)
)?
)
# sized member
(?:
\s* [:] \s*
(?: # <SIZE>
\d+
)
)?
\s*
(?: # <ENDING>
[,;]
)
)
|
(?:
\s*
(?: # <CLOSE>
}}
)
)
)
''')
ENUM_MEMBER_DECL = textwrap.dedent(rf'''
(?:
(?:
\s*
(?: # <CLOSE>
}}
)
)
|
(?:
\s*
(?: # <NAME>
{IDENTIFIER}
)
(?:
\s* = \s*
(?: # <INIT>
{_ind(STRING_LITERAL, 4)}
|
[^'",}}]+
)
)?
\s*
(?: # <ENDING>
, | }}
)
)
)
''')
#######################################
# statements
SIMPLE_STMT_BODY = textwrap.dedent(rf'''
# simple statement body
(?:
(?:
[^'"{{}};]*
{_ind(STRING_LITERAL, 3)}
)*
[^'"{{}};]*
#(?= [;{{] ) # Note this lookahead.
)
# end simple statement body
''')
SIMPLE_STMT = textwrap.dedent(rf'''
# simple statement
(?:
(?: # <SIMPLE_STMT>
# stmt-inline "initializer"
(?:
return \b
(?:
\s*
{_ind(INITIALIZER, 5)}
)?
)
|
# variable assignment
(?:
(?: [*] \s* )?
(?:
{STRICT_IDENTIFIER} \s*
(?: . | -> ) \s*
)*
{STRICT_IDENTIFIER}
(?: \s* \[ \s* \d+ \s* \] )?
\s* = \s*
{_ind(INITIALIZER, 4)}
)
|
# catchall return statement
(?:
return \b
(?:
(?:
[^'";]*
{_ind(STRING_LITERAL, 6)}
)*
\s* [^'";]*
)?
)
|
# simple statement
(?:
{_ind(SIMPLE_STMT_BODY, 4)}
)
)
\s*
(?: # <SIMPLE_ENDING>
;
)
)
# end simple statement
''')
COMPOUND_STMT = textwrap.dedent(rf'''
# compound statement
(?:
\b
(?:
(?:
(?: # <COMPOUND_BARE>
else | do
)
\b
)
|
(?:
(?: # <COMPOUND_LABELED>
(?:
case \b
(?:
[^'":]*
{_ind(STRING_LITERAL, 7)}
)*
\s* [^'":]*
)
|
default
|
{STRICT_IDENTIFIER}
)
\s* [:]
)
|
(?:
(?: # <COMPOUND_PAREN>
for | while | if | switch
)
\s* (?= [(] ) # Note this lookahead.
)
)
\s*
)
# end compound statement
''')
#######################################
# function bodies
LOCAL = textwrap.dedent(rf'''
(?:
# an empty statement
(?: # <EMPTY>
;
)
|
# inline type decl
(?:
(?:
(?: # <INLINE_LEADING>
[^;{{}}]+?
)
\s*
)?
(?: # <INLINE_PRE>
(?: {STORAGE_CLASS} \s* )?
(?: {TYPE_QUALIFIER} \s* )?
)? # </INLINE_PRE>
(?: # <INLINE_KIND>
{COMPOUND_TYPE_KIND}
)
(?:
\s+
(?: # <INLINE_NAME>
{STRICT_IDENTIFIER}
)
)?
\s* {{
)
|
# var decl
(?:
(?: # <STORAGE>
{STORAGE_CLASS}
)? # </STORAGE>
(?:
\s*
(?: # <VAR_DECL>
{_ind(VAR_DECL, 5)}
)
)
(?:
(?:
# initializer
# We expect only basic initializers.
\s* = \s*
(?: # <VAR_INIT>
{_ind(INITIALIZER, 6)}
)
)?
(?:
\s*
(?: # <VAR_ENDING>
[,;]
)
)
)
)
|
{_ind(COMPOUND_STMT, 2)}
|
# start-of-block
(?:
(?: # <BLOCK_LEADING>
(?:
[^'"{{}};]*
{_ind(STRING_LITERAL, 5)}
)*
[^'"{{}};]*
# Presumably we will not see "== {{".
[^\s='"{{}});]
\s*
)? # </BLOCK_LEADING>
(?: # <BLOCK_OPEN>
{{
)
)
|
{_ind(SIMPLE_STMT, 2)}
|
# end-of-block
(?: # <BLOCK_CLOSE>
}}
)
)
''')
LOCAL_STATICS = textwrap.dedent(rf'''
(?:
# inline type decl
(?:
(?:
(?: # <INLINE_LEADING>
[^;{{}}]+?
)
\s*
)?
(?: # <INLINE_PRE>
(?: {STORAGE_CLASS} \s* )?
(?: {TYPE_QUALIFIER} \s* )?
)?
(?: # <INLINE_KIND>
{COMPOUND_TYPE_KIND}
)
(?:
\s+
(?: # <INLINE_NAME>
{STRICT_IDENTIFIER}
)
)?
\s* {{
)
|
# var decl
(?:
# We only look for static variables.
(?: # <STATIC_DECL>
static \b
(?: \s* {TYPE_QUALIFIER} )?
\s* {_ind(TYPE_SPEC, 4)}
\s* {_ind(DECLARATOR, 4)}
)
\s*
(?:
(?: # <STATIC_INIT>
= \s*
{_ind(INITIALIZER, 4)}
\s*
[,;{{]
)
|
(?: # <STATIC_ENDING>
[,;]
)
)
)
|
# everything else
(?:
(?: # <DELIM_LEADING>
(?:
[^'"{{}};]*
{_ind(STRING_LITERAL, 4)}
)*
\s* [^'"{{}};]*
)
(?:
(?: # <BLOCK_OPEN>
{{
)
|
(?: # <BLOCK_CLOSE>
}}
)
|
(?: # <STMT_END>
;
)
)
)
)
''')
#######################################
# global declarations
GLOBAL = textwrap.dedent(rf'''
(?:
# an empty statement
(?: # <EMPTY>
;
)
|
# compound type decl (maybe inline)
(?:
(?:
(?: # <COMPOUND_LEADING>
[^;{{}}]+?
)
\s*
)?
(?: # <COMPOUND_KIND>
{COMPOUND_TYPE_KIND}
)
(?:
\s+
(?: # <COMPOUND_NAME>
{STRICT_IDENTIFIER}
)
)?
\s* {{
)
|
# bogus inline decl artifact
# This simplifies resolving the relative syntactic ambiguity of
# inline structs.
(?:
(?: # <FORWARD_KIND>
{COMPOUND_TYPE_KIND}
)
\s*
(?: # <FORWARD_NAME>
{ANON_IDENTIFIER}
)
(?: # <MAYBE_INLINE_ACTUAL>
[^=,;({{[*\]]*
[=,;({{]
)
)
|
# typedef
(?:
\b typedef \b \s*
(?: # <TYPEDEF_DECL>
{_ind(VAR_DECL, 4)}
)
(?:
# We expect no inline type definitions in the parameters.
\s* [(] \s*
(?: # <TYPEDEF_FUNC_PARAMS>
[^{{;]*
)
\s* [)]
)?
\s* ;
)
|
# func decl/definition & var decls
# XXX dedicated pattern for funcs (more restricted)?
(?:
(?:
(?: # <VAR_STORAGE>
{STORAGE_CLASS}
)
\s*
)?
(?:
(?: # <FUNC_INLINE>
\b inline \b
)
\s*
)?
(?: # <VAR_DECL>
{_ind(VAR_DECL, 4)}
)
(?:
# func decl / definition
(?:
(?:
# We expect no inline type definitions in the parameters.
\s* [(] \s*
(?: # <FUNC_PARAMS>
[^{{;]*
)
\s* [)] \s*
(?: # <FUNC_DELIM>
[{{;]
)
)
|
(?:
# This is some old-school syntax!
\s* [(] \s*
# We throw away the bare names:
{STRICT_IDENTIFIER}
(?: \s* , \s* {STRICT_IDENTIFIER} )*
\s* [)] \s*
# We keep the trailing param declarations:
(?: # <FUNC_LEGACY_PARAMS>
# There's at least one!
(?: {TYPE_QUALIFIER} \s* )?
{_ind(TYPE_SPEC, 7)}
\s*
{_ind(DECLARATOR, 7)}
\s* ;
(?:
\s*
(?: {TYPE_QUALIFIER} \s* )?
{_ind(TYPE_SPEC, 8)}
\s*
{_ind(DECLARATOR, 8)}
\s* ;
)*
)
\s* {{
)
)
|
# var / typedef
(?:
(?:
# initializer
# We expect only basic initializers.
\s* = \s*
(?: # <VAR_INIT>
{_ind(INITIALIZER, 6)}
)
)?
\s*
(?: # <VAR_ENDING>
[,;]
)
)
)
)
)
''')

View file

@ -0,0 +1,190 @@
import contextlib
import distutils.ccompiler
import logging
import os.path
from c_common.fsutil import match_glob as _match_glob
from c_common.tables import parse_table as _parse_table
from ..source import (
resolve as _resolve_source,
good_file as _good_file,
)
from . import errors as _errors
from . import (
pure as _pure,
gcc as _gcc,
)
logger = logging.getLogger(__name__)
# Supprted "source":
# * filename (string)
# * lines (iterable)
# * text (string)
# Supported return values:
# * iterator of SourceLine
# * sequence of SourceLine
# * text (string)
# * something that combines all those
# XXX Add the missing support from above.
# XXX Add more low-level functions to handle permutations?
def preprocess(source, *,
incldirs=None,
macros=None,
samefiles=None,
filename=None,
tool=True,
):
"""...
CWD should be the project root and "source" should be relative.
"""
if tool:
logger.debug(f'CWD: {os.getcwd()!r}')
logger.debug(f'incldirs: {incldirs!r}')
logger.debug(f'macros: {macros!r}')
logger.debug(f'samefiles: {samefiles!r}')
_preprocess = _get_preprocessor(tool)
with _good_file(source, filename) as source:
return _preprocess(source, incldirs, macros, samefiles) or ()
else:
source, filename = _resolve_source(source, filename)
# We ignore "includes", "macros", etc.
return _pure.preprocess(source, filename)
# if _run() returns just the lines:
# text = _run(source)
# lines = [line + os.linesep for line in text.splitlines()]
# lines[-1] = lines[-1].splitlines()[0]
#
# conditions = None
# for lno, line in enumerate(lines, 1):
# kind = 'source'
# directive = None
# data = line
# yield lno, kind, data, conditions
def get_preprocessor(*,
file_macros=None,
file_incldirs=None,
file_same=None,
ignore_exc=False,
log_err=None,
):
_preprocess = preprocess
if file_macros:
file_macros = tuple(_parse_macros(file_macros))
if file_incldirs:
file_incldirs = tuple(_parse_incldirs(file_incldirs))
if file_same:
file_same = tuple(file_same)
if not callable(ignore_exc):
ignore_exc = (lambda exc, _ig=ignore_exc: _ig)
def get_file_preprocessor(filename):
filename = filename.strip()
if file_macros:
macros = list(_resolve_file_values(filename, file_macros))
if file_incldirs:
incldirs = [v for v, in _resolve_file_values(filename, file_incldirs)]
def preprocess(**kwargs):
if file_macros and 'macros' not in kwargs:
kwargs['macros'] = macros
if file_incldirs and 'incldirs' not in kwargs:
kwargs['incldirs'] = [v for v, in _resolve_file_values(filename, file_incldirs)]
if file_same and 'file_same' not in kwargs:
kwargs['samefiles'] = file_same
kwargs.setdefault('filename', filename)
with handling_errors(ignore_exc, log_err=log_err):
return _preprocess(filename, **kwargs)
return preprocess
return get_file_preprocessor
def _resolve_file_values(filename, file_values):
# We expect the filename and all patterns to be absolute paths.
for pattern, *value in file_values or ():
if _match_glob(filename, pattern):
yield value
def _parse_macros(macros):
for row, srcfile in _parse_table(macros, '\t', 'glob\tname\tvalue', rawsep='=', default=None):
yield row
def _parse_incldirs(incldirs):
for row, srcfile in _parse_table(incldirs, '\t', 'glob\tdirname', default=None):
glob, dirname = row
if dirname is None:
# Match all files.
dirname = glob
row = ('*', dirname.strip())
yield row
@contextlib.contextmanager
def handling_errors(ignore_exc=None, *, log_err=None):
try:
yield
except _errors.OSMismatchError as exc:
if not ignore_exc(exc):
raise # re-raise
if log_err is not None:
log_err(f'<OS mismatch (expected {" or ".join(exc.expected)})>')
return None
except _errors.MissingDependenciesError as exc:
if not ignore_exc(exc):
raise # re-raise
if log_err is not None:
log_err(f'<missing dependency {exc.missing}')
return None
except _errors.ErrorDirectiveError as exc:
if not ignore_exc(exc):
raise # re-raise
if log_err is not None:
log_err(exc)
return None
##################################
# tools
_COMPILERS = {
# matching disutils.ccompiler.compiler_class:
'unix': _gcc.preprocess,
'msvc': None,
'cygwin': None,
'mingw32': None,
'bcpp': None,
# aliases/extras:
'gcc': _gcc.preprocess,
'clang': None,
}
def _get_preprocessor(tool):
if tool is True:
tool = distutils.ccompiler.get_default_compiler()
preprocess = _COMPILERS.get(tool)
if preprocess is None:
raise ValueError(f'unsupported tool {tool}')
return preprocess
##################################
# aliases
from .errors import (
PreprocessorError,
PreprocessorFailure,
ErrorDirectiveError,
MissingDependenciesError,
OSMismatchError,
)
from .common import FileInfo, SourceLine

View file

@ -0,0 +1,196 @@
import logging
import sys
from c_common.scriptutil import (
CLIArgSpec as Arg,
add_verbosity_cli,
add_traceback_cli,
add_kind_filtering_cli,
add_files_cli,
add_failure_filtering_cli,
add_commands_cli,
process_args_by_key,
configure_logger,
get_prog,
main_for_filenames,
)
from . import (
errors as _errors,
get_preprocessor as _get_preprocessor,
)
FAIL = {
'err': _errors.ErrorDirectiveError,
'deps': _errors.MissingDependenciesError,
'os': _errors.OSMismatchError,
}
FAIL_DEFAULT = tuple(v for v in FAIL if v != 'os')
logger = logging.getLogger(__name__)
##################################
# CLI helpers
def add_common_cli(parser, *, get_preprocessor=_get_preprocessor):
parser.add_argument('--macros', action='append')
parser.add_argument('--incldirs', action='append')
parser.add_argument('--same', action='append')
process_fail_arg = add_failure_filtering_cli(parser, FAIL)
def process_args(args):
ns = vars(args)
process_fail_arg(args)
ignore_exc = ns.pop('ignore_exc')
# We later pass ignore_exc to _get_preprocessor().
args.get_file_preprocessor = get_preprocessor(
file_macros=ns.pop('macros'),
file_incldirs=ns.pop('incldirs'),
file_same=ns.pop('same'),
ignore_exc=ignore_exc,
log_err=print,
)
return process_args
def _iter_preprocessed(filename, *,
get_preprocessor,
match_kind=None,
pure=False,
):
preprocess = get_preprocessor(filename)
for line in preprocess(tool=not pure) or ():
if match_kind is not None and not match_kind(line.kind):
continue
yield line
#######################################
# the commands
def _cli_preprocess(parser, excluded=None, **prepr_kwargs):
parser.add_argument('--pure', action='store_true')
parser.add_argument('--no-pure', dest='pure', action='store_const', const=False)
process_kinds = add_kind_filtering_cli(parser)
process_common = add_common_cli(parser, **prepr_kwargs)
parser.add_argument('--raw', action='store_true')
process_files = add_files_cli(parser, excluded=excluded)
return [
process_kinds,
process_common,
process_files,
]
def cmd_preprocess(filenames, *,
raw=False,
iter_filenames=None,
**kwargs
):
if 'get_file_preprocessor' not in kwargs:
kwargs['get_file_preprocessor'] = _get_preprocessor()
if raw:
def show_file(filename, lines):
for line in lines:
print(line)
#print(line.raw)
else:
def show_file(filename, lines):
for line in lines:
linefile = ''
if line.filename != filename:
linefile = f' ({line.filename})'
text = line.data
if line.kind == 'comment':
text = '/* ' + line.data.splitlines()[0]
text += ' */' if '\n' in line.data else r'\n... */'
print(f' {line.lno:>4} {line.kind:10} | {text}')
filenames = main_for_filenames(filenames, iter_filenames)
for filename in filenames:
lines = _iter_preprocessed(filename, **kwargs)
show_file(filename, lines)
def _cli_data(parser):
...
return None
def cmd_data(filenames,
**kwargs
):
# XXX
raise NotImplementedError
COMMANDS = {
'preprocess': (
'preprocess the given C source & header files',
[_cli_preprocess],
cmd_preprocess,
),
'data': (
'check/manage local data (e.g. excludes, macros)',
[_cli_data],
cmd_data,
),
}
#######################################
# the script
def parse_args(argv=sys.argv[1:], prog=sys.argv[0], *,
subset='preprocess',
excluded=None,
**prepr_kwargs
):
import argparse
parser = argparse.ArgumentParser(
prog=prog or get_prog(),
)
processors = add_commands_cli(
parser,
commands={k: v[1] for k, v in COMMANDS.items()},
commonspecs=[
add_verbosity_cli,
add_traceback_cli,
],
subset=subset,
)
args = parser.parse_args(argv)
ns = vars(args)
cmd = ns.pop('cmd')
verbosity, traceback_cm = process_args_by_key(
args,
processors[cmd],
['verbosity', 'traceback_cm'],
)
return cmd, ns, verbosity, traceback_cm
def main(cmd, cmd_kwargs):
try:
run_cmd = COMMANDS[cmd][0]
except KeyError:
raise ValueError(f'unsupported cmd {cmd!r}')
run_cmd(**cmd_kwargs)
if __name__ == '__main__':
cmd, cmd_kwargs, verbosity, traceback_cm = parse_args()
configure_logger(verbosity)
with traceback_cm:
main(cmd, cmd_kwargs)

View file

@ -0,0 +1,173 @@
import contextlib
import distutils.ccompiler
import logging
import shlex
import subprocess
import sys
from ..info import FileInfo, SourceLine
from .errors import (
PreprocessorFailure,
ErrorDirectiveError,
MissingDependenciesError,
OSMismatchError,
)
logger = logging.getLogger(__name__)
# XXX Add aggregate "source" class(es)?
# * expose all lines as single text string
# * expose all lines as sequence
# * iterate all lines
def run_cmd(argv, *,
#capture_output=True,
stdout=subprocess.PIPE,
#stderr=subprocess.STDOUT,
stderr=subprocess.PIPE,
text=True,
check=True,
**kwargs
):
if isinstance(stderr, str) and stderr.lower() == 'stdout':
stderr = subprocess.STDOUT
kw = dict(locals())
kw.pop('argv')
kw.pop('kwargs')
kwargs.update(kw)
proc = subprocess.run(argv, **kwargs)
return proc.stdout
def preprocess(tool, filename, **kwargs):
argv = _build_argv(tool, filename, **kwargs)
logger.debug(' '.join(shlex.quote(v) for v in argv))
# Make sure the OS is supported for this file.
if (_expected := is_os_mismatch(filename)):
error = None
raise OSMismatchError(filename, _expected, argv, error, TOOL)
# Run the command.
with converted_error(tool, argv, filename):
# We use subprocess directly here, instead of calling the
# distutil compiler object's preprocess() method, since that
# one writes to stdout/stderr and it's simpler to do it directly
# through subprocess.
return run_cmd(argv)
def _build_argv(
tool,
filename,
incldirs=None,
macros=None,
preargs=None,
postargs=None,
executable=None,
compiler=None,
):
compiler = distutils.ccompiler.new_compiler(
compiler=compiler or tool,
)
if executable:
compiler.set_executable('preprocessor', executable)
argv = None
def _spawn(_argv):
nonlocal argv
argv = _argv
compiler.spawn = _spawn
compiler.preprocess(
filename,
macros=[tuple(v) for v in macros or ()],
include_dirs=incldirs or (),
extra_preargs=preargs or (),
extra_postargs=postargs or (),
)
return argv
@contextlib.contextmanager
def converted_error(tool, argv, filename):
try:
yield
except subprocess.CalledProcessError as exc:
convert_error(
tool,
argv,
filename,
exc.stderr,
exc.returncode,
)
def convert_error(tool, argv, filename, stderr, rc):
error = (stderr.splitlines()[0], rc)
if (_expected := is_os_mismatch(filename, stderr)):
logger.debug(stderr.strip())
raise OSMismatchError(filename, _expected, argv, error, tool)
elif (_missing := is_missing_dep(stderr)):
logger.debug(stderr.strip())
raise MissingDependenciesError(filename, (_missing,), argv, error, tool)
elif '#error' in stderr:
# XXX Ignore incompatible files.
error = (stderr.splitlines()[1], rc)
logger.debug(stderr.strip())
raise ErrorDirectiveError(filename, argv, error, tool)
else:
# Try one more time, with stderr written to the terminal.
try:
output = run_cmd(argv, stderr=None)
except subprocess.CalledProcessError:
raise PreprocessorFailure(filename, argv, error, tool)
def is_os_mismatch(filename, errtext=None):
# See: https://docs.python.org/3/library/sys.html#sys.platform
actual = sys.platform
if actual == 'unknown':
raise NotImplementedError
if errtext is not None:
if (missing := is_missing_dep(errtext)):
matching = get_matching_oses(missing, filename)
if actual not in matching:
return matching
return False
def get_matching_oses(missing, filename):
# OSX
if 'darwin' in filename or 'osx' in filename:
return ('darwin',)
elif missing == 'SystemConfiguration/SystemConfiguration.h':
return ('darwin',)
# Windows
elif missing in ('windows.h', 'winsock2.h'):
return ('win32',)
# other
elif missing == 'sys/ldr.h':
return ('aix',)
elif missing == 'dl.h':
# XXX The existence of Python/dynload_dl.c implies others...
# Note that hpux isn't actual supported any more.
return ('hpux', '???')
# unrecognized
else:
return ()
def is_missing_dep(errtext):
if 'No such file or directory' in errtext:
missing = errtext.split(': No such file or directory')[0].split()[-1]
return missing
return False

View file

@ -0,0 +1,110 @@
import sys
OS = sys.platform
def _as_tuple(items):
if isinstance(items, str):
return tuple(items.strip().replace(',', ' ').split())
elif items:
return tuple(items)
else:
return ()
class PreprocessorError(Exception):
"""Something preprocessor-related went wrong."""
@classmethod
def _msg(cls, filename, reason, **ignored):
msg = 'failure while preprocessing'
if reason:
msg = f'{msg} ({reason})'
return msg
def __init__(self, filename, preprocessor=None, reason=None):
if isinstance(reason, str):
reason = reason.strip()
self.filename = filename
self.preprocessor = preprocessor or None
self.reason = str(reason) if reason else None
msg = self._msg(**vars(self))
msg = f'({filename}) {msg}'
if preprocessor:
msg = f'[{preprocessor}] {msg}'
super().__init__(msg)
class PreprocessorFailure(PreprocessorError):
"""The preprocessor command failed."""
@classmethod
def _msg(cls, error, **ignored):
msg = 'preprocessor command failed'
if error:
msg = f'{msg} {error}'
return msg
def __init__(self, filename, argv, error=None, preprocessor=None):
exitcode = -1
if isinstance(error, tuple):
if len(error) == 2:
error, exitcode = error
else:
error = str(error)
if isinstance(error, str):
error = error.strip()
self.argv = _as_tuple(argv) or None
self.error = error if error else None
self.exitcode = exitcode
reason = str(self.error)
super().__init__(filename, preprocessor, reason)
class ErrorDirectiveError(PreprocessorFailure):
"""The file hit a #error directive."""
@classmethod
def _msg(cls, error, **ignored):
return f'#error directive hit ({error})'
def __init__(self, filename, argv, error, *args, **kwargs):
super().__init__(filename, argv, error, *args, **kwargs)
class MissingDependenciesError(PreprocessorFailure):
"""The preprocessor did not have access to all the target's dependencies."""
@classmethod
def _msg(cls, missing, **ignored):
msg = 'preprocessing failed due to missing dependencies'
if missing:
msg = f'{msg} ({", ".join(missing)})'
return msg
def __init__(self, filename, missing=None, *args, **kwargs):
self.missing = _as_tuple(missing) or None
super().__init__(filename, *args, **kwargs)
class OSMismatchError(MissingDependenciesError):
"""The target is not compatible with the host OS."""
@classmethod
def _msg(cls, expected, **ignored):
return f'OS is {OS} but expected {expected or "???"}'
def __init__(self, filename, expected=None, *args, **kwargs):
if isinstance(expected, str):
expected = expected.strip()
self.actual = OS
self.expected = expected if expected else None
super().__init__(filename, None, *args, **kwargs)

View file

@ -0,0 +1,123 @@
import os.path
import re
from . import common as _common
TOOL = 'gcc'
# https://gcc.gnu.org/onlinedocs/cpp/Preprocessor-Output.html
LINE_MARKER_RE = re.compile(r'^# (\d+) "([^"]+)"(?: [1234])*$')
PREPROC_DIRECTIVE_RE = re.compile(r'^\s*#\s*(\w+)\b.*')
COMPILER_DIRECTIVE_RE = re.compile(r'''
^
(.*?) # <before>
(__\w+__) # <directive>
\s*
[(] [(]
(
[^()]*
(?:
[(]
[^()]*
[)]
[^()]*
)*
) # <args>
( [)] [)] )? # <closed>
''', re.VERBOSE)
POST_ARGS = (
'-pthread',
'-std=c99',
#'-g',
#'-Og',
#'-Wno-unused-result',
#'-Wsign-compare',
#'-Wall',
#'-Wextra',
'-E',
)
def preprocess(filename, incldirs=None, macros=None, samefiles=None):
text = _common.preprocess(
TOOL,
filename,
incldirs=incldirs,
macros=macros,
#preargs=PRE_ARGS,
postargs=POST_ARGS,
executable=['gcc'],
compiler='unix',
)
return _iter_lines(text, filename, samefiles)
def _iter_lines(text, filename, samefiles, *, raw=False):
lines = iter(text.splitlines())
# Build the lines and filter out directives.
partial = 0 # depth
origfile = None
for line in lines:
m = LINE_MARKER_RE.match(line)
if m:
lno, origfile = m.groups()
lno = int(lno)
elif _filter_orig_file(origfile, filename, samefiles):
if (m := PREPROC_DIRECTIVE_RE.match(line)):
name, = m.groups()
if name != 'pragma':
raise Exception(line)
else:
if not raw:
line, partial = _strip_directives(line, partial=partial)
yield _common.SourceLine(
_common.FileInfo(filename, lno),
'source',
line or '',
None,
)
lno += 1
def _strip_directives(line, partial=0):
# We assume there are no string literals with parens in directive bodies.
while partial > 0:
if not (m := re.match(r'[^{}]*([()])', line)):
return None, partial
delim, = m.groups()
partial += 1 if delim == '(' else -1 # opened/closed
line = line[m.end():]
line = re.sub(r'__extension__', '', line)
while (m := COMPILER_DIRECTIVE_RE.match(line)):
before, _, _, closed = m.groups()
if closed:
line = f'{before} {line[m.end():]}'
else:
after, partial = _strip_directives(line[m.end():], 2)
line = f'{before} {after or ""}'
if partial:
break
return line, partial
def _filter_orig_file(origfile, current, samefiles):
if origfile == current:
return True
if origfile == '<stdin>':
return True
if os.path.isabs(origfile):
return False
for filename in samefiles or ():
if filename.endswith(os.path.sep):
filename += os.path.basename(current)
if origfile == filename:
return True
return False

View file

@ -0,0 +1,23 @@
from ..source import (
opened as _open_source,
)
from . import common as _common
def preprocess(lines, filename=None):
if isinstance(lines, str):
with _open_source(lines, filename) as (lines, filename):
yield from preprocess(lines, filename)
return
# XXX actually preprocess...
for lno, line in enumerate(lines, 1):
kind = 'source'
data = line
conditions = None
yield _common.SourceLine(
_common.FileInfo(filename, lno),
kind,
data,
conditions,
)

View file

@ -0,0 +1,64 @@
import contextlib
import os.path
def resolve(source, filename):
if _looks_like_filename(source):
return _resolve_filename(source, filename)
if isinstance(source, str):
source = source.splitlines()
# At this point "source" is not a str.
if not filename:
filename = None
elif not isinstance(filename, str):
raise TypeError(f'filename should be str (or None), got {filename!r}')
else:
filename, _ = _resolve_filename(filename)
return source, filename
@contextlib.contextmanager
def good_file(filename, alt=None):
if not _looks_like_filename(filename):
raise ValueError(f'expected a filename, got {filename}')
filename, _ = _resolve_filename(filename, alt)
try:
yield filename
except Exception:
if not os.path.exists(filename):
raise FileNotFoundError(f'file not found: {filename}')
raise # re-raise
def _looks_like_filename(value):
if not isinstance(value, str):
return False
return value.endswith(('.c', '.h'))
def _resolve_filename(filename, alt=None):
if os.path.isabs(filename):
...
# raise NotImplementedError
else:
filename = os.path.join('.', filename)
if not alt:
alt = filename
elif os.path.abspath(filename) == os.path.abspath(alt):
alt = filename
else:
raise ValueError(f'mismatch: {filename} != {alt}')
return filename, alt
@contextlib.contextmanager
def opened(source, filename=None):
source, filename = resolve(source, filename)
if isinstance(source, str):
with open(source) as srcfile:
yield srcfile, filename
else:
yield source, filename