cpython/Tools/c-analyzer/c_parser/parser/_regexes.py
Eric Snow 345cd37abe
bpo-36876: Fix the C analyzer tool. (GH-22841)
The original tool wasn't working right and it was simpler to create a new one, partially re-using some of the old code. At this point the tool runs properly on the master. (Try: ./python Tools/c-analyzer/c-analyzer.py analyze.)  It take ~40 seconds on my machine to analyze the full CPython code base.

Note that we'll need to iron out some OS-specific stuff (e.g. preprocessor). We're okay though since this tool isn't used yet in our workflow. We will also need to verify the analysis results in detail before activating the check in CI, though I'm pretty sure it's close.

https://bugs.python.org/issue36876
2020-10-22 18:42:51 -06:00

796 lines
19 KiB
Python

# Regular expression patterns for C syntax.
#
# None of these patterns has any capturing. However, a number of them
# have capturing markers compatible with utils.set_capture_groups().
import textwrap
def _ind(text, level=1, edges='both'):
indent = ' ' * level
text = textwrap.indent(text, indent)
if edges == 'pre' or edges == 'both':
text = '\n' + indent + text.lstrip()
if edges == 'post' or edges == 'both':
text = text.rstrip() + '\n' + ' ' * (level - 1)
return text
#######################################
# general
HEX = r'(?: [0-9a-zA-Z] )'
STRING_LITERAL = textwrap.dedent(rf'''
(?:
# character literal
(?:
['] [^'] [']
|
['] \\ . [']
|
['] \\x{HEX}{HEX} [']
|
['] \\0\d\d [']
|
(?:
['] \\o[01]\d\d [']
|
['] \\o2[0-4]\d [']
|
['] \\o25[0-5] [']
)
)
|
# string literal
(?:
["] (?: [^"\\]* \\ . )* [^"\\]* ["]
)
# end string literal
)
''')
_KEYWORD = textwrap.dedent(r'''
(?:
\b
(?:
auto |
extern |
register |
static |
typedef |
const |
volatile |
signed |
unsigned |
char |
short |
int |
long |
float |
double |
void |
struct |
union |
enum |
goto |
return |
sizeof |
break |
continue |
if |
else |
for |
do |
while |
switch |
case |
default |
entry
)
\b
)
''')
KEYWORD = rf'''
# keyword
{_KEYWORD}
# end keyword
'''
_KEYWORD = ''.join(_KEYWORD.split())
IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
# We use a negative lookahead to filter out keywords.
STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'
#######################################
# types
SIMPLE_TYPE = textwrap.dedent(rf'''
# simple type
(?:
\b
(?:
void
|
(?: signed | unsigned ) # implies int
|
(?:
(?: (?: signed | unsigned ) \s+ )?
(?: (?: long | short ) \s+ )?
(?: char | short | int | long | float | double )
)
)
\b
)
# end simple type
''')
COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'
#######################################
# variable declarations
STORAGE_CLASS = r'(?: \b (?: auto | register | static | extern ) \b )'
TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
TYPE_SPEC = textwrap.dedent(rf'''
# type spec
(?:
{_ind(SIMPLE_TYPE, 2)}
|
(?:
[_]*typeof[_]*
\s* [(]
(?: \s* [*&] )*
\s* {STRICT_IDENTIFIER}
\s* [)]
)
|
# reference to a compound type
(?:
{COMPOUND_TYPE_KIND}
(?: \s* {ANON_IDENTIFIER} )?
)
|
# reference to a typedef
{STRICT_IDENTIFIER}
)
# end type spec
''')
DECLARATOR = textwrap.dedent(rf'''
# declarator (possibly abstract)
(?:
(?: {PTR_QUALIFIER} \s* )*
(?:
(?:
(?: # <IDENTIFIER>
{STRICT_IDENTIFIER}
)
(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
)
|
(?:
[(] \s*
(?: # <WRAPPED_IDENTIFIER>
{STRICT_IDENTIFIER}
)
(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
\s* [)]
)
|
# func ptr
(?:
[(] (?: \s* {PTR_QUALIFIER} )? \s*
(?: # <FUNC_IDENTIFIER>
{STRICT_IDENTIFIER}
)
(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
\s* [)]
# We allow for a single level of paren nesting in parameters.
\s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
)
)
)
# end declarator
''')
VAR_DECL = textwrap.dedent(rf'''
# var decl (and typedef and func return type)
(?:
(?:
(?: # <STORAGE>
{STORAGE_CLASS}
)
\s*
)?
(?:
(?: # <TYPE_QUAL>
{TYPE_QUALIFIER}
)
\s*
)?
(?:
(?: # <TYPE_SPEC>
{_ind(TYPE_SPEC, 4)}
)
)
\s*
(?:
(?: # <DECLARATOR>
{_ind(DECLARATOR, 4)}
)
)
)
# end var decl
''')
INITIALIZER = textwrap.dedent(rf'''
# initializer
(?:
(?:
[(]
# no nested parens (e.g. func ptr)
[^)]*
[)]
\s*
)?
(?:
# a string literal
(?:
(?: {_ind(STRING_LITERAL, 4)} \s* )*
{_ind(STRING_LITERAL, 4)}
)
|
# a simple initializer
(?:
(?:
[^'",;{{]*
{_ind(STRING_LITERAL, 4)}
)*
[^'",;{{]*
)
|
# a struct/array literal
(?:
# We only expect compound initializers with
# single-variable declarations.
{{
(?:
[^'";]*?
{_ind(STRING_LITERAL, 5)}
)*
[^'";]*?
}}
(?= \s* ; ) # Note this lookahead.
)
)
)
# end initializer
''')
#######################################
# compound type declarations
STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
(?:
# inline compound type decl
(?:
(?: # <COMPOUND_TYPE_KIND>
{COMPOUND_TYPE_KIND}
)
(?:
\s+
(?: # <COMPOUND_TYPE_NAME>
{STRICT_IDENTIFIER}
)
)?
\s* {{
)
|
(?:
# typed member
(?:
# Technically it doesn't have to have a type...
(?: # <SPECIFIER_QUALIFIER>
(?: {TYPE_QUALIFIER} \s* )?
{_ind(TYPE_SPEC, 5)}
)
(?:
# If it doesn't have a declarator then it will have
# a size and vice versa.
\s*
(?: # <DECLARATOR>
{_ind(DECLARATOR, 6)}
)
)?
)
# sized member
(?:
\s* [:] \s*
(?: # <SIZE>
\d+
)
)?
\s*
(?: # <ENDING>
[,;]
)
)
|
(?:
\s*
(?: # <CLOSE>
}}
)
)
)
''')
ENUM_MEMBER_DECL = textwrap.dedent(rf'''
(?:
(?:
\s*
(?: # <CLOSE>
}}
)
)
|
(?:
\s*
(?: # <NAME>
{IDENTIFIER}
)
(?:
\s* = \s*
(?: # <INIT>
{_ind(STRING_LITERAL, 4)}
|
[^'",}}]+
)
)?
\s*
(?: # <ENDING>
, | }}
)
)
)
''')
#######################################
# statements
SIMPLE_STMT_BODY = textwrap.dedent(rf'''
# simple statement body
(?:
(?:
[^'"{{}};]*
{_ind(STRING_LITERAL, 3)}
)*
[^'"{{}};]*
#(?= [;{{] ) # Note this lookahead.
)
# end simple statement body
''')
SIMPLE_STMT = textwrap.dedent(rf'''
# simple statement
(?:
(?: # <SIMPLE_STMT>
# stmt-inline "initializer"
(?:
return \b
(?:
\s*
{_ind(INITIALIZER, 5)}
)?
)
|
# variable assignment
(?:
(?: [*] \s* )?
(?:
{STRICT_IDENTIFIER} \s*
(?: . | -> ) \s*
)*
{STRICT_IDENTIFIER}
(?: \s* \[ \s* \d+ \s* \] )?
\s* = \s*
{_ind(INITIALIZER, 4)}
)
|
# catchall return statement
(?:
return \b
(?:
(?:
[^'";]*
{_ind(STRING_LITERAL, 6)}
)*
\s* [^'";]*
)?
)
|
# simple statement
(?:
{_ind(SIMPLE_STMT_BODY, 4)}
)
)
\s*
(?: # <SIMPLE_ENDING>
;
)
)
# end simple statement
''')
COMPOUND_STMT = textwrap.dedent(rf'''
# compound statement
(?:
\b
(?:
(?:
(?: # <COMPOUND_BARE>
else | do
)
\b
)
|
(?:
(?: # <COMPOUND_LABELED>
(?:
case \b
(?:
[^'":]*
{_ind(STRING_LITERAL, 7)}
)*
\s* [^'":]*
)
|
default
|
{STRICT_IDENTIFIER}
)
\s* [:]
)
|
(?:
(?: # <COMPOUND_PAREN>
for | while | if | switch
)
\s* (?= [(] ) # Note this lookahead.
)
)
\s*
)
# end compound statement
''')
#######################################
# function bodies
LOCAL = textwrap.dedent(rf'''
(?:
# an empty statement
(?: # <EMPTY>
;
)
|
# inline type decl
(?:
(?:
(?: # <INLINE_LEADING>
[^;{{}}]+?
)
\s*
)?
(?: # <INLINE_PRE>
(?: {STORAGE_CLASS} \s* )?
(?: {TYPE_QUALIFIER} \s* )?
)? # </INLINE_PRE>
(?: # <INLINE_KIND>
{COMPOUND_TYPE_KIND}
)
(?:
\s+
(?: # <INLINE_NAME>
{STRICT_IDENTIFIER}
)
)?
\s* {{
)
|
# var decl
(?:
(?: # <STORAGE>
{STORAGE_CLASS}
)? # </STORAGE>
(?:
\s*
(?: # <VAR_DECL>
{_ind(VAR_DECL, 5)}
)
)
(?:
(?:
# initializer
# We expect only basic initializers.
\s* = \s*
(?: # <VAR_INIT>
{_ind(INITIALIZER, 6)}
)
)?
(?:
\s*
(?: # <VAR_ENDING>
[,;]
)
)
)
)
|
{_ind(COMPOUND_STMT, 2)}
|
# start-of-block
(?:
(?: # <BLOCK_LEADING>
(?:
[^'"{{}};]*
{_ind(STRING_LITERAL, 5)}
)*
[^'"{{}};]*
# Presumably we will not see "== {{".
[^\s='"{{}});]
\s*
)? # </BLOCK_LEADING>
(?: # <BLOCK_OPEN>
{{
)
)
|
{_ind(SIMPLE_STMT, 2)}
|
# end-of-block
(?: # <BLOCK_CLOSE>
}}
)
)
''')
LOCAL_STATICS = textwrap.dedent(rf'''
(?:
# inline type decl
(?:
(?:
(?: # <INLINE_LEADING>
[^;{{}}]+?
)
\s*
)?
(?: # <INLINE_PRE>
(?: {STORAGE_CLASS} \s* )?
(?: {TYPE_QUALIFIER} \s* )?
)?
(?: # <INLINE_KIND>
{COMPOUND_TYPE_KIND}
)
(?:
\s+
(?: # <INLINE_NAME>
{STRICT_IDENTIFIER}
)
)?
\s* {{
)
|
# var decl
(?:
# We only look for static variables.
(?: # <STATIC_DECL>
static \b
(?: \s* {TYPE_QUALIFIER} )?
\s* {_ind(TYPE_SPEC, 4)}
\s* {_ind(DECLARATOR, 4)}
)
\s*
(?:
(?: # <STATIC_INIT>
= \s*
{_ind(INITIALIZER, 4)}
\s*
[,;{{]
)
|
(?: # <STATIC_ENDING>
[,;]
)
)
)
|
# everything else
(?:
(?: # <DELIM_LEADING>
(?:
[^'"{{}};]*
{_ind(STRING_LITERAL, 4)}
)*
\s* [^'"{{}};]*
)
(?:
(?: # <BLOCK_OPEN>
{{
)
|
(?: # <BLOCK_CLOSE>
}}
)
|
(?: # <STMT_END>
;
)
)
)
)
''')
#######################################
# global declarations
GLOBAL = textwrap.dedent(rf'''
(?:
# an empty statement
(?: # <EMPTY>
;
)
|
# compound type decl (maybe inline)
(?:
(?:
(?: # <COMPOUND_LEADING>
[^;{{}}]+?
)
\s*
)?
(?: # <COMPOUND_KIND>
{COMPOUND_TYPE_KIND}
)
(?:
\s+
(?: # <COMPOUND_NAME>
{STRICT_IDENTIFIER}
)
)?
\s* {{
)
|
# bogus inline decl artifact
# This simplifies resolving the relative syntactic ambiguity of
# inline structs.
(?:
(?: # <FORWARD_KIND>
{COMPOUND_TYPE_KIND}
)
\s*
(?: # <FORWARD_NAME>
{ANON_IDENTIFIER}
)
(?: # <MAYBE_INLINE_ACTUAL>
[^=,;({{[*\]]*
[=,;({{]
)
)
|
# typedef
(?:
\b typedef \b \s*
(?: # <TYPEDEF_DECL>
{_ind(VAR_DECL, 4)}
)
(?:
# We expect no inline type definitions in the parameters.
\s* [(] \s*
(?: # <TYPEDEF_FUNC_PARAMS>
[^{{;]*
)
\s* [)]
)?
\s* ;
)
|
# func decl/definition & var decls
# XXX dedicated pattern for funcs (more restricted)?
(?:
(?:
(?: # <VAR_STORAGE>
{STORAGE_CLASS}
)
\s*
)?
(?:
(?: # <FUNC_INLINE>
\b inline \b
)
\s*
)?
(?: # <VAR_DECL>
{_ind(VAR_DECL, 4)}
)
(?:
# func decl / definition
(?:
(?:
# We expect no inline type definitions in the parameters.
\s* [(] \s*
(?: # <FUNC_PARAMS>
[^{{;]*
)
\s* [)] \s*
(?: # <FUNC_DELIM>
[{{;]
)
)
|
(?:
# This is some old-school syntax!
\s* [(] \s*
# We throw away the bare names:
{STRICT_IDENTIFIER}
(?: \s* , \s* {STRICT_IDENTIFIER} )*
\s* [)] \s*
# We keep the trailing param declarations:
(?: # <FUNC_LEGACY_PARAMS>
# There's at least one!
(?: {TYPE_QUALIFIER} \s* )?
{_ind(TYPE_SPEC, 7)}
\s*
{_ind(DECLARATOR, 7)}
\s* ;
(?:
\s*
(?: {TYPE_QUALIFIER} \s* )?
{_ind(TYPE_SPEC, 8)}
\s*
{_ind(DECLARATOR, 8)}
\s* ;
)*
)
\s* {{
)
)
|
# var / typedef
(?:
(?:
# initializer
# We expect only basic initializers.
\s* = \s*
(?: # <VAR_INIT>
{_ind(INITIALIZER, 6)}
)
)?
\s*
(?: # <VAR_ENDING>
[,;]
)
)
)
)
)
''')