mirror of
				https://github.com/python/cpython.git
				synced 2025-10-25 15:58:57 +00:00 
			
		
		
		
	 e4561e0501
			
		
	
	
		e4561e0501
		
			
		
	
	
	
	
		
			
			This replaces the old `TIER_{ONE,TWO}_ONLY` macros. Note that `specialized` implies `tier1`.
Co-authored-by: Alex Waygood <Alex.Waygood@Gmail.com>
		
	
			
		
			
				
	
	
		
			375 lines
		
	
	
	
		
			8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			375 lines
		
	
	
	
		
			8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Parser for C code
 | |
| # Originally by Mark Shannon (mark@hotpy.org)
 | |
| # https://gist.github.com/markshannon/db7ab649440b5af765451bb77c7dba34
 | |
| 
 | |
| import re
 | |
| from dataclasses import dataclass
 | |
| from collections.abc import Iterator
 | |
| 
 | |
| 
 | |
| def choice(*opts: str) -> str:
 | |
|     return "|".join("(%s)" % opt for opt in opts)
 | |
| 
 | |
| 
 | |
| # Regexes
 | |
| 
 | |
| # Longer operators must go before shorter ones.
 | |
| 
 | |
| PLUSPLUS = r"\+\+"
 | |
| MINUSMINUS = r"--"
 | |
| 
 | |
| # ->
 | |
| ARROW = r"->"
 | |
| ELLIPSIS = r"\.\.\."
 | |
| 
 | |
| # Assignment operators
 | |
| TIMESEQUAL = r"\*="
 | |
| DIVEQUAL = r"/="
 | |
| MODEQUAL = r"%="
 | |
| PLUSEQUAL = r"\+="
 | |
| MINUSEQUAL = r"-="
 | |
| LSHIFTEQUAL = r"<<="
 | |
| RSHIFTEQUAL = r">>="
 | |
| ANDEQUAL = r"&="
 | |
| OREQUAL = r"\|="
 | |
| XOREQUAL = r"\^="
 | |
| 
 | |
| # Operators
 | |
| PLUS = r"\+"
 | |
| MINUS = r"-"
 | |
| TIMES = r"\*"
 | |
| DIVIDE = r"/"
 | |
| MOD = r"%"
 | |
| NOT = r"~"
 | |
| XOR = r"\^"
 | |
| LOR = r"\|\|"
 | |
| LAND = r"&&"
 | |
| LSHIFT = r"<<"
 | |
| RSHIFT = r">>"
 | |
| LE = r"<="
 | |
| GE = r">="
 | |
| EQ = r"=="
 | |
| NE = r"!="
 | |
| LT = r"<"
 | |
| GT = r">"
 | |
| LNOT = r"!"
 | |
| OR = r"\|"
 | |
| AND = r"&"
 | |
| EQUALS = r"="
 | |
| 
 | |
| # ?
 | |
| CONDOP = r"\?"
 | |
| 
 | |
| # Delimiters
 | |
| LPAREN = r"\("
 | |
| RPAREN = r"\)"
 | |
| LBRACKET = r"\["
 | |
| RBRACKET = r"\]"
 | |
| LBRACE = r"\{"
 | |
| RBRACE = r"\}"
 | |
| COMMA = r","
 | |
| PERIOD = r"\."
 | |
| SEMI = r";"
 | |
| COLON = r":"
 | |
| BACKSLASH = r"\\"
 | |
| 
 | |
| operators = {op: pattern for op, pattern in globals().items() if op == op.upper()}
 | |
| for op in operators:
 | |
|     globals()[op] = op
 | |
| opmap = {pattern.replace("\\", "") or "\\": op for op, pattern in operators.items()}
 | |
| 
 | |
| # Macros
 | |
| macro = r"# *(ifdef|ifndef|undef|define|error|endif|if|else|include|#)"
 | |
| CMACRO = "CMACRO"
 | |
| 
 | |
| id_re = r"[a-zA-Z_][0-9a-zA-Z_]*"
 | |
| IDENTIFIER = "IDENTIFIER"
 | |
| 
 | |
| 
 | |
| suffix = r"([uU]?[lL]?[lL]?)"
 | |
| octal = r"0[0-7]+" + suffix
 | |
| hex = r"0[xX][0-9a-fA-F]+"
 | |
| decimal_digits = r"(0|[1-9][0-9]*)"
 | |
| decimal = decimal_digits + suffix
 | |
| 
 | |
| 
 | |
| exponent = r"""([eE][-+]?[0-9]+)"""
 | |
| fraction = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
 | |
| float = "((((" + fraction + ")" + exponent + "?)|([0-9]+" + exponent + "))[FfLl]?)"
 | |
| 
 | |
| number_re = choice(octal, hex, float, decimal)
 | |
| NUMBER = "NUMBER"
 | |
| 
 | |
| simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
 | |
| decimal_escape = r"""(\d+)"""
 | |
| hex_escape = r"""(x[0-9a-fA-F]+)"""
 | |
| escape_sequence = (
 | |
|     r"""(\\(""" + simple_escape + "|" + decimal_escape + "|" + hex_escape + "))"
 | |
| )
 | |
| string_char = r"""([^"\\\n]|""" + escape_sequence + ")"
 | |
| str_re = '"' + string_char + '*"'
 | |
| STRING = "STRING"
 | |
| char = r"\'.\'"  # TODO: escape sequence
 | |
| CHARACTER = "CHARACTER"
 | |
| 
 | |
| comment_re = r"(//.*)|/\*([^*]|\*[^/])*\*/"
 | |
| COMMENT = "COMMENT"
 | |
| 
 | |
| newline = r"\n"
 | |
| invalid = (
 | |
|     r"\S"  # A single non-space character that's not caught by any of the other patterns
 | |
| )
 | |
| matcher = re.compile(
 | |
|     choice(
 | |
|         id_re,
 | |
|         number_re,
 | |
|         str_re,
 | |
|         char,
 | |
|         newline,
 | |
|         macro,
 | |
|         comment_re,
 | |
|         *operators.values(),
 | |
|         invalid,
 | |
|     )
 | |
| )
 | |
| letter = re.compile(r"[a-zA-Z_]")
 | |
| 
 | |
| 
 | |
| kwds = []
 | |
| AUTO = "AUTO"
 | |
| kwds.append(AUTO)
 | |
| BREAK = "BREAK"
 | |
| kwds.append(BREAK)
 | |
| CASE = "CASE"
 | |
| kwds.append(CASE)
 | |
| CHAR = "CHAR"
 | |
| kwds.append(CHAR)
 | |
| CONST = "CONST"
 | |
| kwds.append(CONST)
 | |
| CONTINUE = "CONTINUE"
 | |
| kwds.append(CONTINUE)
 | |
| DEFAULT = "DEFAULT"
 | |
| kwds.append(DEFAULT)
 | |
| DO = "DO"
 | |
| kwds.append(DO)
 | |
| DOUBLE = "DOUBLE"
 | |
| kwds.append(DOUBLE)
 | |
| ELSE = "ELSE"
 | |
| kwds.append(ELSE)
 | |
| ENUM = "ENUM"
 | |
| kwds.append(ENUM)
 | |
| EXTERN = "EXTERN"
 | |
| kwds.append(EXTERN)
 | |
| FLOAT = "FLOAT"
 | |
| kwds.append(FLOAT)
 | |
| FOR = "FOR"
 | |
| kwds.append(FOR)
 | |
| GOTO = "GOTO"
 | |
| kwds.append(GOTO)
 | |
| IF = "IF"
 | |
| kwds.append(IF)
 | |
| INLINE = "INLINE"
 | |
| kwds.append(INLINE)
 | |
| INT = "INT"
 | |
| kwds.append(INT)
 | |
| LONG = "LONG"
 | |
| kwds.append(LONG)
 | |
| OFFSETOF = "OFFSETOF"
 | |
| kwds.append(OFFSETOF)
 | |
| RESTRICT = "RESTRICT"
 | |
| kwds.append(RESTRICT)
 | |
| RETURN = "RETURN"
 | |
| kwds.append(RETURN)
 | |
| SHORT = "SHORT"
 | |
| kwds.append(SHORT)
 | |
| SIGNED = "SIGNED"
 | |
| kwds.append(SIGNED)
 | |
| SIZEOF = "SIZEOF"
 | |
| kwds.append(SIZEOF)
 | |
| STATIC = "STATIC"
 | |
| kwds.append(STATIC)
 | |
| STRUCT = "STRUCT"
 | |
| kwds.append(STRUCT)
 | |
| SWITCH = "SWITCH"
 | |
| kwds.append(SWITCH)
 | |
| TYPEDEF = "TYPEDEF"
 | |
| kwds.append(TYPEDEF)
 | |
| UNION = "UNION"
 | |
| kwds.append(UNION)
 | |
| UNSIGNED = "UNSIGNED"
 | |
| kwds.append(UNSIGNED)
 | |
| VOID = "VOID"
 | |
| kwds.append(VOID)
 | |
| VOLATILE = "VOLATILE"
 | |
| kwds.append(VOLATILE)
 | |
| WHILE = "WHILE"
 | |
| kwds.append(WHILE)
 | |
| # An instruction in the DSL
 | |
| INST = "INST"
 | |
| kwds.append(INST)
 | |
| # A micro-op in the DSL
 | |
| OP = "OP"
 | |
| kwds.append(OP)
 | |
| # A macro in the DSL
 | |
| MACRO = "MACRO"
 | |
| kwds.append(MACRO)
 | |
| keywords = {name.lower(): name for name in kwds}
 | |
| 
 | |
| ANNOTATION = "ANNOTATION"
 | |
| annotations = {
 | |
|     "specializing",
 | |
|     "override",
 | |
|     "register",
 | |
|     "replaced",
 | |
|     "pure",
 | |
|     "split",
 | |
|     "replicate",
 | |
|     "tier1",
 | |
|     "tier2",
 | |
| }
 | |
| 
 | |
| __all__ = []
 | |
| __all__.extend(kwds)
 | |
| 
 | |
| 
 | |
| def make_syntax_error(
 | |
|     message: str,
 | |
|     filename: str | None,
 | |
|     line: int,
 | |
|     column: int,
 | |
|     line_text: str,
 | |
| ) -> SyntaxError:
 | |
|     return SyntaxError(message, (filename, line, column, line_text))
 | |
| 
 | |
| 
 | |
| @dataclass(slots=True)
 | |
| class Token:
 | |
|     filename: str
 | |
|     kind: str
 | |
|     text: str
 | |
|     begin: tuple[int, int]
 | |
|     end: tuple[int, int]
 | |
| 
 | |
|     @property
 | |
|     def line(self) -> int:
 | |
|         return self.begin[0]
 | |
| 
 | |
|     @property
 | |
|     def column(self) -> int:
 | |
|         return self.begin[1]
 | |
| 
 | |
|     @property
 | |
|     def end_line(self) -> int:
 | |
|         return self.end[0]
 | |
| 
 | |
|     @property
 | |
|     def end_column(self) -> int:
 | |
|         return self.end[1]
 | |
| 
 | |
|     @property
 | |
|     def width(self) -> int:
 | |
|         return self.end[1] - self.begin[1]
 | |
| 
 | |
|     def replaceText(self, txt: str) -> "Token":
 | |
|         assert isinstance(txt, str)
 | |
|         return Token(self.filename, self.kind, txt, self.begin, self.end)
 | |
| 
 | |
|     def __repr__(self) -> str:
 | |
|         b0, b1 = self.begin
 | |
|         e0, e1 = self.end
 | |
|         if b0 == e0:
 | |
|             return f"{self.kind}({self.text!r}, {b0}:{b1}:{e1})"
 | |
|         else:
 | |
|             return f"{self.kind}({self.text!r}, {b0}:{b1}, {e0}:{e1})"
 | |
| 
 | |
| 
 | |
| def tokenize(src: str, line: int = 1, filename: str = "") -> Iterator[Token]:
 | |
|     linestart = -1
 | |
|     for m in matcher.finditer(src):
 | |
|         start, end = m.span()
 | |
|         text = m.group(0)
 | |
|         if text in keywords:
 | |
|             kind = keywords[text]
 | |
|         elif text in annotations:
 | |
|             kind = ANNOTATION
 | |
|         elif letter.match(text):
 | |
|             kind = IDENTIFIER
 | |
|         elif text == "...":
 | |
|             kind = ELLIPSIS
 | |
|         elif text == ".":
 | |
|             kind = PERIOD
 | |
|         elif text[0] in "0123456789.":
 | |
|             kind = NUMBER
 | |
|         elif text[0] == '"':
 | |
|             kind = STRING
 | |
|         elif text in opmap:
 | |
|             kind = opmap[text]
 | |
|         elif text == "\n":
 | |
|             linestart = start
 | |
|             line += 1
 | |
|             kind = "\n"
 | |
|         elif text[0] == "'":
 | |
|             kind = CHARACTER
 | |
|         elif text[0] == "#":
 | |
|             kind = CMACRO
 | |
|         elif text[0] == "/" and text[1] in "/*":
 | |
|             kind = COMMENT
 | |
|         else:
 | |
|             lineend = src.find("\n", start)
 | |
|             if lineend == -1:
 | |
|                 lineend = len(src)
 | |
|             raise make_syntax_error(
 | |
|                 f"Bad token: {text}",
 | |
|                 filename,
 | |
|                 line,
 | |
|                 start - linestart + 1,
 | |
|                 src[linestart:lineend],
 | |
|             )
 | |
|         if kind == COMMENT:
 | |
|             begin = line, start - linestart
 | |
|             newlines = text.count("\n")
 | |
|             if newlines:
 | |
|                 linestart = start + text.rfind("\n")
 | |
|                 line += newlines
 | |
|         else:
 | |
|             begin = line, start - linestart
 | |
|         if kind != "\n":
 | |
|             yield Token(
 | |
|                 filename, kind, text, begin, (line, start - linestart + len(text))
 | |
|             )
 | |
| 
 | |
| 
 | |
| def to_text(tkns: list[Token], dedent: int = 0) -> str:
 | |
|     res: list[str] = []
 | |
|     line, col = -1, 1 + dedent
 | |
|     for tkn in tkns:
 | |
|         if line == -1:
 | |
|             line, _ = tkn.begin
 | |
|         l, c = tkn.begin
 | |
|         # assert(l >= line), (line, txt, start, end)
 | |
|         while l > line:
 | |
|             line += 1
 | |
|             res.append("\n")
 | |
|             col = 1 + dedent
 | |
|         res.append(" " * (c - col))
 | |
|         text = tkn.text
 | |
|         if dedent != 0 and tkn.kind == "COMMENT" and "\n" in text:
 | |
|             if dedent < 0:
 | |
|                 text = text.replace("\n", "\n" + " " * -dedent)
 | |
|             # TODO: dedent > 0
 | |
|         res.append(text)
 | |
|         line, col = tkn.end
 | |
|     return "".join(res)
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     import sys
 | |
| 
 | |
|     filename = sys.argv[1]
 | |
|     if filename == "-c":
 | |
|         src = sys.argv[2]
 | |
|     else:
 | |
|         src = open(filename).read()
 | |
|     # print(to_text(tokenize(src)))
 | |
|     for tkn in tokenize(src, filename=filename):
 | |
|         print(tkn)
 |