mirror of
				https://github.com/python/cpython.git
				synced 2025-10-21 22:22:48 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			189 lines
		
	
	
	
		
			5.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			189 lines
		
	
	
	
		
			5.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
 | |
| # Licensed to PSF under a Contributor Agreement.
 | |
| 
 | |
| """This module defines the data structures used to represent a grammar.
 | |
| 
 | |
| These are a bit arcane because they are derived from the data
 | |
| structures used by Python's 'pgen' parser generator.
 | |
| 
 | |
| There's also a table here mapping operators to their names in the
 | |
| token module; the Python tokenize module reports all operators as the
 | |
| fallback token code OP, but the parser needs the actual token code.
 | |
| 
 | |
| """
 | |
| 
 | |
| # Python imports
 | |
| import pickle
 | |
| 
 | |
| # Local imports
 | |
| from . import token
 | |
| 
 | |
| 
 | |
| class Grammar(object):
 | |
|     """Pgen parsing tables conversion class.
 | |
| 
 | |
|     Once initialized, this class supplies the grammar tables for the
 | |
|     parsing engine implemented by parse.py.  The parsing engine
 | |
|     accesses the instance variables directly.  The class here does not
 | |
|     provide initialization of the tables; several subclasses exist to
 | |
|     do this (see the conv and pgen modules).
 | |
| 
 | |
|     The load() method reads the tables from a pickle file, which is
 | |
|     much faster than the other ways offered by subclasses.  The pickle
 | |
|     file is written by calling dump() (after loading the grammar
 | |
|     tables using a subclass).  The report() method prints a readable
 | |
|     representation of the tables to stdout, for debugging.
 | |
| 
 | |
|     The instance variables are as follows:
 | |
| 
 | |
|     symbol2number -- a dict mapping symbol names to numbers.  Symbol
 | |
|                      numbers are always 256 or higher, to distinguish
 | |
|                      them from token numbers, which are between 0 and
 | |
|                      255 (inclusive).
 | |
| 
 | |
|     number2symbol -- a dict mapping numbers to symbol names;
 | |
|                      these two are each other's inverse.
 | |
| 
 | |
|     states        -- a list of DFAs, where each DFA is a list of
 | |
|                      states, each state is a list of arcs, and each
 | |
|                      arc is a (i, j) pair where i is a label and j is
 | |
|                      a state number.  The DFA number is the index into
 | |
|                      this list.  (This name is slightly confusing.)
 | |
|                      Final states are represented by a special arc of
 | |
|                      the form (0, j) where j is its own state number.
 | |
| 
 | |
|     dfas          -- a dict mapping symbol numbers to (DFA, first)
 | |
|                      pairs, where DFA is an item from the states list
 | |
|                      above, and first is a set of tokens that can
 | |
|                      begin this grammar rule (represented by a dict
 | |
|                      whose values are always 1).
 | |
| 
 | |
|     labels        -- a list of (x, y) pairs where x is either a token
 | |
|                      number or a symbol number, and y is either None
 | |
|                      or a string; the strings are keywords.  The label
 | |
|                      number is the index in this list; label numbers
 | |
|                      are used to mark state transitions (arcs) in the
 | |
|                      DFAs.
 | |
| 
 | |
|     start         -- the number of the grammar's start symbol.
 | |
| 
 | |
|     keywords      -- a dict mapping keyword strings to arc labels.
 | |
| 
 | |
|     tokens        -- a dict mapping token numbers to arc labels.
 | |
| 
 | |
|     """
 | |
| 
 | |
|     def __init__(self):
 | |
|         self.symbol2number = {}
 | |
|         self.number2symbol = {}
 | |
|         self.states = []
 | |
|         self.dfas = {}
 | |
|         self.labels = [(0, "EMPTY")]
 | |
|         self.keywords = {}
 | |
|         self.tokens = {}
 | |
|         self.symbol2label = {}
 | |
|         self.start = 256
 | |
| 
 | |
|     def dump(self, filename):
 | |
|         """Dump the grammar tables to a pickle file."""
 | |
|         with open(filename, "wb") as f:
 | |
|             pickle.dump(self.__dict__, f, pickle.HIGHEST_PROTOCOL)
 | |
| 
 | |
|     def load(self, filename):
 | |
|         """Load the grammar tables from a pickle file."""
 | |
|         with open(filename, "rb") as f:
 | |
|             d = pickle.load(f)
 | |
|         self.__dict__.update(d)
 | |
| 
 | |
|     def loads(self, pkl):
 | |
|         """Load the grammar tables from a pickle bytes object."""
 | |
|         self.__dict__.update(pickle.loads(pkl))
 | |
| 
 | |
|     def copy(self):
 | |
|         """
 | |
|         Copy the grammar.
 | |
|         """
 | |
|         new = self.__class__()
 | |
|         for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
 | |
|                           "tokens", "symbol2label"):
 | |
|             setattr(new, dict_attr, getattr(self, dict_attr).copy())
 | |
|         new.labels = self.labels[:]
 | |
|         new.states = self.states[:]
 | |
|         new.start = self.start
 | |
|         return new
 | |
| 
 | |
|     def report(self):
 | |
|         """Dump the grammar tables to standard output, for debugging."""
 | |
|         from pprint import pprint
 | |
|         print("s2n")
 | |
|         pprint(self.symbol2number)
 | |
|         print("n2s")
 | |
|         pprint(self.number2symbol)
 | |
|         print("states")
 | |
|         pprint(self.states)
 | |
|         print("dfas")
 | |
|         pprint(self.dfas)
 | |
|         print("labels")
 | |
|         pprint(self.labels)
 | |
|         print("start", self.start)
 | |
| 
 | |
| 
 | |
| # Map from operator to number (since tokenize doesn't do this)
 | |
| 
 | |
| opmap_raw = """
 | |
| ( LPAR
 | |
| ) RPAR
 | |
| [ LSQB
 | |
| ] RSQB
 | |
| : COLON
 | |
| , COMMA
 | |
| ; SEMI
 | |
| + PLUS
 | |
| - MINUS
 | |
| * STAR
 | |
| / SLASH
 | |
| | VBAR
 | |
| & AMPER
 | |
| < LESS
 | |
| > GREATER
 | |
| = EQUAL
 | |
| . DOT
 | |
| % PERCENT
 | |
| ` BACKQUOTE
 | |
| { LBRACE
 | |
| } RBRACE
 | |
| @ AT
 | |
| @= ATEQUAL
 | |
| == EQEQUAL
 | |
| != NOTEQUAL
 | |
| <> NOTEQUAL
 | |
| <= LESSEQUAL
 | |
| >= GREATEREQUAL
 | |
| ~ TILDE
 | |
| ^ CIRCUMFLEX
 | |
| << LEFTSHIFT
 | |
| >> RIGHTSHIFT
 | |
| ** DOUBLESTAR
 | |
| += PLUSEQUAL
 | |
| -= MINEQUAL
 | |
| *= STAREQUAL
 | |
| /= SLASHEQUAL
 | |
| %= PERCENTEQUAL
 | |
| &= AMPEREQUAL
 | |
| |= VBAREQUAL
 | |
| ^= CIRCUMFLEXEQUAL
 | |
| <<= LEFTSHIFTEQUAL
 | |
| >>= RIGHTSHIFTEQUAL
 | |
| **= DOUBLESTAREQUAL
 | |
| // DOUBLESLASH
 | |
| //= DOUBLESLASHEQUAL
 | |
| -> RARROW
 | |
| := COLONEQUAL
 | |
| """
 | |
| 
 | |
| opmap = {}
 | |
| for line in opmap_raw.splitlines():
 | |
|     if line:
 | |
|         op, name = line.split()
 | |
|         opmap[op] = getattr(token, name)
 | |
| del line, op, name
 | 
