mirror of
				https://github.com/python/cpython.git
				synced 2025-11-04 11:49:12 +00:00 
			
		
		
		
	* test_grammar.py, testall.out: added test for funny things in string literals * token.py, symbol.py: definitions used with built-in parser module. * tokenize.py: added double-quote recognition
		
			
				
	
	
		
			63 lines
		
	
	
	
		
			1.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			63 lines
		
	
	
	
		
			1.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# This module compiles a regular expression that recognizes Python tokens.
 | 
						|
# It is designed to match the working of the Python tokenizer exactly.
 | 
						|
# It takes care of everything except indentation;
 | 
						|
# note that un-escaped newlines are tokens, too.
 | 
						|
# tokenprog.regs[3] gives the location of the token without whitespace
 | 
						|
# It also defines various subexpressions, but doesn't compile them.
 | 
						|
# See the function test() below for an example of how to use.
 | 
						|
 | 
						|
import regex
 | 
						|
 | 
						|
# Note: to get a quoted backslash in a regexp, it must be quadrupled.
 | 
						|
 | 
						|
Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?'
 | 
						|
 | 
						|
Name = '[a-zA-Z_][a-zA-Z0-9_]*'
 | 
						|
 | 
						|
Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
 | 
						|
Octnumber = '0[0-7]*[lL]?'
 | 
						|
Decnumber = '[1-9][0-9]*[lL]?'
 | 
						|
Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber
 | 
						|
Exponent = '[eE][-+]?[0-9]+'
 | 
						|
Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?'
 | 
						|
Expfloat = '[0-9]+' + Exponent
 | 
						|
Floatnumber = Pointfloat + '\|' + Expfloat
 | 
						|
Number = Floatnumber + '\|' + Intnumber
 | 
						|
 | 
						|
String = '\'\(\\\\.\|[^\\\n\']\)*\'' + '\|' + '"\(\\\\.\|[^\\\n"]\)*"'
 | 
						|
# Note: this module *recognizes* double quotes, but for backward
 | 
						|
# compatibility, it doesn't *use* them!
 | 
						|
 | 
						|
Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>'
 | 
						|
Bracket = '[][(){}]'
 | 
						|
Special = '[:;.,`\n]'
 | 
						|
Funny = Operator + '\|' + Bracket + '\|' + Special
 | 
						|
 | 
						|
PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny
 | 
						|
 | 
						|
Token = Ignore + '\(' + PlainToken + '\)'
 | 
						|
 | 
						|
try:
 | 
						|
	save_syntax = regex.set_syntax(0) # Use default syntax
 | 
						|
	tokenprog = regex.compile(Token)
 | 
						|
finally:
 | 
						|
	if save_syntax != 0:
 | 
						|
		dummy = regex.set_syntax(save_syntax) # Restore original syntax
 | 
						|
 | 
						|
 | 
						|
def test(file):
 | 
						|
	f = open(file, 'r')
 | 
						|
	while 1:
 | 
						|
		line = f.readline()
 | 
						|
		if not line: break
 | 
						|
		i, n = 0, len(line)
 | 
						|
		while i < n:
 | 
						|
			j = tokenprog.match(line, i)
 | 
						|
			if j < 0:
 | 
						|
				print 'No token at', `line[i:i+20]` + '...'
 | 
						|
				i = i+1
 | 
						|
			else:
 | 
						|
				i = i+j
 | 
						|
				a, b = tokenprog.regs[3]
 | 
						|
				if a < b:
 | 
						|
					print 'Token:', `line[a:b]`
 |