mirror of
https://github.com/python/cpython.git
synced 2025-07-12 05:45:15 +00:00
Issue #3665: \u and \U escapes are now supported in unicode regular expressions.
Patch by Serhiy Storchaka.
This commit is contained in:
parent
c9aa8425c4
commit
463badf06c
4 changed files with 144 additions and 34 deletions
|
@ -177,6 +177,7 @@ class SubPattern:
|
|||
|
||||
class Tokenizer:
|
||||
def __init__(self, string):
|
||||
self.istext = isinstance(string, str)
|
||||
self.string = string
|
||||
self.index = 0
|
||||
self.__next()
|
||||
|
@ -187,14 +188,14 @@ class Tokenizer:
|
|||
char = self.string[self.index:self.index+1]
|
||||
# Special case for the str8, since indexing returns a integer
|
||||
# XXX This is only needed for test_bug_926075 in test_re.py
|
||||
if char and isinstance(char, bytes):
|
||||
if char and not self.istext:
|
||||
char = chr(char[0])
|
||||
if char == "\\":
|
||||
try:
|
||||
c = self.string[self.index + 1]
|
||||
except IndexError:
|
||||
raise error("bogus escape (end of line)")
|
||||
if isinstance(self.string, bytes):
|
||||
if not self.istext:
|
||||
c = chr(c)
|
||||
char = char + c
|
||||
self.index = self.index + len(char)
|
||||
|
@ -209,6 +210,15 @@ class Tokenizer:
|
|||
this = self.next
|
||||
self.__next()
|
||||
return this
|
||||
def getwhile(self, n, charset):
|
||||
result = ''
|
||||
for _ in range(n):
|
||||
c = self.next
|
||||
if c not in charset:
|
||||
break
|
||||
result += c
|
||||
self.__next()
|
||||
return result
|
||||
def tell(self):
|
||||
return self.index, self.next
|
||||
def seek(self, index):
|
||||
|
@ -241,20 +251,30 @@ def _class_escape(source, escape):
|
|||
c = escape[1:2]
|
||||
if c == "x":
|
||||
# hexadecimal escape (exactly two digits)
|
||||
while source.next in HEXDIGITS and len(escape) < 4:
|
||||
escape = escape + source.get()
|
||||
escape = escape[2:]
|
||||
if len(escape) != 2:
|
||||
raise error("bogus escape: %s" % repr("\\" + escape))
|
||||
return LITERAL, int(escape, 16) & 0xff
|
||||
escape += source.getwhile(2, HEXDIGITS)
|
||||
if len(escape) != 4:
|
||||
raise ValueError
|
||||
return LITERAL, int(escape[2:], 16) & 0xff
|
||||
elif c == "u" and source.istext:
|
||||
# unicode escape (exactly four digits)
|
||||
escape += source.getwhile(4, HEXDIGITS)
|
||||
if len(escape) != 6:
|
||||
raise ValueError
|
||||
return LITERAL, int(escape[2:], 16)
|
||||
elif c == "U" and source.istext:
|
||||
# unicode escape (exactly eight digits)
|
||||
escape += source.getwhile(8, HEXDIGITS)
|
||||
if len(escape) != 10:
|
||||
raise ValueError
|
||||
c = int(escape[2:], 16)
|
||||
chr(c) # raise ValueError for invalid code
|
||||
return LITERAL, c
|
||||
elif c in OCTDIGITS:
|
||||
# octal escape (up to three digits)
|
||||
while source.next in OCTDIGITS and len(escape) < 4:
|
||||
escape = escape + source.get()
|
||||
escape = escape[1:]
|
||||
return LITERAL, int(escape, 8) & 0xff
|
||||
escape += source.getwhile(2, OCTDIGITS)
|
||||
return LITERAL, int(escape[1:], 8) & 0xff
|
||||
elif c in DIGITS:
|
||||
raise error("bogus escape: %s" % repr(escape))
|
||||
raise ValueError
|
||||
if len(escape) == 2:
|
||||
return LITERAL, ord(escape[1])
|
||||
except ValueError:
|
||||
|
@ -273,15 +293,27 @@ def _escape(source, escape, state):
|
|||
c = escape[1:2]
|
||||
if c == "x":
|
||||
# hexadecimal escape
|
||||
while source.next in HEXDIGITS and len(escape) < 4:
|
||||
escape = escape + source.get()
|
||||
escape += source.getwhile(2, HEXDIGITS)
|
||||
if len(escape) != 4:
|
||||
raise ValueError
|
||||
return LITERAL, int(escape[2:], 16) & 0xff
|
||||
elif c == "u" and source.istext:
|
||||
# unicode escape (exactly four digits)
|
||||
escape += source.getwhile(4, HEXDIGITS)
|
||||
if len(escape) != 6:
|
||||
raise ValueError
|
||||
return LITERAL, int(escape[2:], 16)
|
||||
elif c == "U" and source.istext:
|
||||
# unicode escape (exactly eight digits)
|
||||
escape += source.getwhile(8, HEXDIGITS)
|
||||
if len(escape) != 10:
|
||||
raise ValueError
|
||||
c = int(escape[2:], 16)
|
||||
chr(c) # raise ValueError for invalid code
|
||||
return LITERAL, c
|
||||
elif c == "0":
|
||||
# octal escape
|
||||
while source.next in OCTDIGITS and len(escape) < 4:
|
||||
escape = escape + source.get()
|
||||
escape += source.getwhile(2, OCTDIGITS)
|
||||
return LITERAL, int(escape[1:], 8) & 0xff
|
||||
elif c in DIGITS:
|
||||
# octal escape *or* decimal group reference (sigh)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue