mirror of
https://github.com/python/cpython.git
synced 2025-08-04 08:59:19 +00:00
#2834: Change re module semantics, so that str and bytes mixing is forbidden,
and str (unicode) patterns get full unicode matching by default. The re.ASCII flag is also introduced to ask for ASCII matching instead.
This commit is contained in:
parent
3ad7ba10a2
commit
fd036451bf
37 changed files with 280 additions and 163 deletions
|
@ -11,9 +11,13 @@
|
||||||
|
|
||||||
|
|
||||||
This module provides regular expression matching operations similar to
|
This module provides regular expression matching operations similar to
|
||||||
those found in Perl. Both patterns and strings to be searched can be
|
those found in Perl. The :mod:`re` module is always available.
|
||||||
Unicode strings as well as 8-bit strings. The :mod:`re` module is
|
|
||||||
always available.
|
Both patterns and strings to be searched can be Unicode strings as well as
|
||||||
|
8-bit strings. However, Unicode strings and 8-bit strings cannot be mixed:
|
||||||
|
that is, you cannot match an Unicode string with a byte pattern or
|
||||||
|
vice-versa; similarly, when asking for a substition, the replacement
|
||||||
|
string must be of the same type as both the pattern and the search string.
|
||||||
|
|
||||||
Regular expressions use the backslash character (``'\'``) to indicate
|
Regular expressions use the backslash character (``'\'``) to indicate
|
||||||
special forms or to allow special characters to be used without invoking
|
special forms or to allow special characters to be used without invoking
|
||||||
|
@ -212,12 +216,12 @@ The special characters are:
|
||||||
group; ``(?P<name>...)`` is the only exception to this rule. Following are the
|
group; ``(?P<name>...)`` is the only exception to this rule. Following are the
|
||||||
currently supported extensions.
|
currently supported extensions.
|
||||||
|
|
||||||
``(?iLmsux)``
|
``(?aiLmsux)``
|
||||||
(One or more letters from the set ``'i'``, ``'L'``, ``'m'``, ``'s'``,
|
(One or more letters from the set ``'a'``, ``'i'``, ``'L'``, ``'m'``,
|
||||||
``'u'``, ``'x'``.) The group matches the empty string; the letters
|
``'s'``, ``'u'``, ``'x'``.) The group matches the empty string; the
|
||||||
set the corresponding flags: :const:`re.I` (ignore case),
|
letters set the corresponding flags: :const:`re.a` (ASCII-only matching),
|
||||||
:const:`re.L` (locale dependent), :const:`re.M` (multi-line),
|
:const:`re.I` (ignore case), :const:`re.L` (locale dependent),
|
||||||
:const:`re.S` (dot matches all), :const:`re.U` (Unicode dependent),
|
:const:`re.M` (multi-line), :const:`re.S` (dot matches all),
|
||||||
and :const:`re.X` (verbose), for the entire regular expression. (The
|
and :const:`re.X` (verbose), for the entire regular expression. (The
|
||||||
flags are described in :ref:`contents-of-module-re`.) This
|
flags are described in :ref:`contents-of-module-re`.) This
|
||||||
is useful if you wish to include the flags as part of the regular
|
is useful if you wish to include the flags as part of the regular
|
||||||
|
@ -324,56 +328,62 @@ the second character. For example, ``\$`` matches the character ``'$'``.
|
||||||
word is indicated by whitespace or a non-alphanumeric, non-underscore character.
|
word is indicated by whitespace or a non-alphanumeric, non-underscore character.
|
||||||
Note that ``\b`` is defined as the boundary between ``\w`` and ``\ W``, so the
|
Note that ``\b`` is defined as the boundary between ``\w`` and ``\ W``, so the
|
||||||
precise set of characters deemed to be alphanumeric depends on the values of the
|
precise set of characters deemed to be alphanumeric depends on the values of the
|
||||||
``UNICODE`` and ``LOCALE`` flags. Inside a character range, ``\b`` represents
|
``ASCII`` and ``LOCALE`` flags. Inside a character range, ``\b`` represents
|
||||||
the backspace character, for compatibility with Python's string literals.
|
the backspace character, for compatibility with Python's string literals.
|
||||||
|
|
||||||
``\B``
|
``\B``
|
||||||
Matches the empty string, but only when it is *not* at the beginning or end of a
|
Matches the empty string, but only when it is *not* at the beginning or end of a
|
||||||
word. This is just the opposite of ``\b``, so is also subject to the settings
|
word. This is just the opposite of ``\b``, so is also subject to the settings
|
||||||
of ``LOCALE`` and ``UNICODE``.
|
of ``ASCII`` and ``LOCALE`` .
|
||||||
|
|
||||||
``\d``
|
``\d``
|
||||||
When the :const:`UNICODE` flag is not specified, matches any decimal digit; this
|
For Unicode (str) patterns:
|
||||||
is equivalent to the set ``[0-9]``. With :const:`UNICODE`, it will match
|
When the :const:`ASCII` flag is specified, matches any decimal digit; this
|
||||||
whatever is classified as a digit in the Unicode character properties database.
|
is equivalent to the set ``[0-9]``. Otherwise, it will match whatever
|
||||||
|
is classified as a digit in the Unicode character properties database
|
||||||
|
(but this does include the standard ASCII digits and is thus a superset
|
||||||
|
of [0-9]).
|
||||||
|
For 8-bit (bytes) patterns:
|
||||||
|
Matches any decimal digit; this is equivalent to the set ``[0-9]``.
|
||||||
|
|
||||||
``\D``
|
``\D``
|
||||||
When the :const:`UNICODE` flag is not specified, matches any non-digit
|
Matches any character which is not a decimal digit. This is the
|
||||||
character; this is equivalent to the set ``[^0-9]``. With :const:`UNICODE`, it
|
opposite of ``\d`` and is therefore similarly subject to the settings of
|
||||||
will match anything other than character marked as digits in the Unicode
|
``ASCII`` and ``LOCALE``.
|
||||||
character properties database.
|
|
||||||
|
|
||||||
``\s``
|
``\s``
|
||||||
When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches
|
For Unicode (str) patterns:
|
||||||
any whitespace character; this is equivalent to the set ``[ \t\n\r\f\v]``. With
|
When the :const:`ASCII` flag is specified, matches only ASCII whitespace
|
||||||
:const:`LOCALE`, it will match this set plus whatever characters are defined as
|
characters; this is equivalent to the set ``[ \t\n\r\f\v]``. Otherwise,
|
||||||
space for the current locale. If :const:`UNICODE` is set, this will match the
|
it will match this set whatever is classified as space in the Unicode
|
||||||
characters ``[ \t\n\r\f\v]`` plus whatever is classified as space in the Unicode
|
character properties database (including for example the non-breaking
|
||||||
character properties database.
|
spaces mandated by typography rules in many languages).
|
||||||
|
For 8-bit (bytes) patterns:
|
||||||
|
Matches characters considered whitespace in the ASCII character set;
|
||||||
|
this is equivalent to the set ``[ \t\n\r\f\v]``.
|
||||||
|
|
||||||
``\S``
|
``\S``
|
||||||
When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches
|
Matches any character which is not a whitespace character. This is the
|
||||||
any non-whitespace character; this is equivalent to the set ``[^ \t\n\r\f\v]``
|
opposite of ``\s`` and is therefore similarly subject to the settings of
|
||||||
With :const:`LOCALE`, it will match any character not in this set, and not
|
``ASCII`` and ``LOCALE``.
|
||||||
defined as space in the current locale. If :const:`UNICODE` is set, this will
|
|
||||||
match anything other than ``[ \t\n\r\f\v]`` and characters marked as space in
|
|
||||||
the Unicode character properties database.
|
|
||||||
|
|
||||||
``\w``
|
``\w``
|
||||||
When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches
|
For Unicode (str) patterns:
|
||||||
any alphanumeric character and the underscore; this is equivalent to the set
|
When the :const:`ASCII` flag is specified, this is equivalent to the set
|
||||||
``[a-zA-Z0-9_]``. With :const:`LOCALE`, it will match the set ``[0-9_]`` plus
|
``[a-zA-Z0-9_]``. Otherwise, it will match whatever is classified as
|
||||||
whatever characters are defined as alphanumeric for the current locale. If
|
alphanumeric in the Unicode character properties database (it will
|
||||||
:const:`UNICODE` is set, this will match the characters ``[0-9_]`` plus whatever
|
include most characters that can be part of a word in whatever language,
|
||||||
is classified as alphanumeric in the Unicode character properties database.
|
as well as numbers and the underscore sign).
|
||||||
|
For 8-bit (bytes) patterns:
|
||||||
|
Matches characters considered alphanumeric in the ASCII character set;
|
||||||
|
this is equivalent to the set ``[a-zA-Z0-9_]``. With :const:`LOCALE`,
|
||||||
|
it will additionally match whatever characters are defined as
|
||||||
|
alphanumeric for the current locale.
|
||||||
|
|
||||||
``\W``
|
``\W``
|
||||||
When the :const:`LOCALE` and :const:`UNICODE` flags are not specified, matches
|
Matches any character which is not an alphanumeric character. This is the
|
||||||
any non-alphanumeric character; this is equivalent to the set ``[^a-zA-Z0-9_]``.
|
opposite of ``\w`` and is therefore similarly subject to the settings of
|
||||||
With :const:`LOCALE`, it will match any character not in the set ``[0-9_]``, and
|
``ASCII`` and ``LOCALE``.
|
||||||
not defined as alphanumeric for the current locale. If :const:`UNICODE` is set,
|
|
||||||
this will match anything other than ``[0-9_]`` and characters marked as
|
|
||||||
alphanumeric in the Unicode character properties database.
|
|
||||||
|
|
||||||
``\Z``
|
``\Z``
|
||||||
Matches only at the end of the string.
|
Matches only at the end of the string.
|
||||||
|
@ -454,6 +464,25 @@ form.
|
||||||
expression at a time needn't worry about compiling regular expressions.)
|
expression at a time needn't worry about compiling regular expressions.)
|
||||||
|
|
||||||
|
|
||||||
|
.. data:: A
|
||||||
|
ASCII
|
||||||
|
|
||||||
|
Make ``\w``, ``\W``, ``\b``, ``\B``, ``\s`` and ``\S`` perform ASCII-only
|
||||||
|
matching instead of full Unicode matching. This is only meaningful for
|
||||||
|
Unicode patterns, and is ignored for byte patterns.
|
||||||
|
|
||||||
|
Note that the :const:`re.U` flag still exists (as well as its synonym
|
||||||
|
:const:`re.UNICODE` and its embedded counterpart ``(?u)``), but it has
|
||||||
|
become useless in Python 3.0.
|
||||||
|
In previous Python versions, it was used to specify that
|
||||||
|
matching had to be Unicode dependent (the default was ASCII matching in
|
||||||
|
all circumstances). Starting from Python 3.0, the default is Unicode
|
||||||
|
matching for Unicode strings (which can be changed by specifying the
|
||||||
|
``'a'`` flag), and ASCII matching for 8-bit strings. Further, Unicode
|
||||||
|
dependent matching for 8-bit strings isn't allowed anymore and results
|
||||||
|
in a ValueError.
|
||||||
|
|
||||||
|
|
||||||
.. data:: I
|
.. data:: I
|
||||||
IGNORECASE
|
IGNORECASE
|
||||||
|
|
||||||
|
@ -465,7 +494,10 @@ form.
|
||||||
LOCALE
|
LOCALE
|
||||||
|
|
||||||
Make ``\w``, ``\W``, ``\b``, ``\B``, ``\s`` and ``\S`` dependent on the
|
Make ``\w``, ``\W``, ``\b``, ``\B``, ``\s`` and ``\S`` dependent on the
|
||||||
current locale.
|
current locale. The use of this flag is discouraged as the locale mechanism
|
||||||
|
is very unreliable, and it only handles one "culture" at a time anyway;
|
||||||
|
you should use Unicode matching instead, which is the default in Python 3.0
|
||||||
|
for Unicode (str) patterns.
|
||||||
|
|
||||||
|
|
||||||
.. data:: M
|
.. data:: M
|
||||||
|
@ -486,13 +518,6 @@ form.
|
||||||
newline; without this flag, ``'.'`` will match anything *except* a newline.
|
newline; without this flag, ``'.'`` will match anything *except* a newline.
|
||||||
|
|
||||||
|
|
||||||
.. data:: U
|
|
||||||
UNICODE
|
|
||||||
|
|
||||||
Make ``\w``, ``\W``, ``\b``, ``\B``, ``\d``, ``\D``, ``\s`` and ``\S`` dependent
|
|
||||||
on the Unicode character properties database.
|
|
||||||
|
|
||||||
|
|
||||||
.. data:: X
|
.. data:: X
|
||||||
VERBOSE
|
VERBOSE
|
||||||
|
|
||||||
|
@ -511,6 +536,8 @@ form.
|
||||||
b = re.compile(r"\d+\.\d*")
|
b = re.compile(r"\d+\.\d*")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
.. function:: search(pattern, string[, flags])
|
.. function:: search(pattern, string[, flags])
|
||||||
|
|
||||||
Scan through *string* looking for a location where the regular expression
|
Scan through *string* looking for a location where the regular expression
|
||||||
|
|
|
@ -14,7 +14,7 @@ import time
|
||||||
import locale
|
import locale
|
||||||
import calendar
|
import calendar
|
||||||
from re import compile as re_compile
|
from re import compile as re_compile
|
||||||
from re import IGNORECASE
|
from re import IGNORECASE, ASCII
|
||||||
from re import escape as re_escape
|
from re import escape as re_escape
|
||||||
from datetime import date as datetime_date
|
from datetime import date as datetime_date
|
||||||
try:
|
try:
|
||||||
|
@ -262,7 +262,7 @@ class TimeRE(dict):
|
||||||
|
|
||||||
def compile(self, format):
|
def compile(self, format):
|
||||||
"""Return a compiled re object for the format string."""
|
"""Return a compiled re object for the format string."""
|
||||||
return re_compile(self.pattern(format), IGNORECASE)
|
return re_compile(self.pattern(format), IGNORECASE | ASCII)
|
||||||
|
|
||||||
_cache_lock = _thread_allocate_lock()
|
_cache_lock = _thread_allocate_lock()
|
||||||
# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock
|
# DO NOT modify _TimeRE_cache or _regex_cache without acquiring the cache lock
|
||||||
|
|
|
@ -39,7 +39,7 @@ def _translate(s, altchars):
|
||||||
return s.translate(translation)
|
return s.translate(translation)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Base64 encoding/decoding uses binascii
|
# Base64 encoding/decoding uses binascii
|
||||||
|
|
||||||
def b64encode(s, altchars=None):
|
def b64encode(s, altchars=None):
|
||||||
|
@ -126,7 +126,7 @@ def urlsafe_b64decode(s):
|
||||||
return b64decode(s, b'-_')
|
return b64decode(s, b'-_')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Base32 encoding/decoding must be done in Python
|
# Base32 encoding/decoding must be done in Python
|
||||||
_b32alphabet = {
|
_b32alphabet = {
|
||||||
0: b'A', 9: b'J', 18: b'S', 27: b'3',
|
0: b'A', 9: b'J', 18: b'S', 27: b'3',
|
||||||
|
@ -225,7 +225,7 @@ def b32decode(s, casefold=False, map01=None):
|
||||||
# characters because this will tell us how many null bytes to remove from
|
# characters because this will tell us how many null bytes to remove from
|
||||||
# the end of the decoded string.
|
# the end of the decoded string.
|
||||||
padchars = 0
|
padchars = 0
|
||||||
mo = re.search('(?P<pad>[=]*)$', s)
|
mo = re.search(b'(?P<pad>[=]*)$', s)
|
||||||
if mo:
|
if mo:
|
||||||
padchars = len(mo.group('pad'))
|
padchars = len(mo.group('pad'))
|
||||||
if padchars > 0:
|
if padchars > 0:
|
||||||
|
@ -262,7 +262,7 @@ def b32decode(s, casefold=False, map01=None):
|
||||||
return b''.join(parts)
|
return b''.join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns
|
# RFC 3548, Base 16 Alphabet specifies uppercase, but hexlify() returns
|
||||||
# lowercase. The RFC also recommends against accepting input case
|
# lowercase. The RFC also recommends against accepting input case
|
||||||
# insensitively.
|
# insensitively.
|
||||||
|
@ -291,12 +291,12 @@ def b16decode(s, casefold=False):
|
||||||
raise TypeError("expected bytes, not %s" % s.__class__.__name__)
|
raise TypeError("expected bytes, not %s" % s.__class__.__name__)
|
||||||
if casefold:
|
if casefold:
|
||||||
s = s.upper()
|
s = s.upper()
|
||||||
if re.search('[^0-9A-F]', s):
|
if re.search(b'[^0-9A-F]', s):
|
||||||
raise binascii.Error('Non-base16 digit found')
|
raise binascii.Error('Non-base16 digit found')
|
||||||
return binascii.unhexlify(s)
|
return binascii.unhexlify(s)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Legacy interface. This code could be cleaned up since I don't believe
|
# Legacy interface. This code could be cleaned up since I don't believe
|
||||||
# binascii has any line length limitations. It just doesn't seem worth it
|
# binascii has any line length limitations. It just doesn't seem worth it
|
||||||
# though. The files should be opened in binary mode.
|
# though. The files should be opened in binary mode.
|
||||||
|
@ -353,7 +353,7 @@ def decodestring(s):
|
||||||
return binascii.a2b_base64(s)
|
return binascii.a2b_base64(s)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Usable as a script...
|
# Usable as a script...
|
||||||
def main():
|
def main():
|
||||||
"""Small main program"""
|
"""Small main program"""
|
||||||
|
|
|
@ -5415,7 +5415,7 @@ ExtendedContext = Context(
|
||||||
# 2. For finite numbers (not infinities and NaNs) the body of the
|
# 2. For finite numbers (not infinities and NaNs) the body of the
|
||||||
# number between the optional sign and the optional exponent must have
|
# number between the optional sign and the optional exponent must have
|
||||||
# at least one decimal digit, possibly after the decimal point. The
|
# at least one decimal digit, possibly after the decimal point. The
|
||||||
# lookahead expression '(?=\d|\.\d)' checks this.
|
# lookahead expression '(?=[0-9]|\.[0-9])' checks this.
|
||||||
#
|
#
|
||||||
# As the flag UNICODE is not enabled here, we're explicitly avoiding any
|
# As the flag UNICODE is not enabled here, we're explicitly avoiding any
|
||||||
# other meaning for \d than the numbers [0-9].
|
# other meaning for \d than the numbers [0-9].
|
||||||
|
|
|
@ -409,7 +409,7 @@ def get_versions():
|
||||||
out = os.popen(gcc_exe + ' -dumpversion','r')
|
out = os.popen(gcc_exe + ' -dumpversion','r')
|
||||||
out_string = out.read()
|
out_string = out.read()
|
||||||
out.close()
|
out.close()
|
||||||
result = re.search('(\d+\.\d+(\.\d+)*)',out_string)
|
result = re.search('(\d+\.\d+(\.\d+)*)', out_string, re.ASCII)
|
||||||
if result:
|
if result:
|
||||||
gcc_version = StrictVersion(result.group(1))
|
gcc_version = StrictVersion(result.group(1))
|
||||||
else:
|
else:
|
||||||
|
@ -421,7 +421,7 @@ def get_versions():
|
||||||
out = os.popen(ld_exe + ' -v','r')
|
out = os.popen(ld_exe + ' -v','r')
|
||||||
out_string = out.read()
|
out_string = out.read()
|
||||||
out.close()
|
out.close()
|
||||||
result = re.search('(\d+\.\d+(\.\d+)*)',out_string)
|
result = re.search('(\d+\.\d+(\.\d+)*)', out_string, re.ASCII)
|
||||||
if result:
|
if result:
|
||||||
ld_version = StrictVersion(result.group(1))
|
ld_version = StrictVersion(result.group(1))
|
||||||
else:
|
else:
|
||||||
|
@ -433,7 +433,7 @@ def get_versions():
|
||||||
out = os.popen(dllwrap_exe + ' --version','r')
|
out = os.popen(dllwrap_exe + ' --version','r')
|
||||||
out_string = out.read()
|
out_string = out.read()
|
||||||
out.close()
|
out.close()
|
||||||
result = re.search(' (\d+\.\d+(\.\d+)*)',out_string)
|
result = re.search(' (\d+\.\d+(\.\d+)*)', out_string, re.ASCII)
|
||||||
if result:
|
if result:
|
||||||
dllwrap_version = StrictVersion(result.group(1))
|
dllwrap_version = StrictVersion(result.group(1))
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -300,7 +300,7 @@ def get_versions():
|
||||||
out = os.popen(gcc_exe + ' -dumpversion','r')
|
out = os.popen(gcc_exe + ' -dumpversion','r')
|
||||||
out_string = out.read()
|
out_string = out.read()
|
||||||
out.close()
|
out.close()
|
||||||
result = re.search('(\d+\.\d+\.\d+)',out_string)
|
result = re.search('(\d+\.\d+\.\d+)', out_string, re.ASCII)
|
||||||
if result:
|
if result:
|
||||||
gcc_version = StrictVersion(result.group(1))
|
gcc_version = StrictVersion(result.group(1))
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -512,7 +512,7 @@ def get_config_vars(*args):
|
||||||
# patched up as well.
|
# patched up as well.
|
||||||
'CFLAGS', 'PY_CFLAGS', 'BLDSHARED'):
|
'CFLAGS', 'PY_CFLAGS', 'BLDSHARED'):
|
||||||
flags = _config_vars[key]
|
flags = _config_vars[key]
|
||||||
flags = re.sub('-arch\s+\w+\s', ' ', flags)
|
flags = re.sub('-arch\s+\w+\s', ' ', flags, re.ASCII)
|
||||||
flags = re.sub('-isysroot [^ \t]*', ' ', flags)
|
flags = re.sub('-isysroot [^ \t]*', ' ', flags)
|
||||||
_config_vars[key] = flags
|
_config_vars[key] = flags
|
||||||
|
|
||||||
|
|
|
@ -81,7 +81,7 @@ def get_platform ():
|
||||||
return "%s-%s.%s" % (osname, version, release)
|
return "%s-%s.%s" % (osname, version, release)
|
||||||
elif osname[:6] == "cygwin":
|
elif osname[:6] == "cygwin":
|
||||||
osname = "cygwin"
|
osname = "cygwin"
|
||||||
rel_re = re.compile (r'[\d.]+')
|
rel_re = re.compile (r'[\d.]+', re.ASCII)
|
||||||
m = rel_re.match(release)
|
m = rel_re.match(release)
|
||||||
if m:
|
if m:
|
||||||
release = m.group()
|
release = m.group()
|
||||||
|
|
|
@ -134,7 +134,7 @@ class StrictVersion (Version):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
version_re = re.compile(r'^(\d+) \. (\d+) (\. (\d+))? ([ab](\d+))?$',
|
version_re = re.compile(r'^(\d+) \. (\d+) (\. (\d+))? ([ab](\d+))?$',
|
||||||
re.VERBOSE)
|
re.VERBOSE | re.ASCII)
|
||||||
|
|
||||||
|
|
||||||
def parse (self, vstring):
|
def parse (self, vstring):
|
||||||
|
|
|
@ -5,7 +5,8 @@ import distutils.version
|
||||||
import operator
|
import operator
|
||||||
|
|
||||||
|
|
||||||
re_validPackage = re.compile(r"(?i)^\s*([a-z_]\w*(?:\.[a-z_]\w*)*)(.*)")
|
re_validPackage = re.compile(r"(?i)^\s*([a-z_]\w*(?:\.[a-z_]\w*)*)(.*)",
|
||||||
|
re.ASCII)
|
||||||
# (package) (rest)
|
# (package) (rest)
|
||||||
|
|
||||||
re_paren = re.compile(r"^\s*\((.*)\)\s*$") # (list) inside of parentheses
|
re_paren = re.compile(r"^\s*\((.*)\)\s*$") # (list) inside of parentheses
|
||||||
|
@ -153,7 +154,8 @@ def split_provision(value):
|
||||||
global _provision_rx
|
global _provision_rx
|
||||||
if _provision_rx is None:
|
if _provision_rx is None:
|
||||||
_provision_rx = re.compile(
|
_provision_rx = re.compile(
|
||||||
"([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*)(?:\s*\(\s*([^)\s]+)\s*\))?$")
|
"([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)*)(?:\s*\(\s*([^)\s]+)\s*\))?$",
|
||||||
|
re.ASCII)
|
||||||
value = value.strip()
|
value = value.strip()
|
||||||
m = _provision_rx.match(value)
|
m = _provision_rx.match(value)
|
||||||
if not m:
|
if not m:
|
||||||
|
|
|
@ -70,7 +70,7 @@ for c in (b' !"#$%&\'()*+,-./0123456789:;<>'
|
||||||
_QUOPRI_BODY_MAP[c] = chr(c)
|
_QUOPRI_BODY_MAP[c] = chr(c)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Helpers
|
# Helpers
|
||||||
def header_check(octet):
|
def header_check(octet):
|
||||||
"""Return True if the octet should be escaped with header quopri."""
|
"""Return True if the octet should be escaped with header quopri."""
|
||||||
|
@ -125,7 +125,7 @@ def quote(c):
|
||||||
return '=%02X' % ord(c)
|
return '=%02X' % ord(c)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def header_encode(header_bytes, charset='iso-8859-1'):
|
def header_encode(header_bytes, charset='iso-8859-1'):
|
||||||
"""Encode a single header line with quoted-printable (like) encoding.
|
"""Encode a single header line with quoted-printable (like) encoding.
|
||||||
|
|
||||||
|
@ -149,7 +149,7 @@ def header_encode(header_bytes, charset='iso-8859-1'):
|
||||||
return '=?%s?q?%s?=' % (charset, EMPTYSTRING.join(encoded))
|
return '=?%s?q?%s?=' % (charset, EMPTYSTRING.join(encoded))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def body_encode(body, maxlinelen=76, eol=NL):
|
def body_encode(body, maxlinelen=76, eol=NL):
|
||||||
"""Encode with quoted-printable, wrapping at maxlinelen characters.
|
"""Encode with quoted-printable, wrapping at maxlinelen characters.
|
||||||
|
|
||||||
|
@ -225,7 +225,7 @@ def body_encode(body, maxlinelen=76, eol=NL):
|
||||||
return encoded_body
|
return encoded_body
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# BAW: I'm not sure if the intent was for the signature of this function to be
|
# BAW: I'm not sure if the intent was for the signature of this function to be
|
||||||
# the same as base64MIME.decode() or not...
|
# the same as base64MIME.decode() or not...
|
||||||
def decode(encoded, eol=NL):
|
def decode(encoded, eol=NL):
|
||||||
|
@ -280,7 +280,7 @@ body_decode = decode
|
||||||
decodestring = decode
|
decodestring = decode
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _unquote_match(match):
|
def _unquote_match(match):
|
||||||
"""Turn a match in the form =AB to the ASCII character with value 0xab"""
|
"""Turn a match in the form =AB to the ASCII character with value 0xab"""
|
||||||
s = match.group(0)
|
s = match.group(0)
|
||||||
|
@ -296,4 +296,4 @@ def header_decode(s):
|
||||||
the high level email.Header class for that functionality.
|
the high level email.Header class for that functionality.
|
||||||
"""
|
"""
|
||||||
s = s.replace('_', ' ')
|
s = s.replace('_', ' ')
|
||||||
return re.sub(r'=\w{2}', _unquote_match, s)
|
return re.sub(r'=\w{2}', _unquote_match, s, re.ASCII)
|
||||||
|
|
|
@ -52,7 +52,7 @@ specialsre = re.compile(r'[][\\()<>@,:;".]')
|
||||||
escapesre = re.compile(r'[][\\()"]')
|
escapesre = re.compile(r'[][\\()"]')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Helpers
|
# Helpers
|
||||||
|
|
||||||
def formataddr(pair):
|
def formataddr(pair):
|
||||||
|
@ -73,7 +73,7 @@ def formataddr(pair):
|
||||||
return address
|
return address
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def getaddresses(fieldvalues):
|
def getaddresses(fieldvalues):
|
||||||
"""Return a list of (REALNAME, EMAIL) for each fieldvalue."""
|
"""Return a list of (REALNAME, EMAIL) for each fieldvalue."""
|
||||||
all = COMMASPACE.join(fieldvalues)
|
all = COMMASPACE.join(fieldvalues)
|
||||||
|
@ -81,7 +81,7 @@ def getaddresses(fieldvalues):
|
||||||
return a.addresslist
|
return a.addresslist
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
ecre = re.compile(r'''
|
ecre = re.compile(r'''
|
||||||
=\? # literal =?
|
=\? # literal =?
|
||||||
(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
|
(?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
|
||||||
|
@ -93,7 +93,7 @@ ecre = re.compile(r'''
|
||||||
''', re.VERBOSE | re.IGNORECASE)
|
''', re.VERBOSE | re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def formatdate(timeval=None, localtime=False, usegmt=False):
|
def formatdate(timeval=None, localtime=False, usegmt=False):
|
||||||
"""Returns a date string as specified by RFC 2822, e.g.:
|
"""Returns a date string as specified by RFC 2822, e.g.:
|
||||||
|
|
||||||
|
@ -146,7 +146,7 @@ def formatdate(timeval=None, localtime=False, usegmt=False):
|
||||||
zone)
|
zone)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def make_msgid(idstring=None):
|
def make_msgid(idstring=None):
|
||||||
"""Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
|
"""Returns a string suitable for RFC 2822 compliant Message-ID, e.g:
|
||||||
|
|
||||||
|
@ -168,7 +168,7 @@ def make_msgid(idstring=None):
|
||||||
return msgid
|
return msgid
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# These functions are in the standalone mimelib version only because they've
|
# These functions are in the standalone mimelib version only because they've
|
||||||
# subsequently been fixed in the latest Python versions. We use this to worm
|
# subsequently been fixed in the latest Python versions. We use this to worm
|
||||||
# around broken older Pythons.
|
# around broken older Pythons.
|
||||||
|
@ -202,7 +202,7 @@ def unquote(str):
|
||||||
return str
|
return str
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# RFC2231-related functions - parameter encoding and decoding
|
# RFC2231-related functions - parameter encoding and decoding
|
||||||
def decode_rfc2231(s):
|
def decode_rfc2231(s):
|
||||||
"""Decode string according to RFC 2231"""
|
"""Decode string according to RFC 2231"""
|
||||||
|
@ -227,7 +227,8 @@ def encode_rfc2231(s, charset=None, language=None):
|
||||||
return "%s'%s'%s" % (charset, language, s)
|
return "%s'%s'%s" % (charset, language, s)
|
||||||
|
|
||||||
|
|
||||||
rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$')
|
rfc2231_continuation = re.compile(r'^(?P<name>\w+)\*((?P<num>[0-9]+)\*?)?$',
|
||||||
|
re.ASCII)
|
||||||
|
|
||||||
def decode_params(params):
|
def decode_params(params):
|
||||||
"""Decode parameters list according to RFC 2231.
|
"""Decode parameters list according to RFC 2231.
|
||||||
|
|
|
@ -176,12 +176,10 @@ class Codec(codecs.Codec):
|
||||||
return "", 0
|
return "", 0
|
||||||
|
|
||||||
# IDNA allows decoding to operate on Unicode strings, too.
|
# IDNA allows decoding to operate on Unicode strings, too.
|
||||||
if isinstance(input, bytes):
|
if not isinstance(input, bytes):
|
||||||
labels = dots.split(input)
|
# XXX obviously wrong, see #3232
|
||||||
else:
|
|
||||||
# Force to bytes
|
|
||||||
input = bytes(input)
|
input = bytes(input)
|
||||||
labels = input.split(b".")
|
labels = input.split(b".")
|
||||||
|
|
||||||
if labels and len(labels[-1]) == 0:
|
if labels and len(labels[-1]) == 0:
|
||||||
trailing_dot = '.'
|
trailing_dot = '.'
|
||||||
|
|
|
@ -590,7 +590,8 @@ def parse150(resp):
|
||||||
global _150_re
|
global _150_re
|
||||||
if _150_re is None:
|
if _150_re is None:
|
||||||
import re
|
import re
|
||||||
_150_re = re.compile("150 .* \((\d+) bytes\)", re.IGNORECASE)
|
_150_re = re.compile(
|
||||||
|
"150 .* \((\d+) bytes\)", re.IGNORECASE | re.ASCII)
|
||||||
m = _150_re.match(resp)
|
m = _150_re.match(resp)
|
||||||
if not m:
|
if not m:
|
||||||
return None
|
return None
|
||||||
|
@ -613,7 +614,7 @@ def parse227(resp):
|
||||||
global _227_re
|
global _227_re
|
||||||
if _227_re is None:
|
if _227_re is None:
|
||||||
import re
|
import re
|
||||||
_227_re = re.compile(r'(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)')
|
_227_re = re.compile(r'(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)', re.ASCII)
|
||||||
m = _227_re.search(resp)
|
m = _227_re.search(resp)
|
||||||
if not m:
|
if not m:
|
||||||
raise error_proto(resp)
|
raise error_proto(resp)
|
||||||
|
|
|
@ -385,4 +385,4 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
return '&'+s+';'
|
return '&'+s+';'
|
||||||
|
|
||||||
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
|
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
|
||||||
replaceEntities, s)
|
replaceEntities, s, re.ASCII)
|
||||||
|
|
|
@ -121,7 +121,7 @@ def time2netscape(t=None):
|
||||||
|
|
||||||
UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
|
UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
|
||||||
|
|
||||||
TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
|
TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$", re.ASCII)
|
||||||
def offset_from_tz_string(tz):
|
def offset_from_tz_string(tz):
|
||||||
offset = None
|
offset = None
|
||||||
if tz in UTC_ZONES:
|
if tz in UTC_ZONES:
|
||||||
|
@ -191,9 +191,9 @@ def _str2time(day, mon, yr, hr, min, sec, tz):
|
||||||
|
|
||||||
STRICT_DATE_RE = re.compile(
|
STRICT_DATE_RE = re.compile(
|
||||||
r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
|
r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
|
||||||
"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
|
"(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$", re.ASCII)
|
||||||
WEEKDAY_RE = re.compile(
|
WEEKDAY_RE = re.compile(
|
||||||
r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
|
r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I | re.ASCII)
|
||||||
LOOSE_HTTP_DATE_RE = re.compile(
|
LOOSE_HTTP_DATE_RE = re.compile(
|
||||||
r"""^
|
r"""^
|
||||||
(\d\d?) # day
|
(\d\d?) # day
|
||||||
|
@ -210,7 +210,7 @@ LOOSE_HTTP_DATE_RE = re.compile(
|
||||||
([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
|
([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
|
||||||
\s*
|
\s*
|
||||||
(?:\(\w+\))? # ASCII representation of timezone in parens.
|
(?:\(\w+\))? # ASCII representation of timezone in parens.
|
||||||
\s*$""", re.X)
|
\s*$""", re.X | re.ASCII)
|
||||||
def http2time(text):
|
def http2time(text):
|
||||||
"""Returns time in seconds since epoch of time represented by a string.
|
"""Returns time in seconds since epoch of time represented by a string.
|
||||||
|
|
||||||
|
@ -282,7 +282,7 @@ ISO_DATE_RE = re.compile(
|
||||||
\s*
|
\s*
|
||||||
([-+]?\d\d?:?(:?\d\d)?
|
([-+]?\d\d?:?(:?\d\d)?
|
||||||
|Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
|
|Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
|
||||||
\s*$""", re.X)
|
\s*$""", re.X | re. ASCII)
|
||||||
def iso2time(text):
|
def iso2time(text):
|
||||||
"""
|
"""
|
||||||
As for http2time, but parses the ISO 8601 formats:
|
As for http2time, but parses the ISO 8601 formats:
|
||||||
|
@ -489,7 +489,7 @@ def parse_ns_headers(ns_headers):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
IPV4_RE = re.compile(r"\.\d+$")
|
IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
|
||||||
def is_HDN(text):
|
def is_HDN(text):
|
||||||
"""Return True if text is a host domain name."""
|
"""Return True if text is a host domain name."""
|
||||||
# XXX
|
# XXX
|
||||||
|
@ -574,7 +574,7 @@ def user_domain_match(A, B):
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
cut_port_re = re.compile(r":\d+$")
|
cut_port_re = re.compile(r":\d+$", re.ASCII)
|
||||||
def request_host(request):
|
def request_host(request):
|
||||||
"""Return request-host, as defined by RFC 2965.
|
"""Return request-host, as defined by RFC 2965.
|
||||||
|
|
||||||
|
@ -1207,7 +1207,7 @@ class CookieJar:
|
||||||
domain_re = re.compile(r"[^.]*")
|
domain_re = re.compile(r"[^.]*")
|
||||||
dots_re = re.compile(r"^\.+")
|
dots_re = re.compile(r"^\.+")
|
||||||
|
|
||||||
magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
|
magic_re = re.compile(r"^\#LWP-Cookies-(\d+\.\d+)", re.ASCII)
|
||||||
|
|
||||||
def __init__(self, policy=None):
|
def __init__(self, policy=None):
|
||||||
if policy is None:
|
if policy is None:
|
||||||
|
@ -1856,7 +1856,7 @@ class LWPCookieJar(FileCookieJar):
|
||||||
|
|
||||||
def _really_load(self, f, filename, ignore_discard, ignore_expires):
|
def _really_load(self, f, filename, ignore_discard, ignore_expires):
|
||||||
magic = f.readline()
|
magic = f.readline()
|
||||||
if not re.search(self.magic_re, magic):
|
if not self.magic_re.search(magic):
|
||||||
msg = ("%r does not look like a Set-Cookie3 (LWP) format "
|
msg = ("%r does not look like a Set-Cookie3 (LWP) format "
|
||||||
"file" % filename)
|
"file" % filename)
|
||||||
raise LoadError(msg)
|
raise LoadError(msg)
|
||||||
|
@ -1965,7 +1965,7 @@ class MozillaCookieJar(FileCookieJar):
|
||||||
header by default (Mozilla can cope with that).
|
header by default (Mozilla can cope with that).
|
||||||
|
|
||||||
"""
|
"""
|
||||||
magic_re = "#( Netscape)? HTTP Cookie File"
|
magic_re = re.compile("#( Netscape)? HTTP Cookie File")
|
||||||
header = """\
|
header = """\
|
||||||
# Netscape HTTP Cookie File
|
# Netscape HTTP Cookie File
|
||||||
# http://www.netscape.com/newsref/std/cookie_spec.html
|
# http://www.netscape.com/newsref/std/cookie_spec.html
|
||||||
|
@ -1977,7 +1977,7 @@ class MozillaCookieJar(FileCookieJar):
|
||||||
now = time.time()
|
now = time.time()
|
||||||
|
|
||||||
magic = f.readline()
|
magic = f.readline()
|
||||||
if not re.search(self.magic_re, magic):
|
if not self.magic_re.search(magic):
|
||||||
f.close()
|
f.close()
|
||||||
raise LoadError(
|
raise LoadError(
|
||||||
"%r does not look like a Netscape format cookies file" %
|
"%r does not look like a Netscape format cookies file" %
|
||||||
|
|
|
@ -445,7 +445,7 @@ _CookiePattern = re.compile(
|
||||||
""+ _LegalCharsPatt +"*" # Any word or empty string
|
""+ _LegalCharsPatt +"*" # Any word or empty string
|
||||||
r")" # End of group 'val'
|
r")" # End of group 'val'
|
||||||
r"\s*;?" # Probably ending in a semi-colon
|
r"\s*;?" # Probably ending in a semi-colon
|
||||||
)
|
, re.ASCII) # May be removed if safe.
|
||||||
|
|
||||||
|
|
||||||
# At long last, here is the cookie class.
|
# At long last, here is the cookie class.
|
||||||
|
|
|
@ -88,11 +88,12 @@ InternalDate = re.compile(r'.*INTERNALDATE "'
|
||||||
r' (?P<hour>[0-9][0-9]):(?P<min>[0-9][0-9]):(?P<sec>[0-9][0-9])'
|
r' (?P<hour>[0-9][0-9]):(?P<min>[0-9][0-9]):(?P<sec>[0-9][0-9])'
|
||||||
r' (?P<zonen>[-+])(?P<zoneh>[0-9][0-9])(?P<zonem>[0-9][0-9])'
|
r' (?P<zonen>[-+])(?P<zoneh>[0-9][0-9])(?P<zonem>[0-9][0-9])'
|
||||||
r'"')
|
r'"')
|
||||||
Literal = re.compile(r'.*{(?P<size>\d+)}$')
|
Literal = re.compile(r'.*{(?P<size>\d+)}$', re.ASCII)
|
||||||
MapCRLF = re.compile(r'\r\n|\r|\n')
|
MapCRLF = re.compile(r'\r\n|\r|\n')
|
||||||
Response_code = re.compile(r'\[(?P<type>[A-Z-]+)( (?P<data>[^\]]*))?\]')
|
Response_code = re.compile(r'\[(?P<type>[A-Z-]+)( (?P<data>[^\]]*))?\]')
|
||||||
Untagged_response = re.compile(r'\* (?P<type>[A-Z-]+)( (?P<data>.*))?')
|
Untagged_response = re.compile(r'\* (?P<type>[A-Z-]+)( (?P<data>.*))?')
|
||||||
Untagged_status = re.compile(r'\* (?P<data>\d+) (?P<type>[A-Z-]+)( (?P<data2>.*))?')
|
Untagged_status = re.compile(
|
||||||
|
r'\* (?P<data>\d+) (?P<type>[A-Z-]+)( (?P<data2>.*))?', re.ASCII)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -146,7 +147,7 @@ class IMAP4:
|
||||||
class abort(error): pass # Service errors - close and retry
|
class abort(error): pass # Service errors - close and retry
|
||||||
class readonly(abort): pass # Mailbox status changed to READ-ONLY
|
class readonly(abort): pass # Mailbox status changed to READ-ONLY
|
||||||
|
|
||||||
mustquote = re.compile(r"[^\w!#$%&'*+,.:;<=>?^`|~-]")
|
mustquote = re.compile(r"[^\w!#$%&'*+,.:;<=>?^`|~-]", re.ASCII)
|
||||||
|
|
||||||
def __init__(self, host = '', port = IMAP4_PORT):
|
def __init__(self, host = '', port = IMAP4_PORT):
|
||||||
self.debug = Debug
|
self.debug = Debug
|
||||||
|
@ -168,7 +169,7 @@ class IMAP4:
|
||||||
self.tagpre = Int2AP(random.randint(4096, 65535))
|
self.tagpre = Int2AP(random.randint(4096, 65535))
|
||||||
self.tagre = re.compile(r'(?P<tag>'
|
self.tagre = re.compile(r'(?P<tag>'
|
||||||
+ self.tagpre
|
+ self.tagpre
|
||||||
+ r'\d+) (?P<type>[A-Z]+) (?P<data>.*)')
|
+ r'\d+) (?P<type>[A-Z]+) (?P<data>.*)', re.ASCII)
|
||||||
|
|
||||||
# Get server welcome message,
|
# Get server welcome message,
|
||||||
# request and store CAPABILITY response.
|
# request and store CAPABILITY response.
|
||||||
|
|
|
@ -67,7 +67,7 @@ def JSONNumber(match, context):
|
||||||
fn = getattr(context, 'parse_int', None) or int
|
fn = getattr(context, 'parse_int', None) or int
|
||||||
res = fn(integer)
|
res = fn(integer)
|
||||||
return res, None
|
return res, None
|
||||||
pattern(r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?')(JSONNumber)
|
pattern(r'(-?(?:0|[1-9][0-9]*))(\.[0-9]+)?([eE][-+]?[0-9]+)?')(JSONNumber)
|
||||||
|
|
||||||
|
|
||||||
STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
|
STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
|
||||||
|
|
|
@ -199,7 +199,7 @@ class TimedRotatingFileHandler(BaseRotatingHandler):
|
||||||
else:
|
else:
|
||||||
raise ValueError("Invalid rollover interval specified: %s" % self.when)
|
raise ValueError("Invalid rollover interval specified: %s" % self.when)
|
||||||
|
|
||||||
self.extMatch = re.compile(self.extMatch)
|
self.extMatch = re.compile(self.extMatch, re.ASCII)
|
||||||
self.interval = self.interval * interval # multiply by units requested
|
self.interval = self.interval * interval # multiply by units requested
|
||||||
self.rolloverAt = currentTime + self.interval
|
self.rolloverAt = currentTime + self.interval
|
||||||
|
|
||||||
|
|
|
@ -118,7 +118,7 @@ _libc_search = re.compile(r'(__libc_init)'
|
||||||
'|'
|
'|'
|
||||||
'(GLIBC_([0-9.]+))'
|
'(GLIBC_([0-9.]+))'
|
||||||
'|'
|
'|'
|
||||||
'(libc(_\w+)?\.so(?:\.(\d[0-9.]*))?)')
|
'(libc(_\w+)?\.so(?:\.(\d[0-9.]*))?)', re.ASCII)
|
||||||
|
|
||||||
def libc_ver(executable=sys.executable,lib='',version='',
|
def libc_ver(executable=sys.executable,lib='',version='',
|
||||||
|
|
||||||
|
@ -223,15 +223,15 @@ def _dist_try_harder(distname,version,id):
|
||||||
|
|
||||||
return distname,version,id
|
return distname,version,id
|
||||||
|
|
||||||
_release_filename = re.compile(r'(\w+)[-_](release|version)')
|
_release_filename = re.compile(r'(\w+)[-_](release|version)', re.ASCII)
|
||||||
_lsb_release_version = re.compile(r'(.+)'
|
_lsb_release_version = re.compile(r'(.+)'
|
||||||
' release '
|
' release '
|
||||||
'([\d.]+)'
|
'([\d.]+)'
|
||||||
'[^(]*(?:\((.+)\))?')
|
'[^(]*(?:\((.+)\))?', re.ASCII)
|
||||||
_release_version = re.compile(r'([^0-9]+)'
|
_release_version = re.compile(r'([^0-9]+)'
|
||||||
'(?: release )?'
|
'(?: release )?'
|
||||||
'([\d.]+)'
|
'([\d.]+)'
|
||||||
'[^(]*(?:\((.+)\))?')
|
'[^(]*(?:\((.+)\))?', re.ASCII)
|
||||||
|
|
||||||
# See also http://www.novell.com/coolsolutions/feature/11251.html
|
# See also http://www.novell.com/coolsolutions/feature/11251.html
|
||||||
# and http://linuxmafia.com/faq/Admin/release-files.html
|
# and http://linuxmafia.com/faq/Admin/release-files.html
|
||||||
|
@ -464,7 +464,7 @@ def _norm_version(version, build=''):
|
||||||
|
|
||||||
_ver_output = re.compile(r'(?:([\w ]+) ([\w.]+) '
|
_ver_output = re.compile(r'(?:([\w ]+) ([\w.]+) '
|
||||||
'.*'
|
'.*'
|
||||||
'Version ([\d.]+))')
|
'Version ([\d.]+))', re.ASCII)
|
||||||
|
|
||||||
def _syscmd_ver(system='', release='', version='',
|
def _syscmd_ver(system='', release='', version='',
|
||||||
|
|
||||||
|
@ -1253,16 +1253,16 @@ def processor():
|
||||||
_sys_version_parser = re.compile(
|
_sys_version_parser = re.compile(
|
||||||
r'([\w.+]+)\s*'
|
r'([\w.+]+)\s*'
|
||||||
'\(#?([^,]+),\s*([\w ]+),\s*([\w :]+)\)\s*'
|
'\(#?([^,]+),\s*([\w ]+),\s*([\w :]+)\)\s*'
|
||||||
'\[([^\]]+)\]?')
|
'\[([^\]]+)\]?', re.ASCII)
|
||||||
|
|
||||||
_jython_sys_version_parser = re.compile(
|
_jython_sys_version_parser = re.compile(
|
||||||
r'([\d\.]+)')
|
r'([\d\.]+)', re.ASCII)
|
||||||
|
|
||||||
_ironpython_sys_version_parser = re.compile(
|
_ironpython_sys_version_parser = re.compile(
|
||||||
r'IronPython\s*'
|
r'IronPython\s*'
|
||||||
'([\d\.]+)'
|
'([\d\.]+)'
|
||||||
'(?: \(([\d\.]+)\))?'
|
'(?: \(([\d\.]+)\))?'
|
||||||
' on (.NET [\d\.]+)')
|
' on (.NET [\d\.]+)', re.ASCII)
|
||||||
|
|
||||||
_sys_version_cache = {}
|
_sys_version_cache = {}
|
||||||
|
|
||||||
|
|
|
@ -147,7 +147,7 @@ class DumbXMLWriter:
|
||||||
# Contents should conform to a subset of ISO 8601
|
# Contents should conform to a subset of ISO 8601
|
||||||
# (in particular, YYYY '-' MM '-' DD 'T' HH ':' MM ':' SS 'Z'. Smaller units may be omitted with
|
# (in particular, YYYY '-' MM '-' DD 'T' HH ':' MM ':' SS 'Z'. Smaller units may be omitted with
|
||||||
# a loss of precision)
|
# a loss of precision)
|
||||||
_dateParser = re.compile(r"(?P<year>\d\d\d\d)(?:-(?P<month>\d\d)(?:-(?P<day>\d\d)(?:T(?P<hour>\d\d)(?::(?P<minute>\d\d)(?::(?P<second>\d\d))?)?)?)?)?Z")
|
_dateParser = re.compile(r"(?P<year>\d\d\d\d)(?:-(?P<month>\d\d)(?:-(?P<day>\d\d)(?:T(?P<hour>\d\d)(?::(?P<minute>\d\d)(?::(?P<second>\d\d))?)?)?)?)?Z", re.ASCII)
|
||||||
|
|
||||||
def _dateFromString(s):
|
def _dateFromString(s):
|
||||||
order = ('year', 'month', 'day', 'hour', 'minute', 'second')
|
order = ('year', 'month', 'day', 'hour', 'minute', 'second')
|
||||||
|
|
|
@ -241,7 +241,7 @@ def expandvars(path):
|
||||||
return path
|
return path
|
||||||
if not _varprog:
|
if not _varprog:
|
||||||
import re
|
import re
|
||||||
_varprog = re.compile(r'\$(\w+|\{[^}]*\})')
|
_varprog = re.compile(r'\$(\w+|\{[^}]*\})', re.ASCII)
|
||||||
i = 0
|
i = 0
|
||||||
while True:
|
while True:
|
||||||
m = _varprog.search(path, i)
|
m = _varprog.search(path, i)
|
||||||
|
|
|
@ -86,7 +86,7 @@ def read_encoding(file, default):
|
||||||
line = f.readline()
|
line = f.readline()
|
||||||
if not line:
|
if not line:
|
||||||
break
|
break
|
||||||
m = re.match(r".*\bcoding:\s*(\S+)\b", line)
|
m = re.match(br".*\bcoding:\s*(\S+)\b", line)
|
||||||
if m:
|
if m:
|
||||||
return m.group(1).decode("ascii")
|
return m.group(1).decode("ascii")
|
||||||
return default
|
return default
|
||||||
|
|
31
Lib/re.py
31
Lib/re.py
|
@ -44,7 +44,7 @@ The special characters are:
|
||||||
"|" A|B, creates an RE that will match either A or B.
|
"|" A|B, creates an RE that will match either A or B.
|
||||||
(...) Matches the RE inside the parentheses.
|
(...) Matches the RE inside the parentheses.
|
||||||
The contents can be retrieved or matched later in the string.
|
The contents can be retrieved or matched later in the string.
|
||||||
(?iLmsux) Set the I, L, M, S, U, or X flag for the RE (see below).
|
(?aiLmsux) Set the A, I, L, M, S, U, or X flag for the RE (see below).
|
||||||
(?:...) Non-grouping version of regular parentheses.
|
(?:...) Non-grouping version of regular parentheses.
|
||||||
(?P<name>...) The substring matched by the group is accessible by name.
|
(?P<name>...) The substring matched by the group is accessible by name.
|
||||||
(?P=name) Matches the text matched earlier by the group named name.
|
(?P=name) Matches the text matched earlier by the group named name.
|
||||||
|
@ -64,11 +64,18 @@ resulting RE will match the second character.
|
||||||
\Z Matches only at the end of the string.
|
\Z Matches only at the end of the string.
|
||||||
\b Matches the empty string, but only at the start or end of a word.
|
\b Matches the empty string, but only at the start or end of a word.
|
||||||
\B Matches the empty string, but not at the start or end of a word.
|
\B Matches the empty string, but not at the start or end of a word.
|
||||||
\d Matches any decimal digit; equivalent to the set [0-9].
|
\d Matches any decimal digit; equivalent to the set [0-9] in
|
||||||
\D Matches any non-digit character; equivalent to the set [^0-9].
|
bytes patterns or string patterns with the ASCII flag.
|
||||||
|
In string patterns without the ASCII flag, it will match the whole
|
||||||
|
range of Unicode digits.
|
||||||
|
\D Matches any non-digit character; equivalent to [^\d].
|
||||||
\s Matches any whitespace character; equivalent to [ \t\n\r\f\v].
|
\s Matches any whitespace character; equivalent to [ \t\n\r\f\v].
|
||||||
\S Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v].
|
\S Matches any non-whitespace character; equiv. to [^ \t\n\r\f\v].
|
||||||
\w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_].
|
\w Matches any alphanumeric character; equivalent to [a-zA-Z0-9_]
|
||||||
|
in bytes patterns or string patterns with the ASCII flag.
|
||||||
|
In string patterns without the ASCII flag, it will match the
|
||||||
|
range of Unicode alphanumeric characters (letters plus digits
|
||||||
|
plus underscore).
|
||||||
With LOCALE, it will match the set [0-9_] plus characters defined
|
With LOCALE, it will match the set [0-9_] plus characters defined
|
||||||
as letters for the current locale.
|
as letters for the current locale.
|
||||||
\W Matches the complement of \w.
|
\W Matches the complement of \w.
|
||||||
|
@ -87,6 +94,12 @@ This module exports the following functions:
|
||||||
escape Backslash all non-alphanumerics in a string.
|
escape Backslash all non-alphanumerics in a string.
|
||||||
|
|
||||||
Some of the functions in this module takes flags as optional parameters:
|
Some of the functions in this module takes flags as optional parameters:
|
||||||
|
A ASCII For string patterns, make \w, \W, \b, \B, \d, \D
|
||||||
|
match the corresponding ASCII character categories
|
||||||
|
(rather than the whole Unicode categories, which is the
|
||||||
|
default).
|
||||||
|
For bytes patterns, this flag is the only available
|
||||||
|
behaviour and needn't be specified.
|
||||||
I IGNORECASE Perform case-insensitive matching.
|
I IGNORECASE Perform case-insensitive matching.
|
||||||
L LOCALE Make \w, \W, \b, \B, dependent on the current locale.
|
L LOCALE Make \w, \W, \b, \B, dependent on the current locale.
|
||||||
M MULTILINE "^" matches the beginning of lines (after a newline)
|
M MULTILINE "^" matches the beginning of lines (after a newline)
|
||||||
|
@ -95,7 +108,8 @@ Some of the functions in this module takes flags as optional parameters:
|
||||||
as the end of the string.
|
as the end of the string.
|
||||||
S DOTALL "." matches any character at all, including the newline.
|
S DOTALL "." matches any character at all, including the newline.
|
||||||
X VERBOSE Ignore whitespace and comments for nicer looking RE's.
|
X VERBOSE Ignore whitespace and comments for nicer looking RE's.
|
||||||
U UNICODE Make \w, \W, \b, \B, dependent on the Unicode locale.
|
U UNICODE For compatibility only. Ignored for string patterns (it
|
||||||
|
is the default), and forbidden for bytes patterns.
|
||||||
|
|
||||||
This module also defines an exception 'error'.
|
This module also defines an exception 'error'.
|
||||||
|
|
||||||
|
@ -107,16 +121,17 @@ import sre_parse
|
||||||
|
|
||||||
# public symbols
|
# public symbols
|
||||||
__all__ = [ "match", "search", "sub", "subn", "split", "findall",
|
__all__ = [ "match", "search", "sub", "subn", "split", "findall",
|
||||||
"compile", "purge", "template", "escape", "I", "L", "M", "S", "X",
|
"compile", "purge", "template", "escape", "A", "I", "L", "M", "S", "X",
|
||||||
"U", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
|
"U", "ASCII", "IGNORECASE", "LOCALE", "MULTILINE", "DOTALL", "VERBOSE",
|
||||||
"UNICODE", "error" ]
|
"UNICODE", "error" ]
|
||||||
|
|
||||||
__version__ = "2.2.1"
|
__version__ = "2.2.1"
|
||||||
|
|
||||||
# flags
|
# flags
|
||||||
|
A = ASCII = sre_compile.SRE_FLAG_ASCII # assume ascii "locale"
|
||||||
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case
|
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE # ignore case
|
||||||
L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
|
L = LOCALE = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
|
||||||
U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode locale
|
U = UNICODE = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale"
|
||||||
M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
|
M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
|
||||||
S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline
|
S = DOTALL = sre_compile.SRE_FLAG_DOTALL # make dot match newline
|
||||||
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
|
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
|
||||||
|
|
|
@ -207,9 +207,10 @@ SRE_FLAG_IGNORECASE = 2 # case insensitive
|
||||||
SRE_FLAG_LOCALE = 4 # honour system locale
|
SRE_FLAG_LOCALE = 4 # honour system locale
|
||||||
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
|
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
|
||||||
SRE_FLAG_DOTALL = 16 # treat target as a single string
|
SRE_FLAG_DOTALL = 16 # treat target as a single string
|
||||||
SRE_FLAG_UNICODE = 32 # use unicode locale
|
SRE_FLAG_UNICODE = 32 # use unicode "locale"
|
||||||
SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
|
SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
|
||||||
SRE_FLAG_DEBUG = 128 # debugging
|
SRE_FLAG_DEBUG = 128 # debugging
|
||||||
|
SRE_FLAG_ASCII = 256 # use ascii "locale"
|
||||||
|
|
||||||
# flags for INFO primitive
|
# flags for INFO primitive
|
||||||
SRE_INFO_PREFIX = 1 # has prefix
|
SRE_INFO_PREFIX = 1 # has prefix
|
||||||
|
|
|
@ -64,6 +64,7 @@ FLAGS = {
|
||||||
"s": SRE_FLAG_DOTALL,
|
"s": SRE_FLAG_DOTALL,
|
||||||
"x": SRE_FLAG_VERBOSE,
|
"x": SRE_FLAG_VERBOSE,
|
||||||
# extensions
|
# extensions
|
||||||
|
"a": SRE_FLAG_ASCII,
|
||||||
"t": SRE_FLAG_TEMPLATE,
|
"t": SRE_FLAG_TEMPLATE,
|
||||||
"u": SRE_FLAG_UNICODE,
|
"u": SRE_FLAG_UNICODE,
|
||||||
}
|
}
|
||||||
|
@ -672,6 +673,18 @@ def _parse(source, state):
|
||||||
|
|
||||||
return subpattern
|
return subpattern
|
||||||
|
|
||||||
|
def fix_flags(src, flags):
|
||||||
|
# Check and fix flags according to the type of pattern (str or bytes)
|
||||||
|
if isinstance(src, str):
|
||||||
|
if not flags & SRE_FLAG_ASCII:
|
||||||
|
flags |= SRE_FLAG_UNICODE
|
||||||
|
elif flags & SRE_FLAG_UNICODE:
|
||||||
|
raise ValueError("ASCII and UNICODE flags are incompatible")
|
||||||
|
else:
|
||||||
|
if flags & SRE_FLAG_UNICODE:
|
||||||
|
raise ValueError("can't use UNICODE flag with a bytes pattern")
|
||||||
|
return flags
|
||||||
|
|
||||||
def parse(str, flags=0, pattern=None):
|
def parse(str, flags=0, pattern=None):
|
||||||
# parse 're' pattern into list of (opcode, argument) tuples
|
# parse 're' pattern into list of (opcode, argument) tuples
|
||||||
|
|
||||||
|
@ -683,6 +696,7 @@ def parse(str, flags=0, pattern=None):
|
||||||
pattern.str = str
|
pattern.str = str
|
||||||
|
|
||||||
p = _parse_sub(source, pattern, 0)
|
p = _parse_sub(source, pattern, 0)
|
||||||
|
p.pattern.flags = fix_flags(str, p.pattern.flags)
|
||||||
|
|
||||||
tail = source.get()
|
tail = source.get()
|
||||||
if tail == ")":
|
if tail == ")":
|
||||||
|
|
|
@ -1368,7 +1368,7 @@ class TarInfo(object):
|
||||||
# "%d %s=%s\n" % (length, keyword, value). length is the size
|
# "%d %s=%s\n" % (length, keyword, value). length is the size
|
||||||
# of the complete record including the length field itself and
|
# of the complete record including the length field itself and
|
||||||
# the newline. keyword and value are both UTF-8 encoded strings.
|
# the newline. keyword and value are both UTF-8 encoded strings.
|
||||||
regex = re.compile(r"(\d+) ([^=]+)=", re.U)
|
regex = re.compile(br"(\d+) ([^=]+)=")
|
||||||
pos = 0
|
pos = 0
|
||||||
while True:
|
while True:
|
||||||
match = regex.match(buf, pos)
|
match = regex.match(buf, pos)
|
||||||
|
|
|
@ -667,4 +667,4 @@ tests.extend([
|
||||||
(r'\b.\b', 'a', SUCCEED, 'found', 'a'),
|
(r'\b.\b', 'a', SUCCEED, 'found', 'a'),
|
||||||
(r'(?u)\b.\b', u, SUCCEED, 'found', u),
|
(r'(?u)\b.\b', u, SUCCEED, 'found', u),
|
||||||
(r'(?u)\w', u, SUCCEED, 'found', u),
|
(r'(?u)\w', u, SUCCEED, 'found', u),
|
||||||
])
|
])
|
||||||
|
|
|
@ -506,7 +506,7 @@ class ByteArrayTest(BaseBytesTest):
|
||||||
def by(s):
|
def by(s):
|
||||||
return bytearray(map(ord, s))
|
return bytearray(map(ord, s))
|
||||||
b = by("Hello, world")
|
b = by("Hello, world")
|
||||||
self.assertEqual(re.findall(r"\w+", b), [by("Hello"), by("world")])
|
self.assertEqual(re.findall(br"\w+", b), [by("Hello"), by("world")])
|
||||||
|
|
||||||
def test_setitem(self):
|
def test_setitem(self):
|
||||||
b = bytearray([1, 2, 3])
|
b = bytearray([1, 2, 3])
|
||||||
|
|
|
@ -54,7 +54,7 @@ class MmapTests(unittest.TestCase):
|
||||||
m.flush()
|
m.flush()
|
||||||
|
|
||||||
# Test doing a regular expression match in an mmap'ed file
|
# Test doing a regular expression match in an mmap'ed file
|
||||||
match = re.search('[A-Za-z]+', m)
|
match = re.search(b'[A-Za-z]+', m)
|
||||||
if match is None:
|
if match is None:
|
||||||
self.fail('regex match on mmap failed!')
|
self.fail('regex match on mmap failed!')
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -83,23 +83,6 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
|
self.assertEqual(re.sub('\r\n', '\n', 'abc\r\ndef\r\n'),
|
||||||
'abc\ndef\n')
|
'abc\ndef\n')
|
||||||
|
|
||||||
def test_bug_1140(self):
|
|
||||||
# re.sub(x, y, b'') should return b'', not '', and
|
|
||||||
# re.sub(x, y, '') should return '', not b''.
|
|
||||||
# Also:
|
|
||||||
# re.sub(x, y, str(x)) should return str(y), and
|
|
||||||
# re.sub(x, y, bytes(x)) should return
|
|
||||||
# str(y) if isinstance(y, str) else unicode(y).
|
|
||||||
for x in 'x', b'x':
|
|
||||||
for y in 'y', b'y':
|
|
||||||
z = re.sub(x, y, b'')
|
|
||||||
self.assertEqual(z, b'')
|
|
||||||
self.assertEqual(type(z), bytes)
|
|
||||||
#
|
|
||||||
z = re.sub(x, y, '')
|
|
||||||
self.assertEqual(z, '')
|
|
||||||
self.assertEqual(type(z), str)
|
|
||||||
|
|
||||||
def test_bug_1661(self):
|
def test_bug_1661(self):
|
||||||
# Verify that flags do not get silently ignored with compiled patterns
|
# Verify that flags do not get silently ignored with compiled patterns
|
||||||
pattern = re.compile('.')
|
pattern = re.compile('.')
|
||||||
|
@ -327,7 +310,7 @@ class ReTests(unittest.TestCase):
|
||||||
|
|
||||||
def test_getattr(self):
|
def test_getattr(self):
|
||||||
self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
|
self.assertEqual(re.compile("(?i)(a)(b)").pattern, "(?i)(a)(b)")
|
||||||
self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I)
|
self.assertEqual(re.compile("(?i)(a)(b)").flags, re.I | re.U)
|
||||||
self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
|
self.assertEqual(re.compile("(?i)(a)(b)").groups, 2)
|
||||||
self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
|
self.assertEqual(re.compile("(?i)(a)(b)").groupindex, {})
|
||||||
self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
|
self.assertEqual(re.compile("(?i)(?P<first>a)(?P<other>b)").groupindex,
|
||||||
|
@ -614,8 +597,8 @@ class ReTests(unittest.TestCase):
|
||||||
import array
|
import array
|
||||||
for typecode in 'bBuhHiIlLfd':
|
for typecode in 'bBuhHiIlLfd':
|
||||||
a = array.array(typecode)
|
a = array.array(typecode)
|
||||||
self.assertEqual(re.compile("bla").match(a), None)
|
self.assertEqual(re.compile(b"bla").match(a), None)
|
||||||
self.assertEqual(re.compile("").match(a).groups(), ())
|
self.assertEqual(re.compile(b"").match(a).groups(), ())
|
||||||
|
|
||||||
def test_inline_flags(self):
|
def test_inline_flags(self):
|
||||||
# Bug #1700
|
# Bug #1700
|
||||||
|
@ -658,6 +641,48 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
|
self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#')
|
||||||
self.assertEqual(pattern.sub('#', '\n'), '#\n#')
|
self.assertEqual(pattern.sub('#', '\n'), '#\n#')
|
||||||
|
|
||||||
|
def test_bytes_str_mixing(self):
|
||||||
|
# Mixing str and bytes is disallowed
|
||||||
|
pat = re.compile('.')
|
||||||
|
bpat = re.compile(b'.')
|
||||||
|
self.assertRaises(TypeError, pat.match, b'b')
|
||||||
|
self.assertRaises(TypeError, bpat.match, 'b')
|
||||||
|
self.assertRaises(TypeError, pat.sub, b'b', 'c')
|
||||||
|
self.assertRaises(TypeError, pat.sub, 'b', b'c')
|
||||||
|
self.assertRaises(TypeError, pat.sub, b'b', b'c')
|
||||||
|
self.assertRaises(TypeError, bpat.sub, b'b', 'c')
|
||||||
|
self.assertRaises(TypeError, bpat.sub, 'b', b'c')
|
||||||
|
self.assertRaises(TypeError, bpat.sub, 'b', 'c')
|
||||||
|
|
||||||
|
def test_ascii_and_unicode_flag(self):
|
||||||
|
# String patterns
|
||||||
|
for flags in (0, re.UNICODE):
|
||||||
|
pat = re.compile('\xc0', flags | re.IGNORECASE)
|
||||||
|
self.assertNotEqual(pat.match('\xe0'), None)
|
||||||
|
pat = re.compile('\w', flags)
|
||||||
|
self.assertNotEqual(pat.match('\xe0'), None)
|
||||||
|
pat = re.compile('\xc0', re.ASCII | re.IGNORECASE)
|
||||||
|
self.assertEqual(pat.match('\xe0'), None)
|
||||||
|
pat = re.compile('(?a)\xc0', re.IGNORECASE)
|
||||||
|
self.assertEqual(pat.match('\xe0'), None)
|
||||||
|
pat = re.compile('\w', re.ASCII)
|
||||||
|
self.assertEqual(pat.match('\xe0'), None)
|
||||||
|
pat = re.compile('(?a)\w')
|
||||||
|
self.assertEqual(pat.match('\xe0'), None)
|
||||||
|
# Bytes patterns
|
||||||
|
for flags in (0, re.ASCII):
|
||||||
|
pat = re.compile(b'\xc0', re.IGNORECASE)
|
||||||
|
self.assertEqual(pat.match(b'\xe0'), None)
|
||||||
|
pat = re.compile(b'\w')
|
||||||
|
self.assertEqual(pat.match(b'\xe0'), None)
|
||||||
|
# Incompatibilities
|
||||||
|
self.assertRaises(ValueError, re.compile, b'\w', re.UNICODE)
|
||||||
|
self.assertRaises(ValueError, re.compile, b'(?u)\w')
|
||||||
|
self.assertRaises(ValueError, re.compile, '\w', re.UNICODE | re.ASCII)
|
||||||
|
self.assertRaises(ValueError, re.compile, '(?u)\w', re.ASCII)
|
||||||
|
self.assertRaises(ValueError, re.compile, '(?a)\w', re.UNICODE)
|
||||||
|
self.assertRaises(ValueError, re.compile, '(?au)\w')
|
||||||
|
|
||||||
|
|
||||||
def run_re_tests():
|
def run_re_tests():
|
||||||
from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR
|
from test.re_tests import benchmarks, tests, SUCCEED, FAIL, SYNTAX_ERROR
|
||||||
|
|
|
@ -47,21 +47,23 @@ def group(*choices): return '(' + '|'.join(choices) + ')'
|
||||||
def any(*choices): return group(*choices) + '*'
|
def any(*choices): return group(*choices) + '*'
|
||||||
def maybe(*choices): return group(*choices) + '?'
|
def maybe(*choices): return group(*choices) + '?'
|
||||||
|
|
||||||
|
# Note: we use unicode matching for names ("\w") but ascii matching for
|
||||||
|
# number literals.
|
||||||
Whitespace = r'[ \f\t]*'
|
Whitespace = r'[ \f\t]*'
|
||||||
Comment = r'#[^\r\n]*'
|
Comment = r'#[^\r\n]*'
|
||||||
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
|
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
|
||||||
Name = r'[a-zA-Z_]\w*'
|
Name = r'[a-zA-Z_]\w*'
|
||||||
|
|
||||||
Hexnumber = r'0[xX][\da-fA-F]+'
|
Hexnumber = r'0[xX][0-9a-fA-F]+'
|
||||||
Binnumber = r'0[bB][01]+'
|
Binnumber = r'0[bB][01]+'
|
||||||
Octnumber = r'0[oO][0-7]+'
|
Octnumber = r'0[oO][0-7]+'
|
||||||
Decnumber = r'(?:0+|[1-9]\d*)'
|
Decnumber = r'(?:0+|[1-9][0-9]*)'
|
||||||
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
|
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
|
||||||
Exponent = r'[eE][-+]?\d+'
|
Exponent = r'[eE][-+]?[0-9]+'
|
||||||
Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
|
Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
|
||||||
Expfloat = r'\d+' + Exponent
|
Expfloat = r'[0-9]+' + Exponent
|
||||||
Floatnumber = group(Pointfloat, Expfloat)
|
Floatnumber = group(Pointfloat, Expfloat)
|
||||||
Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
|
Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
|
||||||
Number = group(Imagnumber, Floatnumber, Intnumber)
|
Number = group(Imagnumber, Floatnumber, Intnumber)
|
||||||
|
|
||||||
# Tail end of ' string.
|
# Tail end of ' string.
|
||||||
|
|
|
@ -141,7 +141,7 @@ def urlcleanup():
|
||||||
_opener = None
|
_opener = None
|
||||||
|
|
||||||
# copied from cookielib.py
|
# copied from cookielib.py
|
||||||
_cut_port_re = re.compile(r":\d+$")
|
_cut_port_re = re.compile(r":\d+$", re.ASCII)
|
||||||
def request_host(request):
|
def request_host(request):
|
||||||
"""Return request-host, as defined by RFC 2965.
|
"""Return request-host, as defined by RFC 2965.
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,14 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #2834: update the regular expression library to match the unicode
|
||||||
|
standards of py3k. In other words, mixing bytes and unicode strings
|
||||||
|
(be it as pattern, search string or replacement string) raises a TypeError.
|
||||||
|
Moreover, the re.UNICODE flag is enabled automatically for unicode patterns,
|
||||||
|
and can be disabled by specifying a new re.ASCII flag; as for bytes
|
||||||
|
patterns, ASCII matching is the only option and trying to specify re.UNICODE
|
||||||
|
for such patterns raises a ValueError.
|
||||||
|
|
||||||
- Issue #3300: make urllib.parse.[un]quote() default to UTF-8.
|
- Issue #3300: make urllib.parse.[un]quote() default to UTF-8.
|
||||||
Code contributed by Matt Giuca. quote() now encodes the input
|
Code contributed by Matt Giuca. quote() now encodes the input
|
||||||
before quoting, unquote() decodes after unquoting. There are
|
before quoting, unquote() decodes after unquoting. There are
|
||||||
|
|
|
@ -1691,7 +1691,7 @@ getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
|
||||||
/* get pointer to string buffer */
|
/* get pointer to string buffer */
|
||||||
view.len = -1;
|
view.len = -1;
|
||||||
buffer = Py_TYPE(string)->tp_as_buffer;
|
buffer = Py_TYPE(string)->tp_as_buffer;
|
||||||
if (!buffer || !buffer->bf_getbuffer ||
|
if (!buffer || !buffer->bf_getbuffer ||
|
||||||
(*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
|
(*buffer->bf_getbuffer)(string, &view, PyBUF_SIMPLE) < 0) {
|
||||||
PyErr_SetString(PyExc_TypeError, "expected string or buffer");
|
PyErr_SetString(PyExc_TypeError, "expected string or buffer");
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -1717,7 +1717,7 @@ getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
|
||||||
if (PyBytes_Check(string) || bytes == size)
|
if (PyBytes_Check(string) || bytes == size)
|
||||||
charsize = 1;
|
charsize = 1;
|
||||||
#if defined(HAVE_UNICODE)
|
#if defined(HAVE_UNICODE)
|
||||||
else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
|
else if (bytes == (Py_ssize_t) (size * sizeof(Py_UNICODE)))
|
||||||
charsize = sizeof(Py_UNICODE);
|
charsize = sizeof(Py_UNICODE);
|
||||||
#endif
|
#endif
|
||||||
else {
|
else {
|
||||||
|
@ -1729,7 +1729,7 @@ getstring(PyObject* string, Py_ssize_t* p_length, int* p_charsize)
|
||||||
*p_charsize = charsize;
|
*p_charsize = charsize;
|
||||||
|
|
||||||
if (ptr == NULL) {
|
if (ptr == NULL) {
|
||||||
PyErr_SetString(PyExc_ValueError,
|
PyErr_SetString(PyExc_ValueError,
|
||||||
"Buffer is NULL");
|
"Buffer is NULL");
|
||||||
}
|
}
|
||||||
return ptr;
|
return ptr;
|
||||||
|
@ -1754,6 +1754,17 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
|
||||||
if (!ptr)
|
if (!ptr)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
if (charsize == 1 && pattern->charsize > 1) {
|
||||||
|
PyErr_SetString(PyExc_TypeError,
|
||||||
|
"can't use a string pattern on a bytes-like object");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (charsize > 1 && pattern->charsize == 1) {
|
||||||
|
PyErr_SetString(PyExc_TypeError,
|
||||||
|
"can't use a bytes pattern on a string-like object");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
/* adjust boundaries */
|
/* adjust boundaries */
|
||||||
if (start < 0)
|
if (start < 0)
|
||||||
start = 0;
|
start = 0;
|
||||||
|
@ -2682,6 +2693,16 @@ _compile(PyObject* self_, PyObject* args)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (pattern == Py_None)
|
||||||
|
self->charsize = -1;
|
||||||
|
else {
|
||||||
|
Py_ssize_t p_length;
|
||||||
|
if (!getstring(pattern, &p_length, &self->charsize)) {
|
||||||
|
PyObject_DEL(self);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Py_INCREF(pattern);
|
Py_INCREF(pattern);
|
||||||
self->pattern = pattern;
|
self->pattern = pattern;
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,7 @@ typedef struct {
|
||||||
PyObject* pattern; /* pattern source (or None) */
|
PyObject* pattern; /* pattern source (or None) */
|
||||||
int flags; /* flags used when compiling pattern source */
|
int flags; /* flags used when compiling pattern source */
|
||||||
PyObject *weakreflist; /* List of weak references */
|
PyObject *weakreflist; /* List of weak references */
|
||||||
|
int charsize; /* pattern charsize (or -1) */
|
||||||
/* pattern code */
|
/* pattern code */
|
||||||
Py_ssize_t codesize;
|
Py_ssize_t codesize;
|
||||||
SRE_CODE code[1];
|
SRE_CODE code[1];
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue