mirror of
https://github.com/python/cpython.git
synced 2025-09-18 22:50:26 +00:00
bpo-32308: Replace empty matches adjacent to a previous non-empty match in re.sub(). (#4846)
This commit is contained in:
parent
0cc99c8cd7
commit
fbb490fd2f
6 changed files with 35 additions and 25 deletions
|
@ -1140,12 +1140,12 @@ new string value and the number of replacements that were performed::
|
||||||
>>> p.subn('colour', 'no colours at all')
|
>>> p.subn('colour', 'no colours at all')
|
||||||
('no colours at all', 0)
|
('no colours at all', 0)
|
||||||
|
|
||||||
Empty matches are replaced only when they're not adjacent to a previous match.
|
Empty matches are replaced only when they're not adjacent to a previous empty match.
|
||||||
::
|
::
|
||||||
|
|
||||||
>>> p = re.compile('x*')
|
>>> p = re.compile('x*')
|
||||||
>>> p.sub('-', 'abxd')
|
>>> p.sub('-', 'abxd')
|
||||||
'-a-b-d-'
|
'-a-b--d-'
|
||||||
|
|
||||||
If *replacement* is a string, any backslash escapes in it are processed. That
|
If *replacement* is a string, any backslash escapes in it are processed. That
|
||||||
is, ``\n`` is converted to a single newline character, ``\r`` is converted to a
|
is, ``\n`` is converted to a single newline character, ``\r`` is converted to a
|
||||||
|
|
|
@ -708,12 +708,15 @@ form.
|
||||||
That way, separator components are always found at the same relative
|
That way, separator components are always found at the same relative
|
||||||
indices within the result list.
|
indices within the result list.
|
||||||
|
|
||||||
The pattern can match empty strings. ::
|
Empty matches for the pattern split the string only when not adjacent
|
||||||
|
to a previous empty match.
|
||||||
|
|
||||||
>>> re.split(r'\b', 'Words, words, words.')
|
>>> re.split(r'\b', 'Words, words, words.')
|
||||||
['', 'Words', ', ', 'words', ', ', 'words', '.']
|
['', 'Words', ', ', 'words', ', ', 'words', '.']
|
||||||
|
>>> re.split(r'\W*', '...words...')
|
||||||
|
['', '', 'w', 'o', 'r', 'd', 's', '', '']
|
||||||
>>> re.split(r'(\W*)', '...words...')
|
>>> re.split(r'(\W*)', '...words...')
|
||||||
['', '...', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', '']
|
['', '...', '', '', 'w', '', 'o', '', 'r', '', 'd', '', 's', '...', '', '', '']
|
||||||
|
|
||||||
.. versionchanged:: 3.1
|
.. versionchanged:: 3.1
|
||||||
Added the optional flags argument.
|
Added the optional flags argument.
|
||||||
|
@ -778,8 +781,8 @@ form.
|
||||||
The optional argument *count* is the maximum number of pattern occurrences to be
|
The optional argument *count* is the maximum number of pattern occurrences to be
|
||||||
replaced; *count* must be a non-negative integer. If omitted or zero, all
|
replaced; *count* must be a non-negative integer. If omitted or zero, all
|
||||||
occurrences will be replaced. Empty matches for the pattern are replaced only
|
occurrences will be replaced. Empty matches for the pattern are replaced only
|
||||||
when not adjacent to a previous match, so ``sub('x*', '-', 'abc')`` returns
|
when not adjacent to a previous empty match, so ``sub('x*', '-', 'abxd')`` returns
|
||||||
``'-a-b-c-'``.
|
``'-a-b--d-'``.
|
||||||
|
|
||||||
In string-type *repl* arguments, in addition to the character escapes and
|
In string-type *repl* arguments, in addition to the character escapes and
|
||||||
backreferences described above,
|
backreferences described above,
|
||||||
|
@ -805,6 +808,9 @@ form.
|
||||||
Unknown escapes in *repl* consisting of ``'\'`` and an ASCII letter
|
Unknown escapes in *repl* consisting of ``'\'`` and an ASCII letter
|
||||||
now are errors.
|
now are errors.
|
||||||
|
|
||||||
|
Empty matches for the pattern are replaced when adjacent to a previous
|
||||||
|
non-empty match.
|
||||||
|
|
||||||
|
|
||||||
.. function:: subn(pattern, repl, string, count=0, flags=0)
|
.. function:: subn(pattern, repl, string, count=0, flags=0)
|
||||||
|
|
||||||
|
|
|
@ -881,8 +881,9 @@ Changes in the Python API
|
||||||
* The result of splitting a string on a :mod:`regular expression <re>`
|
* The result of splitting a string on a :mod:`regular expression <re>`
|
||||||
that could match an empty string has been changed. For example
|
that could match an empty string has been changed. For example
|
||||||
splitting on ``r'\s*'`` will now split not only on whitespaces as it
|
splitting on ``r'\s*'`` will now split not only on whitespaces as it
|
||||||
did previously, but also between any pair of non-whitespace
|
did previously, but also on empty strings before all non-whitespace
|
||||||
characters. The previous behavior can be restored by changing the pattern
|
characters and just before the end of the string.
|
||||||
|
The previous behavior can be restored by changing the pattern
|
||||||
to ``r'\s+'``. A :exc:`FutureWarning` was emitted for such patterns since
|
to ``r'\s+'``. A :exc:`FutureWarning` was emitted for such patterns since
|
||||||
Python 3.5.
|
Python 3.5.
|
||||||
|
|
||||||
|
@ -893,7 +894,13 @@ Changes in the Python API
|
||||||
positions 2--3. To match only blank lines, the pattern should be rewritten
|
positions 2--3. To match only blank lines, the pattern should be rewritten
|
||||||
as ``r'(?m)^[^\S\n]*$'``.
|
as ``r'(?m)^[^\S\n]*$'``.
|
||||||
|
|
||||||
(Contributed by Serhiy Storchaka in :issue:`25054`.)
|
:func:`re.sub()` now replaces empty matches adjacent to a previous
|
||||||
|
non-empty match. For example ``re.sub('x*', '-', 'abxd')`` returns now
|
||||||
|
``'-a-b--d-'`` instead of ``'-a-b--d-'`` (the first minus between 'b' and
|
||||||
|
'd' replaces 'x', and the second minus replaces an empty string between
|
||||||
|
'x' and 'd').
|
||||||
|
|
||||||
|
(Contributed by Serhiy Storchaka in :issue:`25054` and :issue:`32308`.)
|
||||||
|
|
||||||
* :class:`tracemalloc.Traceback` frames are now sorted from oldest to most
|
* :class:`tracemalloc.Traceback` frames are now sorted from oldest to most
|
||||||
recent to be more consistent with :mod:`traceback`.
|
recent to be more consistent with :mod:`traceback`.
|
||||||
|
|
|
@ -213,11 +213,6 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
|
self.assertEqual(re.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'),
|
||||||
'hello there')
|
'hello there')
|
||||||
|
|
||||||
def test_bug_462270(self):
|
|
||||||
# Test for empty sub() behaviour, see SF bug #462270
|
|
||||||
self.assertEqual(re.sub('x*', '-', 'abxd'), '-a-b-d-')
|
|
||||||
self.assertEqual(re.sub('x+', '-', 'abxd'), 'ab-d')
|
|
||||||
|
|
||||||
def test_symbolic_groups(self):
|
def test_symbolic_groups(self):
|
||||||
re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
|
re.compile(r'(?P<a>x)(?P=a)(?(a)y)')
|
||||||
re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
|
re.compile(r'(?P<a1>x)(?P=a1)(?(a1)y)')
|
||||||
|
@ -331,10 +326,10 @@ class ReTests(unittest.TestCase):
|
||||||
['', 'a', '', '', 'c'])
|
['', 'a', '', '', 'c'])
|
||||||
|
|
||||||
for sep, expected in [
|
for sep, expected in [
|
||||||
(':*', ['', 'a', 'b', 'c', '']),
|
(':*', ['', '', 'a', '', 'b', '', 'c', '']),
|
||||||
('(?::*)', ['', 'a', 'b', 'c', '']),
|
('(?::*)', ['', '', 'a', '', 'b', '', 'c', '']),
|
||||||
('(:*)', ['', ':', 'a', ':', 'b', '::', 'c', '', '']),
|
('(:*)', ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', '', 'c', '', '']),
|
||||||
('(:)*', ['', ':', 'a', ':', 'b', ':', 'c', None, '']),
|
('(:)*', ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', None, 'c', None, '']),
|
||||||
]:
|
]:
|
||||||
with self.subTest(sep=sep):
|
with self.subTest(sep=sep):
|
||||||
self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
|
self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
|
||||||
|
@ -357,7 +352,7 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
|
self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
|
||||||
['', ':', 'a', ':', 'b::c'])
|
['', ':', 'a', ':', 'b::c'])
|
||||||
self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
|
self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
|
||||||
['', ':', 'a', ':', 'b::c'])
|
['', ':', '', '', 'a:b::c'])
|
||||||
|
|
||||||
def test_re_findall(self):
|
def test_re_findall(self):
|
||||||
self.assertEqual(re.findall(":+", "abc"), [])
|
self.assertEqual(re.findall(":+", "abc"), [])
|
||||||
|
@ -1753,13 +1748,13 @@ class ReTests(unittest.TestCase):
|
||||||
def test_zerowidth(self):
|
def test_zerowidth(self):
|
||||||
# Issues 852532, 1647489, 3262, 25054.
|
# Issues 852532, 1647489, 3262, 25054.
|
||||||
self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', ''])
|
self.assertEqual(re.split(r"\b", "a::bc"), ['', 'a', '::', 'bc', ''])
|
||||||
self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', 'bc', ''])
|
self.assertEqual(re.split(r"\b|:+", "a::bc"), ['', 'a', '', '', 'bc', ''])
|
||||||
self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', 'bc'])
|
self.assertEqual(re.split(r"(?<!\w)(?=\w)|:+", "a::bc"), ['', 'a', '', 'bc'])
|
||||||
self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', ''])
|
self.assertEqual(re.split(r"(?<=\w)(?!\w)|:+", "a::bc"), ['a', '', 'bc', ''])
|
||||||
|
|
||||||
self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-')
|
self.assertEqual(re.sub(r"\b", "-", "a::bc"), '-a-::-bc-')
|
||||||
self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a--bc-')
|
self.assertEqual(re.sub(r"\b|:+", "-", "a::bc"), '-a---bc-')
|
||||||
self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::]bc[]')
|
self.assertEqual(re.sub(r"(\b|:+)", r"[\1]", "a::bc"), '[]a[][::][]bc[]')
|
||||||
|
|
||||||
self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', ''])
|
self.assertEqual(re.findall(r"\b|:+", "a::bc"), ['', '', '::', '', ''])
|
||||||
self.assertEqual(re.findall(r"\b|\w+", "a::bc"),
|
self.assertEqual(re.findall(r"\b|\w+", "a::bc"),
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
:func:`re.sub()` now replaces empty matches adjacent to a previous non-empty
|
||||||
|
match.
|
|
@ -955,7 +955,7 @@ _sre_SRE_Pattern_split_impl(PatternObject *self, PyObject *string,
|
||||||
}
|
}
|
||||||
|
|
||||||
n = n + 1;
|
n = n + 1;
|
||||||
state.must_advance = 1;
|
state.must_advance = (state.ptr == state.start);
|
||||||
last = state.start = state.ptr;
|
last = state.start = state.ptr;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1109,7 +1109,7 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
|
||||||
|
|
||||||
i = e;
|
i = e;
|
||||||
n = n + 1;
|
n = n + 1;
|
||||||
state.must_advance = 1;
|
state.must_advance = (state.ptr == state.start);
|
||||||
state.start = state.ptr;
|
state.start = state.ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue